1st labs

2015-09-27 21:53:27 +01:00 · 2015-09-27 21:53:27 +01:00 · c35e81a211
commit c35e81a211
parent fd35da1b86
7 changed files with 1211 additions and 0 deletions
--- a/data/.dropbox.attr
+++ b/data/.dropbox.attr
@ -0,0 +1 @@
 {}
--- a/data/HadSSP_daily_qc.txt
+++ b/data/HadSSP_daily_qc.txt
--- a/data/mnist_eval.pkl.gz
+++ b/data/mnist_eval.pkl.gz
--- a/data/mnist_train.pkl.gz
+++ b/data/mnist_train.pkl.gz
--- a/data/mnist_valid.pkl.gz
+++ b/data/mnist_valid.pkl.gz
--- a/mlp/init.py
+++ b/mlp/init.py
--- a/mlp/dataset.py
+++ b/mlp/dataset.py
@ -0,0 +1,187 @@
 # Machine Learning Practical (INFR11119),
 # Pawel Swietojanski, University of Edinburgh
 import cPickle
 import gzip
 import numpy
 import os
 class DataProvider(object):
    """
    Data provider defines an interface for our
    generic data-independent readers.
    """
    def __init__(self, batch_size, randomize=True):
        """
        :param batch_size: int, specifies the number
               of elements returned at each step
        :param randomize: bool, shuffles examples prior
               to iteration, so they are presented in random
               order for stochastic gradient descent training
        :return:
        """
        self.batch_size = batch_size
        self.randomize = randomize
        self._curr_idx = 0
    def reset(self):
        """
        Resets the provider to the initial state to
        use in another epoch
        :return: None
        """
        self._curr_idx = 0
    def __randomize(self):
        """
        Data-specific implementation of shuffling mechanism
        :return:
        """
        raise NotImplementedError()
    def __iter__(self):
        return self
    def next(self):
        """
        Data-specific iteration mechanism.
        :return:
        """
        raise NotImplementedError()
 class MNISTDataProvider(DataProvider):
    """
    The class iterates over MNIST digits dataset, in possibly
    random order.
    """
    def __init__(self, dset,
                 batch_size=10,
                 max_num_examples=-1,
                 randomize=True):
        super(MNISTDataProvider, self).\
            __init__(batch_size, randomize)
        assert dset in ['train', 'valid', 'eval'], (
            "Expected dset to be either 'train', "
            "'valid' or 'eval' got %s" % dset
        )
        dset_path = './data/mnist_%s.pkl.gz' % dset
        assert os.path.isfile(dset_path), (
            "File %s was expected to exist!." % dset_path
        )
        with gzip.open(dset_path) as f:
            x, t = cPickle.load(f)
        self._max_num_examples = max_num_examples
        self.x = x
        self.t = t
        self.num_classes = 10
        self._rand_idx = None
        if self.randomize:
            self._rand_idx = self.__randomize()
    def reset(self):
        super(MNISTDataProvider, self).reset()
        if self.randomize:
            self._rand_idx = self.__randomize()
    def __randomize(self):
        assert isinstance(self.x, numpy.ndarray)
        return numpy.random.permute(numpy.arange(0, self.x.shape[0]))
    def next(self):
        has_enough = (self._curr_idx + self.batch_size) <= self.x.shape[0]
        presented_max = (self._max_num_examples > 0 and
                         self._curr_idx + self.batch_size > self._max_num_examples)
        if not has_enough or presented_max:
            raise StopIteration()
        if self._rand_idx is not None:
            range_idx = \
                self._rand_idx[self._curr_idx:self._curr_idx + self.batch_size]
        else:
            range_idx = \
                numpy.arange(self._curr_idx, self._curr_idx + self.batch_size)
        rval_x = self.x[range_idx]
        rval_t = self.t[range_idx]
        self._curr_idx += self.batch_size
        #return rval_x, self.__to_one_of_k(rval_y)
        return rval_x, rval_t
    def __to_one_of_k(self, y):
        raise NotImplementedError('Write me!')
 class FuncDataProvider(DataProvider):
    """
    Function gets as an argument a list of functions random samples
    drawn from normal distribution which means are defined by those
    functions.
    """
    def __init__(self,
                 fn_list=[lambda x: x ** 2, lambda x: numpy.sin(x)],
                 std_list=[0.1, 0.1],
                 x_from = 0.0,
                 x_to = 1.0,
                 points_per_fn=200,
                 batch_size=10,
                 randomize=True):
        super(FuncDataProvider, self).__init__(batch_size, randomize)
        def sample_points(y, std):
            ys = numpy.zeros_like(y)
            for i in xrange(y.shape[0]):
                ys[i] = numpy.random.normal(y[i], std)
            return ys
        x = numpy.linspace(x_from, x_to, points_per_fn, dtype=numpy.float32)
        means = [fn(x) for fn in fn_list]
        y = [sample_points(mean, std) for mean, std in zip(means, std_list)]
        self.x_orig = x
        self.y_class = y
        self.x = numpy.concatenate([x for ys in y])
        self.y = numpy.concatenate([ys for ys in y])
        if self.randomize:
            self._rand_idx = self.__randomize()
        else:
            self._rand_idx = None
    def __randomize(self):
        assert isinstance(self.x, numpy.ndarray)
        return numpy.random.permute(numpy.arange(0, self.x.shape[0]))
    def __iter__(self):
        return self
    def next(self):
        if (self._curr_idx + self.batch_size) >= self.x.shape[0]:
            raise StopIteration()
        if self._rand_idx is not None:
            range_idx = self._rand_idx[self._curr_idx:self._curr_idx + self.batch_size]
        else:
            range_idx = numpy.arange(self._curr_idx, self._curr_idx + self.batch_size)
        x = self.x[range_idx]
        y = self.y[range_idx]
        self._curr_idx += self.batch_size
        return x, y