mlpractical/mlp/dataset.py


# Machine Learning Practical (INFR11119),
# Pawel Swietojanski, University of Edinburgh

import cPickle
import gzip
import numpy
import os


class DataProvider(object):
    """
    Data provider defines an interface for our
    generic data-independent readers.
    """
    def __init__(self, batch_size, randomize=True):
        """
        :param batch_size: int, specifies the number
               of elements returned at each step
        :param randomize: bool, shuffles examples prior
               to iteration, so they are presented in random
               order for stochastic gradient descent training
        :return:
        """
        self.batch_size = batch_size
        self.randomize = randomize
        self._curr_idx = 0

    def reset(self):
        """
        Resets the provider to the initial state to
        use in another epoch
        :return: None
        """
        self._curr_idx = 0

    def __randomize(self):
        """
        Data-specific implementation of shuffling mechanism
        :return:
        """
        raise NotImplementedError()

    def __iter__(self):
        """
        This method says an object is iterable.
        """
        return self

    def next(self):
        """
        Data-specific iteration mechanism. Called each step 
        (i.e. each iteration in a loop)
        unitl StopIteration() exception is raised.
        :return:
        """
        raise NotImplementedError()


class MNISTDataProvider(DataProvider):
    """
    The class iterates over MNIST digits dataset, in possibly
    random order.
    """
    def __init__(self, dset,
                 batch_size=10,
                 max_num_examples=-1,
                 randomize=True):

        super(MNISTDataProvider, self).\
            __init__(batch_size, randomize)

        assert dset in ['train', 'valid', 'eval'], (
            "Expected dset to be either 'train', "
            "'valid' or 'eval' got %s" % dset
        )

        dset_path = './data/mnist_%s.pkl.gz' % dset
        assert os.path.isfile(dset_path), (
            "File %s was expected to exist!." % dset_path
        )

        with gzip.open(dset_path) as f:
            x, t = cPickle.load(f)

        self._max_num_examples = max_num_examples
        self.x = x
        self.t = t
        self.num_classes = 10

        self._rand_idx = None
        if self.randomize:
            self._rand_idx = self.__randomize()

    def reset(self):
        super(MNISTDataProvider, self).reset()
        if self.randomize:
            self._rand_idx = self.__randomize()

    def __randomize(self):
        assert isinstance(self.x, numpy.ndarray)
        return numpy.random.permutation(numpy.arange(0, self.x.shape[0]))

    def next(self):
        
        has_enough = (self._curr_idx + self.batch_size) <= self.x.shape[0]
        presented_max = (self._max_num_examples > 0 and
                         self._curr_idx + self.batch_size > self._max_num_examples)

        if not has_enough or presented_max:
            raise StopIteration()

        if self._rand_idx is not None:
            range_idx = \
                self._rand_idx[self._curr_idx:self._curr_idx + self.batch_size]
        else:
            range_idx = \
                numpy.arange(self._curr_idx, self._curr_idx + self.batch_size)

        rval_x = self.x[range_idx]
        rval_t = self.t[range_idx]

        self._curr_idx += self.batch_size

        return rval_x, self.__to_one_of_k(rval_y)
        return rval_x, rval_t

    def __to_one_of_k(self, y):
        rval = numpy.zeros((y.shape[0], self.num_classes), dtype=numpy.float32)
        for i in xrange(y.shape[0]):
            rval[i, y[i]] = 1
        return rval


class MetOfficeDataProvider_(DataProvider):
    """
    The class iterates over South Scotland Weather, in possibly
    random order.
    """
    def __init__(self, window_size,
                 batch_size=10,
                 max_num_batches=-1,
                 randomize=True):

        super(MetOfficeDataProvider_, self).\
            __init__(batch_size, randomize)

        dset_path = './data/HadSSP_daily_qc.txt'
        assert os.path.isfile(dset_path), (
            "File %s was expected to exist!." % dset_path
        )

        raw = numpy.loadtxt(dset_path, skiprows=3, usecols=range(2, 32))
        
        self.window_size = windows_size
        #filter out all missing datapoints and
        #flatten a matrix to a vector, so we will get
        #a time preserving representation of measurments
        #with self.x[0] being the first day and self.x[-1] the last
        self.x = raw[raw < 0].flatten()
        self._max_num_examples = max_num_examples
        
        self._rand_idx = None
        if self.randomize:
            self._rand_idx = self.__randomize()

    def reset(self):
        super(MetOfficeDataProvider_, self).reset()
        if self.randomize:
            self._rand_idx = self.__randomize()

    def __randomize(self):
        assert isinstance(self.x, numpy.ndarray)
        # we generate random indexes starting from window_size, i.e. 10th absolute element
        # in the self.x vector, as we later during minibatch preparation slice 
        # the self.x container backwards, i.e. given we want to get a training 
        # data-point for 11th day, we look at 10 preeceding days. 
        # Note, we cannot do this, for example, for the 5th day as
        # we do not have enough observations to make an input (10 days) to the model
        return numpy.random.permutation(numpy.arange(self.window_size, self.x.shape[0]))

    def next(self):
        
        has_enough = (self._curr_idx + self.batch_size) <= self.x.shape[0]
        presented_max = (self._max_num_examples > 0 and
                         self._curr_idx + self.batch_size > self._max_num_examples)

        if not has_enough or presented_max:
            raise StopIteration()

        if self._rand_idx is not None:
            range_idx = \
                self._rand_idx[self._curr_idx:self._curr_idx + self.batch_size]
        else:
            range_idx = \
                numpy.arange(self._curr_idx, self._curr_idx + self.batch_size)

        #build slicing matrix of size minibatch, which will contain batch_size
        #rows, each keeping indexes that selects windows_size+1 [for (x,t)] elements
        #from data vector (self.x) that itself stays always sorted w.r.t time
        range_slices = numpy.zeros((self.batch_size, self.window_size + 1))
        for i in xrange(0, self.batch_size):
            range_slices[i,:] = \
                numpy.arange(range_idx[i], range_idx[i] - self.window_size - 1, -1)[::-1]
        
        #here we use advanced indexing to select slices from observation vector
        #last column of rval_x makes our targets t
        rval_x = self.x[range_slices]

        self._curr_idx += self.batch_size

        return rval_x[:,:-1], rval[:,-1]

    
class FuncDataProvider(DataProvider):
    """
    Function gets as an argument a list of functions defining the means
    of a normal distribution to sample from.
    """
    def __init__(self,
                 fn_list=[lambda x: x ** 2, lambda x: numpy.sin(x)],
                 std_list=[0.1, 0.1],
                 x_from = 0.0,
                 x_to = 1.0,
                 points_per_fn=200,
                 batch_size=10,
                 randomize=True):
        """
        """

        super(FuncDataProvider, self).__init__(batch_size, randomize)

        def sample_points(y, std):
            ys = numpy.zeros_like(y)
            for i in xrange(y.shape[0]):
                ys[i] = numpy.random.normal(y[i], std)
            return ys

        x = numpy.linspace(x_from, x_to, points_per_fn, dtype=numpy.float32)
        means = [fn(x) for fn in fn_list]
        y = [sample_points(mean, std) for mean, std in zip(means, std_list)]

        self.x_orig = x
        self.y_class = y

        self.x = numpy.concatenate([x for ys in y])
        self.y = numpy.concatenate([ys for ys in y])

        if self.randomize:
            self._rand_idx = self.__randomize()
        else:
            self._rand_idx = None

    def __randomize(self):
        assert isinstance(self.x, numpy.ndarray)
        return numpy.random.permutation(numpy.arange(0, self.x.shape[0]))

    def __iter__(self):
        return self

    def next(self):
        if (self._curr_idx + self.batch_size) >= self.x.shape[0]:
            raise StopIteration()

        if self._rand_idx is not None:
            range_idx = self._rand_idx[self._curr_idx:self._curr_idx + self.batch_size]
        else:
            range_idx = numpy.arange(self._curr_idx, self._curr_idx + self.batch_size)

        x = self.x[range_idx]
        y = self.y[range_idx]

        self._curr_idx += self.batch_size

        return x, y
1st labs 2015-09-27 22:53:27 +02:00
			`# Machine Learning Practical (INFR11119),`
			`# Pawel Swietojanski, University of Edinburgh`

			`import cPickle`
			`import gzip`
			`import numpy`
			`import os`


			`class DataProvider(object):`
			`"""`
			`Data provider defines an interface for our`
			`generic data-independent readers.`
			`"""`
			`def __init__(self, batch_size, randomize=True):`
			`"""`
			`:param batch_size: int, specifies the number`
			`of elements returned at each step`
			`:param randomize: bool, shuffles examples prior`
			`to iteration, so they are presented in random`
			`order for stochastic gradient descent training`
			`:return:`
			`"""`
			`self.batch_size = batch_size`
			`self.randomize = randomize`
			`self._curr_idx = 0`

			`def reset(self):`
			`"""`
			`Resets the provider to the initial state to`
			`use in another epoch`
			`:return: None`
			`"""`
			`self._curr_idx = 0`

			`def __randomize(self):`
			`"""`
			`Data-specific implementation of shuffling mechanism`
			`:return:`
			`"""`
			`raise NotImplementedError()`

			`def __iter__(self):`
changes to dataset 2015-10-01 14:26:08 +02:00			`"""`
			`This method says an object is iterable.`
			`"""`
1st labs 2015-09-27 22:53:27 +02:00			`return self`

			`def next(self):`
			`"""`
changes to dataset 2015-10-01 14:26:08 +02:00			`Data-specific iteration mechanism. Called each step`
			`(i.e. each iteration in a loop)`
			`unitl StopIteration() exception is raised.`
1st labs 2015-09-27 22:53:27 +02:00			`:return:`
			`"""`
			`raise NotImplementedError()`


			`class MNISTDataProvider(DataProvider):`
			`"""`
			`The class iterates over MNIST digits dataset, in possibly`
			`random order.`
			`"""`
			`def __init__(self, dset,`
			`batch_size=10,`
			`max_num_examples=-1,`
			`randomize=True):`

			`super(MNISTDataProvider, self).\`
			`__init__(batch_size, randomize)`

			`assert dset in ['train', 'valid', 'eval'], (`
			`"Expected dset to be either 'train', "`
			`"'valid' or 'eval' got %s" % dset`
			`)`

			`dset_path = './data/mnist_%s.pkl.gz' % dset`
			`assert os.path.isfile(dset_path), (`
			`"File %s was expected to exist!." % dset_path`
			`)`

			`with gzip.open(dset_path) as f:`
			`x, t = cPickle.load(f)`

			`self._max_num_examples = max_num_examples`
			`self.x = x`
			`self.t = t`
			`self.num_classes = 10`

			`self._rand_idx = None`
			`if self.randomize:`
			`self._rand_idx = self.__randomize()`

			`def reset(self):`
			`super(MNISTDataProvider, self).reset()`
			`if self.randomize:`
			`self._rand_idx = self.__randomize()`

			`def __randomize(self):`
			`assert isinstance(self.x, numpy.ndarray)`
changes to dataset 2015-10-01 14:26:08 +02:00			`return numpy.random.permutation(numpy.arange(0, self.x.shape[0]))`
1st labs 2015-09-27 22:53:27 +02:00
			`def next(self):`

			`has_enough = (self._curr_idx + self.batch_size) <= self.x.shape[0]`
			`presented_max = (self._max_num_examples > 0 and`
			`self._curr_idx + self.batch_size > self._max_num_examples)`

			`if not has_enough or presented_max:`
			`raise StopIteration()`

			`if self._rand_idx is not None:`
			`range_idx = \`
			`self._rand_idx[self._curr_idx:self._curr_idx + self.batch_size]`
			`else:`
			`range_idx = \`
			`numpy.arange(self._curr_idx, self._curr_idx + self.batch_size)`

			`rval_x = self.x[range_idx]`
			`rval_t = self.t[range_idx]`

			`self._curr_idx += self.batch_size`

changes to dataset 2015-10-01 14:26:08 +02:00			`return rval_x, self.__to_one_of_k(rval_y)`
1st labs 2015-09-27 22:53:27 +02:00			`return rval_x, rval_t`

			`def __to_one_of_k(self, y):`
changes to dataset 2015-10-01 14:26:08 +02:00			`rval = numpy.zeros((y.shape[0], self.num_classes), dtype=numpy.float32)`
			`for i in xrange(y.shape[0]):`
			`rval[i, y[i]] = 1`
			`return rval`


			`class MetOfficeDataProvider_(DataProvider):`
			`"""`
			`The class iterates over South Scotland Weather, in possibly`
			`random order.`
			`"""`
			`def __init__(self, window_size,`
			`batch_size=10,`
			`max_num_batches=-1,`
			`randomize=True):`

			`super(MetOfficeDataProvider_, self).\`
			`__init__(batch_size, randomize)`

			`dset_path = './data/HadSSP_daily_qc.txt'`
			`assert os.path.isfile(dset_path), (`
			`"File %s was expected to exist!." % dset_path`
			`)`
1st labs 2015-09-27 22:53:27 +02:00
changes to dataset 2015-10-01 14:26:08 +02:00			`raw = numpy.loadtxt(dset_path, skiprows=3, usecols=range(2, 32))`

			`self.window_size = windows_size`
			`#filter out all missing datapoints and`
			`#flatten a matrix to a vector, so we will get`
			`#a time preserving representation of measurments`
			`#with self.x[0] being the first day and self.x[-1] the last`
			`self.x = raw[raw < 0].flatten()`
			`self._max_num_examples = max_num_examples`

			`self._rand_idx = None`
			`if self.randomize:`
			`self._rand_idx = self.__randomize()`

			`def reset(self):`
			`super(MetOfficeDataProvider_, self).reset()`
			`if self.randomize:`
			`self._rand_idx = self.__randomize()`
1st labs 2015-09-27 22:53:27 +02:00
changes to dataset 2015-10-01 14:26:08 +02:00			`def __randomize(self):`
			`assert isinstance(self.x, numpy.ndarray)`
			`# we generate random indexes starting from window_size, i.e. 10th absolute element`
			`# in the self.x vector, as we later during minibatch preparation slice`
			`# the self.x container backwards, i.e. given we want to get a training`
			`# data-point for 11th day, we look at 10 preeceding days.`
			`# Note, we cannot do this, for example, for the 5th day as`
			`# we do not have enough observations to make an input (10 days) to the model`
			`return numpy.random.permutation(numpy.arange(self.window_size, self.x.shape[0]))`

			`def next(self):`

			`has_enough = (self._curr_idx + self.batch_size) <= self.x.shape[0]`
			`presented_max = (self._max_num_examples > 0 and`
			`self._curr_idx + self.batch_size > self._max_num_examples)`

			`if not has_enough or presented_max:`
			`raise StopIteration()`

			`if self._rand_idx is not None:`
			`range_idx = \`
			`self._rand_idx[self._curr_idx:self._curr_idx + self.batch_size]`
			`else:`
			`range_idx = \`
			`numpy.arange(self._curr_idx, self._curr_idx + self.batch_size)`

			`#build slicing matrix of size minibatch, which will contain batch_size`
			`#rows, each keeping indexes that selects windows_size+1 [for (x,t)] elements`
			`#from data vector (self.x) that itself stays always sorted w.r.t time`
			`range_slices = numpy.zeros((self.batch_size, self.window_size + 1))`
			`for i in xrange(0, self.batch_size):`
			`range_slices[i,:] = \`
			`numpy.arange(range_idx[i], range_idx[i] - self.window_size - 1, -1)[::-1]`

			`#here we use advanced indexing to select slices from observation vector`
			`#last column of rval_x makes our targets t`
			`rval_x = self.x[range_slices]`

			`self._curr_idx += self.batch_size`

			`return rval_x[:,:-1], rval[:,-1]`


1st labs 2015-09-27 22:53:27 +02:00			`class FuncDataProvider(DataProvider):`
			`"""`
changes to dataset 2015-10-01 14:26:08 +02:00			`Function gets as an argument a list of functions defining the means`
			`of a normal distribution to sample from.`
1st labs 2015-09-27 22:53:27 +02:00			`"""`
			`def __init__(self,`
			`fn_list=[lambda x: x ** 2, lambda x: numpy.sin(x)],`
			`std_list=[0.1, 0.1],`
			`x_from = 0.0,`
			`x_to = 1.0,`
			`points_per_fn=200,`
			`batch_size=10,`
			`randomize=True):`
changes to dataset 2015-10-01 14:26:08 +02:00			`"""`
			`"""`
1st labs 2015-09-27 22:53:27 +02:00
			`super(FuncDataProvider, self).__init__(batch_size, randomize)`

			`def sample_points(y, std):`
			`ys = numpy.zeros_like(y)`
			`for i in xrange(y.shape[0]):`
			`ys[i] = numpy.random.normal(y[i], std)`
			`return ys`

			`x = numpy.linspace(x_from, x_to, points_per_fn, dtype=numpy.float32)`
			`means = [fn(x) for fn in fn_list]`
			`y = [sample_points(mean, std) for mean, std in zip(means, std_list)]`

			`self.x_orig = x`
			`self.y_class = y`

			`self.x = numpy.concatenate([x for ys in y])`
			`self.y = numpy.concatenate([ys for ys in y])`

			`if self.randomize:`
			`self._rand_idx = self.__randomize()`
			`else:`
			`self._rand_idx = None`

			`def __randomize(self):`
			`assert isinstance(self.x, numpy.ndarray)`
changes to dataset 2015-10-01 14:26:08 +02:00			`return numpy.random.permutation(numpy.arange(0, self.x.shape[0]))`
1st labs 2015-09-27 22:53:27 +02:00
			`def __iter__(self):`
			`return self`

			`def next(self):`
			`if (self._curr_idx + self.batch_size) >= self.x.shape[0]:`
			`raise StopIteration()`

			`if self._rand_idx is not None:`
			`range_idx = self._rand_idx[self._curr_idx:self._curr_idx + self.batch_size]`
			`else:`
			`range_idx = numpy.arange(self._curr_idx, self._curr_idx + self.batch_size)`

			`x = self.x[range_idx]`
			`y = self.y[range_idx]`

			`self._curr_idx += self.batch_size`

			`return x, y`