This commit is contained in:
AntreasAntoniou 2018-09-13 02:28:00 +01:00
parent 8f9961669b
commit 2d54935183
23 changed files with 4509 additions and 0 deletions

14
README.md Normal file
View File

@ -0,0 +1,14 @@
# Machine Learning Practical
This repository contains the code for the University of Edinburgh [School of Informatics](http://www.inf.ed.ac.uk) course [Machine Learning Practical](http://www.inf.ed.ac.uk/teaching/courses/mlp/).
This assignment-based course is focused on the implementation and evaluation of machine learning systems. Students who do this course will have experience in the design, implementation, training, and evaluation of machine learning systems.
The code in this repository is split into:
* a Python package `mlp`, a [NumPy](http://www.numpy.org/) based neural network package designed specifically for the course that students will implement parts of and extend during the course labs and assignments,
* a series of [Jupyter](http://jupyter.org/) notebooks in the `notebooks` directory containing explanatory material and coding exercises to be completed during the course labs.
## Getting set up
Detailed instructions for setting up a development environment for the course are given in [this file](notes/environment-set-up.md). Students doing the course will spend part of the first lab getting their own environment set up.

1023
data/HadSSP_daily_qc.txt Normal file

File diff suppressed because it is too large Load Diff

BIN
data/mnist-test.npz Normal file

Binary file not shown.

BIN
data/mnist-train.npz Normal file

Binary file not shown.

BIN
data/mnist-valid.npz Normal file

Binary file not shown.

6
mlp/__init__.py Normal file
View File

@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
"""Machine Learning Practical package."""
__authors__ = ['Pawel Swietojanski', 'Steve Renals', 'Matt Graham']
DEFAULT_SEED = 123456 # Default random number generator seed if none provided.

206
mlp/data_providers.py Normal file
View File

@ -0,0 +1,206 @@
# -*- coding: utf-8 -*-
"""Data providers.
This module provides classes for loading datasets and iterating over batches of
data points.
"""
import pickle
import gzip
import numpy as np
import os
from mlp import DEFAULT_SEED
class DataProvider(object):
"""Generic data provider."""
def __init__(self, inputs, targets, batch_size, max_num_batches=-1,
shuffle_order=True, rng=None):
"""Create a new data provider object.
Args:
inputs (ndarray): Array of data input features of shape
(num_data, input_dim).
targets (ndarray): Array of data output targets of shape
(num_data, output_dim) or (num_data,) if output_dim == 1.
batch_size (int): Number of data points to include in each batch.
max_num_batches (int): Maximum number of batches to iterate over
in an epoch. If `max_num_batches * batch_size > num_data` then
only as many batches as the data can be split into will be
used. If set to -1 all of the data will be used.
shuffle_order (bool): Whether to randomly permute the order of
the data before each epoch.
rng (RandomState): A seeded random number generator.
"""
self.inputs = inputs
self.targets = targets
self.batch_size = batch_size
assert max_num_batches != 0 and not max_num_batches < -1, (
'max_num_batches should be -1 or > 0')
self.max_num_batches = max_num_batches
# maximum possible number of batches is equal to number of whole times
# batch_size divides in to the number of data points which can be
# found using integer division
possible_num_batches = self.inputs.shape[0] // batch_size
if self.max_num_batches == -1:
self.num_batches = possible_num_batches
else:
self.num_batches = min(self.max_num_batches, possible_num_batches)
self.shuffle_order = shuffle_order
if rng is None:
rng = np.random.RandomState(DEFAULT_SEED)
self.rng = rng
self.reset()
def __iter__(self):
"""Implements Python iterator interface.
This should return an object implementing a `next` method which steps
through a sequence returning one element at a time and raising
`StopIteration` when at the end of the sequence. Here the object
returned is the DataProvider itself.
"""
return self
def reset(self):
"""Resets the provider to the initial state to use in a new epoch."""
self._curr_batch = 0
if self.shuffle_order:
self.shuffle()
def shuffle(self):
"""Randomly shuffles order of data."""
new_order = self.rng.permutation(self.inputs.shape[0])
self.inputs = self.inputs[new_order]
self.targets = self.targets[new_order]
def next(self):
"""Returns next data batch or raises `StopIteration` if at end."""
if self._curr_batch + 1 > self.num_batches:
# no more batches in current iteration through data set so reset
# the dataset for another pass and indicate iteration is at end
self.reset()
raise StopIteration()
# create an index slice corresponding to current batch number
batch_slice = slice(self._curr_batch * self.batch_size,
(self._curr_batch + 1) * self.batch_size)
inputs_batch = self.inputs[batch_slice]
targets_batch = self.targets[batch_slice]
self._curr_batch += 1
return inputs_batch, targets_batch
class MNISTDataProvider(DataProvider):
"""Data provider for MNIST handwritten digit images."""
def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
shuffle_order=True, rng=None):
"""Create a new MNIST data provider object.
Args:
which_set: One of 'train', 'valid' or 'eval'. Determines which
portion of the MNIST data this object should provide.
batch_size (int): Number of data points to include in each batch.
max_num_batches (int): Maximum number of batches to iterate over
in an epoch. If `max_num_batches * batch_size > num_data` then
only as many batches as the data can be split into will be
used. If set to -1 all of the data will be used.
shuffle_order (bool): Whether to randomly permute the order of
the data before each epoch.
rng (RandomState): A seeded random number generator.
"""
# check a valid which_set was provided
assert which_set in ['train', 'valid', 'eval'], (
'Expected which_set to be either train, valid or eval. '
'Got {0}'.format(which_set)
)
self.which_set = which_set
self.num_classes = 10
# construct path to data using os.path.join to ensure the correct path
# separator for the current platform / OS is used
# MLP_DATA_DIR environment variable should point to the data directory
data_path = os.path.join(
os.environ['MLP_DATA_DIR'], 'mnist-{0}.npz'.format(which_set))
assert os.path.isfile(data_path), (
'Data file does not exist at expected path: ' + data_path
)
# load data from compressed numpy file
loaded = np.load(data_path)
inputs, targets = loaded['inputs'], loaded['targets']
inputs = inputs.astype(np.float32)
# pass the loaded data to the parent class __init__
super(MNISTDataProvider, self).__init__(
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
# def next(self):
# """Returns next data batch or raises `StopIteration` if at end."""
# inputs_batch, targets_batch = super(MNISTDataProvider, self).next()
# return inputs_batch, self.to_one_of_k(targets_batch)
#
def __next__(self):
return self.next()
def to_one_of_k(self, int_targets):
"""Converts integer coded class target to 1 of K coded targets.
Args:
int_targets (ndarray): Array of integer coded class targets (i.e.
where an integer from 0 to `num_classes` - 1 is used to
indicate which is the correct class). This should be of shape
(num_data,).
Returns:
Array of 1 of K coded targets i.e. an array of shape
(num_data, num_classes) where for each row all elements are equal
to zero except for the column corresponding to the correct class
which is equal to one.
"""
raise NotImplementedError()
class MetOfficeDataProvider(DataProvider):
"""South Scotland Met Office weather data provider."""
def __init__(self, window_size, batch_size=10, max_num_batches=-1,
shuffle_order=True, rng=None):
"""Create a new Met Offfice data provider object.
Args:
window_size (int): Size of windows to split weather time series
data into. The constructed input features will be the first
`window_size - 1` entries in each window and the target outputs
the last entry in each window.
batch_size (int): Number of data points to include in each batch.
max_num_batches (int): Maximum number of batches to iterate over
in an epoch. If `max_num_batches * batch_size > num_data` then
only as many batches as the data can be split into will be
used. If set to -1 all of the data will be used.
shuffle_order (bool): Whether to randomly permute the order of
the data before each epoch.
rng (RandomState): A seeded random number generator.
"""
self.window_size = window_size
assert window_size > 1, 'window_size must be at least 2.'
data_path = os.path.join(
os.environ['MLP_DATA_DIR'], 'HadSSP_daily_qc.txt')
assert os.path.isfile(data_path), (
'Data file does not exist at expected path: ' + data_path
)
# load raw data from text file
# ...
# filter out all missing datapoints and flatten to a vector
# ...
# normalise data to zero mean, unit standard deviation
# ...
# convert from flat sequence to windowed data
# ...
# inputs are first (window_size - 1) entries in windows
# inputs = ...
# targets are last entry in windows
# targets = ...
# initialise base class with inputs and targets arrays
# super(MetOfficeDataProvider, self).__init__(
# inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
def __next__(self):
return self.next()

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 200 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 69 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 62 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 61 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

441
notes/environment-set-up.md Normal file
View File

@ -0,0 +1,441 @@
# Environment set up
*The instructions below are intentionally verbose as they try to explain the reasoning behind our choice of environment set up and to explain what each command we are asking you to run does. If you are already confident using bash, Conda environments and Git you may wish to instead use the much shorter [minimal set-up instructions](#minimal-set-up-instructions-for-dice) at the end which skip the explanations.*
In this course we will be using [Python 3](https://www.python.org/) for all the labs and coursework assignments. In particular we will be making heavy use of the numerical computing libraries [NumPy](http://www.numpy.org/) and [SciPy](http://www.scipy.org/), and the interactive notebook application [Jupyter](http://jupyter.org/).
A common headache in software projects is ensuring the correct versions of all dependencies are available on the current development system. Often you may be working on several distinct projects simultaneously each with its own potentially conflicting dependencies on external libraries. Additionally you may be working across multiple different machines (for example a personal laptop and University computers) with possibly different operating systems. Further, as is the case in Informatics on DICE, you may not have root-level access to a system you are working on and so not be able to install software at a system-wide level and system updates may cause library versions to be changed to incompatible versions.
One way of overcoming these issues is to use project-specific *virtual environments*. In this context a virtual environment is an isolated development environment where the external dependencies of a project can be installed and managed independent of the system-wide versions (and those of the environments of other projects).
There are several virtual environment solutions available in the Python eco-system, including the native [pyvenv](https://docs.python.org/3/library/venv.html) in Python 3 and the popular [virtualenv](https://virtualenv.pypa.io/en/stable/). Also related is [pip](https://pip.pypa.io/en/stable/) a Python package manager natively included in Python 2.7.9 and above.
Here we will instead use the environment capabilities of the [Conda](http://conda.pydata.org/docs/) package management system. Unlike pip and virtualenv/pyvenv, Conda is not limited to managing Python packages but is a language and platform agnostic package manager. Both NumPy and SciPy have many non-Python external dependencies and their performance is very dependent on correctly linking to optimised linear algebra libraries.
Conda can handle installation of the Python libraries we will be using and all their external dependencies, in particular allowing easy installation of [optimised numerical computing libraries](https://docs.continuum.io/mkl-optimizations/). Further Conda can easily be installed on Linux, OSX and Windows systems meaning if you wish to set up an environment on a personal machine as well this should be easy to do whatever your operating system of choice is.
There are several options available for installing Conda on a system. Here we will use the Python 3 version of [Miniconda](http://conda.pydata.org/miniconda.html), which installs just Conda and its dependencies. An alternative is to install the [Anaconda Python distribution](https://docs.continuum.io/anaconda/), which installs Conda and a large selection of popular Python packages. As we will require only a small subset of these packages we will use the more barebones Miniconda to avoid eating into your DICE disk quota too much, however if installing on a personal machine you may wish to consider Anaconda if you want to explore other Python packages.
## Installing Miniconda
We provide instructions here for getting an environment with all the required dependencies running on computers running
the School of Informatics [DICE desktop](http://computing.help.inf.ed.ac.uk/dice-platform). The same instructions
should be able to used on other Linux distributions such as Ubuntu and Linux Mint with minimal adjustments.
For those wishing to install on a personal Windows or OSX machine, the initial instructions for setting up Conda will
differ slightly - you should instead select the relevant installer for your system from [here](http://conda.pydata.org/miniconda.html) and following the corresponding installation instructions from [here](http://conda.pydata.org/docs/install/quick.html). After Conda is installed the [remaining instructions](#creating-the-conda-environment) should be broadly the same across different systems.
*Note: Although we are happy for you to additionally set up an environment on a personal machine, you should still set up a DICE environment now as this will make sure you are able to use shared computing resources later in the course. Also although we have tried to note when the required commands will differ on non-DICE systems, these instructions have only been tested on DICE and we will not be able to offer any support in labs on getting set up on a non-DICE system.*
---
Open a bash terminal (`Applications > Terminal` on DICE).
We first need to download the latest 64-bit Python 3 Miniconda install script:
```
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
```
This uses `wget` a command-line tool for downloading files.
Now run the install script:
```
bash Miniconda3-latest-Linux-x86_64.sh
```
You will first be asked to review the software license agreement. Assuming you choose to agree, you will then be asked
to choose an install location for Miniconda. The default is to install in the root of your home directory
`~/miniconda3`. We recommend going with this default unless you have a particular reason to do otherwise.
You will then be asked whether to prepend the Miniconda binaries directory to the `PATH` system environment variable
definition in `.bashrc`. As the DICE bash start-up mechanism differs from the standard set up
([details here](http://computing.help.inf.ed.ac.uk/dice-bash)), on DICE you should respond `no` here as we will set up the addition to `PATH` manually in the next step. On other Linux distributions you may choose to accept the default.
On DICE, append the Miniconda binaries directory to `PATH` in manually in `~/.benv` using
```
echo "export PATH=\""\$PATH":$HOME/miniconda3/bin\"" >> ~/.benv
```
For those who this appears a bit opaque to and want to know what is going on see here <sup id="a1">[1](#f1)</sup>.
We now need to `source` the updated `~/.benv` so that the `PATH` variable in the current terminal session is updated:
```
source ~/.benv
```
From the next time you log in all future terminal sessions should have the updated `PATH` loaded by default.
## Creating the Conda environment
You should now have a working Conda installation. If you run
```
conda --help
```
from a terminal you should see the Conda help page displayed. If you get a `No command 'conda' found` error you should check you have set up your `PATH` variable correctly (you can get a demonstrator to help you do this).
Assuming Conda is working, we will now create our Conda environment:
```
conda create -n mlp python=3
```
This bootstraps a new Conda environment named `mlp` with a minimal Python 3 install. You will be presented with a 'package plan' listing the packages to be installed and asked whether to proceed: type `y` then enter.
We will now *activate* our created environment:
```
source activate mlp
```
or on Windows only
```
activate mlp
```
When a environment is activated its name will be prepended on to the prompt which should now look something like `(mlp) [machine-name]:~$` on DICE.
**You need to run this `source activate mlp` command every time you wish to activate the `mlp` environment in a terminal (for example at the beginning of each lab)**. When the environment is activated, the environment will be searched first when running commands so that e.g. `python` will launch the Python interpreter installed locally in the `mlp` environment rather than a system-wide version.
If you wish to deactivate an environment loaded in the current terminal e.g. to launch the system Python interpreter, you can run `source deactivate` (just `deactivate` on Windows).
We will now install the dependencies for the course into the new environment:
```
conda install numpy scipy matplotlib jupyter
```
Again you will be given a list of the packages to be installed and asked to confirm whether to proceed. Enter `y` then wait for the packages to install (this should take around five minutes). In addition to Jupyter, NumPy and SciPy which we have already mentioned, we are also installing [matplotlib](http://matplotlib.org/) a plotting and visualisation library.
Once the installation is finished, to recover some disk space we can clear the package tarballs Conda just downloaded:
```
conda clean -t
```
These tarballs are usually cached to allow quicker installation into additional environments however we will only be using a single environment here so there is no need to keep them on disk.
## Getting the course code and a short introduction to Git
The next step in getting our environment set up will be to download the course code. This is available in a Git repository on Github:
https://github.com/CSTR-Edinburgh/mlpractical
[Git](https://git-scm.com/) is a distributed version control system and [Github](https://github.com) a popular site for hosting Git repositories. We will be using Git to distribute the code for all the labs and assignments. We will explain all the necessary `git` commands as we go, though those new to Git may find [this concise guide by Roger Dudler](http://rogerdudler.github.io/git-guide/) or [this slightly longer one from Atlassian](https://www.atlassian.com/git/tutorials/) useful.
---
***Non-DICE systems only:***
Git is installed by default on DICE desktops. If you are running a system which does not have Git installed, you can use Conda to install it in your environment using:
```
conda install git
```
---
We will now go over the process of [cloning](https://www.atlassian.com/git/tutorials/setting-up-a-repository/git-clone) a local copy of the `mlpractical` repository.
---
**Confident Git users only:**
For those who have their own Github account and are confident Git users, you may wish to consider instead [creating a private fork](http://stackoverflow.com/a/30352360) of the `CSTR-Edinburgh/mlpractical` repository on Github. This is not required for the course, however it will allow you to push your local commits to Github making it easier to for example sync your work between DICE computers and a personal machine.
**Note you should NOT create a public fork using the default forking mechanism on Github as this will make any commits you push to the fork publicly available which creates a risk of plagiarism.**
If you are already familiar with Git you may wish to skip over the explanatory sections below, though you should read [the section on how we will use branches to separate the code for different labs](#branching-explanation).
---
By default we will assume here you are cloning to your home directory however if you have an existing system for organising your workspace feel free to keep to that. **If you clone the repository to a path other than `~/mlpractical` however you will need to adjust all references to `~/mlpractical` in the commands below accordingly.**
To clone the `mlpractical` repository to the home directory run
```
git clone https://github.com/CSTR-Edinburgh/mlpractical.git ~/mlpractical
```
This will create a new `mlpractical` subdirectory with a local copy of the repository in it. Enter the directory and list all its contents, including hidden files, by running:
```
cd ~/mlpractical
ls -a # Windows equivalent: dir /a
```
For the most part this will look much like any other directory, with there being the following three non-hidden sub-directories:
* `data`: Data files used in the labs and assignments.
* `mlp`: The custom Python package we will use in this course.
* `notebooks`: The Jupyter notebook files for each lab and coursework.
Additionally there exists a hidden `.git` subdirectory (on Unix systems by default files and directories prepended with a period '.' are hidden). This directory contains the repository history database and various configuration files and references. Unless you are sure you know what you are doing you generally should not edit any of the files in this directory directly. Generally most configuration options can be enacted more safely using a `git config` command.
For instance to globally set the user name and email used in commits you can run:
```
git config --global user.name "[your name]"
git config --global user.email "[matric-number]@sms.ed.ac.uk"
```
*Note this is meant as an example of a `git config` command - you do not need to run this command though there is no harm in doing so.*
From the `~/mlpractical` directory if you now run:
`git status`
a status message containing information about your local clone of the repository should be displayed.
Providing you have not made any changes yet, all that will be displayed is the name of the current *branch* (we will explain what a branch is to those new to Git in a little while), a message that the branch is up to date with the remote repository and that there is nothing to commit in the working directory.
The two key concepts you will need to know about Git for this course are *commits* and *branches*.
A *commit* in Git is a snapshot of the state of the project. The snapshots are recorded in the repository history and allow us to track changes to the code over time and rollback changes if necessary. In Git there is a three stage process to creating a new commit.
1. The relevant edits are made to files in the working directory and any new files created.
2. The files with changes to be committed (including any new files) are added to the *staging area* by running:
```
git add file1 file2 ...
```
3. Finally the *staged changes* are used to create a new commit by running
```
git commit -m "A commit message describing the changes."
```
This writes the staged changes as a new commit in the repository history. We can see a log of the details of previous commits by running:
```
git log
```
Although it is not a requirement of the course for you to make regular commits of your work, we strongly recommend you do as it is a good habit to get into and will make recovery from accidental deletions etc. much easier.
The other key Git concept you will need to know about are *branches*. A branch in Git represents an independent line of development of a project. When a repository is first created it will contain a single branch, named `master` by default. Commits to this branch form a linear series of snapshots of the project.
A new branch is created from a commit on an existing branch. Any commits made to this new branch then evolve as an independent and parallel line of changes - that is commits to the new branch will not affect the old branch and vice versa.
A typical Git workflow in a software development setting would be to create a new branch whenever making changes to a project, for example to fix a bug or implement a new feature. These changes are then isolated from the main code base allowing regular commits without worrying about making unstable changes to the main code base. Key to this workflow is the ability to *merge* commits from a branch into another branch, e.g. when it is decided a new feature is sufficiently developed to be added to the main code base. Although merging branches is key aspect of using Git in many projects, as dealing with merge conflicts when two branches both make changes to same parts of files can be a somewhat tricky process, we will here generally try to avoid the need for merges.
<p id='branching-explanation'>We will therefore use branches here in a slightly non-standard way. The code for each week's lab and for each of the assignments will be maintained in a separate branch. This will allow us to stage the release of the notebooks and code for each lab and assignment while allowing you to commit the changes you make to the code each week without having to merge those changes when new code is released. Similarly this structure will allow us to release updated notebooks from previous labs with proposed solutions without overwriting your own work.</p>
To list the branches present in the local repository, run:
```
git branch
```
This will display a list of branches with a `*` next to the current branch. To switch to a different existing branch in the local repository run
```
git checkout branch-name
```
This will change the code in the working directory to the current state of the checked out branch. Any files added to the staging area and committed will then create a new commit on this branch.
You should make sure you are on the first lab branch now by running:
```
git checkout mlp2017-8/lab1
```
## Installing the `mlp` Python package
In your local repository we noted above the presence of a `mlp` subdirectory. This contains the custom Python package implementing the NumPy based neural network framework we will be using in this course.
In order to make the modules in this package available in your environment we need install it. A [setuptools](https://setuptools.readthedocs.io/en/latest/) `setup.py` script is provided in the root of the `mlpractical` directory for this purpose.
The standard way to install a Python package using a `setup.py` script is to run `python setup.py install`. This creates a copy of the package in the `site-packages` directory of the currently active Python environment.
As we will be updating the code in the `mlp` package during the course of the labs this would require you to re-run `python setup.py install` every time a change is made to the package. Instead therefore you should install the package in development mode by running:
```
python setup.py develop
```
Instead of copying the package, this will instead create a symbolic link to the copy in the local repository. This means any changes made will be immediately available without the need to reinstall the package.
---
**Aside on importing/reloading Python modules:**
Note that after the first time a Python module is loaded into an interpreter instance, using for example:
```
import mlp
```
Running the `import` statement any further times will have no effect even if the underlying module code has been changed. To reload an already imported module we instead need to use the [`reload`](https://docs.python.org/2.7/library/functions.html#reload) function, e.g.
```
reload(mlp)
```
**Note: To be clear as this has caused some confusion in previous labs the above `import ...` / `reload(...)` statements should NOT be run directly in a bash terminal. They are examples Python statements - you could run them in a terminal by first loading a Python interpreter using:**
```
python
```
**however you do not need to do so now. This is meant as information to help you later when importing modules as there was some confusion last year about the difference between `import` and `reload`.**
---
## Adding a data directory variable to the environment
We observed previously the presence of a `data` subdirectory in the local repository. This directory holds the data files that will be used in the course. To enable the data loaders in the `mlp` package to locate these data files we need to set a `MLP_DATA_DIR` environment variable pointing to this directory.
Assuming you used the recommended Miniconda install location and cloned the `mlpractical` repository to your home directory, this variable can be automatically defined when activating the environment by running the following commands (on non-Windows systems):
```
cd ~/miniconda3/envs/mlp
mkdir -p ./etc/conda/activate.d
mkdir -p ./etc/conda/deactivate.d
echo -e '#!/bin/sh\n' >> ./etc/conda/activate.d/env_vars.sh
echo "export MLP_DATA_DIR=$HOME/mlpractical/data" >> ./etc/conda/activate.d/env_vars.sh
echo -e '#!/bin/sh\n' >> ./etc/conda/deactivate.d/env_vars.sh
echo 'unset MLP_DATA_DIR' >> ./etc/conda/deactivate.d/env_vars.sh
export MLP_DATA_DIR=$HOME/mlpractical/data
```
And on Windows systems (replacing the `[]` placeholders with the relevant paths):
```
cd [path-to-conda-root]\envs\mlp
mkdir .\etc\conda\activate.d
mkdir .\etc\conda\deactivate.d
@echo "set MLP_DATA_DIR=[path-to-local-repository]\data" >> .\etc\conda\activate.d\env_vars.bat
@echo "set MLP_DATA_DIR=" >> .\etc\conda\deactivate.d\env_vars.bat
set MLP_DATA_DIR=[path-to-local-repository]\data
```
## Loading the first lab notebook
Your environment is now all set up so you can move on to the introductory exercises in the first lab notebook.
One of the dependencies you installed in your environment earlier was Jupyter. Jupyter notebooks allow combining formatted text with runnable code cells and visualisation of the code output in an intuitive web application interface. Although originally specific to Python (under the previous moniker IPython notebooks) the notebook interface has now been abstracted making them available to a wide range of languages.
There will be a Jupyter notebook available for each lab and assignment in this course, with a combination of explanatory sections for you to read through which will complement the material covered in lectures, as well as series of practical coding exercises to be written and run in the notebook interface. The first lab notebook will cover some of the basics of the notebook interface.
To open a notebook, you first need to launch a Jupyter notebook server instance. From within the `mlpractical` directory containing your local copy of the repository (and with the `mlp` environment activated) run:
```
jupyter notebook
```
This will start a notebook server instance in the current terminal (with a series of status messages being streamed to the terminal output) and launch a browser window which will load the notebook application interface.
By default the notebook interface will show a list of the files in the directory the notebook server was launched from when first loaded. If you click on the `notebooks` directory in this file list, a list of files in this directory should then be displayed. Click the `01_Introduction.ipynb` entry to load the first notebook.
# Minimal set-up instructions for DICE
Below are instructions for setting up the environment without additional explanation. These are intentionally terse and if you do not understand what a particular command is doing you might be better following the more detailed instructions above which explain each step.
---
Start a new bash terminal. Download the latest 64-bit Python 2.7 Miniconda install script:
```
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
```
Run the install script:
```
bash Miniconda3-latest-Linux-x86_64.sh
```
Review the software license agreement and choose whether to accept. Assuming you accept, you be asked to choose an install location for Miniconda. The default is to install in the root of your home directory `~/miniconda3`. We will assume below you have used this default. **If you use a different path you will need to adjust the paths in the commands below to suit.**
You will then be asked whether to prepend the Miniconda binaries directory to the `PATH` system environment variable definition in `.bashrc`. You should respond `no` here as we will set up the addition to `PATH` manually in the next step.
Append the Miniconda binaries directory to `PATH` in manually in `~/.benv`:
```
echo "export PATH=\""\$PATH":$HOME/miniconda3/bin\"" >> ~/.benv
```
`source` the updated `~/.benv`:
```
source ~/.benv
```
Create a new `mlp` Conda environment:
```
conda create -n mlp python=3
```
Activate our created environment:
```
source activate mlp
```
Install the dependencies for the course into the new environment:
```
conda install numpy scipy matplotlib jupyter
```
Clear the package tarballs Conda just downloaded:
```
conda clean -t
```
Clone the course repository to your home directory:
```
git clone https://github.com/CSTR-Edinburgh/mlpractical.git ~/mlpractical
```
Make sure we are on the first lab branch
```
cd ~/mlpractical
git checkout mlp2017-8/lab1
```
Install the `mlp` package in the environment in develop mode
```
python ~/mlpractical/setup.py develop
```
Add an `MLP_DATA_DIR` variable to the environment
```
cd ~/miniconda3/envs/mlp
mkdir -p ./etc/conda/activate.d
mkdir -p ./etc/conda/deactivate.d
echo -e '#!/bin/sh\n' >> ./etc/conda/activate.d/env_vars.sh
echo "export MLP_DATA_DIR=$HOME/mlpractical/data" >> ./etc/conda/activate.d/env_vars.sh
echo -e '#!/bin/sh\n' >> ./etc/conda/deactivate.d/env_vars.sh
echo 'unset MLP_DATA_DIR' >> ./etc/conda/deactivate.d/env_vars.sh
export MLP_DATA_DIR=$HOME/mlpractical/data
```
Environment is now set up. Load the notebook server from `mlpractical` directory
```
cd ~/mlpractical
jupyter notebook
```
and then open the first lab notebook from the `notebooks` directory.
---
<b id="f1">[1]</b> The `echo` command causes the following text to be streamed to an output (standard terminal output by default). Here we use the append redirection operator `>>` to redirect the `echo` output to a file `~/.benv`, with it being appended to the end of the current file. The text actually added is `export PATH="$PATH:[your-home-directory]/miniconda/bin"` with the `\"` being used to escape the quote characters. The `export` command defines system-wide environment variables (more rigorously those inherited by child shells) with `PATH` being the environment variable defining where `bash` searches for executables as a colon-seperated list of directories. Here we add the Miniconda binary directory to the end of the current `PATH` definition. [](#a1)

View File

@ -0,0 +1,55 @@
# Getting started in a lab on DICE computers
Once your [environment is set up](environment-set-up.md), at the beginning of each lab you should be able follow the steps below to get the lab notebook for that session running.
Open a terminal window (`Applications > Terminal`).
We first need to activate our `mlp` Conda environment:
```
source activate mlp
```
We now need to fetch any new code for the lab from the Github repository and create a new branch for this lab's work. First change in to the `mlpractical` repoistory directory (if you cloned the repository to a different directory than the default you will need to adjust the command below accordingly):
```
cd ~/mlpractical
```
If you have not yet commited the changes you made to the current branch in the previous lab you should do so now. You can check if you have changes not yet commited by running `git status`. If there are files with changes to be commited (they will appear in red) you should first add them to the staging area using
```
git add path/to/file1 path/to/file2
```
then commit them with a descriptive commit message using
```
git commit -m "Description of changes e.g. Exercises for first lab notebook."
```
We are now ready to fetch any updated code from the remote repository on Github. This can be done by running
```
git fetch origin
```
This should display a message indicate a new branch has been found and fetched, named `origin/mlp2017-8/lab[n]` where `[n]` is the relevant lab number e.g. `origin/mlp2017-8/lab2` for the second lab.
We now need to create and checkout a new local branch from the remote branch fetched above. This can be done by running
```
git checkout -b lab[n] origin/mlp2017-8/lab[n]
```
where again `lab[n]` corresponds to the relevant lab number fetched above e.g. `lab2`. This command creates a new local branch named `lab[n]` from the fetched branch on the remote repository `origin/mlp2017-8/lab[n]`.
Inside the `notebooks` directory there should new be a new notebook for today's lab. The notebook for the previous lab will now also have proposed solutions filled in.
To get started with the new notebook from the `~/mlpractical` directory start up a Jupyter notebook server
```
jupyter notebook
```
then open the new notebook from the dashboard.

29
notes/quota-issue.md Normal file
View File

@ -0,0 +1,29 @@
# Exceeded quota problems on DICE
Apologies to those who may have issues with having insufficient quota space on DICE in the labs on Monday (25th September).
This was caused by the [dynamic AFS quota system](http://computing.help.inf.ed.ac.uk/dynamic-afs-quotas) which only initially allocates users a subset of their maximum quota and then checks hourly to increase this quota as needed. Unfortunately the amount of disk space needed to store the temporary files used in installing the course dependencies exceeded the current dynamic quota for some people. This meant when running the `conda install ...` command it exited with a quota exceeded error.
Those who experienced that issue should now have sufficient quota space available. From any DICE computer, If you run in a terminal
```
source activate mlp
conda remove -y numpy scipy matplotlib jupyter
conda install -y numpy scipy matplotlib jupyter
conda clean -t -y
```
this should clean out the old partially installed packages and reinstall them from scratch which should now run to completion without a quota exceeded error.
Your homespace can be accessed from any Informatics computer running DICE (e.g. any of the computers in the [Forrest Hill labs](http://web.inf.ed.ac.uk/infweb/student-services/ito/students/year2/student-support/facilities/computer-labs) which are open-access outside of booked lab sessions or for those who know how to use SSH you can [log in remotely](http://computing.help.inf.ed.ac.uk/external-login)). You can therefore finish your environment set up prior to the next lab if you want though it is also fine to wait till the beginning of the next lab (it will take around 5 minutes to complete the installation).
At this point assuming you ran through the rest of the instructions to clone the Git repository to your homespace and install the `mlp` package (i.e. the instructions from [here](https://github.com/CSTR-Edinburgh/mlpractical/blob/mlp2016-7/lab1/environment-set-up.md#getting-the-course-code-and-a-short-introduction-to-git) on-wards), you should have a fully working environment.
Once your environment is set up in all future labs you will only need to activate it to get started. So at the beginning of each subsequent lab we will ask you to do something like the following
```
source activate mlp # Activate the mlp environment
cd ~/mlpractical # Change the current directory to mlpractical repository
git checkout mlp2017-8/lab[...] # Checkout the branch for this week's lab
jupyter notebook # Launch the notebook server
```

View File

@ -0,0 +1,84 @@
# Running Jupyter notebooks over SSH
Below is a guide for how to start a Jupyter notebook server remotely on one of the shared-use `student.compute` servers and to connect to it on a local machine by port-forwarding over SSH. It is assumed you already have a SSH client set up on the machine you are connecting from and that you are familiar with how to use SSH. These instructions have been written for use with a SSH client running within a terminal session - although it may be possible to replicate the relevant commands within a GUI based SSH client, you will need to figure out how to do this yourself. They were written and tested on Ubuntu 14.04 and no attempt has been made to test them on other operating systems.
## Securing your notebook server
Before running a Jupyter notebook server instance on one of the shared compute servers you **must** make sure you have secured your server by configuring it to use a password and to communicate that password between the browser client and server by secure HTTP. This can be done on by running the `secure-notebook-server.sh` bash script provided in the `scripts` directory of the `mlpractical` repository. You can either do this when logged on to DICE in one of the labs or after connecting to DICE remotely over SSH as described below.
To run the script, in a DICE terminal enter the `mlpractical` repository directory and run
```
bash scripts/secure-notebook-server.sh
```
As this script creates a self-signed certificate to set up the secure HTTP encrypted communication between the browser and server, you will be shown a security warning when you load up the URL the notebooks are being served on.
If you want to manually secure the notebook server yourself or to create a certificate which will stop the security warnings appearing you can refer to the [relevant official Jupyter documentation page](http://jupyter-notebook.readthedocs.io/en/latest/public_server.html).
## Connecting to a remote `student.compute` server over SSH
To start an SSH session, open a terminal window and run
```
ssh [dice-username]@student.ssh.inf.ed.ac.uk
```
If this is this is the first time you have logged on to the SSH gateway server from this computer you will be asked to confirm you wish to connect and a ECDSA key fingerprint printed. You can check this against the reference values on the [school help pages](http://computing.help.inf.ed.ac.uk/external-login).
You will then be asked to enter your password. This is the same password you usually use to log on to DICE.
Assuming you enter the correct password, you will at this point be logged in to the SSH *gateway server*. As the message printed when you log in points out this is intended only for accessing the Informatics network externally and you should **not** attempt to work on this server. You should log in to one of the `student.compute` shared-use servers by running
```
ssh student.compute
```
You should now be logged on to one of the shared-use compute servers. The name of the server you are logged on to will appear at the bash prompt e.g.
```
ashbury:~$
```
You will need to know the name of the remote server you are using later on.
## Starting a notebook server on the remote computer
You should now activate your `mlp` Conda environment by running
```
source activate mlp
```
Now move in to the `mlpractical` local repository directory e.g. by running
```
cd ~/mlpractical
```
if you chose the default of putting the repository in your home directory.
We will now launch a notebook server on the remote compute-server. There are two key differences in the command we use to do this compared to how we usually start up a server on a local machine. First as the server will be running remotely you should set the `--no-browser` option as this will prevent the remote server attempting open a browser to connect to the notebook server.
Secondly we will prefix the command with `nice`. `nice` is a shell command which alters the scheduling priority of the process it is used to start. Its important to use `nice` when running on the shared `student.compute` servers to make sure they remain usable by all of the students who need to run jobs on them. You can set a priority level between 10 (highest priority) and 19 (lowest priority) using the `-n` argument. Running the command below will start up a notebook server at the lowest priority level.
```
nice -n 19 jupyter notebook --no-browser
```
Once the notebook server starts running you should take note of the port it is being served on as indicated in the `The Jupyter Notebook is running at: https://localhost:[port]/` message.
## Forwarding a connection to the notebook server over SSH
Now that the notebook server is running on the remote server you need to connect to it on your local machine. We will do this by forwarding the port the notebook server is being run on over SSH to you local machine. As all external connections from outside the `inf.ed.ac.uk` domain have to go via the SSH gateway server we need to go via this gateway server.
In a **new terminal window / tab** run the command below with the `[...]` placeholders substituted with the appropriate values to securely forward the specified port on the remote server to your local machine and bind it to a local port. You should choose `[remote-port]` to be the port the notebook server is running on on the remote server, `[local-port]` to be a currently unused port on your local machine and `[remote-server-name]` to be the host name of the remote server the notebook server is being run on.
```
ssh -N -o ProxyCommand="ssh -q [dice-username]@student.ssh.inf.ed.ac.uk nc [remote-server-name] 22" \
-L [local-port]:localhost:[remote-port] [dice-username]@[remote-server-name]
```
You will be asked to enter your (DICE) password twice, once to log on to the gateway server and a second time to log on to the remote compute server.
Assuming you enter your password both times correctly, the remote port will now be getting forwarded to the specified local port on your computer. If you now open up a browser on your computer and go to `https://localhost:[local-port]` you should (potentially after seeing a security warning about the self-signed certicate) now asked to enter the notebook server password you specified earlier. Once you enter this password you should be able to access the notebook dashboard and open and edit notebooks as you usually do in labratories.
When you are finished working you should both close down the notebook server by entering `Ctrl+C` twice in the terminal window the SSH session you used to start up the notebook server is running and halt the port forwarding command by entering `Ctrl+C` in the terminal it is running in.

View File

@ -0,0 +1,73 @@
#!/bin/bash
# Configure Jupyter notebook server to use password authentication
# Make sure Conda environment is active as will assume it is later
[ -z "$CONDA_PREFIX" ] && echo "Need to have Conda environment activated." && exit 1
if [ "$#" -gt 2 ]; then
echo "Usage: bash secure-notebook-server.sh [jupyter-path] [open-ssl-config-path]"
exit 1
fi
# If specified read Jupyter directory from passed argument
JUPYTER_DIR=${1:-"$HOME/.jupyter"}
# If specified read OpenSSL config file path from passed argument
# This is needed due to bug in how Conda handles config path
export OPENSSL_CONF=${2:-"$CONDA_PREFIX/ssl/openssl.cnf"}
SEPARATOR="=================================================================\n"
# Create default config file if one does not already exist
if [ ! -f "$JUPYTER_DIR/jupyter_notebook_config.py" ]; then
echo "No existing notebook configuration file found, creating new one ..."
printf $SEPARATOR
jupyter notebook --generate-config
printf $SEPARATOR
echo "... notebook configuration file created."
fi
# Get user to enter notebook server password
echo "Getting notebook server password hash. Enter password when prompted ..."
printf $SEPARATOR
HASH=$(python -c "from notebook.auth import passwd; print(passwd());")
printf $SEPARATOR
echo "... got password hash."
# Generate self-signed OpenSSL certificate and key file
echo "Creating certificate file ..."
printf $SEPARATOR
CERT_INFO="/C=UK/ST=Scotland/L=Edinburgh/O=University of Edinburgh/OU=School of Informatics/CN=$USER/emailAddress=$USER@sms.ed.ac.uk"
openssl req \
-x509 -nodes -days 365 \
-subj "/C=UK/ST=Scotland/L=Edinburgh/O=University of Edinburgh/OU=School of Informatics/CN=$USER/emailAddress=$USER@sms.ed.ac.uk" \
-newkey rsa:1024 -keyout "$JUPYTER_DIR/key.key" \
-out "$JUPYTER_DIR/cert.pem"
printf $SEPARATOR
echo "... certificate created."
# Setting permissions on key file
chmod 600 "$JUPYTER_DIR/key.key"
# Add password hash and certificate + key file paths to config file
echo "Setting up configuration file..."
printf $SEPARATOR
echo " adding password hash"
SRC_PSW="^#\?c\.NotebookApp\.password[ ]*=[ ]*u['"'"'"]\(sha1:[a-fA-F0-9]\+\)\?['"'"'"]"
DST_PSW="c.NotebookApp.password = u'$HASH'"
grep -q "c.NotebookApp.password" $JUPYTER_DIR/jupyter_notebook_config.py
if [ ! $? -eq 0 ]; then
echo DST_PSW >> $JUPYTER_DIR/jupyter_notebook_config.py
else
sed -i "s/$SRC_PSW/$DST_PSW/" $JUPYTER_DIR/jupyter_notebook_config.py
fi
echo " adding certificate file path"
SRC_CRT="^#\?c\.NotebookApp\.certfile[ ]*=[ ]*u['"'"'"]\([^'"'"'"]+\)\?['"'"'"]"
DST_CRT="c.NotebookApp.certfile = u'$JUPYTER_DIR/cert.pem'"
grep -q "c.NotebookApp.certfile" $JUPYTER_DIR/jupyter_notebook_config.py
if [ ! $? -eq 0 ]; then
echo DST_CRT >> $JUPYTER_DIR/jupyter_notebook_config.py
else
sed -i "s|$SRC_CRT|$DST_CRT|" $JUPYTER_DIR/jupyter_notebook_config.py
fi
echo " adding key file path"
SRC_KEY="^#\?c\.NotebookApp\.keyfile[ ]*=[ ]*u['"'"'"]\([^'"'"'"]+\)\?['"'"'"]"
DST_KEY="c.NotebookApp.keyfile = u'$JUPYTER_DIR/key.key'"
grep -q "c.NotebookApp.keyfile" $JUPYTER_DIR/jupyter_notebook_config.py
if [ ! $? -eq 0 ]; then
echo DST_KEY >> $JUPYTER_DIR/jupyter_notebook_config.py
else
sed -i "s|$SRC_KEY|$DST_KEY|" $JUPYTER_DIR/jupyter_notebook_config.py
fi
printf $SEPARATOR
echo "... finished setting up configuration file."

13
setup.py Normal file
View File

@ -0,0 +1,13 @@
""" Setup script for mlp package. """
from setuptools import setup
setup(
name = "mlp",
author = "Pawel Swietojanski, Steve Renals, Matt Graham and Antreas Antoniou",
description = ("Neural network framework for University of Edinburgh "
"School of Informatics Machine Learning Practical course."),
url = "https://github.com/CSTR-Edinburgh/mlpractical",
packages=['mlp']
)