Update

2024-11-11 11:34:32 +00:00 · 2024-11-11 11:34:32 +00:00 · 45a2df1b11
commit 45a2df1b11
parent be1f124dff
15 changed files with 594 additions and 480 deletions
--- a/report/figures/VGG38_BN_RC_acc.pdf
+++ b/report/figures/VGG38_BN_RC_acc.pdf
--- a/report/figures/VGG38_BN_RC_loss.pdf
+++ b/report/figures/VGG38_BN_RC_loss.pdf
--- a/report/figures/accuracy_plot.pdf
+++ b/report/figures/accuracy_plot.pdf
--- a/report/figures/grad_flow_vgg08.pdf
+++ b/report/figures/grad_flow_vgg08.pdf
--- a/report/figures/gradplot_38.pdf
+++ b/report/figures/gradplot_38.pdf
--- a/report/figures/gradplot_38_watermarked.pdf
+++ b/report/figures/gradplot_38_watermarked.pdf
--- a/report/figures/gradplot_38bnrc.pdf
+++ b/report/figures/gradplot_38bnrc.pdf
--- a/report/figures/loss_plot.pdf
+++ b/report/figures/loss_plot.pdf
--- a/report/mlp-cw1-questions.tex
+++ b/report/mlp-cw1-questions.tex
@ -1,231 +0,0 @@
 %% REPLACE SXXXXXX with your student number
 \def\studentNumber{SXXXXXX}
 %% START of YOUR ANSWERS
 %% Add answers to the questions below, by replacing the text inside the brackets {} for \youranswer{ "Text to be replaced with your answer." }. 
 %
 % Do not delete the commands for adding figures and tables. Instead fill in the missing values with your experiment results, and replace the images with your own respective figures.
 %
 % You can generally delete the placeholder text, such as for example the text "Question Figure 2 - Replace the images ..." 
 %
 % There are 18 TEXT QUESTIONS (a few of the short first ones have their answers added to both the Introduction and the Abstract). Replace the text inside the brackets of the command \youranswer with your answer to the question.
 %
 % There are also 3 "questions" to replace some placeholder FIGURES with your own, and 3 "questions" asking you to fill in the missing entries in the TABLES provided. 
 %
 % NOTE! that questions are ordered by the order of appearance of their answers in the text, and not by the order you should tackle them. Specifically, you cannot answer Questions 2, 3, and 4 before concluding all of the relevant experiments and analysis. Similarly, you should fill in the TABLES and FIGURES before discussing the results presented there. 
 %
 % NOTE! If for some reason you do not manage to produce results for some FIGURES and TABLES, then you can get partial marks by discussing your expectations of the results in the relevant TEXT QUESTIONS (for example Question 8 makes use of Table 1 and Figure 2).
 %
 % Please refer to the coursework specification for more details.
 %% - - - - - - - - - - - - TEXT QUESTIONS - - - - - - - - - - - - 
 %% Question 1:
 \newcommand{\questionOne} {
 \youranswer{Question 1 - Explain what these figures contain and how the curves evolve, and spot where overfitting occurs. Reason based on the min/max points and velocities (direction and magnitude of change) of the accuracy and error curves}
 }
 %% Question 2:
 \newcommand{\questionTwo} {
 \youranswer{Question 2 - Present your network width experiment results by using the relevant figure and table}
 }
 %% Question 3:
 \newcommand{\questionThree} {
 \youranswer{Question 3 - Discuss whether varying width affects the results in a consistent way, and whether the results are expected and match well with the prior knowledge (by which we mean your expectations as are formed from the relevant Theory and literature)}
 }
 %% Question 4:
 \newcommand{\questionFour} {
 \youranswer{Question 4 - Present your network depth experiment results by using the relevant figure and table}
 }
 %% Question 5:
 \newcommand{\questionFive} {
 \youranswer{Question 5 - Discuss whether varying depth affects the results in a consistent way, and whether the results are expected and match well with the prior knowledge (by which we mean your expectations as are formed from the relevant Theory and literature)}
 }
 %% Question 6:
 \newcommand{\questionSix} {
 \youranswer{Question 6 - Explain the experimental details (e.g. hyperparameters), discuss the results in terms of their generalisation performance and overfitting. Select and test the best performing model as part of this analysis.}
 }
 %% Question 7:
 \newcommand{\questionSeven} {
 \youranswer{Question 7 - Assume you were able to run 8 further training instances (8 specific hyperparameter configurations) where you could combine Dropout and L1, and/or Dropout and L2 regularisation. Which 8 runs would you pick and what question(s) would you aim to answer? Make sure you define the experiment setup, including any relevant hyperparameters}
 }
 %% Question 8:
 \newcommand{\questionEight} {
 \youranswer{Question 8 - Briefly draw your conclusions based on the results from the previous sections (what are the take-away messages?), discussing them in the context of the overall literature, and conclude your report with a recommendation for future directions}
 }
 %% - - - - - - - - - - - - FIGURES - - - - - - - - - - - - 
 %% Question Figure 2:
 \newcommand{\questionFigureTwo} {
 \youranswer{Question Figure 2 - Replace the images in Figure 2 with figures depicting the accuracy and error, training and validation curves for your experiments varying the number of hidden units.
 %
 \begin{figure}[t]
    \centering
    \begin{subfigure}{\linewidth}
        \includegraphics[width=\linewidth]{figures/empty_acc_curve_width.png}
        \caption{accuracy by epoch}
        \label{fig:width_acccurves}
    \end{subfigure} 
    \begin{subfigure}{\linewidth}
        \centering
        \includegraphics[width=\linewidth]{figures/empty_error_curve_width.png}
        \caption{error by epoch}
        \label{fig:width_errorcurves}
    \end{subfigure} 
    \caption{Training and validation curves in terms of classification accuracy (a) and cross-entropy error (b) on the EMNIST dataset for different network widths.}
    \label{fig:width}
 \end{figure} 
 }
 }
 %% Question Figure 3:
 \newcommand{\questionFigureThree} {
 \youranswer{Question Figure 3 - Replace these images with figures depicting the accuracy and error, training and validation curves for your experiments varying the number of hidden layers.
 %
 \begin{figure}[t]
    \centering
    \begin{subfigure}{\linewidth}
        \includegraphics[width=\linewidth]{figures/empty_acc_curve_depth.png}
        \caption{accuracy by epoch}
        \label{fig:depth_acccurves}
    \end{subfigure} 
    \begin{subfigure}{\linewidth}
        \centering
        \includegraphics[width=\linewidth]{figures/empty_error_curve_depth.png}
        \caption{error by epoch}
        \label{fig:depth_errorcurves}
    \end{subfigure} 
    \caption{Training and validation curves in terms of classification accuracy (a) and cross-entropy error (b) on the EMNIST dataset for different network depths.}
    \label{fig:depth}
 \end{figure} 
 }
 }
 %% Question Figure 4:
 \newcommand{\questionFigureFour} {
 \youranswer{Question Figure 4 - Replace these images with figures depicting the Validation Accuracy and Generalisation Gap (difference between validation and training error) for each of the experiment results varying the Dropout inclusion rate, and L1/L2 weight penalty depicted in Table 3 (including any results you have filled in).
 %
 \begin{figure*}[t]
    \centering
    \begin{subfigure}{.475\linewidth}
        \includegraphics[width=\linewidth]{figures/empty_dropout_plot.png}
        \caption{Accuracy and error by inclusion probability.}
        \label{fig:dropoutrates}
    \end{subfigure} 
    \begin{subfigure}{.475\linewidth}
        \centering
        \includegraphics[width=\linewidth]{figures/empty_wd_plot.png}
        \caption{Accuracy and error by weight penalty.}
        \label{fig:weightrates}
    \end{subfigure} 
    \caption{Accuracy and error by regularisation strength of each method (Dropout and L1/L2 Regularisation).}
    \label{fig:hp_search}
 \end{figure*}
 }
 }
 %% - - - - - - - - - - - - TABLES - - - - - - - - - - - - 
 %% Question Table 1:
 \newcommand{\questionTableOne} {
 \youranswer{
 Question Table 1 - Fill in Table 1 with the results from your experiments varying the number of hidden units.
 %
 \begin{table}[t]
    \centering
    \begin{tabular}{c|ccc}
    \toprule
        \# Hidden Units & Val. Acc. & Train Error & Val. Error\\
    \midrule
         32            &    --.-   &  -.--- &   -.---            \\
         64            &    --.-   &  -.---  &   -.---               \\
         128           &    --.-   &  -.---  &   -.---           \\ 
    \bottomrule
    \end{tabular}
    \caption{Validation accuracy (\%) and training/validation error (in terms of cross-entropy error) for varying network widths on the EMNIST dataset.}
    \label{tab:width_exp}
 \end{table}
 }
 }
 %% Question Table 2:
 \newcommand{\questionTableTwo} {
 \youranswer{
 Question Table 2 - Fill in Table 2 with the results from your experiments varying the number of hidden layers.
 %
 \begin{table}[t]
    \centering
    \begin{tabular}{c|ccc}
    \toprule
        \# Hidden Layers & Val. Acc. & Train Error & Val. Error \\
    \midrule
         1               &      --.-      &   -.--- & -.---                \\
         2               &      --.-      &   -.--- & -.---                \\
         3               &      --.-      &   -.--- & -.---                \\ 
    \bottomrule
    \end{tabular}
    \caption{Validation accuracy (\%) and training/validation error (in terms of cross-entropy error) for varying network depths on the EMNIST dataset.}
    \label{tab:depth_exps}
 \end{table}
 }
 }
 %% Question Table 3:
 \newcommand{\questionTableThree} {
 \youranswer{
 Question Table 3 - Fill in Table 3 with the results from your experiments for the missing hyperparameter values for each of L1 regularisation, L2 regularisation, Dropout and label smoothing (use the values shown on the table).
 %
 \begin{table*}[t]
    \centering
    \begin{tabular}{c|c|ccc}
    \toprule
        Model    &  Hyperparameter value(s) & Validation accuracy & Train Error & Validation Error \\
    \midrule
    \midrule
        Baseline &  -                    &               0.837 &       0.241 &  0.533          \\
    \midrule
        \multirow{4}*{Dropout}
                 & 0.6                   &  80.7                &      0.549 & 0.593     \\
                 & 0.7 & --.- & -.--- & -.---  \\
                 & 0.85 & 85.1 &  0.329 &  0.434 \\
                 & 0.97 & 85.4 &  0.244 & 0.457  \\
    \midrule
        \multirow{4}*{L1 penalty}
                 & 5e-4 & 79.5 & 0.642 & 0.658 \\
                 & 1e-3 & --.- & -.--- & -.--- \\
                 & 5e-3 & 2.41 & 3.850 & 3.850 \\
                 & 5e-2 & 2.20 & 3.850 & 3.850 \\
    \midrule
        \multirow{4}*{L2 penalty}  
                 & 5e-4 & 85.1 & 0.306 & 0.460 \\
                 & 1e-3 & --.- & -.--- & -.--- \\
                 & 5e-3 & 81.3 & 0.586 & 0.607 \\
                 & 5e-2 & 39.2 & 2.258 & 2.256  \\
    \midrule
        Label smoothing & 0.1 & --.- & -.--- & -.--- \\
    \bottomrule
    \end{tabular}
    \caption{Results of all hyperparameter search experiments. \emph{italics} indicate the best results per series (Dropout, L1 Regularisation, L2 Regularisation, Label smoothing) and \textbf{bold} indicates the best overall.}
    \label{tab:hp_search}
 \end{table*}
 }
 }
 %% END of YOUR ANSWERS
--- a/report/mlp-cw1-template.tex
+++ b/report/mlp-cw1-template.tex
@ -1,238 +0,0 @@
 %% Template for MLP Coursework 1 / 14 October 2024 
 %% Based on  LaTeX template for ICML 2017 - example_paper.tex at 
 %%  https://2017.icml.cc/Conferences/2017/StyleAuthorInstructions
 \documentclass{article}
 \input{mlp2022_includes}
 \definecolor{red}{rgb}{0.95,0.4,0.4}
 \definecolor{blue}{rgb}{0.4,0.4,0.95}
 \definecolor{orange}{rgb}{1, 0.65, 0}
 \newcommand{\youranswer}[1]{{\color{red} \bf[#1]}} %your answer: 
 %% START of YOUR ANSWERS
 \input{mlp-cw1-questions}
 %% END of YOUR ANSWERS
 %% Do not change anything in this file. Add your answers to mlp-cw1-questions.tex
 \begin{document} 
 \twocolumn[
 \mlptitle{MLP Coursework 1}
 \centerline{\studentNumber}
 \vskip 7mm
 ]
 \begin{abstract} 
 In this report we study the problem of overfitting, which is the training regime where performance increases on the training set but decrease on validation data. Overfitting prevents our trained model from generalizing well to unseen data.
 We first analyse the given example and discuss the probable causes of the underlying problem. 
 Then we investigate how the depth and width of a neural network can affect overfitting in a feedforward architecture and observe that increasing width and depth tend to enable further overfitting.
 Next we discuss how two standard methods, Dropout and Weight Penalty, can
 mitigate overfitting, then describe their implementation and use them in our experiments
 to reduce the overfitting on the EMNIST dataset. 
 Based on our results, we ultimately find that both dropout and weight penalty are able to mitigate overfitting.
 Finally, we conclude the report with our observations and related work. 
 Our main findings indicate that preventing overfitting is achievable through regularization, although combining different methods together is not straightforward.
 \end{abstract} 
 \section{Introduction}
 \label{sec:intro}
 In this report we focus on a common and important problem while training machine learning models known as overfitting, or overtraining, which is the training regime where performances increase on the training set but decrease on unseen data.
 We first start with analysing the given problem in Figure~\ref{fig:example}, study it in different architectures and then investigate different strategies to mitigate the problem.
 In particular, Section~\ref{sec:task1} identifies and discusses the given problem, and investigates the effect of network width and depth in terms of generalization gap (see Ch.~5 in \citealt{Goodfellow-et-al-2016}) and generalization performance.
 Section \ref{sec:task2.1} introduces two regularization techniques to alleviate overfitting: Dropout \cite{srivastava2014dropout} and L1/L2 Weight Penalties (see Section~7.1 in \citealt{Goodfellow-et-al-2016}). 
 We first explain them in detail and discuss why they are used for alleviating overfitting.
 In Section~\ref{sec:task2.2}, we incorporate each of them and their various combinations to a three hidden layer\footnote{We denote all layers as hidden except the final (output) one. This means that depth of a network is equal to the number of its hidden layers + 1.} neural network, train it on the EMNIST dataset, which contains 131,600 images of characters and digits, each of size 28x28, from 47 classes.
 We evaluate them in terms of generalization gap and performance, and discuss the results and effectiveness of the tested regularization strategies.
 Our results show that both dropout and weight penalty are able to mitigate overfitting.
 Finally, we conclude our study in Section~\ref{sec:concl}, noting that preventing overfitting is achievable through regularization, although combining different methods together is not straightforward.
 \section{Problem identification}
 \label{sec:task1}
 \begin{figure}[t]
    \centering
    \begin{subfigure}{\linewidth}
        \includegraphics[width=\linewidth]{figures/fig1_acc.png}
        \caption{accuracy by epoch}
        \label{fig:example_acccurves}
    \end{subfigure} 
    \begin{subfigure}{\linewidth}
        \centering
        \includegraphics[width=\linewidth]{figures/fig1_err.png}
        \caption{error by epoch}
        \label{fig:example_errorcurves}
    \end{subfigure} 
    \caption{Training and validation curves in terms of classification accuracy (a) and cross-entropy error (b) on the EMNIST dataset for the baseline model.}
    \label{fig:example}
 \end{figure} 
 Overfitting to training data is a very common and important issue that needs to be dealt with when training neural networks or other machine learning models in general (see Ch.~5 in \citealt{Goodfellow-et-al-2016}).
 A model is said to be overfitting when as the training progresses, its performance on the training data keeps improving, while its is degrading on validation data. 
 Effectively, the model stops learning related patterns for the task and instead starts to memorize specificities of each training sample that are irrelevant to new samples. 
 Overfitting leads to bad generalization performance in unseen data, as performance on validation data is indicative of performance on test data and (to an extent) during deployment.
 Although it eventually happens to all gradient-based training, it is most often caused by models that are too large with respect to the amount and diversity of training data. The more free parameters the model has, the easier it will be to memorize complex data patterns that only apply to a restricted amount of samples.
 A prominent symptom of overfitting is the generalization gap, defined as the difference between the validation and training error. 
 A steady increase in this quantity is usually interpreted as the model entering the overfitting regime.
 Figure~\ref{fig:example_acccurves} and \ref{fig:example_errorcurves} show a prototypical example of overfitting.
 We see in Figure~\ref{fig:example_acccurves} that \questionOne.
 The extent to which our model overfits depends on many factors.
 For example, the quality and quantity of the training set and the complexity of the model. 
 If we have sufficiently many diverse training samples, or if our model contains few hidden units, it will in general be less prone to overfitting. 
 Any form of regularization will also limit the extent to which the model overfits.
 \subsection{Network width}
 \questionTableOne
 \questionFigureTwo
 First we investigate the effect of increasing the number of hidden units in a single hidden layer network when training on the EMNIST dataset.
 The network is trained using the Adam optimizer
 with a learning rate of $9 \times 10^{-4}$ and a batch size of 100, for a total of 100 epochs.
 The input layer is of size 784, and output layer consists of 47 units. 
 Three different models were trained, with a single hidden layer of 32, 64 and 128 ReLU hidden units respectively.
 Figure~\ref{fig:width} depicts the error and accuracy curves over 100 epochs for the model with varying number of hidden units.
 Table~\ref{tab:width_exp} reports the final accuracy, training error, and validation error.
 We observe that \questionTwo.
 \questionThree.
 \subsection{Network depth}
 \questionTableTwo
 \questionFigureThree
 Next we investigate the effect of varying the number of hidden layers in the network. 
 Table~\ref{tab:depth_exps} and Figure~\ref{fig:depth} depict results from training three models with one, two and three hidden layers respectively, each with 128 ReLU hidden units. 
 As with previous experiments, they are trained with the Adam optimizer with a learning rate of $9 \times 10^{-4}$ and a batch size of 100. 
 We observe that \questionFour.
 \questionFive.
 \section{Regularization}
 \label{sec:task2.1} 
 In this section, we investigate three regularization methods to alleviate the overfitting problem, specifically dropout layers, the L1 and L2 weight penalties and label smoothing.
 \subsection{Dropout}
 Dropout~\cite{srivastava2014dropout} is a stochastic method that randomly inactivates neurons in a neural network according to an hyperparameter, the inclusion rate  (\textit{i.e.} the rate that an unit is included).
 Dropout is commonly represented by an additional layer inserted between the linear layer and activation function.
 Its forward pass during training is defined as follows:
 \begin{align}
    \text{mask} &\sim \text{bernoulli}(p)\\
    \bm{y}' &= \text{mask} \odot \bm{y}\
 \end{align}
 where $\bm{y}, \bm{y}' \in \mathbb{R}^d$ are the output of the linear layer before and after applying dropout, respectively. 
 $\text{mask} \in \mathbb{R}^d$ is a mask vector randomly sampled from the Bernoulli distribution with inclusion probability $p$, and $\odot$ denotes the element-wise multiplication.
 At inference time, stochasticity is not desired, so no neurons are dropped. 
 To account for the change in expectations of the output values, we scale them down by the inclusion probability $p$:
 \begin{align}
    \bm{y}' &= \bm{y}*p\
 \end{align}
 As there is no nonlinear calculation involved, the backward propagation is just the element-wise product of the gradients with respect to the layer outputs and mask created in the forward calculation. 
 The backward propagation for dropout is therefore formulated as follows:
 \begin{align}
    \frac{\partial \bm{y}'}{\partial \bm{y}} = mask
 \end{align}
 Dropout is an easy to implement and highly scalable method. 
 It can be implemented as a layer-based calculation unit, and be placed on any layer of the neural network at will. 
 Dropout can reduce the dependence of hidden units between layers so that the neurons of the next layer will not rely on only few features from of the previous layer.
 Instead, it forces the network to extract diverse features and evenly distribute information among all features. 
 By randomly dropping some neurons in training, dropout makes use of a subset of the whole architecture, so it can also be viewed as bagging different sub networks and averaging their outputs.
 \subsection{Weight penalty}
 L1 and L2 regularization~\cite{ng2004feature} are simple but effective methods to mitigate overfitting to training data. 
 The application of L1 and L2 regularization strategies could be formulated as adding penalty terms with L1 and L2 norm square of weights in the cost function without changing other formulations. 
 The idea behind this regularization method is to penalize the weights by adding a term to the cost function, and explicitly constrain the magnitude of the weights with either the L1 and L2 norms.
 The optimization problem takes a different form:
 \begin{align}
    \text{L1: } & \text{min}_{\bm{w}} \; E_{\text{data}}(\bm{X}, \bm{y}, \bm{w}) + \lambda ||w||_1\\
    \text{L2: } & \text{min}_{\bm{w}} \; E_{\text{data}}(\bm{X}, \bm{y}, \bm{w}) + \lambda ||w||^2_2
 \end{align}
 where $E_{\text{data}}$ denotes the cross entropy error function, and $\{\bm{X}, \bm{y}\}$ denotes the input and target training pairs. 
 $\lambda$ controls the strength of regularization.
 Weight penalty works by constraining the scale of parameters and preventing them to grow too much, avoiding overly sensitive behaviour on unseen data.
 While L1 and L2 regularization are similar to each other in calculation, they have different effects.
 Gradient magnitude in L1 regularization does not depend on the weight value and tends to bring small weights to 0, which can be used as a form of feature selection, whereas L2 regularization tends to shrink the weights to a smaller scale uniformly. 
 \subsection{Label smoothing}
 Label smoothing regularizes a model based on a softmax with $K$ output values by replacing the hard target 0 labels and 1 labels with $\frac{\alpha}{K-1}$ and $1-\alpha$ respectively. 
 $\alpha$ is typically set to a small number such as $0.1$.
 \begin{equation}
    \begin{cases}
        \frac{\alpha}{K-1}, & \quad \text{if} \quad t_k=0\\
        1 - \alpha, & \quad \text{if} \quad t_k=1
    \end{cases}    
 \end{equation}
 The standard cross-entropy error is typically used with these \emph{soft} targets to train the neural network. 
 Hence, implementing label smoothing requires only modifying the targets of training set.
 This strategy may prevent a neural network to obtain very large weights by discouraging very high output values.
 \section{Balanced EMNIST Experiments}
 \questionTableThree
 \questionFigureFour
 \label{sec:task2.2}
 Here we evaluate the effectiveness of the given regularization methods for reducing the overfitting on the EMNIST dataset.
 We build a baseline architecture with three hidden layers, each with 128 neurons, which suffers from overfitting as shown in section \ref{sec:task1}.
 Here we train the network with a lower learning rate of $10^{-4}$, as the previous runs were overfitting after only a handful of epochs. 
 Results for the new baseline (c.f. Table~\ref{tab:hp_search}) confirm that lower learning rate helps, so all further experiments are run using it.
 Here, we apply the L1 or L2 regularization with dropout to our baseline and search for good hyperparameters on the validation set. 
 Then, we apply the label smoothing with $\alpha=0.1$ to our baseline.
 We summarize all the experimental results in Table~\ref{tab:hp_search}. For each method except the label smoothing, we plot the relationship between generalization gap and validation accuracy in Figure~\ref{fig:hp_search}.
 First we analyze three methods separately, train each over a set of hyperparameters and compare their best performing results.
 \questionSix.
 \questionSeven.
 \section{Conclusion}
 \label{sec:concl}
 \questionEight.
 \newpage
 \bibliography{refs}
 \end{document} 
--- a/report/mlp-cw2-questions.tex
+++ b/report/mlp-cw2-questions.tex
@ -0,0 +1,148 @@
 %% REPLACE sXXXXXXX with your student number
 \def\studentNumber{sXXXXXXX}
 %% START of YOUR ANSWERS
 %% Add answers to the questions below, by replacing the text inside the brackets {} for \youranswer{ "Text to be replaced with your answer." }. 
 %
 % Do not delete the commands for adding figures and tables. Instead fill in the missing values with your experiment results, and replace the images with your own respective figures.
 %
 % You can generally delete the placeholder text, such as for example the text "Question Figure 3 - Replace the images ..." 
 %
 % There are 5 TEXT QUESTIONS. Replace the text inside the brackets of the command \youranswer with your answer to the question.
 %
 % There are also 3 "questions" to replace some placeholder FIGURES with your own, and 1 "question" asking you to fill in the missing entries in the TABLE provided. 
 %
 % NOTE! that questions are ordered by the order of appearance of their answers in the text, and not necessarily by the order you should tackle them. You should attempt to fill in the TABLE and FIGURES before discussing the results presented there. 
 %
 % NOTE! If for some reason you do not manage to produce results for some FIGURES and the TABLE, then you can get partial marks by discussing your expectations of the results in the relevant TEXT QUESTIONS. The TABLE specifically has enough information in it already for you to draw meaningful conclusions.
 %
 % Please refer to the coursework specification for more details.
 %% - - - - - - - - - - - - TEXT QUESTIONS - - - - - - - - - - - - 
 %% Question 1:
 \newcommand{\questionOne} {
 \youranswer{Question 1 - Use Figures 1, 2, and 3 to identify the Vanishing Gradient Problem (which of these model suffers from it, and what are the consequences depicted?).
 The average length for an answer to this question is approximately 1/5 of the columns in a 2-column page}
 }
 %% Question 2:
 \newcommand{\questionTwo} {
 \youranswer{Question 2 - Consider these results (including Figure 1 from \cite{he2016deep}). Discuss the relation between network capacity and overfitting, and whether, and how, this is reflected on these results. What other factors may have lead to this difference in performance?
 The average length for an answer to this question is
 approximately 1/5 of the columns in a 2-column page}
 }
 %% Question 3:
 \newcommand{\questionThree} {
 \youranswer{Question 3 - In this coursework, we didn't incorporate residual connections to the downsampling layers. Explain and justify what would need to be changed in order to add residual connections to the downsampling layers. Give and explain 2 ways of incorporating these changes and discuss pros and cons of each.
 }
 }
 %% Question 4:
 \newcommand{\questionFour} {
 \youranswer{Question 4 - Present and discuss the experiment results (all of the results and not just the ones you had to fill in) in Table 1 and Figures 4 and 5 (you may use any of the other Figures if you think they are relevant to your analysis). You will have to determine what data are relevant to the discussion, and what information can be extracted from it. Also, discuss what further experiments you would have ran on any combination of VGG08, VGG38, BN, RC in order to
 \begin{itemize}
    \item Improve performance of the model trained (explain why you expect your suggested experiments will help with this).
    \item Learn more about the behaviour of BN and RC (explain what you are trying to learn and how).
 \end{itemize}
 The average length for an answer to this question is approximately 1 of the columns in a 2-column page
 }
 }
 %% Question 5:
 \newcommand{\questionFive} {
 \youranswer{Question 5 - Briefly draw your conclusions based on the results from the previous sections (what are the take-away messages?) and conclude your report with a recommendation for future work. 
 Good recommendations for future work also draw on the broader literature (the papers already referenced are good starting points). Great recommendations for future work are not just incremental (an example of an incremental suggestion would be: ``we could also train with different learning rates'') but instead also identify meaningful questions or, in other words, questions with answers that might be somewhat more generally applicable. 
 For example, \citep{huang2017densely} end with \begin{quote}``Because of their compact internal representations and reduced feature redundancy, DenseNets may be good feature extractors for various computer vision tasks that build on convolutional features, e.g.,  [4,5].''\end{quote} 
 while \cite{bengio1993problem} state in their conclusions that \begin{quote}``There remains theoretical questions to be considered,  such as whether the problem with simple gradient descent  discussed in this paper would be observed with  chaotic attractors that are not  hyperbolic.''\\\end{quote}
 The length of this question description is indicative of the average length of a conclusion section}
 }
 %% - - - - - - - - - - - - FIGURES - - - - - - - - - - - - 
 %% Question Figure 3:
 \newcommand{\questionFigureThree} {
 \youranswer{Question Figure 3 - Replace this image with a figure depicting the average gradient across layers, for the VGG38 model.
 \textit{(The provided figure is correct, and can be used in your analysis. It is partially obscured so you can get credit for producing your own copy).}
 %
 \begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{figures/gradplot_38_watermarked.pdf}
    \caption{Gradient Flow on VGG38}
    \label{fig:avg_grad_flow_38}
 \end{figure}
 }
 }
 %% Question Figure 4:
 \newcommand{\questionFigureFour} {
 \youranswer{Question Figure 4 - Replace this image with a figure depicting the training curves for the model with the best performance \textit{across experiments you have available (you don't need to run the experiments for the models we already give you results for)}. Edit the caption so that it clearly identifies the model and what is depicted.
 %
 \begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{example-image-duck}
    \caption{Training curves for ? ? ?}
    \label{fig:training_curves_bestModel}
 \end{figure}
 }
 }
 %% Question Figure 5:
 \newcommand{\questionFigureFive} {
 \youranswer{Question Figure 5 - Replace this image with a figure depicting the average gradient across layers, for the model with the best performance \textit{across experiments you have available (you don't need to run the experiments for the models we already give you results for)}. Edit the caption so that it clearly identifies the model and what is depicted.
 %
 \begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{example-image-duck}
    \caption{Gradient Flow on ? ? ?}
    \label{fig:avg_grad_flow_bestModel}
 \end{figure}
 }
 }
 %% - - - - - - - - - - - - TABLES - - - - - - - - - - - - 
 %% Question Table 1:
 \newcommand{\questionTableOne} {
 \youranswer{
 Question Table 1 - Fill in Table 1 with the results from your experiments on 
 \begin{enumerate}
    \item \textit{VGG38 BN (LR 1e-3)}, and 
    \item \textit{VGG38 BN + RC (LR 1e-2)}.
 \end{enumerate}
 %
 \begin{table*}[t]
    \centering
    \begin{tabular}{lr|ccccc}
    \toprule
        Model                   & LR   & \# Params & Train loss & Train acc & Val loss & Val acc \\
    \midrule
        VGG08                   & 1e-3 & 60 K      &  1.74      & 51.59     & 1.95     & 46.84 \\
        VGG38                   & 1e-3 & 336 K     &  4.61      & 00.01     & 4.61     & 00.01 \\
        VGG38 BN                & 1e-3 &     ?     &     ?      &     ?     &    ?     &     ? \\
        VGG38 RC                & 1e-3 & 336 K     &  1.33      & 61.52     & 1.84     & 52.32 \\
        VGG38 BN + RC           & 1e-3 & 339 K     &  1.26      & 62.99     & 1.73     & 53.76 \\
        VGG38 BN                & 1e-2 & 339 K     &  1.70      & 52.28     & 1.99     & 46.72 \\
        VGG38 BN + RC           & 1e-2 &     ?     &     ?      &     ?     &    ?     &     ? \\
    \bottomrule
    \end{tabular}
    \caption{Experiment results (number of model parameters, Training and Validation loss and accuracy) for different combinations of VGG08, VGG38, Batch Normalisation (BN), and Residual Connections (RC), LR is learning rate.}
    \label{tab:CIFAR_results}
 \end{table*} 
 }
 }
 %% END of YOUR ANSWERS
--- a/report/mlp-cw2-template.tex
+++ b/report/mlp-cw2-template.tex
@ -0,0 +1,314 @@
 %% Template for MLP Coursework 2 / 13 November 2023
 %% Based on  LaTeX template for ICML 2017 - example_paper.tex at 
 %%  https://2017.icml.cc/Conferences/2017/StyleAuthorInstructions
 \documentclass{article}
 \input{mlp2022_includes}
 \definecolor{red}{rgb}{0.95,0.4,0.4}
 \definecolor{blue}{rgb}{0.4,0.4,0.95}
 \definecolor{orange}{rgb}{1, 0.65, 0}
 \newcommand{\youranswer}[1]{{\color{red} \bf[#1]}} %your answer: 
 %% START of YOUR ANSWERS
 \input{mlp-cw2-questions}
 %% END of YOUR ANSWERS
 %% Do not change anything in this file. Add your answers to mlp-cw1-questions.tex
 \begin{document} 
 \twocolumn[
 \mlptitle{MLP Coursework 2}
 \centerline{\studentNumber}
 \vskip 7mm
 ]
 \begin{abstract} 
 Deep neural networks have become the state-of-the-art 
 in many standard computer vision problems thanks to their powerful
 representations and availability of large labeled datasets.
 While very deep networks allow for learning more levels of abstractions in their layers from the data, training these models successfully is a challenging task due to problematic gradient flow through the layers, known as vanishing/exploding gradient problem.
 In this report, we first analyze this problem in VGG models with 8 and 38 hidden layers on the CIFAR100 image dataset, by monitoring the gradient flow during training. 
 We explore known solutions to this problem including batch normalization or residual connections, and explain their theory and implementation details. 
 Our experiments show that batch normalization and residual connections effectively address the aforementioned problem and hence enable a deeper model to outperform shallower ones in the same experimental setup.
 \end{abstract} 
 \section{Introduction}
 \label{sec:intro}
 Despite the remarkable progress of modern convolutional neural networks (CNNs) in image classification problems~\cite{simonyan2014very, he2016deep}, training very deep networks is a challenging procedure.
 One of the major problems is the Vanishing Gradient Problem (VGP), a phenomenon where the gradients of the error function with respect to network weights shrink to zero, as they backpropagate to earlier layers, hence preventing effective weight updates. 
 This phenomenon is prevalent and has been extensively studied in various deep neural networks including feedforward  networks~\cite{glorot2010understanding},  RNNs~\cite{bengio1993problem}, and CNNs~\cite{he2016deep}. 
 Multiple solutions have been proposed to mitigate this problem by using weight initialization strategies~\cite{glorot2010understanding},
 activation functions~\cite{glorot2010understanding}, input normalization~\cite{bishop1995neural},
 batch normalization~\cite{ioffe2015batch}, and shortcut connections \cite{he2016deep, huang2017densely}.
 This report focuses on diagnosing the VGP occurring in the VGG38 model\footnote{VGG stands for the Visual Geometry Group in the University of Oxford.} and addressing it by implementing two standard solutions.
 In particular, we first study a ``broken'' network in terms of its gradient flow, L1 norm of gradients with respect to its weights for each layer and contrast it to ones in the healthy and shallower VGG08 to pinpoint the problem.
 Next, we review two standard solutions for this problem,  batch normalization (BN)~\cite{ioffe2015batch} and residual connections (RC)~\cite{he2016deep} in detail and discuss how they can address the gradient problem.
 We first incorporate batch normalization (denoted as VGG38+BN), residual connections (denoted as VGG38+RC),  and their combination (denoted as VGG38+BN+RC) to the given VGG38 architecture.
 We train the resulting three configurations, and VGG08 and VGG38 models on CIFAR100 (pronounced as `see far 100' ) dataset and present the results.
 The results show that though separate use of BN and RC does mitigate the vanishing/exploding gradient problem, therefore enabling effective training of the VGG38 model, the best results are obtained by combining both BN and RC.
 %
 \section{Identifying training problems of a deep CNN}
 \label{sec:task1}
 \begin{figure}[t]
    \begin{subfigure}{\linewidth}
        \centering
        \includegraphics[width=\linewidth]{figures/loss_plot.pdf}
        \caption{Cross entropy error per epoch}
        \label{fig:loss_curves}
    \end{subfigure}
    \begin{subfigure}{\linewidth}
        \centering
        \includegraphics[width=\linewidth]{figures/accuracy_plot.pdf}
        \caption{Classification accuracy per epoch}
        \label{fig:acc_curves}
    \end{subfigure}
    \caption{Training curves for VGG08 and VGG38 in terms of (a) cross-entropy error and (b) classification accuracy}
    \label{fig:curves}
 \end{figure}
 \begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{figures/grad_flow_vgg08.pdf}
    \caption{Gradient flow on VGG08}
    \label{fig:grad_flow_08}
 \end{figure}
 \questionFigureThree
 Concretely, training deep neural networks typically involves three steps: forward
 pass, backward pass (or backpropagation algorithm~\cite{rumelhart1986learning}) and weight update.
 The first step involves passing the input $\bx^{(0)}$ to the network and producing 
 the network prediction and also the error value.
 In detail, each layer takes in the output of the previous layer and applies
 a non-linear transformation:
 \begin{equation}
 \label{eq.fprop}
 \bx^{(l)} = f^{(l)}(\bx^{(l-1)}; W^{(l)})    
 \end{equation} 
 where $(l)$ denotes the $l$-th layer in $L$ layer deep network,
 $f^{(l)}(\cdot,W^{(l)})$ is a non-linear transformation for layer $l$, and $W^{(l)}$ are the weights of layer $l$.
 For instance, $f^{(l)}$ is typically a convolution operation followed by an activation function in convolutional neural networks.
 The second step involves the backpropagation algorithm, where we calculate the gradient of an error function $E$ (\textit{e.g.} cross-entropy) for each layer's weight as follows:
 \begin{equation}
    \label{eq.bprop}
 \frac{\partial E}{\partial W^{(l)}} = \frac{\partial E}{\partial \bx^{(L)}} \frac{\partial \bx^{(L)}}{\partial \bx^{(L-1)}} \dots \frac{\partial \bx^{(l+1)}}{\partial \bx^{(l)}}\frac{\partial \bx^{(l)}}{\partial W^{(l)}}.
 \end{equation}
 This step includes consecutive tensor multiplications between multiple
 partial derivative terms.
 The final step involves updating model weights by using the computed 
 $\frac{\partial E}{\partial W^{(l)}}$ with an update rule.
 The exact update rule depends on the optimizer.
 A notorious problem for training deep neural networks is the vanishing/exploding gradient
 problem~\cite{bengio1993problem} that typically occurs in the backpropagation step when some of partial gradient terms in Eq.~\ref{eq.bprop} includes values larger or smaller than 1.
 In this case, due to the multiple consecutive multiplications, the gradients \textit{w.r.t.} weights can get exponentially very small (close to 0) or very large (close to infinity) and
 prevent effective learning of network weights.
 %
 Figures~\ref{fig:grad_flow_08} and \ref{fig:grad_flow_38} depict the gradient flows through VGG architectures \cite{simonyan2014very} with 8 and 38 layers respectively, trained and evaluated for a total of 100 epochs on the CIFAR100 dataset. \questionOne.
 \section{Background Literature}
 \label{sec:lit_rev}
 In this section we will highlight some of the most influential
 papers that have been central to overcoming the VGP in
 deep CNNs.
 \paragraph{Batch Normalization}\cite{ioffe2015batch}
 BN seeks to solve the  problem of 
 internal covariate shift (ICS), when distribution of each layer’s 
 inputs changes during training, as the parameters of the previous layers change. 
 The authors argue that without batch normalization, the distribution of
 each layer’s inputs can vary significantly due to the  stochastic nature of randomly sampling mini-batches from your
 training set. 
 Layers in the network hence must continuously adapt to these high variance distributions which hinders the rate of convergence gradient-based optimizers.
 This optimization problem is exacerbated further with network depth due
 to the updating of parameters at layer $l$ being dependent on
 the previous $l-1$ layers.
 It is hence beneficial to embed the normalization of
 training data into the network architecture after work from
 LeCun \emph{et al.} showed that training converges faster with
 this addition \cite{lecun2012efficient}. Through standardizing
 the inputs to each layer, we take a step towards achieving
 the fixed distributions of inputs that remove the ill effects
 of ICS. Ioffe and Szegedy demonstrate the effectiveness of
 their technique through training an ensemble of BN
 networks which achieve an accuracy on the ImageNet classification
 task exceeding that of humans in 14 times fewer
 training steps than the state-of-the-art of the time.
 It should be noted, however, that the exact reason for BN’s effectiveness is still not completely understood and it is 
 an open research question~\cite{santurkar2018does}.
 \paragraph{Residual networks (ResNet)}\cite{he2016deep} A well-known way of mitigating the VGP is proposed by He~\emph{et al.} in \cite{he2016deep}. In their paper, the authors depict the error curves of a 20 layer and a 56 layer network to motivate their method. Both training and testing error of the 56 layer network are significantly higher than of the shallower one.
 \questionTwo.
 Residual networks, colloquially
 known as ResNets, aim to alleviate VGP through the
 incorporation of skip connections that bypass the linear
 transformations into the network architecture. 
 The authors argue that this new mapping is significantly easier
 to optimize since if an identity mapping were optimal, the
 network could comfortably learn to push the residual to
 zero rather than attempting to fit an identity mapping via
 a stack of nonlinear layers. 
 They bolster their argument
 by successfully training ResNets with depths exceeding
 1000 layers on the CIFAR10 dataset.
 Prior to their work, training even a 100-layer was accepted
 as a great challenge within the deep learning community.
 The addition of skip connections solves the VGP through
 enabling information to flow more freely throughout the
 network architecture without the addition of neither extra
 parameters, nor computational complexity.
 \section{Solution overview}
 \subsection{Batch normalization}
 BN has been a standard component in the state-of-the-art 
 convolutional neural networks \cite{he2016deep,huang2017densely}.
 % As mentioned in Section~\ref{sec:lit_rev}, 
 Concretely, BN is a
 layer transformation that is performed to whiten the activations
 originating from each layer. 
 As computing full dataset statistics at each training iteration
 would be computationally expensive, BN computes batch statistics
 to approximate them. 
 Given a minibatch of $B$ training samples and their feature maps
 $X = (\bx^1, \bx^2,\ldots , \bx^B)$ at an arbitrary layer where $X \in \mathbb{R}^{B\times H \times W \times C}$, $H, W$ are the height, width of the feature map and $C$ is the number of channels, the batch normalization first computes the following statistics:
 \begin{align}
 \label{eq.bnstats}
    \mu_c &= \frac{1}{BWH}  \sum_{n=1}^{B}\sum_{i,j=1}^{H,W} \bx_{cij}^{n}\\
    \sigma^2_c &= \frac{1}{BWH}
    \sum_{n=1}^{B}\sum_{i,j=1}^{H,W} (\bx_{cij}^{n} - \mu_{c})^2
 \end{align} where $c$, $i$, $j$ denote the index values for $y$, $x$ and channel coordinates of feature maps, and $\bm{\mu}$ and $\bm{\sigma}^2$ are the mean and variance of the batch.
 BN applies the following operation on each feature map in batch B for every $c,i,j$:
 \begin{equation}
 \label{eq.bnop}
 \text{BN}(\bx_{cij}) = \frac{\bx_{cij} - \mu_{c}}{\sqrt{\sigma^2_{c}} + \epsilon} * \gamma_{c} + \beta_{c}
 \end{equation} where $\gamma \in \mathbb{R}^C$ and $\beta\in \mathbb{R}^C$ are learnable parameters and $\epsilon$ is a small constant introduced to ensure numerical stability.
 At inference time, using batch statistics is a poor choice as it introduces noise in the evaluation and might not even be well defined. Therefore, $\bm{\mu}$ and $\bm{\sigma}$ are replaced by running averages of the mean and variance computed during training, which is a better approximation of the full dataset statistics.
 Recent work
 has shown that BatchNorm has a more fundamental
 benefit of smoothing the optimization landscape during
 training \cite{santurkar2018does} thus enhancing the predictive
 power of gradients as our guide to the global minimum.
 Furthermore, a smoother optimization landscape should
 additionally enable the use of a wider range of learning
 rates and initialization schemes which is congruent with the
 findings of Ioffe and Szegedy in the original BatchNorm
 paper~\cite{ioffe2015batch}.
 \subsection{Residual connections}
 Residual connections are another approach used in the state-of-the-art Residual Networks~\cite{he2016deep} to tackle the vanishing gradient problem.
 Introduced by He et. al.~\cite{he2016deep}, a residual block consists of a
 convolution (or group of convolutions) layer, ``short-circuited'' with an identity mapping.
 More precisely, given a mapping $F^{(b)}$ that denotes the transformation of the block $b$ (multiple consecutive layers), $F^{(b)}$ is applied to its input
 feature map $\bx^{(b-1)}$ as $\bx^{(b)} = \bx^{(b-1)} + {F}(\bx^{(b-1)})$.
 Intuitively, stacking residual blocks creates an architecture where inputs of each blocks
 are given two paths : passing through the convolution or skipping to the next layer. A residual network can therefore be seen as an ensemble model averaging every sub-network
 created by choosing one of the two paths. The skip connections allow gradients to flow
 easily into early layers, since 
 \begin{equation}
    \frac{\partial \bx^{(b)}}{\partial \bx^{(b-1)}} = \mathbbm{1} + \frac{\partial{F}(\bx^{(b-1)})}{\partial \bx^{(b-1)}}
    \label{eq.grad_skip}
 \end{equation} where $\bx^{(b-1)} \in \mathbb{R}^{C \times H \times W }$ and $\mathbbm{1}$ is a $\mathbb{R}^{C \times H \times W}$-dimensional tensor with entries 1 where $C$, $H$ and $W$ denote the number of feature maps, its height and width respectively. 
 Importantly, $\mathbbm{1}$ prevents the zero gradient flow.
 \section{Experiment Setup}
 \questionFigureFour
 \questionFigureFive
 \questionTableOne
 We conduct our experiment on the CIFAR100 dataset \cite{krizhevsky2009learning},
 which consists of 60,000 32x32 colour images from 100 different classes. The number of samples per class is balanced, and the
 samples are split into training, validation, and test set while
 maintaining balanced class proportions. In total, there are 47,500; 2,500; and 10,000 instances in the training, validation,
 and test set, respectively. Moreover, we apply data augmentation strategies (cropping, horizontal flipping) to improve the generalization of the model.
 With the goal of understanding whether BN or skip connections
 help fighting vanishing gradients, we first test these
 methods independently, before combining them in an attempt
 to fully exploit the depth of the VGG38 model.
 All experiments are conducted using the Adam optimizer with the default
 learning rate (1e-3) -- unless otherwise specified, cosine annealing and a batch size of 100
 for 100 epochs. 
 Additionally, training images are augmented with random 
 cropping and horizontal flipping.
 Note that we do not use data augmentation at test time.
 These hyperparameters along with the augmentation strategy are used
 to produce the results shown in Fig.~\ref{fig:curves}.
 When used, BN is applied
 after each convolutional layer, before the Leaky
 ReLU non-linearity. 
 Similarly, the skip connections are applied from 
 before the convolution layer to before the final activation function
 of the block as per Fig.~2 of \cite{he2016deep}. 
 Note that adding residual connections between the feature maps before and after downsampling requires special treatment, as there is a dimension mismatch between them. 
 Therefore in the coursework, we do not use residual connections in the down-sampling blocks. However, please note that batch normalization should still be implemented for these blocks. 
 \subsection{Residual Connections to Downsampling Layers}
 \label{subsec:rescimp}
 \questionThree.
 \section{Results and Discussion}
 \label{sec:disc}
 \questionFour.
 \section{Conclusion}
 \label{sec:concl}
 \questionFive.    
 \bibliography{refs}
 \end{document} 
--- a/report/mlp2022.sty
+++ b/report/mlp2022.sty
@ -64,7 +64,7 @@
 %%%%%%%%%%%%%%%%%%%%
 \NeedsTeXFormat{LaTeX2e}
-\ProvidesPackage{mlp2022}[2022/10/16 MLP Coursework Style File]
+\ProvidesPackage{mlp2022}[2021/10/16 MLP Coursework Style File]
 % Use fancyhdr package
 \RequirePackage{fancyhdr}
@ -89,7 +89,7 @@
  \hypersetup{ %
    pdftitle={},
    pdfauthor={},
-    pdfsubject={MLP Coursework 2022-23},
+    pdfsubject={MLP Coursework 2021-22},
    pdfkeywords={},
    pdfborder=0 0 0,
    pdfpagemode=UseNone,
--- a/report/mlp2022_includes.tex
+++ b/report/mlp2022_includes.tex
@ -25,8 +25,6 @@
 \usepackage{color}
 \usepackage{booktabs} % To thicken table lines
 \usepackage{multirow} % Multirow cells in table
 \usepackage{soul}
 \usepackage{bm}
 % Packages hyperref and algorithmic misbehave sometimes.  We can fix
 % this with the following command.
@ -35,9 +33,10 @@
 % Set up MLP coursework style (based on ICML style)
 \usepackage{mlp2022}
-\mlptitlerunning{MLP Coursework 1 (\studentNumber)}
+\mlptitlerunning{MLP Coursework 2 (\studentNumber)}
 \bibliographystyle{icml2017}
-
+\usepackage{bm,bbm}
 \usepackage{soul}
 \DeclareMathOperator{\softmax}{softmax}
 \DeclareMathOperator{\sigmoid}{sigmoid}
@ -47,3 +46,5 @@
 \DeclareMathOperator{\elu}{elu}
 \DeclareMathOperator{\selu}{selu}
 \DeclareMathOperator{\maxout}{maxout}
 \newcommand{\bx}{\bm{x}}
--- a/report/refs.bib
+++ b/report/refs.bib
@ -35,9 +35,129 @@
  year={2004}
 }
-@inproceedings{loshchilov2019decoupled,
+@article{simonyan2014very,
-  title={Decoupled weight decay regularization},
+  title={Very deep convolutional networks for large-scale image recognition},
-  author={Loshchilov, Ilya and Hutter, Frank},
+  author={Simonyan, Karen and Zisserman, Andrew},
-  booktitle={International Conference on Learning Representations (ICLR)},
+  journal={arXiv preprint arXiv:1409.1556},
-  year={2019}
+  year={2014}
 }
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
 }
@inproceedings{glorot2010understanding,
  title={Understanding the difficulty of training deep feedforward neural networks},
  author={Glorot, Xavier and Bengio, Yoshua},
  booktitle={Proceedings of the thirteenth international conference on artificial intelligence and statistics},
  pages={249--256},
  year={2010},
  organization={JMLR Workshop and Conference Proceedings}
 }
@inproceedings{bengio1993problem,
  title={The problem of learning long-term dependencies in recurrent networks},
  author={Bengio, Yoshua and Frasconi, Paolo and Simard, Patrice},
  booktitle={IEEE international conference on neural networks},
  pages={1183--1188},
  year={1993},
  organization={IEEE}
 }
@inproceedings{ide2017improvement,
  title={Improvement of learning for CNN with ReLU activation by sparse regularization},
  author={Ide, Hidenori and Kurita, Takio},
  booktitle={2017 International Joint Conference on Neural Networks (IJCNN)},
  pages={2684--2691},
  year={2017},
  organization={IEEE}
 }
@inproceedings{ioffe2015batch,
  title={Batch normalization: Accelerating deep network training by reducing internal covariate shift},
  author={Ioffe, Sergey and Szegedy, Christian},
  booktitle={International conference on machine learning},
  pages={448--456},
  year={2015},
  organization={PMLR}
 }
@inproceedings{huang2017densely,
  title={Densely connected convolutional networks},
  author={Huang, Gao and Liu, Zhuang and Van Der Maaten, Laurens and Weinberger, Kilian Q},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4700--4708},
  year={2017}
 }
@article{rumelhart1986learning,
  title={Learning representations by back-propagating errors},
  author={Rumelhart, David E and Hinton, Geoffrey E and Williams, Ronald J},
  journal={nature},
  volume={323},
  number={6088},
  pages={533--536},
  year={1986},
  publisher={Nature Publishing Group}
 }
@inproceedings{du2019gradient,
  title={Gradient descent finds global minima of deep neural networks},
  author={Du, Simon and Lee, Jason and Li, Haochuan and Wang, Liwei and Zhai, Xiyu},
  booktitle={International Conference on Machine Learning},
  pages={1675--1685},
  year={2019},
  organization={PMLR}
 }
@inproceedings{pascanu2013difficulty,
  title={On the difficulty of training recurrent neural networks},
  author={Pascanu, Razvan and Mikolov, Tomas and Bengio, Yoshua},
  booktitle={International conference on machine learning},
  pages={1310--1318},
  year={2013},
  organization={PMLR}
 }
@article{li2017visualizing,
  title={Visualizing the loss landscape of neural nets},
  author={Li, Hao and Xu, Zheng and Taylor, Gavin and Studer, Christoph and Goldstein, Tom},
  journal={arXiv preprint arXiv:1712.09913},
  year={2017}
 }
@inproceedings{santurkar2018does,
  title={How does batch normalization help optimization?},
  author={Santurkar, Shibani and Tsipras, Dimitris and Ilyas, Andrew and M{\k{a}}dry, Aleksander},
  booktitle={Proceedings of the 32nd international conference on neural information processing systems},
  pages={2488--2498},
  year={2018}
 }
@article{krizhevsky2009learning,
  title={Learning multiple layers of features from tiny images},
  author={Krizhevsky, Alex and Hinton, Geoffrey and others},
  journal={},
  year={2009},
  publisher={Citeseer}
 }
@incollection{lecun2012efficient,
  title={Efficient backprop},
  author={LeCun, Yann A and Bottou, L{\'e}on and Orr, Genevieve B and M{\"u}ller, Klaus-Robert},
  booktitle={Neural networks: Tricks of the trade},
  pages={9--48},
  year={2012},
  publisher={Springer}
 }
@book{bishop1995neural,
  title={Neural networks for pattern recognition},
  author={Bishop, Christopher M and others},
  year={1995},
  publisher={Oxford university press}
 }