final changes

2024-11-22 09:26:24 +00:00 · 2024-11-22 09:26:24 +00:00 · 46ca7c6dfd
commit 46ca7c6dfd
parent c29681b4ba
6 changed files with 157 additions and 13 deletions
--- a/report/.gitignore
+++ b/report/.gitignore
@ -1,2 +1,4 @@
 *.fls
 *.fdb_latexmk
 s2759177/
 *.zip
--- a/report/VGG38_default/result_outputs/summary.csv
+++ b/report/VGG38_default/result_outputs/summary.csv
@ -0,0 +1,101 @@
 train_acc,train_loss,val_acc,val_loss
 0.009600000000000001,4.609349,0.0104,4.6072426
 0.009326315789473684,4.6068563,0.0092,4.606588
 0.009747368421052631,4.6062207,0.0084,4.606326
 0.009621052631578947,4.6059957,0.0076,4.6067405
 0.009873684210526314,4.605887,0.0076,4.6068487
 0.009136842105263157,4.605854,0.008,4.6074386
 0.009536842105263158,4.605795,0.007200000000000001,4.6064863
 0.009578947368421051,4.6057415,0.006400000000000001,4.6065035
 0.009410526315789473,4.6058245,0.0076,4.606772
 0.009094736842105263,4.6057224,0.007600000000000001,4.6064925
 0.00911578947368421,4.605707,0.007200000000000001,4.6067533
 0.009852631578947368,4.605685,0.007200000000000001,4.6068745
 0.01031578947368421,4.6056952,0.0072,4.6067533
 0.009789473684210527,4.6057863,0.0072,4.6070247
 0.01031578947368421,4.6056023,0.0064,4.607134
 0.010189473684210526,4.605698,0.0064,4.606934
 0.009957894736842107,4.605643,0.006400000000000001,4.6068535
 0.009452631578947369,4.605595,0.0064,4.6070676
 0.009368421052631578,4.6057224,0.008,4.6070356
 0.010210526315789474,4.6056094,0.009600000000000001,4.6070833
 0.009557894736842105,4.6056895,0.0076,4.6069493
 0.009600000000000001,4.605709,0.008400000000000001,4.60693
 0.00985263157894737,4.6055284,0.0084,4.6068263
 0.009200000000000002,4.60564,0.0076,4.6071053
 0.009031578947368422,4.6056323,0.008400000000000001,4.606731
 0.009663157894736842,4.60559,0.0068,4.6069546
 0.008484210526315789,4.605676,0.009600000000000001,4.6063976
 0.0096,4.605595,0.011200000000000002,4.6067076
 0.00951578947368421,4.605619,0.0096,4.6068506
 0.009242105263157895,4.6056657,0.0072,4.6067576
 0.009326315789473684,4.6055913,0.012,4.6070724
 0.01023157894736842,4.605646,0.012000000000000002,4.6066885
 0.009494736842105262,4.605563,0.0072,4.6067305
 0.009810526315789474,4.6055746,0.007200000000000001,4.6067824
 0.010147368421052632,4.605596,0.0072,4.607214
 0.009536842105263156,4.6055007,0.007200000000000001,4.607186
 0.009452631578947369,4.605547,0.0072,4.607297
 0.009578947368421055,4.6055694,0.0072,4.607313
 0.009410526315789475,4.6055374,0.0072,4.60726
 0.00985263157894737,4.605587,0.0072,4.6072307
 0.009389473684210526,4.605559,0.0072,4.607227
 0.009852631578947368,4.6055884,0.008,4.6070976
 0.008968421052631579,4.6055803,0.008,4.607156
 0.009536842105263158,4.605502,0.0076,4.6073594
 0.009410526315789473,4.6055517,0.008,4.607176
 0.01,4.6055126,0.006400000000000001,4.606937
 0.009915789473684213,4.6055126,0.008,4.607185
 0.009305263157894737,4.605594,0.0064,4.606834
 0.009326315789473684,4.6054907,0.008,4.6070714
 0.009094736842105263,4.6055007,0.0076,4.6068645
 0.009052631578947368,4.6055903,0.008400000000000001,4.606755
 0.010294736842105263,4.605449,0.008,4.6068816
 0.009578947368421055,4.6054883,0.0064,4.6067166
 0.009452631578947369,4.60552,0.01,4.6066008
 0.008821052631578948,4.6054573,0.009600000000000001,4.6065955
 0.008968421052631579,4.605544,0.008,4.6063676
 0.010147368421052632,4.605516,0.0064,4.6068606
 0.009600000000000001,4.6054597,0.0096,4.6072354
 0.01008421052631579,4.605526,0.0076,4.6074166
 0.010126315789473685,4.6054554,0.0076,4.6074657
 0.009705263157894736,4.6054635,0.0088,4.607237
 0.009726315789473684,4.605516,0.007200000000000001,4.606978
 0.009894736842105262,4.6054883,0.0072,4.607135
 0.009663157894736842,4.605501,0.007200000000000001,4.607015
 0.00976842105263158,4.605536,0.008,4.6073785
 0.009473684210526316,4.6055303,0.009600000000000001,4.6070166
 0.009347368421052632,4.6054993,0.0076,4.607084
 0.009178947368421054,4.6054535,0.0084,4.6070604
 0.008842105263157892,4.605507,0.0076,4.6069884
 0.009726315789473684,4.6055107,0.007599999999999999,4.6069903
 0.009536842105263156,4.6054244,0.0084,4.6070695
 0.009452631578947369,4.605474,0.0072,4.607035
 0.009621052631578949,4.605444,0.0076,4.6071277
 0.010084210526315791,4.6054263,0.0076,4.6071534
 0.009326315789473686,4.605477,0.0088,4.607115
 0.009010526315789472,4.60548,0.0076,4.6072206
 0.010042105263157897,4.605475,0.0076,4.607185
 0.00976842105263158,4.6054463,0.008400000000000001,4.6071196
 0.01,4.605421,0.008,4.6069384
 0.009536842105263156,4.605482,0.008,4.607035
 0.009915789473684213,4.6054354,0.008,4.6071534
 0.010042105263157894,4.6054177,0.007200000000000001,4.607074
 0.009242105263157895,4.605473,0.0072,4.606825
 0.009726315789473684,4.6054006,0.0072,4.606701
 0.009684210526315788,4.6054583,0.0104,4.606925
 0.009642105263157895,4.6054606,0.0104,4.6068645
 0.00936842105263158,4.605405,0.0076,4.606976
 0.009263157894736843,4.605455,0.0076,4.606981
 0.00905263157894737,4.6054463,0.0092,4.6070757
 0.009915789473684213,4.605465,0.0068000000000000005,4.607151
 0.009389473684210526,4.605481,0.008400000000000001,4.606995
 0.009789473684210527,4.605436,0.0068000000000000005,4.6071105
 0.010273684210526315,4.605466,0.007200000000000001,4.606909
 0.009789473684210527,4.605443,0.0072,4.6066866
 0.009957894736842107,4.6053886,0.0076,4.606541
 0.010168421052631578,4.605481,0.006400000000000001,4.606732
 0.009242105263157894,4.605444,0.006400000000000001,4.606939
 0.009621052631578949,4.6054454,0.008,4.606915
 0.00976842105263158,4.60547,0.0076,4.6068935
 0.009873684210526316,4.6055245,0.0064,4.6072345
--- a/report/VGG38_default/result_outputs/test_summary.csv
+++ b/report/VGG38_default/result_outputs/test_summary.csv
@ -0,0 +1,2 @@
 test_acc,test_loss
 0.01,4.6053004
--- a/report/figures/gradplot_38.pdf
+++ b/report/figures/gradplot_38.pdf
--- a/report/mlp-cw2-questions.tex
+++ b/report/mlp-cw2-questions.tex
@ -62,22 +62,29 @@ The difference between these two methods is that the first approach using a $1\t
 % 
 % The average length for an answer to this question is approximately 1 of the columns in a 2-column page
 \newcommand{\questionFour} {
-\youranswer{test1
+\youranswer{
 Our results demonstrate the effectiveness of batch normalization and residual connection as proposed by \cite{he2016deep}, enabling effective training of deep convolutional networks as shown by the significant improvement in training and validation performance for VGG38 when incorporating these techniques. Table~\ref{tab:CIFAR_results} highlights that adding BN alone (VGG38 BN) reduces both training and validation losses compared to the baseline VGG38, with validation accuracy increasing from near-zero to $47.68\%$ at a learning rate (LR) of $1\mathrm{e}{-3}$. Adding RC further enhances performance, as seen in VGG38 RC achieving $52.32\%$ validation accuracy under the same conditions. The combination of BN and RC (VGG38 BN + RC) yields the best results, achieving $53.76\%$ validation accuracy with LR $1\mathrm{e}{-3}$. BN+RC appears to benefit greatly from a higher learning rate, as it improves further to $58.20\%$ a LR of $1\mathrm{e}{-2}$. BN alone however deteriorates at higher learning rates, as evidenced by lower validation accuracy, emphasizing the stabilizing role of RC. \autoref{fig:training_curves_bestModel} confirms the synergy of BN and RC, with the VGG38 BN + RC model reaching $74\%$ training accuracy and plateauing near $60\%$ validation accuracy. \autoref{fig:avg_grad_flow_bestModel} illustrates stable gradient flow, with BN mitigating vanishing gradients and RC maintaining gradient propagation through deeper layers, particularly in the later stages of the network.
 While this work did not evaluate residual connections on downsampling layers, a thorough evaluation of both methods put forth earlier would be required to complete the picture, highlighting how exactly residual connections in downsampling layers affect gradient flow, feature learning, and overall performance. Such an evaluation would clarify whether the additional computational cost of using $1\times 1$ convolutions for matching dimensions is justified by improved accuracy or if the simpler pooling-based approach suffices, particularly for tasks where computational efficiency is crucial.
 }
 }
 %% Question 5:
 % Briefly draw your conclusions based on the results from the previous sections (what are the take-away messages?) and conclude your report with a recommendation for future work. 
 % 
 % Good recommendations for future work also draw on the broader literature (the papers already referenced are good starting points). Great recommendations for future work are not just incremental (an example of an incremental suggestion would be: ``we could also train with different learning rates'') but instead also identify meaningful questions or, in other words, questions with answers that might be somewhat more generally applicable. 
 % 
 % For example, \citep{huang2017densely} end with \begin{quote}``Because of their compact internal representations and reduced feature redundancy, DenseNets may be good feature extractors for various computer vision tasks that build on convolutional features, e.g.,  [4,5].''\end{quote} 
 % 
 % while \cite{bengio1993problem} state in their conclusions that \begin{quote}``There remains theoretical questions to be considered,  such as whether the problem with simple gradient descent  discussed in this paper would be observed with  chaotic attractors that are not  hyperbolic.''\\\end{quote}
 % 
 % The length of this question description is indicative of the average length of a conclusion section
 \newcommand{\questionFive} {
-\youranswer{Question 5 - Briefly draw your conclusions based on the results from the previous sections (what are the take-away messages?) and conclude your report with a recommendation for future work. 
+\youranswer{
 The results presented showcase a clear solution to the vanishing gradient problem. With batch normalization and Residual Connections, we are able to train much deeper neural networks effectively, as evidenced by the improved performance of VGG38 with these modifications. The combination of BN and RC not only stabilizes gradient flow but also enhances both training and validation accuracy, particularly when paired with an appropriate learning rate. These findings reinforce the utility of architectural innovations like those proposed in \cite{he2016deep} and \cite{ioffe2015batch}, which have become foundational in modern deep learning.
-Good recommendations for future work also draw on the broader literature (the papers already referenced are good starting points). Great recommendations for future work are not just incremental (an example of an incremental suggestion would be: ``we could also train with different learning rates'') but instead also identify meaningful questions or, in other words, questions with answers that might be somewhat more generally applicable. 
+While these methods appear to enable training of deeper neural networks, the critical question of how these architectural enhancements generalize across different datasets and tasks remains open. Future work could investigate the effectiveness of BN and RC in scenarios involving large-scale datasets, such as ImageNet, or in domains like natural language processing and generative models, where deep architectures also face optimization challenges. Additionally, exploring the interplay between residual connections and emerging techniques like attention mechanisms \citep{vaswani2017attention} might uncover further synergies. Beyond this, understanding the theoretical underpinnings of how residual connections influence optimization landscapes and gradient flow could yield insights applicable to designing novel architectures.}
 For example, \citep{huang2017densely} end with \begin{quote}``Because of their compact internal representations and reduced feature redundancy, DenseNets may be good feature extractors for various computer vision tasks that build on convolutional features, e.g.,  [4,5].''\end{quote} 
 while \cite{bengio1993problem} state in their conclusions that \begin{quote}``There remains theoretical questions to be considered,  such as whether the problem with simple gradient descent  discussed in this paper would be observed with  chaotic attractors that are not  hyperbolic.''\\\end{quote}
 The length of this question description is indicative of the average length of a conclusion section}
 }
@ -102,9 +109,20 @@ The length of this question description is indicative of the average length of a
 \newcommand{\questionFigureFour} {
 \youranswer{
 \begin{figure}[t]
    \begin{subfigure}{\linewidth}
        \centering
        \includegraphics[width=\linewidth]{figures/VGG38_BN_RC_loss_performance.pdf}
        \caption{Cross entropy error per epoch}
        \label{fig:vgg38_loss_curves}
    \end{subfigure}
    \begin{subfigure}{\linewidth}
        \centering
        \includegraphics[width=\linewidth]{figures/VGG38_BN_RC_accuracy_performance.pdf}
-    \caption{Training curves for 38 layer CNN with batch normalisation and residual connections, trained with LR of $0.01$}
+        \caption{Classification accuracy per epoch}
        \label{fig:vgg38_acc_curves}
    \end{subfigure}
    \caption{Training curves for the 38 layer CNN with batch normalization and residual connections, trained with LR of $0.01$}
    \label{fig:training_curves_bestModel}
 \end{figure}
 }
@ -117,7 +135,7 @@ The length of this question description is indicative of the average length of a
 \begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{figures/gradplot_38_bn_rc.pdf}
-    \caption{Gradient Flow for 38 layer CNN with batch normalisation and residual connections, trained with LR of $0.01$}
+    \caption{Gradient Flow for the 38 layer CNN with batch normalization and residual connections, trained with LR of $0.01$}
    \label{fig:avg_grad_flow_bestModel}
 \end{figure}
 }
--- a/report/refs.bib
+++ b/report/refs.bib
@ -161,3 +161,24 @@
  year={1995},
  publisher={Oxford university press}
 }
@article{vaswani2017attention,
  author       = {Ashish Vaswani and
                  Noam Shazeer and
                  Niki Parmar and
                  Jakob Uszkoreit and
                  Llion Jones and
                  Aidan N. Gomez and
                  Lukasz Kaiser and
                  Illia Polosukhin},
  title        = {Attention Is All You Need},
  journal      = {CoRR},
  volume       = {abs/1706.03762},
  year         = {2017},
  url          = {http://arxiv.org/abs/1706.03762},
  eprinttype    = {arXiv},
  eprint       = {1706.03762},
  timestamp    = {Sat, 23 Jan 2021 01:20:40 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/VaswaniSPUJGKP17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
 }