final changes

This commit is contained in:
Anton Lydike 2024-11-22 09:26:24 +00:00
parent c29681b4ba
commit 46ca7c6dfd
6 changed files with 157 additions and 13 deletions

2
report/.gitignore vendored
View File

@ -1,2 +1,4 @@
*.fls *.fls
*.fdb_latexmk *.fdb_latexmk
s2759177/
*.zip

View File

@ -0,0 +1,101 @@
train_acc,train_loss,val_acc,val_loss
0.009600000000000001,4.609349,0.0104,4.6072426
0.009326315789473684,4.6068563,0.0092,4.606588
0.009747368421052631,4.6062207,0.0084,4.606326
0.009621052631578947,4.6059957,0.0076,4.6067405
0.009873684210526314,4.605887,0.0076,4.6068487
0.009136842105263157,4.605854,0.008,4.6074386
0.009536842105263158,4.605795,0.007200000000000001,4.6064863
0.009578947368421051,4.6057415,0.006400000000000001,4.6065035
0.009410526315789473,4.6058245,0.0076,4.606772
0.009094736842105263,4.6057224,0.007600000000000001,4.6064925
0.00911578947368421,4.605707,0.007200000000000001,4.6067533
0.009852631578947368,4.605685,0.007200000000000001,4.6068745
0.01031578947368421,4.6056952,0.0072,4.6067533
0.009789473684210527,4.6057863,0.0072,4.6070247
0.01031578947368421,4.6056023,0.0064,4.607134
0.010189473684210526,4.605698,0.0064,4.606934
0.009957894736842107,4.605643,0.006400000000000001,4.6068535
0.009452631578947369,4.605595,0.0064,4.6070676
0.009368421052631578,4.6057224,0.008,4.6070356
0.010210526315789474,4.6056094,0.009600000000000001,4.6070833
0.009557894736842105,4.6056895,0.0076,4.6069493
0.009600000000000001,4.605709,0.008400000000000001,4.60693
0.00985263157894737,4.6055284,0.0084,4.6068263
0.009200000000000002,4.60564,0.0076,4.6071053
0.009031578947368422,4.6056323,0.008400000000000001,4.606731
0.009663157894736842,4.60559,0.0068,4.6069546
0.008484210526315789,4.605676,0.009600000000000001,4.6063976
0.0096,4.605595,0.011200000000000002,4.6067076
0.00951578947368421,4.605619,0.0096,4.6068506
0.009242105263157895,4.6056657,0.0072,4.6067576
0.009326315789473684,4.6055913,0.012,4.6070724
0.01023157894736842,4.605646,0.012000000000000002,4.6066885
0.009494736842105262,4.605563,0.0072,4.6067305
0.009810526315789474,4.6055746,0.007200000000000001,4.6067824
0.010147368421052632,4.605596,0.0072,4.607214
0.009536842105263156,4.6055007,0.007200000000000001,4.607186
0.009452631578947369,4.605547,0.0072,4.607297
0.009578947368421055,4.6055694,0.0072,4.607313
0.009410526315789475,4.6055374,0.0072,4.60726
0.00985263157894737,4.605587,0.0072,4.6072307
0.009389473684210526,4.605559,0.0072,4.607227
0.009852631578947368,4.6055884,0.008,4.6070976
0.008968421052631579,4.6055803,0.008,4.607156
0.009536842105263158,4.605502,0.0076,4.6073594
0.009410526315789473,4.6055517,0.008,4.607176
0.01,4.6055126,0.006400000000000001,4.606937
0.009915789473684213,4.6055126,0.008,4.607185
0.009305263157894737,4.605594,0.0064,4.606834
0.009326315789473684,4.6054907,0.008,4.6070714
0.009094736842105263,4.6055007,0.0076,4.6068645
0.009052631578947368,4.6055903,0.008400000000000001,4.606755
0.010294736842105263,4.605449,0.008,4.6068816
0.009578947368421055,4.6054883,0.0064,4.6067166
0.009452631578947369,4.60552,0.01,4.6066008
0.008821052631578948,4.6054573,0.009600000000000001,4.6065955
0.008968421052631579,4.605544,0.008,4.6063676
0.010147368421052632,4.605516,0.0064,4.6068606
0.009600000000000001,4.6054597,0.0096,4.6072354
0.01008421052631579,4.605526,0.0076,4.6074166
0.010126315789473685,4.6054554,0.0076,4.6074657
0.009705263157894736,4.6054635,0.0088,4.607237
0.009726315789473684,4.605516,0.007200000000000001,4.606978
0.009894736842105262,4.6054883,0.0072,4.607135
0.009663157894736842,4.605501,0.007200000000000001,4.607015
0.00976842105263158,4.605536,0.008,4.6073785
0.009473684210526316,4.6055303,0.009600000000000001,4.6070166
0.009347368421052632,4.6054993,0.0076,4.607084
0.009178947368421054,4.6054535,0.0084,4.6070604
0.008842105263157892,4.605507,0.0076,4.6069884
0.009726315789473684,4.6055107,0.007599999999999999,4.6069903
0.009536842105263156,4.6054244,0.0084,4.6070695
0.009452631578947369,4.605474,0.0072,4.607035
0.009621052631578949,4.605444,0.0076,4.6071277
0.010084210526315791,4.6054263,0.0076,4.6071534
0.009326315789473686,4.605477,0.0088,4.607115
0.009010526315789472,4.60548,0.0076,4.6072206
0.010042105263157897,4.605475,0.0076,4.607185
0.00976842105263158,4.6054463,0.008400000000000001,4.6071196
0.01,4.605421,0.008,4.6069384
0.009536842105263156,4.605482,0.008,4.607035
0.009915789473684213,4.6054354,0.008,4.6071534
0.010042105263157894,4.6054177,0.007200000000000001,4.607074
0.009242105263157895,4.605473,0.0072,4.606825
0.009726315789473684,4.6054006,0.0072,4.606701
0.009684210526315788,4.6054583,0.0104,4.606925
0.009642105263157895,4.6054606,0.0104,4.6068645
0.00936842105263158,4.605405,0.0076,4.606976
0.009263157894736843,4.605455,0.0076,4.606981
0.00905263157894737,4.6054463,0.0092,4.6070757
0.009915789473684213,4.605465,0.0068000000000000005,4.607151
0.009389473684210526,4.605481,0.008400000000000001,4.606995
0.009789473684210527,4.605436,0.0068000000000000005,4.6071105
0.010273684210526315,4.605466,0.007200000000000001,4.606909
0.009789473684210527,4.605443,0.0072,4.6066866
0.009957894736842107,4.6053886,0.0076,4.606541
0.010168421052631578,4.605481,0.006400000000000001,4.606732
0.009242105263157894,4.605444,0.006400000000000001,4.606939
0.009621052631578949,4.6054454,0.008,4.606915
0.00976842105263158,4.60547,0.0076,4.6068935
0.009873684210526316,4.6055245,0.0064,4.6072345
1 train_acc train_loss val_acc val_loss
2 0.009600000000000001 4.609349 0.0104 4.6072426
3 0.009326315789473684 4.6068563 0.0092 4.606588
4 0.009747368421052631 4.6062207 0.0084 4.606326
5 0.009621052631578947 4.6059957 0.0076 4.6067405
6 0.009873684210526314 4.605887 0.0076 4.6068487
7 0.009136842105263157 4.605854 0.008 4.6074386
8 0.009536842105263158 4.605795 0.007200000000000001 4.6064863
9 0.009578947368421051 4.6057415 0.006400000000000001 4.6065035
10 0.009410526315789473 4.6058245 0.0076 4.606772
11 0.009094736842105263 4.6057224 0.007600000000000001 4.6064925
12 0.00911578947368421 4.605707 0.007200000000000001 4.6067533
13 0.009852631578947368 4.605685 0.007200000000000001 4.6068745
14 0.01031578947368421 4.6056952 0.0072 4.6067533
15 0.009789473684210527 4.6057863 0.0072 4.6070247
16 0.01031578947368421 4.6056023 0.0064 4.607134
17 0.010189473684210526 4.605698 0.0064 4.606934
18 0.009957894736842107 4.605643 0.006400000000000001 4.6068535
19 0.009452631578947369 4.605595 0.0064 4.6070676
20 0.009368421052631578 4.6057224 0.008 4.6070356
21 0.010210526315789474 4.6056094 0.009600000000000001 4.6070833
22 0.009557894736842105 4.6056895 0.0076 4.6069493
23 0.009600000000000001 4.605709 0.008400000000000001 4.60693
24 0.00985263157894737 4.6055284 0.0084 4.6068263
25 0.009200000000000002 4.60564 0.0076 4.6071053
26 0.009031578947368422 4.6056323 0.008400000000000001 4.606731
27 0.009663157894736842 4.60559 0.0068 4.6069546
28 0.008484210526315789 4.605676 0.009600000000000001 4.6063976
29 0.0096 4.605595 0.011200000000000002 4.6067076
30 0.00951578947368421 4.605619 0.0096 4.6068506
31 0.009242105263157895 4.6056657 0.0072 4.6067576
32 0.009326315789473684 4.6055913 0.012 4.6070724
33 0.01023157894736842 4.605646 0.012000000000000002 4.6066885
34 0.009494736842105262 4.605563 0.0072 4.6067305
35 0.009810526315789474 4.6055746 0.007200000000000001 4.6067824
36 0.010147368421052632 4.605596 0.0072 4.607214
37 0.009536842105263156 4.6055007 0.007200000000000001 4.607186
38 0.009452631578947369 4.605547 0.0072 4.607297
39 0.009578947368421055 4.6055694 0.0072 4.607313
40 0.009410526315789475 4.6055374 0.0072 4.60726
41 0.00985263157894737 4.605587 0.0072 4.6072307
42 0.009389473684210526 4.605559 0.0072 4.607227
43 0.009852631578947368 4.6055884 0.008 4.6070976
44 0.008968421052631579 4.6055803 0.008 4.607156
45 0.009536842105263158 4.605502 0.0076 4.6073594
46 0.009410526315789473 4.6055517 0.008 4.607176
47 0.01 4.6055126 0.006400000000000001 4.606937
48 0.009915789473684213 4.6055126 0.008 4.607185
49 0.009305263157894737 4.605594 0.0064 4.606834
50 0.009326315789473684 4.6054907 0.008 4.6070714
51 0.009094736842105263 4.6055007 0.0076 4.6068645
52 0.009052631578947368 4.6055903 0.008400000000000001 4.606755
53 0.010294736842105263 4.605449 0.008 4.6068816
54 0.009578947368421055 4.6054883 0.0064 4.6067166
55 0.009452631578947369 4.60552 0.01 4.6066008
56 0.008821052631578948 4.6054573 0.009600000000000001 4.6065955
57 0.008968421052631579 4.605544 0.008 4.6063676
58 0.010147368421052632 4.605516 0.0064 4.6068606
59 0.009600000000000001 4.6054597 0.0096 4.6072354
60 0.01008421052631579 4.605526 0.0076 4.6074166
61 0.010126315789473685 4.6054554 0.0076 4.6074657
62 0.009705263157894736 4.6054635 0.0088 4.607237
63 0.009726315789473684 4.605516 0.007200000000000001 4.606978
64 0.009894736842105262 4.6054883 0.0072 4.607135
65 0.009663157894736842 4.605501 0.007200000000000001 4.607015
66 0.00976842105263158 4.605536 0.008 4.6073785
67 0.009473684210526316 4.6055303 0.009600000000000001 4.6070166
68 0.009347368421052632 4.6054993 0.0076 4.607084
69 0.009178947368421054 4.6054535 0.0084 4.6070604
70 0.008842105263157892 4.605507 0.0076 4.6069884
71 0.009726315789473684 4.6055107 0.007599999999999999 4.6069903
72 0.009536842105263156 4.6054244 0.0084 4.6070695
73 0.009452631578947369 4.605474 0.0072 4.607035
74 0.009621052631578949 4.605444 0.0076 4.6071277
75 0.010084210526315791 4.6054263 0.0076 4.6071534
76 0.009326315789473686 4.605477 0.0088 4.607115
77 0.009010526315789472 4.60548 0.0076 4.6072206
78 0.010042105263157897 4.605475 0.0076 4.607185
79 0.00976842105263158 4.6054463 0.008400000000000001 4.6071196
80 0.01 4.605421 0.008 4.6069384
81 0.009536842105263156 4.605482 0.008 4.607035
82 0.009915789473684213 4.6054354 0.008 4.6071534
83 0.010042105263157894 4.6054177 0.007200000000000001 4.607074
84 0.009242105263157895 4.605473 0.0072 4.606825
85 0.009726315789473684 4.6054006 0.0072 4.606701
86 0.009684210526315788 4.6054583 0.0104 4.606925
87 0.009642105263157895 4.6054606 0.0104 4.6068645
88 0.00936842105263158 4.605405 0.0076 4.606976
89 0.009263157894736843 4.605455 0.0076 4.606981
90 0.00905263157894737 4.6054463 0.0092 4.6070757
91 0.009915789473684213 4.605465 0.0068000000000000005 4.607151
92 0.009389473684210526 4.605481 0.008400000000000001 4.606995
93 0.009789473684210527 4.605436 0.0068000000000000005 4.6071105
94 0.010273684210526315 4.605466 0.007200000000000001 4.606909
95 0.009789473684210527 4.605443 0.0072 4.6066866
96 0.009957894736842107 4.6053886 0.0076 4.606541
97 0.010168421052631578 4.605481 0.006400000000000001 4.606732
98 0.009242105263157894 4.605444 0.006400000000000001 4.606939
99 0.009621052631578949 4.6054454 0.008 4.606915
100 0.00976842105263158 4.60547 0.0076 4.6068935
101 0.009873684210526316 4.6055245 0.0064 4.6072345

View File

@ -0,0 +1,2 @@
test_acc,test_loss
0.01,4.6053004
1 test_acc test_loss
2 0.01 4.6053004

Binary file not shown.

View File

@ -62,22 +62,29 @@ The difference between these two methods is that the first approach using a $1\t
% %
% The average length for an answer to this question is approximately 1 of the columns in a 2-column page % The average length for an answer to this question is approximately 1 of the columns in a 2-column page
\newcommand{\questionFour} { \newcommand{\questionFour} {
\youranswer{test1 \youranswer{
Our results demonstrate the effectiveness of batch normalization and residual connection as proposed by \cite{he2016deep}, enabling effective training of deep convolutional networks as shown by the significant improvement in training and validation performance for VGG38 when incorporating these techniques. Table~\ref{tab:CIFAR_results} highlights that adding BN alone (VGG38 BN) reduces both training and validation losses compared to the baseline VGG38, with validation accuracy increasing from near-zero to $47.68\%$ at a learning rate (LR) of $1\mathrm{e}{-3}$. Adding RC further enhances performance, as seen in VGG38 RC achieving $52.32\%$ validation accuracy under the same conditions. The combination of BN and RC (VGG38 BN + RC) yields the best results, achieving $53.76\%$ validation accuracy with LR $1\mathrm{e}{-3}$. BN+RC appears to benefit greatly from a higher learning rate, as it improves further to $58.20\%$ a LR of $1\mathrm{e}{-2}$. BN alone however deteriorates at higher learning rates, as evidenced by lower validation accuracy, emphasizing the stabilizing role of RC. \autoref{fig:training_curves_bestModel} confirms the synergy of BN and RC, with the VGG38 BN + RC model reaching $74\%$ training accuracy and plateauing near $60\%$ validation accuracy. \autoref{fig:avg_grad_flow_bestModel} illustrates stable gradient flow, with BN mitigating vanishing gradients and RC maintaining gradient propagation through deeper layers, particularly in the later stages of the network.
While this work did not evaluate residual connections on downsampling layers, a thorough evaluation of both methods put forth earlier would be required to complete the picture, highlighting how exactly residual connections in downsampling layers affect gradient flow, feature learning, and overall performance. Such an evaluation would clarify whether the additional computational cost of using $1\times 1$ convolutions for matching dimensions is justified by improved accuracy or if the simpler pooling-based approach suffices, particularly for tasks where computational efficiency is crucial.
} }
} }
%% Question 5: %% Question 5:
% Briefly draw your conclusions based on the results from the previous sections (what are the take-away messages?) and conclude your report with a recommendation for future work.
%
% Good recommendations for future work also draw on the broader literature (the papers already referenced are good starting points). Great recommendations for future work are not just incremental (an example of an incremental suggestion would be: ``we could also train with different learning rates'') but instead also identify meaningful questions or, in other words, questions with answers that might be somewhat more generally applicable.
%
% For example, \citep{huang2017densely} end with \begin{quote}``Because of their compact internal representations and reduced feature redundancy, DenseNets may be good feature extractors for various computer vision tasks that build on convolutional features, e.g., [4,5].''\end{quote}
%
% while \cite{bengio1993problem} state in their conclusions that \begin{quote}``There remains theoretical questions to be considered, such as whether the problem with simple gradient descent discussed in this paper would be observed with chaotic attractors that are not hyperbolic.''\\\end{quote}
%
% The length of this question description is indicative of the average length of a conclusion section
\newcommand{\questionFive} { \newcommand{\questionFive} {
\youranswer{Question 5 - Briefly draw your conclusions based on the results from the previous sections (what are the take-away messages?) and conclude your report with a recommendation for future work. \youranswer{
The results presented showcase a clear solution to the vanishing gradient problem. With batch normalization and Residual Connections, we are able to train much deeper neural networks effectively, as evidenced by the improved performance of VGG38 with these modifications. The combination of BN and RC not only stabilizes gradient flow but also enhances both training and validation accuracy, particularly when paired with an appropriate learning rate. These findings reinforce the utility of architectural innovations like those proposed in \cite{he2016deep} and \cite{ioffe2015batch}, which have become foundational in modern deep learning.
Good recommendations for future work also draw on the broader literature (the papers already referenced are good starting points). Great recommendations for future work are not just incremental (an example of an incremental suggestion would be: ``we could also train with different learning rates'') but instead also identify meaningful questions or, in other words, questions with answers that might be somewhat more generally applicable. While these methods appear to enable training of deeper neural networks, the critical question of how these architectural enhancements generalize across different datasets and tasks remains open. Future work could investigate the effectiveness of BN and RC in scenarios involving large-scale datasets, such as ImageNet, or in domains like natural language processing and generative models, where deep architectures also face optimization challenges. Additionally, exploring the interplay between residual connections and emerging techniques like attention mechanisms \citep{vaswani2017attention} might uncover further synergies. Beyond this, understanding the theoretical underpinnings of how residual connections influence optimization landscapes and gradient flow could yield insights applicable to designing novel architectures.}
For example, \citep{huang2017densely} end with \begin{quote}``Because of their compact internal representations and reduced feature redundancy, DenseNets may be good feature extractors for various computer vision tasks that build on convolutional features, e.g., [4,5].''\end{quote}
while \cite{bengio1993problem} state in their conclusions that \begin{quote}``There remains theoretical questions to be considered, such as whether the problem with simple gradient descent discussed in this paper would be observed with chaotic attractors that are not hyperbolic.''\\\end{quote}
The length of this question description is indicative of the average length of a conclusion section}
} }
@ -102,9 +109,20 @@ The length of this question description is indicative of the average length of a
\newcommand{\questionFigureFour} { \newcommand{\questionFigureFour} {
\youranswer{ \youranswer{
\begin{figure}[t] \begin{figure}[t]
\centering \begin{subfigure}{\linewidth}
\includegraphics[width=\linewidth]{figures/VGG38_BN_RC_accuracy_performance.pdf} \centering
\caption{Training curves for 38 layer CNN with batch normalisation and residual connections, trained with LR of $0.01$} \includegraphics[width=\linewidth]{figures/VGG38_BN_RC_loss_performance.pdf}
\caption{Cross entropy error per epoch}
\label{fig:vgg38_loss_curves}
\end{subfigure}
\begin{subfigure}{\linewidth}
\centering
\includegraphics[width=\linewidth]{figures/VGG38_BN_RC_accuracy_performance.pdf}
\caption{Classification accuracy per epoch}
\label{fig:vgg38_acc_curves}
\end{subfigure}
\caption{Training curves for the 38 layer CNN with batch normalization and residual connections, trained with LR of $0.01$}
\label{fig:training_curves_bestModel} \label{fig:training_curves_bestModel}
\end{figure} \end{figure}
} }
@ -117,7 +135,7 @@ The length of this question description is indicative of the average length of a
\begin{figure}[t] \begin{figure}[t]
\centering \centering
\includegraphics[width=\linewidth]{figures/gradplot_38_bn_rc.pdf} \includegraphics[width=\linewidth]{figures/gradplot_38_bn_rc.pdf}
\caption{Gradient Flow for 38 layer CNN with batch normalisation and residual connections, trained with LR of $0.01$} \caption{Gradient Flow for the 38 layer CNN with batch normalization and residual connections, trained with LR of $0.01$}
\label{fig:avg_grad_flow_bestModel} \label{fig:avg_grad_flow_bestModel}
\end{figure} \end{figure}
} }

View File

@ -161,3 +161,24 @@
year={1995}, year={1995},
publisher={Oxford university press} publisher={Oxford university press}
} }
@article{vaswani2017attention,
author = {Ashish Vaswani and
Noam Shazeer and
Niki Parmar and
Jakob Uszkoreit and
Llion Jones and
Aidan N. Gomez and
Lukasz Kaiser and
Illia Polosukhin},
title = {Attention Is All You Need},
journal = {CoRR},
volume = {abs/1706.03762},
year = {2017},
url = {http://arxiv.org/abs/1706.03762},
eprinttype = {arXiv},
eprint = {1706.03762},
timestamp = {Sat, 23 Jan 2021 01:20:40 +0100},
biburl = {https://dblp.org/rec/journals/corr/VaswaniSPUJGKP17.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}