You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

59 lines
3.4 KiB
BibTeX

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

@UNPUBLISHED{heiss2019,
series = {arXiv},
author = {Heiss, Jakob and Teichmann, Josef and Wutte, Hanna},
publisher = {Cornell University},
year = {2019},
language = {en},
copyright = {In Copyright - Non-Commercial Use Permitted},
keywords = {early stopping; implicit regularization; machine learning; neural networks; spline; regression; gradient descent; artificial intelligence},
size = {53 p.},
address = {Ithaca, NY},
abstract = {Today, various forms of neural networks are trained to perform approximation tasks in many fields. However, the solutions obtained are not fully understood. Empirical results suggest that typical training algorithms favor regularized solutions.These observations motivate us to analyze properties of the solutions found by gradient descent initialized close to zero, that is frequently employed to perform the training task. As a starting point, we consider one dimensional (shallow) ReLU neural networks in which weights are chosen randomly and only the terminal layer is trained. We show that the resulting solution converges to the smooth spline interpolation of the training data as the number of hidden nodes tends to infinity. Moreover, we derive a correspondence between the early stopped gradient descent and the smoothing spline regression. This might give valuable insight on the properties of the solutions obtained using gradient descent methods in general settings.},
DOI = {10.3929/ethz-b-000402003},
title = {How Implicit Regularization of Neural Networks Affects the Learned Function Part I},
PAGES = {1911.02903}
}
@article{Dropout,
author = {Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov},
title = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
journal = {Journal of Machine Learning Research},
year = 2014,
volume = 15,
number = 56,
pages = {1929-1958},
url = {http://jmlr.org/papers/v15/srivastava14a.html}
}
@article{ADADELTA,
author = {Matthew D. Zeiler},
title = {{ADADELTA:} An Adaptive Learning Rate Method},
journal = {CoRR},
volume = {abs/1212.5701},
year = 2012,
url = {http://arxiv.org/abs/1212.5701},
archivePrefix = {arXiv},
eprint = {1212.5701},
timestamp = {Mon, 13 Aug 2018 16:45:57 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1212-5701.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{backprop,
author={Rumelhart, David E.
and Hinton, Geoffrey E.
and Williams, Ronald J.},
title={Learning representations by back-propagating errors},
journal={Nature},
year={1986},
month={Oct},
day={01},
volume={323},
number={6088},
pages={533-536},
abstract={We describe a new learning procedure, back-propagation, for networks of neurone-like units. The procedure repeatedly adjusts the weights of the connections in the network so as to minimize a measure of the difference between the actual output vector of the net and the desired output vector. As a result of the weight adjustments, internal `hidden' units which are not part of the input or output come to represent important features of the task domain, and the regularities in the task are captured by the interactions of these units. The ability to create useful new features distinguishes back-propagation from earlier, simpler methods such as the perceptron-convergence procedure1.},
issn={1476-4687},
doi={10.1038/323533a0},
url={https://doi.org/10.1038/323533a0}
}