@inproceedings{goodfellow2013maxout, title={Maxout networks}, author={Goodfellow, Ian and Warde-Farley, David and Mirza, Mehdi and Courville, Aaron and Bengio, Yoshua}, booktitle={International conference on machine learning}, pages={1319--1327}, year={2013}, organization={PMLR} } @article{srivastava2014dropout, title={Dropout: a simple way to prevent neural networks from overfitting}, author={Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky, Alex and Sutskever, Ilya and Salakhutdinov, Ruslan}, journal={The journal of machine learning research}, volume={15}, number={1}, pages={1929--1958}, year={2014}, publisher={JMLR. org} } @book{Goodfellow-et-al-2016, title={Deep Learning}, author={Ian Goodfellow and Yoshua Bengio and Aaron Courville}, publisher={MIT Press}, note={\url{http://www.deeplearningbook.org}}, year={2016} } @inproceedings{ng2004feature, title={Feature selection, L1 vs. L2 regularization, and rotational invariance}, author={Ng, Andrew Y}, booktitle={Proceedings of the twenty-first international conference on Machine learning}, pages={78}, year={2004} } @article{simonyan2014very, title={Very deep convolutional networks for large-scale image recognition}, author={Simonyan, Karen and Zisserman, Andrew}, journal={arXiv preprint arXiv:1409.1556}, year={2014} } @inproceedings{he2016deep, title={Deep residual learning for image recognition}, author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, pages={770--778}, year={2016} } @inproceedings{glorot2010understanding, title={Understanding the difficulty of training deep feedforward neural networks}, author={Glorot, Xavier and Bengio, Yoshua}, booktitle={Proceedings of the thirteenth international conference on artificial intelligence and statistics}, pages={249--256}, year={2010}, organization={JMLR Workshop and Conference Proceedings} } @inproceedings{bengio1993problem, title={The problem of learning long-term dependencies in recurrent networks}, author={Bengio, Yoshua and Frasconi, Paolo and Simard, Patrice}, booktitle={IEEE international conference on neural networks}, pages={1183--1188}, year={1993}, organization={IEEE} } @inproceedings{ide2017improvement, title={Improvement of learning for CNN with ReLU activation by sparse regularization}, author={Ide, Hidenori and Kurita, Takio}, booktitle={2017 International Joint Conference on Neural Networks (IJCNN)}, pages={2684--2691}, year={2017}, organization={IEEE} } @inproceedings{ioffe2015batch, title={Batch normalization: Accelerating deep network training by reducing internal covariate shift}, author={Ioffe, Sergey and Szegedy, Christian}, booktitle={International conference on machine learning}, pages={448--456}, year={2015}, organization={PMLR} } @inproceedings{huang2017densely, title={Densely connected convolutional networks}, author={Huang, Gao and Liu, Zhuang and Van Der Maaten, Laurens and Weinberger, Kilian Q}, booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, pages={4700--4708}, year={2017} } @article{rumelhart1986learning, title={Learning representations by back-propagating errors}, author={Rumelhart, David E and Hinton, Geoffrey E and Williams, Ronald J}, journal={nature}, volume={323}, number={6088}, pages={533--536}, year={1986}, publisher={Nature Publishing Group} } @inproceedings{du2019gradient, title={Gradient descent finds global minima of deep neural networks}, author={Du, Simon and Lee, Jason and Li, Haochuan and Wang, Liwei and Zhai, Xiyu}, booktitle={International Conference on Machine Learning}, pages={1675--1685}, year={2019}, organization={PMLR} } @inproceedings{pascanu2013difficulty, title={On the difficulty of training recurrent neural networks}, author={Pascanu, Razvan and Mikolov, Tomas and Bengio, Yoshua}, booktitle={International conference on machine learning}, pages={1310--1318}, year={2013}, organization={PMLR} } @article{li2017visualizing, title={Visualizing the loss landscape of neural nets}, author={Li, Hao and Xu, Zheng and Taylor, Gavin and Studer, Christoph and Goldstein, Tom}, journal={arXiv preprint arXiv:1712.09913}, year={2017} } @inproceedings{santurkar2018does, title={How does batch normalization help optimization?}, author={Santurkar, Shibani and Tsipras, Dimitris and Ilyas, Andrew and M{\k{a}}dry, Aleksander}, booktitle={Proceedings of the 32nd international conference on neural information processing systems}, pages={2488--2498}, year={2018} } @article{krizhevsky2009learning, title={Learning multiple layers of features from tiny images}, author={Krizhevsky, Alex and Hinton, Geoffrey and others}, journal={}, year={2009}, publisher={Citeseer} } @incollection{lecun2012efficient, title={Efficient backprop}, author={LeCun, Yann A and Bottou, L{\'e}on and Orr, Genevieve B and M{\"u}ller, Klaus-Robert}, booktitle={Neural networks: Tricks of the trade}, pages={9--48}, year={2012}, publisher={Springer} } @book{bishop1995neural, title={Neural networks for pattern recognition}, author={Bishop, Christopher M and others}, year={1995}, publisher={Oxford university press} } @article{vaswani2017attention, author = {Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin}, title = {Attention Is All You Need}, journal = {CoRR}, volume = {abs/1706.03762}, year = {2017}, url = {http://arxiv.org/abs/1706.03762}, eprinttype = {arXiv}, eprint = {1706.03762}, timestamp = {Sat, 23 Jan 2021 01:20:40 +0100}, biburl = {https://dblp.org/rec/journals/corr/VaswaniSPUJGKP17.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} }