Compare commits
2 Commits
cb9777f037
...
340e0017c4
Author | SHA1 | Date | |
---|---|---|---|
|
340e0017c4 | ||
|
06d93ef937 |
4
.gitignore
vendored
4
.gitignore
vendored
@ -27,3 +27,7 @@ main-blx.bib
|
||||
|
||||
# no slurm logs
|
||||
*slurm*.out
|
||||
|
||||
# no plot data
|
||||
*.csv
|
||||
*.mean
|
||||
|
6
TF/.ipynb_checkpoints/Untitled-checkpoint.ipynb
Normal file
6
TF/.ipynb_checkpoints/Untitled-checkpoint.ipynb
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"cells": [],
|
||||
"metadata": {},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
266
TF/Untitled.ipynb
Normal file
266
TF/Untitled.ipynb
Normal file
@ -0,0 +1,266 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ModuleNotFoundError",
|
||||
"evalue": "No module named 'numpy'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m--------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-2-d9bbc8b73862>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrcdefaults\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlines\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLine2D\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'numpy'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"plt.rcdefaults()\n",
|
||||
"from matplotlib.lines import Line2D\n",
|
||||
"from matplotlib.patches import Rectangle\n",
|
||||
"from matplotlib.patches import Circle\n",
|
||||
"\n",
|
||||
"NumDots = 4\n",
|
||||
"NumConvMax = 8\n",
|
||||
"NumFcMax = 20\n",
|
||||
"White = 1.\n",
|
||||
"Light = 0.7\n",
|
||||
"Medium = 0.5\n",
|
||||
"Dark = 0.3\n",
|
||||
"Darker = 0.15\n",
|
||||
"Black = 0.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def add_layer(patches, colors, size=(24, 24), num=5,\n",
|
||||
" top_left=[0, 0],\n",
|
||||
" loc_diff=[3, -3],\n",
|
||||
" ):\n",
|
||||
" # add a rectangle\n",
|
||||
" top_left = np.array(top_left)\n",
|
||||
" loc_diff = np.array(loc_diff)\n",
|
||||
" loc_start = top_left - np.array([0, size[0]])\n",
|
||||
" for ind in range(num):\n",
|
||||
" patches.append(Rectangle(loc_start + ind * loc_diff, size[1], size[0]))\n",
|
||||
" if ind % 2:\n",
|
||||
" colors.append(Medium)\n",
|
||||
" else:\n",
|
||||
" colors.append(Light)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def add_layer_with_omission(patches, colors, size=(24, 24),\n",
|
||||
" num=5, num_max=8,\n",
|
||||
" num_dots=4,\n",
|
||||
" top_left=[0, 0],\n",
|
||||
" loc_diff=[3, -3],\n",
|
||||
" ):\n",
|
||||
" # add a rectangle\n",
|
||||
" top_left = np.array(top_left)\n",
|
||||
" loc_diff = np.array(loc_diff)\n",
|
||||
" loc_start = top_left - np.array([0, size[0]])\n",
|
||||
" this_num = min(num, num_max)\n",
|
||||
" start_omit = (this_num - num_dots) // 2\n",
|
||||
" end_omit = this_num - start_omit\n",
|
||||
" start_omit -= 1\n",
|
||||
" for ind in range(this_num):\n",
|
||||
" if (num > num_max) and (start_omit < ind < end_omit):\n",
|
||||
" omit = True\n",
|
||||
" else:\n",
|
||||
" omit = False\n",
|
||||
"\n",
|
||||
" if omit:\n",
|
||||
" patches.append(\n",
|
||||
" Circle(loc_start + ind * loc_diff + np.array(size) / 2, 0.5))\n",
|
||||
" else:\n",
|
||||
" patches.append(Rectangle(loc_start + ind * loc_diff,\n",
|
||||
" size[1], size[0]))\n",
|
||||
"\n",
|
||||
" if omit:\n",
|
||||
" colors.append(Black)\n",
|
||||
" elif ind % 2:\n",
|
||||
" colors.append(Medium)\n",
|
||||
" else:\n",
|
||||
" colors.append(Light)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def add_mapping(patches, colors, start_ratio, end_ratio, patch_size, ind_bgn,\n",
|
||||
" top_left_list, loc_diff_list, num_show_list, size_list):\n",
|
||||
"\n",
|
||||
" start_loc = top_left_list[ind_bgn] \\\n",
|
||||
" + (num_show_list[ind_bgn] - 1) * np.array(loc_diff_list[ind_bgn]) \\\n",
|
||||
" + np.array([start_ratio[0] * (size_list[ind_bgn][1] - patch_size[1]),\n",
|
||||
" - start_ratio[1] * (size_list[ind_bgn][0] - patch_size[0])]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" end_loc = top_left_list[ind_bgn + 1] \\\n",
|
||||
" + (num_show_list[ind_bgn + 1] - 1) * np.array(\n",
|
||||
" loc_diff_list[ind_bgn + 1]) \\\n",
|
||||
" + np.array([end_ratio[0] * size_list[ind_bgn + 1][1],\n",
|
||||
" - end_ratio[1] * size_list[ind_bgn + 1][0]])\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" patches.append(Rectangle(start_loc, patch_size[1], -patch_size[0]))\n",
|
||||
" colors.append(Dark)\n",
|
||||
" patches.append(Line2D([start_loc[0], end_loc[0]],\n",
|
||||
" [start_loc[1], end_loc[1]]))\n",
|
||||
" colors.append(Darker)\n",
|
||||
" patches.append(Line2D([start_loc[0] + patch_size[1], end_loc[0]],\n",
|
||||
" [start_loc[1], end_loc[1]]))\n",
|
||||
" colors.append(Darker)\n",
|
||||
" patches.append(Line2D([start_loc[0], end_loc[0]],\n",
|
||||
" [start_loc[1] - patch_size[0], end_loc[1]]))\n",
|
||||
" colors.append(Darker)\n",
|
||||
" patches.append(Line2D([start_loc[0] + patch_size[1], end_loc[0]],\n",
|
||||
" [start_loc[1] - patch_size[0], end_loc[1]]))\n",
|
||||
" colors.append(Darker)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def label(xy, text, xy_off=[0, 4]):\n",
|
||||
" plt.text(xy[0] + xy_off[0], xy[1] + xy_off[1], text,\n",
|
||||
" family='sans-serif', size=8)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == '__main__':\n",
|
||||
"\n",
|
||||
" fc_unit_size = 2\n",
|
||||
" layer_width = 40\n",
|
||||
" flag_omit = True\n",
|
||||
"\n",
|
||||
" patches = []\n",
|
||||
" colors = []\n",
|
||||
"\n",
|
||||
" fig, ax = plt.subplots()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" ############################\n",
|
||||
" # conv layers\n",
|
||||
" size_list = [(32, 32), (18, 18), (10, 10), (6, 6), (4, 4)]\n",
|
||||
" num_list = [3, 32, 32, 48, 48]\n",
|
||||
" x_diff_list = [0, layer_width, layer_width, layer_width, layer_width]\n",
|
||||
" text_list = ['Inputs'] + ['Feature\\nmaps'] * (len(size_list) - 1)\n",
|
||||
" loc_diff_list = [[3, -3]] * len(size_list)\n",
|
||||
"\n",
|
||||
" num_show_list = list(map(min, num_list, [NumConvMax] * len(num_list)))\n",
|
||||
" top_left_list = np.c_[np.cumsum(x_diff_list), np.zeros(len(x_diff_list))]\n",
|
||||
"\n",
|
||||
" for ind in range(len(size_list)-1,-1,-1):\n",
|
||||
" if flag_omit:\n",
|
||||
" add_layer_with_omission(patches, colors, size=size_list[ind],\n",
|
||||
" num=num_list[ind],\n",
|
||||
" num_max=NumConvMax,\n",
|
||||
" num_dots=NumDots,\n",
|
||||
" top_left=top_left_list[ind],\n",
|
||||
" loc_diff=loc_diff_list[ind])\n",
|
||||
" else:\n",
|
||||
" add_layer(patches, colors, size=size_list[ind],\n",
|
||||
" num=num_show_list[ind],\n",
|
||||
" top_left=top_left_list[ind], loc_diff=loc_diff_list[ind])\n",
|
||||
" label(top_left_list[ind], text_list[ind] + '\\n{}@{}x{}'.format(\n",
|
||||
" num_list[ind], size_list[ind][0], size_list[ind][1]))\n",
|
||||
"\n",
|
||||
" ############################\n",
|
||||
" # in between layers\n",
|
||||
" start_ratio_list = [[0.4, 0.5], [0.4, 0.8], [0.4, 0.5], [0.4, 0.8]]\n",
|
||||
" end_ratio_list = [[0.4, 0.5], [0.4, 0.8], [0.4, 0.5], [0.4, 0.8]]\n",
|
||||
" patch_size_list = [(5, 5), (2, 2), (5, 5), (2, 2)]\n",
|
||||
" ind_bgn_list = range(len(patch_size_list))\n",
|
||||
" text_list = ['Convolution', 'Max-pooling', 'Convolution', 'Max-pooling']\n",
|
||||
"\n",
|
||||
" for ind in range(len(patch_size_list)):\n",
|
||||
" add_mapping(\n",
|
||||
" patches, colors, start_ratio_list[ind], end_ratio_list[ind],\n",
|
||||
" patch_size_list[ind], ind,\n",
|
||||
" top_left_list, loc_diff_list, num_show_list, size_list)\n",
|
||||
" label(top_left_list[ind], text_list[ind] + '\\n{}x{} kernel'.format(\n",
|
||||
" patch_size_list[ind][0], patch_size_list[ind][1]), xy_off=[26, -65]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" ############################\n",
|
||||
" # fully connected layers\n",
|
||||
" size_list = [(fc_unit_size, fc_unit_size)] * 3\n",
|
||||
" num_list = [768, 500, 2]\n",
|
||||
" num_show_list = list(map(min, num_list, [NumFcMax] * len(num_list)))\n",
|
||||
" x_diff_list = [sum(x_diff_list) + layer_width, layer_width, layer_width]\n",
|
||||
" top_left_list = np.c_[np.cumsum(x_diff_list), np.zeros(len(x_diff_list))]\n",
|
||||
" loc_diff_list = [[fc_unit_size, -fc_unit_size]] * len(top_left_list)\n",
|
||||
" text_list = ['Hidden\\nunits'] * (len(size_list) - 1) + ['Outputs']\n",
|
||||
"\n",
|
||||
" for ind in range(len(size_list)):\n",
|
||||
" if flag_omit:\n",
|
||||
" add_layer_with_omission(patches, colors, size=size_list[ind],\n",
|
||||
" num=num_list[ind],\n",
|
||||
" num_max=NumFcMax,\n",
|
||||
" num_dots=NumDots,\n",
|
||||
" top_left=top_left_list[ind],\n",
|
||||
" loc_diff=loc_diff_list[ind])\n",
|
||||
" else:\n",
|
||||
" add_layer(patches, colors, size=size_list[ind],\n",
|
||||
" num=num_show_list[ind],\n",
|
||||
" top_left=top_left_list[ind],\n",
|
||||
" loc_diff=loc_diff_list[ind])\n",
|
||||
" label(top_left_list[ind], text_list[ind] + '\\n{}'.format(\n",
|
||||
" num_list[ind]))\n",
|
||||
"\n",
|
||||
" text_list = ['Flatten\\n', 'Fully\\nconnected', 'Fully\\nconnected']\n",
|
||||
"\n",
|
||||
" for ind in range(len(size_list)):\n",
|
||||
" label(top_left_list[ind], text_list[ind], xy_off=[-10, -65])\n",
|
||||
"\n",
|
||||
" ############################\n",
|
||||
" for patch, color in zip(patches, colors):\n",
|
||||
" patch.set_color(color * np.ones(3))\n",
|
||||
" if isinstance(patch, Line2D):\n",
|
||||
" ax.add_line(patch)\n",
|
||||
" else:\n",
|
||||
" patch.set_edgecolor(Black * np.ones(3))\n",
|
||||
" ax.add_patch(patch)\n",
|
||||
"\n",
|
||||
" plt.tight_layout()\n",
|
||||
" plt.axis('equal')\n",
|
||||
" plt.axis('off')\n",
|
||||
" plt.show()\n",
|
||||
" fig.set_size_inches(8, 2.5)\n",
|
||||
"\n",
|
||||
" # fig_dir = './'\n",
|
||||
" # fig_ext = '.png'\n",
|
||||
" # fig.savefig(os.path.join(fig_dir, 'convnet_fig' + fig_ext),\n",
|
||||
" # bbox_inches='tight', pad_inches=0)\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
217
TF/cnn.py
Normal file
217
TF/cnn.py
Normal file
@ -0,0 +1,217 @@
|
||||
import os
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
plt.rcdefaults()
|
||||
from matplotlib.lines import Line2D
|
||||
from matplotlib.patches import Rectangle
|
||||
from matplotlib.patches import Circle
|
||||
|
||||
NumDots = 4
|
||||
NumConvMax = 8
|
||||
NumFcMax = 20
|
||||
White = 1.
|
||||
Light = 0.7
|
||||
Medium = 0.5
|
||||
Dark = 0.3
|
||||
Darker = 0.15
|
||||
Black = 0.
|
||||
|
||||
|
||||
def add_layer(patches, colors, size=(24, 24), num=5,
|
||||
top_left=[0, 0],
|
||||
loc_diff=[3, -3],
|
||||
):
|
||||
# add a rectangle
|
||||
top_left = np.array(top_left)
|
||||
loc_diff = np.array(loc_diff)
|
||||
loc_start = top_left - np.array([0, size[0]])
|
||||
for ind in range(num):
|
||||
patches.append(Rectangle(loc_start + ind * loc_diff, size[1], size[0]))
|
||||
if ind % 2:
|
||||
colors.append(Medium)
|
||||
else:
|
||||
colors.append(Light)
|
||||
|
||||
|
||||
def add_layer_with_omission(patches, colors, size=(24, 24),
|
||||
num=5, num_max=8,
|
||||
num_dots=4,
|
||||
top_left=[0, 0],
|
||||
loc_diff=[3, -3],
|
||||
):
|
||||
# add a rectangle
|
||||
top_left = np.array(top_left)
|
||||
loc_diff = np.array(loc_diff)
|
||||
loc_start = top_left - np.array([0, size[0]])
|
||||
this_num = min(num, num_max)
|
||||
start_omit = (this_num - num_dots) // 2
|
||||
end_omit = this_num - start_omit
|
||||
start_omit -= 1
|
||||
for ind in range(this_num):
|
||||
if (num > num_max) and (start_omit < ind < end_omit):
|
||||
omit = True
|
||||
else:
|
||||
omit = False
|
||||
|
||||
if omit:
|
||||
patches.append(
|
||||
Circle(loc_start + ind * loc_diff + np.array(size) / 2, 0.5))
|
||||
else:
|
||||
patches.append(Rectangle(loc_start + ind * loc_diff,
|
||||
size[1], size[0]))
|
||||
|
||||
if omit:
|
||||
colors.append(Black)
|
||||
elif ind % 2:
|
||||
colors.append(Medium)
|
||||
else:
|
||||
colors.append(Light)
|
||||
|
||||
|
||||
def add_mapping(patches, colors, start_ratio, end_ratio, patch_size, ind_bgn,
|
||||
top_left_list, loc_diff_list, num_show_list, size_list):
|
||||
|
||||
start_loc = top_left_list[ind_bgn] \
|
||||
+ (num_show_list[ind_bgn] - 1) * np.array(loc_diff_list[ind_bgn]) \
|
||||
+ np.array([start_ratio[0] * (size_list[ind_bgn][1] - patch_size[1]),
|
||||
- start_ratio[1] * (size_list[ind_bgn][0] - patch_size[0])]
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
end_loc = top_left_list[ind_bgn + 1] \
|
||||
+ (num_show_list[ind_bgn + 1] - 1) * np.array(
|
||||
loc_diff_list[ind_bgn + 1]) \
|
||||
+ np.array([end_ratio[0] * size_list[ind_bgn + 1][1],
|
||||
- end_ratio[1] * size_list[ind_bgn + 1][0]])
|
||||
|
||||
|
||||
patches.append(Rectangle(start_loc, patch_size[1], -patch_size[0]))
|
||||
colors.append(Dark)
|
||||
patches.append(Line2D([start_loc[0], end_loc[0]],
|
||||
[start_loc[1], end_loc[1]]))
|
||||
colors.append(Darker)
|
||||
patches.append(Line2D([start_loc[0] + patch_size[1], end_loc[0]],
|
||||
[start_loc[1], end_loc[1]]))
|
||||
colors.append(Darker)
|
||||
patches.append(Line2D([start_loc[0], end_loc[0]],
|
||||
[start_loc[1] - patch_size[0], end_loc[1]]))
|
||||
colors.append(Darker)
|
||||
patches.append(Line2D([start_loc[0] + patch_size[1], end_loc[0]],
|
||||
[start_loc[1] - patch_size[0], end_loc[1]]))
|
||||
colors.append(Darker)
|
||||
|
||||
|
||||
|
||||
def label(xy, text, xy_off=[0, 4]):
|
||||
plt.text(xy[0] + xy_off[0], xy[1] + xy_off[1], text,
|
||||
family='sans-serif', size=8)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
fc_unit_size = 2
|
||||
layer_width = 40
|
||||
flag_omit = False
|
||||
|
||||
patches = []
|
||||
colors = []
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
|
||||
|
||||
############################
|
||||
# conv layers
|
||||
size_list = [(28, 28), (28, 28), (28,28), (14, 14), (14,14), (14,14), (7,7)]
|
||||
num_list = [1, 32, 32, 32, 64, 64, 64]
|
||||
x_diff_list = [0, layer_width, layer_width, layer_width, layer_width, layer_width, layer_width]
|
||||
text_list = ['Inputs'] + ['Feature\nmaps'] * (len(size_list) - 1)
|
||||
loc_diff_list = [[3, -3]] * len(size_list)
|
||||
|
||||
num_show_list = list(map(min, num_list, [NumConvMax] * len(num_list)))
|
||||
top_left_list = np.c_[np.cumsum(x_diff_list), np.zeros(len(x_diff_list))]
|
||||
|
||||
for ind in range(len(size_list)-1,-1,-1):
|
||||
if flag_omit:
|
||||
add_layer_with_omission(patches, colors, size=size_list[ind],
|
||||
num=num_list[ind],
|
||||
num_max=NumConvMax,
|
||||
num_dots=NumDots,
|
||||
top_left=top_left_list[ind],
|
||||
loc_diff=loc_diff_list[ind])
|
||||
else:
|
||||
add_layer(patches, colors, size=size_list[ind],
|
||||
num=num_show_list[ind],
|
||||
top_left=top_left_list[ind], loc_diff=loc_diff_list[ind])
|
||||
label(top_left_list[ind], text_list[ind] + '\n{}@{}x{}'.format(
|
||||
num_list[ind], size_list[ind][0], size_list[ind][1]))
|
||||
|
||||
############################
|
||||
# in between layers
|
||||
start_ratio_list = [[0.4, 0.5], [0.4, 0.8], [0.4,0.8], [0.4, 0.5], [0.4, 0.8],[0.4,0.8]]
|
||||
end_ratio_list = [[0.4, 0.5], [0.4, 0.8], [0.4,0.8], [0.4, 0.5], [0.4, 0.8],[0.4,0.8]]
|
||||
patch_size_list = [(3, 3), (3, 3), (2, 2), (3,3), (3, 3), (2, 2)]
|
||||
ind_bgn_list = range(len(patch_size_list))
|
||||
text_list = ['Conv.', 'Conv.', 'Max-pool.', 'Conv.', 'Conv.', 'Max-pool.']
|
||||
|
||||
for ind in range(len(patch_size_list)):
|
||||
add_mapping(
|
||||
patches, colors, start_ratio_list[ind], end_ratio_list[ind],
|
||||
patch_size_list[ind], ind,
|
||||
top_left_list, loc_diff_list, num_show_list, size_list)
|
||||
label(top_left_list[ind], text_list[ind] + '\n{}x{} kernel'.format(
|
||||
patch_size_list[ind][0], patch_size_list[ind][1]), xy_off=[26, -65]
|
||||
)
|
||||
|
||||
|
||||
############################
|
||||
# fully connected layers
|
||||
size_list = [(fc_unit_size, fc_unit_size)] * 2
|
||||
num_list = [256, 10]
|
||||
num_show_list = list(map(min, num_list, [NumFcMax] * len(num_list)))
|
||||
x_diff_list = [sum(x_diff_list) + layer_width, layer_width, layer_width]
|
||||
top_left_list = np.c_[np.cumsum(x_diff_list), np.zeros(len(x_diff_list))]
|
||||
loc_diff_list = [[fc_unit_size, -fc_unit_size]] * len(top_left_list)
|
||||
text_list = ['Hidden\nunits'] * (len(size_list) - 1) + ['Outputs']
|
||||
|
||||
for ind in range(len(size_list)):
|
||||
if flag_omit:
|
||||
add_layer_with_omission(patches, colors, size=size_list[ind],
|
||||
num=num_list[ind],
|
||||
num_max=NumFcMax,
|
||||
num_dots=NumDots,
|
||||
top_left=top_left_list[ind],
|
||||
loc_diff=loc_diff_list[ind])
|
||||
else:
|
||||
add_layer(patches, colors, size=size_list[ind],
|
||||
num=num_show_list[ind],
|
||||
top_left=top_left_list[ind],
|
||||
loc_diff=loc_diff_list[ind])
|
||||
label(top_left_list[ind], text_list[ind] + '\n{}'.format(
|
||||
num_list[ind]))
|
||||
|
||||
text_list = ['Flatten\n', 'Fully\nconnected']
|
||||
|
||||
for ind in range(len(size_list)):
|
||||
label(top_left_list[ind], text_list[ind], xy_off=[-10, -65])
|
||||
|
||||
############################
|
||||
for patch, color in zip(patches, colors):
|
||||
patch.set_color(color * np.ones(3))
|
||||
if isinstance(patch, Line2D):
|
||||
ax.add_line(patch)
|
||||
else:
|
||||
patch.set_edgecolor(Black * np.ones(3))
|
||||
ax.add_patch(patch)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.axis('equal')
|
||||
plt.axis('off')
|
||||
# plt.show()
|
||||
fig.set_size_inches(8, 2.5)
|
||||
|
||||
fig_dir = '/home/tobi/Masterarbeit/TeX/Plots/Data/'
|
||||
fig_ext = '.pdf'
|
||||
fig.savefig(os.path.join(fig_dir, 'cnn_fashion_fig' + fig_ext),
|
||||
bbox_inches='tight', pad_inches=0)
|
74
TF/pashion.py
Normal file
74
TF/pashion.py
Normal file
@ -0,0 +1,74 @@
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
from tensorflow.keras.callbacks import CSVLogger
|
||||
from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
||||
mnist = tf.keras.datasets.fashion_mnist
|
||||
|
||||
(x_train, y_train), (x_test, y_test) = mnist.load_data()
|
||||
x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
|
||||
x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)
|
||||
x_train, x_test = x_train / 255.0, x_test / 255.0
|
||||
|
||||
#y_train = tf.keras.utils.to_categorical(y_train)
|
||||
y_test = tf.keras.utils.to_categorical(y_test)
|
||||
|
||||
def get_random_sample(a, b, number_of_samples=10):
|
||||
x = []
|
||||
y = []
|
||||
for category_number in range(0,10):
|
||||
# get all samples of a category
|
||||
train_data_category = a[b==category_number]
|
||||
# pick a number of random samples from the category
|
||||
train_data_category = train_data_category[np.random.randint(train_data_category.shape[0],
|
||||
size=number_of_samples), :]
|
||||
x.extend(train_data_category)
|
||||
y.append([category_number]*number_of_samples)
|
||||
|
||||
return np.asarray(x).reshape(-1, 28, 28, 1), np.asarray(y).reshape(10*number_of_samples,1)
|
||||
|
||||
for i in ['1']:
|
||||
|
||||
model = tf.keras.Sequential()
|
||||
|
||||
model.add(tf.keras.layers.Conv2D(filters = 32, kernel_size = (3, 3), activation='relu',
|
||||
input_shape = (28, 28, 1), padding='same'))
|
||||
model.add(tf.keras.layers.Conv2D(filters = 32, kernel_size = (2, 2), activation='relu', padding = 'same'))
|
||||
model.add(tf.keras.layers.MaxPool2D(strides=(2,2)))
|
||||
|
||||
model.add(tf.keras.layers.Conv2D(filters = 64, kernel_size = (3, 3), activation='relu', padding='same'))
|
||||
model.add(tf.keras.layers.Conv2D(filters = 64, kernel_size = (3, 3), activation='relu', padding='same'))
|
||||
model.add(tf.keras.layers.MaxPool2D(strides=(2,2)))
|
||||
|
||||
model.add(tf.keras.layers.Flatten())
|
||||
|
||||
model.add(tf.keras.layers.Dense(256, activation='relu'))
|
||||
model.add(tf.keras.layers.Dropout(0.2))
|
||||
model.add(tf.keras.layers.Dense(10, activation='softmax'))
|
||||
|
||||
|
||||
model.compile(optimizer=tf.keras.optimizers.Adam(lr = 1e-3), loss="categorical_crossentropy", metrics=["accuracy"])
|
||||
|
||||
|
||||
x_train_, y_train_ = get_random_sample(x_train, y_train, number_of_samples=100)
|
||||
y_train_ = tf.keras.utils.to_categorical(y_train_)
|
||||
print(np.shape(y_train.shape))
|
||||
|
||||
datagen = ImageDataGenerator(
|
||||
rotation_range = 15,
|
||||
zoom_range = 0.1,
|
||||
width_shift_range=2,
|
||||
height_shift_range=2,
|
||||
shear_range = 0.5,
|
||||
fill_mode = 'constant',
|
||||
cval = 0)
|
||||
print(model.summary())
|
||||
#x_test_ = np.append(x_train[300:],x_test).reshape(x_train[300:].shape[0]+x_test.shape[0],28,28,1)
|
||||
#y_test_ = np.append(y_train[300:],y_test).reshape(y_train[300:].shape[0]+y_test.shape[0],10)
|
||||
|
||||
# csv_logger = CSVLogger('output/fashion_exacly_like_novatec__'+i+'.log')
|
||||
# history = model.fit(datagen.flow(x_train, tf.keras.utils.to_categorical(y_train), batch_size=20), validation_data=(x_test, y_test), epochs=125, steps_per_epoch = x_train_.shape[0]//20, callbacks=[csv_logger])
|
||||
# history = model.fit(datagen.flow(x_train, tf.keras.utils.to_categorical(y_train), batch_size=30),steps_per_epoch=2000,
|
||||
# validation_data=(x_test, y_test),
|
||||
# epochs=125, callbacks=[csv_logger],
|
||||
# shuffle=True)
|
||||
|
60
TF/random_sample.py
Normal file
60
TF/random_sample.py
Normal file
@ -0,0 +1,60 @@
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
from tensorflow.keras.callbacks import CSVLogger
|
||||
from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
||||
mnist = tf.keras.datasets.mnist
|
||||
|
||||
(x_train, y_train), (x_test, y_test) = mnist.load_data()
|
||||
x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
|
||||
x_train = x_train / 255.0
|
||||
x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)
|
||||
x_test = x_test / 255.0
|
||||
|
||||
#y_train = tf.keras.utils.to_categorical(y_train)
|
||||
y_test = tf.keras.utils.to_categorical(y_test)
|
||||
|
||||
def get_random_sample(a, b, number_of_samples=10):
|
||||
x = []
|
||||
y = []
|
||||
for category_number in range(0,10):
|
||||
# get all samples of a category
|
||||
train_data_category = a[b==category_number]
|
||||
# pick a number of random samples from the category
|
||||
train_data_category = train_data_category[np.random.randint(train_data_category.shape[0],
|
||||
size=number_of_samples), :]
|
||||
x.extend(train_data_category)
|
||||
y.append([category_number]*number_of_samples)
|
||||
|
||||
return np.asarray(x).reshape(-1, 28, 28, 1), np.asarray(y).reshape(10*number_of_samples,1)
|
||||
for j in [0.0]:
|
||||
for i in ['1','2','3','4','5','6','7','8','9','0']:
|
||||
|
||||
model = tf.keras.models.Sequential()
|
||||
model.add(tf.keras.layers.Conv2D(24,kernel_size=5,padding='same',activation='relu',
|
||||
input_shape=(28,28,1)))
|
||||
model.add(tf.keras.layers.MaxPool2D())
|
||||
model.add(tf.keras.layers.Conv2D(64,kernel_size=5,padding='same',activation='relu'))
|
||||
model.add(tf.keras.layers.MaxPool2D(padding='same'))
|
||||
model.add(tf.keras.layers.Flatten())
|
||||
model.add(tf.keras.layers.Dense(256, activation='relu'))
|
||||
model.add(tf.keras.layers.Dropout(j))
|
||||
model.add(tf.keras.layers.Dense(10, activation='softmax'))
|
||||
model.compile(optimizer='adam', loss="categorical_crossentropy", metrics=["accuracy"])
|
||||
print(model.summary())
|
||||
for n in [10,100]:
|
||||
x_train_, y_train_ = get_random_sample(x_train, y_train, number_of_samples=n)
|
||||
y_train_ = tf.keras.utils.to_categorical(y_train_)
|
||||
|
||||
datagen = ImageDataGenerator(
|
||||
rotation_range = 30,
|
||||
zoom_range = 0.15,
|
||||
width_shift_range=2,
|
||||
height_shift_range=2,
|
||||
shear_range = 1)
|
||||
|
||||
#x_test_ = np.append(x_train[300:],x_test).reshape(x_train[300:].shape[0]+x_test.shape[0],28,28,1)
|
||||
#y_test_ = np.append(y_train[300:],y_test).reshape(y_train[300:].shape[0]+y_test.shape[0],10)
|
||||
|
||||
# csv_logger = CSVLogger('Sample/adam_dropout_'+str(j).replace('.',"")+'_'+str(n)+'_'+i+'.log')
|
||||
# history = model.fit(datagen.flow(x_train_, y_train_, batch_size=50), validation_data=(x_test, y_test), epochs=125, callbacks=[csv_logger], steps_per_epoch = x_train_.shape[0]//50)
|
||||
# history = model.fit(x_train_, y_train_, validation_data=(x_test, y_test), epochs=125, callbacks=[csv_logger])
|
@ -17,6 +17,6 @@ model.compile(optimizer='adam',
|
||||
loss=loss_fn,
|
||||
metrics=['accuracy'])
|
||||
|
||||
model.fit(x_train, y_train, epochs=5)
|
||||
model.fit(x_train, y_train, epochs=10)
|
||||
|
||||
|
||||
|
37
TeX/#main.lof#
Normal file
37
TeX/#main.lof#
Normal file
@ -0,0 +1,37 @@
|
||||
|
||||
\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax
|
||||
\babel@toc {english}{}
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {2.1}{\ignorespaces Illustration of a neural network}}{2}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {2.2}{\ignorespaces Plots of the activation functions\relax }}{4}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {2.3}{\ignorespaces Structure of a single neuron\relax }}{4}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {3.1}{\ignorespaces Overfitting of shallow neural networks}}{10}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {3.2}{\ignorespaces Comparison of shallow neural networks and regression splines}}{21}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.1}{\ignorespaces Signal smoothing using convolution}}{23}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.2}{\ignorespaces Channel separation of color image}}{24}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.3}{\ignorespaces Convolution applied on image}}{25}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.4}{\ignorespaces MNIST data set}}{29}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.5}{\ignorespaces architecture\relax }}{29}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.6}{\ignorespaces Performance comparison of SDG and GD}}{30}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.7}{\ignorespaces Performance comparison of training algorithms}}{35}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.8}{\ignorespaces Image data generation}}{37}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.9}{\ignorespaces Performance comparison of overfitting measures}}{38}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.10}{\ignorespaces Fashion MNIST data set}}{39}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.11}{\ignorespaces \relax }}{41}%
|
||||
\defcounter {refsection}{0}\relax
|
||||
\contentsline {figure}{\numberline {4.12}{\ignorespaces Sample pictures of the mnist fashioyn dataset, one per class.\relax }}{41}%
|
58
TeX/Figures/Data/min_max.txt
Executable file
58
TeX/Figures/Data/min_max.txt
Executable file
@ -0,0 +1,58 @@
|
||||
datagen_dropout_02_1
|
||||
test
|
||||
0.6604& 0.5175& 0.60136& 0.002348447
|
||||
|
||||
datagen_dropout_00_1
|
||||
test
|
||||
0.6704& 0.4878& 0.58621& 0.003600539
|
||||
|
||||
dropout_02_1
|
||||
test
|
||||
0.5312& 0.4224& 0.47137& 0.001175149
|
||||
|
||||
default_1
|
||||
test
|
||||
0.5633& 0.3230& 0.45702& 0.004021449
|
||||
|
||||
datagen_dropout_02_10
|
||||
test
|
||||
0.9441& 0.9061& 0.92322& 0.00015
|
||||
train
|
||||
1& 0.97& 0.989& 1e-04
|
||||
|
||||
datagen_dropout_00_10
|
||||
test
|
||||
0.931& 0.9018& 0.9185& 6e-05
|
||||
train
|
||||
1& 0.97& 0.99& 0.00013
|
||||
|
||||
dropout_02_10
|
||||
test
|
||||
0.9423& 0.9081& 0.92696& 0.00013
|
||||
train
|
||||
1& 0.99& 0.992& 2e-05
|
||||
|
||||
default_10
|
||||
test
|
||||
0.8585& 0.8148& 0.83771& 0.00027
|
||||
train
|
||||
1& 1& 1& 0
|
||||
|
||||
datagen_dropout_02_100
|
||||
test
|
||||
0.9805& 0.9727& 0.97826& 0
|
||||
train
|
||||
|
||||
datagen_dropout_00_100
|
||||
test
|
||||
0.981& 0.9702& 0.9769& 1e-05
|
||||
train
|
||||
|
||||
dropout_02_100
|
||||
test
|
||||
0.9796& 0.9719& 0.97703& 1e-05
|
||||
train
|
||||
|
||||
default_100
|
||||
test
|
||||
0.9637& 0.9506& 0.95823& 2e-05
|
141
TeX/Figures/RN_vs_RS.tex
Normal file
141
TeX/Figures/RN_vs_RS.tex
Normal file
@ -0,0 +1,141 @@
|
||||
\pgfplotsset{
|
||||
compat=1.11,
|
||||
legend image code/.code={
|
||||
\draw[mark repeat=2,mark phase=2]
|
||||
plot coordinates {
|
||||
(0cm,0cm)
|
||||
(0.075cm,0cm) %% default is (0.3cm,0cm)
|
||||
(0.15cm,0cm) %% default is (0.6cm,0cm)
|
||||
};%
|
||||
}
|
||||
}
|
||||
\begin{figure}
|
||||
\begin{subfigure}[b]{0.5\textwidth}
|
||||
\begin{subfigure}[b]{\textwidth}
|
||||
\begin{adjustbox}{width=\textwidth, height=0.25\textheight}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[
|
||||
ytick = {-1, 0, 1, 2},
|
||||
yticklabels = {$-1$, $\phantom{-0.}0$, $1$, $2$},]
|
||||
\addplot table [x=x, y=y, col sep=comma, only marks,
|
||||
forget plot] {Figures/Data/sin_6.csv};
|
||||
\addplot [black, line width=2pt] table [x=x, y=y, col
|
||||
sep=comma, mark=none] {Figures/Data/matlab_0.csv};
|
||||
\addplot [red, line width = 1.5pt, dashed] table [x=x_n_5000_tl_0.0,
|
||||
y=y_n_5000_tl_0.0, col sep=comma, mark=none] {Figures/Data/scala_out_sin.csv};
|
||||
\addlegendentry{$f_1^{*, 0.1}$};
|
||||
\addlegendentry{$\mathcal{RN}_w^{\tilde{\lambda}}$};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\end{adjustbox}
|
||||
\caption{$\lambda = 0.1$}
|
||||
\end{subfigure}\\
|
||||
\begin{subfigure}[b]{\textwidth}
|
||||
\begin{adjustbox}{width=\textwidth, height=0.25\textheight}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}
|
||||
\addplot table [x=x, y=y, col sep=comma, only marks,
|
||||
forget plot] {Figures/Data/sin_6.csv};
|
||||
\addplot [black, line width=2pt] table [x=x, y=y, col sep=comma, mark=none] {Figures/Data/matlab_1.csv};
|
||||
\addplot [red, line width = 1.5pt, dashed] table [x=x_n_5000_tl_1.0,
|
||||
y=y_n_5000_tl_1.0, col sep=comma, mark=none] {Figures/Data/scala_out_sin.csv};
|
||||
\addlegendentry{$f_1^{*, 1.0}$};
|
||||
\addlegendentry{$\mathcal{RN}_w^{\tilde{\lambda}}$};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\end{adjustbox}
|
||||
\caption{$\lambda = 1.0$}
|
||||
\end{subfigure}\\
|
||||
\begin{subfigure}[b]{\textwidth}
|
||||
\begin{adjustbox}{width=\textwidth, height=0.25\textheight}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}
|
||||
\addplot table [x=x, y=y, col sep=comma, only marks,
|
||||
forget plot] {Figures/Data/sin_6.csv};
|
||||
\addplot [black, line width=2pt] table [x=x, y=y, col sep=comma, mark=none] {Figures/Data/matlab_3.csv};
|
||||
\addplot [red, line width = 1.5pt, dashed] table [x=x_n_5000_tl_3.0,
|
||||
y=y_n_5000_tl_3.0, col sep=comma, mark=none] {Figures/Data/scala_out_sin.csv};
|
||||
\addlegendentry{$f_1^{*, 3.0}$};
|
||||
\addlegendentry{$\mathcal{RN}_w^{\tilde{\lambda}}$};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\end{adjustbox}
|
||||
\caption{$\lambda = 3.0$}
|
||||
\end{subfigure}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[b]{0.5\textwidth}
|
||||
\begin{subfigure}[b]{\textwidth}
|
||||
\begin{adjustbox}{width=\textwidth, height=0.245\textheight}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[
|
||||
ytick = {-2,-1, 0, 1, 2},
|
||||
yticklabels = {$-2$,$-1$, $\phantom{-0.}0$, $1$, $2$},]
|
||||
\addplot table [x=x, y=y, col sep=comma, only marks,
|
||||
forget plot] {Figures/Data/data_sin_d_t.csv};
|
||||
\addplot [black, line width=2pt] table [x=x, y=y, col sep=comma, mark=none] {Figures/Data/matlab_sin_d_01.csv};
|
||||
\addplot [red, line width = 1.5pt, dashed] table [x=x_n_5000_tl_0.1,
|
||||
y=y_n_5000_tl_0.1, col sep=comma, mark=none] {Figures/Data/scala_out_d_1_t.csv};
|
||||
\addlegendentry{$f_1^{*, 0.1}$};
|
||||
\addlegendentry{$\mathcal{RN}_w^{\tilde{\lambda}}$};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\end{adjustbox}
|
||||
\caption{$\lambda = 0.1$}
|
||||
\end{subfigure}\\
|
||||
\begin{subfigure}[b]{\textwidth}
|
||||
\begin{adjustbox}{width=\textwidth, height=0.25\textheight}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}
|
||||
\addplot table [x=x, y=y, col sep=comma, only marks,
|
||||
forget plot] {Figures/Data/data_sin_d_t.csv};
|
||||
\addplot [black, line width=2pt] table [x=x, y=y, col sep=comma, mark=none] {Figures/Data/matlab_sin_d_1.csv};
|
||||
\addplot [red, line width = 1.5pt, dashed] table [x=x_n_5000_tl_1.0,
|
||||
y=y_n_5000_tl_1.0, col sep=comma, mark=none] {Figures/Data/scala_out_d_1_t.csv};
|
||||
\addlegendentry{$f_1^{*, 1.0}$};
|
||||
\addlegendentry{$\mathcal{RN}_w^{\tilde{\lambda},*}$};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\end{adjustbox}
|
||||
\caption{$\lambda = 1.0$}
|
||||
\end{subfigure}\\
|
||||
\begin{subfigure}[b]{\textwidth}
|
||||
\begin{adjustbox}{width=\textwidth, height=0.25\textheight}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}
|
||||
\addplot table [x=x, y=y, col sep=comma, only marks,
|
||||
forget plot] {Figures/Data/data_sin_d_t.csv};
|
||||
\addplot [black, line width=2pt] table [x=x, y=y, col sep=comma, mark=none] {Figures/Data/matlab_sin_d_3.csv};
|
||||
\addplot [red, line width = 1.5pt, dashed] table [x=x_n_5000_tl_3.0,
|
||||
y=y_n_5000_tl_3.0, col sep=comma, mark=none] {Figures/Data/scala_out_d_1_t.csv};
|
||||
\addlegendentry{$f_1^{*, 3.0}$};
|
||||
\addlegendentry{$\mathcal{RN}_w^{\tilde{\lambda}}$};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\end{adjustbox}
|
||||
\caption{$\lambda = 3.0$}
|
||||
\end{subfigure}
|
||||
\end{subfigure}
|
||||
\caption[Comparison of shallow neural networks and regression
|
||||
splines]{% In these Figures the behaviour stated in ... is
|
||||
% visualized
|
||||
% in two exaples. For $(a), (b), (c)$ six values of sinus equidistantly
|
||||
% spaced on $[-\pi, \pi]$ have been used as training data. For
|
||||
% $(d),(e),(f)$ 15 equidistand values have been used, where
|
||||
% $y_i^{train} = \sin(x_i^{train}) + \varepsilon_i$ and
|
||||
% $\varepsilon_i \sim \mathcal{N}(0, 0.3)$. For
|
||||
% $\mathcal{RN}_w^{\tilde{\lambda, *}}$ the random weights are
|
||||
% distributed as follows
|
||||
% \begin{align*}
|
||||
% \xi_k &\sim
|
||||
% \end{align*}
|
||||
Ridge Penalized Neural Network compared to Regression Spline,
|
||||
with them being trained on $\text{data}_A$ in a), b), c) and on
|
||||
$\text{data}_B$ in d), e), f).
|
||||
The Parameters of each are given above.
|
||||
}
|
||||
\label{fig:rn_vs_rs}
|
||||
\end{figure}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master:
|
||||
%%% End:
|
93
TeX/Figures/SGD_vs_GD.tex
Normal file
93
TeX/Figures/SGD_vs_GD.tex
Normal file
@ -0,0 +1,93 @@
|
||||
\pgfplotsset{
|
||||
compat=1.11,
|
||||
legend image code/.code={
|
||||
\draw[mark repeat=2,mark phase=2]
|
||||
plot coordinates {
|
||||
(0cm,0cm)
|
||||
(0.0cm,0cm) %% default is (0.3cm,0cm)
|
||||
(0.0cm,0cm) %% default is (0.6cm,0cm)
|
||||
};%
|
||||
}
|
||||
}
|
||||
\begin{figure}
|
||||
\begin{subfigure}[h!]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.6\textwidth,
|
||||
xtick = {1, 3, 5,7,9,11,13,15,17,19},
|
||||
xticklabels = {$2$, $4$, $6$, $8$,
|
||||
$10$,$12$,$14$,$16$,$18$,$20$},
|
||||
xlabel = {training epoch}, ylabel = {classification accuracy}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma] {Figures/Data/GD_01.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma] {Figures/Data/GD_05.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma] {Figures/Data/GD_1.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma]
|
||||
{Figures/Data/SGD_01_b32.log};
|
||||
|
||||
\addlegendentry{GD$_{0.01}$}
|
||||
\addlegendentry{GD$_{0.05}$}
|
||||
\addlegendentry{GD$_{0.1}$}
|
||||
\addlegendentry{SGD$_{0.01}$}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
%\caption{Classification accuracy}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[b]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.6\textwidth,
|
||||
ytick = {0, 1, 2, 3, 4},
|
||||
yticklabels = {$0$, $1$, $\phantom{0.}2$, $3$, $4$},
|
||||
xtick = {1, 3, 5,7,9,11,13,15,17,19},
|
||||
xticklabels = {$2$, $4$, $6$, $8$,
|
||||
$10$,$12$,$14$,$16$,$18$,$20$},
|
||||
xlabel = {training epoch}, ylabel = {error measure\vphantom{fy}}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_loss, col sep=comma] {Figures/Data/GD_01.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_loss, col sep=comma] {Figures/Data/GD_05.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_loss, col sep=comma] {Figures/Data/GD_1.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_loss, col sep=comma] {Figures/Data/SGD_01_b32.log};
|
||||
|
||||
\addlegendentry{GD$_{0.01}$}
|
||||
\addlegendentry{GD$_{0.05}$}
|
||||
\addlegendentry{GD$_{0.1}$}
|
||||
\addlegendentry{SGD$_{0.01}$}
|
||||
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{Performance metrics during training}
|
||||
\end{subfigure}
|
||||
% \\~\\
|
||||
\caption[Performance comparison of SDG and GD]{The neural network given in ?? trained with different
|
||||
algorithms on the MNIST handwritten digits data set. For gradient
|
||||
descent the learning rated 0.01, 0.05 and 0.1 are (GD$_{\cdot}$). For
|
||||
stochastic gradient descend a batch size of 32 and learning rate
|
||||
of 0.01 is used (SDG$_{0.01}$).}
|
||||
\label{fig:sgd_vs_gd}
|
||||
\end{figure}
|
||||
|
||||
\begin{table}[h]
|
||||
\begin{tabu} to \textwidth {@{} *4{X[c]}c*4{X[c]} @{}}
|
||||
\multicolumn{4}{c}{Classification Accuracy}
|
||||
&~&\multicolumn{4}{c}{Error Measure}
|
||||
\\\cline{1-4}\cline{6-9}
|
||||
GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$&&GD$_{0.01}$&GD$_{0.05}$&GD$_{0.1}$&SGD$_{0.01}$
|
||||
\\\cline{1-4}\cline{6-9}
|
||||
\multicolumn{9}{c}{test}\\
|
||||
0.265&0.633&0.203&0.989&&2.267&1.947&3.91&0.032
|
||||
\end{tabu}
|
||||
\caption{Performance metrics of the networks trained in
|
||||
Figure~\ref{fig:sgd_vs_gd} after 20 training epochs.}
|
||||
\label{table:sgd_vs_gd}
|
||||
\end{table}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "../main"
|
||||
%%% End:
|
71
TeX/Figures/_region_.tex
Normal file
71
TeX/Figures/_region_.tex
Normal file
@ -0,0 +1,71 @@
|
||||
\message{ !name(pfg_test.tex)}\documentclass{article}
|
||||
\usepackage{pgfplots}
|
||||
\usepackage{filecontents}
|
||||
\usepackage{subcaption}
|
||||
\usepackage{adjustbox}
|
||||
\usepackage{xcolor}
|
||||
\usepackage{graphicx}
|
||||
\usetikzlibrary{calc, 3d}
|
||||
|
||||
\begin{document}
|
||||
|
||||
\message{ !name(pfg_test.tex) !offset(6) }
|
||||
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\end{adjustbox}
|
||||
\caption{True position (\textcolor{red}{red}), distorted data (black)}
|
||||
\end{figure}
|
||||
\begin{center}
|
||||
\begin{figure}[h]
|
||||
\begin{subfigure}{0.49\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/klammern.jpg}
|
||||
\caption{Original Picure}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.49\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/image_conv4.png}
|
||||
\caption{test}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.49\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/image_conv5.png}
|
||||
\caption{test}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.49\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/image_conv6.png}
|
||||
\caption{test}
|
||||
\end{subfigure}
|
||||
\end{figure}
|
||||
\end{center}
|
||||
|
||||
\begin{figure}
|
||||
\begin{adjustbox}{width=\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{scope}[x = (0:1cm), y=(90:1cm), z=(15:-0.5cm)]
|
||||
\node[canvas is xy plane at z=0, transform shape] at (0,0)
|
||||
{\includegraphics[width=5cm]{Data/klammern_r.jpg}};
|
||||
\node[canvas is xy plane at z=2, transform shape] at (0,-0.2)
|
||||
{\includegraphics[width=5cm]{Data/klammern_g.jpg}};
|
||||
\node[canvas is xy plane at z=4, transform shape] at (0,-0.4)
|
||||
{\includegraphics[width=5cm]{Data/klammern_b.jpg}};
|
||||
\node[canvas is xy plane at z=4, transform shape] at (-8,-0.2)
|
||||
{\includegraphics[width=5.3cm]{Data/klammern_rgb.jpg}};
|
||||
\end{scope}
|
||||
\end{tikzpicture}
|
||||
\end{adjustbox}
|
||||
\caption{On the right the red, green and blue chanels of the picture
|
||||
are displayed. In order to better visualize the color channes the
|
||||
black and white picture of each channel has been colored in the
|
||||
respective color. Combining the layers results in the image on the
|
||||
left}
|
||||
\end{figure}
|
||||
|
||||
|
||||
|
||||
\message{ !name(pfg_test.tex) !offset(3) }
|
||||
|
||||
\end{document}
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: t
|
||||
%%% End:
|
53
TeX/Figures/fashion_mnist.tex
Normal file
53
TeX/Figures/fashion_mnist.tex
Normal file
@ -0,0 +1,53 @@
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Figures/Data/fashion_mnist0.pdf}
|
||||
\caption{T-shirt/top}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Figures/Data/fashion_mnist1.pdf}
|
||||
\caption{Trousers}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Figures/Data/fashion_mnist2.pdf}
|
||||
\caption{Pullover}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Figures/Data/fashion_mnist3.pdf}
|
||||
\caption{Dress}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Figures/Data/fashion_mnist4.pdf}
|
||||
\caption{Coat}
|
||||
\end{subfigure}\\
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Figures/Data/fashion_mnist5.pdf}
|
||||
\caption{Sandal}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Figures/Data/fashion_mnist6.pdf}
|
||||
\caption{Shirt}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Figures/Data/fashion_mnist7.pdf}
|
||||
\caption{Sneaker}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Figures/Data/fashion_mnist8.pdf}
|
||||
\caption{Bag}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Figures/Data/fashion_mnist9.pdf}
|
||||
\caption{Ankle boot}
|
||||
\end{subfigure}
|
||||
\caption[Fashion MNIST data set]{The fashtion MNIST data set contains 70.000 images of
|
||||
preprocessed product images from Zalando, which are categorized as
|
||||
T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt,
|
||||
Sneaker, Bag, Ankle boot. Of these images 60.000 are used as training images, while
|
||||
the rest are used to validate the models trained.}
|
||||
\label{fig:fashionMNIST}
|
||||
\end{figure}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "../main"
|
||||
%%% End:
|
83
TeX/Figures/gen_dropout.tex
Normal file
83
TeX/Figures/gen_dropout.tex
Normal file
@ -0,0 +1,83 @@
|
||||
\pgfplotsset{
|
||||
compat=1.11,
|
||||
legend image code/.code={
|
||||
\draw[mark repeat=2,mark phase=2]
|
||||
plot coordinates {
|
||||
(0cm,0cm)
|
||||
(0.15cm,0cm) %% default is (0.3cm,0cm)
|
||||
(0.3cm,0cm) %% default is (0.6cm,0cm)
|
||||
};%
|
||||
}
|
||||
}
|
||||
\begin{figure}
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\small
|
||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||
/pgf/number format/precision=3},tick style = {draw = none}, width = 0.975\textwidth,
|
||||
height = 0.6\textwidth, ymin = 0.988, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
xlabel = {epoch}, ylabel = {Classification Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width =1.25pt}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Figures/Data/adam_datagen_full_mean.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Figures/Data/adam_datagen_dropout_02_full_mean.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Figures/Data/adam_datagen_dropout_04_full_mean.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Figures/Data/adam_dropout_02_full_mean.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Figures/Data/adam_dropout_04_full_mean.log};
|
||||
\addplot [dashed] table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Figures/Data/adam_full_mean.log};
|
||||
|
||||
\addlegendentry{\footnotesize{G.}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.4}}
|
||||
\addlegendentry{\footnotesize{D. 0.2}}
|
||||
\addlegendentry{\footnotesize{D. 0.4}}
|
||||
\addlegendentry{\footnotesize{Default}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{Classification accuracy}
|
||||
\vspace{.25cm}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[h]{1.0\linewidth}
|
||||
\begin{tabu} to \textwidth {@{}lc*5{X[c]}@{}}
|
||||
\Tstrut \Bstrut & \textsc{\,Adam\,} & D. 0.2 & D. 0.4 & G. &G.+D.\,0.2 & G.+D.\,0.4 \\
|
||||
\hline
|
||||
\multicolumn{7}{c}{Test Accuracy}\Bstrut \\
|
||||
\cline{2-7}
|
||||
mean \Tstrut & 0.9914 & 0.9923 & 0.9930 & 0.9937 & 0.9938 & 0.9943 \\
|
||||
max & 0.9926 & 0.9930 & 0.9934 & 0.9946 & 0.9955 & 0.9956 \\
|
||||
min & 0.9887 & 0.9909 & 0.9922 & 0.9929 & 0.9929 & 0.9934 \\
|
||||
\hline
|
||||
\multicolumn{7}{c}{Training Accuracy}\Bstrut \\
|
||||
\cline{2-7}
|
||||
mean \Tstrut & 0.9994 & 0.9991 & 0.9989 & 0.9967 & 0.9954 & 0.9926 \\
|
||||
max & 0.9996 & 0.9996 & 0.9992 & 0.9979 & 0.9971 & 0.9937 \\
|
||||
min & 0.9992 & 0.9990 & 0.9984 & 0.9947 & 0.9926 & 0.9908 \\
|
||||
\end{tabu}
|
||||
\caption{Mean and maximum accuracy after 48 epochs of training.}
|
||||
\label{fig:gen_dropout_b}
|
||||
\end{subfigure}
|
||||
\caption[Performance comparison of overfitting measures]{Accuracy for the net given in ... with Dropout (D.),
|
||||
data generation (G.), a combination, or neither (Default) implemented and trained
|
||||
with \textsc{Adam}. For each epoch the 60.000 training samples
|
||||
were used, or for data generation 10.000 steps with each using
|
||||
batches of 60 generated data points. For each configuration the
|
||||
model was trained 5 times and the average accuracies at each epoch
|
||||
are given in (a). Mean, maximum and minimum values of accuracy on
|
||||
the test and training set are given in (b).}
|
||||
\label{fig:gen_dropout}
|
||||
\end{figure}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "../main"
|
||||
%%% End:
|
41
TeX/Figures/mnist.tex
Normal file
41
TeX/Figures/mnist.tex
Normal file
@ -0,0 +1,41 @@
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist0.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist1.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist2.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist3.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist4.pdf}
|
||||
\end{subfigure}\\
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist5.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist6.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist7.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist8.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist9.pdf}
|
||||
\end{subfigure}
|
||||
\caption[MNIST data set]{The MNIST data set contains 70.000 images of preprocessed handwritten
|
||||
digits. Of these images 60.000 are used as training images, while
|
||||
the rest are used to validate the models trained.}
|
||||
\label{fig:MNIST}
|
||||
\end{figure}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "../main"
|
||||
%%% End:
|
297
TeX/Figures/pfg_test.tex
Normal file
297
TeX/Figures/pfg_test.tex
Normal file
@ -0,0 +1,297 @@
|
||||
\documentclass[a4paper, 12pt, draft=true]{article}
|
||||
\usepackage{pgfplots}
|
||||
\usepackage{filecontents}
|
||||
\usepackage{subcaption}
|
||||
\usepackage{adjustbox}
|
||||
\usepackage{xcolor}
|
||||
\usepackage{tabu}
|
||||
\usepackage{showframe}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{titlecaps}
|
||||
\usetikzlibrary{calc, 3d}
|
||||
\usepgfplotslibrary{colorbrewer}
|
||||
|
||||
\newcommand\Tstrut{\rule{0pt}{2.6ex}} % = `top' strut
|
||||
\newcommand\Bstrut{\rule[-0.9ex]{0pt}{0pt}} % = `bottom' strut
|
||||
|
||||
\begin{document}
|
||||
\pgfplotsset{
|
||||
compat=1.11,
|
||||
legend image code/.code={
|
||||
\draw[mark repeat=2,mark phase=2]
|
||||
plot coordinates {
|
||||
(0cm,0cm)
|
||||
(0.3cm,0cm) %% default is (0.3cm,0cm)
|
||||
(0.6cm,0cm) %% default is (0.6cm,0cm)
|
||||
};%
|
||||
}
|
||||
}
|
||||
\begin{figure}
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
ylabel = {Test Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width
|
||||
=1.25pt}]
|
||||
% \addplot [dashed] table
|
||||
% [x=epoch, y=accuracy, col sep=comma, mark = none]
|
||||
% {Data/adam_datagen_full.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_1.mean};
|
||||
% \addplot [dashed] table
|
||||
% [x=epoch, y=accuracy, col sep=comma, mark = none]
|
||||
% {Data/adam_datagen_dropout_02_full.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_1.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_02_1.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_02_1.mean};
|
||||
|
||||
|
||||
\addlegendentry{\footnotesize{G.}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.4}}
|
||||
\addlegendentry{\footnotesize{D. 0.2}}
|
||||
\addlegendentry{\footnotesize{D. 0.4}}
|
||||
\addlegendentry{\footnotesize{Default}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{1 sample per class}
|
||||
\vspace{0.25cm}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
ylabel = {Test Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width
|
||||
=1.25pt}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_00_10.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_02_10.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_00_10.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_02_10.mean};
|
||||
|
||||
|
||||
\addlegendentry{\footnotesize{G.}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.4}}
|
||||
\addlegendentry{\footnotesize{D. 0.2}}
|
||||
\addlegendentry{\footnotesize{D. 0.4}}
|
||||
\addlegendentry{\footnotesize{Default}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{10 samples per class}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||
/pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth,
|
||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
xlabel = {epoch}, ylabel = {Test Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width
|
||||
=1.25pt}, ymin = {0.92}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_00_100.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_02_100.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_00_100.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_02_100.mean};
|
||||
|
||||
\addlegendentry{\footnotesize{G.}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.4}}
|
||||
\addlegendentry{\footnotesize{D. 0.2}}
|
||||
\addlegendentry{\footnotesize{D. 0.4}}
|
||||
\addlegendentry{\footnotesize{Default}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{100 samples per class}
|
||||
\vspace{.25cm}
|
||||
\end{subfigure}
|
||||
\caption{Accuracy for the net given in ... with Dropout (D.),
|
||||
data generation (G.), a combination, or neither (Default) implemented and trained
|
||||
with \textsc{Adam}. For each epoch the 60.000 training samples
|
||||
were used, or for data generation 10.000 steps with each using
|
||||
batches of 60 generated data points. For each configuration the
|
||||
model was trained 5 times and the average accuracies at each epoch
|
||||
are given in (a). Mean, maximum and minimum values of accuracy on
|
||||
the test and training set are given in (b).}
|
||||
\end{figure}
|
||||
\begin{table}
|
||||
\centering
|
||||
\begin{tabu} to \textwidth {@{}l*4{X[c]}@{}}
|
||||
\Tstrut \Bstrut & \textsc{Adam} & D. 0.2 & Gen & Gen.+D. 0.2 \\
|
||||
\hline
|
||||
&
|
||||
\multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut \\
|
||||
\cline{2-5}
|
||||
max \Tstrut & 0.5633 & 0.5312 & 0.6704 & 0.6604 \\
|
||||
min & 0.3230 & 0.4224 & 0.4878 & 0.5175 \\
|
||||
mean & 0.4570 & 0.4714 & 0.5862 & 0.6014 \\
|
||||
var & 0.0040 & 0.0012 & 0.0036 & 0.0023 \\
|
||||
\hline
|
||||
&
|
||||
\multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut \\
|
||||
\cline{2-5}
|
||||
max \Tstrut & 0.8585 & 0.9423 & 0.9310 & 0.9441 \\
|
||||
min & 0.8148 & 0.9081 & 0.9018 & 0.9061 \\
|
||||
mean & 0.8377 & 0.9270 & 0.9185 & 0.9232 \\
|
||||
var & 2.7e-4 & 1.3e-4 & 6e-05 & 1.5e-4 \\
|
||||
\hline
|
||||
&
|
||||
\multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut \\
|
||||
\cline{2-5}
|
||||
max & 0.9637 & 0.9796 & 0.9810 & 0.9805 \\
|
||||
min & 0.9506 & 0.9719 & 0.9702 & 0.9727 \\
|
||||
mean & 0.9582 & 0.9770 & 0.9769 & 0.9783 \\
|
||||
var & 2e-05 & 1e-05 & 1e-05 & 0 \\
|
||||
\hline
|
||||
\end{tabu}
|
||||
\caption{Values of the test accuracy of the model trained 10 times
|
||||
of random training sets containing 1, 10 and 100 data points per
|
||||
class.}
|
||||
\end{table}
|
||||
|
||||
\begin{center}
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist0.pdf}
|
||||
\caption{original\\image}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist_gen_zoom.pdf}
|
||||
\caption{random\\zoom}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist_gen_shear.pdf}
|
||||
\caption{random\\shear}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist_gen_rotation.pdf}
|
||||
\caption{random\\rotation}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist_gen_shift.pdf}
|
||||
\caption{random\\positional shift}
|
||||
\end{subfigure}\\
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist5.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist6.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist7.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist8.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Data/mnist9.pdf}
|
||||
\end{subfigure}
|
||||
\caption{The MNIST data set contains 70.000 images of preprocessed handwritten
|
||||
digits. Of these images 60.000 are used as training images, while
|
||||
the rest are used to validate the models trained.}
|
||||
\end{figure}
|
||||
\end{center}
|
||||
|
||||
\begin{figure}
|
||||
\begin{adjustbox}{width=\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{scope}[x = (0:1cm), y=(90:1cm), z=(15:-0.5cm)]
|
||||
\node[canvas is xy plane at z=0, transform shape] at (0,0)
|
||||
{\includegraphics[width=5cm]{Data/klammern_r.jpg}};
|
||||
\node[canvas is xy plane at z=2, transform shape] at (0,-0.2)
|
||||
{\includegraphics[width=5cm]{Data/klammern_g.jpg}};
|
||||
\node[canvas is xy plane at z=4, transform shape] at (0,-0.4)
|
||||
{\includegraphics[width=5cm]{Data/klammern_b.jpg}};
|
||||
\node[canvas is xy plane at z=4, transform shape] at (-8,-0.2)
|
||||
{\includegraphics[width=5.3cm]{Data/klammern_rgb.jpg}};
|
||||
\end{scope}
|
||||
\end{tikzpicture}
|
||||
\end{adjustbox}
|
||||
\caption{On the right the red, green and blue chanels of the picture
|
||||
are displayed. In order to better visualize the color channes the
|
||||
black and white picture of each channel has been colored in the
|
||||
respective color. Combining the layers results in the image on the
|
||||
left}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\begin{subfigure}{\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{Data/convnet_fig.pdf}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{.45\linewidth}
|
||||
\centering
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[enlargelimits=false, width=\textwidth]
|
||||
\addplot[domain=-5:5, samples=100]{tanh(x)};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{.45\linewidth}
|
||||
\centering
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[enlargelimits=false, width=\textwidth,
|
||||
ytick={0,2,4},yticklabels={\hphantom{4.}0,2,4}, ymin=-1]
|
||||
\addplot[domain=-5:5, samples=100]{max(0,x)};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{.45\linewidth}
|
||||
\centering
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[enlargelimits=false, width=\textwidth, ymin=-1,
|
||||
ytick={0,2,4},yticklabels={$\hphantom{-5.}0$,2,4}]
|
||||
\addplot[domain=-5:5, samples=100]{max(0,x)+ 0.1*min(0,x)};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\end{subfigure}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[enlargelimits=false]
|
||||
\addplot [domain=-5:5, samples=101,unbounded coords=jump]{1/(1+exp(-x)};
|
||||
\addplot[domain=-5:5, samples=100]{tanh(x)};
|
||||
\addplot[domain=-5:5, samples=100]{max(0,x)};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[enlargelimits=false]
|
||||
\addplot[domain=-2*pi:2*pi, samples=100]{cos(deg(x))};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
|
||||
\end{document}
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: t
|
||||
%%% End:
|
78
TeX/Figures/sdg_comparison.tex
Normal file
78
TeX/Figures/sdg_comparison.tex
Normal file
@ -0,0 +1,78 @@
|
||||
\pgfplotsset{
|
||||
compat=1.11,
|
||||
legend image code/.code={
|
||||
\draw[mark repeat=2,mark phase=2]
|
||||
plot coordinates {
|
||||
(0cm,0cm)
|
||||
(0.0cm,0cm) %% default is (0.3cm,0cm)
|
||||
(0.0cm,0cm) %% default is (0.6cm,0cm)
|
||||
};%
|
||||
}
|
||||
}
|
||||
\begin{figure}
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.6\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east},
|
||||
xlabel = {epoch}, ylabel = {Classification Accuracy}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Figures/Data/adagrad.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Figures/Data/adadelta.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Figures/Data/adam.log};
|
||||
|
||||
\addlegendentry{\footnotesize{ADAGRAD}}
|
||||
\addlegendentry{\footnotesize{ADADELTA}}
|
||||
\addlegendentry{\footnotesize{ADAM}}
|
||||
\addlegendentry{SGD$_{0.01}$}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
%\caption{Classification accuracy}
|
||||
\vspace{.25cm}
|
||||
\end{subfigure}
|
||||
% \begin{subfigure}[b]{\textwidth}
|
||||
% \begin{tikzpicture}
|
||||
% \begin{axis}[tick style = {draw = none}, width = \textwidth,
|
||||
% height = 0.6\textwidth, ymax = 0.5,
|
||||
% xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels =
|
||||
% {0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}]
|
||||
% \addplot table
|
||||
% [x=epoch, y=val_loss, col sep=comma, mark = none] {Figures/Data/adagrad.log};
|
||||
% \addplot table
|
||||
% [x=epoch, y=val_loss, col sep=comma, mark = none] {Figures/Data/adadelta.log};
|
||||
% \addplot table
|
||||
% [x=epoch, y=val_loss, col sep=comma, mark = none] {Figures/Data/adam.log};
|
||||
|
||||
% \addlegendentry{\footnotesize{ADAGRAD}}
|
||||
% \addlegendentry{\footnotesize{ADADELTA}}
|
||||
% \addlegendentry{\footnotesize{ADAM}}
|
||||
% \addlegendentry{SGD$_{0.01}$}
|
||||
|
||||
% \end{axis}
|
||||
% \end{tikzpicture}
|
||||
% \caption{Performance metrics during training}
|
||||
% \vspace{.25cm}
|
||||
% \end{subfigure}
|
||||
\begin{subfigure}[b]{1.0\linewidth}
|
||||
\begin{tabu} to \textwidth {@{} *3{X[c]}c*3{X[c]} @{}}
|
||||
\multicolumn{3}{c}{Classification Accuracy}
|
||||
&~&\multicolumn{3}{c}{Error Measure}
|
||||
\\\cline{1-3}\cline{5-7}
|
||||
ADAGRAD&ADADELTA&ADAM&&ADAGRAD&ADADELTA&ADAM
|
||||
\\\cline{1-3}\cline{5-7}
|
||||
1&1&1&&1&1&1
|
||||
\end{tabu}
|
||||
\caption{Performace metrics after 20 epochs}
|
||||
\end{subfigure}
|
||||
\caption[Performance comparison of training algorithms]{Classification accuracy on the test set and ...Performance metrics of the network given in ... trained
|
||||
with different optimization algorithms}
|
||||
\label{fig:comp_alg}
|
||||
\end{figure}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "../main"
|
||||
%%% End:
|
45
TeX/Figures/sin_conv.tex
Normal file
45
TeX/Figures/sin_conv.tex
Normal file
@ -0,0 +1,45 @@
|
||||
\begin{figure}
|
||||
\centering
|
||||
\begin{subfigure}[b]{0.49\textwidth}
|
||||
\centering
|
||||
\begin{adjustbox}{width=\textwidth, height=0.25\textheight}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[tick style = {draw = none}, xticklabel = \empty,
|
||||
yticklabel=\empty]
|
||||
\addplot [mark options={scale = 0.7}, mark = o] table
|
||||
[x=x_d,y=y_d, col sep = comma] {Figures/Data/sin_conv.csv};
|
||||
\addplot [red, mark=x] table [x=x_i, y=y_i, col sep=comma, color ='black'] {Figures/Data/sin_conv.csv};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\end{adjustbox}
|
||||
\caption{True position (\textcolor{red}{red}), distorted position data (black)}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[b]{0.49\textwidth}
|
||||
\centering
|
||||
\begin{adjustbox}{width=\textwidth, height=0.25\textheight}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[tick style = {draw = none}, xticklabel = \empty,
|
||||
yticklabel=\empty]
|
||||
\addplot [mark options={scale = 0.7}, mark = o] table [x=x,y=y, col
|
||||
sep = comma] {Figures/Data/sin_conv.csv};
|
||||
\addplot [red, mark=x] table [x=x_i, y=y_i, col sep=comma, color ='black'] {Figures/Data/sin_conv.csv};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\end{adjustbox}
|
||||
\caption{True position (\textcolor{red}{red}), filtered position data (black)}
|
||||
\end{subfigure}
|
||||
\caption[Signal smoothing using convolution]{Example for noise reduction using convolution with simulated
|
||||
positional data. As filter
|
||||
$g(i)=\left(\nicefrac{1}{3},\nicefrac{1}{4},\nicefrac{1}{5},\nicefrac{1}{6},\nicefrac{1}{20}\right)_{(i-1)}$
|
||||
is chosen and applied to the $x$ and $y$ coordinate
|
||||
data seperately. The convolution of both signals with $g$
|
||||
improves the MSE of the positions from 0.196 to 0.170 and
|
||||
visibly smoothes the data.
|
||||
}
|
||||
\label{fig:sin_conv}
|
||||
\end{figure}
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "../main"
|
||||
%%% End:
|
175
TeX/Figures/test.tex
Normal file
175
TeX/Figures/test.tex
Normal file
@ -0,0 +1,175 @@
|
||||
\documentclass{report}
|
||||
\usepackage[utf8]{inputenc}
|
||||
\usepackage[english]{babel}
|
||||
\usepackage[T1]{fontenc}
|
||||
|
||||
\usepackage{xcolor}
|
||||
\definecolor{maroon}{cmyk}{0, 0.87, 0.68, 0.32}
|
||||
\definecolor{halfgray}{gray}{0.55}
|
||||
\definecolor{ipython_frame}{RGB}{207, 207, 207}
|
||||
\definecolor{ipython_bg}{RGB}{247, 247, 247}
|
||||
\definecolor{ipython_red}{RGB}{186, 33, 33}
|
||||
\definecolor{ipython_green}{RGB}{0, 128, 0}
|
||||
\definecolor{ipython_cyan}{RGB}{64, 128, 128}
|
||||
\definecolor{ipython_purple}{RGB}{170, 34, 255}
|
||||
|
||||
\usepackage{listings}
|
||||
\lstset{
|
||||
breaklines=true,
|
||||
%
|
||||
extendedchars=true,
|
||||
literate=
|
||||
{á}{{\'a}}1 {é}{{\'e}}1 {í}{{\'i}}1 {ó}{{\'o}}1 {ú}{{\'u}}1
|
||||
{Á}{{\'A}}1 {É}{{\'E}}1 {Í}{{\'I}}1 {Ó}{{\'O}}1 {Ú}{{\'U}}1
|
||||
{à}{{\`a}}1 {è}{{\`e}}1 {ì}{{\`i}}1 {ò}{{\`o}}1 {ù}{{\`u}}1
|
||||
{À}{{\`A}}1 {È}{{\'E}}1 {Ì}{{\`I}}1 {Ò}{{\`O}}1 {Ù}{{\`U}}1
|
||||
{ä}{{\"a}}1 {ë}{{\"e}}1 {ï}{{\"i}}1 {ö}{{\"o}}1 {ü}{{\"u}}1
|
||||
{Ä}{{\"A}}1 {Ë}{{\"E}}1 {Ï}{{\"I}}1 {Ö}{{\"O}}1 {Ü}{{\"U}}1
|
||||
{â}{{\^a}}1 {ê}{{\^e}}1 {î}{{\^i}}1 {ô}{{\^o}}1 {û}{{\^u}}1
|
||||
{Â}{{\^A}}1 {Ê}{{\^E}}1 {Î}{{\^I}}1 {Ô}{{\^O}}1 {Û}{{\^U}}1
|
||||
{œ}{{\oe}}1 {Œ}{{\OE}}1 {æ}{{\ae}}1 {Æ}{{\AE}}1 {ß}{{\ss}}1
|
||||
{ç}{{\c c}}1 {Ç}{{\c C}}1 {ø}{{\o}}1 {å}{{\r a}}1 {Å}{{\r A}}1
|
||||
{€}{{\EUR}}1 {£}{{\pounds}}1
|
||||
}
|
||||
|
||||
%%
|
||||
%% Python definition (c) 1998 Michael Weber
|
||||
%% Additional definitions (2013) Alexis Dimitriadis
|
||||
%% modified by me (should not have empty lines)
|
||||
%%
|
||||
\lstdefinelanguage{iPython}{
|
||||
morekeywords={access,and,break,class,continue,def,del,elif,else,except,exec,finally,for,from,global,if,import,in,is,lambda,not,or,pass,print,raise,return,try,while},%
|
||||
%
|
||||
% Built-ins
|
||||
morekeywords=[2]{abs,all,any,basestring,bin,bool,bytearray,callable,chr,classmethod,cmp,compile,complex,delattr,dict,dir,divmod,enumerate,eval,execfile,file,filter,float,format,frozenset,getattr,globals,hasattr,hash,help,hex,id,input,int,isinstance,issubclass,iter,len,list,locals,long,map,max,memoryview,min,next,object,oct,open,ord,pow,property,range,raw_input,reduce,reload,repr,reversed,round,set,setattr,slice,sorted,staticmethod,str,sum,super,tuple,type,unichr,unicode,vars,xrange,zip,apply,buffer,coerce,intern},%
|
||||
%
|
||||
sensitive=true,%
|
||||
morecomment=[l]\#,%
|
||||
morestring=[b]',%
|
||||
morestring=[b]",%
|
||||
%
|
||||
morestring=[s]{'''}{'''},% used for documentation text (mulitiline strings)
|
||||
morestring=[s]{"""}{"""},% added by Philipp Matthias Hahn
|
||||
%
|
||||
morestring=[s]{r'}{'},% `raw' strings
|
||||
morestring=[s]{r"}{"},%
|
||||
morestring=[s]{r'''}{'''},%
|
||||
morestring=[s]{r"""}{"""},%
|
||||
morestring=[s]{u'}{'},% unicode strings
|
||||
morestring=[s]{u"}{"},%
|
||||
morestring=[s]{u'''}{'''},%
|
||||
morestring=[s]{u"""}{"""},%
|
||||
%
|
||||
% {replace}{replacement}{lenght of replace}
|
||||
% *{-}{-}{1} will not replace in comments and so on
|
||||
literate=
|
||||
{á}{{\'a}}1 {é}{{\'e}}1 {í}{{\'i}}1 {ó}{{\'o}}1 {ú}{{\'u}}1
|
||||
{Á}{{\'A}}1 {É}{{\'E}}1 {Í}{{\'I}}1 {Ó}{{\'O}}1 {Ú}{{\'U}}1
|
||||
{à}{{\`a}}1 {è}{{\`e}}1 {ì}{{\`i}}1 {ò}{{\`o}}1 {ù}{{\`u}}1
|
||||
{À}{{\`A}}1 {È}{{\'E}}1 {Ì}{{\`I}}1 {Ò}{{\`O}}1 {Ù}{{\`U}}1
|
||||
{ä}{{\"a}}1 {ë}{{\"e}}1 {ï}{{\"i}}1 {ö}{{\"o}}1 {ü}{{\"u}}1
|
||||
{Ä}{{\"A}}1 {Ë}{{\"E}}1 {Ï}{{\"I}}1 {Ö}{{\"O}}1 {Ü}{{\"U}}1
|
||||
{â}{{\^a}}1 {ê}{{\^e}}1 {î}{{\^i}}1 {ô}{{\^o}}1 {û}{{\^u}}1
|
||||
{Â}{{\^A}}1 {Ê}{{\^E}}1 {Î}{{\^I}}1 {Ô}{{\^O}}1 {Û}{{\^U}}1
|
||||
{œ}{{\oe}}1 {Œ}{{\OE}}1 {æ}{{\ae}}1 {Æ}{{\AE}}1 {ß}{{\ss}}1
|
||||
{ç}{{\c c}}1 {Ç}{{\c C}}1 {ø}{{\o}}1 {å}{{\r a}}1 {Å}{{\r A}}1
|
||||
{€}{{\EUR}}1 {£}{{\pounds}}1
|
||||
%
|
||||
{^}{{{\color{ipython_purple}\^{}}}}1
|
||||
{=}{{{\color{ipython_purple}=}}}1
|
||||
%
|
||||
{+}{{{\color{ipython_purple}+}}}1
|
||||
{*}{{{\color{ipython_purple}$^\ast$}}}1
|
||||
{/}{{{\color{ipython_purple}/}}}1
|
||||
%
|
||||
{+=}{{{+=}}}1
|
||||
{-=}{{{-=}}}1
|
||||
{*=}{{{$^\ast$=}}}1
|
||||
{/=}{{{/=}}}1,
|
||||
literate=
|
||||
*{-}{{{\color{ipython_purple}-}}}1
|
||||
{?}{{{\color{ipython_purple}?}}}1,
|
||||
%
|
||||
identifierstyle=\color{black}\ttfamily,
|
||||
commentstyle=\color{ipython_cyan}\ttfamily,
|
||||
stringstyle=\color{ipython_red}\ttfamily,
|
||||
keepspaces=true,
|
||||
showspaces=false,
|
||||
showstringspaces=false,
|
||||
%
|
||||
rulecolor=\color{ipython_frame},
|
||||
frame=single,
|
||||
frameround={t}{t}{t}{t},
|
||||
framexleftmargin=6mm,
|
||||
numbers=left,
|
||||
numberstyle=\tiny\color{halfgray},
|
||||
%
|
||||
%
|
||||
backgroundcolor=\color{ipython_bg},
|
||||
% extendedchars=true,
|
||||
basicstyle=\scriptsize,
|
||||
keywordstyle=\color{ipython_green}\ttfamily,
|
||||
}
|
||||
|
||||
\begin{document}
|
||||
\begin{lstlisting}[language=iPython]
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
from tensorflow.keras.callbacks import CSVLogger
|
||||
from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
||||
|
||||
mnist = tf.keras.datasets.mnist
|
||||
|
||||
(x_train, y_train), (x_test, y_test) = mnist.load_data()
|
||||
x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
|
||||
x_train = x_train / 255.0
|
||||
x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)
|
||||
x_test = x_test / 255.0
|
||||
|
||||
y_train = tf.keras.utils.to_categorical(y_train)
|
||||
y_test = tf.keras.utils.to_categorical(y_test)
|
||||
|
||||
model = tf.keras.models.Sequential()
|
||||
model.add(tf.keras.layers.Conv2D(24,kernel_size=5,padding='same',activation='relu',input_shape=(28,28,1)))
|
||||
model.add(tf.keras.layers.MaxPool2D())
|
||||
model.add(tf.keras.layers.Conv2D(64,kernel_size=5,padding='same',activation='relu'))
|
||||
model.add(tf.keras.layers.MaxPool2D(padding='same'))
|
||||
model.add(tf.keras.layers.Flatten())
|
||||
model.add(tf.keras.layers.Dense(256, activation='relu'))
|
||||
model.add(tf.keras.layers.Dropout(j))
|
||||
model.add(tf.keras.layers.Dense(10, activation='softmax'))
|
||||
model.compile(optimizer='adam', loss="categorical_crossentropy",
|
||||
metrics=["accuracy"])
|
||||
|
||||
datagen = ImageDataGenerator(
|
||||
rotation_range = 30,
|
||||
zoom_range = 0.15,
|
||||
width_shift_range=2,
|
||||
height_shift_range=2,
|
||||
shear_range = 1)
|
||||
|
||||
csv_logger = CSVLogger(<Target File>)
|
||||
|
||||
history = model.fit(datagen.flow(x_train_, y_train_, batch_size=50),
|
||||
validation_data=(x_test, y_test), epochs=125,
|
||||
callbacks=[csv_logger],
|
||||
steps_per_epoch = x_train_.shape[0]//50)
|
||||
|
||||
\end{lstlisting}
|
||||
\begin{lstlisting}[language=iPython]
|
||||
def get_random_sample(a, b, number_of_samples=10):
|
||||
x = []
|
||||
y = []
|
||||
for category_number in range(0,10):
|
||||
# get all samples of a category
|
||||
train_data_category = a[b==category_number]
|
||||
# pick a number of random samples from the category
|
||||
train_data_category = train_data_category[np.random.randint(
|
||||
train_data_category.shape[0], size=number_of_samples), :]
|
||||
x.extend(train_data_category)
|
||||
y.append([category_number]*number_of_samples)
|
||||
|
||||
return (np.asarray(x).reshape(-1, 28, 28, 1),
|
||||
np.asarray(y).reshape(10*number_of_samples,1))
|
||||
\end{lstlisting}
|
||||
\end{document}
|
5
TeX/Figures/y.tex
Normal file
5
TeX/Figures/y.tex
Normal file
@ -0,0 +1,5 @@
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "../main"
|
||||
%%% End:
|
58
TeX/Plots/Data/min_max.txt
Executable file
58
TeX/Plots/Data/min_max.txt
Executable file
@ -0,0 +1,58 @@
|
||||
datagen_dropout_02_1
|
||||
test
|
||||
0.6604& 0.5175& 0.60136& 0.002348447
|
||||
|
||||
datagen_dropout_00_1
|
||||
test
|
||||
0.6704& 0.4878& 0.58621& 0.003600539
|
||||
|
||||
dropout_02_1
|
||||
test
|
||||
0.5312& 0.4224& 0.47137& 0.001175149
|
||||
|
||||
default_1
|
||||
test
|
||||
0.5633& 0.3230& 0.45702& 0.004021449
|
||||
|
||||
datagen_dropout_02_10
|
||||
test
|
||||
0.9441& 0.9061& 0.92322& 0.00015
|
||||
train
|
||||
1& 0.97& 0.989& 1e-04
|
||||
|
||||
datagen_dropout_00_10
|
||||
test
|
||||
0.931& 0.9018& 0.9185& 6e-05
|
||||
train
|
||||
1& 0.97& 0.99& 0.00013
|
||||
|
||||
dropout_02_10
|
||||
test
|
||||
0.9423& 0.9081& 0.92696& 0.00013
|
||||
train
|
||||
1& 0.99& 0.992& 2e-05
|
||||
|
||||
default_10
|
||||
test
|
||||
0.8585& 0.8148& 0.83771& 0.00027
|
||||
train
|
||||
1& 1& 1& 0
|
||||
|
||||
datagen_dropout_02_100
|
||||
test
|
||||
0.9805& 0.9727& 0.97826& 0
|
||||
train
|
||||
|
||||
datagen_dropout_00_100
|
||||
test
|
||||
0.981& 0.9702& 0.9769& 1e-05
|
||||
train
|
||||
|
||||
dropout_02_100
|
||||
test
|
||||
0.9796& 0.9719& 0.97703& 1e-05
|
||||
train
|
||||
|
||||
default_100
|
||||
test
|
||||
0.9637& 0.9506& 0.95823& 2e-05
|
@ -115,7 +115,9 @@ plot coordinates {
|
||||
\caption{$\lambda = 3.0$}
|
||||
\end{subfigure}
|
||||
\end{subfigure}
|
||||
\caption{% In these Figures the behaviour stated in ... is visualized
|
||||
\caption[Comparison of shallow neural networks and regression
|
||||
splines]{% In these Figures the behaviour stated in ... is
|
||||
% visualized
|
||||
% in two exaples. For $(a), (b), (c)$ six values of sinus equidistantly
|
||||
% spaced on $[-\pi, \pi]$ have been used as training data. For
|
||||
% $(d),(e),(f)$ 15 equidistand values have been used, where
|
||||
@ -131,6 +133,7 @@ plot coordinates {
|
||||
$\text{data}_B$ in d), e), f).
|
||||
The Parameters of each are given above.
|
||||
}
|
||||
\label{fig:rn_vs_rs}
|
||||
\end{figure}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
|
@ -65,7 +65,7 @@ plot coordinates {
|
||||
\caption{Performance metrics during training}
|
||||
\end{subfigure}
|
||||
% \\~\\
|
||||
\caption{The neural network given in ?? trained with different
|
||||
\caption[Performance comparison of SDG and GD]{The neural network given in ?? trained with different
|
||||
algorithms on the MNIST handwritten digits data set. For gradient
|
||||
descent the learning rated 0.01, 0.05 and 0.1 are (GD$_{\cdot}$). For
|
||||
stochastic gradient descend a batch size of 32 and learning rate
|
||||
|
@ -40,7 +40,7 @@
|
||||
\includegraphics[width=\textwidth]{Plots/Data/fashion_mnist9.pdf}
|
||||
\caption{Ankle boot}
|
||||
\end{subfigure}
|
||||
\caption{The fashtion MNIST data set contains 70.000 images of
|
||||
\caption[Fashion MNIST data set]{The fashtion MNIST data set contains 70.000 images of
|
||||
preprocessed product images from Zalando, which are categorized as
|
||||
T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt,
|
||||
Sneaker, Bag, Ankle boot. Of these images 60.000 are used as training images, while
|
||||
|
@ -51,7 +51,7 @@ plot coordinates {
|
||||
\begin{tabu} to \textwidth {@{}lc*5{X[c]}@{}}
|
||||
\Tstrut \Bstrut & \textsc{\,Adam\,} & D. 0.2 & D. 0.4 & G. &G.+D.\,0.2 & G.+D.\,0.4 \\
|
||||
\hline
|
||||
\multicolumn{7}{c}{Classification Accuracy}\Bstrut \\
|
||||
\multicolumn{7}{c}{Test Accuracy}\Bstrut \\
|
||||
\cline{2-7}
|
||||
mean \Tstrut & 0.9914 & 0.9923 & 0.9930 & 0.9937 & 0.9938 & 0.9943 \\
|
||||
max & 0.9926 & 0.9930 & 0.9934 & 0.9946 & 0.9955 & 0.9956 \\
|
||||
@ -64,8 +64,9 @@ plot coordinates {
|
||||
min & 0.9992 & 0.9990 & 0.9984 & 0.9947 & 0.9926 & 0.9908 \\
|
||||
\end{tabu}
|
||||
\caption{Mean and maximum accuracy after 48 epochs of training.}
|
||||
\label{fig:gen_dropout_b}
|
||||
\end{subfigure}
|
||||
\caption{Accuracy for the net given in ... with Dropout (D.),
|
||||
\caption[Performance comparison of overfitting measures]{Accuracy for the net given in ... with Dropout (D.),
|
||||
data generation (G.), a combination, or neither (Default) implemented and trained
|
||||
with \textsc{Adam}. For each epoch the 60.000 training samples
|
||||
were used, or for data generation 10.000 steps with each using
|
||||
@ -73,6 +74,7 @@ plot coordinates {
|
||||
model was trained 5 times and the average accuracies at each epoch
|
||||
are given in (a). Mean, maximum and minimum values of accuracy on
|
||||
the test and training set are given in (b).}
|
||||
\label{fig:gen_dropout}
|
||||
\end{figure}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
|
@ -30,7 +30,7 @@
|
||||
\begin{subfigure}{0.19\textwidth}
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist9.pdf}
|
||||
\end{subfigure}
|
||||
\caption{The MNIST data set contains 70.000 images of preprocessed handwritten
|
||||
\caption[MNIST data set]{The MNIST data set contains 70.000 images of preprocessed handwritten
|
||||
digits. Of these images 60.000 are used as training images, while
|
||||
the rest are used to validate the models trained.}
|
||||
\label{fig:MNIST}
|
||||
|
@ -5,7 +5,9 @@
|
||||
\usepackage{adjustbox}
|
||||
\usepackage{xcolor}
|
||||
\usepackage{tabu}
|
||||
\usepackage{showframe}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{titlecaps}
|
||||
\usetikzlibrary{calc, 3d}
|
||||
\usepgfplotslibrary{colorbrewer}
|
||||
|
||||
@ -29,33 +31,29 @@ plot coordinates {
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.6\textwidth, ymin = 0.988, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
xlabel = {epoch}, ylabel = {Classification Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width =1.25pt}]
|
||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
ylabel = {Test Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width
|
||||
=1.25pt}]
|
||||
% \addplot [dashed] table
|
||||
% [x=epoch, y=accuracy, col sep=comma, mark = none]
|
||||
% {Data/adam_datagen_full.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_full_mean.log};
|
||||
{Data/adam_1.mean};
|
||||
% \addplot [dashed] table
|
||||
% [x=epoch, y=accuracy, col sep=comma, mark = none]
|
||||
% {Data/adam_datagen_dropout_02_full.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_02_full_mean.log};
|
||||
{Data/adam_datagen_1.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_04_full_mean.log};
|
||||
{Data/adam_datagen_dropout_02_1.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_02_full_mean.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_04_full_mean.log};
|
||||
\addplot [dashed] table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_full_mean.log};
|
||||
{Data/adam_dropout_02_1.mean};
|
||||
|
||||
|
||||
\addlegendentry{\footnotesize{G.}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||
@ -65,26 +63,72 @@ plot coordinates {
|
||||
\addlegendentry{\footnotesize{Default}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{Classification accuracy}
|
||||
\vspace{.25cm}
|
||||
\caption{1 sample per class}
|
||||
\vspace{0.25cm}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[h]{1.0\linewidth}
|
||||
\begin{tabu} to \textwidth {@{}lc*5{X[c]}@{}}
|
||||
\Tstrut \Bstrut & \textsc{\,Adam\,} & D. 0.2 & D. 0.4 & G. &G.+D.\,0.2 & G.+D.\,0.4 \\
|
||||
\hline
|
||||
\multicolumn{7}{c}{Classification Accuracy}\Bstrut \\
|
||||
\cline{2-7}
|
||||
mean \Tstrut & 0.9914 & 0.9923 & 0.9930 & 0.9937 & 0.9938 & 0.9943 \\
|
||||
max & 0.9926 & 0.9930 & 0.9934 & 0.9946 & 0.9955 & 0.9956 \\
|
||||
min & 0.9887 & 0.9909 & 0.9922 & 0.9929 & 0.9929 & 0.9934 \\
|
||||
\hline
|
||||
\multicolumn{7}{c}{Training Accuracy}\Bstrut \\
|
||||
\cline{2-7}
|
||||
mean \Tstrut & 0.9994 & 0.9991 & 0.9989 & 0.9967 & 0.9954 & 0.9926 \\
|
||||
max & 0.9996 & 0.9996 & 0.9992 & 0.9979 & 0.9971 & 0.9937 \\
|
||||
min & 0.9992 & 0.9990 & 0.9984 & 0.9947 & 0.9926 & 0.9908 \\
|
||||
\end{tabu}
|
||||
\caption{Mean and maximum accuracy after 48 epochs of training.}
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
ylabel = {Test Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width
|
||||
=1.25pt}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_00_10.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_02_10.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_00_10.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_02_10.mean};
|
||||
|
||||
|
||||
\addlegendentry{\footnotesize{G.}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.4}}
|
||||
\addlegendentry{\footnotesize{D. 0.2}}
|
||||
\addlegendentry{\footnotesize{D. 0.4}}
|
||||
\addlegendentry{\footnotesize{Default}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{10 samples per class}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||
/pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth,
|
||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
xlabel = {epoch}, ylabel = {Test Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width
|
||||
=1.25pt}, ymin = {0.92}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_00_100.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_dropout_02_100.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_00_100.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Data/adam_datagen_dropout_02_100.mean};
|
||||
|
||||
\addlegendentry{\footnotesize{G.}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.4}}
|
||||
\addlegendentry{\footnotesize{D. 0.2}}
|
||||
\addlegendentry{\footnotesize{D. 0.4}}
|
||||
\addlegendentry{\footnotesize{Default}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{100 samples per class}
|
||||
\vspace{.25cm}
|
||||
\end{subfigure}
|
||||
\caption{Accuracy for the net given in ... with Dropout (D.),
|
||||
data generation (G.), a combination, or neither (Default) implemented and trained
|
||||
@ -95,6 +139,40 @@ plot coordinates {
|
||||
are given in (a). Mean, maximum and minimum values of accuracy on
|
||||
the test and training set are given in (b).}
|
||||
\end{figure}
|
||||
\begin{table}
|
||||
\centering
|
||||
\begin{tabu} to \textwidth {@{}l*4{X[c]}@{}}
|
||||
\Tstrut \Bstrut & \textsc{Adam} & D. 0.2 & Gen & Gen.+D. 0.2 \\
|
||||
\hline
|
||||
&
|
||||
\multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut \\
|
||||
\cline{2-5}
|
||||
max \Tstrut & 0.5633 & 0.5312 & 0.6704 & 0.6604 \\
|
||||
min & 0.3230 & 0.4224 & 0.4878 & 0.5175 \\
|
||||
mean & 0.4570 & 0.4714 & 0.5862 & 0.6014 \\
|
||||
var & 0.0040 & 0.0012 & 0.0036 & 0.0023 \\
|
||||
\hline
|
||||
&
|
||||
\multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut \\
|
||||
\cline{2-5}
|
||||
max \Tstrut & 0.8585 & 0.9423 & 0.9310 & 0.9441 \\
|
||||
min & 0.8148 & 0.9081 & 0.9018 & 0.9061 \\
|
||||
mean & 0.8377 & 0.9270 & 0.9185 & 0.9232 \\
|
||||
var & 2.7e-4 & 1.3e-4 & 6e-05 & 1.5e-4 \\
|
||||
\hline
|
||||
&
|
||||
\multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut \\
|
||||
\cline{2-5}
|
||||
max & 0.9637 & 0.9796 & 0.9810 & 0.9805 \\
|
||||
min & 0.9506 & 0.9719 & 0.9702 & 0.9727 \\
|
||||
mean & 0.9582 & 0.9770 & 0.9769 & 0.9783 \\
|
||||
var & 2e-05 & 1e-05 & 1e-05 & 0 \\
|
||||
\hline
|
||||
\end{tabu}
|
||||
\caption{Values of the test accuracy of the model trained 10 times
|
||||
of random training sets containing 1, 10 and 100 data points per
|
||||
class.}
|
||||
\end{table}
|
||||
|
||||
\begin{center}
|
||||
\begin{figure}[h]
|
||||
|
@ -10,7 +10,7 @@ plot coordinates {
|
||||
}
|
||||
}
|
||||
\begin{figure}
|
||||
\begin{subfigure}[b]{\textwidth}
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.6\textwidth, ymin = 0.92, legend style={at={(0.9825,0.75)},anchor=north east},
|
||||
@ -32,30 +32,31 @@ plot coordinates {
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
%\caption{Classification accuracy}
|
||||
\vspace{.25cm}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[b]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.6\textwidth, ymax = 0.5,
|
||||
xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels =
|
||||
{0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adagrad.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adadelta.log};
|
||||
\addplot table
|
||||
[x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adam.log};
|
||||
% \begin{subfigure}[b]{\textwidth}
|
||||
% \begin{tikzpicture}
|
||||
% \begin{axis}[tick style = {draw = none}, width = \textwidth,
|
||||
% height = 0.6\textwidth, ymax = 0.5,
|
||||
% xlabel = {epoch}, ylabel = {Error Measure\vphantom{y}},ytick ={0,0.1,0.2,0.3,0.4,0.45,0.5}, yticklabels =
|
||||
% {0,0.1,0.2,0.3,0.4,\phantom{0.94},0.5}]
|
||||
% \addplot table
|
||||
% [x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adagrad.log};
|
||||
% \addplot table
|
||||
% [x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adadelta.log};
|
||||
% \addplot table
|
||||
% [x=epoch, y=val_loss, col sep=comma, mark = none] {Plots/Data/adam.log};
|
||||
|
||||
\addlegendentry{\footnotesize{ADAGRAD}}
|
||||
\addlegendentry{\footnotesize{ADADELTA}}
|
||||
\addlegendentry{\footnotesize{ADAM}}
|
||||
\addlegendentry{SGD$_{0.01}$}
|
||||
% \addlegendentry{\footnotesize{ADAGRAD}}
|
||||
% \addlegendentry{\footnotesize{ADADELTA}}
|
||||
% \addlegendentry{\footnotesize{ADAM}}
|
||||
% \addlegendentry{SGD$_{0.01}$}
|
||||
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{Performance metrics during training}
|
||||
\end{subfigure}
|
||||
\\~\\
|
||||
% \end{axis}
|
||||
% \end{tikzpicture}
|
||||
% \caption{Performance metrics during training}
|
||||
% \vspace{.25cm}
|
||||
% \end{subfigure}
|
||||
\begin{subfigure}[b]{1.0\linewidth}
|
||||
\begin{tabu} to \textwidth {@{} *3{X[c]}c*3{X[c]} @{}}
|
||||
\multicolumn{3}{c}{Classification Accuracy}
|
||||
@ -67,8 +68,9 @@ plot coordinates {
|
||||
\end{tabu}
|
||||
\caption{Performace metrics after 20 epochs}
|
||||
\end{subfigure}
|
||||
\caption{Classification accuracy on the test set and ...Performance metrics of the network given in ... trained
|
||||
\caption[Performance comparison of training algorithms]{Classification accuracy on the test set and ...Performance metrics of the network given in ... trained
|
||||
with different optimization algorithms}
|
||||
\label{fig:comp_alg}
|
||||
\end{figure}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
|
@ -28,7 +28,7 @@
|
||||
\end{adjustbox}
|
||||
\caption{True position (\textcolor{red}{red}), filtered position data (black)}
|
||||
\end{subfigure}
|
||||
\caption{Example for noise reduction using convolution with simulated
|
||||
\caption[Signal smoothing using convolution]{Example for noise reduction using convolution with simulated
|
||||
positional data. As filter
|
||||
$g(i)=\left(\nicefrac{1}{3},\nicefrac{1}{4},\nicefrac{1}{5},\nicefrac{1}{6},\nicefrac{1}{20}\right)_{(i-1)}$
|
||||
is chosen and applied to the $x$ and $y$ coordinate
|
||||
|
@ -176,4 +176,29 @@ url={https://openreview.net/forum?id=rkgz2aEKDr}
|
||||
timestamp = {Thu, 25 Jul 2019 14:25:37 +0200},
|
||||
biburl = {https://dblp.org/rec/journals/corr/KingmaB14.bib},
|
||||
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||
}
|
||||
|
||||
@article{transfer_learning,
|
||||
author = {Zhao,Wei},
|
||||
title = {Research on the deep learning of the small sample data based on transfer learning},
|
||||
journal = {AIP Conference Proceedings},
|
||||
volume = {1864},
|
||||
number = {1},
|
||||
pages = {020018},
|
||||
year = {2017},
|
||||
doi = {10.1063/1.4992835},
|
||||
URL = {https://aip.scitation.org/doi/abs/10.1063/1.4992835},
|
||||
eprint = {https://aip.scitation.org/doi/pdf/10.1063/1.4992835}
|
||||
}
|
||||
|
||||
@article{gan,
|
||||
title = "GAN-based synthetic medical image augmentation for increased CNN performance in liver lesion classification",
|
||||
journal = "Neurocomputing",
|
||||
volume = 321,
|
||||
pages = "321 - 331",
|
||||
year = 2018,
|
||||
issn = "0925-2312",
|
||||
doi = "https://doi.org/10.1016/j.neucom.2018.09.013",
|
||||
url = "http://www.sciencedirect.com/science/article/pii/S0925231218310749",
|
||||
author = "Maayan Frid-Adar and Idit Diamant and Eyal Klang and Michal Amitai and Jacob Goldberger and Hayit Greenspan"
|
||||
}
|
@ -85,7 +85,7 @@ channel (color) $c$ to the respective value $v$
|
||||
\end{scope}
|
||||
\end{tikzpicture}
|
||||
\end{adjustbox}
|
||||
\caption{On the right the red, green and blue chances of the picture
|
||||
\caption[Channel separation of color image]{On the right the red, green and blue chances of the picture
|
||||
are displayed. In order to better visualize the color channels the
|
||||
black and white picture of each channel has been colored in the
|
||||
respective color. Combining the layers results in the image on the
|
||||
@ -177,7 +177,7 @@ wise. Examples of convolution with both kernels are given in Figure~\ref{fig:img
|
||||
% \includegraphics[width=\textwidth]{Plots/Data/image_conv6.png}
|
||||
% \caption{test}
|
||||
% \end{subfigure}
|
||||
\caption{Convolution of original greyscale Image (a) with different
|
||||
\caption[Convolution applied on image]{Convolution of original greyscale Image (a) with different
|
||||
kernels. In (b) and (c) Gaussian kernels of size 11 and stated
|
||||
$\sigma^2$ are used. In (d) - (f) the above defined Sobel Operator
|
||||
kernels are used.}
|
||||
@ -186,7 +186,7 @@ wise. Examples of convolution with both kernels are given in Figure~\ref{fig:img
|
||||
\clearpage
|
||||
\newpage
|
||||
\subsection{Convolutional NN}
|
||||
\todo{Eileitung zu CNN}
|
||||
\todo{Eileitung zu CNN amout of parameters}
|
||||
% Conventional neural network as described in chapter .. are made up of
|
||||
% fully connected layers, meaning each node in a layer is influenced by
|
||||
% all nodes of the previous layer. If one wants to extract information
|
||||
@ -219,11 +219,11 @@ The usage of multiple filters results in multiple outputs of the same
|
||||
size as the input. These are often called channels. Depending on the
|
||||
size of the filters this can result in the dimension of the output
|
||||
being one larger than the input.
|
||||
However for convolutional layers following a convolutional layer the
|
||||
However for convolutional layers that are preceded by convolutional layers the
|
||||
size of the filter is often chosen to coincide with the amount of channels
|
||||
of the output of the previous layer without using padding in this
|
||||
direction in order to prevent gaining additional
|
||||
dimensions\todo{komisch} in the output.
|
||||
dimensions\todo{filter mit ganzer tiefe besser erklären} in the output.
|
||||
This can also be used to flatten certain less interesting channels of
|
||||
the input as for example a color channels.
|
||||
Thus filters used in convolutional networks are usually have the same
|
||||
@ -264,11 +264,11 @@ reduced in size by extracting a single value from a
|
||||
neighborhood \todo{moving...}... . The resulting output size is dependent on
|
||||
the offset of the neighborhoods used. Popular is max-pooling where the
|
||||
largest value in a neighborhood is used or.
|
||||
|
||||
This construct allows for extraction of features from the input while
|
||||
using far less input variables.
|
||||
|
||||
... \todo{Beispiel mit kleinem Bild, am besten das von oben}
|
||||
\todo{kleine grafik}
|
||||
The combination of convolution and pooling layers allows for
|
||||
extraction of features from the input in the from of feature maps while
|
||||
using relatively few parameters that need to be trained.
|
||||
\todo{Beispiel feature maps}
|
||||
|
||||
\subsubsection{Parallels to the Visual Cortex in Mammals}
|
||||
|
||||
@ -447,11 +447,15 @@ algorithm (\textsc{AdaGrad}, \textcite{ADAGRAD})
|
||||
laying the base work. Here for each parameter update the learning rate
|
||||
is given my a constant
|
||||
$\gamma$ is divided by the sum of the squares of the past partial
|
||||
derivatives in this parameter. This results in a monotonously
|
||||
decreasing learning rate for each parameter. This results in a faster
|
||||
decaying learning rate for parameters with large updates, where as
|
||||
derivatives in this parameter. This results in a monotonous decaying
|
||||
learning rate with faster
|
||||
decay for parameters with large updates, where as
|
||||
parameters with small updates experience smaller decay. The \textsc{AdaGrad}
|
||||
algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
|
||||
algorithm is given in Algorithm~\ref{alg:ADAGRAD}. Note that while
|
||||
this algorithm is still based upon the idea of gradient descent it no
|
||||
longer takes steps in the direction of the gradient while
|
||||
updating. Due to the individual learning rates for each parameter only
|
||||
the direction/sign for single parameters remain the same.
|
||||
|
||||
\begin{algorithm}[H]
|
||||
\SetAlgoLined
|
||||
@ -461,29 +465,64 @@ algorithm is given in Algorithm~\ref{alg:ADAGRAD}.
|
||||
\For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
|
||||
Compute Gradient: $g_t$\;
|
||||
Compute Update: $\Delta x_{t,i} \leftarrow
|
||||
-\frac{\gamma}{\norm{g_{1:t,i}}_2 + \varepsilon} g_t, \forall i =
|
||||
-\frac{\gamma}{\norm{g_{1:t,i}}_2 + \varepsilon} g_{t,i}, \forall i =
|
||||
1, \dots,p$\;
|
||||
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
|
||||
}
|
||||
\caption{\textls{\textsc{AdaGrad}}}
|
||||
\caption{\textsc{AdaGrad}}
|
||||
\label{alg:ADAGRAD}
|
||||
\end{algorithm}
|
||||
|
||||
Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the ... (\textsc{AdaDelta})
|
||||
Building on \textsc{AdaGrad} \textcite{ADADELTA} developed the
|
||||
\textsc{AdaDelta} algorithm
|
||||
in order to improve upon the two main drawbacks of \textsc{AdaGrad}, being the
|
||||
continual decay of the learning rate and the need for a manually
|
||||
selected global learning rate $\gamma$.
|
||||
As \textsc{AdaGrad} uses division by the accumulated squared gradients the learning rate will
|
||||
eventually become infinitely small.
|
||||
In order to ensure that even after a significant of iterations
|
||||
learning continues to make progress instead of summing the gradients a
|
||||
exponentially decaying average of the past gradients is used to ....
|
||||
learning continues to make progress instead of summing the squared gradients a
|
||||
exponentially decaying average of the past squared gradients is used to for
|
||||
regularizing the learning rate resulting in
|
||||
\begin{align*}
|
||||
E[g^2]_t & = \rho E[g^2]_{t-1} + (1-\rho) g_t^2, \\
|
||||
\Delta x_t & = -\frac{\gamma}{\sqrt{E[g^2]_t + \varepsilon}} g_t,
|
||||
\end{align*}
|
||||
for a decay rate $\rho$.
|
||||
Additionally the fixed global learning rate $\gamma$ is substituted by
|
||||
a exponentially decaying average of the past parameter updates.
|
||||
The usage of the past parameter updates is motivated by ensuring that
|
||||
if the parameter vector had some hypothetical units they would be matched
|
||||
by these of the parameter update $\Delta x_t$. This proper
|
||||
\todo{erklärung unit}
|
||||
hypothetical units of the parameter vector match those of the
|
||||
parameter update $\Delta x_t$. When only using the
|
||||
gradient with a scalar learning rate as in SDG the resulting unit of
|
||||
the parameter update is:
|
||||
\[
|
||||
\text{units of } \Delta x \propto \text{units of } g \propto
|
||||
\frac{\partial f}{\partial x} \propto \frac{1}{\text{units of } x},
|
||||
\]
|
||||
assuming the cost function $f$ is unitless. \textsc{AdaGrad} neither
|
||||
has correct units since the update is given by a ratio of gradient
|
||||
quantities resulting in a unitless parameter update. If however
|
||||
Hessian information or a approximation thereof is used to scale the
|
||||
gradients the unit of the updates will be correct:
|
||||
\[
|
||||
\text{units of } \Delta x \propto H^{-1} g \propto
|
||||
\frac{\frac{\partial f}{\partial x}}{\frac{\partial ^2 f}{\partial
|
||||
x^2}} \propto \text{units of } x
|
||||
\]
|
||||
Since using the second derivative results in correct units, Newton's
|
||||
method (assuming diagonal hessian) is rearranged to determine the
|
||||
quantities involved in the inverse of the second derivative:
|
||||
\[
|
||||
\Delta x = \frac{\frac{\partial f}{\partial x}}{\frac{\partial ^2
|
||||
f}{\partial x^2}} \iff \frac{1}{\frac{\partial^2 f}{\partial
|
||||
x^2}} = \frac{\Delta x}{\frac{\partial f}{\partial x}}.
|
||||
\]
|
||||
As the root mean square of the past gradients is already used in the
|
||||
denominator of the learning rate a exponentially decaying root mean
|
||||
square of the past updates is used to obtain a $\Delta x$ quantity for
|
||||
the denominator resulting in the correct unit of the update. The full
|
||||
algorithm is given by Algorithm~\ref{alg:adadelta}.
|
||||
|
||||
\begin{algorithm}[H]
|
||||
\SetAlgoLined
|
||||
@ -501,23 +540,24 @@ by these of the parameter update $\Delta x_t$. This proper
|
||||
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
|
||||
}
|
||||
\caption{\textsc{AdaDelta}, \textcite{ADADELTA}}
|
||||
\label{alg:gd}
|
||||
\label{alg:adadelta}
|
||||
\end{algorithm}
|
||||
|
||||
While the stochastic gradient algorithm is less susceptible to local
|
||||
While the stochastic gradient algorithm is less susceptible to getting
|
||||
stuck in local
|
||||
extrema than gradient descent the problem still persists especially
|
||||
with saddle points. \textcite{DBLP:journals/corr/Dauphinpgcgb14}
|
||||
for saddle points with steep .... \textcite{DBLP:journals/corr/Dauphinpgcgb14}
|
||||
|
||||
An approach to the problem of ``getting stuck'' in saddle point or
|
||||
local minima/maxima is the addition of momentum to SDG. Instead of
|
||||
using the actual gradient for the parameter update an average over the
|
||||
past gradients is used. In order to avoid the need to SAVE the past
|
||||
values usually a exponentially decaying average is used resulting in
|
||||
Algorithm~\ref{alg_momentum}. This is comparable of following the path
|
||||
of a marble with mass rolling down the SLOPE of the error
|
||||
function. The decay rate for the average is comparable to the TRÄGHEIT
|
||||
Algorithm~\ref{alg:sgd_m}. This is comparable of following the path
|
||||
of a marble with mass rolling down the slope of the error
|
||||
function. The decay rate for the average is comparable to the inertia
|
||||
of the marble.
|
||||
This results in the algorithm being able to escape ... due to the
|
||||
This results in the algorithm being able to escape some local extrema due to the
|
||||
build up momentum from approaching it.
|
||||
|
||||
% \begin{itemize}
|
||||
@ -539,14 +579,26 @@ build up momentum from approaching it.
|
||||
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
|
||||
}
|
||||
\caption{SDG with momentum}
|
||||
\label{alg:gd}
|
||||
\label{alg:sgd_m}
|
||||
\end{algorithm}
|
||||
|
||||
In an effort to combine the properties of the momentum method and the
|
||||
automatic adapted learning rate of \textsc{AdaDelta} \textcite{ADAM}
|
||||
developed the \textsc{Adam} algorithm. The
|
||||
|
||||
Problems / Improvements ADAM \textcite{rADAM}
|
||||
developed the \textsc{Adam} algorithm, given in
|
||||
Algorithm~\ref{alg:adam}. Here the exponentially decaying
|
||||
root mean square of the gradients is still used for realizing and
|
||||
combined with the momentum method. Both terms are normalized such that
|
||||
the ... are the first and second moment of the gradient. However the term used in
|
||||
\textsc{AdaDelta} to ensure correct units is dropped for a scalar
|
||||
global learning rate. This results in .. hyperparameters, however the
|
||||
algorithms seems to be exceptionally stable with the recommended
|
||||
parameters of ... and is a very reliable algorithm for training
|
||||
neural networks.
|
||||
However the \textsc{Adam} algorithm can have problems with high
|
||||
variance of the adaptive learning rate early in training.
|
||||
\textcite{rADAM} try to address these issues with the Rectified Adam
|
||||
algorithm
|
||||
\todo{will ich das einbauen?}
|
||||
|
||||
|
||||
\begin{algorithm}[H]
|
||||
@ -556,21 +608,27 @@ Problems / Improvements ADAM \textcite{rADAM}
|
||||
Initialize accumulation variables $m_0 = 0$, $v_0 = 0$\;
|
||||
\For{$t \in \left\{1,\dots,T\right\};\, t+1$}{
|
||||
Compute Gradient: $g_t$\;
|
||||
Accumulate first and second Moment of the Gradient:
|
||||
\begin{align*}
|
||||
m_t &\leftarrow \beta_1 m_{t-1} + (1 - \beta_1) g_t \\
|
||||
v_t &\leftarrow \beta_2 v_{t-1} + (1 - \beta_2) g_t^2\;
|
||||
\end{align*}
|
||||
Compute Update: $\Delta x_t \leftarrow -\frac{\sqrt{E[\Delta
|
||||
x^2]_{t-1} + \varepsilon}}{\sqrt{E[g^2]_t + \varepsilon}} g_t$\;
|
||||
Accumulate Updates: $E[\Delta x^2]_t \leftarrow \rho E[\Delta
|
||||
x^2]_{t-1} + (1+p)\Delta x_t^2$\;
|
||||
Accumulate first Moment of the Gradient and correct for bias:
|
||||
$m_t \leftarrow \beta_1 m_{t-1} + (1 - \beta_1) g_t;$\hspace{\linewidth}
|
||||
$\hat{m}_t \leftarrow \frac{m_t}{1-\beta_1^t}$\;
|
||||
Accumulate second Moment of the Gradient and correct for bias:
|
||||
$v_t \leftarrow \beta_2 v_{t-1} + (1 - \beta_2)g_t^2;$\hspace{\linewidth}
|
||||
$\hat{v}_t \leftarrow \frac{v_t}{1-\beta_2^t}$\;
|
||||
Compute Update: $\Delta x_t \leftarrow
|
||||
-\frac{\alpha}{\sqrt{\hat{v}_t + \varepsilon}}
|
||||
\hat{m}_t$\;
|
||||
Apply Update: $x_{t+1} \leftarrow x_t + \Delta x_t$\;
|
||||
}
|
||||
\caption{ADAM, \cite{ADAM}}
|
||||
\label{alg:gd}
|
||||
\label{alg:adam}
|
||||
\end{algorithm}
|
||||
|
||||
In order to get an understanding of the performance of the above
|
||||
discussed training algorithms the neural network given in ... has been
|
||||
trained on the ... and the results are given in
|
||||
Figure~\ref{fig:comp_alg}.
|
||||
Here it can be seen that the ADAM algorithm performs far better than
|
||||
the other algorithms, with AdaGrad and Adelta following... bla bla
|
||||
|
||||
|
||||
\input{Plots/sdg_comparison.tex}
|
||||
@ -594,15 +652,27 @@ Problems / Improvements ADAM \textcite{rADAM}
|
||||
% \cite{Dropout}
|
||||
|
||||
Similarly to shallow networks overfitting still can impact the quality of
|
||||
convolutional neural networks. A popular way to combat this problem is
|
||||
by introducing noise into the training of the model. This is a
|
||||
successful strategy for ofter models as well, the a conglomerate of
|
||||
descision trees grown on bootstrapped trainig samples benefit greatly
|
||||
of randomizing the features available to use in each training
|
||||
iteration (Hastie, Bachelorarbeit??).
|
||||
There are two approaches to introduce noise to the model during
|
||||
learning, either by manipulating the model it self or by manipulating
|
||||
the input data.
|
||||
convolutional neural networks.
|
||||
Popular ways to combat this problem for a .. of models is averaging
|
||||
over multiple models trained on subsets (bootstrap) or introducing
|
||||
noise directly during the training (for example random forest, where a
|
||||
conglomerate of decision trees benefit greatly of randomizing the
|
||||
features available to use in each training iteration).
|
||||
We explore implementations of these approaches for neural networks
|
||||
being dropout for simulating a conglomerate of networks and
|
||||
introducing noise during training by slightly altering the input
|
||||
pictures.
|
||||
% A popular way to combat this problem is
|
||||
% by introducing noise into the training of the model.
|
||||
% This can be done in a variety
|
||||
% This is a
|
||||
% successful strategy for ofter models as well, the a conglomerate of
|
||||
% descision trees grown on bootstrapped trainig samples benefit greatly
|
||||
% of randomizing the features available to use in each training
|
||||
% iteration (Hastie, Bachelorarbeit??).
|
||||
% There are two approaches to introduce noise to the model during
|
||||
% learning, either by manipulating the model it self or by manipulating
|
||||
% the input data.
|
||||
\subsubsection{Dropout}
|
||||
If a neural network has enough hidden nodes there will be sets of
|
||||
weights that accurately fit the training set (proof for a small
|
||||
@ -690,21 +760,35 @@ mirroring.
|
||||
\includegraphics[width=\textwidth]{Plots/Data/mnist_gen_shift.pdf}
|
||||
\caption{random\\positional shift}
|
||||
\end{subfigure}
|
||||
\caption{Example for the manipuations used in ... As all images are
|
||||
\caption[Image data generation]{Example for the manipuations used in ... As all images are
|
||||
of the same intensity brightness manipulation does not seem
|
||||
... Additionally mirroring is not used for ... reasons.}
|
||||
\end{figure}
|
||||
|
||||
In order to compare the benefits obtained from implementing these
|
||||
measures we have trained the network given in ... on the same problem
|
||||
and implemented different combinations of the measures. The results
|
||||
are given in Figure~\ref{fig:gen_dropout}. Here it can be seen that ...
|
||||
and implemented different combinations of data generation and dropout. The results
|
||||
are given in Figure~\ref{fig:gen_dropout}. For each scennario the
|
||||
model was trained five times and the performance measures were
|
||||
averaged. It can be seen that implementing the measures does indeed
|
||||
increase the performance of the model. Implementing data generation on
|
||||
its own seems to have a larger impact than dropout and applying both
|
||||
increases the accuracy even further.
|
||||
|
||||
The better performance stems most likely from reduced overfitting. The
|
||||
reduction in overfitting can be seen in
|
||||
\ref{fig:gen_dropout}~(\subref{fig:gen_dropout_b}) as the training
|
||||
accuracy decreases with test accuracy increasing. However utlitizing
|
||||
data generation as well as dropout with a probability of 0.4 seems to
|
||||
be a too aggressive approach as the training accuracy drops below the
|
||||
test accuracy.
|
||||
|
||||
\input{Plots/gen_dropout.tex}
|
||||
|
||||
\todo{Vergleich verschiedene dropout größen auf MNSIT o.ä., subset als
|
||||
training set?}
|
||||
|
||||
\clearpage
|
||||
\subsubsection{\titlecap{effectivety for small training sets}}
|
||||
|
||||
For some applications (medical problems with small amount of patients)
|
||||
@ -726,13 +810,141 @@ full dataset: ... per class\\
|
||||
100 per class
|
||||
10 per class
|
||||
|
||||
the results for training .. are given in ... Here can be seen...
|
||||
the results for training .. are given in ... Here can be seen... that
|
||||
for small training sets data generation has a large impact on the accuracy.
|
||||
|
||||
\begin{table}
|
||||
\centering
|
||||
\begin{tabu} to \textwidth {@{}l*4{X[c]}@{}}
|
||||
\Tstrut \Bstrut & \textsc{Adam} & D. 0.2 & Gen & Gen.+D. 0.2 \\
|
||||
\hline
|
||||
&
|
||||
\multicolumn{4}{c}{\titlecap{test accuracy for 1 sample}}\Bstrut \\
|
||||
\cline{2-5}
|
||||
max \Tstrut & 0.5633 & 0.5312 & 0.6704 & 0.6604 \\
|
||||
min & 0.3230 & 0.4224 & 0.4878 & 0.5175 \\
|
||||
mean & 0.4570 & 0.4714 & 0.5862 & 0.6014 \\
|
||||
var & 0.0040 & 0.0012 & 0.0036 & 0.0023 \\
|
||||
\hline
|
||||
&
|
||||
\multicolumn{4}{c}{\titlecap{test accuracy for 10 samples}}\Bstrut \\
|
||||
\cline{2-5}
|
||||
max \Tstrut & 0.8585 & 0.9423 & 0.9310 & 0.9441 \\
|
||||
min & 0.8148 & 0.9081 & 0.9018 & 0.9061 \\
|
||||
mean & 0.8377 & 0.9270 & 0.9185 & 0.9232 \\
|
||||
var & 2.7e-4 & 1.3e-4 & 6e-05 & 1.5e-4 \\
|
||||
\hline
|
||||
&
|
||||
\multicolumn{4}{c}{\titlecap{test accuracy for 100 samples}}\Bstrut \\
|
||||
\cline{2-5}
|
||||
max & 0.9637 & 0.9796 & 0.9810 & 0.9805 \\
|
||||
min & 0.9506 & 0.9719 & 0.9702 & 0.9727 \\
|
||||
mean & 0.9582 & 0.9770 & 0.9769 & 0.9783 \\
|
||||
var & 2e-05 & 1e-05 & 1e-05 & 0 \\
|
||||
\hline
|
||||
\end{tabu}
|
||||
\caption{Values of the test accuracy of the model trained 10 times
|
||||
of random training sets containing 1, 10 and 100 data points per
|
||||
class.}
|
||||
\end{table}
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\missingfigure{datagen digits}
|
||||
\caption{Sample pictures of the mnist fashioyn dataset, one per
|
||||
class.}
|
||||
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
ylabel = {Test Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width
|
||||
=1.25pt}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_1.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_dropout_02_1.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_datagen_1.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_datagen_dropout_02_1.mean};
|
||||
|
||||
|
||||
\addlegendentry{\footnotesize{Default}}
|
||||
\addlegendentry{\footnotesize{D. 0.2}}
|
||||
\addlegendentry{\footnotesize{G.}}
|
||||
\addlegendentry{\footnotesize{G. + D. 0.2}}
|
||||
\addlegendentry{\footnotesize{D. 0.4}}
|
||||
\addlegendentry{\footnotesize{Default}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{1 sample per class}
|
||||
\vspace{0.25cm}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||
/pgf/number format/precision=3},tick style = {draw = none}, width = \textwidth,
|
||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
ylabel = {Test Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width
|
||||
=1.25pt}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_dropout_00_10.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_dropout_02_10.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_datagen_dropout_00_10.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_datagen_dropout_02_10.mean};
|
||||
|
||||
|
||||
\addlegendentry{\footnotesize{Default.}}
|
||||
\addlegendentry{\footnotesize{D. 0.2}}
|
||||
\addlegendentry{\footnotesize{G.}}
|
||||
\addlegendentry{\footnotesize{G + D. 0.2}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{10 samples per class}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[h]{\textwidth}
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[legend cell align={left},yticklabel style={/pgf/number format/fixed,
|
||||
/pgf/number format/precision=3},tick style = {draw = none}, width = 0.9875\textwidth,
|
||||
height = 0.35\textwidth, legend style={at={(0.9825,0.0175)},anchor=south east},
|
||||
xlabel = {epoch}, ylabel = {Test Accuracy}, cycle
|
||||
list/Dark2, every axis plot/.append style={line width
|
||||
=1.25pt}, ymin = {0.92}]
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_dropout_00_100.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_dropout_02_100.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_datagen_dropout_00_100.mean};
|
||||
\addplot table
|
||||
[x=epoch, y=val_accuracy, col sep=comma, mark = none]
|
||||
{Plots/Data/adam_datagen_dropout_02_100.mean};
|
||||
|
||||
\addlegendentry{\footnotesize{Default.}}
|
||||
\addlegendentry{\footnotesize{D. 0.2}}
|
||||
\addlegendentry{\footnotesize{G.}}
|
||||
\addlegendentry{\footnotesize{G + D. 0.2}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{100 samples per class}
|
||||
\vspace{.25cm}
|
||||
\end{subfigure}
|
||||
\caption{}
|
||||
\label{mnist fashion}
|
||||
\end{figure}
|
||||
|
||||
@ -752,6 +964,8 @@ the results for training .. are given in ... Here can be seen...
|
||||
\item Transfer learning, use network trained on different task and
|
||||
repurpose it / train it with the training data
|
||||
\end{itemize}
|
||||
\textcite{transfer_learning}
|
||||
\textcite{gan}
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
|
@ -2,13 +2,18 @@
|
||||
\section{Introduction to Neural Networks}
|
||||
|
||||
Neural Networks (NN) are a mathematical construct inspired by the
|
||||
connection of neurons in nature. It consists of an input and output
|
||||
layer with an arbitrary amount of hidden layers between them. Each
|
||||
layer consits of a numer of neurons (nodes) with the number of nodes
|
||||
in the in-/output layers corresponding to the dimensions of the
|
||||
in-/output.\par
|
||||
Each neuron recieves the output of all layers in the previous layers,
|
||||
except for the input layer, which recieves the components of the input.
|
||||
... of brains in mammals. It consists of an array of neurons that
|
||||
receive inputs and compute a accumulated output. These neurons are
|
||||
arranged in layers, with one input and output layer and a arbirtary
|
||||
amount of hidden layer between them.
|
||||
The amount of neurons in the in- and output layers correspond to the
|
||||
desired dimensions of in- and outputs of the model.
|
||||
In conventional neural networks the information is passed ... from the
|
||||
input layer towards the output layer hence they are often called feed
|
||||
forward networks. Each neuron in a layer has the outputs of all
|
||||
neurons in the preceding layer as input (fully connected). A
|
||||
illustration of a example neuronal network is given in
|
||||
Figure~\ref{fig:nn} and one of a neuron in Figure~\ref{fig:neuron}
|
||||
|
||||
\tikzset{%
|
||||
every neuron/.style={
|
||||
@ -79,10 +84,11 @@ except for the input layer, which recieves the components of the input.
|
||||
\node[fill=white,scale=1.5,inner xsep=10pt,inner ysep=10mm] at ($(hidden1-1)!.5!(hidden2-2)$) {$\dots$};
|
||||
|
||||
\end{tikzpicture}}%}
|
||||
\caption{Illustration of a neural network with $d_i$ inputs, $l$
|
||||
\caption[Illustration of a neural network]{Illustration of a neural network with $d_i$ inputs, $l$
|
||||
hidden layers with $n_{\cdot}$ nodes in each layer, as well as
|
||||
$d_o$ outputs.
|
||||
}
|
||||
\label{fig:nn}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Nonlinearity of Neural Networks}
|
||||
@ -91,35 +97,39 @@ The arguably most important feature of neural networks that sets them
|
||||
apart from linear models is the activation function implemented in the
|
||||
neurons. As seen in Figure~\ref{fig:neuron} on the weighted sum of the
|
||||
inputs a activation function $\sigma$ is applied in order to obtain
|
||||
the output resulting in the output being given by
|
||||
the output resulting in the output of the $k$-th. neuron in a layer
|
||||
being given by
|
||||
\[
|
||||
o_k = \sigma\left(b_k + \sum_{j=1}^m w_{k,j} i_j\right).
|
||||
o_k = \sigma\left(b_k + \sum_{j=1}^m w_{k,j} i_j\right)
|
||||
\]
|
||||
for weights $w_{k,j}$ and biases $b_k$.
|
||||
The activation function is usually chosen nonlinear (a linear one
|
||||
would result in the entire model collapsing into a linear one) which
|
||||
allows it to better model data (beispiel satz ...).
|
||||
would result in the entire model collapsing into a linear one\todo{beweis?}) which
|
||||
allows it to better model data where the relation of in- and output is
|
||||
of nonlinear nature.
|
||||
There are two types of activation functions, saturating and not
|
||||
saturating ones. Popular examples for the former are sigmoid
|
||||
functions where most commonly the standard logisitc function or tanh are used
|
||||
as they have easy to compute derivatives which is ... for gradient
|
||||
functions where most commonly the standard logisitc function or tangen
|
||||
hyperbolicus are used
|
||||
as they have easy to compute derivatives which is desirable for gradient
|
||||
based optimization algorithms. The standard logistic function (often
|
||||
referred to simply as sigmoid function) is given by
|
||||
\[
|
||||
f(x) = \frac{1}{1+e^{-x}}
|
||||
f(x) = \frac{1}{1+e^{-x}}
|
||||
\]
|
||||
and has a realm of $[0,1]$. Its usage as an activation function is
|
||||
motivated by modeling neurons which
|
||||
are close to deactive until a certain threshold where they grow in
|
||||
intensity until they are fully
|
||||
active, which is similar to the behavior of neurons in brains
|
||||
\todo{besser schreiben}. The tanh function is given by
|
||||
active, which is similar to the behavior of neurons in
|
||||
brains\todo{besser schreiben}. The tangens hyperbolicus is given by
|
||||
\[
|
||||
tanh(x) = \frac{2}{e^{2x}+1}
|
||||
\tanh(x) = \frac{2}{e^{2x}+1}
|
||||
\]
|
||||
|
||||
The downside of these saturating activation functions is that given
|
||||
their ... their derivatives are close to zero for large or small
|
||||
input values which can ... the ... of gradient based methods.
|
||||
their saturating nature their derivatives are close to zero for large or small
|
||||
input values which can slow or hinder the progress of gradient based methods.
|
||||
|
||||
The nonsaturating activation functions commonly used are the recified
|
||||
linear using (ReLU) or the leaky RelU. The ReLU is given by
|
||||
@ -127,11 +137,12 @@ linear using (ReLU) or the leaky RelU. The ReLU is given by
|
||||
r(x) = \max\left\{0, x\right\}.
|
||||
\]
|
||||
This has the benefit of having a constant derivative for values larger
|
||||
than zero. However the derivative being zero ... . The leaky ReLU is
|
||||
than zero. However the derivative being zero has the same downside for
|
||||
fitting the model with gradient based methods. The leaky ReLU is
|
||||
an attempt to counteract this problem by assigning a small constant
|
||||
derivative to all values smaller than zero and for scalar $\alpha$ is given by
|
||||
\[
|
||||
l(x) = \max\left\{0, x\right\} + \alpha.
|
||||
l(x) = \max\left\{0, x\right\} + \alpha \min \left\{0, x\right\}.
|
||||
\]
|
||||
In order to illustrate these functions plots of them are given in Figure~\ref{fig:activation}.
|
||||
|
||||
@ -144,6 +155,7 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi
|
||||
\addplot [domain=-5:5, samples=101,unbounded coords=jump]{1/(1+exp(-x)};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{\titlecap{standard logistic function}}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{.45\linewidth}
|
||||
\centering
|
||||
@ -152,6 +164,7 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi
|
||||
\addplot[domain=-5:5, samples=100]{tanh(x)};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{\titlecap{tangens hyperbolicus}}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{.45\linewidth}
|
||||
\centering
|
||||
@ -161,6 +174,7 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi
|
||||
\addplot[domain=-5:5, samples=100]{max(0,x)};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{ReLU}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{.45\linewidth}
|
||||
\centering
|
||||
@ -170,8 +184,9 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi
|
||||
\addplot[domain=-5:5, samples=100]{max(0,x)+ 0.1*min(0,x)};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{Leaky ReLU, $\alpha = 0.1$}
|
||||
\end{subfigure}
|
||||
\caption{Plots of the activation fucntoins...}
|
||||
\caption{Plots of the activation functions}
|
||||
\label{fig:activation}
|
||||
\end{figure}
|
||||
|
||||
@ -266,24 +281,28 @@ In order to illustrate these functions plots of them are given in Figure~\ref{fi
|
||||
\clearpage
|
||||
\subsection{Training Neural Networks}
|
||||
|
||||
After a neural network model is designed, like most statistical models
|
||||
it has to be fit to the data. In the machine learning context this is
|
||||
often called ``training'' as due to the complexity and amount of
|
||||
variables in these models they are fitted iteratively to the data,
|
||||
``learing'' the properties of the data better with each iteration.
|
||||
As neural networks are a PARAMETRIC model we need to fit it to input
|
||||
data in order to get meaningfull OUTPUT from the network in order to
|
||||
do this we first need to discuss how we interpret the output of the
|
||||
neural network.
|
||||
|
||||
There are two main categories of machine learning models, being
|
||||
supervised and unsupervised learners. Unsupervised learners learn
|
||||
structure in the data without guidance form outside (as labeling data
|
||||
beforehand for training) popular examples of this are clustering
|
||||
algorithms\todo{quelle}. Supervised learners on the other hand are as
|
||||
the name suggest supervised during learning. This generally amounts to
|
||||
using data with the expected response (label) attached to each
|
||||
data-point in fitting the model, where usually some distance between
|
||||
the model output and the labels is minimized.
|
||||
% After a neural network model is designed, like most statistical models
|
||||
% it has to be fit to the data. In the machine learning context this is
|
||||
% often called ``training'' as due to the complexity and amount of
|
||||
% variables in these models they are fitted iteratively to the data,
|
||||
% ``learing'' the properties of the data better with each iteration.
|
||||
|
||||
\subsubsection{Interpreting the Output / Classification vs Regression
|
||||
/ Nonliniarity in last layer}
|
||||
% There are two main categories of machine learning models, being
|
||||
% supervised and unsupervised learners. Unsupervised learners learn
|
||||
% structure in the data without guidance form outside (as labeling data
|
||||
% beforehand for training) popular examples of this are clustering
|
||||
% algorithms\todo{quelle}. Supervised learners on the other hand are as
|
||||
% the name suggest supervised during learning. This generally amounts to
|
||||
% using data with the expected response (label) attached to each
|
||||
% data-point in fitting the model, where usually some distance between
|
||||
% the model output and the labels is minimized.
|
||||
|
||||
\subsubsection{\titlecap{nonliniarity in last layer}}
|
||||
|
||||
Given the nature of the neural net the output of the last layer are
|
||||
real numbers. For regression tasks this is desirable, for
|
||||
@ -316,6 +335,13 @@ and the individual values sum to one, thus the output can be interpreted as
|
||||
a probability for each class given the input.
|
||||
Additionally to being differentiable this allows for evaluataing the
|
||||
cetainiy of a prediction, rather than just whether it is accurate.
|
||||
A similar effect is obtained when for a binary or two class problem the
|
||||
sigmoid function
|
||||
\[
|
||||
f(x) = \frac{1}{1 + e^{-x}}
|
||||
\]
|
||||
is used and the output $f(x)$ is interpreted as the probability for
|
||||
the first class and $1-f(x)$ for the second class.
|
||||
|
||||
\todo{vielleicht additiv invarianz}
|
||||
% Another property that makes softmax attractive is the invariance to addition
|
||||
@ -372,7 +398,7 @@ common in time series models. \todo{komisch}
|
||||
|
||||
As discussed above the output of a neural network for a classification
|
||||
problem can be interpreted as a probability distribution over the classes
|
||||
conditioned on the input. In this case it is \todo{can?} desirable to
|
||||
conditioned on the input. In this case it is desirable to
|
||||
use error functions designed to compare probability distributions. A
|
||||
widespread error function for this use case is the cross entropy (\textcite{PRML}),
|
||||
which for two discrete distributions $p, q$ with the same realm $C$ is given by
|
||||
@ -392,15 +418,17 @@ $f$ we get the loss function
|
||||
|
||||
\subsubsection{Gradient Descent Algorithm}
|
||||
|
||||
When trying to fit a neural network it is hard
|
||||
to predict the impact of the single parameters on the accuracy of the
|
||||
output. Thus applying numeric optimization algorithms is the only
|
||||
Trying to find the optimal parameter for fitting the model to the data
|
||||
can be a hard problem. Given the complex nature of a neural network
|
||||
with many layers and neurons it is hard to predict the impact of
|
||||
single parameters on the accuracy of the output.
|
||||
Thus applying numeric optimization algorithms is the only
|
||||
feasible way to fit the model. A attractive algorithm for training
|
||||
neural networks is gradient descent where each parameter $\theta_i$ is
|
||||
iterative changed according to the gradient regarding the error
|
||||
measure and a step size $\gamma$. For this all parameters are
|
||||
initialized (often random or close to zero) and then iteratively
|
||||
updated until a certain criteria is hit, mostly either being a fixed
|
||||
updated until a certain stopping criterion is hit, mostly either being a fixed
|
||||
number of iterations or a desired upper limit for the error measure.
|
||||
% For a function $f_\theta$ with parameters $\theta \in \mathbb{R}^n$
|
||||
% and a error function $L(f_\theta)$ the gradient descent algorithm is
|
||||
@ -450,6 +478,7 @@ introduced by \textcite{backprop}.
|
||||
\[
|
||||
\frac{\partial L(...)}{}
|
||||
\]
|
||||
\todo{Backprop richtig aufschreiben}
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
|
@ -34,7 +34,7 @@
|
||||
\usepackage{todonotes}
|
||||
\usepackage{lipsum}
|
||||
\usepackage[ruled,vlined]{algorithm2e}
|
||||
%\usepackage{showframe}
|
||||
\usepackage{showframe}
|
||||
\usepackage[protrusion=true, expansion=true, kerning=true, letterspace
|
||||
= 150]{microtype}
|
||||
\usepackage{titlecaps}
|
||||
@ -113,6 +113,8 @@
|
||||
\newpage
|
||||
%\setcounter{tocdepth}{4}
|
||||
\tableofcontents
|
||||
\clearpage
|
||||
\listoffigures
|
||||
\listoftodos
|
||||
\newpage
|
||||
\pagenumbering{arabic}
|
||||
|
5
TeX/papers
Normal file
5
TeX/papers
Normal file
@ -0,0 +1,5 @@
|
||||
Robust error measure for supervised neural network learning with outliers
|
||||
|
||||
Learning rate decay https://arxiv.org/pdf/1908.01878.pdf
|
||||
|
||||
Best mnist https://arxiv.org/abs/1805.01890
|
523
TeX/theo_3_8.tex
523
TeX/theo_3_8.tex
@ -6,14 +6,15 @@
|
||||
%%% End:
|
||||
\section{Shallow Neural Networks}
|
||||
|
||||
In order to get a some understanding of the behavior of neural
|
||||
networks we study a simplified class of networks called shallow neural
|
||||
networks in this chapter. We consider shallow neural networks consist of a single
|
||||
hidden layer and
|
||||
In order to examine some behavior of neural networks in this chapter
|
||||
we consider a simple class of networks, the shallow ones. These
|
||||
networks only contain one hidden layer and have a single output node.
|
||||
|
||||
% In order to get a some understanding of the behavior of neural
|
||||
% networks we study a simplified class of networks called shallow neural
|
||||
% networks in this chapter.
|
||||
% We consider shallow neural networks consist of a single
|
||||
% hidden layer and
|
||||
In order to get some understanding of the behavior of neural networks
|
||||
we examine a simple class of networks in this chapter. We consider
|
||||
networks that contain only one hidden layer and have a single output
|
||||
node. We call these networks shallow neural networks.
|
||||
\begin{Definition}[Shallow neural network]
|
||||
For a input dimension $d$ and a Lipschitz continuous activation function $\sigma:
|
||||
\mathbb{R} \to \mathbb{R}$ we define a shallow neural network with
|
||||
@ -84,15 +85,16 @@ with
|
||||
% \end{figure}
|
||||
|
||||
As neural networks with a large amount of nodes have a large amount of
|
||||
parameters that can be tuned it can often fit the data quite well. If a ReLU
|
||||
parameters that can be tuned it can often fit the data quite well. If
|
||||
a ReLU activation function
|
||||
\[
|
||||
\sigma(x) \coloneqq \max{(0, x)}
|
||||
\]
|
||||
is chosen as activation function one can easily prove that if the
|
||||
is chosen one can easily prove that if the
|
||||
amount of hidden nodes exceeds the
|
||||
amount of data points in the training data a shallow network trained
|
||||
on MSE will perfectly fit the data.
|
||||
\begin{Theorem}[sinnvoller titel]
|
||||
\begin{Theorem}[Shallow neural network can fit data perfectly]
|
||||
For training data of size t
|
||||
\[
|
||||
\left(x_i^{\text{train}}, y_i^{\text{train}}\right) \in \mathbb{R}^d
|
||||
@ -150,17 +152,18 @@ on MSE will perfectly fit the data.
|
||||
\label{theo:overfit}
|
||||
\end{Theorem}
|
||||
|
||||
However this behavior is often not desired as over fit models often
|
||||
However this behavior is often not desired as over fit models generally
|
||||
have bad generalization properties especially if noise is present in
|
||||
the data. This effect can be seen in
|
||||
Figure~\ref{fig:overfit}. Here a network that perfectly fits the
|
||||
training data regarding the MSE is \todo{Formulierung}
|
||||
constructed and compared to a regression spline
|
||||
(Definition~\ref{def:wrs}). While the network
|
||||
fits the data better than the spline, the spline is much closer to the
|
||||
underlying mechanism that was used to generate the data. The better
|
||||
the data. This effect is illustrated in
|
||||
Figure~\ref{fig:overfit}. Here a shallow neural network that perfectly fits the
|
||||
training data regarding MSE is \todo{Formulierung}
|
||||
constructed according to the proof of Theorem~\ref{theo:overfit} and
|
||||
compared to a regression spline
|
||||
(Definition~\ref{def:wrs}). While the neural network
|
||||
fits the data better than the spline, the spline represents the
|
||||
underlying mechanism that was used to generate the data more accurately. The better
|
||||
generalization of the spline compared to the network is further
|
||||
illustrated by the better validation error computed with new generated
|
||||
demonstrated by the better validation error computed on newly generated
|
||||
test data.
|
||||
In order to improve the accuracy of the model we want to reduce
|
||||
overfitting. A possible way to achieve this is by explicitly
|
||||
@ -168,7 +171,7 @@ regularizing the network through the cost function as done with
|
||||
ridge penalized networks
|
||||
(Definition~\ref{def:rpnn}) where large weights $w$ are punished. In
|
||||
Theorem~\ref{theo:main1} we will
|
||||
prove that this will result in the network converging to
|
||||
prove that this will result in the shallow neural network converging to
|
||||
regressions splines as the amount of nodes in the hidden layer is
|
||||
increased.
|
||||
|
||||
@ -205,7 +208,7 @@ plot coordinates {
|
||||
\addlegendentry{\footnotesize{spline}};
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\caption{For data of the form $y=\sin(\frac{x+\pi}{2 \pi}) +
|
||||
\caption[Overfitting of shallow neural networks]{For data of the form $y=\sin(\frac{x+\pi}{2 \pi}) +
|
||||
\varepsilon,~ \varepsilon \sim \mathcal{N}(0,0.4)$
|
||||
(\textcolor{blue}{blue dots}) the neural network constructed
|
||||
according to the proof of Theorem~\ref{theo:overfit} (black) and the
|
||||
@ -224,14 +227,24 @@ plot coordinates {
|
||||
Networks}
|
||||
|
||||
|
||||
This section is based on \textcite{heiss2019}. We will analyze the connection of randomized shallow
|
||||
Neural Networks with one dimensional input and regression splines. We
|
||||
will see that the punishment of the size of the weights in training
|
||||
This section is based on \textcite{heiss2019}. We will analyze the
|
||||
connection between randomized shallow
|
||||
Neural Networks with one dimensional input with a ReLU as activation
|
||||
function for all neurons and regression splines.
|
||||
% \[
|
||||
% \sigma(x) = \max\left\{0,x\right\}.
|
||||
% \]
|
||||
We will see that the punishment of the size of the weights in training
|
||||
the randomized shallow
|
||||
Neural Network will result in a function that minimizes the second
|
||||
Neural Network will result in a learned function that minimizes the second
|
||||
derivative as the amount of hidden nodes is grown to infinity. In order
|
||||
to properly formulate this relation we will first need to introduce
|
||||
some definitions.
|
||||
some definitions, all neural networks introduced in the following will
|
||||
use a ReLU as activation at all neurons.
|
||||
|
||||
A randomized shallow network is characterized by only the weight
|
||||
parameter of the output layer being trainable, whereas the other
|
||||
parameters are random numbers.
|
||||
|
||||
\begin{Definition}[Randomized shallow neural network]
|
||||
For an input dimension $d$, let $n \in \mathbb{N}$ be the number of
|
||||
@ -244,11 +257,20 @@ some definitions.
|
||||
\]
|
||||
\label{def:rsnn}
|
||||
\end{Definition}
|
||||
We call a one dimensional randomized shallow neural network were the
|
||||
$L^2$ norm of the trainable weights $w$ are penalized in the loss
|
||||
function ridge penalized neural networks.
|
||||
|
||||
% We call a randomized shallow neural network where the size of the trainable
|
||||
% weights is punished in the error function a ridge penalized
|
||||
% neural network. For a tuning parameter $\tilde{\lambda}$ .. the extent
|
||||
% of penalization we get:
|
||||
\begin{Definition}[Ridge penalized Neural Network]
|
||||
\label{def:rpnn}
|
||||
Let $\mathcal{RN}_{w, \omega}$ be a randomized shallow neural
|
||||
network, as introduced in ???. Then the optimal ridge penalized
|
||||
network, as introduced in Definition~\ref{def:rsnn} and tuning
|
||||
parameter $\tilde{\lambda} \in \mathbb{R}$. Then the optimal ridge
|
||||
penalized
|
||||
network is given by
|
||||
\[
|
||||
\mathcal{RN}^{*, \tilde{\lambda}}_{\omega}(x) \coloneqq
|
||||
@ -263,9 +285,8 @@ some definitions.
|
||||
\tilde{\lambda} \norm{w}_2^2\right\}}_{\eqqcolon F_n^{\tilde{\lambda}}(\mathcal{RN}_{w,\omega})}.
|
||||
\]
|
||||
\end{Definition}
|
||||
In the ridge penalized Neural Network large weights are penalized, the
|
||||
extend of which can be tuned with the parameter $\tilde{\lambda}$. If
|
||||
$n$ is larger than the amount of training samples $N$ then for
|
||||
If the amount of hidden nodes $n$ is larger than the amount of
|
||||
training samples $N$ then for
|
||||
$\tilde{\lambda} \to 0$ the network will interpolate the data while
|
||||
having minimal weights, resulting in the \textit{minimum norm
|
||||
network} $\mathcal{RN}_{w^{\text{min}}, \omega}$.
|
||||
@ -280,15 +301,109 @@ having minimal weights, resulting in the \textit{minimum norm
|
||||
\left\{1,\dots,N\right\}.
|
||||
\]
|
||||
For $\tilde{\lambda} \to \infty$ the learned
|
||||
function will resemble the data less and less with the weights
|
||||
approaching $0$. .\par
|
||||
In order to make the notation more convinient in the follwoing the
|
||||
function will resemble the data less and with the weights
|
||||
approaching $0$ will converge to the constant $0$ function.
|
||||
|
||||
In order to make the notation more convinient in the following the
|
||||
$\omega$ used to express the realised random parameters will no longer
|
||||
be explizitly mentioned.
|
||||
be explicitly mentioned.
|
||||
|
||||
We call a function that minimizes the cubic distance between training points
|
||||
and the function with respect\todo{richtiges wort} to the second
|
||||
derivative of the function a regression spline.
|
||||
|
||||
\begin{Definition}[Regression Spline]
|
||||
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
|
||||
\left\{1,\dots,N\right\}$ be trainig data. for a given $\lambda \in
|
||||
\mathbb{R}$ the regression spline is given by
|
||||
\[
|
||||
f^{*,\lambda} :\in \argmin_{f \in
|
||||
\mathcal{C}^2}\left\{\sum_{i=1}^N
|
||||
\left(f\left(x_i^{\text{train}}\right) -
|
||||
y_i^{\text{train}}\right)^2 + \lambda \int f^{''}(x)^2dx\right\}.
|
||||
\]
|
||||
\end{Definition}
|
||||
|
||||
We will show that for specific hyper parameters the ridge penalized
|
||||
shallow neural networks converge to a slightly modified variant of the
|
||||
regression spline. We will need to incorporate the densities of the
|
||||
random parameters in the loss function of the spline to ensure
|
||||
convergence. Thus we define
|
||||
the adapted weighted regression spline where the loss for the second
|
||||
derivative is weighted by a function $g$ and the support of the second
|
||||
derivative of $f$ has to be a subset the support of $g$. The formal
|
||||
definition is given in Definition~\ref{def:wrs}.
|
||||
|
||||
% We will later ... the converging .. of the ridge penalized shallow
|
||||
% neural network, in order to do so we will need a slightly modified
|
||||
% version of the regression
|
||||
% spline that allows for weighting the penalty term for the second
|
||||
% derivative with a weight function $g$. This is needed to ...the
|
||||
% distributions of the random parameters ... We call this the adapted
|
||||
% weighted regression spline.
|
||||
|
||||
% Now we take a look at weighted regression splines. Later we will prove
|
||||
% that the ridge penalized neural network as defined in
|
||||
% Definition~\ref{def:rpnn} converges a weighted regression spline, as
|
||||
% the amount of hidden nodes is grown to inifity.
|
||||
|
||||
\begin{Definition}[Adapted Weighted regression spline]
|
||||
\label{def:wrs}
|
||||
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
|
||||
\left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$
|
||||
and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted
|
||||
regression spline $f^{*, \lambda}_g$ is given by
|
||||
|
||||
\[
|
||||
f^{*, \lambda}_g :\in \argmin_{\substack{f \in \mathcal{C}^2(\mathbb{R})
|
||||
\\ \supp(f'') \subseteq \supp(g)}} \underbrace{\left\{ \overbrace{\sum_{i =
|
||||
1}^N \left(f(x_i^{\text{train}}) - y_i^{\text{train}}\right)^2}^{L(f)} +
|
||||
\lambda g(0) \int_{\supp(g)}\frac{\left(f''(x)\right)^2}{g(x)}
|
||||
dx\right\}}_{\eqqcolon F^{\lambda, g}(f)}.
|
||||
\]
|
||||
\todo{Anforderung an Ableitung von f, doch nicht?}
|
||||
\end{Definition}
|
||||
|
||||
Similarly to ridge weight penalized neural networks the parameter
|
||||
$\lambda$ controls a trade-off between accuracy on the training data
|
||||
and smoothness or low second dreivative. For $g \equiv 1$ and $\lambda \to 0$ the
|
||||
resulting function $f^{*, 0+}$ will interpolate the training data while minimizing
|
||||
the second derivative. Such a function is known as cubic spline
|
||||
interpolation.
|
||||
\todo{cite cubic spline}
|
||||
|
||||
\[
|
||||
f^{*, 0+} \text{ smooth spline interpolation: }
|
||||
\]
|
||||
\[
|
||||
f^{*, 0+} \coloneqq \lim_{\lambda \to 0+} f^{*, \lambda}_1 \in
|
||||
\argmin_{\substack{f \in \mathcal{C}^2\mathbb{R}, \\ f(x_i^{\text{train}}) =
|
||||
y_i^{\text{train}}}} = \left( \int _{\mathbb{R}} (f''(x))^2dx\right).
|
||||
\]
|
||||
|
||||
For $\lambda \to \infty$ on the other hand $f_g^{*\lambda}$ converges
|
||||
to linear regression of the data.
|
||||
|
||||
We use two intermediary functions in order to show the convergence of
|
||||
the ridge penalized shallow neural network to adapted regression splines.
|
||||
% In order to show that ridge penalized shallow neural networks converge
|
||||
% to adapted regression splines for a growing amount of hidden nodes we
|
||||
% define two intermediary functions.
|
||||
One being a smooth approximation of
|
||||
the neural network, and a randomized shallow neural network designed
|
||||
to approximate a spline.
|
||||
In order to properly BUILD these functions we need to take the points
|
||||
of the network into consideration where the TRAJECTORY changes or
|
||||
their points of discontinuity
|
||||
As we use the ReLU activation the function learned by the
|
||||
network will possess points of discontinuity where a neuron in the hidden
|
||||
layer gets activated (goes from 0 -> x>0). We formalize these points
|
||||
as kinks in Definition~\ref{def:kink}.
|
||||
\begin{Definition}
|
||||
\label{def:kink}
|
||||
Let $\mathcal{RN}_w$ be a randomized shallow Neural
|
||||
Network according to Definition~\ref{def:rsnn}, then kinks depending on the random parameters can
|
||||
Network according to Definition~\ref{def:rsnn}, then kinks depending
|
||||
on the random parameters can
|
||||
be observed.
|
||||
\[
|
||||
\mathcal{RN}_w(x) = \sum_{k = 1}^n w_k \sigma(b_k + v_kx)
|
||||
@ -307,15 +422,14 @@ be explizitly mentioned.
|
||||
\end{enumerate}
|
||||
\end{Definition}
|
||||
|
||||
In order to later prove the connection between randomised shallow
|
||||
Neural Networks and regression splines, we first take a look at a
|
||||
smooth approximation of the RSNN.
|
||||
Using the density of the kinks we construct a kernel and smooth the
|
||||
network by applying the kernel similar to convolution.
|
||||
|
||||
\begin{Definition}[Smooth Approximation of Randomized Shallow Neural
|
||||
Network]
|
||||
\label{def:srsnn}
|
||||
Let $RS_{w}$ be a randomized shallow Neural Network according to
|
||||
Definition~\ref{def:RSNN} with weights $w$ and kinks $\xi_k$ with
|
||||
Definition~\ref{def:rsnn} with weights $w$ and kinks $\xi_k$ with
|
||||
corresponding kink density $g_{\xi}$ as given by
|
||||
Definition~\ref{def:kink}.
|
||||
In order to smooth the RSNN consider following kernel for every $x$:
|
||||
@ -338,53 +452,19 @@ satisfies $\int_{\mathbb{R}}\kappa_x dx = 1$. While $f^w$ looks highly
|
||||
similar to a convolution, it differs slightly as the kernel $\kappa_x(s)$
|
||||
is dependent on $x$. Therefore only $f^w = (\mathcal{RN}_w *
|
||||
\kappa_x)(x)$ is well defined, while $\mathcal{RN}_w * \kappa$ is not.
|
||||
We use $f^{w^{*,\tilde{\lambda}}}$ do describe the spline
|
||||
approximating the ... ridge penalized network
|
||||
$\mathrm{RN}^{*,\tilde{\lambda}}$.
|
||||
|
||||
Now we take a look at weighted regression splines. Later we will prove
|
||||
that the ridge penalized neural network as defined in
|
||||
Definition~\ref{def:rpnn} converges a weighted regression spline, as
|
||||
the amount of hidden nodes is grown to inifity.
|
||||
Next we construct a randomized shallow neural network which
|
||||
approximates a spline independent from the realization of the random
|
||||
parameters. In order to achieve this we ...
|
||||
|
||||
\begin{Definition}[Adapted Weighted regression spline]
|
||||
\label{def:wrs}
|
||||
Let $x_i^{\text{train}}, y_i^{\text{train}} \in \mathbb{R}, i \in
|
||||
\left\{1,\dots,N\right\}$ be trainig data. For a given $\lambda \in \mathbb{R}_{>0}$
|
||||
and a function $g: \mathbb{R} \to \mathbb{R}_{>0}$ the weighted
|
||||
regression spline $f^{*, \lambda}_g$ is given by
|
||||
|
||||
\[
|
||||
f^{*, \lambda}_g :\in \argmin_{\substack{f \in \mathcal{C}^2(\mathbb{R})
|
||||
\\ \supp(f) \subseteq \supp(g)}} \underbrace{\left\{ \overbrace{\sum_{i =
|
||||
1}^N \left(f(x_i^{\text{train}}) - y_i^{\text{train}}\right)^2}^{L(f)} +
|
||||
\lambda g(0) \int_{\supp(g)}\frac{\left(f''(x)\right)^2}{g(x)}
|
||||
dx\right\}}_{\eqqcolon F^{\lambda, g}(f)}.
|
||||
\]
|
||||
\todo{Anforderung an Ableitung von f, doch nicht?}
|
||||
\end{Definition}
|
||||
|
||||
Similary to ridge weight penalized neural networks the parameter
|
||||
$\lambda$ controls a trade-off between accuracy on the training data
|
||||
and smoothness or low second dreivative. For $g \equiv 1$ and $\lambda \to 0$ the
|
||||
resuling function $f^{*, 0+}$ will interpolate the training data while minimizing
|
||||
the second derivative. Such a function is known as cubic spline
|
||||
interpolation.
|
||||
\todo{cite cubic spline}
|
||||
|
||||
\[
|
||||
f^{*, 0+} \text{ smooth spline interpolation: }
|
||||
\]
|
||||
\[
|
||||
f^{*, 0+} \coloneqq \lim_{\lambda \to 0+} f^{*, \lambda}_1 \in
|
||||
\argmin_{\substack{f \in \mathcal{C}^2\mathbb{R}, \\ f(x_i^{\text{train}}) =
|
||||
y_i^{\text{train}}}} = \left( \int _{\mathbb{R}} (f''(x))^2dx\right).
|
||||
\]
|
||||
|
||||
For $\lambda \to \infty$ on the other hand $f_g^{*\lambda}$ converges
|
||||
to linear regression of the data.
|
||||
\begin{Definition}[Spline approximating Randomised Shallow Neural
|
||||
Network]
|
||||
\label{def:sann}
|
||||
Let $\mathcal{RN}$ be a randomised shallow Neural Network according
|
||||
to Definition~\ref{def:RSNN} and $f^{*, \lambda}_g$ be the weighted
|
||||
to Definition~\ref{def:rsnn} and $f^{*, \lambda}_g$ be the weighted
|
||||
regression spline as introduced in Definition~\ref{def:wrs}. Then
|
||||
the randomised shallow neural network approximating $f^{*,
|
||||
\lambda}_g$ is given by
|
||||
@ -399,9 +479,8 @@ to linear regression of the data.
|
||||
\end{Definition}
|
||||
|
||||
The approximating nature of the network in
|
||||
Definition~\ref{def:sann} can be seen by LOOKING \todo{besseres Wort
|
||||
finden} at the first derivative of $\mathcal{RN}_{\tilde{w}}(x)$ which is given
|
||||
by
|
||||
Definition~\ref{def:sann} can be seen by examining the first
|
||||
derivative of $\mathcal{RN}_{\tilde{w}}(x)$ which is given by
|
||||
\begin{align}
|
||||
\frac{\partial \mathcal{RN}_{\tilde{w}}}{\partial x}
|
||||
\Big{|}_{x} &= \sum_k^n \tilde{w}_k \mathds{1}_{\left\{b_k + v_k x >
|
||||
@ -411,16 +490,18 @@ by
|
||||
\xi_k < x}} \frac{v_k^2}{g_{\xi}(\xi_k) \mathbb{E}[v^2 \vert \xi
|
||||
= \xi_k]} (f_g^{*, \lambda})''(\xi_k). \label{eq:derivnn}
|
||||
\end{align}
|
||||
\todo{gescheite Ableitungs Notation}
|
||||
As the expression (\ref{eq:derivnn}) behaves similary to a
|
||||
Riemann-sum for $n \to \infty$ it will converge to the first
|
||||
derievative of $f^{*,\lambda}_g$. A formal proof of this behaviour
|
||||
Riemann-sum for $n \to \infty$ it will converge in probability to the
|
||||
first derivative of $f^{*,\lambda}_g$. A formal proof of this behaviour
|
||||
is given in Lemma~\ref{lem:s0}.
|
||||
|
||||
In order to ensure the functions used in the proof of the convergence
|
||||
are well defined we need to assume some properties of the random
|
||||
parameters and their densities
|
||||
|
||||
In order to formulate the theorem describing the convergence of $RN_w$
|
||||
we need to make a couple of assumptions.
|
||||
\todo{Bessere Formulierung}
|
||||
% In order to formulate the theorem describing the convergence of $RN_w$
|
||||
% we need to make a couple of assumptions.
|
||||
% \todo{Bessere Formulierung}
|
||||
|
||||
\begin{Assumption}~
|
||||
\label{ass:theo38}
|
||||
@ -440,8 +521,8 @@ we need to make a couple of assumptions.
|
||||
\end{enumerate}
|
||||
\end{Assumption}
|
||||
|
||||
As we will prove the prorpsition in the Sobolev space, we hereby
|
||||
introduce it and its inuced\todo{richtiges wort?} norm.
|
||||
As we will prove the convergence of in the Sobolev space, we hereby
|
||||
introduce it and the corresponding induced norm.
|
||||
|
||||
\begin{Definition}[Sobolev Space]
|
||||
For $K \subset \mathbb{R}^n$ open and $1 \leq p \leq \infty$ we
|
||||
@ -473,9 +554,10 @@ introduce it and its inuced\todo{richtiges wort?} norm.
|
||||
\]
|
||||
\end{Definition}
|
||||
|
||||
With these assumption in place we can formulate the main theorem.
|
||||
\todo{Bezug Raum}
|
||||
|
||||
With the important definitions and assumptions in place we can now
|
||||
formulate the main theorem ... the convergence of ridge penalized
|
||||
random neural networks to adapted regression splines when the
|
||||
parameters are chosen accordingly.
|
||||
|
||||
\begin{Theorem}[Ridge weight penaltiy corresponds to weighted regression spline]
|
||||
\label{theo:main1}
|
||||
@ -498,7 +580,8 @@ With these assumption in place we can formulate the main theorem.
|
||||
\tilde{\lambda} & \coloneqq \lambda n g(0).
|
||||
\end{align*}
|
||||
\end{Theorem}
|
||||
We will proof Theo~\ref{theo:main1} by showing that
|
||||
As mentioned above we will prof Theorem~\ref{theo:main1} utilizing
|
||||
the ... functions. We show that
|
||||
\begin{equation}
|
||||
\label{eq:main2}
|
||||
\plimn \norm{\mathcal{RN}^{*, \tilde{\lambda}} - f^{w^*}}_{W^{1,
|
||||
@ -509,10 +592,10 @@ and
|
||||
\label{eq:main3}
|
||||
\plimn \norm{f^{w^*} - f_g^{*, \lambda}}_{W^{1,\infty}(K)} = 0
|
||||
\end{equation}
|
||||
and then using the triangle inequality to follow (\ref{eq:main1}). In
|
||||
and then get (\ref{eq:main1}) using the triangle inequality. In
|
||||
order to prove (\ref{eq:main2}) and (\ref{eq:main3}) we will need to
|
||||
introduce a number of auxiliary lemmmata, proves to these will be
|
||||
provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
introduce a number of auxiliary lemmmata, proves of these will be
|
||||
provided in the appendix.
|
||||
|
||||
|
||||
|
||||
@ -534,7 +617,7 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
\exists C_K^2 \in \mathbb{R}_{>0} : \norm{f}_{W^{1,\infty}(K)} \leq
|
||||
C_K^2 \norm{f''}_{L^2(K)}.
|
||||
\end{equation*}
|
||||
% \proof
|
||||
\proof The proof is given in the appendix...
|
||||
% With the fundamental theorem of calculus, if
|
||||
% \(\norm{f}_{L^{\infty}(K)}<\infty\) we get
|
||||
% \begin{equation}
|
||||
@ -555,17 +638,17 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
% get (\ref{eq:pti1}).
|
||||
% By using the Hölder inequality, we can proof the second claim.
|
||||
% \begin{align*}
|
||||
% \norm{f'}_{L^{\infty}(K)} &= \sup_{x \in K} \abs{\int_a^bf''(y)
|
||||
% \mathds{1}_{[a,x]}(y)dy} \leq \sup_{x \in
|
||||
% K}\norm{f''\mathds{1}_{[a,x]}}_{L^1(K)}\\
|
||||
% &\hspace{-6pt} \stackrel{\text{Hölder}}{\leq} sup_{x
|
||||
% \in
|
||||
% K}\norm{f''}_{L^2(K)}\norm{\mathds{1}_{[a,x]}}_{L^2(K)}
|
||||
% = \abs{b-a}\norm{f''}_{L^2(K)}.
|
||||
% \end{align*}
|
||||
% Thus (\ref{eq:pti2}) follows with \(C_K^2 \coloneqq
|
||||
% \abs{b-a}C_K^{\infty}\).
|
||||
% \qed
|
||||
% \norm{f'}_{L^{\infty}(K)} &= \sup_{x \in K} \abs{\int_a^bf''(y)
|
||||
% \mathds{1}_{[a,x]}(y)dy} \leq \sup_{x \in
|
||||
% K}\norm{f''\mathds{1}_{[a,x]}}_{L^1(K)}\\
|
||||
% &\hspace{-6pt} \stackrel{\text{Hölder}}{\leq} sup_{x
|
||||
% \in
|
||||
% K}\norm{f''}_{L^2(K)}\norm{\mathds{1}_{[a,x]}}_{L^2(K)}
|
||||
% = \abs{b-a}\norm{f''}_{L^2(K)}.
|
||||
% \end{align*}
|
||||
% Thus (\ref{eq:pti2}) follows with \(C_K^2 \coloneqq
|
||||
% \abs{b-a}C_K^{\infty}\).
|
||||
% \qed
|
||||
\end{Lemma}
|
||||
|
||||
\begin{Lemma}
|
||||
@ -584,62 +667,62 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
\mathbb{E}\left[\varphi(\xi, v) \vert \xi = x \right] dx
|
||||
\]
|
||||
uniformly in \(T \in K\).
|
||||
% \proof
|
||||
% For \(T \leq C_{g_{\xi}}^l\) both sides equal 0, so it is sufficient to
|
||||
% consider \(T > C_{g_{\xi}}^l\). With \(\varphi\) and
|
||||
% \(\nicefrac{1}{g_{\xi}}\) uniformly continous in \(\xi\),
|
||||
% \begin{equation}
|
||||
% \label{eq:psi_stet}
|
||||
% \forall \varepsilon > 0 : \exists \delta(\varepsilon) : \forall
|
||||
% \abs{\xi - \xi'} < \delta(\varepsilon) : \abs{\varphi(\xi, v)
|
||||
% \frac{1}{g_{\xi}(\xi)} - \varphi(\xi', v)
|
||||
% \frac{1}{g_{\xi}(\xi')}} < \varepsilon
|
||||
% \end{equation}
|
||||
% uniformly in \(v\). In order to
|
||||
% save space we use the notation \((a \wedge b) \coloneqq \min\{a,b\}\) for $a$ and $b
|
||||
% \in \mathbb{R}$. W.l.o.g. assume \(\sup(g_{\xi})\) in an
|
||||
% intervall. By splitting the interval in disjoint strips of length \(\delta
|
||||
% \leq \delta(\varepsilon)\) we get:
|
||||
|
||||
% \[
|
||||
% \underbrace{\sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
|
||||
% \frac{\bar{h}_k}{2}}_{\circled{1}} =
|
||||
% \underbrace{\sum_{l \in \mathbb{Z}:
|
||||
% \left[\delta l, \delta (l + 1)\right] \subseteq
|
||||
% \left[C_{g_{\xi}}^l, C_{g_{\xi}}^u \wedge T
|
||||
% \right]}}_{\coloneqq \, l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
|
||||
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
|
||||
% \varphi\left(\xi_k, v_k\right)\frac{\bar{h}_k}{2} \right)
|
||||
% \]
|
||||
% Using (\ref{eq:psi_stet}) we can approximate $\circled{1}$ by
|
||||
% \begin{align*}
|
||||
% \circled{1} & \approx \sum_{l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
|
||||
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
|
||||
% \left(\varphi\left(l\delta, v_k\right)\frac{1}{g_{\xi}(l\delta)}
|
||||
% \pm \varepsilon\right)\frac{1}{n} \underbrace{\frac{\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}{\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}}_{=
|
||||
% 1}\right) \\
|
||||
% % \intertext{}
|
||||
% &= \sum_{l \in I_{\delta}} \left( \frac{ \sum_{ \substack{k \in \kappa\\
|
||||
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
|
||||
% \varphi\left(l\delta, v_k\right)}
|
||||
% {\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}\frac{\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l +
|
||||
% 1)]\right\}}}{ng_{\xi}(l\delta)}\right) \pm \varepsilon .\\
|
||||
% \intertext{We use the mean to approximate the number of kinks in
|
||||
% each $\delta$-strip, as it follows a bonomial distribution this
|
||||
% amounts to
|
||||
% \[
|
||||
% \mathbb{E}\left[\abs{\left\{m \in \kappa : \xi_m \in [\delta l,
|
||||
% \delta(l + 1)]\right\}\right]} = n \int_{[\delta l, \delta (l +
|
||||
% 1)]} g_{\xi}(x)dx \approx n (\delta g_{\xi}(l\delta) \pm
|
||||
% \tilde{\varepsilon}).
|
||||
% \]
|
||||
% Bla Bla Bla $v_k$}
|
||||
% \circled{1} & \approx
|
||||
% \end{align*}
|
||||
\proof The proof is given in appendix...
|
||||
% For \(T \leq C_{g_{\xi}}^l\) both sides equal 0, so it is sufficient to
|
||||
% consider \(T > C_{g_{\xi}}^l\). With \(\varphi\) and
|
||||
% \(\nicefrac{1}{g_{\xi}}\) uniformly continous in \(\xi\),
|
||||
% \begin{equation}
|
||||
% \label{eq:psi_stet}
|
||||
% \forall \varepsilon > 0 : \exists \delta(\varepsilon) : \forall
|
||||
% \abs{\xi - \xi'} < \delta(\varepsilon) : \abs{\varphi(\xi, v)
|
||||
% \frac{1}{g_{\xi}(\xi)} - \varphi(\xi', v)
|
||||
% \frac{1}{g_{\xi}(\xi')}} < \varepsilon
|
||||
% \end{equation}
|
||||
% uniformly in \(v\). In order to
|
||||
% save space we use the notation \((a \wedge b) \coloneqq \min\{a,b\}\) for $a$ and $b
|
||||
% \in \mathbb{R}$. W.l.o.g. assume \(\sup(g_{\xi})\) in an
|
||||
% intervall. By splitting the interval in disjoint strips of length \(\delta
|
||||
% \leq \delta(\varepsilon)\) we get:
|
||||
|
||||
% \[
|
||||
% \underbrace{\sum_{k \in \kappa : \xi_k < T} \varphi(\xi_k, v_k)
|
||||
% \frac{\bar{h}_k}{2}}_{\circled{1}} =
|
||||
% \underbrace{\sum_{l \in \mathbb{Z}:
|
||||
% \left[\delta l, \delta (l + 1)\right] \subseteq
|
||||
% \left[C_{g_{\xi}}^l, C_{g_{\xi}}^u \wedge T
|
||||
% \right]}}_{\coloneqq \, l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
|
||||
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
|
||||
% \varphi\left(\xi_k, v_k\right)\frac{\bar{h}_k}{2} \right)
|
||||
% \]
|
||||
% Using (\ref{eq:psi_stet}) we can approximate $\circled{1}$ by
|
||||
% \begin{align*}
|
||||
% \circled{1} & \approx \sum_{l \in I_{\delta}} \left( \, \sum_{\substack{k \in \kappa\\
|
||||
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
|
||||
% \left(\varphi\left(l\delta, v_k\right)\frac{1}{g_{\xi}(l\delta)}
|
||||
% \pm \varepsilon\right)\frac{1}{n} \underbrace{\frac{\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}{\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}}_{=
|
||||
% 1}\right) \\
|
||||
% \intertext{}
|
||||
% &= \sum_{l \in I_{\delta}} \left( \frac{ \sum_{ \substack{k \in \kappa\\
|
||||
% \xi_k \in \left[\delta l, \delta (l + 1)\right]}}
|
||||
% \varphi\left(l\delta, v_k\right)}
|
||||
% {\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l + 1)]\right\}}}\frac{\abs{\left\{m \in
|
||||
% \kappa : \xi_m \in [\delta l, \delta(l +
|
||||
% 1)]\right\}}}{ng_{\xi}(l\delta)}\right) \pm \varepsilon .\\
|
||||
% \intertext{We use the mean to approximate the number of kinks in
|
||||
% each $\delta$-strip, as it follows a bonomial distribution this
|
||||
% amounts to
|
||||
% \[
|
||||
% \mathbb{E}\left[\abs{\left\{m \in \kappa : \xi_m \in [\delta l,
|
||||
% \delta(l + 1)]\right\}\right]} = n \int_{[\delta l, \delta (l +
|
||||
% 1)]} g_{\xi}(x)dx \approx n (\delta g_{\xi}(l\delta) \pm
|
||||
% \tilde{\varepsilon}).
|
||||
% \]
|
||||
% Bla Bla Bla $v_k$}
|
||||
% \circled{1} & \approx
|
||||
% \end{align*}
|
||||
\end{Lemma}
|
||||
|
||||
\begin{Lemma}[Step 0]
|
||||
@ -666,18 +749,18 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
\begin{align*}
|
||||
\plimn \frac{\partial \mathcal{RN}_{\tilde{w}}}{\partial x}
|
||||
\stackrel{(\ref{eq:derivnn})}{=}
|
||||
& \plimn \sum_{\substack{k \in \mathbb{N} \\
|
||||
& \plimn \sum_{\substack{k \in \mathbb{N} \\
|
||||
\xi_k < x}} \frac{v_k^2}{\mathbb{E}[v^2 \vert \xi
|
||||
= \xi_k]} (f_g^{*, \lambda})''(\xi_k) h_{k,n}
|
||||
\stackrel{\text{Lemma}~\ref{lem:cnvh}}{=} \\
|
||||
\stackrel{\text{Lemma}~\ref{lem:cnvh}}{=} \\
|
||||
\stackrel{\phantom{(\ref{eq:derivnn})}}{=}
|
||||
&
|
||||
&
|
||||
\int_{\min\left\{C_{g_{\xi}}^l,T\right\}}^{min\left\{C_{g_{\xi}}^u,T\right\}}
|
||||
\mathbb{E}\left[\frac{v^2}{\mathbb{E}[v^2|\xi = z]} (f^{*,
|
||||
\lambda}_w)''(\xi) \vert
|
||||
\xi = x \right] dx \equals^{\text{Tower-}}_{\text{property}} \\
|
||||
\stackrel{\phantom{(\ref{eq:derivnn})}}{=}
|
||||
&
|
||||
&
|
||||
\int_{\min\left\{C_{g_{\xi}}^l,
|
||||
T\right\}}^{min\left\{C_{g_{\xi}}^u,T\right\}}(f^{*,\lambda}_w)''(x)
|
||||
dx.
|
||||
@ -685,6 +768,7 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
By the fundamental theorem of calculus and $\supp(f') \subset
|
||||
\supp(f)$, (\ref{eq:s0}) follows with Lemma~\ref{lem:pieq}.
|
||||
\qed
|
||||
\label{lem:s0}
|
||||
\end{Lemma}
|
||||
|
||||
\begin{Lemma}[Step 2]
|
||||
@ -696,19 +780,22 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
F^{\lambda, g}(f^{*, \lambda}_g) = 0.
|
||||
\]
|
||||
\proof
|
||||
This can be prooven by showing
|
||||
The proof is given in the appendix...
|
||||
\label{lem:s2}
|
||||
\end{Lemma}
|
||||
|
||||
\begin{Lemma}[Step 3]
|
||||
For any $\lambda > 0$ and training data $(x_i^{\text{train}},
|
||||
y_i^{\text{train}}) \in \mathbb{R}^2, \, i \in
|
||||
\left\{1,\dots,N\right\}$, with $w^*$ and $\tilde{\lambda}$ as
|
||||
defined in Definition~\ref{def:rpnn} and Theroem~\ref{theo:main1}
|
||||
respectively, it holds
|
||||
\left\{1,\dots,N\right\}$, with $w^*$ as
|
||||
defined in Definition~\ref{def:rpnn} and $\tilde{\lambda}$ as
|
||||
defined in Theroem~\ref{theo:main1}, it holds
|
||||
\[
|
||||
\plimn \norm{\mathcal{RN}^{*,\tilde{\lambda}} -
|
||||
f^{w*, \tilde{\lambda}}}_{W^{1,\infty}(K)} = 0.
|
||||
\]
|
||||
\proof The proof is given in Appendix ..
|
||||
\label{lem:s3}
|
||||
\end{Lemma}
|
||||
|
||||
\begin{Lemma}[Step 4]
|
||||
@ -718,9 +805,11 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
defined in Definition~\ref{def:rpnn} and Theroem~\ref{theo:main1}
|
||||
respectively, it holds
|
||||
\[
|
||||
\plimn \abs{F_n^{\lambda}(\mathcal{RN}^{*,\tilde{\lambda}}) -
|
||||
\plimn \abs{F_n^{\tilde{\lambda}}(\mathcal{RN}^{*,\tilde{\lambda}}) -
|
||||
F^{\lambda, g}(f^{w*, \tilde{\lambda}})} = 0.
|
||||
\]
|
||||
\proof The proof is given in appendix...
|
||||
\label{lem:s4}
|
||||
\end{Lemma}
|
||||
|
||||
\begin{Lemma}[Step 7]
|
||||
@ -735,11 +824,81 @@ provided in the appendix, as they would SPRENGEN DEN RAHMEN.
|
||||
\[
|
||||
\plimn \norm{f^n - f^{*, \lambda}} = 0.
|
||||
\]
|
||||
\proof The proof is given in appendix ...
|
||||
\label{lem:s7}
|
||||
\end{Lemma}
|
||||
Using these lemmata we can now proof Theorem~\ref{theo:main1}. We
|
||||
start by showing that the error measure of the smooth approximation of
|
||||
the ridge penalized randomized shallow neural network $F^{\lambda,
|
||||
g}\left(f^{w^{*,\tilde{\lambda}}}\right)$
|
||||
will converge in probability to the error measure of the adapted weighted regression
|
||||
spline $F^{\lambda, g}\left(f^{*,\lambda}\right)$ for the specified
|
||||
parameters.
|
||||
|
||||
\textcite{heiss2019} further show a link between ridge penalized
|
||||
networks and randomized shallow neural networks which are trained with
|
||||
gradient descent which is stopped after a certain amount of iterations.
|
||||
Using Lemma~\ref{lem:s4} we get that for every $P \in (0,1)$ and
|
||||
$\varepsilon > 0$ there exists a $n_1 \in \mathbb{N}$ such that
|
||||
\[
|
||||
\mathbb{P}\left[F^{\lambda, g}\left(f^{w^{*,\tilde{\lambda}}}\right) \in
|
||||
F_n^{\tilde{\lambda}}\left(\mathcal{RN}^{*,\tilde{\lambda}}\right)
|
||||
+[-\varepsilon, \varepsilon]\right] > P, \forall n \in \mathbb{N}_{> n_1}.
|
||||
\]
|
||||
As $\mathcal{RN}^{*,\tilde{\lambda}}$ is the optimal network for
|
||||
$F_n^{\tilde{\lambda}}$ we know that
|
||||
\[
|
||||
F_n^{\tilde{\lambda}}\left(\mathcal{RN}^{*,\tilde{\lambda}}\right)
|
||||
\leq F_n^{\tilde{\lambda}}\left(\mathcal{RN}_{\tilde{w}}\right).
|
||||
\]
|
||||
Using Lemma~\ref{lem:s2} we get that for every $P \in (0,1)$ and
|
||||
$\varepsilon > 0$ there exists a $n_2 \in \mathbb{N}$ such that
|
||||
\[
|
||||
\mathbb{P}\left[F_n^{\tilde{\lambda}}\left(\mathcal{RN}_{\tilde{w}}\right)
|
||||
\in F^{\lambda, g}\left(f^{*,\lambda}_g\right)+[-\varepsilon,
|
||||
\varepsilon]\right] > P, \forall n \in \mathbb{N}_{> n_2}.
|
||||
\]
|
||||
If we combine these ... we get that for every $P \in (0,1)$ and
|
||||
$\varepsilon > 0$ and $n_3 \geq
|
||||
\max\left\{n_1,n_2\right\}$
|
||||
\[
|
||||
\mathbb{P}\left[F^{\lambda,
|
||||
g}\left(f^{w^{*,\tilde{\lambda}}}\right) \leq F^{\lambda,
|
||||
g}\left(f^{*,\lambda}_g\right)+2\varepsilon\right] > P, \forall
|
||||
n \in \mathbb{N}_{> n_3}.
|
||||
\]
|
||||
As ... is in ... and ... is optimal we know that
|
||||
\[
|
||||
F^{\lambda, g}\left(f^{*,\lambda}_g\right) \leq F^{\lambda, g}\left(f^{w^{*,\tilde{\lambda}}}\right)
|
||||
\]
|
||||
and thus get with the squeeze theorem
|
||||
\[
|
||||
\plimn F^{\lambda, g}\left(f^{w^{*,\tilde{\lambda}}}\right) = F^{\lambda, g}\left(f^{*,\lambda}_g\right).
|
||||
\]
|
||||
We can now use Lemma~\ref{lem:s7} to follow that
|
||||
\begin{equation}
|
||||
\plimn \norm{f^{w^{*,\tilde{\lambda}}} - f^{*,\lambda}_g}
|
||||
_{W^{1,\infty}} = 0.
|
||||
\label{eq:main2}
|
||||
\end{equation}
|
||||
Now by using the triangle inequality with Lemma~\ref{lem:s3} and
|
||||
(\ref{eq:main2}) we get
|
||||
\begin{align*}
|
||||
\plimn \norm{\mathcal{RN}^{*, \tilde{\lambda}} - f_g^{*,\lambda}}
|
||||
\leq& \plimn \bigg(\norm{\mathcal{RN}^{*, \tilde{\lambda}} -
|
||||
f_g^{w^{*,\tilde{\lambda}}}}_{W^{1,\infty}}\\
|
||||
&+ \norm{f^{w^{*,\tilde{\lambda}}} - f^{*,\lambda}_g}
|
||||
_{W^{1,\infty}}\bigg) = 0
|
||||
\end{align*}
|
||||
and thus have proven Theorem~\ref{theo:main1}.
|
||||
We now know that randomized shallow neural networks behave similar to
|
||||
spline regression if we regularize the size of the weights during
|
||||
training.
|
||||
\textcite{heiss2019} further explore a connection between ridge penalized
|
||||
networks and randomized shallow neural networks which are trained
|
||||
which are only trained for a certain amount of epoch using gradient
|
||||
descent.
|
||||
And ... that the effect of weight regularization can be achieved by
|
||||
training for a certain amount of iterations this ... between adapted
|
||||
weighted regression splines and randomized shallow neural networks
|
||||
where training is stopped early.
|
||||
|
||||
\newpage
|
||||
\subsection{Simulations}
|
||||
@ -755,7 +914,7 @@ data have been generated.
|
||||
y_{i, A}^{\text{train}} &\coloneqq \sin( x_{i, A}^{\text{train}}). \phantom{(i - 1),
|
||||
i \in \left\{1, \dots, 6\right\}}
|
||||
\end{align*}
|
||||
\item $\text{data}_b = (x_{i, B}^{\text{train}}, y_{i,
|
||||
\item $\text{data}_B = (x_{i, B}^{\text{train}}, y_{i,
|
||||
B}^{\text{train}})$ with
|
||||
\begin{align*}
|
||||
x_{i, B}^{\text{train}} &\coloneqq \pi\frac{i - 8}{7},
|
||||
@ -785,9 +944,9 @@ been calculated with Matlab's ..... As ... minimizes
|
||||
the smoothing parameter used for fittment is $\bar{\lambda} =
|
||||
\frac{1}{1 + \lambda}$. The parameter $\tilde{\lambda}$ for training
|
||||
the networks is chosen as defined in Theorem~\ref{theo:main1} and each
|
||||
one is trained on the full training data for 5000 iterations using
|
||||
one is trained on the full training data for 5000 epoch using
|
||||
gradient descent. The
|
||||
results are given in Figure~\ref{blblb}, here it can be seen that in
|
||||
results are given in Figure~\ref{fig:rs_vs_rs}, here it can be seen that in
|
||||
the intervall of the traing data $[-\pi, \pi]$ the neural network and
|
||||
smoothing spline are nearly identical, coinciding with the proposition.
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user