automl-edu · mlindauer · Feb 20, 2021 · May 15, 2020
diff --git a/.DS_Store b/.DS_Store
diff --git a/w05_gps/.DS_Store b/w05_gps/.DS_Store
diff --git a/w05_gps/t01_bayes_lm.pdf b/w05_gps/t01_bayes_lm.pdf
diff --git a/w05_gps/t01_bayes_lm.tex b/w05_gps/t01_bayes_lm.tex
@@ -213,7 +213,7 @@
 
 \item Based on the posterior destribution, 
 $\thetab \mid \Xmat, \ydat \sim \normaldist(\sigma^{-2}\bm{A}^{-1}\Xmat^\top\ydat, \bm{A}^{-1})$,
-we can derive the predictive distribution for a new observations $\x_*$.
+we can derive the predictive distribution for a new observation $\x_*$.
 
 \lz
 

diff --git a/w05_gps/t02_basic.pdf b/w05_gps/t02_basic.pdf
diff --git a/w05_gps/t03_covariance.pdf b/w05_gps/t03_covariance.pdf
diff --git a/w05_gps/t03_covariance.tex b/w05_gps/t03_covariance.tex
@@ -149,7 +149,7 @@
 \begin{itemize}
 \item $k$ is \textbf{stationary} if it depends only on $\bm{d} =\x -\x^\prime$ and is denoted by $k(\bm{d})$.
 
-\item $k$ is \textbf{isotropic} if it depends only on $r = \|\x - \x^\prime\|$and is denoted by $k(r)$.
+\item $k$ is \textbf{isotropic} if it depends only on $r = \|\x - \x^\prime\|$ and is denoted by $k(r)$.
 
 \item $k$ is a \textbf{dot product} if it depends only on $\x^T \x^\prime$.
 \end{itemize}
@@ -284,7 +284,7 @@
 \vspace{2mm}
 \item If $\ls_i$ is very large, the covariance will become almost independent of that input, effectively removing it from inference.
 \vspace{2mm}
-\item If the features are on different scales, the data can be automatically \textbf{rescaled} by estimating $\ls_1,\dots, \ls_p$ 
+\item If the features are on different scales, the data can be automatically \textbf{rescaled} by estimating $\ls_1,\dots, \ls_p$. 
 
 \end{itemize}
 

diff --git a/w05_gps/t04_prediction.pdf b/w05_gps/t04_prediction.pdf
diff --git a/w05_gps/t04_prediction.tex b/w05_gps/t04_prediction.tex
@@ -128,7 +128,7 @@
 
 \item As the posterior is Gaussian, the maximum a-posteriori estimate (i.e., the mode of the posterior distribution) is:
 
-\large $$\bm{k}_{*}^{T}\Kmat^{-1}\bm{f}$$.
+\large $$\bm{k}_{*}^{T}\Kmat^{-1}\bm{f}.$$
 
 \end{itemize}
 \end{frame}

diff --git a/w05_gps/t05_training.pdf b/w05_gps/t05_training.pdf
diff --git a/w05_gps/t06_mean.pdf b/w05_gps/t06_mean.pdf
diff --git a/w05_gps/t07_covariance_adv.pdf b/w05_gps/t07_covariance_adv.pdf
diff --git a/w05_gps/t08_additional.pdf b/w05_gps/t08_additional.pdf
diff --git a/w05_gps/t09_classification.pdf b/w05_gps/t09_classification.pdf
diff --git a/w05_gps/t09_classification.tex b/w05_gps/t09_classification.tex
@@ -87,7 +87,7 @@
 
 \vspace{15mm}
 
-\item Frm the GP assumption, we can assert that $p(\bm{f}\mid\Xmat) \sim \mathcal{N}\left(0, \bm{K}\right)$. Hence, we have:
+\item From the GP assumption, we can assert that $p(\bm{f}\mid\Xmat) \sim \mathcal{N}\left(0, \bm{K}\right)$. Hence, we have:
 \vspace{-2mm}
 $$\log p(\bm{f}\mid\Xmat, y) \propto \log p(\ydat\mid\bm{f}) - \frac{1}{2} \bm{f}^\top \bm{K}^{-1} \bm{f} - \frac{1}{2} \log |\bm{K}| - \frac{n}{2} \log 2 \pi.$$
 
@@ -123,16 +123,17 @@
 \vspace{-2mm}
 $$\frac{1}{2} \|\thetab\|^2 + C \sumin \loss(\yI{i},f(\xI{i})),$$
 
+\vspace{-2mm}
+where $\loss(\y,f(\x))  = \max\{0, 1-f(\x)\cdot\y\}$ is the Hinge loss.
+
 
 \vspace{.5cm}
 \item Plugging that in, the optimization objective would be:
 \vspace{-2mm}
-$$\frac{1}{2} \bm{f}^\top \bm{K}^{-1} \bm{f} + C \sumin \loss(\yI{i},f(\xI{i})),$$
-\vspace{-2mm}
-where $\loss(\y,f(\x))  = \max\{0, 1-f(\x)\cdot\y\}$ is the Hinge loss.
+$$\frac{1}{2} \bm{f}^\top \bm{K}^{-1} \bm{f} + C \sumin \loss(\yI{i},f(\xI{i})).$$
 
 
-\vspace{1cm}
+\vspace{.1cm}
 \item From the representer theorem: $\thetab = \sumin \beta_i\,\yI{i} k\left(\xI{i}, \cdot \right)$, and thus:
 \vspace{-2mm}
 $$\thetab^\top \thetab = \beta^\top \bm{K} \beta = \bm{f}^\top \bm{K}^{-1} \bm{f}$$.