\documentclass[12pt,dvipsnames,addpoints]{exam}


%packages
\usepackage{amsmath}
\usepackage{derivative}
\usepackage{tikz}
\usepackage{hyperref}
\usepackage{mathtools}
\usepackage{amssymb}
\usepackage{geometry}
\usepackage{titlesec}
\usepackage{relsize}
\usepackage{pgfplots}
\pgfplotsset{compat=1.18}
\usepackage{xcolor}
\usepackage{fancyvrb}
\VerbatimFootnotes

%redefs
\renewcommand\vec{\mathbf}

% package options
\hypersetup{
     colorlinks   = true,
     linkcolor    = RedViolet,
     citecolor    = gray
}

\boxedpoints
\pointsinrightmargin
\setlength{\rightpointsmargin}{1.5cm}
\noprintanswers

\geometry{
    top = 20mm,
    bottom = 30mm,
    right = 30mm,
    left = 30mm
}

%titling
\title{\vspace*{-2cm} 23 Problems}
\author{Aayush Bajaj}
\date{\today\vspace*{-0.5cm}}

%pagination
\titleformat{\section}{\normalfont\Large\bfseries}{{\color{RedViolet}\S}}{0.5em}{}
\titleformat{\subsection}{\normalfont\large\bfseries}{{\large \color{RedViolet}\S\S}}{0.5em}{}
\titleformat{\subsubsection}{\normalfont\bfseries}{{\color{RedViolet}\S\S\S}}{0.5em}{}
\cfoot{\thepage}

\begin{document}

\maketitle
\dotfill
\tableofcontents
\dotfill

\section*{Rules}
\begin{enumerate}
    \item This year, I shall not advocate aversion from the internet.
    \item If your work is unpleasant to read, and / or difficult to mark, I reserve the right to discard it.
    \item The boxed numbers in the right margin are marks.
    \item Deadline:\textit{11:59PM}, 31st of January 2025.
    \item Submission: \LaTeX{} appraised, hand-written accepted. \textsc{filename must be your full name!}
\end{enumerate}
\dotfill

\begin{center}
\begin{tikzpicture}
    \node[draw, rounded corners=8pt, fill=red!30, inner sep=10pt] (submit_button) at (0,0) {\textbf{Submit}};
    \node[draw, rounded corners=8pt, fill=green!30, inner sep=10pt] (sauce_button) at (0,2) {\textbf{Sauce Code}};
  \def\suburl{https://abaj.ai/projects/bday-problems/upload}
  \def\sucurl{https://abaj.ai/projects/bday-problems/23rd}
  \node[anchor=center] (sub_link) at (submit_button.center) {\href{\suburl}{\phantom{Submit}}};
  \node[anchor=center] (sac_link) at (sauce_button.center) {\href{\sucurl}{\phantom{Sauce Code}}};
\end{tikzpicture}
\end{center}

\newpage

\section{Problems}

\subsection{Appetisers}
\begin{questions}
    \question The Newton-Raphson method.
    \begin{parts}
        \part[2] Estimate $\sqrt{2}$ using 2 iterations of the Newton-Raphson method. Let $x_0 =1$\\
        Hint: The equation $f(x) = x^2 - 2$ may be helpful.
        \part[1] What does this method do, and how can it be used to find the optima of a function?
        \part[1] List one advantage and one disadvantage for this method in practise.\reversemarginpar \marginpar{\tiny Half mark each} \\Hint: Generalise to the multivariate matrix case.
    \end{parts}
    \question[2] Let $n$ be the number of people in a group. What number must $n$ be such that 2 people in this group share the same birthday with at least 50\% probability?
    \begin{parts}
    	\part[1] What number must $n$ be such that 2 people share the same birthday with 100\% probability?
    \end{parts}
    \question What kind of matrices possess an inverse? If the following matrices do, find them:
    \begin{parts}
        \part[1] \[\begin{bmatrix}5&4&3\\3&2&1\end{bmatrix}\]
        \part[1] \[\begin{bmatrix}0&2&4\\2&4&6\\4&6&8\end{bmatrix}\]
        \part[1] \[\begin{bmatrix}4&3&2&1\\0&3&2&1\\0&0&2&1\\0&0&0&1\end{bmatrix}\] Recall that $A | I \leadsto I | A^{-1}$
    \end{parts}
    \question Let $\vec{x}^T = \begin{bmatrix}1&2&3\end{bmatrix}^T$ and $\vec{y}^T = \begin{bmatrix}-2&1&3\end{bmatrix}^T$. Compute the \emph{distance} between these two vectors by taking the \textbf{inner product} to be:
        \begin{parts}
            \part[1] the familiar dot product: $\vec{x}^T \vec{y}$
            \part[1] $\vec{x}^T \vec{A}\vec{y}, \vec{A} = \begin{bmatrix}2&1&0\\1&2&-1\\0&-1&2\end{bmatrix}$
        \end{parts}
        Hint: Recall that $\|x\| \coloneqq \sqrt{\langle x,x\rangle}$
    \question Matrix Decompositions
    \begin{parts}
        \part[1] The trace of a matrix is equal to the \fillin[product] of the eigenvalues.
        \part[1] The determinant of a matrix is equal to the \fillin[sum] of its eigenvalues.
        \part[1] Find the eigenvalues of this $\mathbb{R}^{2\times 2}$ matrix: \[\begin{bmatrix}3&3\\7&-1\end{bmatrix}\]
            \part[1] \emph{Qualitatively} describe the differences between eigendecompositions and Singular Value Decompositions\footnote{mathematics is sufficient, although not necessary here.}
    \end{parts}
    \question[2] Given that a matrix $A\in\mathbb{R}^{n\times n}$ is \emph{positive semi-definite}, denoted $A\succeq 0$, if $A=A^T$ and $x^T Ax \geq 0, \,\,\forall x\in \mathbb{R}^n$. Show that $A = zz^T$ for an arbitrary $z\in\mathbb{R}^n$\phantomsection\label{q:psd}
    \question Here are two ordinary, linear, autonomous and inhomogenous differential equations:
    \begin{parts}
        \part[2] Find the exact solution to: \[\odv{y}{x}+\cos(x)y=\cos(x),\quad  y(\frac{\pi}{2})=-1\]
        \part[2] Find the general solution to: \[y^{''}+14y^{'} + 49y = e^t\]
    \end{parts}
    \question[6] List out the first four terms of the power series expansions of:
    \begin{parts}
        \part $\exp{z}$
        \part $\sin{z}$
        \part $\cos{z}$
        \part $\sinh{z}$
        \part $\cosh{z}$
        \part $\frac{1}{1-x}$
        \part $\frac{1}{1+x}$
    \end{parts}
    \question[3] Solve the following recurrence relation with initial conditions\footnote{be grateful for the ease of this recurrence; solving real ones in DE's are painful}:
    \[a_{n+1} = \frac{2a_n}{n+3},\qquad a_0=1\]
    \question[3] Compute the following determinant --- \emph{efficiently}.
    \[\begin{vmatrix}
        2&0&1&2&0\\
        2&-1&0&1&1\\
        0&1&2&1&2\\
        -2&0&2&-1&2\\
        2&0&0&1&1
    \end{vmatrix}\]
    \question[2] Express the derivative of $\mathlarger{\sigma(z) = \frac{1}{1+e^{-z}}}$, $\sigma^{'}(z)$, as a function of $\sigma(z)$.
    \question[5] Describe the following Probability Distributions (in a sentence or two), and give an example experiment for each:
    \begin{parts}
        \part Bernoulli
        \part Binomial
        \part Gaussian
        \part Pareto
        \part Geometric
        \part Poisson
        \part Uniform
    \end{parts}
    \question[2] ``The principle point of proof is to compel belief'' - Daniel Velleman. Keeping this in mind, explain to me either mathematically or otherwise, why \[{n\choose r} = \frac{n!}{r!(n-r)!}\]
\end{questions}
        
\subsection[Main Course]{Main Course: Least Squares}
The following section is devoted to deriving closed forms for the univariate and multivariate least squares model. We will then go on to see how to massage this linear model to fit obviously non-linear relationships (Figure \hyperref[fig:lwr]{1}, Image 3) and non-obvious non-linear relationships (Figure \hyperref[fig:kernel]{2}).  

\begin{figure}[ht]
\begin{center}
\label{fig:lwr}
\begin{tikzpicture}
    % First plot with least squares fit
    \begin{scope}[xshift=0cm]
        \begin{axis}[
            width=5cm, % width of the plot
            height=5cm, % height of the plot
            xmin=0, xmax=10,
            ymin=0, ymax=10,
            grid=major,
            xticklabels={},yticklabels={}
        ]
        \addplot[only marks, mark=*, blue] coordinates {
            (1,2) (3,4) (5,6) (7,8) (2,9) (4,3) (6,7) (8,1) (9,5) (3,6)
        };
        \addplot[domain=0:10, samples=100, thick, black] {0.56*x + 2.8}; % Least squares line approximation
        \end{axis}
    \end{scope}

    % Second plot with locally-weighted regression (up to x^2)
    \begin{scope}[xshift=6cm]
        \begin{axis}[
            width=5cm,
            height=5cm,
            xmin=0, xmax=10,
            ymin=0, ymax=10,
            grid=major,
            xticklabels={},yticklabels={}
        ]
        \addplot[only marks, mark=*, blue] coordinates {
            (1,2) (3,4) (5,6) (7,8) (2,9) (4,3) (6,7) (8,1) (9,5) (3,6)
        };
        \addplot[domain=0:10, samples=100, thick, black] {0.1*x^2 - 0.5*x + 4}; % Locally-weighted regression
        \end{axis}
    \end{scope}

    % Third plot with overfitting to fifth degree
    \begin{scope}[xshift=12cm]
        \begin{axis}[
            width=5cm,
            height=5cm,
            xmin=0, xmax=10,
            ymin=0, ymax=10,
            grid=major,
            xticklabels={},yticklabels={}
        ]
        \addplot[only marks, mark=*, blue] coordinates {
            (1,2) (3,4) (5,6) (7,8) (2,9) (4,3) (6,7) (8,1) (9,5) (3,6)
        };
        \addplot[domain=0:10, samples=100, thick, black] {-0.02*x^5 + 0.25*x^4 - 1.2*x^3 + 2.5*x^2 - 0.3*x + 5}; % Overfitting to fifth degree
        \end{axis}
    \end{scope}
\end{tikzpicture}
\end{center}
\caption{Linear Regression with increasingly polynomial features}
\end{figure}

\begin{figure}[ht]
\begin{center}
\label{fig:kernel}
\begin{tikzpicture}
    \begin{axis}[
        width=8cm,
        height=8cm,
        axis equal,
        tick style={draw=none},
        xtick=\empty,
        ytick=\empty
    ]

    % Outer circle
    \foreach \angle in {0,5,...,360} {
        \pgfmathsetmacro{\x}{cos(\angle) + 0.08*rand}
        \pgfmathsetmacro{\y}{sin(\angle) + 0.08*rand}
        \addplot[only marks, mark=*, mark size=1pt, blue] coordinates {(\x, \y)};
        \pgfmathsetmacro{\x}{cos(\angle) + 0.08*rand}
        \pgfmathsetmacro{\y}{sin(\angle) + 0.08*rand}
        \addplot[only marks, mark=*, mark size=1pt, orange] coordinates {(\x, \y)};
    }

    % Inner circle
    \foreach \angle in {0,5,...,360} {
        \pgfmathsetmacro{\x}{0.4*cos(\angle) + 0.08*rand}
        \pgfmathsetmacro{\y}{0.4*sin(\angle) + 0.08*rand}
        \addplot[only marks, mark=*, mark size=1pt, orange] coordinates {(\x, \y)};
        \pgfmathsetmacro{\x}{0.4*cos(\angle) + 0.08*rand}
        \pgfmathsetmacro{\y}{0.4*sin(\angle) + 0.08*rand}
        \addplot[only marks, mark=*, mark size=1pt, blue] coordinates {(\x, \y)};
    }

    \end{axis}
\end{tikzpicture}
\end{center}
\caption{Non-Euclidean classification}
\end{figure}

\subsubsection{Matrix Calculus}
\begin{questions}
    \setcounter{question}{13}
    \question A \emph{univariate} linear regression model is a model that tries to predict an outcome based on just a single independent variable: $x$. The model takes in data in the form of tuples and constructs a prediction (first plot, Figure \hyperref[fig:lwr]{1}) that we can use either to interpolate or extrapolate values of the dependent variable.

    The question then becomes, what kind of metric should we use to determine this line? In practice we construct a loss function $\mathcal{L}(\vec{w}) = \frac{1}{n}\sum_{i=1}^{n} (y_i - y_i^*)^2$. This takes each data point we were given, $y_i$ and squares the difference between it and our linear model $y_i^* = w_0 + w_1 x_i$.\footnote{the \verb|*| indicates \emph{our} best estimate}

    Our loss function is then equivalently \[\mathcal{L}(\vec{w}) = \frac{1}{n}\sum_{i=0}^n (y_i - (w_0 + w_1 x_i))^2\]
    \begin{parts}
        \part[2] Obviously we want to minimise this loss function to achieve the best possible linear model\footnote{in the sense that the least squares estimate of the weight parameters will have the smallest variance amongst all linear unbiased estimates}. We will see why this \emph{Mean-Squared Error} (MSE) is a good choice in the following parts, but for now show that the $w_0$ which minimises this loss function is equal to $\overline{y} - w_1 \overline{x}$. \emph{Recall that $\overline{a} = \frac{1}{n}\sum_{i=0}^{n}a_i$.}
        \part[2] Repeat the steps by taking the partial derivative w.r.t $w_1$ and showing that the minimum occurs at $\mathlarger{\frac{\overline{xy}-w_0 \overline{x}}{\overline{x^2}}}$.
        \part[1] \reversemarginpar\marginpar{\tiny the hat above the $y$ signifies \emph{the} best estimate}Solve the above weights simultaneously to rewrite the univariate regression line $\hat{y}(x) = w_0 + w_1 x$ in terms of the data only.
    \end{parts}
    \question We now extend this result to the multivariate case of $p$ features\footnote{this is how many independent variables we have} with $n$ feature vectors\footnote{this is how many data points you have}: $\vec{x_1}, \vec{x_2}, ..., \vec{x_n}$.

    So each $i^\text{th}$ feature vector, $x_i$ has $p$ entries: \[x_i = \begin{bmatrix}x_{i,0}\\x_{i,1}\\\vdots\\x_{i, p-1}\end{bmatrix}\].

        Our outputs (still $y$) will now be taking in richer feature vectors and applying weights $w_0, ..., w_{p-1}$ to each data point of each independent variable. 

        It makes sense to stack these feature vectors and construct a design matrix \mbox{$X\in\mathbb{R}^{n\times p}$}:
        \[\vec{X} = \begin{bmatrix}\vec{x_1}^T\\\vec{x_2}^T\\\vdots\\\vec{x_n}^T\end{bmatrix} = \begin{bmatrix}x_{1,0}&x_{1,1}&\dots&x_{1,p-1}\\x_{2,0}&x_{2,1}&\dots&x_{2,p-1}\\\vdots&\vdots&\ddots&\vdots\\x_{n,0}&x_{n,1}&\dots&x_{n,p-1}\end{bmatrix}\]

        You should verify for yourself that the multivariate model now looks like:
        \begin{equation}\label{eq:yi}\hat{y}_i = w_0 + w_1 x_{i,1} + w_2 x_{i,2} + \ldots + w_{p-1}x_{i,p-1}\end{equation}

        And thus our previously manageable loss function transforms into
        \begin{align}
            \mathcal{L}(\vec{w}) &= \frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2\\
                                 &= \frac{1}{n} \sum_{i=1}^n (w_0 + w_1 x_{i,1} + w_2 x_{i,2} + \ldots + w_{p-1}x_{i,p-1})^2
        \end{align}

        It should now be clear that taking partials with respect to each of the $p$ weights is infeasible, and thus we must leverage matrix notation and calculus:

        \[\mathcal{L}(\vec{w}) = \frac{1}{n} \|\vec{y}-\vec{X}\vec{w}\|_2^2\]

        where $\|\cdot\|_2^2$ denotes the L2 norm and has the property $\|\vec{x}\|^2_2 = \vec{x}^T \vec{x}$.

    \begin{parts}
        \part[1] Begin by showing that $\nabla_\vec{x} \vec{b}^T \vec{x} = \vec{b}$, where $\vec{b}, \vec{x} \in \mathbb{R}^n$ \reversemarginpar\marginpar{\tiny\textsc{do not fear summation expansions!}}
        \part[1] Next show that $\nabla_\vec{x} \vec{x}^T \vec{A}\vec{x} = 2\vec{A}\vec{x}$, with $\vec{A}\in\mathbb{R}^{n\times n}$ and $\vec{x}\in\mathbb{R}^n$
        \part[2] Finally, by leveraging the above rules, find $\nabla_w \mathcal{L}(\vec{w})$.
        \part[1] Set the above gradient to 0, and thus find the critical point.
    \end{parts}
    \question[1] Whilst a critical point is a necessary condition for a minimum, it is not sufficient. Given that any critical point of a convex function \emph{is} a minimum, show that the Hessian of $\mathcal{L}(\vec{w})$ is positive semidefinite. \\Recall the definition from \hyperref[q:psd]{Question 6}.
    \question[2] Wielding this Hessian, apply the Newton-Raphson method to this problem to find out how many iterations it takes to find the optimal solution. Let $\vec{w} = w_0$.
\end{questions}

\subsubsection{Statistics}
In this section we will justify the use of the Mean-Squared Error that we befriended in the previous section. We will justify \emph{mathematically} its existence as the best loss function under the assumption that the noise within our sampling was / is \emph{normally distributed}.
\begin{questions}
    \setcounter{question}{17}
    \question Given a loss function parametrised on $\theta$, $\mathcal{L}(\vec{\theta})$, we now want to choose this parameter that gives us the highest possible likelihood of observing the data. In other words, we wish to find the \emph{maximum likelihood estimator} (MLE):\[\hat{\theta}_{\text{MLE}} = \operatorname*{arg\,max}_{\theta\in\Theta} \mathcal{L}(\theta)\]
    \begin{parts}
        \part[3] Assuming $X_1, ..., X_n \overset{\text{i.i.d.}}{\sim}$ Bernoulli(p), compute $\hat{p}_{\text{MLE}}$. Recall that the Bernoulli Distribution is discrete and has probability mass function:
        \[\mathbb{P}(X=k) =p^k(1-p)^{1-k}, \quad\quad k= 0,1\quad p\in[0,1]\]
        \part[4] Assume that $X_1, ..., X_n \overset{\text{i.i.d.}}{\sim} \mathcal{N}(\mu, \sigma^2)$. Compute $(\hat{\mu}_{\text{MLE}}, \hat{\sigma}^2_{\text{MLE}})$
    \end{parts}
    \question[5] Having now practised finding MLE's, let us now rewrite equation \ref{eq:yi} as 
    \begin{align*}
        \vec{y} &= \vec{X}\vec{w}\\
        \implies y^{(i)} &= \vec{x^{(i)}}^T\vec{w}
    \end{align*}
    And now the punchline: Let us assume that each of these data points have a degree of noise in them. Furthermore, we will assume that this noise is normally distributed with zero mean and variance $\sigma^2$:
    \[y^{(i)} = \vec{x^{(i)}}^T\vec{w} + \epsilon^{(i)},\qquad \epsilon\sim \mathcal{N}(0, \sigma^2)\]
    Your task is write down the log-likelihood and maximum likelihood estimation objective and then solve for the MLE estimator $\hat{w}_{\text{MLE}}$.
    Note: you may assume the errors are independent and identically distributed.
\end{questions}

\subsubsection{Non-linearities}
\begin{questions}
    \setcounter{question}{19}
    \question[2] In this question we will explore the capacity of a linear model to fit \textbf{polynomial} relationships with a trick known as \emph{Locally Weighted Regression}.

    Locally Weighted Regression (LWR) is a non-parametric regression technique where weights are assigned to the data points based on their ``closeness'' to the query point. The weighted linear regression model minimizes a weighted version of the squared loss:
    \[
    \mathcal{L}(\vec{w}) = \sum_{i=1}^n w^{(i)}\left(y^{(i)} - \vec{x^{(i)}}^T\vec{w}\right)^2,
    \]
    where the weights are given by:
    \[
    w^{(i)} = \exp\left(-\frac{\|\vec{x}^{(i)} - \vec{x}\|^2}{2\tau^2}\right).
    \]
    Here, \(\tau > 0\) is a bandwidth parameter controlling the locality of the regression. 
    \begin{parts}
        \part[1] Explain the difference between a parametric and non-parametric model.
        \part[2] Derive the closed-form solution for the weight vector \(\vec{w}\) by solving the weighted least squares minimization problem.
        \begin{solution}
            Closed-form solution for \(\vec{w}\):
            We minimize the weighted least squares objective:
            \[
            \mathcal{L}(\vec{w}) = \sum_{i=1}^n w^{(i)}\left(y^{(i)} - \vec{x}^{(i)}\vec{w}\right)^2.
            \]

            Expand and differentiate w.r.t. \(\vec{w}\):
            \[
            \mathcal{L}(\vec{w}) = \vec{y}^T W \vec{y} - 2 \vec{y}^T W X \vec{w} + \vec{w}^T X^T W X \vec{w}.
            \]

            Set the gradient to 0:
            \[
            -2X^T W \vec{y} + 2X^T W X \vec{w} = 0 \implies \vec{w} = \left(X^T W X\right)^{-1} X^T W \vec{y}.
            \]
        \end{solution}
        \part[2] For the dataset:
        \[
        \{(x^{(i)}, y^{(i)})\} = \{(-1, 1), (0, 0), (1, 1)\},
        \]
        use Locally Weighted Regression with \(\tau = 1\) to compute the predicted value \(\hat{y}\) at \(x = 0.5\)
        \begin{solution}
            Compute \(\hat{y}\) at \(x = 0.5\)

            Given data:
            \[
            \{(x^{(i)}, y^{(i)})\} = \{(-1, 1), (0, 0), (1, 1)\}, \quad \tau = 1, \quad \vec{x} = [1, x].
            \]

            1. \textbf{Weight calculation:}
               \[
               w^{(i)} = \exp\left(-\frac{(x^{(i)} - 0.5)^2}{2(1)^2}\right),
               \]
               giving:
               \[
               w^{(1)} = \exp\left(-\frac{(-1 - 0.5)^2}{2}\right) = \exp(-1.125),
               \]
               \[
               w^{(2)} = \exp\left(-\frac{(0 - 0.5)^2}{2}\right) = \exp(-0.125),
               \]
               \[
               w^{(3)} = \exp\left(-\frac{(1 - 0.5)^2}{2}\right) = \exp(-0.125).
               \]

               Let:
               \[
               W = \text{diag}([w^{(1)}, w^{(2)}, w^{(3)}]).
               \]

            2. \textbf{Matrix setup:}
               \[
               X = \begin{bmatrix}
               1 & -1 \\
               1 & 0 \\
               1 & 1
               \end{bmatrix}, \quad \vec{y} = \begin{bmatrix}
               1 \\
               0 \\
               1
               \end{bmatrix}.
               \]

            3. \textbf{Weighted least squares solution:}
               Compute:
               \[
               \vec{w} = \left(X^T W X\right)^{-1} X^T W \vec{y}.
               \]

            4. \textbf{Predicted value:}
               Evaluate:
               \[
               \hat{y} = \vec{x} \cdot \vec{w}.
               \] 

            Plugging in the numbers yields:
            \[
            \hat{y} = 0.75.
            \]
        \end{solution}
    \end{parts}
    \question[1] What is an n-dimensional line called?
    \question[3] We will now see how it is possible to construct a linear decision boundary for data which looks as tangled as Figure \hyperref[fig:kernel]{Figure 2}.

    The concentric circles have been randomly coloured, but the point of the task is to colour them correctly. 

    \begin{solution}
    \begin{center}
        \begin{tikzpicture}
        \begin{axis}[
            width=8cm,
            height=8cm,
            axis equal,
            xlabel={$x_1$},
            ylabel={$x_2$},
            title={Randomly Labelled Points},
            tick style={draw=none}, % Remove ticks
            xtick=\empty,
            ytick=\empty
        ]
        % Orange points
        \pgfplotsinvokeforeach{1,...,200}{
            \addplot[only marks, mark=*, mark size=1pt, orange] coordinates {
                ({rand*2-1}, {rand*2-1})
            };
        }
        % Blue points
        \pgfplotsinvokeforeach{1,...,200}{
            \addplot[only marks, mark=*, mark size=1pt, blue] coordinates {
                ({rand*2-1}, {rand*2-1})
            };
        }
        \end{axis}
    \end{tikzpicture}
    \end{center}

    \end{solution}

    Your task is simple, submit a sketch or plot in 2 or 3 dimensions, that separates this data. This task may require some research into kernel methods, which project the data into higher dimensions and allow you to linearly separate the data in at least two of these dimensions
    \question[2] Finally, because all of the above methods are prone to overfitting, find the closed form solution of the multivariate least squares subject to L2 normalisation throttled by the hyperparameter lambda. 
    \[w_{\text{ridge}} = \operatorname*{arg\,max}_{w} \|\vec{y}-\vec{X}\vec{w}\|^2_2 + \lambda\|\vec{w}\|^2_2\]
    Note: Ridge regression is another term for L2 normalised Least Squares

    \enlargethispage{1cm}
    \begin{figure}[ht]
        \begin{centering}
        \begin{tikzpicture}[scale=0.8]
      % First plot: overfitting
    \begin{scope}[xshift=0cm]
      \begin{axis}[
        width=7cm,
        height=6cm,
        title={Overfitting (No Regularization)},
        ymin=-2.5,
        ymax=2.5,
        xmin=-1.1,
        xmax=1.1,
        grid=major,
        legend pos=north west
      ]
        % Noisy data
        \addplot+[only marks, mark=o, color=blue] coordinates {
          (-1.0, -1.0) (-0.8, -0.5) (-0.6, 1.0) (-0.4, 0.2)
          (-0.2, -0.8) (0, 0.1) (0.2, 0.9) (0.4, -0.5)
          (0.6, 1.5) (0.8, -0.7) (1.0, 0.3)
        };

        % Overfit polynomial
        \addplot+[domain=-1:1, smooth, thick, color=red] {
          -0.4 + 2.8*x - 4.5*x^2 + 3.5*x^3 - 1.8*x^4 + 0.4*x^5 + 0.2*x^6
        };
        \addlegendentry{\small Sixth Degree Polynomial}
      \end{axis}
    \end{scope}
    \begin{scope}[xshift=8cm]
      % Second plot: regularized fit
      \begin{axis}[
        width=7cm,
        height=6cm,
        title={With L2 Regularization},
        ymin=-2.5,
        ymax=2.5,
        xmin=-1.1,
        xmax=1.1,
        grid=major,
        legend pos=north west
      ]
        % Noisy data
        \addplot+[only marks, mark=o, color=blue] coordinates {
          (-1.0, -1.0) (-0.8, -0.5) (-0.6, 1.0) (-0.4, 0.2)
          (-0.2, -0.8) (0, 0.1) (0.2, 0.9) (0.4, -0.5)
          (0.6, 1.5) (0.8, -0.7) (1.0, 0.3)
        };

        % Regularized polynomial
        \addplot+[domain=-1:1, smooth, thick, color=green] {
          -0.3 + 1.0*x - 1.2*x^2 + 0.6*x^3 - 0.3*x^4 + 0.1*x^6
        };
        \addlegendentry{\small Regularized Sixth Degree Polynomial}
      \end{axis}
    \end{scope}
    \end{tikzpicture}
    \caption{for conceptual benefit}
        \end{centering}
    \end{figure}
\end{questions}

\newpage
\section{Grade Table}
\hspace{-0.5cm}\multirowgradetable{2}[questions]

\section{References}
cs229 problem set 0; mathematics for machine learning (book); cs229 lecture notes; cs9417 lab code + tutorials. the diff eqns q were from a unsw math test. i created the matrix problems excluding \hyperref[q:notmymatrix]{q10}. the remaining problems are simply classical.
\end{document}