% vim: tw=50
% 04/03/2023 09AM

\subsection{The linear Model}

Data are pairs $(x_1, Y_1)$, $\ldots$, $(x_n, Y_n)$.
$Y_i \in \RR$: ``responses'', random. $x_i \in
\RR^p$: ``predictors'', fixed.

\begin{example*}
    $Y_i$: number of insurance claims for client
    $i$. $x_i$: (age, number of claims in 2-21,
    years with driver's license, $\ldots$).
\end{example*}

\noindent
In a linear model, we assume
\[ Y_i = \cancel{\alpha} + \beta_1 x_{i1} +
\beta_2 x_{i2} + \cdots + \beta_p x_{ip} + \eps_i \]
\begin{itemize}
    \item $\alpha$ is an intercept.
    \item $\beta_1, \ldots, \beta_p$ are
        coefficients.
    \item $\eps_1, \ldots, \eps_n$ are random
        noise variables.
\end{itemize}

\begin{remark*}
    We normally remove intercept by including a
    dummy predictor which is equal to $1$ for all
    $i$, i.e. $x_{i1} = 1$ for all $i = 1, \ldots, n$.
\end{remark*}

\begin{remark*}
    We can also model non-linear relationships
    between $Y_i$ and $x_i$ using a linear model,
    for example by using $x_i = (\text{age},
    \text{age}^2, \log(\text{age}))$.
\end{remark*}

\begin{remark*}
    $\beta_j$ is the effect on $Y_i$ of increasing
    $x_{ij}$ by a unit, whilst keeping all other
    predictors constant. Estimates of $\beta$
    should not be interpreted causally, unless we
    have a randomised experiment.
\end{remark*}

\noindent
Matrix formulation:
\[ Y =
\begin{pmatrix}
    Y_1 \\
    \vdots \\
    Y_n
\end{pmatrix}
\qquad X = \ub{
\begin{pmatrix}
    x_{11} & x_{12} & \cdots & x_{1p} \\
    x_{21} & x_{22} & \cdots x_{2p} \\
    \vdots & \vdots & \ddots & \vdots \\
    x_{n1} & x_{n2} & \cdots & x_{np}
\end{pmatrix}
}_{\text{``design matrix''}} \]
\[ \beta =
\begin{pmatrix}
    \beta_1 \\
    \vdots \\
    \beta_p
\end{pmatrix}
\qquad \eps =
\begin{pmatrix}
    \eps_1 \\
    \vdots \\
    \eps_n
\end{pmatrix}
\]
\[ Y = X\beta + \eps \]
Moment assumptions on $\eps$:
\begin{enumerate}[(1)]
    \item $\EE\eps = 0 \implies \EE Y = X\beta$.
    \item $\Var\eps = \sigma^2 I \implies
        \Var(\eps_i) = \sigma^2$ for all $i$
        ``homoscedasticity''. $\Cov(\eps_i,
        \eps_j) = 0$ for all $i \neq j$.
\end{enumerate}
We'll assume throughout that $x \in \RR^{k \times
p}$ has full rank. In particular, $p \le n$ (more
samples than predictors).

\subsubsection*{Least squares estimator}

$\hat{\beta}$ minimises the residual sum of
squares
\begin{align*}
    S(\beta)
    &= \|Y - X\beta\|^2 \\
    &= \sum_{i = 1}^n (Y_i - x_i^\top \beta)^2
\end{align*}
This is a quadratic (positive definite) polynomial
in $\beta$ so $\hat{\beta}$ satisfies
\[ \nabla S(\beta)|_{\beta = \hat{\beta}} = 0 \]
\[ \implies \left. \pfrac{S(\beta)}{\beta_k}
\right|_{\beta = \hat{\beta}} = -2 \sum_{i = 1}^n
x_{ik} \left( Y_i - \sum_{j = 1}^p x_{ij}
\hat{\beta}_j \right) = 0 \]
for each $k = 1, \ldots, p$. Equivalent matrix
form:
\[ X^\top X \hat{\beta} = X^\top Y \]
As $X$ has rank $p$, the matrix $X^\top X \in
\RR^{p \times p}$ is invertible, hence
\[ \hat{\beta} = (X^\top X)^{-1} X^\top Y \]
(linear in $Y$!). Check:
\begin{align*}
    \EE\hat{\beta}
    &= \EE[(X^\top X)^{-1} X^\top Y] \\
    &= (X^\top X)^{-1} X^\top \EE Y \\
    &= \cancel{(X^\top X)^{-1}} \cancel{X^\top X}
    \beta \\
    &= \beta
\end{align*}
Hence $\hat{\beta}$ is unbiased. We can also
calculate:
\begin{align*}
    \Var(\hat{\beta})
    &= \Var((X^\top X)^{-1} X^\top Y) \\
    &= (X^\top X)^{-1} X^\top \Var(Y) X(X^\top
    X)^{-1} \\
    &= (X^\top X)^{-1} X^\top \sigma^2 I X(X^\top
    X)^{-1} \\
    &= \sigma^2 (X^\top X)^{-1}
\end{align*}

\begin{flashcard}[Gauss-Markov-thm]
\begin{theorem*}[Gauss-Markov]
    Let $\beta^* = CY$ be any linear estimator of
    $\beta$ which is unbiased. Then \cloze{for any $t \in
    \RR^p$,
    \[ \Var(t^\top \hat{\beta}) \le \Var(t^\top
    \beta^*) \]}
    We say $\hat{\beta}$ is \cloze{``Best Linear Unbiased
    Estimator'' (BLUE).}
\end{theorem*}
\end{flashcard}

\begin{hiddenflashcard}[gauss-markov-proof]
Proof of Gauss-Markov Theorem? \\
\cloze{
\begin{proof}
    Show that it is sufficient to show $\Var
    \beta^* - \Var \hat{\beta}$ is positive
    semi-definite. \\
    Now let $\beta^* = CY$, $\hat{\beta} = (X^\top
    X)^{-1} X^\top Y$ and let $A = C - (X^\top
    X)^{-1} X^\top$. Then since they are unbiased,
    \[ \EE AY = \EE \beta^* - \EE \hat{\beta} =
    \beta - \beta = 0 \]
    But $\EE AY = A \EE Y = AX\beta = 0$ holds for
    all $\beta \in \RR^p$ so $AX = 0$. Then
    \begin{align*}
        \Var \beta^*
        &= \Var((A + (X^\top X)^{-1} X^\top)Y) \\
        &= (A + (X^\top X)^{-1} X^\top) \Var Y (A +
        (X^\top X)^{-1} X^\top)^\top \\
        &= \sigma^2 (AA^\top + (X^\top X)^{-1}
        + 0 + 0) \\
        &= \sigma^2 AA^\top + \Var(\hat{\beta})
    \end{align*}
    so $\Var \beta^* - \Var \hat{\beta} = \sigma^2
    AA^\top$ which is positive semi-definite as
    desired.
\end{proof}
}
\end{hiddenflashcard}

\begin{remark*}
    Think of $t \in \RR^p$ as the value of the
    predictors for a new sample. Then $t^\top
    \hat{\beta}$, $t^\top \beta^*$ are estimators
    of the mean response. These are both unbiased,
    so the mse is the variance of $t^\top
    \hat{\beta}$, $t^\top \beta^*$. Theorem says
    variance is ``best'' using the least squares
    estimator.
\end{remark*}

\begin{proof}
    \[ \Var(t^\top \beta^*) - \Var(t^\top
    \hat{\beta}) = t^\top (\Var \beta^* - \Var
    \hat{\beta}) t \ge 0 \]
    This holds for all $t \in \RR^p$ if and only
    if the matrix $\Var\beta^* - \Var\hat{\beta}$
    is positive semi-definite. Recall $\beta^* =
    CY$, $\hat{\beta} = (X^\top X)^{-1} X^\top Y$.
    Let $A = C - (X^\top X)^{-1} X^\top$. Note:
    \[ \EE AY = \EE \beta^* - \EE \hat{\beta} =
    \beta - \beta = 0 \]
    (since $\beta^*$ and $\hat{\beta}$ are
    unbiased). But also note
    \[ \EE AY = A \EE Y = AX\beta = 0 \]
    for all $\beta \in \RR^p$, so we must have $AX
    = 0$. Then
    \begin{align*}
        \Var\beta^*
        &= \Var((A + (X^\top X)^{-1}X^\top)Y) \\
        &= (A + (X^\top X)^{-1} X^\top ) \Var Y (A
        + (X^\top X)^{-1} X^\top )^\top \\
        &= \sigma^2 (AA^\top + (X^\top X)^{-1} +
        \cancel{AX(X^\top X)^{-1}} +
        \cancel{(X^\top X)^{-1} X^\top A^\top}) \\
        &= \sigma^2 AA^\top + \Var(\hat{\beta}) \\
        \implies \Var \beta^* - \Var \hat{\beta}
        &= \sigma^2 AA^\top
    \end{align*}
    and this is positive definite, as desired.
\end{proof}

\myskip
Fitted values and residuals: fitted values
\[ \hat{Y} = X\hat{B} = \ub{X(X^\top X)^{-1}
X^\top}_{P \text{ ``hat matrix''}} Y \]
Residuals: $Y - \hat{Y} = (I - P)Y$.

\begin{proposition*}
    $P$ is the orthogonal projection onto
    $\col(X)$.
\end{proposition*}

\begin{proof}
    $P$ is clearly symmetric. Also,
    \[ P^2 = X(X^\top X)^{-1} \cancel{X^\top X}
    \cancel{(X^\top X)^{-1}} X^\top = P \]
    Therefore $P$ is an orthogonal projection onto
    $\col(P)$. We need to show $\col(P) =
    \col(X)$. For any $a$, $Pa = X[(X^\top X)^{-1}
    X^\top a] \in \col(X)$. Also, if $b = Xc$ is a
    vector in $\col(X)$, then
    \[  b = Xc = X(X^\top X)^{-1} X^\top Xc = Pb
    \in \col(P) \qedhere \]
\end{proof}

\begin{corollary*}
    Fitted values are projections of $Y$ onto
    $\col(X)$. Residuals are projections of $Y$
    onto $\col(X)^\perp$.
    \begin{center}
        \includegraphics[width=0.6\linewidth]
        {images/3ac39c2aba7311ed.png}
    \end{center}
\end{corollary*}

\subsubsection*{Normal assumptions}

We assume in addition to $\EE\eps = 0$, $\Var\eps
= \sigma^2 I$, that $\eps$ is MVN, i.e.
\[ \eps \sim \normaldist(0, \sigma^2 I_n) \]
$\sigma^2$ is usually unknown, so the parameters in
the model are $(\beta, \sigma^2)$. We'll see that
mle of $\beta$ is the least squares estimator
$\hat{\beta}$.