% vim: tw=50
% 09/03/2023 11AM

\subsubsection*{Normal linear model}

Take $Y = XB + \eps$, $\eps \sim \normaldist(0,
\sigma^2 I)$. MLE: 2 parameters: $\beta \in
\RR^p$, $\sigma^2 \in \RR_+$. Log-likelihood:
\[ l(\beta, \sigma^2) = \text{const} + \frac{n}{2}
\log \sigma^2 - \frac{1}{2\sigma^2} \|Y - X\beta\|^2 \]
For any $\sigma^2 > 0$, we can see that
$l(\beta, \sigma^2)$ is maximised as a function
of $\beta$ at the minimiser of $\|Y - XB\|^2$,
i.e. the least squares estimator $\hat{\beta}$.
Now find:
\[ \arg \max_{\sigma^2 \ge 0} l(\hat{\beta},
\sigma^2) \]
\[ l(\hat{\beta}, \sigma^2) = \text{const} -
\frac{n}{2} \log \sigma^2 - \frac{1}{2\sigma^2}\|Y
- X\hat{\beta}\|^2 \]
As $\sigma^2 \mapsto l(\hat{\beta}, \sigma^2)$ is
concave, there is unique maximiser where
$\pfrac{l(\hat{\beta}, \sigma^2)}{\sigma^2} = 0$
\[ \implies \hat{\sigma}^2 = \frac{\|Y -
X\beta\|^2}{n} = \frac{\|(I - P)Y\|^2}{n} \]

\begin{theorem*}
    \begin{enumerate}[(1)]
        \item $\hat{\beta} \sim \normaldist(\beta,
            \sigma^2 (X^\top X)^{-1})$
        \item $\frac{\hat{\sigma}^2}{\sigma^2}n
            \sim \chi_{n - p}^2$
        \item $\hat{\beta}$, $\hat{\sigma}^2$ are
            independent(!)
    \end{enumerate}
\end{theorem*}

\begin{proof}
    $\hat{\beta}$ is linear in $Y$, hence MVN. We
    already know $\EE\hat{\beta} = \beta$, $\Var
    \hat{\beta} = \sigma^2 (X^\top X)^{-1}$. This
    proves (1). For (2) note
    \begin{align*}
        \frac{n\hat{\sigma}^2}{\sigma}
        &= \frac{\|(I - P)Y\|^2}{\sigma^2} \\
        &= \frac{\|(I - P)(X\beta + \eps)\|^2}{
        \sigma^2}
        &&(I - P)X = 0 \\
        &= \frac{\|(I - P)\eps\|^2}{\sigma^2} \\
        &\sim \chi_{\rank(I - P)}^2
    \end{align*}
    $\rank(I - P) = \Trace(I - P) = n - p$. ($X \in
    \RR^{n - p}$ has full rank).
    \myskip
    For (3), note $\hat{\sigma}^2$ is a function
    of $(I - P)\eps$. We'll show that
    $\hat{\beta}$ is a function of $P\eps$, which
    implies $\hat{\sigma}^2 \ci \hat{\beta}$ since
    $P\eps \ci (I - P) \eps$.
    \begin{align*}
        \hat{\beta}
        &= (X^\top X)^{-1} X^\top Y \\
        &= (X^\top X)^{-1} X^\top (X\beta + \eps)
        \\
        &= \beta + (X^\top X)^{-1} X^\top \eps \\
        &= \beta + (X^\top X)^{-1} X^\top P\eps
    \end{align*}
    since $X^\top P = X^\top$.
\end{proof}

\begin{corollary*}
    $\hat{\sigma}^2$ is biased
    \[ \EE \frac{\hat{\sigma}^2 n}{\sigma^2} = n -
    p \implies \EE \hat{\sigma}^2 = \left( \frac{n
    - p}{n} \right) \sigma^2 \]
\end{corollary*}

\subsubsection*{Student's $t$-distribution}

If $U \sim \normaldist(0, 1)$, $V \sim \chi_n^2$,
$U\ci V$ then we say $T = \frac{U}{\sqrt{V / n}}$
has a $t_n$ distribution.

\subsubsection*{The $F$ distribution}

If $V \sim \chi_n^2$, $W \sim \chi_n^2$, $V \ci W$
then we say
\[ F = \frac{V / n}{W / m} \]
has an $F_{n, m}$ distribution.

\subsubsection*{Confidence sets for $\beta$}

Suppose we want a $100(1 - \alpha)\%$ confidence
interval for one of the coefficients (WLOG take
$\beta_1$). Note:
\[ \frac{\beta_1 -
\hat{\beta_1}}{\sqrt{\sigma^2(X^\top
X)^{-1}_{11}}} \sim \normaldist(0, 1) \]
because $\hat{\beta}_1 \sim \normaldist(\beta_1,
\sigma^2 (X^\top X)^{-1}_{11})$. Also,
\[ \frac{\hat{\sigma}^2}{\sigma^2}n \sim \chi_{n -
p}^2 \]
and these two statistics are independent.
\[ \implies \frac{\frac{\hat{\beta}_1 -
\beta_1}{\sqrt{\cancel{\sigma^2}(X^\top X)^{-1}_{11}}}}
{\sqrt{\frac{\hat{\sigma}^2}{\cancel{\sigma^2}}
\frac{n}{n - p}}} \sim \frac{\normaldist(0,
1)}{\sqrt{\chi_{n - p}^2 / (n - p)}} \sim t_{n -
p} \]
Now this only depends on $\beta_1$ and \emph{not}
on $\sigma^2$, so we can use this as a pivot.
\[ \PP_{\beta, \sigma^2} \left( -t_{n -
p} \left( \frac{\alpha}{2} \right) \le
\frac{\hat{\beta}_1 - \beta_1}{\sqrt{(X^\top
X)^{-1}_{11}}} \sqrt{\frac{n - p}{n\hat{\sigma}^2}}
\le t_{n - p} \left(
\frac{\alpha}{2} \right) \right) = 1 - \alpha \]
We use that $t_n$ distribution is symmetric around
$0$.
\begin{center}
    \includegraphics[width=0.6\linewidth]
    {images/50e4e9dcbe6f11ed.png}
\end{center}
Rearranging the inequalities, we get
\[ \PP_{\beta, \sigma^2} \left( \hat{\beta}_1 -
\ub{t_{n - p} \left( \frac{\alpha}{2} \right)
\sqrt{\frac{(X^\top X)^{-1}_{11}
\hat{\sigma}^2}{(n - p) / n}}}_{=M} \le \beta_1 \le
\hat{\beta}_1 + M \right) = 1 - \alpha \]
We conclude that
\[ \left[ \hat{\beta}_1 \pm t_{n - p} \left(
\frac{\alpha}{2} \right) \sqrt{ \frac{(X^\top
X)^{-1}_{11} \hat{\sigma}^2}{(n - p) / n} } \right] \]
is a $(1 - \alpha) \cdot 100\%$ confidence
interval for $\beta_1$.

\begin{remark*}
    This is \emph{not} asymptotic.
\end{remark*}

\noindent
By the duality between tests of significance and
confidence intervals, we can find a size $\alpha$
test for $H_0$: $\beta_1 = \beta^*$ vs $H_1$:
$\beta_1 \neq \beta^*$. Simply reject $H_0$ if
$\beta^*$ is not contained in the $100 \cdot (1 -
\alpha)\%$ confidence interval for $\beta_1$.

\subsubsection*{Confidence ellipsoids for $\beta$}

Note $\hat{\beta} - \beta \sim \normaldist(0,
\sigma^2 (X^\top X)^{-1})$. As $X$ has full rank,
$X^\top X$ is positive definite. So it has
eigendecomposition
\[ (X^\top X) = UDU^\top \]
where $D_{ii} > 0$ for $i = 1, \ldots, p$. Define
\[ (X^\top X)^\alpha = UD^\alpha U^\top \]
\[ D^\alpha =
\begin{pmatrix}
    D_{11}^\alpha & \cdots & 0 \\
    \vdots & \ddots & \vdots \\
    0 & \cdots & D_{pp}^\alpha
\end{pmatrix}
\]
\[ (X^\top X)^{1/2} (\hat{\beta} - \beta) \sim
\normaldist(0, \sigma^2 I) \]
Hence
\begin{align*}
    \ub{\frac{\|(X^\top X)^{1/2} (\hat{\beta} -
    \beta)\|^2}{\sigma^2}}_{=
    \frac{\|X(\hat{\beta} - \beta)\|^2}{\sigma^2}}
    &\sim \chi_p^2
\end{align*}
This is a function of $\hat{\beta}$, so it's
independent of
\[ \frac{\hat{\sigma}^2 n}{\sigma^2} \sim \chi_{n
- p}^2 \]
\[ \implies \frac{\|X(\hat{\beta} - \beta)\|^2 /
\cancel{\sigma^2} p}{\hat{\sigma}^2 n /
\cancel{\sigma^2} (n - p)} \sim F_{p, n - p} \]
This only depends on $\beta$, \emph{not} on
$\sigma^2$, so it can be used as a pivot. For all
$\beta, \sigma^2$:
\[ \PP_{\sigma^2, \beta} \left(
\frac{\|X(\hat{\beta} - \beta\|^2 /
p}{\hat{\sigma}^2 n / (n - p)} \le F_{p, n -
p}(\alpha) \right) = 1 - \alpha \]
\begin{center}
    \includegraphics[width=0.6\linewidth]
    {images/8a68fefebe7011ed.png}
\end{center}
So, we can say that the set
\[ \left\{ \beta \in \RR^p :
\frac{\|(X(\hat{\beta} - \beta)\|^2 /
p}{\hat{\sigma}^2 n / (n - p)} \le F_{p, n -
p}(\alpha) \right\} \]
is a $100(1 - \alpha)\%$ confidence set for
$\beta$.
\begin{center}
    \includegraphics[width=0.6\linewidth]
    {images/c270c4eebe7011ed.png}
\end{center}
Principal axes are given by eigenvectors of
$(X^\top X)$.

\myskip
In the next section we'll talk about hypothesis
tests for $H_0$: $\beta_1 = \cdots = \beta_p = 0$,
$H_1$: $\beta \in \RR^p$.