% vim: tw=50
% 11/03/2023 09AM

\subsubsection*{The $F$-test}

$Y = X\beta + \eps$, $\eps \sim \normaldist(0,
\sigma^2 I)$. $H_0$: $\beta_1 = \beta_2 = \cdots =
\beta_{p_0} = 0$. $H_1$: $\beta \in \RR^p$. Let $X
= (x_0, x_1)$ ($X_0$ is $n \times p_0$ and $X_1$
is $n \times (p - p_0)$)
\[ \beta =
\begin{pmatrix}
    \beta^0 \\
    \beta^1
\end{pmatrix}
\qquad \beta^0 =
\begin{pmatrix}
    \beta_0 \\
    \vdots \\
    \beta_{p_0}
\end{pmatrix}
\qquad \beta^1 =
\begin{pmatrix}
    \beta_{p_0 + 1} \\
    \vdots \\
    \beta_p
\end{pmatrix}
\]
Null: $\beta^0 = 0$. This is a normal linear
model:
\[ Y = X_1 \beta^1 + \eps \]
Write $P = X(X^\top X)^{-1}X^\top$, $P_1 = X_1
(X_1^\top X_1)^{-1} X_1^\top$. As $X$, $P$ have
full rank, so do $X_1$, $P_1$. Recall that the
maximum log-likelihood in a linear model is
\begin{align*}
    \max_{\substack{\beta \in \RR^p \\ \sigma^2 >
    0}} l(\beta, \sigma^2)
    &= l(\hat{\beta}, \hat{\sigma}^2) \\
    &= -\frac{n}{2} \log \left( \frac{\|(I -
    p)Y\|^2}{n} \right) + \text{const}
\end{align*}
The generalised log likelihood ratio statistic is
\begin{align*}
    2\log\Lambda
    &= 2\left(\max_{\substack{\beta \in \RR^p \\
    \sigma^2 > 0}} l(\beta, \sigma^2) -
    \max_{\substack{\beta^0 = 0 \\ \beta^1 \in
    \RR^{p - p_0} \\ \sigma^2 > 0}} l(\beta,
    \sigma^2) \right) \\
    &= \frac{2n}{2} \left( -\log \left( \frac{\|(I
    - P)Y\|^2}{n} \right) + \log \left( \frac{\|(I
    - P_1)Y\|^2}{n} \right) \right)
\end{align*}
This is a monotone increasing function in
\begin{align*}
    \frac{\|(I - P_1)Y\|^2}{\|(I - P)Y\|^2}
    &= \frac{\|(I - P + P - P_1)Y\|^2}{\|(I -
    P)Y\|^2} \\
    &= \frac{\|(I - P)Y\|^2 + \|(P - P_1)Y\|^2 +
    2Y^\top\cancel{(I - P)(P - P_1)}Y}{\|(I -
    P)Y\|^2}
\end{align*}
(The cancel takes place because the columns of $P
- P_1$ are in $\col(X)$).
This is monotone increasing in
\[ \frac{\|(P - P_1)Y\|^2 / p_0}{\|(I - P)Y\|^2 /
(n - p)} \defeq F \]
``$F$ statistic''.

\begin{lemma*}
    $P - P_1$ is an orthogonal projection with
    rank $p_0$.
\end{lemma*}

\begin{proof}
    $P - P_1$ is symmetric as both $P$ and $P_1$
    are
    \[ (P - P_1) (P - P_1) = P + P_1 -
    2\ub{PP_1}_{=P_2} = P - P_1 \]
    \begin{align*}
        \rank(P - P_1)
        &= \Trace(P - P_1) \\
        &= \Trace(P) - \Trace(P_1) \\
        &= p - (p - p_0) \\
        &= p_0
    \end{align*}
\end{proof}

\myskip
To recap the generalised LRT rejects $H_0$ when
$F$ is large. What is the null distribution of
$F$? Under $H_0$:
\begin{align*}
    (P - P_1)Y
    &= (P - P_1)(X\beta + \eps) \\
    &= (P - P_1)(X_1 \beta^1 + \eps) \\
    &= (P - P_1)\eps
\end{align*}
Therefore, under $H_0$:
\[ F = \frac{\frac{1}{\sigma^2} \|(P - P_1)
\eps\|^2 / p_0}{\frac{1}{\sigma^2}\|(I -
P)\eps\|^2 / (n - p)} \]
with numerator $\sim \left(
\frac{\chi_{p_0}^2}{p_0} \right)$ and denominator
$\sim \left( \frac{\chi_{n - p}^2}{n - p}
\right)$. Furthermore,
\[
\begin{pmatrix}
    (P - P_1)\eps \\
    (I - P)\eps
\end{pmatrix}
\]
is MVN with $\Cov((P - P_1)\eps, (I - P)\eps) =
\sigma^2 (P - P-1)(I - P) = 0$. Hence $(P - P_1)
\eps \ci (I - P)\eps$. Hence numerator $\ci$
denominator in $F$. We conclude that
\[ F \sim F_{p_0, n - p} ,\]
so the test rejects $H_0$ with size $\alpha$ if
\[ F \ge F_{p_0, n - p}(\alpha) \]
Last time we derived a size $\alpha$ test for
$H_0$: $\beta_1 = 0$ using the $100 \cdot (1 -
\alpha)\%$ confidence interval for $\beta_1$. That
test rejects $H_0$ when
\[ |\beta_1| > t_{n - p} \left( \frac{\alpha}{2}
\right) \sqrt{\frac{\hat{\sigma}^2 n(X^\top
X)^{-1}_{11}}{n - p}} \]

\begin{lemma*}
    This test is equivalent to the $F$-test with
    $p_0 = 1$.
\end{lemma*}

\begin{proof}
    Exercise.
\end{proof}

\subsubsection*{Categorical predictors}

\begin{example*}
    $Y_i \in \RR$: clinical response, $z_i \in
    \{\text{control}, \text{treatment 1},
    \text{treatment 2}\}$.
\end{example*}

\noindent
Let
\[ x_{i,j} = \mathbbm{1}_{\{z_i = j\}} =
\mathbbm{1}_{\{\text{subject $i$ was in group $j$}\}} \]
$x_i \in \RR^3$ this is numerical.
\[ Y_i = \alpha + \beta_1 x_{i, 1} + \beta_2 x_{i,
2} + \beta_3 x_{i, 3} \]
Problem:
\begin{center}
    \includegraphics[width=0.6\linewidth]
    {images/278d15aabff011ed.png}
\end{center}
This has rank $3 < 4$. Corner point constraint:
call one of the groups the ``baseline'' and remove
it from the linear model. Interpretation of
$\beta_j$ depends on baseline. $\beta_j$ is effect
of being in group $j$ \emph{relative} to baseline.
$\beta_j$ is effect of being in group $j$
\emph{relative} to baseline. However, $\col(X)$ and
matrix $P$ are insensitive of choice of baseline,
and therefore so are the fitted values
\[ \hat{Y} = PY .\]
This can be extended to a model with more than 1
categorical predictor, for example group and
gender.

\myskip
ANOVA: Analysis of Variance. The $F$-test for
\begin{itemize}
    \item $H_0$: $\beta_j = 0$ for a categorical
        predictor $\alpha \neq 0$.
    \item $H_1$: $\begin{pmatrix} \alpha_1 \\
        \beta \end{pmatrix} \in \RR^3$.
\end{itemize}
In this case, we can write the $F$ statistic in a
simpler way.
\begin{center}
    \includegraphics[width=0.6\linewidth]
    {images/84056cbebff111ed.png}
\end{center}
$P_1$ projection onto constant vectors.
\[ P_1 = \frac{1}{n} \bf{1} \bf{1}^\top \]
\[ P = \text{projection onto vectors which are
constant for each group} \]
\[ F = \frac{\|(P - P_1)Y\|^2 / p_0}{\|(I -
p)Y\|^2 / (n - p)} \]
\[ P_1 Y =
\begin{pmatrix}
    \ol{Y} \\
    \ol{Y} \\
    \vdots \\
    \ol{Y}
\end{pmatrix}
\qquad \ol{Y} = \frac{1}{n} \sum_{i = 1}^n Y_i \]
\[ Py =
\begin{pmatrix}
    \ol{Y}_1 \\
    \ol{Y}_1 \\
    \vdots \\
    \ol{Y}_2 \\
    \ol{Y}_2 \\
    \vdots \\
    \ol{Y}_3 \\
    \ol{Y}_3
\end{pmatrix}
\qquad \ol{Y}_j = \frac{\sum_{i = 1}^n Y_i
\mathbbm{1}_{\{z_i = j\}}}{\sum_{i = 1}^n
\mathbbm{1}_{\{z_i = j\}}} = \text{average response
for group $j$} \]
\[ F = \frac{\sum_{i = 1}^3 N(\ol{Y}_j - \ol{Y})^2
/ 2}{\sum_{i = 1}^N \sum_{j = 1}^3 (Y_{ij} -
\ol{Y}_j)^2 / (3N - 3)} \]
Assume all groups of size $N$ ($n = 3N$).
Numerator is variance between groups, denominator
is variance within groups.