% vim: tw=50
% 23/02/2023 11AM

\subsubsection*{Tests of Goodness of fit and Independence}

It's common to write
\[ 2\log \Lambda = 2 \sum_i o_i \log \left(
\frac{o_i}{e_i} \right) \]
where $o_i = N_i$ ``observed number of type $i$''
and $e_i = n \cdot \tilde{p}_i$ ``expected number
of type $i$ under null''.

\myskip
Pearson's statistic: Let $\delta_i = o_i - e_i$.
Then
\begin{align*}
    2\log\Lambda
    &= 2\sum_i (e_i + \delta_i)
    \ub{\log \left( 1 + \frac{\delta_i}{e_i}
    \right)}_{= \frac{\delta_i}{e_i} -
    \frac{\delta_i^2}{2e_i^2} + O \left(
    \frac{\delta_i^3}{e_i^3} \right)}
    \\
    &\approx 2\sum_i \left( \ub{\cancel{\delta_i}}_{\sum_i
    \delta_i = \sum_i (o_i - e_i) = n - n = 0} +
    \frac{\delta_i^2}{e_i} -
    \frac{\delta_i^2}{2e_i} \right) \\
    &= \sum \frac{\delta_i^2}{e_i} \\
    &= \sum_i \frac{(o_i - e_i)^2}{e_i}
\end{align*}
This is called Preason's statistic. This is also
referred to a $\chi_{k - 1}^2$ distribution when
$n$ is large.

\begin{hiddenflashcard}[pearsons-statistic]
Pearson's statistic? \\
\cloze{
\[ 2\log\Lambda = \sum_i \frac{(o_i - e_i)^2}{e_i} \]
}
\end{hiddenflashcard}

\begin{example*}
    Mendel's data:
    \[ (n_1, n_2, n_3, n_4) = (315, 108, 102, 31) \]
    $2\log\Lambda \approx 0.618$, $\sum_i
    \frac{(o_i - e_i)^2}{e_i} \approx 0.604$. We
    refer each statistic to a $\chi_{k - 1}^2 =
    \chi_3^2$ distribution.
    \[ \chi_3^2(0.05) = 7.815 \]
    \begin{center}
        \includegraphics[width=0.2\linewidth]
        {images/41121334b36d11ed.png}
    \end{center}
    We don't reject $H_0$ at size $5\%$. The
    $p$-value is $\PP(\chi_3^2 > 0.6) \approx
    0.96$. The data fir the null model almost too
    well.
\end{example*}

\subsubsection*{Goodness of fit test for composite null}

$H_0$: $p_i = p_i(\theta)$ for some parameter
$\theta$. $H_1$: $p$ can be any distribution on
$\{1, \ldots, k\}$.

\begin{example*}
    Individuals can have 3 genotypes. $H_0$: $p_1
    = \theta^2$, $p_2 = 2\theta(1 - \theta)$, $p_3
    = (1 - \theta)^2$, for some $\theta \in [0,
    1]$.
    \begin{align*}
        2\log\Lambda
        &= 2 \left( \sup_{p : \sum p_i
        = 1} l(p) - \sup_{\theta} l(p(\theta))
        \right) \\
        &= 2 (l(\hat{p}) - l(p(\hat{\theta}))
    \end{align*}
    where $\hat{p}$ is the mle in the alternative
    $H_1$; $\hat{\theta}$ is the mle in null
    $H_0$. Last time we found $\hat{p}_i =
    \frac{N_i}{n}$. $\hat{\theta}$ would need to
    be computed for the null model in question.
    \begin{align*}
        2\log\Lambda
        &= 2\sum_i N_i \log \left(
        \frac{N_i}{n p_i(\hat{\theta})} \right) \\
        &= 2 \sum_i o_i \log \left( \frac{o_i}{e_i} \right)
    \end{align*}
    $o_i = N_i$ ``observed number of type $i$'',
    $e_i = n \cdot p_i(\hat{\theta})$ ``expected
    number of type $i$ under $H_0$''. We can
    define a Pearson statistic $\sum_i \frac{(o_i
    - e_i)^2}{e_i}$ using the same argument as
    before.
\end{example*}

\noindent
Each statistic can be referred to a
$\chi_d^2$ when $n$ is large by Wilke's
theorem.
\begin{align*}
    d
    &= \dim(\Theta_1) - \dim(\Theta_0) \\
    &= (k - 1) - \dim(\Theta_0)
\end{align*}
\begin{example*}
    $l(\theta) = \sum_i N_i \log p_i(\theta) =
    2N_1 \log\theta + N_2 \log (2\theta(1 -
    \theta)) + 2N_3 \log(1 - \theta)$. Maximising
    over $\theta \in [0, 1]$ gives $\hat{\theta} =
    \frac{2N_1 + N_2}{2n}$ (exercise). In this
    model $2\log\Lambda$ and $\sum_i \frac{(o_i -
    e_i)^2}{e_i}$ have a $\chi_d^2$ distribution
    with $d = (k - 1) - \dim(\Theta_0) = (k - 1) -
    1 = k - 2 = 3 - 2 = 1$.
\end{example*}

\subsubsection*{Testing independence in contingency tables}

$(X_1, Y_1), \ldots, (X_n, Y_n)$ are iid with
$X_i$ taking values in $\{1, \ldots, r\}$, $Y_i$
taking values in $\{1, \ldots, c\}$. The entries
in a contingency table are
\[ N_{ij} = \# \{l : 1 \le l \le n, (X_l, Y_l) =
(i, j)\} \]
(\# samples of type $(i, j)$)

\begin{example*}
    COVID-19 deaths. $X_i$: age of $i$-th death.
    $Y_i$: week on which it fell. Question: are
    deaths decreasing faster for older age grou
    that had been vaccinated?
\end{example*}

\subsubsection*{Probability Model}

We'll assume $n$ is fixed. A sample $(X_l, Y_l)$
has probability $p_{ij}$ of falling in $(i, j)$
entry of table.
\[ (N_{11}, \ldots, N_{1c}, N_{21}, \ldots,
N_{2c}, \ldots, N_{rc}) \sim \Multinomial(n;
p_{11}, \ldots, p_{1c}, \ldots, p_{rc}) \]

\begin{remark*}
    Fixing $n$ may not be natural; we'll consider
    other models later.
\end{remark*}

\subsubsection*{Null hypothesis}

Week of death is independent of age. $X_i$
independent of $Y_i$ for each sample. Let
\[ p_{i+} = \sum_{j = 1}^n p_{ij} \qquad p_{+j} =
\sum_{i = 1}^r p_{ij} \]
$H_0$: $p_{ij} = p_{i+} p_{+j}$. ($\PP(X_l = i,
Y_l = j) = \PP(X_l = i)\PP(Y_l = j)$). $H_1$:
$(p_{ij})$ is unconstrained except for $p_{ij} \ge
0$, $\sum_{i, j} p_{ij} = 1$.
The generalised LRT:
\[ 2\log\Lambda = 2 \sum_{i, j} o_{ij} \log \left(
\frac{o_{ij}}{e_{ij}} \right) \]
$o_{ij} = N_{ij}$, $e_{ij} = n\hat{p}_{ij}$, where
$\hat{p}$ is the mle under independence model
$H_0$. Using Lagrange multipliers we can find
\[ \hat{p}_{ij} = \hat{p}_{i+} \hat{p}_{+j} \]
where
\begin{align*}
    \hat{p}_{i+}
    &= \frac{N_{i+}}{n}
    &
    \hat{p}_{+j}
    &= \frac{N_{+j}}{n} \\
    N_{i+}
    &= \sum_j N_{ij}
    &
    N_{+j}
    &= \sum_i N_{ij}
\end{align*}
\[ \implies 2\log\Lambda = 2 \sum_{i = 1}^r
\sum_{j = 1}^c N_{ij} \log \left( \frac{N_{ij}}{n
\cdot \hat{p}_{i+} \hat{p}_{+j}} \right) \approx
\sum_{i, j} \frac{(o_{ij} - e_{ij})^2}{e_{ij}} \]

\myskip
Wilke's: The asymptotic distribution of these
statistics is $\chi_d^2$ with
\begin{align*}
    d
    &= \dim(\Theta_1) - \dim(\Theta_0) \\
    &= (rc - 1) - [(r - 1) + (c - 1)] \\
    (r - 1)(c - 1)
\end{align*}
($(r - 1)$ and $(c - 1)$ $\to$ degrees of freedom
in $(p_{1+}, \ldots, p_{r+})$ and $(p_{+1}, \ldots,
p_{+c})$)