% vim: tw=50 % 23/02/2023 11AM \subsubsection*{Tests of Goodness of fit and Independence} It's common to write \[ 2\log \Lambda = 2 \sum_i o_i \log \left( \frac{o_i}{e_i} \right) \] where $o_i = N_i$ ``observed number of type $i$'' and $e_i = n \cdot \tilde{p}_i$ ``expected number of type $i$ under null''. \myskip Pearson's statistic: Let $\delta_i = o_i - e_i$. Then \begin{align*} 2\log\Lambda &= 2\sum_i (e_i + \delta_i) \ub{\log \left( 1 + \frac{\delta_i}{e_i} \right)}_{= \frac{\delta_i}{e_i} - \frac{\delta_i^2}{2e_i^2} + O \left( \frac{\delta_i^3}{e_i^3} \right)} \\ &\approx 2\sum_i \left( \ub{\cancel{\delta_i}}_{\sum_i \delta_i = \sum_i (o_i - e_i) = n - n = 0} + \frac{\delta_i^2}{e_i} - \frac{\delta_i^2}{2e_i} \right) \\ &= \sum \frac{\delta_i^2}{e_i} \\ &= \sum_i \frac{(o_i - e_i)^2}{e_i} \end{align*} This is called Preason's statistic. This is also referred to a $\chi_{k - 1}^2$ distribution when $n$ is large. \begin{hiddenflashcard}[pearsons-statistic] Pearson's statistic? \\ \cloze{ \[ 2\log\Lambda = \sum_i \frac{(o_i - e_i)^2}{e_i} \] } \end{hiddenflashcard} \begin{example*} Mendel's data: \[ (n_1, n_2, n_3, n_4) = (315, 108, 102, 31) \] $2\log\Lambda \approx 0.618$, $\sum_i \frac{(o_i - e_i)^2}{e_i} \approx 0.604$. We refer each statistic to a $\chi_{k - 1}^2 = \chi_3^2$ distribution. \[ \chi_3^2(0.05) = 7.815 \] \begin{center} \includegraphics[width=0.2\linewidth] {images/41121334b36d11ed.png} \end{center} We don't reject $H_0$ at size $5\%$. The $p$-value is $\PP(\chi_3^2 > 0.6) \approx 0.96$. The data fir the null model almost too well. \end{example*} \subsubsection*{Goodness of fit test for composite null} $H_0$: $p_i = p_i(\theta)$ for some parameter $\theta$. $H_1$: $p$ can be any distribution on $\{1, \ldots, k\}$. \begin{example*} Individuals can have 3 genotypes. $H_0$: $p_1 = \theta^2$, $p_2 = 2\theta(1 - \theta)$, $p_3 = (1 - \theta)^2$, for some $\theta \in [0, 1]$. \begin{align*} 2\log\Lambda &= 2 \left( \sup_{p : \sum p_i = 1} l(p) - \sup_{\theta} l(p(\theta)) \right) \\ &= 2 (l(\hat{p}) - l(p(\hat{\theta})) \end{align*} where $\hat{p}$ is the mle in the alternative $H_1$; $\hat{\theta}$ is the mle in null $H_0$. Last time we found $\hat{p}_i = \frac{N_i}{n}$. $\hat{\theta}$ would need to be computed for the null model in question. \begin{align*} 2\log\Lambda &= 2\sum_i N_i \log \left( \frac{N_i}{n p_i(\hat{\theta})} \right) \\ &= 2 \sum_i o_i \log \left( \frac{o_i}{e_i} \right) \end{align*} $o_i = N_i$ ``observed number of type $i$'', $e_i = n \cdot p_i(\hat{\theta})$ ``expected number of type $i$ under $H_0$''. We can define a Pearson statistic $\sum_i \frac{(o_i - e_i)^2}{e_i}$ using the same argument as before. \end{example*} \noindent Each statistic can be referred to a $\chi_d^2$ when $n$ is large by Wilke's theorem. \begin{align*} d &= \dim(\Theta_1) - \dim(\Theta_0) \\ &= (k - 1) - \dim(\Theta_0) \end{align*} \begin{example*} $l(\theta) = \sum_i N_i \log p_i(\theta) = 2N_1 \log\theta + N_2 \log (2\theta(1 - \theta)) + 2N_3 \log(1 - \theta)$. Maximising over $\theta \in [0, 1]$ gives $\hat{\theta} = \frac{2N_1 + N_2}{2n}$ (exercise). In this model $2\log\Lambda$ and $\sum_i \frac{(o_i - e_i)^2}{e_i}$ have a $\chi_d^2$ distribution with $d = (k - 1) - \dim(\Theta_0) = (k - 1) - 1 = k - 2 = 3 - 2 = 1$. \end{example*} \subsubsection*{Testing independence in contingency tables} $(X_1, Y_1), \ldots, (X_n, Y_n)$ are iid with $X_i$ taking values in $\{1, \ldots, r\}$, $Y_i$ taking values in $\{1, \ldots, c\}$. The entries in a contingency table are \[ N_{ij} = \# \{l : 1 \le l \le n, (X_l, Y_l) = (i, j)\} \] (\# samples of type $(i, j)$) \begin{example*} COVID-19 deaths. $X_i$: age of $i$-th death. $Y_i$: week on which it fell. Question: are deaths decreasing faster for older age grou that had been vaccinated? \end{example*} \subsubsection*{Probability Model} We'll assume $n$ is fixed. A sample $(X_l, Y_l)$ has probability $p_{ij}$ of falling in $(i, j)$ entry of table. \[ (N_{11}, \ldots, N_{1c}, N_{21}, \ldots, N_{2c}, \ldots, N_{rc}) \sim \Multinomial(n; p_{11}, \ldots, p_{1c}, \ldots, p_{rc}) \] \begin{remark*} Fixing $n$ may not be natural; we'll consider other models later. \end{remark*} \subsubsection*{Null hypothesis} Week of death is independent of age. $X_i$ independent of $Y_i$ for each sample. Let \[ p_{i+} = \sum_{j = 1}^n p_{ij} \qquad p_{+j} = \sum_{i = 1}^r p_{ij} \] $H_0$: $p_{ij} = p_{i+} p_{+j}$. ($\PP(X_l = i, Y_l = j) = \PP(X_l = i)\PP(Y_l = j)$). $H_1$: $(p_{ij})$ is unconstrained except for $p_{ij} \ge 0$, $\sum_{i, j} p_{ij} = 1$. The generalised LRT: \[ 2\log\Lambda = 2 \sum_{i, j} o_{ij} \log \left( \frac{o_{ij}}{e_{ij}} \right) \] $o_{ij} = N_{ij}$, $e_{ij} = n\hat{p}_{ij}$, where $\hat{p}$ is the mle under independence model $H_0$. Using Lagrange multipliers we can find \[ \hat{p}_{ij} = \hat{p}_{i+} \hat{p}_{+j} \] where \begin{align*} \hat{p}_{i+} &= \frac{N_{i+}}{n} & \hat{p}_{+j} &= \frac{N_{+j}}{n} \\ N_{i+} &= \sum_j N_{ij} & N_{+j} &= \sum_i N_{ij} \end{align*} \[ \implies 2\log\Lambda = 2 \sum_{i = 1}^r \sum_{j = 1}^c N_{ij} \log \left( \frac{N_{ij}}{n \cdot \hat{p}_{i+} \hat{p}_{+j}} \right) \approx \sum_{i, j} \frac{(o_{ij} - e_{ij})^2}{e_{ij}} \] \myskip Wilke's: The asymptotic distribution of these statistics is $\chi_d^2$ with \begin{align*} d &= \dim(\Theta_1) - \dim(\Theta_0) \\ &= (rc - 1) - [(r - 1) + (c - 1)] \\ (r - 1)(c - 1) \end{align*} ($(r - 1)$ and $(c - 1)$ $\to$ degrees of freedom in $(p_{1+}, \ldots, p_{r+})$ and $(p_{+1}, \ldots, p_{+c})$)