% vim: tw=50 % 09/03/2023 11AM \subsubsection*{Normal linear model} Take $Y = XB + \eps$, $\eps \sim \normaldist(0, \sigma^2 I)$. MLE: 2 parameters: $\beta \in \RR^p$, $\sigma^2 \in \RR_+$. Log-likelihood: \[ l(\beta, \sigma^2) = \text{const} + \frac{n}{2} \log \sigma^2 - \frac{1}{2\sigma^2} \|Y - X\beta\|^2 \] For any $\sigma^2 > 0$, we can see that $l(\beta, \sigma^2)$ is maximised as a function of $\beta$ at the minimiser of $\|Y - XB\|^2$, i.e. the least squares estimator $\hat{\beta}$. Now find: \[ \arg \max_{\sigma^2 \ge 0} l(\hat{\beta}, \sigma^2) \] \[ l(\hat{\beta}, \sigma^2) = \text{const} - \frac{n}{2} \log \sigma^2 - \frac{1}{2\sigma^2}\|Y - X\hat{\beta}\|^2 \] As $\sigma^2 \mapsto l(\hat{\beta}, \sigma^2)$ is concave, there is unique maximiser where $\pfrac{l(\hat{\beta}, \sigma^2)}{\sigma^2} = 0$ \[ \implies \hat{\sigma}^2 = \frac{\|Y - X\beta\|^2}{n} = \frac{\|(I - P)Y\|^2}{n} \] \begin{theorem*} \begin{enumerate}[(1)] \item $\hat{\beta} \sim \normaldist(\beta, \sigma^2 (X^\top X)^{-1})$ \item $\frac{\hat{\sigma}^2}{\sigma^2}n \sim \chi_{n - p}^2$ \item $\hat{\beta}$, $\hat{\sigma}^2$ are independent(!) \end{enumerate} \end{theorem*} \begin{proof} $\hat{\beta}$ is linear in $Y$, hence MVN. We already know $\EE\hat{\beta} = \beta$, $\Var \hat{\beta} = \sigma^2 (X^\top X)^{-1}$. This proves (1). For (2) note \begin{align*} \frac{n\hat{\sigma}^2}{\sigma} &= \frac{\|(I - P)Y\|^2}{\sigma^2} \\ &= \frac{\|(I - P)(X\beta + \eps)\|^2}{ \sigma^2} &&(I - P)X = 0 \\ &= \frac{\|(I - P)\eps\|^2}{\sigma^2} \\ &\sim \chi_{\rank(I - P)}^2 \end{align*} $\rank(I - P) = \Trace(I - P) = n - p$. ($X \in \RR^{n - p}$ has full rank). \myskip For (3), note $\hat{\sigma}^2$ is a function of $(I - P)\eps$. We'll show that $\hat{\beta}$ is a function of $P\eps$, which implies $\hat{\sigma}^2 \ci \hat{\beta}$ since $P\eps \ci (I - P) \eps$. \begin{align*} \hat{\beta} &= (X^\top X)^{-1} X^\top Y \\ &= (X^\top X)^{-1} X^\top (X\beta + \eps) \\ &= \beta + (X^\top X)^{-1} X^\top \eps \\ &= \beta + (X^\top X)^{-1} X^\top P\eps \end{align*} since $X^\top P = X^\top$. \end{proof} \begin{corollary*} $\hat{\sigma}^2$ is biased \[ \EE \frac{\hat{\sigma}^2 n}{\sigma^2} = n - p \implies \EE \hat{\sigma}^2 = \left( \frac{n - p}{n} \right) \sigma^2 \] \end{corollary*} \subsubsection*{Student's $t$-distribution} If $U \sim \normaldist(0, 1)$, $V \sim \chi_n^2$, $U\ci V$ then we say $T = \frac{U}{\sqrt{V / n}}$ has a $t_n$ distribution. \subsubsection*{The $F$ distribution} If $V \sim \chi_n^2$, $W \sim \chi_n^2$, $V \ci W$ then we say \[ F = \frac{V / n}{W / m} \] has an $F_{n, m}$ distribution. \subsubsection*{Confidence sets for $\beta$} Suppose we want a $100(1 - \alpha)\%$ confidence interval for one of the coefficients (WLOG take $\beta_1$). Note: \[ \frac{\beta_1 - \hat{\beta_1}}{\sqrt{\sigma^2(X^\top X)^{-1}_{11}}} \sim \normaldist(0, 1) \] because $\hat{\beta}_1 \sim \normaldist(\beta_1, \sigma^2 (X^\top X)^{-1}_{11})$. Also, \[ \frac{\hat{\sigma}^2}{\sigma^2}n \sim \chi_{n - p}^2 \] and these two statistics are independent. \[ \implies \frac{\frac{\hat{\beta}_1 - \beta_1}{\sqrt{\cancel{\sigma^2}(X^\top X)^{-1}_{11}}}} {\sqrt{\frac{\hat{\sigma}^2}{\cancel{\sigma^2}} \frac{n}{n - p}}} \sim \frac{\normaldist(0, 1)}{\sqrt{\chi_{n - p}^2 / (n - p)}} \sim t_{n - p} \] Now this only depends on $\beta_1$ and \emph{not} on $\sigma^2$, so we can use this as a pivot. \[ \PP_{\beta, \sigma^2} \left( -t_{n - p} \left( \frac{\alpha}{2} \right) \le \frac{\hat{\beta}_1 - \beta_1}{\sqrt{(X^\top X)^{-1}_{11}}} \sqrt{\frac{n - p}{n\hat{\sigma}^2}} \le t_{n - p} \left( \frac{\alpha}{2} \right) \right) = 1 - \alpha \] We use that $t_n$ distribution is symmetric around $0$. \begin{center} \includegraphics[width=0.6\linewidth] {images/50e4e9dcbe6f11ed.png} \end{center} Rearranging the inequalities, we get \[ \PP_{\beta, \sigma^2} \left( \hat{\beta}_1 - \ub{t_{n - p} \left( \frac{\alpha}{2} \right) \sqrt{\frac{(X^\top X)^{-1}_{11} \hat{\sigma}^2}{(n - p) / n}}}_{=M} \le \beta_1 \le \hat{\beta}_1 + M \right) = 1 - \alpha \] We conclude that \[ \left[ \hat{\beta}_1 \pm t_{n - p} \left( \frac{\alpha}{2} \right) \sqrt{ \frac{(X^\top X)^{-1}_{11} \hat{\sigma}^2}{(n - p) / n} } \right] \] is a $(1 - \alpha) \cdot 100\%$ confidence interval for $\beta_1$. \begin{remark*} This is \emph{not} asymptotic. \end{remark*} \noindent By the duality between tests of significance and confidence intervals, we can find a size $\alpha$ test for $H_0$: $\beta_1 = \beta^*$ vs $H_1$: $\beta_1 \neq \beta^*$. Simply reject $H_0$ if $\beta^*$ is not contained in the $100 \cdot (1 - \alpha)\%$ confidence interval for $\beta_1$. \subsubsection*{Confidence ellipsoids for $\beta$} Note $\hat{\beta} - \beta \sim \normaldist(0, \sigma^2 (X^\top X)^{-1})$. As $X$ has full rank, $X^\top X$ is positive definite. So it has eigendecomposition \[ (X^\top X) = UDU^\top \] where $D_{ii} > 0$ for $i = 1, \ldots, p$. Define \[ (X^\top X)^\alpha = UD^\alpha U^\top \] \[ D^\alpha = \begin{pmatrix} D_{11}^\alpha & \cdots & 0 \\ \vdots & \ddots & \vdots \\ 0 & \cdots & D_{pp}^\alpha \end{pmatrix} \] \[ (X^\top X)^{1/2} (\hat{\beta} - \beta) \sim \normaldist(0, \sigma^2 I) \] Hence \begin{align*} \ub{\frac{\|(X^\top X)^{1/2} (\hat{\beta} - \beta)\|^2}{\sigma^2}}_{= \frac{\|X(\hat{\beta} - \beta)\|^2}{\sigma^2}} &\sim \chi_p^2 \end{align*} This is a function of $\hat{\beta}$, so it's independent of \[ \frac{\hat{\sigma}^2 n}{\sigma^2} \sim \chi_{n - p}^2 \] \[ \implies \frac{\|X(\hat{\beta} - \beta)\|^2 / \cancel{\sigma^2} p}{\hat{\sigma}^2 n / \cancel{\sigma^2} (n - p)} \sim F_{p, n - p} \] This only depends on $\beta$, \emph{not} on $\sigma^2$, so it can be used as a pivot. For all $\beta, \sigma^2$: \[ \PP_{\sigma^2, \beta} \left( \frac{\|X(\hat{\beta} - \beta\|^2 / p}{\hat{\sigma}^2 n / (n - p)} \le F_{p, n - p}(\alpha) \right) = 1 - \alpha \] \begin{center} \includegraphics[width=0.6\linewidth] {images/8a68fefebe7011ed.png} \end{center} So, we can say that the set \[ \left\{ \beta \in \RR^p : \frac{\|(X(\hat{\beta} - \beta)\|^2 / p}{\hat{\sigma}^2 n / (n - p)} \le F_{p, n - p}(\alpha) \right\} \] is a $100(1 - \alpha)\%$ confidence set for $\beta$. \begin{center} \includegraphics[width=0.6\linewidth] {images/c270c4eebe7011ed.png} \end{center} Principal axes are given by eigenvectors of $(X^\top X)$. \myskip In the next section we'll talk about hypothesis tests for $H_0$: $\beta_1 = \cdots = \beta_p = 0$, $H_1$: $\beta \in \RR^p$.