% vim: tw=50 % 04/03/2023 09AM \subsection{The linear Model} Data are pairs $(x_1, Y_1)$, $\ldots$, $(x_n, Y_n)$. $Y_i \in \RR$: ``responses'', random. $x_i \in \RR^p$: ``predictors'', fixed. \begin{example*} $Y_i$: number of insurance claims for client $i$. $x_i$: (age, number of claims in 2-21, years with driver's license, $\ldots$). \end{example*} \noindent In a linear model, we assume \[ Y_i = \cancel{\alpha} + \beta_1 x_{i1} + \beta_2 x_{i2} + \cdots + \beta_p x_{ip} + \eps_i \] \begin{itemize} \item $\alpha$ is an intercept. \item $\beta_1, \ldots, \beta_p$ are coefficients. \item $\eps_1, \ldots, \eps_n$ are random noise variables. \end{itemize} \begin{remark*} We normally remove intercept by including a dummy predictor which is equal to $1$ for all $i$, i.e. $x_{i1} = 1$ for all $i = 1, \ldots, n$. \end{remark*} \begin{remark*} We can also model non-linear relationships between $Y_i$ and $x_i$ using a linear model, for example by using $x_i = (\text{age}, \text{age}^2, \log(\text{age}))$. \end{remark*} \begin{remark*} $\beta_j$ is the effect on $Y_i$ of increasing $x_{ij}$ by a unit, whilst keeping all other predictors constant. Estimates of $\beta$ should not be interpreted causally, unless we have a randomised experiment. \end{remark*} \noindent Matrix formulation: \[ Y = \begin{pmatrix} Y_1 \\ \vdots \\ Y_n \end{pmatrix} \qquad X = \ub{ \begin{pmatrix} x_{11} & x_{12} & \cdots & x_{1p} \\ x_{21} & x_{22} & \cdots x_{2p} \\ \vdots & \vdots & \ddots & \vdots \\ x_{n1} & x_{n2} & \cdots & x_{np} \end{pmatrix} }_{\text{``design matrix''}} \] \[ \beta = \begin{pmatrix} \beta_1 \\ \vdots \\ \beta_p \end{pmatrix} \qquad \eps = \begin{pmatrix} \eps_1 \\ \vdots \\ \eps_n \end{pmatrix} \] \[ Y = X\beta + \eps \] Moment assumptions on $\eps$: \begin{enumerate}[(1)] \item $\EE\eps = 0 \implies \EE Y = X\beta$. \item $\Var\eps = \sigma^2 I \implies \Var(\eps_i) = \sigma^2$ for all $i$ ``homoscedasticity''. $\Cov(\eps_i, \eps_j) = 0$ for all $i \neq j$. \end{enumerate} We'll assume throughout that $x \in \RR^{k \times p}$ has full rank. In particular, $p \le n$ (more samples than predictors). \subsubsection*{Least squares estimator} $\hat{\beta}$ minimises the residual sum of squares \begin{align*} S(\beta) &= \|Y - X\beta\|^2 \\ &= \sum_{i = 1}^n (Y_i - x_i^\top \beta)^2 \end{align*} This is a quadratic (positive definite) polynomial in $\beta$ so $\hat{\beta}$ satisfies \[ \nabla S(\beta)|_{\beta = \hat{\beta}} = 0 \] \[ \implies \left. \pfrac{S(\beta)}{\beta_k} \right|_{\beta = \hat{\beta}} = -2 \sum_{i = 1}^n x_{ik} \left( Y_i - \sum_{j = 1}^p x_{ij} \hat{\beta}_j \right) = 0 \] for each $k = 1, \ldots, p$. Equivalent matrix form: \[ X^\top X \hat{\beta} = X^\top Y \] As $X$ has rank $p$, the matrix $X^\top X \in \RR^{p \times p}$ is invertible, hence \[ \hat{\beta} = (X^\top X)^{-1} X^\top Y \] (linear in $Y$!). Check: \begin{align*} \EE\hat{\beta} &= \EE[(X^\top X)^{-1} X^\top Y] \\ &= (X^\top X)^{-1} X^\top \EE Y \\ &= \cancel{(X^\top X)^{-1}} \cancel{X^\top X} \beta \\ &= \beta \end{align*} Hence $\hat{\beta}$ is unbiased. We can also calculate: \begin{align*} \Var(\hat{\beta}) &= \Var((X^\top X)^{-1} X^\top Y) \\ &= (X^\top X)^{-1} X^\top \Var(Y) X(X^\top X)^{-1} \\ &= (X^\top X)^{-1} X^\top \sigma^2 I X(X^\top X)^{-1} \\ &= \sigma^2 (X^\top X)^{-1} \end{align*} \begin{flashcard}[Gauss-Markov-thm] \begin{theorem*}[Gauss-Markov] Let $\beta^* = CY$ be any linear estimator of $\beta$ which is unbiased. Then \cloze{for any $t \in \RR^p$, \[ \Var(t^\top \hat{\beta}) \le \Var(t^\top \beta^*) \]} We say $\hat{\beta}$ is \cloze{``Best Linear Unbiased Estimator'' (BLUE).} \end{theorem*} \end{flashcard} \begin{hiddenflashcard}[gauss-markov-proof] Proof of Gauss-Markov Theorem? \\ \cloze{ \begin{proof} Show that it is sufficient to show $\Var \beta^* - \Var \hat{\beta}$ is positive semi-definite. \\ Now let $\beta^* = CY$, $\hat{\beta} = (X^\top X)^{-1} X^\top Y$ and let $A = C - (X^\top X)^{-1} X^\top$. Then since they are unbiased, \[ \EE AY = \EE \beta^* - \EE \hat{\beta} = \beta - \beta = 0 \] But $\EE AY = A \EE Y = AX\beta = 0$ holds for all $\beta \in \RR^p$ so $AX = 0$. Then \begin{align*} \Var \beta^* &= \Var((A + (X^\top X)^{-1} X^\top)Y) \\ &= (A + (X^\top X)^{-1} X^\top) \Var Y (A + (X^\top X)^{-1} X^\top)^\top \\ &= \sigma^2 (AA^\top + (X^\top X)^{-1} + 0 + 0) \\ &= \sigma^2 AA^\top + \Var(\hat{\beta}) \end{align*} so $\Var \beta^* - \Var \hat{\beta} = \sigma^2 AA^\top$ which is positive semi-definite as desired. \end{proof} } \end{hiddenflashcard} \begin{remark*} Think of $t \in \RR^p$ as the value of the predictors for a new sample. Then $t^\top \hat{\beta}$, $t^\top \beta^*$ are estimators of the mean response. These are both unbiased, so the mse is the variance of $t^\top \hat{\beta}$, $t^\top \beta^*$. Theorem says variance is ``best'' using the least squares estimator. \end{remark*} \begin{proof} \[ \Var(t^\top \beta^*) - \Var(t^\top \hat{\beta}) = t^\top (\Var \beta^* - \Var \hat{\beta}) t \ge 0 \] This holds for all $t \in \RR^p$ if and only if the matrix $\Var\beta^* - \Var\hat{\beta}$ is positive semi-definite. Recall $\beta^* = CY$, $\hat{\beta} = (X^\top X)^{-1} X^\top Y$. Let $A = C - (X^\top X)^{-1} X^\top$. Note: \[ \EE AY = \EE \beta^* - \EE \hat{\beta} = \beta - \beta = 0 \] (since $\beta^*$ and $\hat{\beta}$ are unbiased). But also note \[ \EE AY = A \EE Y = AX\beta = 0 \] for all $\beta \in \RR^p$, so we must have $AX = 0$. Then \begin{align*} \Var\beta^* &= \Var((A + (X^\top X)^{-1}X^\top)Y) \\ &= (A + (X^\top X)^{-1} X^\top ) \Var Y (A + (X^\top X)^{-1} X^\top )^\top \\ &= \sigma^2 (AA^\top + (X^\top X)^{-1} + \cancel{AX(X^\top X)^{-1}} + \cancel{(X^\top X)^{-1} X^\top A^\top}) \\ &= \sigma^2 AA^\top + \Var(\hat{\beta}) \\ \implies \Var \beta^* - \Var \hat{\beta} &= \sigma^2 AA^\top \end{align*} and this is positive definite, as desired. \end{proof} \myskip Fitted values and residuals: fitted values \[ \hat{Y} = X\hat{B} = \ub{X(X^\top X)^{-1} X^\top}_{P \text{ ``hat matrix''}} Y \] Residuals: $Y - \hat{Y} = (I - P)Y$. \begin{proposition*} $P$ is the orthogonal projection onto $\col(X)$. \end{proposition*} \begin{proof} $P$ is clearly symmetric. Also, \[ P^2 = X(X^\top X)^{-1} \cancel{X^\top X} \cancel{(X^\top X)^{-1}} X^\top = P \] Therefore $P$ is an orthogonal projection onto $\col(P)$. We need to show $\col(P) = \col(X)$. For any $a$, $Pa = X[(X^\top X)^{-1} X^\top a] \in \col(X)$. Also, if $b = Xc$ is a vector in $\col(X)$, then \[ b = Xc = X(X^\top X)^{-1} X^\top Xc = Pb \in \col(P) \qedhere \] \end{proof} \begin{corollary*} Fitted values are projections of $Y$ onto $\col(X)$. Residuals are projections of $Y$ onto $\col(X)^\perp$. \begin{center} \includegraphics[width=0.6\linewidth] {images/3ac39c2aba7311ed.png} \end{center} \end{corollary*} \subsubsection*{Normal assumptions} We assume in addition to $\EE\eps = 0$, $\Var\eps = \sigma^2 I$, that $\eps$ is MVN, i.e. \[ \eps \sim \normaldist(0, \sigma^2 I_n) \] $\sigma^2$ is usually unknown, so the parameters in the model are $(\beta, \sigma^2)$. We'll see that mle of $\beta$ is the least squares estimator $\hat{\beta}$.