% vim: tw=50 % 21/01/2023 09AM \subsubsection*{Change of Variables (in 2D)} Let $(x, y) \mapsto (u, v)$ is a differentiable bijection. Then \[ f_{U, V}(u, v) = f_{X, Y}(x(u, v), y(u, v)) \cdot |\det J| \] \[ J = \pfrac{(x, y)}{(u, v)} = \begin{bmatrix} \pfrac{x}{u} & \pfrac{x}{v} \\ \pfrac{y}{u} & \pfrac{y}{v} \end{bmatrix} \] \subsubsection*{Important Distributions} $X \sim \Negbin(k, p)$: In successive IID $\Ber(p)$ trials $X$ is the time at which $k$-th success occurs. \myskip $X \sim \Poisson(\lambda)$ is the limit of a $\Bin(n, \lambda/n)$ as $n \to \infty$. \begin{hiddenflashcard}[poisson-dist] Poisson distribution $\Poisson(\lambda)$? \\ \[ f(x) = \cloze{e^{-\lambda} \frac{\lambda^x}{x!}} \] \[ \EE[X] = \cloze{\lambda} \] \[ \Var(X) = \cloze{\lambda} \] \end{hiddenflashcard} \begin{hiddenflashcard}[gamma-dist] Gamma distribution $\Gamma(\cloze{\alpha, \lambda})$? \\ \[ f(x) = \cloze{\frac{\lambda^\alpha}{\Gamma(\alpha)} x^{\alpha - 1} e^{-\lambda x}} \] \[ \EE[X] = \cloze{\frac{\alpha}{\lambda}} \] \[ \Var(X) = \cloze{\frac{\alpha}{\lambda^2}} \] \end{hiddenflashcard} \begin{hiddenflashcard}[beta-dist] Beta distribution $\Beta(\cloze{a, b})$? \\ \[ f(x) = \cloze{\frac{\Gamma(a + b)}{\Gamma(a) \Gamma(b)} x^{a - 1} (1 - x)^{b - 1}} \] \[ \EE[X] = \cloze{\frac{a}{a + b}} \] \[ \Var(X) = \cloze{\frac{ab}{(a + b)^2(a + b + 1)}} \] \end{hiddenflashcard} \myskip If $X_i \sim \Gamma(\alpha_i, \lambda)$ for $i = 1, \ldots, n$ with $X_1, \ldots, X_n$ independent. What is the distribution of $S_n = X_1 + \cdots + X_n$? \[ M_{S_n}(t) = \prod_{i = 1}^n M_{X_i}(t) = \left( \frac{\lambda}{\lambda - t} \right)^{\alpha_1 + \cdots + \alpha_n} \] This is the MGF of a $\Gamma(\sum \alpha_i, \lambda)$. Hence $S_n \sim \Gamma(\sum \alpha_i, \lambda)$. Also, if $X \sim \Gamma(a, \lambda)$, then for any $b \in (c, \infty)$, $bX \sim \Gamma(\alpha, \lambda/b)$. \subsubsection*{Special cases} $\Gamma(1, \lambda) = \Exp(\lambda)$, $\Gamma(k/2, 1/2) = \chi_k^2$ ``Chi-squared with $k$ degrees of freedom.'' Sum of $k$ independent squared $\normaldist(0, 1)$ random variables. \subsection{Estimation} Suppose we observe data $X_1, X_2, \ldots, X_n$ which are IID from some PDF (pmf) $f_X(x \mid \theta)$, with $\theta$ unknown. \begin{definition*}[Estimator] An \emph{estimator} is a statistic or a function of the data $T(X) = \hat{\theta}$, which we use to approximate the true parameter $\theta$. The distribution of $T(X)$ is called the \emph{sampling distrbution}. \end{definition*} \begin{example*} $X_1, \ldots, X_n \stackrel{\text{IID}}{\sim} \normaldist(\mu, 1)$. \[ \hat{\mu} = T(X) = \frac{1}{h} \sum_{i = 1}^n X_i \] The sampling distribution of $\hat{\mu}$ is $\normaldist \left( \mu, \frac{1}{n} \right)$. \end{example*} \begin{definition*} The \emph{bias} of $\hat{\theta} = T(X)$ is \[ \bias(\hat{\theta}) = \EE_\theta(\hat{\theta}) - \theta \] \end{definition*} \begin{note*} In general, the bias is a function of $\theta$, even if notation $\bias(\hat{\theta})$ does not make it explicit. \end{note*} \begin{definition*} We say that $\hat{\theta}$ is \emph{unbiased} if $\bias(\hat{\theta}) = 0$ for all $\theta \in \Theta$. \end{definition*} \begin{example*}[Continuing from previous] $\hat{\mu} = \frac{1}{n} \sum_{i = 1}^n X_i$ is unbiased because $\EE_\mu(\hat{\mu}) = \mu$ for all $\mu \in \RR$. \end{example*} \begin{definition*} The \emph{mean squared error} (mse) of $\hat{\theta}$ is \[ \mse(\hat{\theta}) = \EE_\theta((\hat{\theta} - \theta)^2) \] \end{definition*} \begin{note*} Like the bias, $\mse(\hat{\theta})$ is a function of $\theta$! \end{note*} \subsubsection*{Bias-variance decomposition} \begin{align*} \mse(\hat{\theta}) &= \EE_\theta [(\hat{\theta} - \theta)^2] \\ &= \EE_\theta[(\hat{\theta} - \EE_\theta \hat{\theta} + \EE_\theta \hat{\theta} - \theta)^2] \\ &= \Var_\theta(\hat{\theta}) + \bias^2(\hat{\theta}) + \cancel{[\EE_\theta(\hat{\theta} - \EE_\theta \hat{\theta})]} (\EE_\theta \hat{\theta} - \theta) \end{align*} The two terms on the RHS are $\ge 0$. \begin{hiddenflashcard}[bias-variance-decomposition] Bias variance decomposition? \\ \[ \mse(\hat{\theta}) = \cloze{\EE_\theta[(\hat{\theta} - \EE_\theta \hat{\theta} + \EE_\theta \hat{\theta} - \theta)^2] = \Var_\theta(\hat{\theta}) + \bias^2(\hat{\theta})} \] \end{hiddenflashcard} \myskip There is a trade off between bias and variance. \begin{example*} $X \sim \Bin(n, \theta)$. Suppose $n$ known, we wish to estimate $\theta$. Standard estimator $T_u = \frac{X}{n}$, then $\EE_\theta T_u = \frac{\EE_\theta X}{n} = \theta$ (holds for all $\theta$). Hence $T_u$ is unbiased. \begin{align*} \mse(T_u) &= \Var_\theta(T_u) \\ &= \frac{\Var_\theta X}{h^2} \\ &= \frac{n\theta(1 - \theta)}{h^2} \\ &= \frac{\theta(1 - \theta)}{h} \end{align*} Consider a second estimator \[ T_B = \frac{X + 1}{n + 2} = \omega \frac{X}{n} + (1 - \omega) \half \] with $\omega = \frac{n}{n + 2}$. If $X = 8$, $n = 10$ (8 successes in 10 trials), then $T_u = 0.8$, $T_B = \frac{9}{12} = 0.75$. \begin{align*} \bias(T_B) &= \EE_\theta T_B - \theta \\ &= \EE \left( \frac{X + 1}{n + 2} \right) - \theta \\ &= \frac{n}{n + 2}\theta + \frac{1}{n + 2} - \theta \end{align*} This is $\neq 0$ for all but one value of $\theta$. Hence $T_b$ is biased. \[ \Var_\theta(T_B) = \frac{1}{(n + 2)^2}n\theta(1 - \theta) = \frac{\omega^2\theta(1 - \theta)}{n} \] \begin{align*} \mse(T_B) &= \Var_\theta(T_B) + \bias^2(T_B) \\ &= \omega^2 \frac{\theta(1 - \theta)}{n} + (1 - \omega)^2 \left( \half - \theta \right)^2 \end{align*} \begin{center} \includegraphics[width=0.6\linewidth] {images/ed14ef56997011ed.png} \end{center} \end{example*} \noindent Message: Our prior judgements about $\theta$ affect our choice of estimator (for example in this previous example, if we knew the $X_i$ represent coin flips, then we expect $\theta$ to be near $\half$, so we should use $\mse(T_B)$). \myskip Unbiasedness is not necessarily desirable. Consider this pathological example: \begin{example*} Suppose $X \sim \Poisson(\lambda)$. We wish to estimate $\theta = \PP(X = 0)^2 = e^{-2\lambda}$. For an estimator $T(X)$ to be unbiased we must have for all $\lambda$ \[ \EE_\lambda[\hat{\theta}] = \sum_{x = 0}^\infty T(X) \frac{e^{-\lambda} \lambda^x}{x!} = e^{-2\lambda} = \theta \] \[ \iff \sum_{x = 0}^\infty T(x) \frac{\lambda^x}{x!} = e^{-\lambda} = \sum_{x = 0}^\infty (-1)^x \frac{\lambda^x}{x!} \] for this to hold $\forall \lambda \ge 0$, we need \[ T(x) = (-1)^x \] This estimator makes no sense! \end{example*}