% vim: tw=50
% 21/01/2023 09AM

\subsubsection*{Change of Variables (in 2D)}

Let $(x, y) \mapsto (u, v)$ is a differentiable
bijection. Then
\[ f_{U, V}(u, v) = f_{X, Y}(x(u, v), y(u, v))
\cdot |\det J| \]
\[ J = \pfrac{(x, y)}{(u, v)} =
\begin{bmatrix}
    \pfrac{x}{u} & \pfrac{x}{v} \\
    \pfrac{y}{u} & \pfrac{y}{v}
\end{bmatrix}
\]

\subsubsection*{Important Distributions}

$X \sim \Negbin(k, p)$: In successive IID
$\Ber(p)$ trials $X$ is the time at which $k$-th
success occurs.

\myskip
$X \sim \Poisson(\lambda)$ is the limit of a
$\Bin(n, \lambda/n)$ as $n \to \infty$. 
\begin{hiddenflashcard}[poisson-dist]
Poisson distribution $\Poisson(\lambda)$? \\
\[ f(x) = \cloze{e^{-\lambda} \frac{\lambda^x}{x!}} \]
\[ \EE[X] = \cloze{\lambda} \]
\[ \Var(X) = \cloze{\lambda} \]
\end{hiddenflashcard}
\begin{hiddenflashcard}[gamma-dist]
Gamma distribution $\Gamma(\cloze{\alpha, \lambda})$? \\
\[ f(x) = \cloze{\frac{\lambda^\alpha}{\Gamma(\alpha)} x^{\alpha - 1}
e^{-\lambda x}} \]
\[ \EE[X] = \cloze{\frac{\alpha}{\lambda}} \]
\[ \Var(X) = \cloze{\frac{\alpha}{\lambda^2}} \]
\end{hiddenflashcard}
\begin{hiddenflashcard}[beta-dist]
Beta distribution $\Beta(\cloze{a, b})$? \\
\[ f(x) = \cloze{\frac{\Gamma(a + b)}{\Gamma(a) \Gamma(b)} x^{a - 1}
(1 - x)^{b - 1}} \]
\[ \EE[X] = \cloze{\frac{a}{a + b}} \]
\[ \Var(X) = \cloze{\frac{ab}{(a + b)^2(a + b + 1)}} \]
\end{hiddenflashcard}

\myskip
If $X_i \sim \Gamma(\alpha_i, \lambda)$ for $i =
1, \ldots, n$ with $X_1, \ldots, X_n$ independent.
What is the distribution of $S_n = X_1 + \cdots +
X_n$?
\[ M_{S_n}(t) = \prod_{i = 1}^n M_{X_i}(t) =
\left( \frac{\lambda}{\lambda - t}
\right)^{\alpha_1 + \cdots + \alpha_n} \]
This is the MGF of a $\Gamma(\sum \alpha_i,
\lambda)$. Hence $S_n \sim \Gamma(\sum \alpha_i,
\lambda)$. Also, if $X \sim \Gamma(a, \lambda)$,
then for any $b \in (c, \infty)$, $bX \sim
\Gamma(\alpha, \lambda/b)$.

\subsubsection*{Special cases}

$\Gamma(1, \lambda) = \Exp(\lambda)$, $\Gamma(k/2,
1/2) = \chi_k^2$ ``Chi-squared with $k$ degrees of
freedom.'' Sum of $k$ independent squared
$\normaldist(0, 1)$ random variables.

\subsection{Estimation}

Suppose we observe data $X_1, X_2, \ldots, X_n$
which are IID from some PDF (pmf) $f_X(x \mid
\theta)$, with $\theta$ unknown.

\begin{definition*}[Estimator]
    An \emph{estimator} is a statistic or a
    function of the data $T(X) = \hat{\theta}$,
    which we use to approximate the true parameter
    $\theta$. The distribution of $T(X)$ is called
    the \emph{sampling distrbution}.
\end{definition*}

\begin{example*}
    $X_1, \ldots, X_n \stackrel{\text{IID}}{\sim}
    \normaldist(\mu, 1)$.
    \[ \hat{\mu} = T(X) = \frac{1}{h} \sum_{i =
    1}^n X_i \]
    The sampling distribution of $\hat{\mu}$ is
    $\normaldist \left( \mu, \frac{1}{n} \right)$.
\end{example*}

\begin{definition*}
    The \emph{bias} of $\hat{\theta} = T(X)$ is
    \[ \bias(\hat{\theta}) =
    \EE_\theta(\hat{\theta}) - \theta \]
\end{definition*}

\begin{note*}
    In general, the bias is a function of
    $\theta$, even if notation
    $\bias(\hat{\theta})$ does not make it
    explicit.
\end{note*}

\begin{definition*}
    We say that $\hat{\theta}$ is \emph{unbiased}
    if $\bias(\hat{\theta}) = 0$ for all $\theta
    \in \Theta$.
\end{definition*}

\begin{example*}[Continuing from previous]
    $\hat{\mu} = \frac{1}{n} \sum_{i = 1}^n X_i$
    is unbiased because $\EE_\mu(\hat{\mu}) = \mu$
    for all $\mu \in \RR$.
\end{example*}

\begin{definition*}
    The \emph{mean squared error} (mse) of
    $\hat{\theta}$ is
    \[ \mse(\hat{\theta}) =
    \EE_\theta((\hat{\theta} - \theta)^2) \]
\end{definition*}

\begin{note*}
    Like the bias, $\mse(\hat{\theta})$ is a
    function of $\theta$!
\end{note*}

\subsubsection*{Bias-variance decomposition}

\begin{align*}
    \mse(\hat{\theta})
    &= \EE_\theta [(\hat{\theta} - \theta)^2] \\
    &= \EE_\theta[(\hat{\theta} - \EE_\theta
    \hat{\theta} + \EE_\theta \hat{\theta} -
    \theta)^2] \\
    &= \Var_\theta(\hat{\theta}) +
    \bias^2(\hat{\theta}) +
    \cancel{[\EE_\theta(\hat{\theta} - \EE_\theta
    \hat{\theta})]} (\EE_\theta \hat{\theta} -
    \theta)
\end{align*}
The two terms on the RHS are $\ge 0$.

\begin{hiddenflashcard}[bias-variance-decomposition]
Bias variance decomposition? \\
\[ \mse(\hat{\theta}) = \cloze{\EE_\theta[(\hat{\theta} -
\EE_\theta \hat{\theta} + \EE_\theta \hat{\theta}
- \theta)^2] = \Var_\theta(\hat{\theta}) +
\bias^2(\hat{\theta})} \]
\end{hiddenflashcard}

\myskip
There is a trade off between bias and variance.

\begin{example*}
    $X \sim \Bin(n, \theta)$. Suppose $n$ known,
    we wish to estimate $\theta$. Standard
    estimator $T_u = \frac{X}{n}$, then
    $\EE_\theta T_u = \frac{\EE_\theta X}{n} =
    \theta$ (holds for all $\theta$). Hence $T_u$
    is unbiased.
    \begin{align*}
        \mse(T_u)
        &= \Var_\theta(T_u) \\
        &= \frac{\Var_\theta X}{h^2} \\
        &= \frac{n\theta(1 - \theta)}{h^2} \\
        &= \frac{\theta(1 - \theta)}{h}
    \end{align*}
    Consider a second estimator
    \[ T_B = \frac{X + 1}{n + 2} = \omega
    \frac{X}{n} + (1 - \omega) \half \]
    with $\omega = \frac{n}{n + 2}$. If $X = 8$,
    $n = 10$ (8 successes in 10 trials), then $T_u
    = 0.8$, $T_B = \frac{9}{12} = 0.75$.
    \begin{align*}
        \bias(T_B)
        &= \EE_\theta T_B - \theta \\
        &= \EE \left( \frac{X + 1}{n + 2} \right)
        - \theta \\
        &= \frac{n}{n + 2}\theta + \frac{1}{n + 2}
        - \theta
    \end{align*}
    This is $\neq 0$ for all but one value of
    $\theta$. Hence $T_b$ is biased.
    \[ \Var_\theta(T_B) = \frac{1}{(n +
    2)^2}n\theta(1 - \theta) =
    \frac{\omega^2\theta(1 - \theta)}{n} \]
    \begin{align*}
        \mse(T_B)
        &= \Var_\theta(T_B) + \bias^2(T_B) \\
        &= \omega^2 \frac{\theta(1 - \theta)}{n} +
        (1 - \omega)^2 \left( \half - \theta
        \right)^2
    \end{align*}
    \begin{center}
        \includegraphics[width=0.6\linewidth]
        {images/ed14ef56997011ed.png}
    \end{center}
\end{example*}

\noindent
Message: Our prior judgements about $\theta$
affect our choice of estimator (for example in
this previous example, if we knew the $X_i$
represent coin flips, then we expect $\theta$ to
be near $\half$, so we should use $\mse(T_B)$).

\myskip
Unbiasedness is not necessarily desirable.
Consider this pathological example:

\begin{example*}
    Suppose $X \sim \Poisson(\lambda)$. We wish to
    estimate $\theta = \PP(X = 0)^2 =
    e^{-2\lambda}$. For an estimator $T(X)$ to be
    unbiased we must have for all $\lambda$
    \[ \EE_\lambda[\hat{\theta}] = \sum_{x =
    0}^\infty T(X) \frac{e^{-\lambda}
    \lambda^x}{x!} = e^{-2\lambda} = \theta \]
    \[ \iff \sum_{x = 0}^\infty T(x)
    \frac{\lambda^x}{x!} = e^{-\lambda} = \sum_{x
    = 0}^\infty (-1)^x \frac{\lambda^x}{x!} \]
    for this to hold $\forall \lambda \ge 0$, we
    need
    \[ T(x) = (-1)^x \]
    This estimator makes no sense!
\end{example*}