% vim: tw=50
% 11/02/2023 09AM

\begin{example*}
    $X_1, \ldots, X_n \iidsim \normaldist(\mu,
    1)$. Prior: $\pi(\mu)$ is $\normaldist\left(
    0, \frac{1}{\tau^2} \right)$
    \begin{align*}
        \pi(\mu \mid x)
        &\propto f_X(x \mid \mu) \cdot \pi(\mu) \\
        &\propto \exp \left[ -\half \sum_{i = 1}^n
        (x_i - \mu)^2 \right] \exp \left[
        -\frac{\mu^2 \tau^2}{2} \right] \\
        &\propto \exp \left[ -\left( \half
        \right)^{(n + \tau^2)} \left\{ \mu -
        \frac{\sum x_i}{n + \tau^2} \right\}^2 \right]
    \end{align*}
    we recognise this as a 
    \[ \normaldist \left( \frac{\sum x_i}{n +
    \tau^2}, \frac{1}{n + \tau^2} \right) \]
    distribution. The Bayes estimator
    $\hat{\mu}^{(b)} = \frac{\sum x_i}{n +
    \tau^2}$ for both quadratic loss and absolute
    error loss ($\hat{\mu}^{\text{mle}} =
    \frac{\sum x_i}{n}$). A $95\%$ credible
    interval is
    \[ \left( \hat{\mu}^{(b)} -
    \frac{1.96}{\sqrt{n + \tau^2}},
    \hat{\mu}^{(b)} + \frac{1.96}{\sqrt{n + \tau}^2} \right) \]
    This is close to a $95\%$ confidence interval
    when $n \gg \tau^2$.
\end{example*}

\begin{example*}
    $X_1, \ldots, X_n \iidsim \Poisson(\lambda)$.
    Prior: $\pi(\lambda)$ is $\Exp(1)$,
    $\pi(\lambda) = e^{-\lambda}$, $\lambda > 0$.
    \begin{align*}
        \pi(\lambda \mid x)
        &\propto f_X(x \mid \lambda) \cdot
        \pi(\lambda) \\
        &\propto \frac{e^{-n\lambda} \lambda^{\sum
        x_i}}{\cancel{\prod_i x_i!}} e^{-\lambda}
        &&\lambda > 0 \\
        &= e^{-(n + 1)\lambda} \lambda^{\sum x_i}
        &&\lambda > 0
    \end{align*}
    THis is a $\Gamma\left( 1 + \sum x_i, n + 1
    \right)$ distribution. The Bayes estimator
    under quadratic loss is the posterior mean
    \[ \hat{\lambda}^{(b)} = \frac{\sum x_i + 1}{n
    + 1} \stackrel{n \to \infty}{\longrightarrow}
    \frac{\sum x_i}{n} = \hat{\lambda}^{\text{mle}} \]
    Under the absolute error loss the bayes
    estimator $\tilde{\lambda}^{(b)}$ has
    \[ \int_0^{\tilde{\lambda}^{(b)}} \frac{(n +
    1)^{\sum x_i - 1}}{(\sum x_i)!} x^{\sum x_i}
    e^{-(n + 1)\lambda} \dd \lambda = \half \]
\end{example*}

\subsubsection*{Simple Hypothesis}

A \emph{hypothesis} is some assumption about the
distribution of the data $X$. Scientific questions
are phrased as a choice between a \emph{null
hypothesis} $H_0$ (base case, simple model, no
effect) and an \emph{alternative hypothesis} $H_1$
(complex model, interesting case, positive or
negative effect).

\myskip
Examples and non-examples of simple hypotheses (no
explanation yet)
\begin{enumerate}[(1)]
    \item $X_1, \ldots, X_n \iidsim \Ber(\theta)$,
        $H_0$: $\theta = \half$ (fair coin),
        $H_1$: $\theta = \frac{3}{4}$. This is a
        valid pair.
    \item As in the previous but $H_0$: $\theta =
        \half$ and $H_1$: $\theta \neq \half$.
        This is not a valid pair.
    \item $X_1, \ldots, X_n$ takes values in
        $\NN_0$. $H_0$: $X_i \iidsim
        \Poisson(\lambda)$ for some $\lambda > 0$,
        $H_2$: $X_i \iidsim f_1$ for some other
        $f_1$. This is not a valid pair.
    \item $X$ has pdf $f(\bullet \mid \theta)$,
        $\theta \in \Theta$. $H_0$: $\theta \in
        \Theta_0 \subset \Theta$, $H_1$: $\theta
        \not\in \Theta_0$. This is simple if
        $\Theta_0 = \{\theta_0\}$.
\end{enumerate}
\begin{flashcard}[simple-hypothesis]
A hypothesis is said to be \emph{simple} if
\cloze{it
fully specifies the distribution of $X$. Otherwise
we say it is \emph{composite}.}
\end{flashcard}

\myskip
A test of $H_0$ is defined by a \emph{critical
region} $C \subseteq \mathcal{X}$. When $X \in C$
we ``reject'' $H_0$ and when $X \not\in C$ we say
we ``fail to reject'' or ``find no evidence
against'' $H_0$.

\myskip
\begin{flashcard}[type-i-ii-errors]
\prompt{Definitions of Type I and Type II error?
\\}%
\noindent
Type I error: \cloze{we reject $H_0$ when $H_0$ is
true.}
\\
Type II error: \cloze{we fail to reject $H_0$ when $H_0$
is false.}
\end{flashcard}

\myskip
When $H_0$ and $H_1$ are simple, we define
\[ \alpha = \PP_{H_0}(\text{$H_0$ is rejected}) =
\PP_{H_0}(X \in C) \]
``probability of type I error''.
\[ \beta = \PP_{H_2}(\text{$H_0$ is not rejected})
= \PP_{H_1}(X \not\in C) \]
``probability of type II error''.

\myskip
The \emph{size} of the test is $\alpha$. The
\emph{power} of the test is $1 - \beta$. Tradeoff
between minimising size and maximising power.
Usually we fix an acceptable size (say $\alpha =
1\%$), then pick test of size $\alpha$ which
maximises the power.

\begin{hiddenflashcard}[size-and-power]
\prompt{Definition of size and power? \\}
\cloze{
$\alpha$ is the probability of Type I error,
$\beta$ is the probablity of Type II error. Then
$\alpha$ is the size, and $1 - \beta$ is the
power. Alternatively,
\[ size = \PP_{H_0}(X \in C) \]
\[ power = \PP_{H_1}(X \in C) \]
}
\end{hiddenflashcard}

\subsubsection*{Neyman-Pearson Lemma}

Let $H_0, H_1$ be simple. Let $X$ have pdf $f_i$
under $H_i$, $i = 0, 1$. The likelihood ratio
statistic
\[ \Lambda_x(H_0, H_1) = \frac{f_1(X)}{f_0(X)} \]
A likelihood ratio test (LRT) rejects $H_0$ when
\[  X \in C = \{x : \Lambda_x(H_0, H_1) > k\} \]
for some threshold or ``critical value'' $k$.

\begin{flashcard}[neyman-pearson-lemma]
\begin{theorem*}[Neyman-Pearson Lemma]
    \cloze{
    Suppose that $f_0, f_1$ are \fcemph{non-zero on the
    same sets}. Suppose there exists $k$ such that
    the LRT with critical region
    \[ C = \{x : \Lambda_x(H_0, H_1) > k\} \]
    has size \fcemph{exactly} $\alpha$. Then, this is the test with
    the smallest $\beta$ (highest power) out of
    all tests of size $\le \alpha$.
}
\end{theorem*}
\end{flashcard}

\begin{remark*}
    A LRT of size $\alpha$ need not exist (try to
    think of an example). Even then, there is a
    ``randomised LRT'' with size $\alpha$.
\end{remark*}

\begin{proof}
    Let $\ol{C}$ be complement of $C$. The LRT has
    \begin{align*}
        \alpha
        &= \PP_{H_0}(X \in C)
        &= \int_C f_0(x) \dd x \\
        \beta
        &= \PP_{H_1}(X \not\in C)
        &= \int_{\ol{C}} f_1(x) \dd x
    \end{align*}
    Let $C^*$ be critical region of another test
    with size $\alpha^*$, power $1 - \beta^*$,
    with $\alpha^* \le \alpha$. Want to prove that
    $\beta \le \beta^*$ or $\beta - \beta^* \le
    0$.
    \begin{align*}
        \beta - \beta^*
        &= \int_{\ol{C}} f_1(x) \dd x -
        \int_{\ol{C^*}} f_1(x) \dd x \\
        &= \int_{\ol{C} \cap C^*} f_1(x) \dd x -
        \int_{\ol{C^*} \cap C} f_1(x) \dd x \\
        &= \int_{\ol{C} \cap C^*}
        \ub{\frac{f_1(x)}{f_0(x)}}_{\le R \text{
        on } \ol{C}} f_0(x) \dd x -
        \int_{\ol{C^*} \cap C}
        \ub{\frac{f_1(x)}{f_(x)}}_{> R \text{ on
        }\ol{C}} f_0(x) \dd x \\
        &\le k \left[ \int_{C \cap C^*} f_0(x) \dd
        x - \int_{\ol{C^*} \cap C} f_0(x) \dd x
        \right] \\
        &= k \left[ \int_{C^*} f_0(x) \dd x -
        \int_C f_0(x) \dd  x \right] \\
        &= k(\alpha^* - \alpha) \\
        &\le 0 \qedhere
    \end{align*}
\end{proof}