% vim: tw=50
% 26/01/2023 11AM

\subsection{Sufficiency}

$X_1, \ldots, X_n$ are IID random variables from a
distribution with pdf (or pmf) $f_X(\bullet \mid
\theta)$. Let $X = (X_1, \ldots, X_n)$.

\myskip
Question: Is there a statistic $T(X)$ which
contains all information in $X$ needed to estimate
$\theta$?

\begin{flashcard}[sufficiency-defn]
\begin{definition*}[Sufficiency]
    \cloze{
    A statistic $T$ is \emph{sufficient} for
    $\theta$ if the \fcemph{conditional
    distribution} of
    $X$ given $T(X)$ \fcemph{does not depend} on $\theta$.
}
\end{definition*}
\end{flashcard}

\begin{remark*}
    $\theta$ and $T(X)$ could be vector-valued.
\end{remark*}

\begin{example*}
    $X_1, \ldots, X_n \iidsim \Ber(\theta)$ for
    $\theta \in [0, 1]$.
    \begin{align*}
        f_X(\bullet \mid \theta)
        &= \prod_{i = 1}^n \theta^{x_i} (1 -
        \theta)^{1 - x_i} \\
        &= \theta^{\sum x_i} (1 - \theta)^{n -
        \sum x_i}
    \end{align*}
    \begin{note*}
        This only depends on $X$ through $T(X) =
        \sum_{i = 1}^n X_i$.
    \end{note*}
    For $x$ with $\sum x_i = t$,
    \begin{align*}
        f_{X \mid T = t}(x \mid T(x) = t)
        &= \frac{\PP_\theta(X = x, T(X) =
        t)}{\PP_\theta(T(X) = t)} \\
        &= \frac{\PP_\theta(X =
        x)}{\PP_\theta(T(X) = t} \\
        &= \frac{\theta^{\sum x_i} (1 - \theta)^{n
        - \sum x_i}}{{n \choose t} \theta^t (1 -
        \theta)^{n - t}} \\
        &= {n \choose t}^{-2}
    \end{align*}
    As this doesn't depend on $\theta$, $T(X)$ is
    sufficient for $\theta$.
\end{example*}

\begin{flashcard}[factorisation-criterion]
\begin{theorem*}[Factorisation criterion]
    \cloze{
    $T$ is sufficient for $\theta$ if and only if
    \[ f_X(x \mid \theta) = g(T(x), \theta) \cdot
    h(x) \]
    for suitable functions $g, h$.
}
\end{theorem*}
\end{flashcard}

\begin{proof}
    (Discrete case)

    \myskip
    Suppose $f_X(x \mid \theta) = g(T(X), \theta)
    h(X)$. If $T(x) = t$, then
    \begin{align*}
        f_{X \mid T = t}(x \mid T = t)
        &= \pfrac{\PP_\theta(X = x, \cancel{T(X) =
        t}}{\PP_\theta(T(X) = t)} \\
        &= \frac{g(T(X), \theta) h(X)}{
        \sum_{x' \colon T(x') = t} g(T(x;),
        \theta) h(x')} \\
        &= \frac{\cancel{g(t,
        \theta)}}{\cancel{g(t, \theta)}}
        \frac{h(x)}{ \sum_{x' \colon T(x') = t}
        h(x')}
    \end{align*}
    As this doesn't depend on $\theta$, $T(X)$ is
    sufficient.

    \myskip
    Conversely, suppose $T(X)$ is sufficient, then
    \begin{align*}
        \PP_\theta(X = \tau)
        &= \PP_\theta(X = x, T(X) = t) \\
        &= \ub{\PP_\theta(T(X) = t)}_{g(t,
        \theta)} \cdot \ub{\PP_\theta(X = x \mid T(X)
        = t)}_{h(x)}
    \end{align*}
    Then by sufficiency of $T$, $h(x)$ doesn't
    depend on $\theta$ (so it is a function of
    $x$). Thus the pmf of $X$, $f_X(\bullet \mid
    \theta)$ factorises as in the statement of the
    theorem.
\end{proof}

\begin{example*}
    $X_1, \ldots, X_n \iidsim \Ber(\theta)$.
    \[ f_X(x \mid \theta) = \theta^{\sum x_i} (1 -
    \theta)^{n - \sum x_i} \]
    Take $g(t, \theta) = \theta^t (1 - \theta)^{n
    - t}$, $h(x) = 1$. This immediately implies
    $T(X) = \sum x_i$ is sufficient.
\end{example*}

\begin{example*}
    $X_1, \ldots, X_n \iidsim \Unif([0, \theta])$,
    $\theta > 0$. Then
    \begin{align*}
        f_X(x \mid \theta)
        &= \prod_{i = 1}^n \frac{1}{\theta}
        \mathbbm{1}_{x_i \in [0, \theta]} \\
        &= \ub{\frac{1}{\theta^n} \mathbbm{1}_{\{\max_i
        x_i \le \theta\}}}_{T(x), \theta)}
        \ub{\mathbbm{1}_{\{\min_i x_i
        \ge 0\}}}_{h(x)}
    \end{align*}
    $T(x) = \max_i x_i$. Then by factorisation
    lemma, $T(x) = \max_i x_i$ is sufficient for
    $\theta$.
\end{example*}

\subsubsection*{Minimal Sufficiency}

Sufficient stats are \emph{not} unique. Indeed any
1-to-1 function of a sufficient statistic is also
sufficient. Also $T(X) = X$ is always sufficient
by not very useful.

\begin{definition*}
    A sufficient statistic $T$ is \emph{minimal
    sufficient} if it is a function of any other
    sufficient statistic. That is, if $T'$ is also
    sufficient, then
    \[ T'(x) = T'(y) \implies T(x) = T(y) \]
    for all $x, y \in \mathcal{X}^n$.
\end{definition*}

\begin{remark*}
    Any two minimal sufficient statistics, $T, T'$
    are ``in bijection with each other'':
    \[ T(x) = T(y) \iff T'(x) = T'(y) \]
    Useful condition to check minimal sufficiency.
\end{remark*}

\begin{flashcard}[minimal-sufficiency-thm]
\begin{theorem*}[Minimal Sufficiency Theorem]
    \cloze{
    Suppose that $T(X)$ is a statistic such that
    $f_X(x \mid \theta) / f_X(y \mid \theta)$ is
    constant as a function of $\theta$ if and only
    if $T(x) = T(y)$. Then $T$ is minimal
    sufficient.
}
\end{theorem*}
\end{flashcard}

\noindent
Let $x \stackrel{1}{\sim} y$ if $\frac{f_X(x \mid
\theta)}{f_X(y \mid \theta)}$ is constant in
$\theta$. It's easy to check that
$\stackrel{1}{\sim}$ is an equivalence relation.

\myskip
Similarly, for a given statistic $T$, $x
\stackrel{2}{\sim} y$ if $T(x) = T(y)$ defines
another equivalence relation. The condition of
theorem says $\stackrel{1}{\sim}$ and
$\stackrel{2}{\sim}$ are the same.

\begin{note*}
    We can always construct a statistic $T$ which
    is constant on the equivalence classes of
    $\stackrel{1}{\sim}$, which by the theorem is
    minimal sufficient.
\end{note*}

\begin{proof}
    For any value $t$ of $T$, let $z_t$ be a
    representative from the equivalence class
    \[ \{x \mid T(x) = t\} \]
    Then
    \begin{align*}
        f_X(x \mid \theta)
        &= \ub{f_X(z_{T(x)} \mid \theta)}_{g(T(x),
        \theta)} \ub{\frac{f_X(x
        \mid \theta)}{f_X(z_{T(x)} \mid
        \theta)}}_{h(x)}
    \end{align*}
    Where $h(x)$ does not depend on $\theta$ by
    the hypothesis, as $x \stackrel{1}{\sim}
    z_{T(x)}$. By factorisation criterion, $T$ is
    sufficient.

    \myskip
    To prove that $T$ is minimal, take any other
    sufficient statistic $S$. Want to prove that
    if $S(x) = S(y)$ then $T(x) = T(y)$. By
    factorisation criterion, there are functions
    $g_S, h_S$ such that
    \[ f_X(x \mid \theta) = g_S(S(x), \theta)
    h_S(x) \]
    Suppose $S(x) = S(y)$. Then
    \[ \frac{f_X(x \mid \theta)}{f_X(y \mid
    \theta)} = \frac{\cancel{g_S(S(x), \theta)}
    h_S(x)}{\cancel{g_S(S(y), \theta)} h_S(y)} \]
    which doesn't depend on $\theta$. Hence $x
    \stackrel{1}{\sim} y$. By hypothesis, $x
    \stackrel{2}{\sim}$, hence $T(x) = T(y)$.
\end{proof}

\begin{remark*}
    Sometimes the range of $X$ depends on $\theta$
    (for example $X_1, \ldots, X_n \iidsim
    \Unif([0, \theta])$. In this case we can
    interpret
    \begin{center}
        ``$\frac{f_X(x \mid \theta}{f_X(y \mid
        \theta)}$ is constant in $\theta$''
    \end{center}
    to mean that $f_X(x \mid \theta) = c(x, y)
    f_X(y \mid \theta)$ for some function $c$
    which does not depend on $\theta$.
\end{remark*}