%! TEX root = EMC.tex
% vim: tw=50
% 23/01/2025 09AM

\newpage
\section{The Khinchin (Shannon?) axioms for entropy}

\begin{note*}
  In this course, ``random variable'' will mean
  ``discrete random variable'' (unless otherwise
  specified).

  All logarithms will be base $2$ (unless
  otherwise specified).
\end{note*}

\begin{fcdefnstar}[Entropy]
  \glsnoundefn{ent}{entropy}{entropies}%
  \glsnoundefn{cent}{entropy}{entropies}%
  \glsnoundefn{normalisation}{normalisation}{NA}%
  \glsnoundefn{invariance}{invariance}{NA}%
  \glsnoundefn{extendability}{extendability}{NA}%
  \glsnoundefn{maximality}{maximality}{NA}%
  \glsnoundefn{continuity}{continuity}{NA}%
  \glsnoundefn{additivity}{additivity}{NA}%
  \glssymboldefn{H}%
  The \emph{entropy} of a discrete random variable
  $X$ is a quantity $\mathbf{H}[X]$ that takes real values
  and has the following properties:
  \begin{enumerate}[(i)]
    \item \emph{Normalisation}: If $X$ is uniform on $\{0, 1\}$ then
      $\mathbf{H}[X] = 1$.
    \item \emph{Invariance}: If $X$ takes values
      in $A$, $Y$ takes values in $B$, $f$ is a
      bijection from $A$ to $B$, and for every $a
      \in A$ we have $\Pbb[X = a] =
      \Pbb[Y = f(a)]$, then $\mathbf{H}[Y] =
      \mathbf{H}[X]$.
    \item \emph{Extendability}: If $X$ takes values in a set $A$, and
      $B$ is disjoint from $A$, $Y$ takes values
      in $A \cup B$, and for all $a \in A$ we have
      $\Pbb[Y = a] = \Pbb[X = a]$, then
      $\mathbf{H}[Y] =
      \mathbf{H}[X]$.
    \item \emph{Maximality}: If $X$ takes values
      in a finite set $A$ and $Y$ is uniformly
      distributed in $A$, then $\mathbf{H}[X] \le
      \mathbf{H}[Y]$.
    \item \emph{Continuity}: $\mathbf{H}$ depends
      continuously on $X$ with respect to total
      variation distance (defined by the distance
      between $X$ and $Y$ is $\sup_E |\Pbb[X \in
      E] - \Pbb[Y \in E]|$).
  \end{enumerate}
  For the last axiom we need a definition:

  Let $X$ and $Y$ be random variables. The
  \emph{conditional entropy} $\mathbf{H}[X \mid Y]$ of $X$
  given $Y$ is
  \[
    \sum_y \Pbb[Y = y] \mathbf{H}[X \mid Y = y]
  .\]
  \begin{enumerate}[(i)]
    \setcounter{enumi}{5}
    \item \emph{Additivity}: $\mathbf{H}[X, Y] =
      \mathbf{H}[Y] + \mathbf{H}[X \mid Y]$.
  \end{enumerate}
\end{fcdefnstar}

\begin{fclemma}[]
  \label{lemma:1.1}
  % Lemma 1.1
  Assuming:
  - $X$ and $Y$ are independent random variables
  Then:
  \[
    \ent{X, Y} = \ent X + \ent Y
  .\]
\end{fclemma}

\begin{proof}
  $\cent XY = \sum_y \Pbb[Y = y] \ent{X \mid Y =
  y}$.

  Since $X$ and $Y$ are independent, the
  distribution of $X$ is unaffected by knowing $Y$
  (so by \gls{invariance}, $\ent{X \mid Y = y} =
  \ent X$), so
  \[
    \ent{X \mid Y = y} = \ent X
  \]
  for all $y$, which gives the result.
\end{proof}

\begin{corollary}[]
  \label{coro:1.2}
  % Corollary 2
  If $X_1, \ldots, X_n$ are independent, then
  \[
    \ent{X_1, \ldots, X_n} = \ent{X_1} + \cdots +
    \ent{X_n}
  .\]
\end{corollary}

\begin{proof}
  \cref{lemma:1.1} and obvious induction.
\end{proof}

\begin{fclemma}[Chain rule]
  \label{lemma:1.3}
  % Lemma 1.3
  Assuming:
  - $X_1, \ldots, X_n$ are random variables
  Then:
  \[
    \ent{X_1, \ldots, X_n}
    = \ent{X_1} + \cent{X_2}{X_1}
    + \cent{X_3}{X_1, X_2} + \cdots +
    \cent{X_n}{X_1, \ldots, X_{n - 1}}
  .\]
\end{fclemma}

\begin{proof}
  The case $n = 2$ is \gls{additivity}. In
  general,
  \[
    \ent{X_1, \ldots, X_n}
    = \ent{X_1, \ldots, X_{n - 1}}
    + \cent{X_n}{X_1, \ldots, X_{n - 1}}
  \]
  so we are done by induction.
\end{proof}

\begin{fclemma}[]
  \label{lemma:1.4}
  % Lemma 1.4
  Assuming:
  - $Y = f(X)$
  Then:
  \[
    \ent{X, Y} = \ent X
  .\]
  Also,
  \[
    \cent Z{X, Y} = \cent ZX
  .\]
\end{fclemma}

\begin{proof}
  The map $g : x \mapsto (x, f(x))$ is a
  bijection, and $(X, Y) = g(X)$. So the first
  statement follows by \gls{invariance}. For the
  second statement:
  \begin{align*}
    \cent Z{X, Y}
    &= \ent{Z, X, Y} - \ent{X, Y}
    &&\text{(by \gls{additivity})} \\
    &= \ent{Z, X} - \ent X
    &&\text{(by first part)} \\
    &= \cent ZX
    && \text{(by \gls{additivity})}
    \qedhere
  \end{align*}
\end{proof}

\begin{fclemma}[]
  \label{lemma:1.5}
  % Lemma 1.5
  Assuming:
  - $X$ takes only one value
  Then:
  $\ent X = 0$.
\end{fclemma}

\begin{proof}
  $X$ and $X$ are independent. Therefore, by
  \cref{lemma:1.1}, $\ent{X, X} = 2\ent X$. But by
  \gls{invariance}, $\ent{X, X} = \ent X$. So
  $\ent X = 0$.
\end{proof}

\begin{fcprop}[]
  \label{prop:1.6}
  % Proposition 1.6
  Assuming:
  - $X$ is uniformly distributed on a set of size
  $2^n$
  Then:
  $\ent X = n$.
\end{fcprop}

\begin{proof}
  Let $X_1, \ldots, X_n$ be independent random
  variables uniformly distributed on $\{0, 1\}$.
  By \cref{coro:1.2} and \gls{normalisation}, $\ent{X_1, \ldots, X_n} =
  n$. But $(X_1, \ldots, X_n)$ is uniformly
  distributed on $\{0, 1\}^n$, so by
  \gls{invariance}, the result follows.
\end{proof}