%! TEX root = EMC.tex % vim: tw=50 % 23/01/2025 09AM \newpage \section{The Khinchin (Shannon?) axioms for entropy} \begin{note*} In this course, ``random variable'' will mean ``discrete random variable'' (unless otherwise specified). All logarithms will be base $2$ (unless otherwise specified). \end{note*} \begin{fcdefnstar}[Entropy] \glsnoundefn{ent}{entropy}{entropies}% \glsnoundefn{cent}{entropy}{entropies}% \glsnoundefn{normalisation}{normalisation}{NA}% \glsnoundefn{invariance}{invariance}{NA}% \glsnoundefn{extendability}{extendability}{NA}% \glsnoundefn{maximality}{maximality}{NA}% \glsnoundefn{continuity}{continuity}{NA}% \glsnoundefn{additivity}{additivity}{NA}% \glssymboldefn{H}% The \emph{entropy} of a discrete random variable $X$ is a quantity $\mathbf{H}[X]$ that takes real values and has the following properties: \begin{enumerate}[(i)] \item \emph{Normalisation}: If $X$ is uniform on $\{0, 1\}$ then $\mathbf{H}[X] = 1$. \item \emph{Invariance}: If $X$ takes values in $A$, $Y$ takes values in $B$, $f$ is a bijection from $A$ to $B$, and for every $a \in A$ we have $\Pbb[X = a] = \Pbb[Y = f(a)]$, then $\mathbf{H}[Y] = \mathbf{H}[X]$. \item \emph{Extendability}: If $X$ takes values in a set $A$, and $B$ is disjoint from $A$, $Y$ takes values in $A \cup B$, and for all $a \in A$ we have $\Pbb[Y = a] = \Pbb[X = a]$, then $\mathbf{H}[Y] = \mathbf{H}[X]$. \item \emph{Maximality}: If $X$ takes values in a finite set $A$ and $Y$ is uniformly distributed in $A$, then $\mathbf{H}[X] \le \mathbf{H}[Y]$. \item \emph{Continuity}: $\mathbf{H}$ depends continuously on $X$ with respect to total variation distance (defined by the distance between $X$ and $Y$ is $\sup_E |\Pbb[X \in E] - \Pbb[Y \in E]|$). \end{enumerate} For the last axiom we need a definition: Let $X$ and $Y$ be random variables. The \emph{conditional entropy} $\mathbf{H}[X \mid Y]$ of $X$ given $Y$ is \[ \sum_y \Pbb[Y = y] \mathbf{H}[X \mid Y = y] .\] \begin{enumerate}[(i)] \setcounter{enumi}{5} \item \emph{Additivity}: $\mathbf{H}[X, Y] = \mathbf{H}[Y] + \mathbf{H}[X \mid Y]$. \end{enumerate} \end{fcdefnstar} \begin{fclemma}[] \label{lemma:1.1} % Lemma 1.1 Assuming: - $X$ and $Y$ are independent random variables Then: \[ \ent{X, Y} = \ent X + \ent Y .\] \end{fclemma} \begin{proof} $\cent XY = \sum_y \Pbb[Y = y] \ent{X \mid Y = y}$. Since $X$ and $Y$ are independent, the distribution of $X$ is unaffected by knowing $Y$ (so by \gls{invariance}, $\ent{X \mid Y = y} = \ent X$), so \[ \ent{X \mid Y = y} = \ent X \] for all $y$, which gives the result. \end{proof} \begin{corollary}[] \label{coro:1.2} % Corollary 2 If $X_1, \ldots, X_n$ are independent, then \[ \ent{X_1, \ldots, X_n} = \ent{X_1} + \cdots + \ent{X_n} .\] \end{corollary} \begin{proof} \cref{lemma:1.1} and obvious induction. \end{proof} \begin{fclemma}[Chain rule] \label{lemma:1.3} % Lemma 1.3 Assuming: - $X_1, \ldots, X_n$ are random variables Then: \[ \ent{X_1, \ldots, X_n} = \ent{X_1} + \cent{X_2}{X_1} + \cent{X_3}{X_1, X_2} + \cdots + \cent{X_n}{X_1, \ldots, X_{n - 1}} .\] \end{fclemma} \begin{proof} The case $n = 2$ is \gls{additivity}. In general, \[ \ent{X_1, \ldots, X_n} = \ent{X_1, \ldots, X_{n - 1}} + \cent{X_n}{X_1, \ldots, X_{n - 1}} \] so we are done by induction. \end{proof} \begin{fclemma}[] \label{lemma:1.4} % Lemma 1.4 Assuming: - $Y = f(X)$ Then: \[ \ent{X, Y} = \ent X .\] Also, \[ \cent Z{X, Y} = \cent ZX .\] \end{fclemma} \begin{proof} The map $g : x \mapsto (x, f(x))$ is a bijection, and $(X, Y) = g(X)$. So the first statement follows by \gls{invariance}. For the second statement: \begin{align*} \cent Z{X, Y} &= \ent{Z, X, Y} - \ent{X, Y} &&\text{(by \gls{additivity})} \\ &= \ent{Z, X} - \ent X &&\text{(by first part)} \\ &= \cent ZX && \text{(by \gls{additivity})} \qedhere \end{align*} \end{proof} \begin{fclemma}[] \label{lemma:1.5} % Lemma 1.5 Assuming: - $X$ takes only one value Then: $\ent X = 0$. \end{fclemma} \begin{proof} $X$ and $X$ are independent. Therefore, by \cref{lemma:1.1}, $\ent{X, X} = 2\ent X$. But by \gls{invariance}, $\ent{X, X} = \ent X$. So $\ent X = 0$. \end{proof} \begin{fcprop}[] \label{prop:1.6} % Proposition 1.6 Assuming: - $X$ is uniformly distributed on a set of size $2^n$ Then: $\ent X = n$. \end{fcprop} \begin{proof} Let $X_1, \ldots, X_n$ be independent random variables uniformly distributed on $\{0, 1\}$. By \cref{coro:1.2} and \gls{normalisation}, $\ent{X_1, \ldots, X_n} = n$. But $(X_1, \ldots, X_n)$ is uniformly distributed on $\{0, 1\}^n$, so by \gls{invariance}, the result follows. \end{proof}