% vim: tw=50
% 19/01/2023 11AM

\setcounter{section}{-1}
\section{Introduction}

Statistics: The science of making informed
decisions. Can include:
\begin{itemize}
    \item Design of experiments
    \item Graphical exploration of data
    \item \emph{Formal statistical inference}
        $\in$ Decision theory
    \item Communication of results.
\end{itemize}
Let $X_1, X_2, \dots, X_n$ be independent
observations from some distribution $f_X(\bullet
\mid \theta)$, with parameter $\theta$. We wish to
infer the value of $\theta$ from $X_1, \dots,
X_n$.
\begin{itemize}
    \item Estimating $\theta$
    \item Quantifying uncertainty in estimator
    \item Testing a hypothesis about $\theta$.
\end{itemize}

\subsection{Probability Review}

Let $\Omega$ be the \emph{sample space} of
outcomes in an experiment. A ``nice'' or
measurable subset of $\Omega$ is called an
\emph{event}, we denote the set of events
$\mathcal{F}$. A function $\PP : \mathcal{F} \to
[0, 1]$ is called a \emph{probability measure} if:
\begin{itemize}
    \item $\PP(\phi) = 0$
    \item $\PP(\Omega) = 1$
    \item $\PP\left( \bigcup_{i = 1}^\infty A_i
        \right) = \sum_{i = 1}^\infty \PP(A_i)$ if
        $(A_i)$ are disjoint.
\end{itemize}
A \emph{random variable} is a (measurable)
function $X \colon \RR \to \RR$. For example:
tossing a coin twice $\Omega = \{HH, HT, TH,
TT\}$. $X$: number of heads.
\[ X(HH) = 2 \qquad X(TH) = X(HT) = 1 \qquad X(TT)
= 0 \]
The \emph{distribution function} of $X$ is
\[ F_X(x) = \PP(X \le x) \]
A \emph{discrete} random variable takes values in
a countable $\mathcal{X} \in \RR$, its
\emph{probability mass function} or pmf is $p_X(x)
= \PP(X = x)$. We say $X$ has \emph{continuous}
distribution if it has a \emph{probability density
function} or pdf satisfying
\[ \PP(X \in A) = \int_A f_X(x) \dd x \]
for any ``nice'' set $A$.

\myskip
The \emph{expectation} of $X$ is
\[ \EE X = \begin{cases}
    \sum_{x \in \mathcal{X}} x p_X(x) & \text{if
    $X$ is discrete} \\
    \int xf_X(x) \dd x & \text{if $X$ is
    continuous}
\end{cases} \]
If $g : \RR \to \RR$,
\[ \EE g(x) = \int g(x) f_X(x) \dd x \]
The \emph{variance} of $X$ is
\[ \Var(X) = \EE((X - \EE X)^2) \]
We say that $X_1, X_2, \dots, X_n$ are independent
if for all $x_1, \dots, x_n$
\[ \PP(X_1 \le x_2, \ldots, X_n \le x_n) = \PP(X_1
\le x_1) \cdots \PP(X_n \le x_n) \]
If the variables have pdf's, then
\[ f_X(x) = \prod_{i = 1}^n f_{X_i}(x_i) \]
($x = (x_1, \dots, x_n)$, $X = (X_1, \dots,
X_n)$).

\subsubsection*{Linear transformations}

If $a_1, \dots, a_n \in \RR$
\[ \EE(a_1X_1 + \cdots + a_nX_n) = a_1 \EE X_1 +
\cdots + a_n \EE X_n \]
\[ \Var(a_1X_1 + \cdots + a_n X_n) = \sum_{i, j}
a_i a_J \Cov(X_i, X_j) \]
($\Cov(X_i, X_i) = \EE((X_i - \EE X_i)(X_j - \EE
X_j))$). If $X = (X_1, \dots, X_n)^\top$
\begin{align*}
    \EE X
    &= (\EE X_1, \dots, \EE X_n)^\top \\
    \EE (a^\top X)
    &= a^\top \EE X \\
    \Var(a^\top X) = a^\top
    \ub{\Var(X)}_{(\Var(X))_{ij} = \Cov(X_i, X_j)} a
\end{align*}

\subsubsection*{Moment generating functions}

\[ M_X(t) = \EE(e^{tX}) \]
This may only exist for $t$ in some neighbourhood
of $0$.
\begin{itemize}
    \item $\EE(X^n) = \dfrac[n]{}{t} M_X(0)$
    \item $M_X = M_Y \implies F_X = F_Y$
    \item Makes it easy to find the distribution
        function of sums of IID variables.
\end{itemize}

\begin{example*}
    Let $X_1, \dots, X_n$ be IID $\Poisson(\mu)$
    \begin{align*}
        M_{X_1}(t)
        &= \EE e^{tX_1} \\
        &= \sum_{x = 0}^\infty e^{tx} \cdot
        \frac{e^{-\mu}\mu^x}{x!} \\
        &= e^{-\mu} \sum_{x = 0}^\infty \frac{(e^t
        \mu)^x}{x!} \\
        &= e^{-\mu} e^{\mu \exp t} \\
        &= e^{-\mu(1 - e^t)}
    \end{align*}
    $S_n = X_1 + \cdots + X_n$.
    \begin{align*}
        M_{S_n}(t)
        &= \EE e^{t(X_1 + \cdots X_n)} \\
        &= \prod_{i = 1}^n \EE e^{tX_i}
        &&\text{(independent)} \\
        &= e^{-\mu(1 - e^t)n}
    \end{align*}
    Observe this is $\Poisson(\mu n)$ mgf. So $S_n
    \sim \Poisson(\mu n)$.
\end{example*}

\subsubsection*{Limit Theorems}

Weak law of large numbers (WLLN). $X_1, \dots,
X_n$ are IID with $\EE X_1 = \mu$.
\[ \ol{X}_n = \frac{1}{n} \sum_{i = 1}^n X_i \]
is the ``sample mean''. For all $\eps > 0$,
\[ \PP(\ub{|\ol{X}_n - \mu| > \eps}_{\text{event
that depends only on $X_1, \dots, X_n$}}) \to 0 \qquad
\text{as $n \to \infty$} \]
Strong law of large numbers (SLLN)
\[ \PP(\ol{X}_n \stackrel{n \to
\infty}{\longrightarrow} \mu) = 1 \]
(This event depends on \emph{whole} sequence $X_1,
X_2, \dots$. $\ol{X}_n \to \mu \iff \forall \eps >
0 \exists N \forall n > N |\ol{X}_n - \mu| <
\eps$.

\subsubsection*{Central Limit Theorem}

$Z_n = \frac{\sqrt{n}(\ol{X}_n - \mu)}{\sigma}$
where $\sigma^2 = \Var(X_i)$. Then $Z_n$ is
approximately $\normaldist(0, 1)$ as $n \to \infty$.
\[ \PP(Z_n \le z) \to \Phi(z) \qquad \text{as $n
\to \infty$}\qquad \forall z \in \RR \]
where $\Phi$ is the distribution function of a
$\normaldist(0, 1)$ variable.

\subsubsection*{Conditioning}

Let $X$ and $Y$ be discrete random variables.
Their joint pmf is
\[ p_{X,Y}(x, y) = \PP(X = x, Y = y) \]
The marginal pmf
\[ p_X(x) = \PP(X = x) = \sum_{y \in Y} p_{X,Y}(x,
y) \]
Conditional pmf of $X$ given $Y = y$ is
\begin{align*}
    p_{X \mid Y}(x \mid y)
    &= \PP(X = x \mid Y = y) \\
    &= \frac{\PP(X = x, Y = y)}{\PP(Y = y)} \\
    &= \frac{p_{X, Y}(x, y)}{p_Y(y)}
\end{align*}
(defined $= 0$ if $p_Y(y) = 0$). If $X, Y$ are
continuous, the joint pdf $f_{X,y}$ has
\[ \PP(X \le x', Y \le y') = \int_{-\infty}^{x'}
\int_{-\infty}^{y'} f_{X, Y}(x, y) \dd y \dd x \]
The marginal pdf of $Y$ is
\[ f_Y(y) = \int_{-\infty}^\infty f_{X, Y}(x, y)
\dd x \]
The conditional pdf of $X$ given $Y$ is
\[ f_{X \mid Y}(x \mid y) = \frac{f_{X, Y}(x,
y)}{f_Y(y)} \]
Conditional expectation:
\[ \EE(X \mid Y) = \begin{cases}
    \sum_x x p_{X \mid Y}(x \mid y) \\
    \int_{-\infty}^\infty x f_{X \mid Y}(x \mid y)
    \dd x
\end{cases} \]
(this is treated as a random variable, which is a
function of $Y$).

\myskip
Tourer property:
\[ \EE(\EE(X \mid Y)) = \EE X \]
Conditional variance formula:
\begin{align*}
    \Var(X) = \EE(X^2) - (\EE X)^2 \\
    &= \EE(\EE(X^2 \mid Y)) - (\EE(\EE(X \mid
    Y)))^2 \\
    &= \EE(\EE(X^2 \mid Y) - [\EE(X \mid Y)]^2) +
    \EE[\EE(X \mid Y)^2] - \EE[(X \mid Y)] \\
    &= \EE \Var(X \mid Y) + \Var(\EE(X \mid Y))
\end{align*}
\begin{hiddenflashcard}[conditional-variance-formula]
Conditional variance formula? \\
\[ \cloze{\Var(X) = \EE \Var(X \mid Y) + \Var
\EE(X \mid Y)} \]
\end{hiddenflashcard}