% vim: tw=50
% 06/10/2022 10AM

\setcounter{section}{-1}
\section{Introduction}

\begin{definition*}[Markov Chains]
    \emph{Markov chains} are random processes
    (sequence of random variables) that retain no
    memory of the past.
    \[ \text{past} \,\,
    \stackrel[\text{present}]{}{\perp} \,\,
    \text{future} \]
\end{definition*}

\subsubsection*{History}

\begin{itemize}
    \item Markov in 1906
    \item Poisson process, branching processes
        existed before. \\
        \emph{Motivation}: Extend the law of large
        numbers to the non IID setting.
    \item Koluogorov in 1930: continuous time Markov
        processes.
    \item Brownian motion: fundamental object in
        modern probability theory.
\end{itemize}

\subsubsection*{Why Study Markov Chains?}

Simplest mathematical models for random phenomena
evolving in time.

\begin{itemize}
    \item Simple: amenable to analysis - tools
        from probability, analysis, combinatorics.
    \item Applications: population growth,
        mathematical genetics, queuing networks,
        Monte Carlo simulation, \dots
\end{itemize}

\subsection{Page-Rank algorithm}

This is an example of a simple algorithm which was
previously used by search engines such as Google.
\myskip
Model the web as a directed graph, $G : (V, E)$.
$V$ is the set of websites (the vertices), and
$(i, j) \in E$ if and only if $i$ contains a link
to page $j$. Let $L(i)$ be the number of outgoing
edges from $i$. Define
\[ \hat{p}_{ij} = \begin{cases}
    \frac{1}{L(i)} & \text{if $L(i) > 0$ and $(i,
    j) \in E$} \\
    \frac{1}{n} & \text{if $L(i) = 0$}
\end{cases} \qquad (n = |V|) \]
Now also define for $\alpha \in (0, 1)$,
\[ p_{ij} = \alpha \hat{p}_{ij} + (1 - \alpha)
\frac{1}{n} \]
A random surfer tosses a coin, with probability
$\alpha$ and chooses to go to: either $\hat{p}$ or
uniformly at random. We want to find the invariant
distribution:
\[ \pi = \pi p \]
where
\[ \pi_i = \text{proportion of time spent at state
$i$ by the surfer} \]
Once we solve for this, if $\pi_i > \pi_j$ then $i$
is more important than $j$ and Google ranks it higher.

\newpage
\section{Markov Chains}

We will always denote state space by $I$, and it
will always be finite or countable. The
probability space will always be $(\Omega,
\mathcal{F}, \PP)$. We will now more formally
define a Markov Chain:

\begin{flashcard}
\begin{definition*}[Markov Chain]
    A stochastic process $(X_n)_{n \ge 0}$ is
    called a \emph{Markov chain} (with values
    in $I$) if \cloze{$\forall n \ge
    0, \forall x_0, \dots, x_{n + 1} \in I$},
    \[ \cloze{\PP(\ub{X_{n + 1} = x_{n +
    1}}_{\text{future}} \mid \ub{X_n =
    x_n}_{\text{present}},
    \ub{\dots, X_0 = x_0}_{\text{past}})
    = \PP(X_{n + 1} = x_{n + 1} \mid X_n = x_n)} \]
\end{definition*}
\end{flashcard}

\noindent
If $\PP(X_{n = 1} = y \mid X_n = x)$ is
independent of $n$ $\forall x, y$, then $X$ is
called \emph{time-homogenous} (this is what we
will focus on in this course). Otherwise
\emph{time-inhomogeneous}. \myskip

Define $P(x, y) = \PP(X_1 = y \mid X_0 = x)$ for
$x, y \in I$. $P$ is called the transition matrix
of the Markov chain.
\[ \sum_{y \in I} P(x, y) = \sum_{y \in I} \PP(X_1
= y \mid X_0 = x) = 1 \]
$P$ is called a \emph{stochastic matrix}.

\begin{flashcard}
\begin{definition*}
    $(X_n)_{n \ge 0}$ with values in $I$ is called
    $\mathrm{Markov}(\lambda, P)$ if \cloze{$X_0 \sim
    \lambda$} and $(X_n)_{n \ge 0}$ is a Markov
    chain with transition matrix $P$, i.e.
    \begin{enumerate}[(1)]
        \item \cloze{$\PP(X_0 = x) = \lambda(x)$ for all
            $x \in I$}
        \item \cloze{$\PP(X_{n + 1} = x_{n + 1} \mid X_n
            = x_n \dots X_0 = x_0) = \PP(x_n, x_{n
            + 1})$ for all $n, x_0, \dots,
            x_{n + 1}$}
    \end{enumerate}
\end{definition*}
\end{flashcard}

\begin{notation*}
    $P(x, y) = p_{xy} = p(x, y)$
\end{notation*}

\noindent
Draw a diagram (directed graph), and put a
directed edge between $x$ and $y$ ($x \to y$) if
$P(x, y) > 0$, and write the probability on top of
these arrows.
\begin{itemize}
    \item \[ P =
        \begin{bmatrix}
            \alpha & 1 - \alpha \\
            1 - \beta & \beta
        \end{bmatrix}
        \qquad \alpha, \beta \in (0, 1) \]
        \begin{center}
            \includegraphics[width=0.6\linewidth]
            {images/9ace69ca455b11ed.png}
        \end{center}
    \item \[ P =
        \begin{bmatrix}
            \half & \half & 0 \\
            0 & \frac{1}{3} & \frac{2}{3} \\
            1 & 0 & 0
        \end{bmatrix}
        \]
        \begin{center}
            \includegraphics[width=0.6\linewidth]
            {images/bddb4c6c455b11ed.png}
        \end{center}
\end{itemize}

\begin{theorem*}
    $X$ is $\Markov(\lambda, P)$ if and only if
    for all $n \ge 0$ and $x_0, \dots, x_n \in I$,
    \[ \PP(X_0 = x_0, \dots, X_n = x_n) =
    \lambda(x_0) P(x_0, x_1) \cdots P(x_{n - 1},
    x_n) \]
\end{theorem*}

\begin{proof}
    \begin{enumerate}
        \item[$\Rightarrow$]
            \begin{align*}
                \PP(X_n = x_n, \dots, X_0 = x_0)
                &= \PP(X_n = x_n \mid X_{n - 1} =
                x_{n - 1}, \dots, X_0 = x_0) \\
                &\,\,\,\,\,\,\times \PP(X_{n - 1} =
                x_{n - 1}, \dots, X_0 = x_0) \\
                &= P(x_{n - 1}, x_n) \PP(X_{n - 1}
                = x_{n - 1}, \dots, X_0 = x_0) \\
                &= \cdots \\
                &= \lambda(x_0) P(x_0, x_1) \cdots
                P(x_{n - 1}, x_n)
            \end{align*}
        \item[$\Leftarrow$] for $n = 0$, $\PP(X_0
            = x_0) = \lambda(x_0)$
            \begin{align*}
                \PP(X_n = x_n \mid X_{n - 1} =
                x_{n - 1} \cdots X_0 = x_0)
                &=
                \frac{\PP(X_n = x_n, X_{n - 1} =
                x_{n - 1}, \dots, X_0 =
                x_0}{\PP(X_{n - 1} = x_{n - 1}, \dots,
                X_0 = x_0)} \\
                &= P(x_{n - 1}, x_n)
            \end{align*}
    \end{enumerate}
\end{proof}

\begin{definition*}
    Let $i \in I$. The $\delta_i$-mass at $i$ is
    defined as
    \[ \delta_{ij} = 1(i = j) = \begin{cases}
        1 & \text{if $i = j$} \\
        0 & \text{otherwise}
    \end{cases} \]
\end{definition*}

\begin{flashcard}
\begin{definition*}
    Let $X_1, \dots, X_n$ be discrete random
    variables with values in $I$. They are
    independent if \cloze{for all $x_1, \dots, x_n
    \in I$}
    \[ \cloze{ \PP(X_1 = x_1, \dots, X_n = x_n) = \prod_{i
    = 1}^n \PP(X_i = x_i)} \]
\end{definition*}
\end{flashcard}

\noindent
Let $(X_n)_{n \ge 0}$ be a set of random variables
in $I$. They are independent if for all $i_1 < i_2
< \cdots < i_k$, for all $k$ and for all $x_1,
\dots, x_k$,
\[ \PP(X_{i_1} = x_1, \dots, X_{i_k} = x_k) =
\prod_{j = 1}^k P(X_{i_j} = x_j) \]
Let $(X_n)_{n \ge 0}$ and $(Y_n)_{n \ge 0}$ be 2
sequences. $X \perp Y$ if for all $k, m \in \NN$,
and for all $i_1 < \cdots < i_k$, $j_1 < \cdots <
j_m$, $x_1, \dots, x_k, y_1, \dots, y_m$,
\begin{align*}
    \PP(X_{i_1} = x_1, \dots, X_{i_k} = x_k,
    Y_{j_1} = y_1, \dots, Y_{j_m} = y_m) \\
    = \PP(X_{i_1} = x_1, \dots, X_{i_k} = x_k)
    \times \PP(Y_{j_1} = y_1, \dots, Y_{j_m} = y_m)
\end{align*}