\section{General Probability Theory} \subsection{\texorpdfstring{$\sigma$}{sigma}-algebra} \begin{definition} \leavevmode \begin{itemize} \item The set of all possible outcomes of an experiment is called the \textbf{state space}, denoted by $\Omega$. \item The family of all possible subsets of $\Omega$, i.e.\ the power set is denoted by $2^\Omega$. \end{itemize} \end{definition} \begin{definition}[$\sigma$-algebra] A family of subsets $\A \subset 2^\Omega$ is called a $\sigma$-algebra if it satisfies \begin{enumerate}[(i)] \item $\Omega \in \A$, or $\emptyset \in \A$. \item Closure under complement: If $A \in \A$ then $A^c \in \A$. \item Closure under countable unions and intersections: If a sequence of sets $(A_i)_{i \geq 1}$ belongs to $\A$, then $\bigcup_{i=1}^{\infty} A_i$ (or $\bigcap_{i=1}^{\infty} A_i$, by De Morgan's law\footnote{$\bigcap_{i=1}^{\infty} A_i = \left(\bigcup_{i=1}^{\infty} A_i^c\right)^c$.}) also belongs to $\A$. \end{enumerate} The smallest possible $\sigma$-algebra is the trivial $\sigma$-algebra: $\{\emptyset, \Omega\}$. The largest possible $\sigma$-algebra is $2^\Omega$. Between the two, we have e.g.\ for a 6-faced dice, $\A = \{\{1\}, \{2, 3, 4, 5, 6\}, \emptyset, \Omega\}$. \end{definition} \begin{definition}[$\sigma$-algebra generated by families of sets] \leavevmode \begin{itemize} \item Let $\mathcal{C}$ be an arbitrary family of subsets of $\Omega$. We denote by $\sigma(\mathcal{C})$ the smallest $\sigma$-algebra which contains every set in $\mathcal{C}$ (i.e.\ $\mathcal{C} \subseteq \sigma(\mathcal{C})$) and call this the $\sigma$-algebra generated by $\mathcal{C}$. \item (Borel $\sigma$-algebra) An important example is the Borel $\sigma$-algebra over any topological space $\Omega$, denoted by $\B(\Omega)$, which is the $\sigma$-algebra generated by the open sets of $\Omega$ (or, equivalently, by the closed sets\footnote{To show this, we just need to show that $\sigma(\mathcal{C}) \subseteq \B(\Omega)$ and $\B(\Omega) \subseteq \sigma(\mathcal{C})$. All closed sets are complements of open sets. Since $\B(\Omega)$ being a $\sigma$-algebra is closed under complement, it contains all the closed sets, i.e.\ $\mathcal{C} \subseteq \B(\Omega) \Rightarrow \sigma(\mathcal{C}) \subseteq \B(\Omega)$. A similar argument can be used to show that $\mathcal{O} \subseteq \sigma(\mathcal{C}) \Rightarrow \B(\Omega) = \sigma(\mathcal{O}) \subseteq \sigma(\mathcal{C})$.}). In other words, $\B(\Omega) := \sigma(\mathcal{O}(\Omega))$, where $\mathcal{O}(\cdot)$ denotes the collection of all open sets. \begin{itemize} \item Recall that an open set of $\R$ is a subset $E \subseteq \R$ such that for every $x \in E$ there exists $\epsilon > 0$ such that $B_\epsilon(x) = \{y \in \R : |x - y| < \epsilon\}$ is contained in $E$. \item A set $F \subseteq \R$ is said to be closed if $F^c$ is open. \item $\R$ and $\emptyset$ are simultaneously both open and closed sets. \end{itemize} \item ($\sigma$-algebra generated by a random variable) Given $X : (\Omega, \A) \to (\Psi, \G)$, the $\sigma$-algebra generated by $X$, denoted $\sigma(X)$ is the smallest $\sigma$-algebra on $\Omega$ such that $X$ is a random variable, that is, $X$ is measurable with respect to $\sigma(X)$ and $\G$. Equivalently, $\sigma(X) = X^{-1}(\G) = \{X^{-1}(S) \mid S \in \G\}$ (by Definition~1.3.1 and Theorem~1.3.2). \end{itemize} \end{definition} \begin{theorem} The Borel $\sigma$-algebra $\B(\R)$ is generated by intervals of the form $(-\infty, a]$ where $a \in \mathbb{Q}$ is a rational number. \end{theorem} \begin{proof} Let $\mathcal{O}$ denote the collection of all open intervals. Since every open set in $\R$ is an at most countable union of (disjoint) open intervals, we have $\sigma(\mathcal{O}) = \B(\R)$. Let $\mathcal{D}$ be the collection of all intervals of the form $(-\infty, a]$, $a \in \mathbb{Q}$. For any $a < b$, $a, b \in \mathbb{Q}$, \[ (a, b) = \bigcup_{n=1}^{\infty} (a, b - \tfrac{1}{n}] = \bigcup_{n=1}^{\infty} (-\infty, b - \tfrac{1}{n}] \cap (-\infty, a]^c, \] which implies that $(a, b) \in \sigma(\mathcal{D})$, i.e.\ $\mathcal{O} \subseteq \sigma(\mathcal{D}) \Rightarrow \sigma(\mathcal{O}) \subseteq \sigma(\mathcal{D})$. However, every element of $\mathcal{D}$ is a closed set (complement of an open set) so $\mathcal{D} \subseteq \B(\R) \Rightarrow \sigma(\mathcal{D}) \subseteq \B(\R) = \sigma(\mathcal{O})$, completing the proof. \end{proof} \subsection{Probability Space} \begin{definition}[measurability] \leavevmode \begin{itemize} \item Given a state space $\Omega$ and a $\sigma$-algebra $\A \subseteq 2^\Omega$, the duple $(\Omega, \A)$ called a \textbf{measurable space}. \item A subset $A \in \Omega$ is said to be $\A$-measurable if $A \in \A$. We call $A$ an \textbf{event}. \end{itemize} \end{definition} \begin{definition}[probability measure] Let $\Omega$ be a nonempty set, and let $\A$ be a $\sigma$-algebra of subsets of $\Omega$. A probability measure $\P$ is a function that, to every set $A \in \A$, assigns a number in $[0, 1]$. We require: \begin{enumerate}[(i)] \item $\P(\Omega) = 1$. \item (countable additivity) whenever $A_1, A_2, \cdots$ is a countable sequence of disjoint sets in $\A$, then \[ \P\left(\bigcup_{i=1}^{\infty} A_i\right) = \sum_{i=1}^{\infty} \P(A_i). \] \end{enumerate} The triple $(\Omega, \A, \P)$ is called a \textbf{probability space}. \end{definition} \begin{theorem}[probability measure $\P$ is continuous along monotone sequences of events] If $\P : \A \to [0, 1]$ is a probability measure, then \begin{itemize} \item If $(A_i)_{i \geq 1} \in \A$ is increasing, i.e.\ $A_1 \subseteq A_2 \subseteq A_3 \subseteq \cdots$ with $\lim_{i \to \infty} A_i = \bigcup_{i=1}^{\infty} A_i = A$, denoted $A_i \uparrow A$, then \[ \lim_{i \to \infty} \P(A_i) = \P\left(\lim_{i \to \infty} A_i\right) = \P(A). \] \item If $(A_i)_{i \geq 1} \in \A$ is decreasing, i.e.\ $A_1 \supseteq A_2 \supseteq A_3 \supseteq \cdots$ with $\lim_{i \to \infty} A_i = \bigcap_{i=1}^{\infty} A_i = A$, denoted $A_i \downarrow A$, then \[ \lim_{i \to \infty} \P(A_i) = \P\left(\lim_{i \to \infty} A_i\right) = \P(A). \] \end{itemize} \end{theorem} Proof omitted (course notes pp.8--9). \subsection{Random Variables} Let $(\Omega, \A)$ and $(\Psi, \G)$ be two measurable spaces (Definition~1.2.1). \begin{definition}[measurability of a function] A function $X : \Omega \to \Psi$ is said to be \textbf{measurable} with respect to $\A$ and $\G$ if the pre-image $X^{-1}(G) = \{\omega : X(\omega) \in G\} \in \A$ for all $G \in \G$, or equivalently, $X^{-1}(\G) \subseteq \A$. \end{definition} \begin{definition}[random variable] A \textbf{random variable} is a real-valued function $X : \Omega \to \R$ that is measurable with respect to $\A$ and $\B(\R)$. That is, the pre-image $X^{-1}(B) = \{\omega : X(\omega) \in B\} \in \A$ for all Borel sets $B \in \B(\R)$. We sometimes simply say $X$ is $\A$-measurable. \end{definition} \begin{definition}[almost sure equality of two r.v.s] Two random variables $X$ and $Y$ are said to be \textbf{equal almost surely}, written $X \overset{\text{a.s.}}{=} Y$ if the set $\{\omega : X(\omega) \neq Y(\omega)\}$ is of probability zero. \end{definition} The theorem below says, to check if a function is measurable w.r.t.\ a $\sigma$-algebra, it suffices to check the pre-image of the generating family of the $\sigma$-algebra. \begin{theorem}[measurability of a function] Let $\mathcal{C}$ be a class of subsets of $\Psi$ such that $\sigma(\mathcal{C}) = \G$. In order for a function $X : \Omega \to \Psi$ to be measurable with respect to $\A$ and $\G$, it is necessary and sufficient that $X^{-1}(\mathcal{C}) \subseteq \A$. \end{theorem} \begin{proof} By Definition~1.3.1, we just need to show that $X^{-1}(\mathcal{C}) \subseteq \A$ iff $X^{-1}(\sigma(\mathcal{C})) \subseteq \A$. The backward direction is straightforward since $\mathcal{C} \subseteq \sigma(\mathcal{C})$. To show the forward direction, suppose $X^{-1}(\mathcal{C}) \subseteq \A$. Define \[ \mathcal{K} = \{B \in \sigma(\mathcal{C}) \mid X^{-1}(B) \in \A\}. \] It is easy to verify that $\mathcal{K}$ is a $\sigma$-algebra (by using results on pre-image of complement, union, and intersection). By assumption, $\mathcal{C} \subseteq \mathcal{K}$ and therefore $\sigma(\mathcal{C}) \subseteq \mathcal{K}$. The definition of $\mathcal{K}$ then implies $X^{-1}(\sigma(\mathcal{C})) \subseteq \A$. \end{proof} \begin{corollary} (from Theorem~1.3.1 and Theorem~1.1.1) A function $X : \Omega \to \R$ is a random variable if and only if $\{\omega : X(\omega) \leq a\} = X^{-1}((-\infty, a]) \in \A$ for all $a \in \mathbb{Q}$. \end{corollary} \textit{Example to Corollary~1.3.1.} Consider the probability space $([0, 1], \B[0, 1], \P)$ where $\P$ is the Lebesgue measure, i.e.\ $\P[a, b] = b - a$, $0 \leq a \leq b \leq 1$. Consider the identity function, i.e.\ $X(\omega) = \omega$. To show $X$ is a r.v., just note that \[ X^{-1}((-\infty, a]) = \{\omega : X(\omega) \leq a\} = [0, a] \in \B[0, 1], \] for all $a \in \mathbb{Q}$, $a \leq 1$. \begin{corollary} (from Theorem~1.3.1) Two random variables $X$ and $Y$ are independent iff $\P(X \leq a, Y \leq b) = \P(X \leq a)\P(Y \leq b)$ for all $a, b \in \mathbb{Q}$. \end{corollary} \begin{theorem}[pre-image of $\sigma$-algebra on domain is also $\sigma$-algebra] Let $X : \Omega \to \Psi$ be a mapping. Let $\G$ be a $\sigma$-algebra on $\Psi$. Then $X^{-1}(\G)$ is a $\sigma$-algebra on $\Omega$. \end{theorem} \begin{proof} We verify the axioms for a $\sigma$-algebra: \begin{enumerate}[(i)] \item $X^{-1}(\Psi) = \Omega \Rightarrow \Omega \in X^{-1}(\G)$ as $\Psi \in \G$. \item Let $S \in X^{-1}(\G)$. Then $S = X^{-1}(G)$ for some $G \in \G$. We know that the pre-image of set difference equals the set difference of the respective pre-images, so \[ \Omega \setminus S = X^{-1}(\Psi) \setminus X^{-1}(G) = X^{-1}(\underbrace{\Psi \setminus G}_{\in \G}) \Rightarrow \Omega \setminus S \in X^{-1}(\G). \] \item Let $(S_i)_{i \geq 1} \in X^{-1}(\G)$. Then $S_i = X^{-1}(G_i)$ for some $G_i \in \G$ for all $i$. It follows that \[ \bigcup_{i=1}^{\infty} S_i = \bigcup_{i=1}^{\infty} X^{-1}(G_i) = X^{-1}\left(\bigcup_{i=1}^{\infty} G_i\right) \in X^{-1}(\G), \] where we used the fact that the union of pre-images is the same as the pre-image of the union. \end{enumerate} \end{proof} \subsection{Expectation} \begin{definition}[characteristic function, mgf] \leavevmode \begin{itemize} \item The \textbf{characteristic function} of a r.v.\ $X$ is given by $\Phi_X(t) = \E[e^{itX}]$. \item The \textbf{moment generating function} of a r.v.\ $X$ is given by $M_X(t) = \E[e^{tX}]$. \end{itemize} \end{definition} \begin{definition}[expectation] Let $X$ be a r.v.\ on a probability space $(\Omega, \A, \P)$. The \textbf{expectation} of $X$ is \[ \E(X) = \int_\Omega X(\omega) \, d\P(\omega). \] When $X$ is a simple random variable, i.e.\ it takes on only finitely many values, it can be written as \[ X = \sum_{i=1}^{n} x_i \indic_{A_i} \iff X(\omega) = \sum_{i=1}^{n} x_i \indic_{A_i}(\omega), \quad \omega \in \Omega \] where $x_i \in \R$ and $A_i = \{\omega : X(\omega) = x_i\} \in \A$. Its expectation is given by \[ \E(X) = \sum_{i=1}^{n} x_i \P(A_i) = \sum_{i=1}^{n} x_i \P\{X = x_i\}. \] \end{definition} \begin{definition}[$L^1$] We say $X$ is \textbf{integrable} if $\E(X) < \infty$. The space of integrable r.v.s is denoted by $L^1(\Omega, \P)$. \end{definition} \begin{theorem} The random variable $X$ is integrable if and only if $\E(|X|) < \infty$. \end{theorem} \begin{proof} We can write $X = X^+ - X^-$ where $X^+ = \max(X, 0)$ and $X^- = \max(-X, 0)$. Then $|X| = X^+ + X^-$. \end{proof} \begin{theorem}[Jensen's inequality] Let $g : \R \to \R$ be a convex function and $X$ a r.v.\ with $\E|X| < \infty$. Then \[ g(\E[X]) \leq \E[g(X)]. \] \end{theorem} \subsection{Conditional Expectation} \subsubsection{Conditional Expectation w.r.t.\ a Partition} Given a simple random variable $X$ taking values $x_1, \cdots, x_n$, \[ \mathcal{D}(X) = \{D_1^X, \cdots, D_n^X\} \quad \text{where } D_i^X = \{X = x_i\} \] is the partition of $\Omega$ associated with $X$. \begin{definition}[conditional expectation w.r.t.\ a partition] For any partition $\mathcal{D} = \{D_1, \cdots, D_n\}$ of $\Omega$, the conditional expectation of a simple random variable $X$ is given by \[ \E(X|\mathcal{D}) = \sum_{i=1}^{n} \E(X|D_i) \indic_{D_i} = \underbrace{\sum_{i=1}^{n} \sum_{j=1}^{m} x_j \P(D_j^X | D_i) \indic_{D_i}}_{\text{a simple } \sigma(\mathcal{D})\text{-measurable r.v.}}, \] where $\E(X|D_i) = \E(X \indic_{D_i}) \P(D_i)^{-1}$ is the usual conditional expectation, and $\mathcal{D}(X)$ partitions $\Omega$ w.r.t.\ $X$. If we have two simple random variables $X$ and $Y$, then \[ \E(X|Y) := \E[X|\mathcal{D}(Y)] = g(Y) \] for some measurable function $g$, which, clearly, is $\sigma(Y)$-measurable. \end{definition} \begin{result} \leavevmode \begin{itemize} \item Given a partition $\mathcal{D}$ of $\Omega$, $\P(A|\mathcal{D}) = \P(A|\sigma(\mathcal{D}))$ for all $A \in \A$. More generally, $\E(X|\mathcal{D}) = \E(X|\sigma(\mathcal{D}))$. \item For a simple random variable $X$, $\sigma(X) = \sigma(\mathcal{D}(X))$. \end{itemize} \end{result} \begin{proof} For (1), $\E(X|\mathcal{D})$ is clearly $\sigma(\mathcal{D})$-measurable. Now from the uniqueness of conditional expectation (Definition~1.5.2), it suffices to check that for all $A \in \sigma(\mathcal{D})$, \[ \E[\E(X|\mathcal{D}) \indic_A] = \E[\indic_A X]. \] We can see that this indeed holds as \begin{align*} \E[\indic_A X] &= \E[\E[\indic_A X | \mathcal{D}]] && \text{(tower property)} \\ &= \E[\indic_A \E[X|\mathcal{D}]] && \text{(taking out what is known)}. \end{align*} For (2), $X$ is $\sigma(\mathcal{D}(X))$-measurable, so $\sigma(X) \subseteq \sigma(\mathcal{D}(X))$. On the other hand, $\mathcal{D}(X) \subseteq \sigma(X) \Rightarrow \sigma(\mathcal{D}(X)) \subseteq \sigma(X)$, hence the equality. \end{proof} \subsubsection{General Conditional Expectation} \begin{definition}[conditional expectation w.r.t.\ a $\sigma$-algebra] Let $X$ be a integrable random variable on $(\Omega, \A, \P)$. Given a arbitrary sub-$\sigma$-algebra $\G \subseteq \A$, the \textbf{conditional expectation} of $X$ with respect to $\G$, denoted by $\E[X|\G]$, is the unique integrable random variable satisfying the following conditions: \begin{enumerate}[(i)] \item (measurability) $\E[X|\G]$ is $\G$-measurable, i.e.\ $\E[X|\G]^{-1}(\B(\R)) \subseteq \G$ or $\E[X|\G]^{-1}(B) \in \G$ for all $A \in \B(\R)$. \item (partial averaging) For any $A \in \G$, we have \[ \E[\indic_A X] = \E[\indic_A \E[X|\G]] \] or in integral form $\int_A \E[X|\G](\omega) \, d\P(\omega) = \int_A X(\omega) \, d\P(\omega)$. \end{enumerate} \end{definition} For the definition to make sense, we need to prove the existence and uniqueness of conditional expectation. \begin{theorem}[existence and uniqueness of conditional expectation] Let $\G$ be a sub-$\sigma$-algebra of $\A$. Then \begin{enumerate} \item (existence) there exists a conditional expectation $\E[X|\G]$ for any $X \in L^1(\Omega, \P)$. \item (uniqueness) any two conditional expectations of $X \in L^1(\Omega, \P)$ respective to $\G$ are equal $\P$-a.s. \end{enumerate} \end{theorem} \begin{proof} \textbf{Existence.} Suppose $X \in L^1(\Omega, \P)$ then $X^\pm \in L^1(\Omega, \P)$. Without loss of generality, we may assume that $X \geq 0$. Define a new probability measure $\Q$ on $(\Omega, \A)$ by setting for any $A \in \A$, \[ \Q(A) := \frac{\E[\indic_A X]}{\E[X]} \] The probability $\Q$ is absolutely continuous with respect to $\P$ on $\A$ (i.e.\ for $A \in \A$, $\P(A) = 0 \Rightarrow \Q(A) = 0$). This implies that $\Q \ll \P$ on $\G \subseteq \A$ (Remark following Definition~7.1.1). Therefore from the Radon--Nikodym Theorem, there exists a positive $\G$-measurable Radon--Nikodym derivative $\eta = d\Q/d\P \in L^1(\Omega, \P)$ such that \begin{align*} \E[\indic_A X] &=: \E[X] \Q(A) \\ &= \E[X] \E[\eta \indic_A] \\ &= \E[\E[X] \E[\eta \indic_A] | \G] \\ &= \E[\eta \E[X] \indic_A] \end{align*} then $\eta \E[X]$ is a version of the conditional expectation $\E[X|\G]$. \textbf{Uniqueness.} Suppose $Y$ and $Y'$ are both $\G$-conditional expectation of $X$. Let $G = \{\omega : Y(\omega) > Y'(\omega)\}$ and we assume that $\P(G) > 0$. To this end, we note that \[ G := \{Y - Y' > 0\} = \bigcup_{n=1}^{\infty} \{Y - Y' > \tfrac{1}{n}\} \] \[ G_n := \{Y - Y' > \tfrac{1}{n}\} = \bigcup_{j=1}^{n} \{Y - Y' > \tfrac{1}{j}\} \] By Theorem~1.2.1, $G_n \uparrow G \Rightarrow \P(G_n) \uparrow \P(G)$, so there exists $m > 0$ such that $\P(G_m) > 0$. Since $Y$ and $Y'$ are both $\G$-conditional expectations, we have by (ii) of Definition~1.5.2 that for every $A \in \G$, in particular $G$, we have $\E[\indic_G Y] = \E[\indic_G Y'] \Rightarrow \E[\indic_G (Y - Y')] = 0$. But \begin{align*} \E\left[\indic_G (Y - Y')\right] &\geq \E\left[\indic_{G_m} (Y - Y')\right] && (\indic_G \geq \indic_{G_m}) \\ &\geq \frac{1}{m} \P[G_m] > 0 && (\indic_{G_m} = \indic_{\{Y - Y' > 1/m\}}) \end{align*} This is a contradiction and hence $\P(G) = 0$. \end{proof} \subsubsection{Properties of Conditional Expectation} \begin{result} \leavevmode \begin{itemize} \item Consider a constant r.v.\ $X = c$, i.e.\ $X(\omega) = c$ for all $\omega \in \Omega$. Then $\sigma(X)$ is the trivial $\sigma$-algebra $\A_0 = \{\emptyset, \Omega\}$. \item $\E[\cdot|\A_0] = \E[\cdot]$. \end{itemize} \end{result} \begin{proof} For (1), the trivial $\sigma$-algebra is obviously the smallest. Further, $X$ is measurable w.r.t.\ $\A_0$ as \[ X^{-1}(B) = \begin{cases} \emptyset & \text{if } c \notin B \\ \Omega & \text{if } c \in B \end{cases} \] for all $B \in \B(\R)$. As $X^{-1}(\B(\R)) \subseteq \A_0$, by definition $\sigma(X) = \A_0$. For (2), let $X$ be any random variable on $(\Omega, \A, \P)$. The earlier result implies that $\E[X]$ is $\A_0$-measurable. Next, we check that \[ \begin{cases} \E(\indic_\emptyset X) = \E(\indic_\emptyset \E(X)) = 0 \\ \E(\indic_\Omega X) = \E(\indic_\Omega \E(X)) = \E(X) \end{cases} \] That is, $\E[\indic_A X] = \E[\indic_A \E[X]]$ for all $A \in \A_0$. The required equality follows immediately from the uniqueness of conditional expectation. \end{proof} \begin{theorem}[conditional expectation] Let $X$, $Y$ be two integrable random variables on $(\Omega, \A, \P)$, and $\G$, $\mathcal{H}$ be two sub $\sigma$-algebras of $\A$. Then \begin{enumerate} \item \textbf{(taking out what is known)} If $X$ is $\G$-measurable (or equivalently $\sigma(X) \subseteq \G$), then $\E[X|\G] = X$. \item \textbf{(linearity)} For $a, b \in \R$, $\E[aX + bY|\G] = a\E[X|\G] + b\E[Y|\G]$. \item \textbf{(tower property)} If $\mathcal{H} \subseteq \G$ then \[ \E[\E[X|\G]|\mathcal{H}] = \E[X|\mathcal{H}] \] in particular, by taking $\mathcal{H} = \{\emptyset, \Omega\}$ to be the trivial $\sigma$-algebra, we have \[ \E[\E[X|\G]] = \E(X). \] \item If $X$ is independent of $\G$ in the sense that for all $A \in \sigma(X)$ and $B \in \G$ we have $\P(A \cap B) = \P(A)\P(B)$, then \[ \E[X|\G] = \E[X]. \] \item If $X$ is $\G$-measurable and $Y$ is independent of $\G$ then for any Borel function $h : \R^2 \to \R$ we have \[ \E[h(X, Y)|\G] = H(X), \] where $H : \R \to \R$ is given by the formula $H(x) = \E[h(x, Y)]$. Consider, e.g.\ $X$ represents the present, $Y$ the future and $\G$ the information generated by past events. \item \textbf{(conditional Jensen's inequality)} Let $g : \R \to \R$ be a convex function and for any $\sigma$-algebra $\G \subseteq \A$, \[ g(\E[X|\G]) \leq \E[g(X)|\G]. \] \end{enumerate} \end{theorem} \begin{proof} (for 5) \begin{align*} \E[h(X, Y)|\G] &= \E[h(x, Y)|\G]\big|_{x=X} && \text{(taking out what is known)} \\ &= \E[h(x, Y)]\big|_{x=X} && \text{($Y$ is independent of $\G$)} \\ &=: H(x)\big|_{x=X} \end{align*} \end{proof} \begin{lemma}[Doob's measurability theorem] Let $X : \Omega \to \Psi$ be a mapping and $(\Psi, \G)$ a measurable space. A function $Y : \Omega \to \R$ is $\sigma(X)$-measurable if and only if there exists a $\G$-measurable function $h : \Psi \to \R$ s.t.\ $Y = h(X)$. \end{lemma} \begin{remark} Doob's measurability theorem tells us for a integrable random variable $Y$, $\E[Y|\sigma(X)]$ which by definition is $\sigma(X)$-measurable, must be a function of $X$. \end{remark} \subsubsection{Geometric Interpretation of Conditional Expectation} \begin{theorem} Let $X$ be a square integrable random variable, i.e.\ $\E\left[X^2\right] < \infty$. Then $\E[X|\G]$ is the orthogonal projection of $X$ on $L^2(\Omega, \P)$. This means that for every square integrable $\G$-measurable $Z$, \[ \E\left[(X - Z)^2\right] \geqslant \E\left[(X - \E[X|\G])^2\right] \] with equality if and only if $Z = \E[X|\G]$. \end{theorem} \begin{remark} One can show that $X - \E(X|\G)$ is orthogonal to $\E(X|\G)$. That is, \[ \E\left[(X - \E(X|\G)) \cdot \E(X|\G)\right] = 0. \] \end{remark} \begin{proof} (Sketch) We consider \[ \E\left[(X - Z)^2\right] = \E\left[(X - \E(X|\G) + \E(X|\G) - Z)^2\right] \] \[ = \E\left[(X - \E(X|\G))^2\right] + \E\left[\hblue{(\E(X|\G) - Z)^2}\right] + 2\underbrace{\E\left[(X - \E(X|\G))(\E(X|\G) - Z)\right]}_{(*)} \] But \begin{align*} (*) &= \E\left[\E\left[(X - \E(X|\G))(\E(X|\G) - Z) \middle| \G\right]\right] && \text{(tower property)} \\ &= \E\left[(\E(X|\G) - Z) \underbrace{\E\left[(X - \E(X|\G)) \middle| \G\right]}_{= \E(X|\G) - \E(X|\G) = 0}\right] = 0 && \text{(taking out what is known)} \end{align*} and therefore $\E\left[(X - Z)^2\right] \geqslant \E\left[(X - \E[X|\G])^2\right]$. \end{proof} \subsection{Visual Representation of Expectation} \[ \E(X) = \int_\Omega X(\omega) \, d\P(\omega) \] The lower Riemann sum is pictured below, with $A_k = \{\omega \in \Omega : y_k \leq X(\omega) < y_{k+1}\}$. \begin{center} \begin{tikzpicture}[scale=0.8] % Axes \draw[->] (0,0) -- (8,0) node[right] {$\omega$}; \draw[->] (0,0) -- (0,6) node[above] {$y$}; % Curve (representing X(omega)) \draw[thick] (0.5,1) .. controls (1.5,4.5) and (3,5.5) .. (4,5) .. controls (5,4.5) and (5.5,3) .. (6,2.5) .. controls (6.5,2) and (7,1.5) .. (7.5,1.2); % Horizontal dashed lines for y levels \draw[dashed] (0,0.8) -- (7.8,0.8); \draw[dashed] (0,2) -- (7.8,2); \draw[dashed] (0,3) -- (7.8,3); \draw[dashed] (0,4.5) -- (7.8,4.5); \draw[dashed] (0,5.2) -- (7.8,5.2); % Y-axis labels \node[left] at (0,0) {$y_0 = 0$}; \node[left] at (0,0.8) {$y_1$}; \node[left] at (0,2) {$y_2$}; \node[left] at (0,3) {$y_3$}; \node[left] at (0,4.5) {$y_s$}; \node[left] at (0,5.2) {$y$}; % Caption \node[below] at (4,-0.8) {Fig.\ 1.3.2.\ Lower Lebesgue sum.}; \end{tikzpicture} \end{center}