changeset 60:cd940f75aab6

Finish section 1, two more to go
author Jordi Gutiérrez Hermoso <jordigh@octave.org>
date Tue, 17 May 2016 20:54:41 -0400
parents 049258e4b72a
children 73b369370665
files talk/code/plots.py talk/talk.tex
diffstat 2 files changed, 174 insertions(+), 11 deletions(-) [+]
line wrap: on
line diff
--- a/talk/code/plots.py
+++ b/talk/code/plots.py
@@ -26,6 +26,7 @@
     # Do an adjusted boxplot
     if adjusted:
         mc = medcouple_1d(data)
+        print "MC = ", mc
         iqr = data_stats[0]['iqr']
         q1 =  data_stats[0]['q1']
         q3 =  data_stats[0]['q3']
--- a/talk/talk.tex
+++ b/talk/talk.tex
@@ -1,6 +1,8 @@
 %%% BEGIN BEAMER PREAMBLE %%%
-\documentclass[green]{beamer}
-\usepackage{bm, fourier, anyfontsize}
+\documentclass[blue]{beamer}
+\usepackage{bm, fourier, anyfontsize, xcolor}
+\newcommand{\MC}{\operatorname{MC}}
+\newcommand{\IQR}{\operatorname{IQR}}
 
 
 \mode<presentation>
@@ -47,24 +49,184 @@
 
 \section{Outliers and Boxplots}
 
+\begin{frame}{What is an outlier?}
+  \pause
+  \begin{center}
+    \pgfimage[height=2.5in]{img/normal-boxhistplot.pdf}
+  \end{center}
+  A simple answer: Tukey's boxplots
+\end{frame}
+
+\begin{frame}{Anatomy of a boxplot}
+  \begin{overlayarea}{\textwidth}{8cm}
+    \only<1>{\pgfimage[width=4in]{img/normal-points}}
+    \only<2>{\pgfimage[width=4in]{img/normal-boxplot}}
+    \only<3>{\pgfimage[width=4in]{img/normal-boxplot-bare/base}}
+    \only<4>{\pgfimage[width=4in]{img/normal-boxplot-bare/median}}
+    \only<5>{\pgfimage[width=4in]{img/normal-boxplot-bare/q1q3}}
+    \only<6>{\pgfimage[width=4in]{img/normal-boxplot-bare/IQR}}
+    \only<7>{\pgfimage[width=4in]{img/normal-boxplot-bare/whiskers}}
+    \only<8>{\pgfimage[width=4in]{img/normal-boxplot-bare/15}}
+    \only<9>{\pgfimage[width=4in]{img/normal-boxplot-bare/outliers}}
+  \end{overlayarea}
+\end{frame}
+
+\begin{frame}{Anatomy of a boxplot}
+  \begin{itemize}
+    \item Why 1.5?
+    \pause
+    \item Tukey responded: ``it's less than 2 and more than 1''
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{Outliers}
+  \pause
+  \begin{center}
+    \pgfimage[height=2.5in]{img/normal-boxhistplot}
+  \end{center}
+  
+  The boxplot identifies $10$ outliers out of $1000$ points ($1\%$)
+\end{frame}
+
+\begin{frame}{Skew distributions}
+  Remember:
+  \begin{center}
+    \pgfimage[width=4in]{img/skew-distributions}
+  \end{center}
+\end{frame}
+
 \begin{frame}
-  What is an outlier?
+  For skew distributions...
+\end{frame}
+
+\begin{frame}
+  \begin{overlayarea}{\textwidth}{8cm}
+    \only<1>{
+      \pgfimage[height=3in]{img/geometric-boxhistplot}
+      
+      $433$ outliers out of $10 000$ points ($4.3\%$)
+    }
+    \only<2>{
+      \pgfimage[height=3in]{img/boys-and-girls}
+      
+      $578$ and $644$ outliers for actors and actresses respectively
+      ($1.2\%$ and $3\%$)
+    }
+  \end{overlayarea}
 \end{frame}
 
 \begin{frame}
-  \begin{center}
-    \pgfimage[width=4.5in,height=3.5in]{img/boys-and-girls}
-  \end{center}
+  \begin{itemize}
+    \item Too many outliers...
+    \pause
+    \item Idea: adjust whisker lengths taking into account skewness:
+  \end{itemize}
+  \emph{M. Hubert; E. Vandervieren (2008). "An adjusted boxplot for skewed
+    distributions". Computational Statistics and Data Analysis 52
+    (12): 5186-5201. doi:10.1016/j.csda.2007.11.008.}
+\end{frame}
+
+\begin{frame}{Adjusted boxplot}
+  \begin{overlayarea}{\textwidth}{3cm}
+    \only<1>{
+      Recall normal whiskers:
+      % Trick to hide medcouple, use whiteout, so that the text gets
+      % positioned the same with or without it.
+      \begin{align*}
+        \text{lower} &= Q_1 - 1.5 \IQR\textcolor{white}{e^{a \MC}} \\
+        \text{higher} &= Q_3 + 1.5 \IQR\textcolor{white}{e^{b \MC}}
+      \end{align*}
+    }
+    \only<2>{
+      Instead, use adjusted whiskers:
+      \begin{align*}
+        \text{lower} &= Q_1 - 1.5 \IQR\textcolor{red}{e^{a \MC}}  \\
+        \text{higher} &= Q_3 + 1.5 \IQR\textcolor{red}{e^{b \MC}}
+      \end{align*}
+      \begin{itemize}
+        \item[$\MC$] -- the \emph{medcouple}, a measure of skewness
+        \item[$a, b$] -- parameters to fit across some sample distributions
+      \end{itemize}
+    }
+  \end{overlayarea}
+\end{frame}
+
+\begin{frame}{Adjusted boxplot}
+  For the whiskers, Hubert and Vandervieren recommend:
+  \[
+  \begin{cases}
+    [Q_1 - 1.5 \IQR e^{-3 \MC},  Q_3 + 1.5 \IQR e^{4 \MC}] &\text{if} \MC > 0 \\
+    [Q_1 - 1.5 \IQR e^{-4 \MC},  Q_3 + 1.5 \IQR e^{3 \MC}] &\text{if} \MC < 0
+  \end{cases}
+  \]
+  \pause
+  Of course, if $\MC = 0$ (no skewness) then no adjustment
+\end{frame}
+
+\begin{frame}
+  Let's see some adjusted boxplots...
+\end{frame}
+
+\begin{frame}
+  \begin{overlayarea}{\textwidth}{8cm}
+    \only<1>{
+      \pgfimage[height=3in]{img/geometric-boxhistplot}
+      
+      $433$ outliers out of $10 000$ points ($4.3\%$)
+    }
+    \only<2>{
+      \pgfimage[height=3in]{img/geometric-boxhistplot-adjusted}
+      
+      \textcolor{red}{$25$ outliers} out of $10 000$ points
+      (\textcolor{red}{$0.25\%$}) (\textcolor{blue}{$\MC = 0.25$})
+    }
+  \end{overlayarea}
+\end{frame}
+
+\begin{frame}
+  \begin{overlayarea}{\textwidth}{8cm}
+    \only<1>{
+      \pgfimage[height=3in]{img/normal-boxhistplot}
+      
+      $10$ outliers out of $1 000$ points ($1\%$)
+    }
+    \only<2>{
+      \pgfimage[height=3in]{img/normal-boxhistplot-adjusted}
+      
+      \textcolor{red}{$10$ outliers} out of $1 000$ points
+      (\textcolor{red}{$1\%$}) (\textcolor{blue}{$\MC = 0.0006$})
+    }
+  \end{overlayarea}
+\end{frame}
+
+\begin{frame}
+  \begin{overlayarea}{\textwidth}{8cm}
+    \only<1>{
+      \pgfimage[height=3in]{img/boys-and-girls}
+
+      $578$ and $644$ outliers for actors and actresses respectively
+      ($1.2\%$ and $3\%$)
+    }
+    \only<2>{
+      \pgfimage[height=3in]{img/boys-and-girls-adjusted}
+
+      \textcolor{red}{$346$} and \textcolor{red}{$657$} outliers for
+      actors and actresses respectively
+      (\textcolor{red}{$0.69\%$} and \textcolor{red}{$3\%$})
+      (\textcolor{blue}{$\MC = 0.12$} and \textcolor{blue}{$\MC = 0.231$})
+
+    }
+  \end{overlayarea}
 \end{frame}
 
 \section{The Medcouple}
 
 \begin{frame}
-  omg
+  
 \end{frame}
 
 
-\begin{frame}
+\begin{frame}{Computing the medcouple}
   \begin{center}
     \pgfimage[width=4in]{img/naive/x-orig.png}
   \end{center}
@@ -72,7 +234,7 @@
   Take some $X$ random numbers.
 \end{frame}
 
-\begin{frame}
+\begin{frame}{Computing the medcouple}
   \begin{center}
     \pgfimage[width=4in]{img/naive/x-sorted.png}
   \end{center}
@@ -80,7 +242,7 @@
   Sort them.
 \end{frame}
 
-\begin{frame}
+\begin{frame}{Computing the medcouple}
   \begin{center}
     \pgfimage[width=4in]{img/naive/sortx-red.png}
   \end{center}
@@ -88,7 +250,7 @@
   Pick the median.
 \end{frame}
 
-\begin{frame}
+\begin{frame}{Computing the medcouple}
   \begin{overlayarea}{\textwidth}{8cm}
     \only<1>{%
       \begin{center}