hdat9400-smlxldata-20211019.tex

% HDAT9400 Data Management: S, M, L, XL Data
% (c) 2021 Malcolm Gillies <malcolm.gillies@unsw.edu.au>
% https://github.com/mbg-unsw/hdat9400
%
% This work is licensed under a
% Creative Commons Attribution-NonCommercial-ShareAlike 4.0
% International Licence
\documentclass[aspectratio=169,12pt,usepdftitle=false]{beamer} % XXXX fix AR here
\usepackage[latin1]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{textcomp}
\usefonttheme{serif}
\usetheme{Berlin}  % using default now
\usecolortheme{beaver}  % using default now
\usepackage[libertine]{libertine} % not using osf (old-style figures)
\usepackage[scale=0.9]{tgheros} % scale to match libertine
\usepackage[varqu,varl]{inconsolata}
\usepackage[libertine]{newtxmath}
\usepackage{graphicx}
\usepackage{tikz}
\usepackage{tikzpagenodes}
\usepackage{gitinfo2}
\usepackage{minted}
\definecolor{links}{HTML}{2A1B81}
\hypersetup{colorlinks,linkcolor=,urlcolor=links,
	    pdfauthor={(C) 2021 Malcolm Gillies. CC-BY-NC-SA},
            pdftitle={HDAT9400 Data Management: S, M, L, XL Data},
	    pdfkeywords={data management, time complexity, data processing, health data, big data}}

\renewcommand{\gitMark}{\color{gray}\texttt{\tiny\gitBranch\,@\,\gitAbbrevHash\,\gitAuthorDate}}

\setbeamertemplate{navigation symbols}{} % remove navigation symbols
\setbeamertemplate{page number in head/foot}[totalpagenumber]
\setbeamercolor*{item}{fg=darkred}

\title{HDAT9400 Data Management: S, M, L, XL Data}
\institute{\url{https://github.com/mbg-unsw/hdat9400}}
\author{Malcolm Gillies}
\date{19 October 2021}
\usebackgroundtemplate{%
\begin{tikzpicture}[remember picture,overlay]
    \node[anchor=south west,scale=1,rotate=90] at ([shift={(0cm,0cm)}]current page marginpar area.south east) {\gitMark};
\end{tikzpicture}%
}

\titlegraphic{%
\begin{tikzpicture}[overlay,remember picture]
	\node[anchor=south west,scale=0.33] at ([shift={(0cm,0cm)}]current page text area.south west)
	{\includegraphics{ref/unsw-cbdrh-land}};
\end{tikzpicture}
}

\begin{document}

{
%\usebackgroundtemplate{}
\begin{frame}
\titlepage
\end{frame}
}

{
\usebackgroundtemplate{\includegraphics[width=\paperwidth]{ref/ijplein.jpg}}
\begin{frame}[plain]
\end{frame}
}

\begin{frame}{Lecture outline}
    \begin{itemize}
	\item Why does data size matter?
	\item How big are health data sets?
	\item How fast can we process data?
	\item Asymptotic algorithmic complexity
	\item Real world examples
    \end{itemize}
\end{frame}

\begin{frame}{About me}
    \begin{itemize}
        \item When I studied computer science (1990), a PC had
	    \begin{itemize}
		\item 4MB RAM (1000th today's phones)
		\item 200MB Disk (1000th today's phones)
		\item 33MHz Processor (100th today's phones)
	    \end{itemize}
    \end{itemize}
\end{frame}

\begin{frame}{Health data I've worked with}
    \begin{itemize}
	\item NPS MedicineWise: GP electronic medical records (MedicineInsight)
	\item NSW Ministry of Health: hospital, ambulance, births
	\item CBDRH: Pharmaceutical Benefits Scheme (PBS)
	\item SAS, R, MS SQL Server, PostgreSQL, SQLite, DuckDB
    \end{itemize}
\end{frame}

\begin{frame}{Why does data size matter?}
    \begin{itemize}
        \item Time and space are finite
	\item We have budgets and deadlines
	\item Two times bigger can take more than twice the time
    \end{itemize}
\end{frame}

% XXXX would be nice to have an example of a problem that becomes
% intractable at larger n

\begin{frame}{What can we do about it?}
    \begin{itemize}
	\item \emph{Work smarter, not harder}
	\item Learn from the theory of algorithms
	\item Use efficient data-processing tools
    \end{itemize}
\end{frame}

{
\usebackgroundtemplate{\includegraphics[width=\paperwidth]{ref/kunsthal.jpg}}
\begin{frame}[plain]
\end{frame}
}

% XXXX consider decimal alignment for Gigabytes column

\begin{frame}{How big are health data sets?}
    \begin{tabular}{lrr}
	\textbf{Data} & \textbf{Records} & \textbf{Gigabytes} \\
	NSW congenital conditions (5 years)& 10\,000 & 0.001 \\
	NSW perinatal (20 years) & 1\,000\,000 & 1 \\
	NSW Admitted patients (20 years) & 100\,000\,000 & 15 \\
	AU Pharmaceutical benefits (20 years) & 1\,000\,000\,000 & 400 \\
	Health ``data lake'' & 1\,000\,000\,000\,000\,000 & 1\,000\,000 \\
    \end{tabular}\par
    \medskip
    20--200 variables per record
\end{frame}

\begin{frame}{What is ``big data''?}
\centering
\begin{minipage}{0.8\textwidth}
\textbf{Big data} is a field that treats ways to analyze,
systematically extract information from, or otherwise deal with data sets
that are too large or complex to be dealt with by traditional
data-processing application software.
\end{minipage}
\flushright{--- Wikipedia}
\end{frame}

\begin{frame}{So...}
\centering
\includegraphics[width=1.0\textwidth]
	{ref/cmsdrh.pdf}
\end{frame}

\begin{frame}{Examples of different data processing technologies}
    \begin{tabular}{lrrl}
	\textbf{Method} & \textbf{Max size} & \textbf{Rec per sec} &
		\textbf{Notes} \\
	In memory [R] & GB & 1000M & Simple! \\
	Disk streaming [SAS] & TB & 1M & Slower \\
	Relational database [SQLite] & TB & 2M & Complicated \\
	Column-store database [DuckDB] & TB & 50M & Specialised \\
	NoSQL [Apache Spark] & PB & ???? & Don't ask \\
    \end{tabular}
\end{frame}

\begin{frame}{Starting simple: process all the data}
    \begin{itemize}
	\item Sometimes you need to look at every record aka \emph{full table scan}
	    \begin{itemize}
		\item e.g. What is the total length of stay of all NSW admissions?
	    \end{itemize}
	\item All else being equal, twice the data takes twice the time
	\item Most important distinction: scan in memory (RAM) or disk?
    \end{itemize}
\end{frame}

\begin{frame}{Making things more complicated}
    \begin{itemize}
	\item Sort all prescriptions by date of prescription
	\item Analyse all data from hospitals in Sydney
	\item For each antibiotic prescription, find the corresponding doctor visit
	\item Build a regression model for risk of low birth weight based on maternal characteristics
    \end{itemize}
\end{frame}

{
\usebackgroundtemplate{\includegraphics[width=\paperwidth]{ref/Seattle_Public_Library.jpg}}
\begin{frame}[plain]
\end{frame}
}

\begin{frame}{Time (and space) complexity}
    \begin{itemize}
	\item How does an algorithm ``scale'' with the amount of data?
	\item Ignore details of the implementation, specific computer etc
	\item Insight: how does the time/space cost change as $n\rightarrow\infty$ ?
	\item \emph{Asymptotic complexity} % explain leading term etc
    \end{itemize}
\end{frame}

\begin{frame}{$\mathcal{O}(n)$ vs $\mathcal{O}(n^2)$}
\centering
\includegraphics[height=0.8\textheight]
	{ref/big-o-graph.pdf}
\end{frame}

\begin{frame}{Optimal speed of common algorithms}
\centering
    \begin{tabular}{ll}
	Hash lookup & $\mathcal{O}(1)$ \\
	(Binary) search & $\mathcal{O}(\log{}n)$ \\
	Sort & $\mathcal{O}(n\log{}n)$ \\
	Matrix inversion & $\mathcal{O}(n^2\log{}n)$ \\
    \end{tabular}
\end{frame}

\begin{frame}{Experiment: sorting in SAS}
\centering
\includegraphics[height=0.8\textheight]
	{ref/sort-bench.pdf}
\end{frame}

\begin{frame}[fragile]{Speeding up WHERE using an index (1)}
\begin{minted}{sas}
/* test.data contains 10,000,000 records of 200 variables */
/* each variable contains a random number from 1 to 1000 */

proc sql;
    create index xi200 on test.data(xi200);
quit;

/* data.sas7bdat: 16GB */
/* data.sas7bndx: 80MB */
\end{minted}
\end{frame}

\begin{frame}[fragile]{Speeding up WHERE using an index (2)}
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{minted}{sas}
data benchmark1;
    set test.data;
    keep total;
    total+xi1;
    * no index;
    where xi199=10;
run;
\end{minted}
\emph{real time 14.76 seconds}
\end{column}
\begin{column}{0.5\textwidth}
\begin{minted}{sas}
data benchmark2;
    set test.data;
    keep total;
    total+xi1;
    * index;
    where xi200=10;
run;
\end{minted}
\emph{real time 2.52 seconds}
\end{column}
\end{columns}
\end{frame}
{
\usebackgroundtemplate{\includegraphics[width=\paperwidth]{ref/rotterdam.jpg}}
\begin{frame}[plain]
\end{frame}
}

\begin{frame}{Real world example: Matching ED and in-patient episodes}
    \begin{itemize}
	\item 50M records in Admitted Patient Data Collection
	\item 50M records in Emergency Department Data Collection
	\item SAS merge on PPN (patient ID) and date
	\item Sort, sort and merge $\rightarrow$ \emph{hours}
	\item R: cannot load complete data set into memory
	    \begin{itemize}
		\item 1M records, subset of 16 variables $\rightarrow$ 2.5s
		\item need to process in chunks, similar total run time to SAS
	    \end{itemize}
    \end{itemize}
\end{frame}

\begin{frame}{Real world example: Modelling ICU glucose control (1)}
    \begin{itemize}
	\item Blood glucose levels in ICU monitored and insulin given
	\item What are risk factors for glucose >180 mg/dL?
	\item Covariates: severity score (APACHE), diabetes dx, age, gender, insulin
	\item 1M observations, 200K patients, 200 ICUs
	\item Generalised linear mixed model (3 level)
    \end{itemize}
\end{frame}

\begin{frame}{Real world example: Modelling ICU glucose control (2)}
\begin{columns}
    \begin{column}{0.65\textwidth}
    \centering
    \includegraphics[height=0.8\textheight]
	    {ref/oisin.pdf}
    \end{column}
    \begin{column}{0.35\textwidth}
	\begin{itemize}
	    \item MixedModels.jl
	    \item \emph{Full model too large for R lme4}
	\end{itemize}
    \end{column}
\end{columns}
\end{frame}

\begin{frame}{Real world example: NSW hospital readmission rates (1)}
    \begin{itemize}
	\item Bureau of Health Information
	\item Quarterly report on hospital performance
	\item Mixed models, SAS
	\item 32\,000 records, 177 hospitals
	\item Run time for the analysis: 1 minute
    \end{itemize}
\end{frame}

\begin{frame}{Real world example: NSW hospital readmission rates (2)}
% insert some output fromt the report e.g. caterpillar plot, funnel plot
\centering
\includegraphics[height=0.8\textheight]
	{ref/ami-rsrr.pdf}

%	\tiny 9 March 2020

\end{frame}

\begin{frame}{Summary}
    \begin{itemize}
	\item Computers are fast and most data isn't that big
	\item We have a bunch of excellent data processing tools
	\item If you do things the wrong way, time and space limits can bite
	\item Some tasks will always be ``slow'' asymptotically
	\item Think about the size of your data before you start
    \end{itemize}
\end{frame}

% -	Digression on compiled v interpreted?
%
% -	Algorithms, efficiency and optimisation
% o	SQL and query optimisation
%
% -	Statistical modelling with bigger data
%
% -	Final real-world examples
% o	EHealth Data Lake
% o	Loading the HIE
%
% -	Oscar's point: bigger isn't always better! Data reduction


% Breaks and diversions

% Points for discussion/questions

% -	Pop Quiz 1
% o	Which of these datasets is the biggest?

% -	Pop Quiz 2
% o	How long would it take to...

% -	Pop Quiz 3
% o	Which of these operations is fast? slow?
%	O(n log n) etc

% -	Pop Quiz 4
% o	Which of these cannot be computed within one lifetime?

% -	Potential diversion: SPDY

% -	Potential diversion: TPC and Sabre

\begin{frame}{Thanks}
    \begin{itemize}
        \item Sadaf Marashi-Pour (Bureau of Health Information)
	\item Sandy Sa (NSW Ministry of Health)
	\item Juan Quiroz Aguilera (CBDRH)
	\item Oisin Fitzgerald (CBDRH)
    \end{itemize}
\end{frame}

\begin{frame}{Further reading}
    \begin{itemize}
	\item \href{https://discrete.gr/complexity/}{A Gentle Introduction to Algorithm Complexity Analysis} by Dionysis Zindros
	\item \href{https://h2oai.github.io/db-benchmark/}{Database-like ops benchmark} by \texttt{data.table} developers
	\item \href{https://m-clark.github.io/posts/2019-10-20-big-mixed-models/}{Mixed Models for Big Data} by Michael Clark
    \end{itemize}
\end{frame}

{
\usebackgroundtemplate{\includegraphics[width=\paperwidth]{ref/rem.jpg}}
\begin{frame}[plain]
\end{frame}
}

\begin{frame}{Picture credits (Creative Commons)}
    \begin{itemize}
	\item Nieuwbouw IJplein \copyright{} Bart Molendijk / Anefo
	\item Kunsthal, Rotterdam \copyright{} Fred Romero
	\item Seattle Public Library \copyright{} \href{https://www.flickr.com/people/moody75/}{Moody 75}
	\item De Rotterdam \copyright{} Fred Romero
	\item Kunsthal, Rem Koolhaas portrait \copyright{} Jacco van Giessen
    \end{itemize}
\end{frame}

\end{document}