index.html.erb

<!doctype html>

<html lang="en">	
    <head>
        <!-- taken from tutorial http://htmlcheats.com/reveal-js/reveal-js-tutorial-reveal-js-for-beginners/ -->
        <meta charset="utf-8">
        <title>Big Data with Hadoop/Spark</title>
        <link rel="stylesheet" href="css/reveal.min.css">
        <link rel="stylesheet" href="css/theme/default.css" id="theme">    

        <!-- Syntax Highlighting -->
        <!-- <link rel="stylesheet" href="lib/css/zenburn.css"> -->

        <!--Add support for earlier versions of Internet Explorer -->
        <!--[if lt IE 9]>
            <script src="lib/js/html5shiv.js"></script>
            <![endif]-->
    </head>

    <body>
        <!-- Wrap the entire slide show in a div using the "reveal" class. -->
        <div class="reveal">
            <div id="arc-logo" style="background: url(images/arc-logo.png);
            background-size: 516px 74px;
            position: absolute;
            bottom: 30px;
            left: 50px;
            width: 516px;
            height: 74px;"></div>
            <!-- Wrap all slides in a single "slides" class -->
            <div class="slides">

              <section id="title">
                <h1>Big Data Research</h1>
                <h3>Using Hadoop, Spark, and other tools</h3>
                <br />
                <center>
                <table>
                  <tr>
                    <td>Brock Palen</td>
                    <td style="text-align: right;"><a href="mailto:brockp@umich.edu">brockp@umich.edu</a></td>
                  </tr>
                  <tr>
                    <td>Alec Ten Harmsel</td>
                    <td style="text-align: right;"><a href="mailto:talec@umich.edu">talec@umich.edu</a></td>
                  </tr>
                </table>
                <center>
              </section>

              <section id="what-is-hadoop">
                <h2>What is Hadoop?</h2>
                <p>
                Apache Hadoop is a
                <strong class="fragment highlight-green">framework</strong>
                for
                <strong class="fragment highlight-green">distributed processing</strong>
                of large data sets across clusters of computers using simple
                programming models. It is designed to scale up from single
                servers to thousands of machines by co-locating data and
                computation on each machine.
                </p>
                <br />
                <br />
                <p>
                Fladoop is the name of the trial ARC-TS Hadoop cluster.
                </p>
              </section>

              <section id="links">
                <h2>Links and Tutorials</h2>
                <a href="http://caen.github.io/hadoop">ARC Fladoop Documentation</a><br />
                <a href="http://fluxhpc.blogspot.com/2014/09/arc-fladoop-data-platform-hadoop.html">Hardware - Fladoop Architecture</a><br />
                <a href="https://www.udacity.com/course/ud617">Free UDACITY Hadoop/MapReduce Course Materials</a><br />
                <a href="http://www.cloudera.com/content/cloudera/en/products-and-services/cloudera-live.html">Cloudera Quickstart VM</a><br />
                <iframe width="420" height="315" src="http://www.youtube.com/embed/bcjSe0xCHbE" frameborder="0" allowfullscreen></iframe>
              </section>

              <section id="avoid-map-reduce">
                <h2>Avoid Writing MapReduce Code</h2>
                <ul>
                  <li>Lowest level of control in Hadoop</li>
                  <li>Most difficult to work with</li>
                  <li>Requires Java knowledge and lots of boilerplate code</li>
                </ul>
              </section>

              <aside class="notes" data-markdown>
                <p>
                - Don't focus on MapReduce. 
                - It is difficult and requires building a pair of Java classes.  Most users with big data will be better served by using applications on top of Hadoop. Think of this as writing your own program, vs. using an exsiting one (C-Code vs. R).
                </p>
              </aside>

              <section id="alternatives">
                <h2>Tools for Analysis</h2>
                <h3>Alternatives to Writing MapReduce Code</h3>
                <ul>
                  <li><a href="http://hive.apache.org/">Hive - SQL on Hadoop</a></li>
                  <li><a href="http://pig.apache.org/">Pig - SQL-like analysis language</a></li>
                  <li><a href="http://spark.apache.org/">Spark - A fast, general compute engine</a></li>
                  <li><a href="http://mahout.apache.org/">Mahout - A machine learning and data mining library</a></li>
                </ul>
              </section>

              <section id="hdfs">
                <h2>The Hadoop File System (HDFS)</h2>
                <img src="images/fladoop-hdfs.png">
              </section>

              <section id="hdfs-about">
                <h2>About HDFS</h2>
                <ul>
                  <li>Distributed and fault tolerant</li>
                  <li>Requires use of HDFS-specific tools</li>
                  <li>Looks like POSIX filesystems</li>
                  <li>Optimized for read performance</li>
                </ul>
              </section>

              <section id="dfs">
                <h2>Common HDFS Commands</h2>
                <pre>
<code data-trim>
<%= File.read('warez/hdfs.sh') %>
</code>
                </pre>
              </section>

              <section id="hdfs-practice1">

                <section id="hdfs-practice1-q">
                  <h2>HDFS Practice</h2>
                  <br />
                  <p>Create a directory in HDFS called <code>arcdata</code> and check its contents.</p>
                </section>

                <section id="hdfs-practice1-ans">
                  <pre>
<code data-trim>
hdfs dfs -mkdir arcdata
hdfs dfs -ls arcdata
</code>
                  </pre>
                </section>

              </section>

              <section id="hdfs-get-put">
                <h2>Moving to and from HDFS</h2>
                <h3>get / put / distcp</h3>
                <pre>
<code data-trim>
<%= File.read('warez/hdfs_xfer.sh') %>
</code>
                </pre>
              </section>

              <section id="hadoop-examples">
                <h2>Tool Examples</h2>
                <ul>
                  <li>Task: Find number of words per year that occur in only one volume</li>
                  <li>Dataset: Google NGrams (specifically 1-grams)</li>
                  <li>NGrams schema: word, year, occurrences, volumes</li>
                  <li>NGrams dataset is tab separated</li>
                </ul>
              </section>

              <section id="hive">

                <section id="hive-intro">
                  <h2>Hive</h2>
                  <ul>
                    <li>SQL layer on Hadoop</li>
                    <li>Supports most reading SQL queries</li>
                    <li>Supports few writing SQL queries</li>
                    <li>Fastest from idea to result</li>
                  </ul>
                </section>

                <section id="hive-start">
                  <h2>Starting Hive</h2>
                  <pre>
<code data-trim>
<%= File.read('warez/hive.sh') %>
</code>
                  </pre>
                </section>

                <section id="hive-create-table">
                  <h2>Creating a Table</h2>
                  <br />
                  <pre>
<code>
<%= File.read('warez/ngrams_create.sql') %>
</code>
                  </pre>
                </section>

                <section id="hive-simple">
                  <h2>Querying the Data</h2>
                  <br />
                  <pre>
<code>
<%= File.read('warez/ngrams_query.sql') %>
</code>
                  </pre>
                </section>

                <section id="hive-results">
                  <h2>Query Result</h2>
                  <br />
                  <pre>
<code>
snake 22
snaketail 10
</code>
                  </pre>
                </section>

              </section>

              <section id="pig">

                <section id="pig-intro">
                  <h2>Pig</h2>
                  <br />
                  <ul>
                    <li>Similar to but more in-depth than SQL</li>
                    <li>Better than SQL for complex transformations</li>
                    <li>Slower than Hive</li>
                  </ul>
                </section>

                <section id="pig-shell">
                  <h2>Running Pig code</h2>
                  <br />
                  <pre>
<code>
<%= File.read('warez/pig.sh') %>
</code>
                  </pre>
                </section>

                <section id="pig-code">
                  <h2>Pig Code</h2>
                  <br />
                  <pre>
<code>
<%=
  raw = File.read('warez/ngrams.pig').split("\n")
  no_comments = Array.new
  raw.each do |line|
    if not line =~ /^#/ and not line.empty?
      no_comments << line
    end
  end
  no_comments.join("\n")
  %>
</code>
                  </pre>
                </section>

                <section id="pig-results">
                  <h2>Pig Results</h2>
                  <br />
                  <pre>
<code>
(snake,22)
(snaketail,10)
</code>
                  </pre>
                </section>

              </section>

              <section id="python">

                <section id="python-intro">
                  <h2>Hadoop Streaming: Python</h2>
                  <br />
                  <ul>
                    <li>Map and reduce in Python</li>
                    <li>Code reads from STDIN and writes to STDOUT</li>
                    <li>Slow due to interpreter start-up overhead</li>
                    <li>Slow due to short task lives - JIT does not help</li>
                  </ul>
                </section>

                <section id="python-mapper">
                  <h2>Mapper</h2>
                  <br />
                  <pre>
<code class="python">
<%= File.read('warez/map.py') %>
</code>
                  </pre>
                </section>

                <section id="python-mapper-explain">
                  <h2>Mapper Continued</h2>
                  <br />
                  <ol>
                    <li>Iterate over STDIN with fileinput.</li>
                    <li>Split each line.</li>
                    <li>Check if the ngram is only in one volume.</li>
                    <li>If so, print out the year and a '1' as a tab-separated key-value pair.</li>
                  </ol>
                </section>

                    <section id="python-reducer">
                        <h2>Reducer</h2>
                        <br />
                        <pre>
<code class="python">
<%= File.read('warez/reduce.py') %>
</code>
                        </pre>
                    </section>

                    <section id="python-reducer-explain">
                      <h2>Reducer Continued</h2>
                      <br />
                      <ol>
                        <li>Sum the amount of ngrams that only appear in one volume by year</li>
                        <li>Print the sorted result</li>
                      </ol>
                    </section>

                    <section id="python-running">
                      <h2>Sumbitting Python</h2>
                      <br />
                      <pre>
<code>
<%= File.read('warez/python.sh') %>
</code>
                      </pre>
                    </section>

                    <section id="python-results">
                      <pre>
<code>
snake   22
snaketail       10
</code>
                      </pre>
                    </section>

                  </section>

                  <section id="spark">

                    <section id="spark-intro">
                      <h2>Spark</h2>
                      <br />
                      <ul>
                        <li>Spark supports Scala, Java, and Python</li>
                        <li>Packaging Scala and Java for job submission is complicated</li>
                        <li>Spark examples will be shown in Python</li>
                        <li>PySpark, the Python API, is slower than the Scala/Java API</li>
                      </ul>
                    </section>

                    <section id="pyspark-shell">
                      <h2>PySpark Shell and Submission</h2>
                      <pre>
<code>
<%= File.read('warez/pyspark.sh') %>
</code>
                      </pre>
                    </section>

                    <section id="pyspark-code">
                      <h2>PySpark code</h2>
                      <br />
                      <pre>
<code>
<%=
  raw = File.read('warez/ngrams-spark.py').split("\n")
  no_comments = Array.new
  raw.each do |line|
    if not line =~ /^#/ and not line.empty?
      no_comments << line
    end
  end
  no_comments.join("\n")
  %>
</code>
                      </pre>
                    </section>

                  </section>

                  <section id="choosing">
                    <h2>Choosing a tool and language</h2>
                    <br />
                    <ul>
                      <li>If you can use a tool you're already familiar with, do</li>
                      <li>Working on structured data? Use Hive or Pig</li>
                      <li>There is probably already an Apache tool to do what you want</li>
                    </ul>
                  </section>

                  <section id="testing-locally">
                    <h2>Testing Locally</h2>
                    <br />
                    <p>
                    You can test both Pig and Spark locally if you have them
                    installed. In addition, there is a Dockerfile
                    <a href="https://github.com/trozamon/hadoop-pres/blob/master/Dockerfile">here</a>
                    that provides a pre-built environment for testing Pig and
                    Spark if you are a Docker user.
                    </p>
                  </section>

                  <section id="contact">
                    <h2>Contact Info</h2>
                    <ul>
                      <li><a href="mailto:hpc-support@umich.edu">hpc-support@umich.edu</a></li>
                      <li><a href="http://arc-ts.umich.edu/">http://arc-ts.umich.edu/</a></li>
                      <li><a href="https://twitter.com/arcts_um">@arcts_um</a></li>
                    </ul>
                  </section>

            </div>
        </div>
        <script src="lib/js/head.min.js"></script>
        <script src="js/reveal.min.js"></script>

        <script>
// Required, even if empty.
Reveal.initialize({
    slideNumber: true,
    history: true,
    dependencies: [
        // Cross-browser shim that fully implements classList - https://github.com/eligrey/classList.js/
    { src: 'lib/js/classList.js', condition: function() { return !document.body.classList; } },

    // Interpret Markdown in <section> elements
    { src: 'plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
        { src: 'plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },

        // Syntax highlight for <code> elements
        { src: 'plugin/highlight/highlight.js', async: true, callback: function() { hljs.initHighlightingOnLoad(); } },

            // Zoom in and out with Alt+click
            { src: 'plugin/zoom-js/zoom.js', async: true, condition: function() { return !!document.body.classList; } },

            // Speaker notes
            { src: 'plugin/notes/notes.js', async: true, condition: function() { return !!document.body.classList; } },

                // Remote control your reveal.js presentation using a touch device
                { src: 'plugin/remotes/remotes.js', async: true, condition: function() { return !!document.body.classList; } },

                // MathJax
                { src: 'plugin/math/math.js', async: true }
    ]
});

        </script>
    </body>
</html>