-
Notifications
You must be signed in to change notification settings - Fork 25
/
README.html
178 lines (115 loc) · 17 KB
/
README.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<title>README</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<style>
/*
This document has been created with Marked.app <http://markedapp.com>, Copyright 2011 Brett Terpstra
Please leave this notice in place, along with any additional credits below.
---------------------------------------------------------------
Title: GitHub
Author: Brett Terpstra
Description: Github README style. Includes theme for Pygmentized code blocks.
*/
html,body{color:black}*{margin:0;padding:0}body{font:13.34px helvetica,arial,freesans,clean,sans-serif;-webkit-font-smoothing:antialiased;line-height:1.4;padding:3px;background:#fff;border-radius:3px;-moz-border-radius:3px;-webkit-border-radius:3px}p{margin:1em 0}a{color:#4183c4;text-decoration:none}#wrapper{background-color:#fff;border:3px solid #eee!important;padding:0 30px;margin:15px}#wrapper{font-size:14px;line-height:1.6}#wrapper>*:first-child{margin-top:0!important}#wrapper>*:last-child{margin-bottom:0!important}h1,h2,h3,h4,h5,h6{margin:0;padding:0}h1{margin:15px 0;padding-bottom:2px;font-size:24px;border-bottom:1px solid #eee}h2{margin:20px 0 10px 0;font-size:18px}h3{margin:20px 0 10px 0;padding-bottom:2px;font-size:14px;border-bottom:1px solid #ddd}h4{font-size:14px;line-height:26px;padding:18px 0 4px;font-weight:bold;text-transform:uppercase}h5{font-size:13px;line-height:26px;padding:14px 0 0;font-weight:bold;text-transform:uppercase}h6{color:#666;font-size:14px;line-height:26px;padding:18px 0 0;font-weight:normal;font-variant:italic}hr{background:transparent url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAYAAAAECAYAAACtBE5DAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAAyJpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuMC1jMDYwIDYxLjEzNDc3NywgMjAxMC8wMi8xMi0xNzozMjowMCAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvIiB4bWxuczp4bXBNTT0iaHR0cDovL25zLmFkb2JlLmNvbS94YXAvMS4wL21tLyIgeG1sbnM6c3RSZWY9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9zVHlwZS9SZXNvdXJjZVJlZiMiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENTNSBNYWNpbnRvc2giIHhtcE1NOkluc3RhbmNlSUQ9InhtcC5paWQ6OENDRjNBN0E2NTZBMTFFMEI3QjRBODM4NzJDMjlGNDgiIHhtcE1NOkRvY3VtZW50SUQ9InhtcC5kaWQ6OENDRjNBN0I2NTZBMTFFMEI3QjRBODM4NzJDMjlGNDgiPiA8eG1wTU06RGVyaXZlZEZyb20gc3RSZWY6aW5zdGFuY2VJRD0ieG1wLmlpZDo4Q0NGM0E3ODY1NkExMUUwQjdCNEE4Mzg3MkMyOUY0OCIgc3RSZWY6ZG9jdW1lbnRJRD0ieG1wLmRpZDo4Q0NGM0E3OTY1NkExMUUwQjdCNEE4Mzg3MkMyOUY0OCIvPiA8L3JkZjpEZXNjcmlwdGlvbj4gPC9yZGY6UkRGPiA8L3g6eG1wbWV0YT4gPD94cGFja2V0IGVuZD0iciI/PqqezsUAAAAfSURBVHjaYmRABcYwBiM2QSA4y4hNEKYDQxAEAAIMAHNGAzhkPOlYAAAAAElFTkSuQmCC) repeat-x 0 0;border:0 none;color:#ccc;height:4px;margin:20px 0;padding:0}#wrapper>h2:first-child,#wrapper>h1:first-child,#wrapper>h1:first-child+h2{border:0;margin:0;padding:0}#wrapper>h3:first-child,#wrapper>h4:first-child,#wrapper>h5:first-child,#wrapper>h6:first-child{margin:0;padding:0}h4+p,h5+p,h6+p{margin-top:0}li p.first{display:inline-block}ul,ol{margin:15px 0 15px 25px}ul li,ol li{margin-top:7px;margin-bottom:7px}ul li>*:last-child,ol li>*:last-child{margin-bottom:0}ul li>*:first-child,ol li>*:first-child{margin-top:0}#wrapper>ul,#wrapper>ol{margin-top:21px;margin-left:36px}dl{margin:0;padding:20px 0 0}dl dt{font-size:14px;font-weight:bold;line-height:normal;margin:0;padding:20px 0 0}dl dt:first-child{padding:0}dl dd{font-size:13px;margin:0;padding:3px 0 0}blockquote{margin:14px 0;border-left:4px solid #ddd;padding-left:11px;color:#555}table{border-collapse:collapse;margin:20px 0 0;padding:0}table tr{border-top:1px solid #ccc;background-color:#fff;margin:0;padding:0}table tr:nth-child(2n){background-color:#f8f8f8}table tr th,table tr td{border:1px solid #ccc;text-align:left;margin:0;padding:6px 13px}img{max-width:100%;height:auto}code,tt{margin:0 2px;padding:2px 5px;white-space:nowrap;border:1px solid #ccc;background-color:#f8f8f8;border-radius:3px;-moz-border-radius:3px;-webkit-border-radius:3px;font-size:12px}pre>code{margin:0;padding:0;white-space:pre;border:0;background:transparent;font-size:13px}.highlight pre,pre{background-color:#f8f8f8;border:1px solid #ccc;font-size:13px;line-height:19px;overflow:auto;padding:6px 10px;border-radius:3px;-moz-border-radius:3px;-webkit-border-radius:3px}#wrapper>pre,#wrapper>div.highlight{margin:10px 0 0}pre code,pre tt{background-color:transparent;border:0}#wrapper{background-color:#fff;border:1px solid #cacaca;padding:30px}.poetry pre{font-family:Georgia,Garamond,serif!important;font-style:italic;font-size:110%!important;line-height:1.6em;display:block;margin-left:1em}.poetry pre code{font-family:Georgia,Garamond,serif!important}sup,sub,a.footnote{font-size:1.4ex;height:0;line-height:1;vertical-align:super;position:relative}sub{vertical-align:sub;top:-1px}@media print{body{background:#fff}img,pre,blockquote,table,figure{page-break-inside:avoid}#wrapper{background:#fff;border:0}code{background-color:#fff;color:#444!important;padding:0 .2em;border:1px solid #dedede}pre code{background-color:#fff!important;overflow:visible}pre{background:#fff}}@media screen{body.inverted,.inverted #wrapper,.inverted hr .inverted p,.inverted td,.inverted li,.inverted h1,.inverted h2,.inverted h3,.inverted h4,.inverted h5,.inverted h6,.inverted th,.inverted .math,.inverted caption,.inverted dd,.inverted dt,.inverted blockquote{color:#eee!important;border-color:#555}.inverted td,.inverted th{background:#333}.inverted pre,.inverted code,.inverted tt{background:#444!important}.inverted h2{border-color:#555}.inverted hr{border-color:#777;border-width:1px!important}::selection{background:rgba(157,193,200,.5)}h1::selection{background-color:rgba(45,156,208,.3)}h2::selection{background-color:rgba(90,182,224,.3)}h3::selection,h4::selection,h5::selection,h6::selection,li::selection,ol::selection{background-color:rgba(133,201,232,.3)}code::selection{background-color:rgba(0,0,0,.7);color:#eee}code span::selection{background-color:rgba(0,0,0,.7)!important;color:#eee!important}a::selection{background-color:rgba(255,230,102,.2)}.inverted a::selection{background-color:rgba(255,230,102,.6)}td::selection,th::selection,caption::selection{background-color:rgba(180,237,95,.5)}.inverted{background:#0b2531}.inverted #wrapper,.inverted{background:rgba(37,42,42,1)}.inverted a{color:rgba(172,209,213,1)}}.highlight .c{color:#998;font-style:italic}.highlight .err{color:#a61717;background-color:#e3d2d2}.highlight .k{font-weight:bold}.highlight .o{font-weight:bold}.highlight .cm{color:#998;font-style:italic}.highlight .cp{color:#999;font-weight:bold}.highlight .c1{color:#998;font-style:italic}.highlight .cs{color:#999;font-weight:bold;font-style:italic}.highlight .gd{color:#000;background-color:#fdd}.highlight .gd .x{color:#000;background-color:#faa}.highlight .ge{font-style:italic}.highlight .gr{color:#a00}.highlight .gh{color:#999}.highlight .gi{color:#000;background-color:#dfd}.highlight .gi .x{color:#000;background-color:#afa}.highlight .go{color:#888}.highlight .gp{color:#555}.highlight .gs{font-weight:bold}.highlight .gu{color:#800080;font-weight:bold}.highlight .gt{color:#a00}.highlight .kc{font-weight:bold}.highlight .kd{font-weight:bold}.highlight .kn{font-weight:bold}.highlight .kp{font-weight:bold}.highlight .kr{font-weight:bold}.highlight .kt{color:#458;font-weight:bold}.highlight .m{color:#099}.highlight .s{color:#d14}.highlight .na{color:#008080}.highlight .nb{color:#0086b3}.highlight .nc{color:#458;font-weight:bold}.highlight .no{color:#008080}.highlight .ni{color:#800080}.highlight .ne{color:#900;font-weight:bold}.highlight .nf{color:#900;font-weight:bold}.highlight .nn{color:#555}.highlight .nt{color:#000080}.highlight .nv{color:#008080}.highlight .ow{font-weight:bold}.highlight .w{color:#bbb}.highlight .mf{color:#099}.highlight .mh{color:#099}.highlight .mi{color:#099}.highlight .mo{color:#099}.highlight .sb{color:#d14}.highlight .sc{color:#d14}.highlight .sd{color:#d14}.highlight .s2{color:#d14}.highlight .se{color:#d14}.highlight .sh{color:#d14}.highlight .si{color:#d14}.highlight .sx{color:#d14}.highlight .sr{color:#009926}.highlight .s1{color:#d14}.highlight .ss{color:#990073}.highlight .bp{color:#999}.highlight .vc{color:#008080}.highlight .vg{color:#008080}.highlight .vi{color:#008080}.highlight .il{color:#099}.highlight .gc{color:#999;background-color:#eaf2f5}.type-csharp .highlight .k{color:#00F}.type-csharp .highlight .kt{color:#00F}.type-csharp .highlight .nf{color:#000;font-weight:normal}.type-csharp .highlight .nc{color:#2b91af}.type-csharp .highlight .nn{color:#000}.type-csharp .highlight .s{color:#a31515}.type-csharp .highlight .sc{color:#a31515}
</style>
</head>
<body class="normal">
<div id="wrapper">
<h1 id="scaldingworkshopreadme">Scalding Workshop README</h1>
<p><em>Copyright (C) 2010-2014 Think Big Analytics, Inc. All Rights Reserved.</em></p>
<p><strong>StrangeLoop 2012</strong><br/>
<strong>Dean Wampler, Think Big Analytics</strong><br/>
<a href="mailto:dean@deanwampler.com">dean@deanwampler.com</a><br/>
<a href="https://twitter.com/deanwampler">@deanwampler</a><br/>
<a href="http://thinkbiganalytics.com">Hire Us!</a></p>
<h2 id="aboutthisworkshop">About this Workshop</h2>
<p>This workshop is a half-day tutorial on Scalding and its place in the Hadoop ecosystem. <a href="https://github.com/twitter/scalding">Scalding</a> is a Scala API developed at Twitter for distributed data programming that uses the <a href="http://www.cascading.org/">Cascading</a> Java API, which in turn sits on top of Hadoop’s Java API. However, Scalding, through Cascading, also offers a <em>local</em> mode that makes it easy to run jobs without using the Hadoop libraries, for simpler testing and learning. We’ll use this feature for most of this workshop.</p>
<h2 id="gettingstarted">Getting Started</h2>
<p>To keep the setup process as simple as possible, the workshop git repo contains a pre-built jar that bundles Scalding v0.7.3 for Scala v2.9.2 and other required jars, such as <code>Cascading</code>, <code>Hadoop</code> <em>core</em>, <code>Log4J</code>, etc. So, all you need to install is Java, Scala, Ruby, and this workshop.</p>
<p>It helps to pick a work directory where you will install some of the packages. In what follows, we’ll assume you’re using <code>$HOME/fun</code> on Linux, Mac OSX, or Cygwin for Windows with the <code>bash</code> shell (or a similar shell) or you are using <code>C:\fun</code> on Windows.</p>
<h3 id="git">Git</h3>
<p>You’ll need git to clone the workshop repository and optionally for other installs. See <a href="http://git-scm.com/book/en/Getting-Started-Installing-Git">here</a> for details. As an alternative, you can download a workshop release from its Github repo, rather than clone it.</p>
<h3 id="thisworkshop">This Workshop</h3>
<p>Download or clone this <a href="https://github.com/thinkbiganalytics/scalding-workshop">workshop from GitHub</a>.</p>
<p>To clone this workshop from GitHub using <code>bash</code>:</p>
<pre><code>cd $HOME/fun
git clone https://github.com/thinkbiganalytics/scalding-workshop
</code></pre>
<p>On Windows:</p>
<pre><code>cd C:\fun
git clone https://github.com/thinkbiganalytics/scalding-workshop
</code></pre>
<p>Or, simply <a href="https://github.com/ThinkBigAnalytics/scalding-workshop/downloads">download a release</a>.</p>
<h3 id="javav1.6orbetter">Java v1.6 or Better</h3>
<p>Install Java if necessary from <a href="http://www.java.com/en/download/help/download_options.xml">here</a>.</p>
<h3 id="scalav2.9.2">Scala v2.9.2</h3>
<p>Scalding uses Scala v2.9.2. Install it from <a href="http://www.scala-lang.org/downloads">here</a>.</p>
<h3 id="rubyv1.8.7orv1.9.x">Ruby v1.8.7 or v1.9.X</h3>
<p>Ruby is used as a platform-independent language for driver scripts by Scalding and we’ve followed the same convention. See <a href="http://ruby-lang.org">ruby-lang.org</a> for details on installing Ruby. Either version 1.8.7 or 1.9.X will work.</p>
<h2 id="sanitycheck">Sanity Check</h2>
<p>Once you’ve completed these steps, run the following commands as a sanity check to ensure that everything is setup properly. Using <code>bash</code>: </p>
<pre><code>cd $HOME/fun/scalding-workshop
./run.rb scripts/SanityCheck0.scala
</code></pre>
<p>On Windows:</p>
<pre><code>cd C:\fun\scalding-workshop
ruby run.rb scripts/SanityCheck0.scala
</code></pre>
<p>The commands should run without error. Note that it takes a moment to compile the Scala script and run to completion. The output is written to <code>output/SanityCheck0.txt</code>. What’s in that file?</p>
<h2 id="optionalinstalls">Optional Installs</h2>
<p>If you’re serious about using Scalding, you should clone and build the Scalding repo. We’ll talk briefly about it in the workshop, but it isn’t required.</p>
<h3 id="sbtv0.11">SBT v0.11</h3>
<p>SBT is the <em>de facto</em> build tool for Scala. You’ll need it to build Scalding. Follow these <a href="https://github.com/harrah/xsbt/wiki/Getting-Started-Setup">installation instructions</a>.</p>
<h3 id="scaldingfromgithub">Scalding from GitHub</h3>
<p>Clone <a href="https://github.com/twitter/scalding">Scalding from GitHub</a>. Using <code>bash</code>:</p>
<pre><code>cd $HOME/fun
git clone https://github.com/twitter/scalding.git
</code></pre>
<p>On Windows:</p>
<pre><code>cd C:\fun
git clone https://github.com/thinkbiganalytics/scalding-workshop
</code></pre>
<h3 id="buildscalding">Build Scalding</h3>
<p>Build Scalding according to its <a href="https://github.com/twitter/scalding/wiki/Getting-Started">Getting Started</a> page. Here is a synopsis of the steps. Using <code>bash</code>: </p>
<pre><code>cd $HOME/fun/scalding
sbt update
sbt assembly
</code></pre>
<p>On Windows:</p>
<pre><code>cd C:\fun\scalding
sbt update
sbt assembly
</code></pre>
<p>(The Getting Started page says to build the <code>test</code> target between <code>update</code> and <code>assembly</code>, but the later builds <code>test</code> itself.)</p>
<h3 id="sanitycheck">Sanity Check</h3>
<p>Once you’ve built Scalding, run the following command as a sanity check to ensure everything is setup properly. Using <code>bash</code>: </p>
<pre><code>cd $HOME/fun/scalding
scripts/scald.rb --local tutorial/Tutorial0.scala
</code></pre>
<p>On Windows:</p>
<pre><code>cd C:\fun\scalding
ruby scripts\scald.rb --local tutorial/Tutorial0.scala
</code></pre>
<h2 id="nextsteps">Next Steps</h2>
<p>The Workshop/Tutorial proper is described in the companion <a href="https://github.com/thinkbiganalytics/scalding-workshop/blob/master/Workshop.html">Workshop document</a>.</p>
<h2 id="notesonreleases">Notes on Releases</h2>
<h3 id="v0.2.1">V0.2.1</h3>
<p>Added missing file to distribution. Refined the run scripts to work better with different Java versions.</p>
<h3 id="v0.2">V0.2</h3>
<p>Refined several exercises and fixed bugs. Added <code>Makefile</code> for building releases.</p>
<h3 id="v0.1">V0.1</h3>
<p>First release for StrangeLoop 2012 workshop.</p>
<h2 id="forfurtherinformation">For Further Information</h2>
<p>See the <a href="https://github.com/twitter/scalding">Scalding GitHub page</a> for more information about Scalding. The <a href="https://github.com/twitter/scalding/wiki">wiki</a> is very useful.</p>
<p><a href="mailto:dean@deanwampler.com">Dean Wampler</a> from <a href="http://thinkbiganalytics.com">Think Big Analytics</a> prepared this workshop. <a href="mailto:dean@deanwampler.com">Contact Dean</a> with questions about the workshop. For information about consulting and training on Scalding and other Hadoop-related topics, <a href="mailto:info@thinkbiganalytics.com">send us email</a>.</p>
<p>Some of the data used in these exercises was obtained from <a href="http://infochimps.com">InfoChimps</a>.</p>
<p><strong>Dean Wampler</strong><br/>
<a href="mailto:dean@deanwampler.com">dean@deanwampler.com</a><br/>
<a href="https://twitter.com/deanwampler">@deanwampler</a><br/></p>
</div>
</body>
</html>