exampleTimelineReport.Rnw

\documentclass[a4paper]{article}
\SweaveOpts{echo=FALSE}
\usepackage{a4wide}
\usepackage{color}

<< echo = F >>=
require(twitteR)
source("TAGS-Based-Scripts/core-TAGSExplorer-API.R")

username='bishopofdurham'
num=1500
minsmax=60*24*100
#the most tweets we can bring back from a user timeline is the most recent 3200...
mht=userTimeline(username,n=num)
tw.df=twListToDF(mht)

#As I've done in previous scripts, pull out the names of folk who have been "old-fashioned RTd"...
require(stringr)
trim <- function (x) sub('@','',x)

tw.df$rt=sapply(tw.df$text,function(tweet) trim(str_match(tweet,"^RT (@[[:alnum:]_]*)")[2]))
tw.df$rtt=sapply(tw.df$rt,function(rt) if (is.na(rt)) 'T' else 'RT')

tw.dfs=subset(tw.df,subset=((Sys.time()-created)<minsmax))

require(ggplot2)
@

\title{Example Document Summarising User Timeline Behaviour on Twitter}

\author{
Tony Hirst\thanks{@psychemedia}\\License: CC-BY
}

\date{\today}


\begin{document}
\SweaveOpts{concordance=TRUE}

\maketitle

\renewcommand{\topfraction}{0.85}
\renewcommand{\textfraction}{0.1}
\renewcommand{\floatpagefraction}{0.75}

\newpage

\section{Twitter Usertimeline Summary}
This report has been automatically generated from user timeline data retrieved from the Twitter API.

\subsection{Replies}
A series of summary reports around reply and retweet behaviour observed on the sampled user timeline.


\begin{figure}[htbp]
\begin{center}
<<exampleReplyToUserAccession, fig = T, echo = F>>=
require(plyr)
#Order the replyToSN factor levels in the order in which they were first created
tw.dfx=ddply(tw.dfs, .var = "replyToSN", .fun = function(x) {return(subset(x, created %in% min(created),select=c(replyToSN,created)))})
tw.dfxa=arrange(tw.dfx,-desc(created))
tw.dfs$replyToSN=factor(tw.dfs$replyToSN, levels = tw.dfxa$replyToSN)

#and plot the result
p=ggplot(tw.dfs)+geom_point(aes(x=created,y=replyToSN))
print(p)
@
\caption{Users Replied To Over Time}
\end{center}
\end{figure}

\begin{figure}[htbp]
\begin{center}
<<exampleOldStyleRTUserAccession, fig = T, echo = F>>=
#Order the rt factor levels in the order in which they were first created
tw.dfx=ddply(tw.dfs, .var = "rt", .fun = function(x) {return(subset(x, created %in% min(created),select=c(replyToSN,created)))})
tw.dfxa=arrange(tw.dfx,-desc(created))
tw.dfs$rt=factor(tw.dfs$rt, levels = tw.dfxa$rt)

#and plot the result
p=ggplot(tw.dfs)+geom_point(aes(x=created,y=rt))
print(p)
@
\caption{Users Old-Style RT'd Over Time}
\end{center}
\end{figure}


\begin{figure}[htbp]
\begin{center}
<<exampleReplyAndRTUserAccession, fig = T, echo = F>>=

p=ggplot()+geom_point(data=subset(tw.dfs,subset=(!is.na(replyToSN))),aes(x=created,y=replyToSN),col='red') + geom_point(data=subset(tw.dfs,subset=(!is.na(rt))),aes(x=created,y=rt),col='blue') + geom_point(data=subset(tw.dfs,subset=(is.na(replyToSN) & is.na(rt))),aes(x=created,y=screenName),col='green')

print(p)
@
\caption{Originated Tweets (green), Replies to User (red) and Old Style RTs (blue) of User over time}
\end{center}
\end{figure}


\begin{figure}[htbp]
\begin{center}
<<exampleReplyCounts, fig = T, echo = F>>=

#First we need to count how many replies a user gets...
#http://stackoverflow.com/a/3255448/454773
r_table <- table(tw.dfs$replyToSN)
#..rank them...
r_levels <- names(r_table)[order(-r_table)]
#..and use this ordering to order the factor levels...
tw.dfs$replyToSN <- factor(tw.dfs$replyToSN, levels = r_levels) 

#Then we can plot the chart...
p=ggplot(subset(tw.dfs,subset=(!is.na(replyToSN))),aes(x=replyToSN)) + geom_bar(aes(y = (..count..))) + opts(axis.text.x=theme_text(angle=-90,size=6))
print(p)
@
\caption{Plot of most heavily replied to users}
\end{center}
\end{figure}

\begin{figure}[htbp]
\begin{center}
<<exampleOldRTCounts, fig = T, echo = F>>=
r_table <- table(tw.dfs$rt)
#..rank them...
r_levels <- names(r_table)[order(-r_table)]
#..and use this ordering to order the factor levels...
tw.dfs$rt <- factor(tw.dfs$rt, levels = r_levels) 

#Then we can plot the chart...

  p=ggplot(subset(tw.dfs,subset=(!is.na(rt))),aes(x=rt)) + geom_bar(aes(y = (..count..))) + opts(axis.text.x=theme_text(angle=-90,size=6))

  try(print(p),silent=T)
@
\caption{Plot of most heavily old-style RT\'d users}
\end{center}
\end{figure}

\begin{figure}[htbp]
\begin{center}
<<exampleTweetsByHourOfDay, fig = T, echo = F>>=
#label a tweet with the month number
tw.dfs$month=sapply(tw.dfs$created, function(x) {p=as.POSIXlt(x);p$mon})
#label a tweet with the hour
tw.dfs$hour=sapply(tw.dfs$created, function(x) {p=as.POSIXlt(x);p$hour})
#label a tweet with a number corresponding to the day of the week
tw.dfs$wday=sapply(tw.dfs$created, function(x) {p=as.POSIXlt(x);p$wday})

p=ggplot(tw.dfs)+geom_jitter(aes(x=wday,y=hour))
print(p)
@
\caption{Plot of when tweeting occurred (random jitter is applied to the hour bin in which tweets were created)}
\end{center}
\end{figure}

\begin{figure}[htbp]
\begin{center}
<<exampleTweetsByDay, fig = T, echo = F>>=
p=ggplot(tw.dfs,aes(x=wday))+geom_bar(aes(y = (..count..)),binwidth=1)

print(p)
@
\caption{Count of tweets by day of week}
\end{center}
\end{figure}

\begin{figure}[htbp]
\begin{center}
<<exampleTweetsByHour, fig = T, echo = F>>=
p=ggplot(tw.dfs,aes(x=hour))+geom_bar(aes(y = (..count..)),binwidth=1)
print(p)
@
\caption{Count of tweets by hour of day}
\end{center}
\end{figure}

\begin{figure}[htbp]
\begin{center}
<<exampleTweetsTimeseries, fig = T, echo = F>>=
require(xts)
#The xts function creates a timeline from a vector of values and a vector of timestamps.
#If we know how many tweets we have, we can just create a simple list or vector containing that number of 1s
ts=xts(rep(1,times=nrow(tw.dfs)),tw.dfs$created)

#We can now do some handy number crunching on the timeseries, such as applying a formula to values contained with day, week, month, quarter or year time bins.
#So for example, if we sum the unit values in daily bin, we can get a count of the number of tweets per day
ts.sum=apply.daily(ts,sum) 
#also apply. weekly, monthly, quarterly, yearly

#If for any resason we need to turn the timeseries into a dataframe, we can:
#http://stackoverflow.com/a/3387259/454773
ts.sum.df=data.frame(date=index(ts.sum), coredata(ts.sum))

colnames(ts.sum.df)=c('date','sum')

#We can then use ggplot to plot the timeseries...
p=ggplot(ts.sum.df)+geom_line(aes(x=date,y=sum))

print(p)
@
\caption{Tweeting activity}
\end{center}
\end{figure}


\begin{figure}[htbp]
\begin{center}
<<exampleCalendarHeatmap, fig = T, echo = F>>=

source('thirdpartyScripts/calendarHeatmap.R')
p=calendarHeat(ts.sum.df$date, ts.sum.df$sum, varname=paste(username,"Twitter activity"))
print(p)
@
\caption{Tweeting activity}
\end{center}
\end{figure}


\begin{figure}[htbp]
\begin{center}
<<exampleWordcloud, fig = T, echo = F>>=
source('doodles/wordcloud.R')
tweets=as.vector(sapply(tw.dfs$text, RemoveAtPeople))
p=wordcloud.generate(generateCorpus(tweets),3)
print(p)
@
\caption{Example wordcloud generated from user timeline}
\end{center}
\end{figure}


\begin{figure}[htbp]
\begin{center}
<<exampleHashtagCount, fig = T, echo = F>>=
#hashtag processing via http://stackoverflow.com/a/9360445/454773
hashtagAugment=function(tmp){
  tags <- str_extract_all(tmp$text, '#[a-zA-Z0-9]+')
  index <- rep.int(seq_len(nrow(tmp)), sapply(tags, length))
  if (length(index)!=0 || index ){
    tagged <- tmp[index, ]
    tagged$tag <- unlist(tags)
  } else {tagged=data.frame()}
  has_no_tag <- sapply(tags, function(x) length(x) == 0L)
  not_tagged <- tmp[has_no_tag, ]
  not_tagged$tag <- NA
  rbind(tagged, not_tagged)
}
tw.dfst=hashtagAugment(tw.dfs)

p=ggplot(tw.dfst,aes(x=na.omit(tag)))+geom_bar(aes(y=(..count..))) + xlab(NULL) + opts(axis.text.x=theme_text(angle=-90,size=6))

try(print(p),silent=T)
@
\caption{Example hashtag count}
\end{center}
\end{figure}

\begin{figure}[htbp]
\begin{center}
<<exampleHashtagOverTime, fig = T, echo = F>>=
#Order the tag factor levels in the order in which they were first created
tw.dfx=ddply(tw.dfst, .var = "tag", .fun = function(x) {return(subset(x, created %in% min(created),select=c(tag,created)))})
tw.dfxa=arrange(tw.dfx,-desc(created))
tw.dfst$tag=factor(tw.dfst$tag, levels = tw.dfxa$tag)

#and plot the result
p=ggplot(tw.dfst)+geom_point(aes(x=created,y=tag))
try(print(p),silent=T)
@
\caption{Hashtag Usage Over Time}
\end{center}
\end{figure}


\end{document}