With almost 30k commits and a history spanning over ten years, Scala is a mature programming language. The dataset we will use, which has been previously mined and extracted from GitHub, is comprised of three files:
import pandas as pd
pulls_one = pd.read_csv('datasets/pulls_2011-2013.csv') pulls_two = pd.read_csv('datasets/pulls_2014-2018.csv') pull_files = pd.read_csv('datasets/pull_files.csv')
Preparing and cleaning the data
pulls = pd.concat([pulls_one , pulls_two],ignore_index=True)
pulls['date'] = pd.to_datetime(pulls['date'],utc=True)
Merging the DataFrames
data = pulls.merge(pull_files, on='pid')
%matplotlib inline
data['month'] = data['date'].dt.month
data['year'] = data['date'].dt.year
counts = data.groupby(['year', 'month'])['pid'].count()
counts.plot(kind='bar', figsize = (12,4))
%matplotlib inline
by_user = data.groupby('user')['pid'].count()
by_user.hist(bins=50)
last_10 = pulls_two.nlargest(10, 'pid')
joined_pr = last_10.merge(pull_files, on='pid')
files = set(joined_pr['file'])
files
file = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'
file_pr = data[data['file'] == file]
author_counts = file_pr.groupby('user')['pid'].count()
author_counts.sort_values(ascending=False).head(3)
file = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'
file_pr = pull_files[pull_files['file'] == file]
joined_pr = file_pr.merge(pulls, on='pid')
users_last_10 = set(joined_pr.nlargest(10, 'date')['user'])
users_last_10
%matplotlib inline
authors = ['xeno-by', 'soc']
by_author = pulls[pulls['user'].isin(authors)]
counts = by_author.groupby(['user', by_author['date'].dt.year]).agg({'pid': 'count'}).reset_index()
counts_wide = counts.pivot_table(index='date', columns='user', values='pid', fill_value=0)
counts_wide.plot(kind='bar')
authors = ['xeno-by', 'soc'] file = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'
by_author = data[data['user'].isin(authors)]
by_file = by_author[by_author['file'] == file]
grouped = by_file.groupby(['user', by_file['date'].dt.year]).count()['pid'].reset_index()
by_file_wide = grouped.pivot_table(index='date', columns='user', values='pid', fill_value=0)
by_file_wide.plot(kind='bar')