Skip to content
The GitHub History of the Scala Language
# Importing pandas
import pandas as pd
# Loading in the data
pulls_one = pd.read_csv('datasets/pulls_2011-2013.csv')
pulls_two = pd.read_csv('datasets/pulls_2014-2018.csv')
pull_files = pd.read_csv('datasets/pull_files.csv')
# Append pulls_one to pulls_two
pulls = pulls_one.append(pulls_two)
print(pulls)
# Convert the date for the pulls object
pulls['date'] = pd.to_datetime(pulls['date'],utc=True)
# Merge the two DataFrames
data = pulls.merge(pull_files, on='pid')
print(data)
[16]
# Create a column that will store the month
data['month'] = data['date'].dt.month
# Create a column that will store the year
data['year'] = data['date'].dt.year
# Group by the month and year and count the pull requests
counts = data.groupby(['month','year'])['file'].count()
# Plot the results
counts.plot(kind='bar', figsize = (12,4))
[17]
data.head()
[18]
# Group by the submitter
by_user = data.groupby('user')['file'].count()
# Plot the histogram
by_user.plot(kind='hist')
# Identify the last 10 pull requests
last_10 = pulls.sort_values(by='date', ascending=False).head(n=10)
# Join the two data sets
joined_pr =last_10.merge(pull_files, on='pid')
# Identify the unique files
files = joined_pr['file'].unique()
# Print the results
print(files)
print(pulls.head())
print(pull_files)
# This is the file we are interested in:
file = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'
# Identify the commits that changed the file
file_pr = data[data['file']=='src/compiler/scala/reflect/reify/phases/Calculate.scala']
# Count the number of changes made by each developer
author_counts = file_pr.groupby('user').count()
# Print the top 3 developers
print(author_counts.sort_values('file').tail(n=3))
p=data[data['file']=='src/compiler/scala/reflect/reify/phases/Calculate.scala']
g=p.groupby('user').count()
print(g)
[24]
file = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'
# Select the pull requests that changed the target file
file_pr = data[data['file']=='src/compiler/scala/reflect/reify/phases/Calculate.scala']
# Merge the obtained results with the pulls DataFrame
joined_pr = pulls.merge(file_pr, on='pid',sort='date_x')
# Find the users of the last 10 most recent pull requests
users_last_10 = joined_pr['user_x'].tail(n=10).unique()
# Printing the results
users_last_10
%matplotlib inline
import seaborn as sns
# The developers we are interested in
authors = ['xeno-by', 'soc']
# Get all the developers' pull requests
authors=data[(data['user']=='xeno-by') | (data['user']== 'soc')]
authors['year']=authors['date'].dt.year
counts=authors.groupby(['year', 'user']).agg({'file': 'count'}).reset_index()
counts_wide = counts.pivot_table(index='year', columns='user', values='file', fill_value=0)
print(counts_wide)
counts_wide.plot(kind='bar')
[26]
authors = ['xeno-by', 'soc']
file = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'
# Merge DataFrames and select the pull requests by the author
by_author = data[data['user'].isin(authors)]
# Select the pull requests that affect the file
by_file = by_author[by_author['file'] == file]
# Group and count the number of PRs done by each user each year
grouped = by_file.groupby(['user', by_file['date'].dt.year]).count()['pid'].reset_index()
# Transform the data into a wide format
by_file_wide = grouped.pivot_table(index='date', columns='user', values='pid', fill_value=0)
# Plot the results
by_file_wide.plot(kind='bar')
authors=data[(data['user']=='xeno-by') | (data['user']== 'soc')]
print(authors)