The GitHub History of the Scala Language

# Importing pandas
import pandas as pd

# Loading in the data
pulls_one = pd.read_csv('datasets/pulls_2011-2013.csv')
pulls_two = pd.read_csv('datasets/pulls_2014-2018.csv')
pull_files = pd.read_csv('datasets/pull_files.csv')

# Append pulls_one to pulls_two
pulls = pulls_one.append(pulls_two)
print(pulls)
# Convert the date for the pulls object
pulls['date'] = pd.to_datetime(pulls['date'],utc=True)

# Merge the two DataFrames
data = pulls.merge(pull_files, on='pid')
print(data)

[16]


# Create a column that will store the month
data['month'] = data['date'].dt.month

# Create a column that will store the year
data['year'] = data['date'].dt.year

# Group by the month and year and count the pull requests
counts = data.groupby(['month','year'])['file'].count()

# Plot the results
counts.plot(kind='bar', figsize = (12,4))

[17]

data.head()

[18]


# Group by the submitter
by_user = data.groupby('user')['file'].count()

# Plot the histogram
by_user.plot(kind='hist')

# Identify the last 10 pull requests
last_10 = pulls.sort_values(by='date', ascending=False).head(n=10)
# Join the two data sets
joined_pr =last_10.merge(pull_files, on='pid')
# Identify the unique files
files = joined_pr['file'].unique()
# Print the results
print(files)

print(pulls.head())

print(pull_files)

# This is the file we are interested in:
file = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'

# Identify the commits that changed the file
file_pr = data[data['file']=='src/compiler/scala/reflect/reify/phases/Calculate.scala']

# Count the number of changes made by each developer
author_counts = file_pr.groupby('user').count()

# Print the top 3 developers
print(author_counts.sort_values('file').tail(n=3))

p=data[data['file']=='src/compiler/scala/reflect/reify/phases/Calculate.scala']
g=p.groupby('user').count()
print(g)

[24]

file = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'

# Select the pull requests that changed the target file
file_pr = data[data['file']=='src/compiler/scala/reflect/reify/phases/Calculate.scala']
# Merge the obtained results with the pulls DataFrame
joined_pr = pulls.merge(file_pr, on='pid',sort='date_x')

# Find the users of the last 10 most recent pull requests
users_last_10 = joined_pr['user_x'].tail(n=10).unique()

# Printing the results
users_last_10

%matplotlib inline
import seaborn as sns
# The developers we are interested in
authors = ['xeno-by', 'soc']

# Get all the developers' pull requests
authors=data[(data['user']=='xeno-by') | (data['user']== 'soc')]
authors['year']=authors['date'].dt.year
counts=authors.groupby(['year', 'user']).agg({'file': 'count'}).reset_index()
counts_wide = counts.pivot_table(index='year', columns='user', values='file', fill_value=0)
print(counts_wide)
counts_wide.plot(kind='bar')

[26]

authors = ['xeno-by', 'soc']
file = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'

# Merge DataFrames and select the pull requests by the author
by_author = data[data['user'].isin(authors)]

# Select the pull requests that affect the file
by_file = by_author[by_author['file'] == file]

# Group and count the number of PRs done by each user each year
grouped = by_file.groupby(['user', by_file['date'].dt.year]).count()['pid'].reset_index()

# Transform the data into a wide format
by_file_wide = grouped.pivot_table(index='date', columns='user', values='pid', fill_value=0)

# Plot the results
by_file_wide.plot(kind='bar')

authors=data[(data['user']=='xeno-by') | (data['user']== 'soc')]
print(authors)

‌
‌
‌