Skip to content
Portfolio Project 2 - NBA Scoring vs Draft Position
DataFrameas
df
variable
SELECT * FROM 'all_seasons.csv';# List the names of the columns in the DataFrame and Print them
column_names = df.columns.tolist()
print("Column Names:")
for name in column_names:
print(name)# Round the values in the 6th and 7th columns (height and weight) to the nearest whole number
df.iloc[:, 6:8] = df.iloc[:, 6:8].round()
# Print the DataFrame after Rounding
print(df)# New DataFrame where Games Played is at least 65 AND Points per game is more than 15.0
import pandas as pd
df = pd.read_csv('all_seasons.csv')
data_gppt = df[(df['gp'] >= 65) & (df['pts'] >= 15.0)]
data_gppt# Convert string value in draft position (Undrafted) to 0 to make the column numerical
data_gppt['draft_number'] = data_gppt['draft_number'].replace('Undrafted', '0')
data_gppt['draft_number'] = pd.to_numeric(data_gppt['draft_number'])
# Create a k-means clustering model with 4 clusters and fit the model to the data with the parameters of the graph
import pandas as pd
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, init='random', random_state=0)
kmeans.fit(data_gppt[['ts_pct', 'pts']])
# Add a new column to the DataFrame and define names for the clusters based on Draft Position
data_gppt['Cluster'] = kmeans.labels_
def assign_cluster(draft_number):
if 1 <= draft_number <= 5:
return 0
elif 6 <= draft_number <= 14:
return 1
elif 15 <= draft_number <= 30:
return 2
elif draft_number > 31:
return 3
elif draft_number == 0:
return 4
else:
return 'Unknown'
# Apply the function to create the 'Cluster' column
data_gppt['Cluster'] = data_gppt['draft_number'].apply(assign_cluster)
def assign_cluster_name(cluster_label):
if cluster_label == 0:
return 'Top 5 Pick (Picks 1-5)'
elif cluster_label == 1:
return 'End of Lottery (Picks 6-14)'
elif cluster_label == 2:
return 'End of First Rounder (Picks 15-30)'
elif cluster_label == 3:
return 'Second Rounder (Picks 31 and On)'
elif cluster_label == 4:
return 'Undrafted'
else:
return 'Other' # Assign a special name for out-of-range picks
# Map cluster labels to cluster names based on Draft Position
data_gppt['Cluster Name'] = data_gppt['Cluster'].apply(assign_cluster_name)
# Print the DataFrame with cluster labels and names
print(data_gppt)import matplotlib.pyplot as plt
# Define the columns for the x and y axes
xAxis = 'pts'
yAxis = 'ts_pct'
# Create the scatter plot
plt.figure(figsize=(8, 6))
# Scatter plot for each cluster
for cluster_label, color, label in [
(0, 'red', 'Lottery Pick (Picks 1-5)'),
(1, 'blue', 'End of Lottery (Picks 6-14)'),
(2, 'green', 'End of First Rounder (Picks 15-30)'),
(3, 'yellow', 'Second Rounder And Later (Picks 31 And On)'),
(4, 'black', 'Undrafted')
]:
cluster_data = data_gppt[data_gppt['Cluster'] == cluster_label]
plt.scatter(cluster_data[xAxis], cluster_data[yAxis], color=color, label=label)
# Add horizontal line at y=0.6
plt.axhline(y=0.6, color='grey', linestyle='--')
# Add vertical line at x=25
plt.axvline(x=25, color='grey', linestyle='--')
# Print the scatterplot
plt.xlabel('Points per Game')
plt.ylabel('True Shooting Percentage')
plt.legend()
plt.show()import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load the data
df = pd.read_csv('all_seasons.csv')
# Filter the data
data_gppt = df[(df['gp'] >= 65) & (df['pts'] >= 15.0)]
# Convert 'Undrafted' to 0 in the draft_number column
data_gppt['draft_number'] = data_gppt['draft_number'].replace('Undrafted', '0')
data_gppt['draft_number'] = pd.to_numeric(data_gppt['draft_number'])
# Create bins for draft positions
bins = [0, 5, 14, 30, 60, 100]
labels = ['Top 5', '6-14', '15-30', '31-60', 'Undrafted']
data_gppt['Draft Group'] = pd.cut(data_gppt['draft_number'], bins=bins, labels=labels, right=False)
# Create the box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='Draft Group', y='pts', data=data_gppt)
plt.xlabel('Draft Position Group')
plt.ylabel('Points per Game')
plt.title('Distribution of Points per Game by Draft Position Group')
plt.show()