Portfolio Project 2 - NBA Scoring vs Draft Position

DataFrameas

df

variable

SELECT * FROM 'all_seasons.csv';

# List the names of the columns in the DataFrame and Print them
column_names = df.columns.tolist()
print("Column Names:")
for name in column_names:
    print(name)

# Round the values in the 6th and 7th columns (height and weight) to the nearest whole number
df.iloc[:, 6:8] = df.iloc[:, 6:8].round()

# Print the DataFrame after Rounding
print(df)

# New DataFrame where Games Played is at least 65 AND Points per game is more than 15.0
import pandas as pd

df = pd.read_csv('all_seasons.csv')
data_gppt = df[(df['gp'] >= 65) & (df['pts'] >= 15.0)]
data_gppt

# Convert string value in draft position (Undrafted) to 0 to make the column numerical
data_gppt['draft_number'] = data_gppt['draft_number'].replace('Undrafted', '0')

data_gppt['draft_number'] = pd.to_numeric(data_gppt['draft_number'])

# Create a k-means clustering model with 4 clusters and fit the model to the data with the parameters of the graph
import pandas as pd
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, init='random', random_state=0)

kmeans.fit(data_gppt[['ts_pct', 'pts']])

# Add a new column to the DataFrame and define names for the clusters based on Draft Position
data_gppt['Cluster'] = kmeans.labels_

def assign_cluster(draft_number):
    if 1 <= draft_number <= 5:
        return 0
    elif 6 <= draft_number <= 14:
        return 1
    elif 15 <= draft_number <= 30:
        return 2
    elif draft_number > 31:
        return 3
    elif draft_number == 0:
        return 4
    else:
        return 'Unknown'

# Apply the function to create the 'Cluster' column
data_gppt['Cluster'] = data_gppt['draft_number'].apply(assign_cluster)

def assign_cluster_name(cluster_label):
    if cluster_label == 0:
        return 'Top 5 Pick (Picks 1-5)'
    elif cluster_label == 1:
        return 'End of Lottery (Picks 6-14)'
    elif cluster_label == 2:
        return 'End of First Rounder (Picks 15-30)'
    elif cluster_label == 3:
        return 'Second Rounder (Picks 31 and On)'
    elif cluster_label == 4:
        return 'Undrafted'
    else:
        return 'Other'  # Assign a special name for out-of-range picks
    
# Map cluster labels to cluster names based on Draft Position
data_gppt['Cluster Name'] = data_gppt['Cluster'].apply(assign_cluster_name)

# Print the DataFrame with cluster labels and names
print(data_gppt)

import matplotlib.pyplot as plt

# Define the columns for the x and y axes
xAxis = 'pts'
yAxis = 'ts_pct'

# Create the scatter plot
plt.figure(figsize=(8, 6))

# Scatter plot for each cluster
for cluster_label, color, label in [
    (0, 'red', 'Lottery Pick (Picks 1-5)'),
    (1, 'blue', 'End of Lottery (Picks 6-14)'), 
    (2, 'green', 'End of First Rounder (Picks 15-30)'),
    (3, 'yellow', 'Second Rounder And Later (Picks 31 And On)'),
    (4, 'black', 'Undrafted')
]:
    cluster_data = data_gppt[data_gppt['Cluster'] == cluster_label]
    plt.scatter(cluster_data[xAxis], cluster_data[yAxis], color=color, label=label)

# Add horizontal line at y=0.6
plt.axhline(y=0.6, color='grey', linestyle='--')

# Add vertical line at x=25
plt.axvline(x=25, color='grey', linestyle='--')

# Print the scatterplot
plt.xlabel('Points per Game')
plt.ylabel('True Shooting Percentage')
plt.legend()
plt.show()

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv('all_seasons.csv')

# Filter the data
data_gppt = df[(df['gp'] >= 65) & (df['pts'] >= 15.0)]

# Convert 'Undrafted' to 0 in the draft_number column
data_gppt['draft_number'] = data_gppt['draft_number'].replace('Undrafted', '0')
data_gppt['draft_number'] = pd.to_numeric(data_gppt['draft_number'])

# Create bins for draft positions
bins = [0, 5, 14, 30, 60, 100]
labels = ['Top 5', '6-14', '15-30', '31-60', 'Undrafted']
data_gppt['Draft Group'] = pd.cut(data_gppt['draft_number'], bins=bins, labels=labels, right=False)

# Create the box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='Draft Group', y='pts', data=data_gppt)
plt.xlabel('Draft Position Group')
plt.ylabel('Points per Game')
plt.title('Distribution of Points per Game by Draft Position Group')
plt.show()