Skip to content

Preprocessing for Machine Learning

import pandas as pd
from sklearn.model_selection import train_test_split

# loading file
volunteer = pd.read_csv('datasets/volunteer_opportunities.csv')

# data inspection
print(volunteer.head())

# get data features
print(volunteer.info())

# get data statistics
print(volunteer.describe())

# find total missing value in locality column
print(volunteer['locality'].isna().sum())

# Drop the Latitude and Longitude columns from volunteer
volunteer_cols = volunteer.drop(["Latitude", "Longitude"], axis=1)

# Drop rows with missing category_desc values from volunteer_cols
volunteer_subset = volunteer_cols.dropna(subset=["category_desc"])

# Print out the shape of the subset
print(volunteer_subset.shape)

# Print the head of the hits column
print(volunteer["hits"].head())

# Convert the hits column to type int
volunteer["hits"] = volunteer["hits"].astype(int)

# Look at the dtypes of the dataset
print(volunteer.dtypes)

# Value counts of category_desc
print(volunteer['category_desc'].value_counts())

# Codes after processing 
# Create a DataFrame with all columns except category_desc
X = volunteer.drop("category_desc", axis=1)

# Create a category_desc labels dataset
y = volunteer[["category_desc"]]

# Use stratified sampling to split up the dataset according to the y dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify= y, random_state=42)

# Print the category_desc counts from y_train
print(y_train['category_desc'].value_counts())