Skip to content

Make Predictions with Linear Regression

This recipe shows how to perform linear regression on your data. You can either play around using the provided Boston housing data (source) or you can load your own data and make the necessary changes in input_cols and output_col. For the linear regression itself, you will use the LinearRegression functionality from the scikit-learn package.

# Load packages
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid", {"axes.facecolor": ".9"})
# Load data from the csv file
df = pd.read_csv("housing_data.csv")
df.head()
# Understand the variables
pd.options.display.max_colwidth = 100
pd.read_csv('variable_explanation.csv', index_col=0)
# Split the data into X and y
# You can adapt the input and output columns to fit your own data
input_cols = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']
output_col = ['PRICE']
X = df[input_cols]
y = df[output_col]

# Split the data into training and test data
X_train, X_test, y_train, y_test =  train_test_split(X,y,test_size = 0.30, random_state= 44)
# Make two figures so it is better visualized
half = len(input_cols)//2

fig1=sns.pairplot(
    df,
    x_vars=input_cols[:half],
    y_vars=output_col
)

fig2=sns.pairplot(
    df,
    x_vars=input_cols[half:],
    y_vars=output_col
)
# Function to flatten 2D lists so it can be used by plotly
def flatten(l):
    return [item for sublist in l for item in sublist]

# Set up and fit the linear regressor
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Flatten the prediction and expected lists
predicted = flatten(lin_reg.predict(X_test))
expected = flatten(y_test.values)
%matplotlib inline
# Import plotting package
import plotly.express as px

# Put data to plot in dataframe
df_plot = pd.DataFrame({'expected':expected, 'predicted':predicted})

# Make scatter plot from data
fig = px.scatter(
    df_plot, 
    x='expected', 
    y='predicted',
    title='Predicted vs. Actual Values')

# Add straight line indicating perfect model
fig.add_shape(type="line",
    x0=0, y0=0, x1=50, y1=50,
    line=dict(
        color="Red",
        width=4,
        dash="dot",
    )
)

# Show figure
fig.show()
# Print the root mean square error (RMS)
error = np.sqrt(np.mean((np.array(predicted) - np.array(expected)) ** 2))
print(f"RMS: {error:.4f} ")

r2=r2_score(expected, predicted)
print(f"R2: {round(r2,4)}")