Skip to content
Regression analysis (one expl_var and cat-var)
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import olsRead dataframe taiwan from from CSV file
# read dataframe
df = pd.read_csv('taiwan_real_estate.csv')
# review daf
print(df.head)Visualize dataset
# Create a figure and axis
fig, ax = plt.subplots()
# Plot the scatterplot
sns.regplot(x='dist_to_mrt_m', y='price_twd_msq', data=df, color='blue',line_kws={"color": "red"})
# Set the title of the plot
ax.set_title('Price vs Distance to MRT')
# Create a figure and axis
fig, ax = plt.subplots()
# Plot the scatterplot
sns.regplot(x='n_convenience', y='price_twd_msq', data=df, color='green',line_kws={"color": "black"})
# Set the title of the plot
ax.set_title('n_store vs Distance to MRT')import matplotlib.pyplot as plt
import seaborn as sns
# Create a figure and axis
fig, ax = plt.subplots()
# Plot the scatterplot
sns.histplot(x='house_age_years', data=df, color='purple')
plt.show()
# Set the title of the plot
ax.set_title('n_store vs Distance to MRT')Creating the model using ols (1 dependent and 1 independent variable)
# model for n_convenience vs price
model1 = ols('price_twd_msq ~ n_convenience', data=df).fit()
print(N_CvsPrice.params)# test model with same dataset for n_convenience vs price
predicted_price1= model1.fittedvalues
# add predicted_price1 to df
df['new_price'] = predicted_price1
print(df[['new_price','price_twd_msq']])# plot new vs actual price
sns.lineplot(x='n_convenience', y='new_price', data=df, label='New Price')
sns.lineplot(x='n_convenience', y='price_twd_msq', data=df, label='actual Price')Quantify model fit
# residual error
resid1 = model1.resid
# mean resid
resid1_error = np.std(resid1)
print(f'resid1_error: {resid1_error}')
# Calculate R-squared
r_squared = model1.rsquared
print(f'R-squared: {r_squared}')
# Calculate MSE
mse = model1.mse_resid
print(f'Mean Squared Error (MSE): {mse}')