Skip to content
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols

Read dataframe taiwan from from CSV file

# read dataframe
df = pd.read_csv('taiwan_real_estate.csv')
# review daf
print(df.head)

Visualize dataset


# Create a figure and axis
fig, ax = plt.subplots()

# Plot the scatterplot
sns.regplot(x='dist_to_mrt_m', y='price_twd_msq', data=df, color='blue',line_kws={"color": "red"})

# Set the title of the plot
ax.set_title('Price vs Distance to MRT')

# Create a figure and axis
fig, ax = plt.subplots()

# Plot the scatterplot
sns.regplot(x='n_convenience', y='price_twd_msq', data=df, color='green',line_kws={"color": "black"})

# Set the title of the plot
ax.set_title('n_store vs Distance to MRT')
import matplotlib.pyplot as plt
import seaborn as sns

# Create a figure and axis
fig, ax = plt.subplots()

# Plot the scatterplot
sns.histplot(x='house_age_years', data=df, color='purple')
plt.show()

# Set the title of the plot
ax.set_title('n_store vs Distance to MRT')

Creating the model using ols (1 dependent and 1 independent variable)

# model for n_convenience vs price
model1 = ols('price_twd_msq ~ n_convenience', data=df).fit()
print(N_CvsPrice.params)
# test model with same dataset for n_convenience vs price
predicted_price1= model1.fittedvalues
# add predicted_price1 to df
df['new_price'] = predicted_price1
print(df[['new_price','price_twd_msq']])
# plot new vs actual price
sns.lineplot(x='n_convenience', y='new_price', data=df, label='New Price')
sns.lineplot(x='n_convenience', y='price_twd_msq', data=df, label='actual Price')

Quantify model fit

# residual error
resid1 = model1.resid

# mean resid
resid1_error = np.std(resid1)
print(f'resid1_error: {resid1_error}')

# Calculate R-squared
r_squared = model1.rsquared
print(f'R-squared: {r_squared}')

# Calculate MSE
mse = model1.mse_resid
print(f'Mean Squared Error (MSE): {mse}')