SamplePracticalExam:CoffeeShops

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

coffee = pd.read_csv('coffee.csv')
print(coffee.info())

cat = ['Region', 'Place type', 'Price', 'Delivery option', 'Dine in option', 'Takeout option']
for column in cat:
    print(coffee[column].value_counts())

coffee.describe()

coffee.isna().sum()

coffee['Rating'] = coffee['Rating'].fillna(0)
median = np.median(coffee['Reviews'].dropna())
coffee['Reviews'] = coffee['Reviews'].fillna(median)
coffee.info()

coffee['Dine in option'] = coffee['Dine in option'].fillna(False)
coffee['Takeout option'] = coffee['Takeout option'].fillna(False)
coffee['Dine in option'] = coffee['Dine in option'].astype('bool')
coffee['Takeout option'] = coffee['Takeout option'].astype('bool')
coffee.info()

sns.countplot(data=coffee, x='Rating', color='gray')
plt.title('Graph 1 - The count of rating')
plt.show()

sns.histplot(data=coffee, x='Reviews', color='gray')
plt.title('Graph 2.1 - The distributipn of number of reviews')
plt.show()

sns.histplot(x=np.log(coffee['Reviews']), color='gray')
plt.title('Graph 2.2 - The distribution of number of reviews (log transformation)')
plt.show()

sns.boxplot(data=coffee, x='Rating', y='Reviews', color='gray')
plt.title('Graph 3.1 - Relationship between Rating and Reviews')
plt.show()

outlier = coffee['Reviews'] == coffee['Reviews'].max()
coffee_no_outlier = coffee[~outlier]

sns.boxplot(data=coffee_no_outlier, x='Rating', y='Reviews', color='gray')
plt.title('Graph 3.2 - Relationship between Rating and Reviews after removing outlier')
plt.show()

coffee = coffee.drop(columns=['Place name'])
outlier = coffee['Reviews'] == coffee['Reviews'].max()
coffee = coffee[~outlier]

from sklearn import preprocessing
features = coffee.drop(columns='Reviews')
X = pd.get_dummies(features, columns=['Place type','Price','Region','Delivery option','Dine in option','Takeout option'])
y = np.log(coffee['Reviews'])

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

‌
‌
‌