# Import libraries and methods/functions
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
# Start your code here!
#Created the dataframes
df1=pd.read_csv('telecom_demographics.csv')
df2=pd.read_csv('telecom_usage.csv')
#Merge the dataframes
churn_df = df1.merge(df2, how='inner', on='customer_id')
churn_df.head()
#Calculate churn rate
churn_rate=(churn_df['churn'].value_counts())/len(churn_df)
print("Churn Rate:\n", churn_rate)
churn_df.info()
# Encoding categorical features
cat_var=['telecom_partner', 'gender', 'state', 'city', 'registration_event']
churn_df=pd.get_dummies(churn_df, columns=cat_var)
#Scaling data using StandardScaler
scaler = StandardScaler()
#X and y variable, we have taken features variable to do feature scaling
features = churn_df.drop(['customer_id', 'churn'], axis=1)
features_scaled = scaler.fit_transform(features)
target=churn_df['churn']
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)
## Initializing Logistic Regression model
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)
## Initializing RandomForest model
rf=RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred=rf.predict(X_test)
# Logistic Regression evaluation
print(confusion_matrix(y_test, logreg_pred))
print(classification_report(y_test, logreg_pred))
#Random forest evaluation
print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))
## Which accuracy score is higher? Ridge or RandomForest
higher_accuracy = "RandomForest"