# Import libraries and methods/functions
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
# Start your code here!
# Load datasets
telecom_demographics = pd.read_csv('telecom_demographics.csv')
telecom_usage = pd.read_csv('telecom_usage.csv')
# Merge datasets on 'customer_id'
churn_df = pd.merge(telecom_demographics, telecom_usage, on='customer_id')
# Calculate churn rate
churn_rate = churn_df['churn'].mean()
print("Churn Rate:", churn_rate)
# Identify categorical variables
categorical_variables = churn_df.select_dtypes(include=['object']).columns.tolist()
print("Categorical Variables:", categorical_variables)
from sklearn.preprocessing import StandardScaler
# Separate features and target variable
features = churn_df.drop(columns=['customer_id', 'churn'])
target = churn_df['churn']
# Perform one-hot encoding for categorical variables
features_encoded = pd.get_dummies(features, columns=categorical_variables)
# Perform feature scaling
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_encoded)
from sklearn.model_selection import train_test_split
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# Train Logistic Regression model
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)
# Train Random Forest Classifier model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
from sklearn.metrics import accuracy_score
# Evaluate models
logreg_accuracy = accuracy_score(y_test, logreg_pred)
rf_accuracy = accuracy_score(y_test, rf_pred)
# Assign model name with higher accuracy
higher_accuracy = "LogisticRegression" if logreg_accuracy > rf_accuracy else "RandomForest"
print("Accuracy of Logistic Regression:", logreg_accuracy)
print("Accuracy of Random Forest Classifier:", rf_accuracy)
print("Model with higher accuracy:", higher_accuracy)