Advanced models and libraries
The problem:
Bob has launched a new mobile phone brand, aiming to compete with major industry players such as Apple and Samsung. However, determining the appropriate pricing for his mobile phones poses a challenge. In the highly competitive mobile phone market, making assumptions about pricing strategies can be risky. To address this issue, Bob has gathered sales data from various mobile phone manufacturers.
He seeks to uncover a correlation between the characteristics of a mobile phone (e.g., RAM, internal memory, etc.) and its selling price. Unfortunately, Bob lacks expertise in Machine Learning and requires assistance to tackle this problem.
The task at hand is not to predict the exact selling price of the mobile phones. Instead, the objective is to classify the mobile phones into different price ranges, which will indicate their relative price level. This classification approach will help Bob understand how the features of a mobile phone can influence its position in the market in terms of pricing.
The dataset contains various features related to mobile phones, such as battery_power, blue (Bluetooth), clock_speed, dual_sim, fc (front camera megapixels), four_g, int_memory (internal memory in Gigabytes), m_dep (mobile depth in cm), mobile_wt (mobile weight in grams), n_cores (number of cores of processor), pc (primary camera megapixels), px_height (pixel resolution height), px_width (pixel resolution width), ram (random access memory in Megabytes), sc_h (screen height in cm), sc_w (screen width in cm), talk_time (longest time that a single battery charge will last when you are), three_g, touch_screen, wifi, and price_range indicating the price category of the mobile.
Here are a few points to think about:
Handling missing data: Examine the dataset for any missing values.
Outlier management: Reflect on your strategy for dealing with outliers identified within the dataset. Would eliminating them be appropriate, and if so, what rationale supports this decision?
Price range distribution: Analyze how mobile phone price ranges are distributed within the dataset. What does this distribution tell you about the dataset's composition?
Feature correlation with price range: Explore how features such as RAM, battery power, pixel height, and pixel width are related to mobile phone price ranges. What does the correlation between these features and the price range suggest?
Feature relationships exploration: Examine the relationships between certain features (e.g., the presence of 4G and 3G, dual SIM functionality, and battery power). Are there any associations?
Innovative feature creation: Think about the creation of new features, such as a total talk time per battery power or a weight-to-size ratio, that could enhance the prediction of mobile phone price ranges. How might these new features improve predictive accuracy?
# Load necessary libraries
library(caret)
library(readr)
library(tidyverse)
# read csv file via readr library
df <- read_csv("train.csv")
#check the data types
str(df)
#check summary for the basic statistics
summary(df)
#example fo bar plot
ggplot(df, aes(blue)) +
geom_bar()
install.packages("corrr")
library(corrr)
install.packages("ggcorrplot")
library(ggcorrplot)
# Step 1 scale the data frame
data_scale <- scale(df)
# Step 2 make the correlation matrix
corr_matrix <- cor(data_scale)
# Step 3 plot the correlation matrix
ggcorrplot(corr_matrix, type= "lower")
# Exploratory Data Analysis after Correlation Matrix
#when scatter is not a good option
ggplot(df, aes(x=ram, y=price_range)) +
geom_point()
#change to factor
df$price_range <- as.factor(df$price_range)
str(df)
#when boxplot is a better option
ggplot(df, aes(x=ram, y=price_range, color=price_range)) +
geom_boxplot()
#color for different price ranges
ggplot(df, aes(x=battery_power, y=price_range, color=price_range)) +
geom_boxplot()
#two bar plot one over the other
ggplot(df, aes(fc)) +
geom_bar(fill="red", alpha = 0.5) +
geom_bar(aes(pc), fill="blue", alpha = 0.4) +
labs(title = "FC vs PC graph")
#price range back to numeric
df$price_range <- as.numeric(df$price_range)
#density plots
ggplot(df, aes(x=ram, y= price_range)) +
geom_density_2d() +
ylim(0,5) +
xlim(-100, 4500)
ggplot(df, aes(x=battery_power, y= price_range)) +
geom_density_2d() +
ylim(0,5) +
xlim(0, 2500)
# KNN
#caret library
library(caret)
#str(df)
df$price_range <- as.factor(df$price_range)
set.seed(123) # for reproduction
################################################################################
# Split the dataset with the caret library
myIndex <- createDataPartition(df$price_range, p=0.8, list=FALSE)
trainSet <- df[myIndex, ]
testSet <- df[-myIndex, ]
################################################################################
set.seed(123) # for reproduction
################################################################################
trControl <- trainControl(method = "cv", number = 10) # first parameter
metric <- "Accuracy" # second parameter
tuneGrid <- expand.grid(.k=seq(2,20, by=2)) # third parameter
#tuneGrid <- expand.grid(.k=c(2:20))
fit.knn <- train(price_range ~., data = trainSet, method = "knn",
trControl = trControl, metric = metric, tuneGrid = tuneGrid)
################################################################################
print(fit.knn)
plot(fit.knn)
################################################################################
# Making predictions and confustion matrix
predictions <- predict(fit.knn, newdata = testSet)
confusionMatrix(predictions, testSet$price_range)
################################################################################
Naive Bayes
library(caret)
library(readr)
library(tidyverse)
library(klaR)
df_new <- read_csv("train.csv")
df_new$price_range <- as.factor(df_new$price_range)
set.seed(123) # for reproduction
myIndex <- createDataPartition(df_new$price_range, p=0.8, list=FALSE)
trainSet2 <- df_new[myIndex, ]
testSet2 <- df_new[-myIndex, ]
set.seed(123) # for reproduction again
trControl <- trainControl(method = "cv", number = 10) # first param
suppressWarnings({ # handling warnings
fit.nb <- train(price_range ~., data = trainSet2, method = "nb",
trControl = trControl, preProcess="scale")
})# closing for handling warnings
print(fit.nb)
plot(fit.nb)
suppressWarnings({ # handling warnings
predictions2 <- predict(fit.nb, newdata = testSet2)
}) # closing for handling warnings
confusionMatrix(predictions2, testSet2$price_range)
EXTRA: Support Vector Machine
# Load necessary libraries
# kernlab is used for support vector machines
install.packages("kernlab")
library(caret) # caret is used for creating predictive models
library(readr) # readr is used for reading CSV files
library(tidyverse) # tidyverse is a collection of R packages for data science
library(kernlab) # kernlab is used for kernel-based machine learning methods
# Load and prepare the dataset
dataset <- read_csv("train.csv") # Read the dataset from a CSV file
dataset$price_range <- as.factor(dataset$price_range) # Convert price_range to a factor for classification
# Set a seed for reproducibility
set.seed(123)
# Split the dataset into training and testing sets
validationIndex <- createDataPartition(dataset$price_range, p=0.8, list=FALSE) # 80% training, 20% testing
train <- dataset[validationIndex,] # Training set
test <- dataset[-validationIndex,] # Testing set
# Setup training control
trainControl <- trainControl(method="cv", number=10) # 10-fold cross-validation
# Metric to evaluate the models
metric <- "Accuracy" # Use accuracy as the performance metric
# Define a tuning grid for the SVM model
# Here, we're using an RBF kernel, so we need to tune the cost parameter (C) and the sigma parameter
tuneGrid <- expand.grid(C=c(0.01, 0.1, 1, 10, 100), sigma=c(0.001, 0.01, 0.1, 1)) # Parameter grid
# Train the SVM model
suppressWarnings({ # handling warnings
fit.svm <- train(price_range~., data=train, method="svmRadial", # Use radial basis function kernel
metric=metric, trControl=trainControl, tuneGrid=tuneGrid) # Train the model
})# closing for handling warnings
# Print the model summary
print(fit.svm) # Print the model's details
# Print the best tuning parameters
print(fit.svm$bestTune) # Print the best tuning parameters found
suppressWarnings({ # handling warnings
predictions3 <- predict(fit.svm, newdata = test)
confusionMatrix(predictions3, test$price_range)
})# closing for handling warnings