There are various methods to handle imbalanced classes. In this
article, we will cover the three most used techniques- Oversampling,
Undersampling and SMOTE.
Importing data and required libraries
This is an HR Analytics data consisting of employee data with
variables such as Gender, Experience, Education, Training hours
etc.
library(ROSE)
library(randomForest)
library(caret)
library(e1071)
library(DMwR)
data<-read.csv("empdata.csv",header = T,stringsAsFactors = T)
head(data)
enrollee_id city city_development_index gender relevent_experience
1 8949 city_103 0.920 Male Has relevent experience
2 29725 city_40 0.776 Male No relevent experience
3 11561 city_21 0.624 No relevent experience
4 33241 city_115 0.789 No relevent experience
5 666 city_162 0.767 Male Has relevent experience
6 21651 city_176 0.764 Has relevent experience
enrolled_university education_level major_discipline experience company_size
1 no_enrollment Graduate STEM >20
2 no_enrollment Graduate STEM 15 50-99
3 Full time course Graduate STEM 5
4 Graduate Business Degree <1
5 no_enrollment Masters STEM >20 50-99
6 Part time course Graduate STEM 11
company_type last_new_job training_hours target
1 1 36 1
2 Pvt Ltd >4 47 0
3 never 83 0
4 Pvt Ltd never 52 1
5 Funded Startup 4 8 0
6 1 24 1
data<-subset(data,select = -c(enrollee_id,city))
data$target<-as.factor(data$target)
Checking Class Imbalance
barplot(prop.table(table(data$target)),
col = rainbow(2),
ylim = c(0, 0.7),
main = "Class Distribution")

Running and RandomForest Model and checking the confusion
matrix
split <- sample(2, nrow(data), replace = TRUE, prob = c(0.7, 0.3))
train <- data[split==1,]
test <- data[split==2,]
rf <- randomForest(target~., data = train)
predrf<-predict(rf,test,type="vote",norm.votes = T)
library(ROCR)
pred<-prediction(predrf[,2],test$target)
perf<-performance(pred,"tpr","fpr")
plot(perf)
abline(0,1)

auc<-performance(pred,"auc")
auc@y.values
[[1]]
[1] 0.796736
Method 1- Oversampling
Oversampling balances the data by randomly oversampling the minority
class.
overtrain <- ovun.sample(target~., data = train, method = "over")$data
barplot(prop.table(table(overtrain$target)),
col = rainbow(2),
ylim = c(0, 0.7),
main = "Class Distribution")

Running the model after oversampling
rfover <- randomForest(target~., data = overtrain)
predrfover<-predict(rfover,test,type="vote",norm.votes = T)
library(ROCR)
pred<-prediction(predrfover[,2],test$target)
perf<-performance(pred,"tpr","fpr")
plot(perf)
abline(0,1)

auc<-performance(pred,"auc")
auc@y.values
[[1]]
[1] 0.7958179
Method 2- Undersampling
Undersampling method randomly chooses observations from majority
class which are eliminated until the data set gets balanced.
undertrain <- ovun.sample(target~., data = train, method = "under")$data
barplot(prop.table(table(undertrain$target)),
col = rainbow(2),
ylim = c(0, 0.7),
main = "Class Distribution")

Running the model after undersampling
rfunder <- randomForest(target~., data = undertrain)
predrfunder<-predict(rfunder,test,type="vote",norm.votes = T)
library(ROCR)
pred<-prediction(predrfunder[,2],test$target)
perf<-performance(pred,"tpr","fpr")
plot(perf)
abline(0,1)

auc<-performance(pred,"auc")
auc@y.values
[[1]]
[1] 0.7966998
Method 3- SMOTE
SMOTE stands for Synthetic Minority Sampling Technique. It down
samples the majority class and synthesizes new minority instances by
interpolating between existing ones.
smote <- SMOTE(target ~ ., train)
barplot(prop.table(table(smote$target)),
col = rainbow(2),
ylim = c(0, 0.7),
main = "Class Distribution")

Running the model after SMOTE
rfsmote <- randomForest(target~., data = smote)
predrfsmote<-predict(rfsmote,test,type="vote",norm.votes = T)
library(ROCR)
pred<-prediction(predrfsmote[,2],test$target)
perf<-performance(pred,"tpr","fpr")
plot(perf)
abline(0,1)

auc<-performance(pred,"auc")
auc@y.values
[[1]]
[1] 0.7695837