library(ROSE)
library(randomForest)
library(caret)
library(e1071)
library(DMwR)
data<-read.csv("empdata.csv",header = T,stringsAsFactors = T)
head(data)
enrollee_id city city_development_index gender relevent_experience
1 8949 city_103 0.920 Male Has relevent experience
2 29725 city_40 0.776 Male No relevent experience
3 11561 city_21 0.624 No relevent experience
4 33241 city_115 0.789 No relevent experience
5 666 city_162 0.767 Male Has relevent experience
6 21651 city_176 0.764 Has relevent experience
enrolled_university education_level major_discipline experience company_size
1 no_enrollment Graduate STEM >20
2 no_enrollment Graduate STEM 15 50-99
3 Full time course Graduate STEM 5
4 Graduate Business Degree <1
5 no_enrollment Masters STEM >20 50-99
6 Part time course Graduate STEM 11
company_type last_new_job training_hours target
1 1 36 1
2 Pvt Ltd >4 47 0
3 never 83 0
4 Pvt Ltd never 52 1
5 Funded Startup 4 8 0
6 1 24 1
data<-subset(data,select = -c(enrollee_id,city))
data$target<-as.factor(data$target)
barplot(prop.table(table(data$target)),
col = rainbow(2),
ylim = c(0, 0.7),
main = "Class Distribution")
split <- sample(2, nrow(data), replace = TRUE, prob = c(0.7, 0.3))
train <- data[split==1,]
test <- data[split==2,]
rf <- randomForest(target~., data = train)
predrf<-predict(rf,test,type="vote",norm.votes = T)
library(ROCR)
pred<-prediction(predrf[,2],test$target)
perf<-performance(pred,"tpr","fpr")
plot(perf)
abline(0,1)
auc<-performance(pred,"auc")
auc@y.values
[[1]]
[1] 0.796736
overtrain <- ovun.sample(target~., data = train, method = "over")$data
barplot(prop.table(table(overtrain$target)),
col = rainbow(2),
ylim = c(0, 0.7),
main = "Class Distribution")
rfover <- randomForest(target~., data = overtrain)
predrfover<-predict(rfover,test,type="vote",norm.votes = T)
library(ROCR)
pred<-prediction(predrfover[,2],test$target)
perf<-performance(pred,"tpr","fpr")
plot(perf)
abline(0,1)
auc<-performance(pred,"auc")
auc@y.values
[[1]]
[1] 0.7958179
undertrain <- ovun.sample(target~., data = train, method = "under")$data
barplot(prop.table(table(undertrain$target)),
col = rainbow(2),
ylim = c(0, 0.7),
main = "Class Distribution")
rfunder <- randomForest(target~., data = undertrain)
predrfunder<-predict(rfunder,test,type="vote",norm.votes = T)
library(ROCR)
pred<-prediction(predrfunder[,2],test$target)
perf<-performance(pred,"tpr","fpr")
plot(perf)
abline(0,1)
auc<-performance(pred,"auc")
auc@y.values
[[1]]
[1] 0.7966998
smote <- SMOTE(target ~ ., train)
barplot(prop.table(table(smote$target)),
col = rainbow(2),
ylim = c(0, 0.7),
main = "Class Distribution")
rfsmote <- randomForest(target~., data = smote)
predrfsmote<-predict(rfsmote,test,type="vote",norm.votes = T)
library(ROCR)
pred<-prediction(predrfsmote[,2],test$target)
perf<-performance(pred,"tpr","fpr")
plot(perf)
abline(0,1)
auc<-performance(pred,"auc")
auc@y.values
[[1]]
[1] 0.7695837