Libraries Used
library(tidyverse)
library(caret)
Data Import and Cleaing
data <- read.csv("Datasets/CaseStudy2-data.csv", header = TRUE)
#Remove columns that aren't useful
data <- subset(data, select = -c(ID,EmployeeCount,EmployeeNumber,Over18,StandardHours))
#Change all character variables to factors
data[sapply(data, is.character)] <- lapply(data[sapply(data, is.character)],
as.factor)
#Create binary var for attrition
data$AttritionCoded <- with(data, ifelse(Attrition == "Yes", 1, 0))
Creating Train and Test Sets
split <- 0.75
trainIndex <- sample(seq(1,870,1),870*split)
dataTrain <- data[trainIndex,]
dataTest <- data[-trainIndex,]
Here we are building a kNN model to predict attrition. We will use 10-fold cross validation and upsample the data to create a balanced dataset.
ctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
verboseIter = FALSE,
sampling = "up")
model <- train(Attrition ~ Age+
EducationField+EnvironmentSatisfaction+
Gender+HourlyRate+JobLevel+
JobRole+JobSatisfaction+JobInvolvement+
MonthlyIncome+MonthlyRate+NumCompaniesWorked+
OverTime+RelationshipSatisfaction+WorkLifeBalance+
YearsSinceLastPromotion+YearsWithCurrManager,
data = dataTrain,
method = "knn",
trControl = ctrl,
preProcess = c("center","scale"))
predict <- predict(model,newdata = dataTest)
confusionMatrix(table(predict,dataTest$Attrition))
## Confusion Matrix and Statistics
##
##
## predict No Yes
## No 133 16
## Yes 48 21
##
## Accuracy : 0.7064
## 95% CI : (0.6411, 0.766)
## No Information Rate : 0.8303
## P-Value [Acc > NIR] : 0.9999980
##
## Kappa : 0.225
##
## Mcnemar's Test P-Value : 0.0001066
##
## Sensitivity : 0.7348
## Specificity : 0.5676
## Pos Pred Value : 0.8926
## Neg Pred Value : 0.3043
## Prevalence : 0.8303
## Detection Rate : 0.6101
## Detection Prevalence : 0.6835
## Balanced Accuracy : 0.6512
##
## 'Positive' Class : No
##
Attrition Rate by Job Role
#create dataset
jobdat <- data %>% group_by(JobRole) %>%
summarize(percAttr = sum(AttritionCoded)/length(AttritionCoded)*100)
#plot data
jobdat %>% ggplot(aes(reorder(JobRole, percAttr), percAttr)) +
geom_col(fill = "#303c94") + coord_flip() + labs(y = "% Attrition", x= "Job Role")

Monthly Income Regression
fit3 <- lm(MonthlyIncome ~ JobLevel+TotalWorkingYears+
JobRole+JobRole*JobLevel, data = data)
summary(fit3)
##
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + TotalWorkingYears + JobRole +
## JobRole * JobLevel, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3500.9 -629.2 -99.6 597.1 4333.0
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -608.154 456.272 -1.333
## JobLevel 2985.140 186.435 16.012
## TotalWorkingYears 50.750 7.505 6.762
## JobRoleHuman Resources 364.079 646.258 0.563
## JobRoleLaboratory Technician 1759.720 508.739 3.459
## JobRoleManager 3446.214 1050.768 3.280
## JobRoleManufacturing Director -907.251 622.758 -1.457
## JobRoleResearch Director 3735.449 807.041 4.629
## JobRoleResearch Scientist 920.748 513.398 1.793
## JobRoleSales Executive -806.340 549.594 -1.467
## JobRoleSales Representative 939.944 786.148 1.196
## JobLevel:JobRoleHuman Resources -351.768 384.389 -0.915
## JobLevel:JobRoleLaboratory Technician -1667.611 244.362 -6.824
## JobLevel:JobRoleManager 39.383 280.964 0.140
## JobLevel:JobRoleManufacturing Director 426.238 245.000 1.740
## JobLevel:JobRoleResearch Director -8.118 245.656 -0.033
## JobLevel:JobRoleResearch Scientist -832.004 259.891 -3.201
## JobLevel:JobRoleSales Executive 330.355 220.463 1.498
## JobLevel:JobRoleSales Representative -1001.410 618.828 -1.618
## Pr(>|t|)
## (Intercept) 0.182929
## JobLevel < 2e-16 ***
## TotalWorkingYears 2.53e-11 ***
## JobRoleHuman Resources 0.573335
## JobRoleLaboratory Technician 0.000569 ***
## JobRoleManager 0.001081 **
## JobRoleManufacturing Director 0.145533
## JobRoleResearch Director 4.26e-06 ***
## JobRoleResearch Scientist 0.073258 .
## JobRoleSales Executive 0.142703
## JobRoleSales Representative 0.232173
## JobLevel:JobRoleHuman Resources 0.360380
## JobLevel:JobRoleLaboratory Technician 1.68e-11 ***
## JobLevel:JobRoleManager 0.888557
## JobLevel:JobRoleManufacturing Director 0.082265 .
## JobLevel:JobRoleResearch Director 0.973647
## JobLevel:JobRoleResearch Scientist 0.001419 **
## JobLevel:JobRoleSales Executive 0.134385
## JobLevel:JobRoleSales Representative 0.105982
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 996.4 on 851 degrees of freedom
## Multiple R-squared: 0.954, Adjusted R-squared: 0.953
## F-statistic: 980.7 on 18 and 851 DF, p-value: < 2.2e-16
Job Satisfaction by Attrition T-Test
#job satisfaction by attrition, ttest
x <- data %>% filter(Attrition == "Yes")
x <- x[,"JobSatisfaction"]
y <- data %>% filter(Attrition == "No")
y <- y[,"JobSatisfaction"]
t.test(x,y)
##
## Welch Two Sample t-test
##
## data: x and y
## t = -3.2202, df = 197.97, p-value = 0.001497
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.5255243 -0.1263348
## sample estimates:
## mean of x mean of y
## 2.435714 2.761644
Monthly Income by Attrition T-Test
#montlyincome by attrition, ttest
a <- data %>% filter(Attrition == "Yes")
a <- a[,"MonthlyIncome"]
b <- data %>% filter(Attrition == "No")
b <- b[,"MonthlyIncome"]
t.test(a,b)
##
## Welch Two Sample t-test
##
## data: a and b
## t = -5.3249, df = 228.45, p-value = 2.412e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2654.047 -1220.382
## sample estimates:
## mean of x mean of y
## 4764.786 6702.000
YouTube Presentation: https://youtu.be/b8qVZO8sTnY RShiny App: https://josh-turk.shinyapps.io/AttritionProject/