—– Executive Summary —–
This is a case study for DDSAnalytics that leverages data science for talent management. Talent management is the iterative process of developing and retaining employees and may include workforce planning, employee development through training and mentoring, increasing employee satisfaction and reducing voluntary employee attrition or churn.
The first part of this analysis focuses on attrition and utilizes a machine algorithm called Boruta to efficiently identify both statistically and highly probable statistically important factors that lead to attrition. After careful testing and modeling, it was determined that the three most influential factors contributing to attrition are overtime, monthly income and stock options.
The second part of this analysis focuses on job role trends. Based on the data, the employees seem to be on average all aobut the same when it comes to environment and job satisfaction, job involvement and work life balance with scores ranging from 2.5-3.0 on a 4 point scale. Sales Reps tend to be the youngest and have the lowest average scores in terms of satisfaction while managers tend to be the oldest and most satisfied with their jobs. Overall the company is very generous with annual pay raises with an overall average of 15.2%!
Finally, we looked at estimating monthly salaries and determined the most influential factors are total working years, education level, job level and job role.
The video presentation can be viewed here: https://www.screencast.com/t/leqSdPsrP -Justin Ehly
Online Research to help with variable selection Based on some SME online research, let’s start with 1. Poor Training - According to go2HR.com, 40% of employees who receive poor job training leave their positions within the first year. 2. Poor Management increases turnover - survey conducted by FurstPerson illustrates as much, highlighting a correlation between a director’s tenure and employee attrition. 3. Lack of Growth Opportunities 4. Inaccurate Job descriptions during hiring process (leads to disappointed employees) Resource: https://www.furstperson.com/blog/causes-employee-attrition-cost-employee-attrition
Another study shows 1. management 2. Alignment & Involvement 3. Employee Enablement 4. Collaboration & Teamwork 5. Feedback & Recognition 6. Investment in People 7. Comp & Benefits 8. Company Confidence 9. Social Connection 10. Company Performance Resource: https://blog.betterworks.com/people-analytics-reveals-top-reasons-for-attrition-and-it-isnt-compensation/
##### Libraries #####
library(readr)
library(tibble)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(magrittr)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(utils)
library(stats)
library(e1071)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tidyr 1.1.2 v stringr 1.4.0
## v purrr 0.3.4 v forcats 0.5.0
## v dplyr 1.0.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(klaR)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:plotly':
##
## select
library(mlbench)
library(rstatix)
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:stats':
##
## filter
library(fmsb)
## Registered S3 methods overwritten by 'fmsb':
## method from
## print.roc pROC
## plot.roc pROC
library(class)
library(stringr)
library(Boruta)
library(dplyr)
#####################################
## ##
########### Grab the Data ###########
## ##
#####################################
dds <- read.csv("CaseStudy2-data.csv")
#view(names(dds))
##### Look for any NAs in the data set ####
sapply(dds, function(x) sum(is.na(x)))
## ID Age Attrition
## 0 0 0
## BusinessTravel DailyRate Department
## 0 0 0
## DistanceFromHome Education EducationField
## 0 0 0
## EmployeeCount EmployeeNumber EnvironmentSatisfaction
## 0 0 0
## Gender HourlyRate JobInvolvement
## 0 0 0
## JobLevel JobRole JobSatisfaction
## 0 0 0
## MaritalStatus MonthlyIncome MonthlyRate
## 0 0 0
## NumCompaniesWorked Over18 OverTime
## 0 0 0
## PercentSalaryHike PerformanceRating RelationshipSatisfaction
## 0 0 0
## StandardHours StockOptionLevel TotalWorkingYears
## 0 0 0
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 0 0 0
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 0 0 0
# Confirmed no NA's in the data
summary(dds)
## ID Age Attrition BusinessTravel
## Min. : 1.0 Min. :18.00 Length:870 Length:870
## 1st Qu.:218.2 1st Qu.:30.00 Class :character Class :character
## Median :435.5 Median :35.00 Mode :character Mode :character
## Mean :435.5 Mean :36.83
## 3rd Qu.:652.8 3rd Qu.:43.00
## Max. :870.0 Max. :60.00
## DailyRate Department DistanceFromHome Education
## Min. : 103.0 Length:870 Min. : 1.000 Min. :1.000
## 1st Qu.: 472.5 Class :character 1st Qu.: 2.000 1st Qu.:2.000
## Median : 817.5 Mode :character Median : 7.000 Median :3.000
## Mean : 815.2 Mean : 9.339 Mean :2.901
## 3rd Qu.:1165.8 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :1499.0 Max. :29.000 Max. :5.000
## EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction
## Length:870 Min. :1 Min. : 1.0 Min. :1.000
## Class :character 1st Qu.:1 1st Qu.: 477.2 1st Qu.:2.000
## Mode :character Median :1 Median :1039.0 Median :3.000
## Mean :1 Mean :1029.8 Mean :2.701
## 3rd Qu.:1 3rd Qu.:1561.5 3rd Qu.:4.000
## Max. :1 Max. :2064.0 Max. :4.000
## Gender HourlyRate JobInvolvement JobLevel
## Length:870 Min. : 30.00 Min. :1.000 Min. :1.000
## Class :character 1st Qu.: 48.00 1st Qu.:2.000 1st Qu.:1.000
## Mode :character Median : 66.00 Median :3.000 Median :2.000
## Mean : 65.61 Mean :2.723 Mean :2.039
## 3rd Qu.: 83.00 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :100.00 Max. :4.000 Max. :5.000
## JobRole JobSatisfaction MaritalStatus MonthlyIncome
## Length:870 Min. :1.000 Length:870 Min. : 1081
## Class :character 1st Qu.:2.000 Class :character 1st Qu.: 2840
## Mode :character Median :3.000 Mode :character Median : 4946
## Mean :2.709 Mean : 6390
## 3rd Qu.:4.000 3rd Qu.: 8182
## Max. :4.000 Max. :19999
## MonthlyRate NumCompaniesWorked Over18 OverTime
## Min. : 2094 Min. :0.000 Length:870 Length:870
## 1st Qu.: 8092 1st Qu.:1.000 Class :character Class :character
## Median :14074 Median :2.000 Mode :character Mode :character
## Mean :14326 Mean :2.728
## 3rd Qu.:20456 3rd Qu.:4.000
## Max. :26997 Max. :9.000
## PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours
## Min. :11.0 Min. :3.000 Min. :1.000 Min. :80
## 1st Qu.:12.0 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:80
## Median :14.0 Median :3.000 Median :3.000 Median :80
## Mean :15.2 Mean :3.152 Mean :2.707 Mean :80
## 3rd Qu.:18.0 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:80
## Max. :25.0 Max. :4.000 Max. :4.000 Max. :80
## StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
## Min. :0.0000 Min. : 0.00 Min. :0.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000
## Median :1.0000 Median :10.00 Median :3.000 Median :3.000
## Mean :0.7839 Mean :11.05 Mean :2.832 Mean :2.782
## 3rd Qu.:1.0000 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :3.0000 Max. :40.00 Max. :6.000 Max. :4.000
## YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.: 0.000
## Median : 5.000 Median : 3.000 Median : 1.000
## Mean : 6.962 Mean : 4.205 Mean : 2.169
## 3rd Qu.:10.000 3rd Qu.: 7.000 3rd Qu.: 3.000
## Max. :40.000 Max. :18.000 Max. :15.000
## YearsWithCurrManager
## Min. : 0.00
## 1st Qu.: 2.00
## Median : 3.00
## Mean : 4.14
## 3rd Qu.: 7.00
## Max. :17.00
# Get a list of all variable names and unique values
dds_info <- capture.output(str(dds))
#view(dds_info)
write(dds_info, "dds_info.csv")
#########################################################
# #
################# Categorical Variables #################
# #
#########################################################
# Let's separate the categorical from the numeric variables and get rid of unnecessary variables
# I used excel for this part
# Categorical Variables
ddsDF_cat <- dds[,c(3,4,6,8,9,12,13,15:19,24,26,27,29,31,32)]
str(ddsDF_cat)
## 'data.frame': 870 obs. of 18 variables:
## $ Attrition : chr "No" "No" "No" "No" ...
## $ BusinessTravel : chr "Travel_Rarely" "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" ...
## $ Department : chr "Sales" "Research & Development" "Research & Development" "Sales" ...
## $ Education : int 4 3 2 4 1 2 5 4 4 4 ...
## $ EducationField : chr "Life Sciences" "Medical" "Life Sciences" "Marketing" ...
## $ EnvironmentSatisfaction : int 2 3 3 3 1 4 2 4 3 4 ...
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ JobInvolvement : int 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : int 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : chr "Sales Executive" "Research Director" "Manufacturing Director" "Sales Executive" ...
## $ JobSatisfaction : int 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : chr "Divorced" "Single" "Single" "Married" ...
## $ OverTime : chr "No" "No" "No" "No" ...
## $ PerformanceRating : int 3 3 3 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 3 1 3 3 3 3 1 3 4 2 ...
## $ StockOptionLevel : int 1 0 0 2 0 2 0 3 1 1 ...
## $ TrainingTimesLastYear : int 3 2 2 3 2 4 5 5 2 3 ...
## $ WorkLifeBalance : int 2 4 3 3 3 2 2 3 3 2 ...
# let's turn all the variables into factors
ddsDF_cat <- ddsDF_cat %>% dplyr::mutate_if(is.character, as.factor)
ddsDF_cat <- ddsDF_cat %>% dplyr::mutate_if(is.integer, as.factor)
ddsDF_cat$JobRole <- case_when(
ddsDF_cat$JobRole == "Healthcare Representative" ~ "Healthcare Rep",
ddsDF_cat$JobRole == "Human Resources" ~ "HR",
ddsDF_cat$JobRole == "Laboratory Technician" ~ "Lab Tech",
ddsDF_cat$JobRole == "Manufacturing Director" ~ "Manufact Director",
ddsDF_cat$JobRole == "Research Director" ~ "Res. Director",
ddsDF_cat$JobRole == "Research Scientist" ~ "Res. Scientist",
ddsDF_cat$JobRole == "Sales Executive" ~ "Sales Exec",
ddsDF_cat$JobRole == "Sales Representative" ~ "Sales Rep",
ddsDF_cat$JobRole == "Manager" ~ "Manager")
str(ddsDF_cat)
## 'data.frame': 870 obs. of 18 variables:
## $ Attrition : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
## $ Education : Factor w/ 5 levels "1","2","3","4",..: 4 3 2 4 1 2 5 4 4 4 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
## $ EnvironmentSatisfaction : Factor w/ 4 levels "1","2","3","4": 2 3 3 3 1 4 2 4 3 4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
## $ JobInvolvement : Factor w/ 4 levels "1","2","3","4": 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : Factor w/ 5 levels "1","2","3","4",..: 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : chr "Sales Exec" "Res. Director" "Manufact Director" "Sales Exec" ...
## $ JobSatisfaction : Factor w/ 4 levels "1","2","3","4": 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
## $ PerformanceRating : Factor w/ 2 levels "3","4": 1 1 1 1 1 2 1 1 1 1 ...
## $ RelationshipSatisfaction: Factor w/ 4 levels "1","2","3","4": 3 1 3 3 3 3 1 3 4 2 ...
## $ StockOptionLevel : Factor w/ 4 levels "0","1","2","3": 2 1 1 3 1 3 1 4 2 2 ...
## $ TrainingTimesLastYear : Factor w/ 7 levels "0","1","2","3",..: 4 3 3 4 3 5 6 6 3 4 ...
## $ WorkLifeBalance : Factor w/ 4 levels "1","2","3","4": 2 4 3 3 3 2 2 3 3 2 ...
###############################################
# #
############# Numeric Variables ###############
# #
###############################################
# numeric variables
ddsDF_num <- dds[,c(3,2,5,7,14,20,21,22,25,30,33:36)]
str(ddsDF_num)
## 'data.frame': 870 obs. of 14 variables:
## $ Attrition : chr "No" "No" "No" "No" ...
## $ Age : int 32 40 35 32 24 27 41 37 34 34 ...
## $ DailyRate : int 117 1308 200 801 567 294 1283 309 1333 653 ...
## $ DistanceFromHome : int 13 14 18 1 2 10 5 10 10 10 ...
## $ HourlyRate : int 73 44 60 48 32 32 90 88 87 92 ...
## $ MonthlyIncome : int 4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
## $ MonthlyRate : int 9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
## $ NumCompaniesWorked : int 2 1 2 1 1 1 2 2 1 1 ...
## $ PercentSalaryHike : int 11 14 11 19 13 21 12 14 19 14 ...
## $ TotalWorkingYears : int 8 21 10 14 6 9 7 8 1 8 ...
## $ YearsAtCompany : int 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : int 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion: int 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : int 3 9 2 7 3 7 3 0 0 7 ...
ddsDF_num[,c(2:14)] <- sapply(ddsDF_num[,c(2:14)], as.numeric)
ddsDF_num$Attrition <- as.factor(ddsDF_num$Attrition)
str(ddsDF_num)
## 'data.frame': 870 obs. of 14 variables:
## $ Attrition : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : num 32 40 35 32 24 27 41 37 34 34 ...
## $ DailyRate : num 117 1308 200 801 567 ...
## $ DistanceFromHome : num 13 14 18 1 2 10 5 10 10 10 ...
## $ HourlyRate : num 73 44 60 48 32 32 90 88 87 92 ...
## $ MonthlyIncome : num 4403 19626 9362 10422 3760 ...
## $ MonthlyRate : num 9250 17544 19944 24032 17218 ...
## $ NumCompaniesWorked : num 2 1 2 1 1 1 2 2 1 1 ...
## $ PercentSalaryHike : num 11 14 11 19 13 21 12 14 19 14 ...
## $ TotalWorkingYears : num 8 21 10 14 6 9 7 8 1 8 ...
## $ YearsAtCompany : num 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : num 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion: num 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : num 3 9 2 7 3 7 3 0 0 7 ...
########################
## ##
### Start EDA ###
## ##
########################
################# Boruta for Attrition ##############
# Note about Boruta: Boruta is an all relevant feature selection wrapper algorithm, capable of working with any classification method that output variable importance measure (VIM); by default, Boruta uses Random Forest. The method performs a top-down search for relevant features by comparing original attributes' importance with importance achievable at random, estimated using their permuted copies, and progressively eliminating irrelevant features to stabilise that test.
ddsBinded <- cbind(ddsDF_cat, ddsDF_num)
str(ddsBinded)
## 'data.frame': 870 obs. of 32 variables:
## $ Attrition : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
## $ Education : Factor w/ 5 levels "1","2","3","4",..: 4 3 2 4 1 2 5 4 4 4 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
## $ EnvironmentSatisfaction : Factor w/ 4 levels "1","2","3","4": 2 3 3 3 1 4 2 4 3 4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
## $ JobInvolvement : Factor w/ 4 levels "1","2","3","4": 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : Factor w/ 5 levels "1","2","3","4",..: 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : chr "Sales Exec" "Res. Director" "Manufact Director" "Sales Exec" ...
## $ JobSatisfaction : Factor w/ 4 levels "1","2","3","4": 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
## $ PerformanceRating : Factor w/ 2 levels "3","4": 1 1 1 1 1 2 1 1 1 1 ...
## $ RelationshipSatisfaction: Factor w/ 4 levels "1","2","3","4": 3 1 3 3 3 3 1 3 4 2 ...
## $ StockOptionLevel : Factor w/ 4 levels "0","1","2","3": 2 1 1 3 1 3 1 4 2 2 ...
## $ TrainingTimesLastYear : Factor w/ 7 levels "0","1","2","3",..: 4 3 3 4 3 5 6 6 3 4 ...
## $ WorkLifeBalance : Factor w/ 4 levels "1","2","3","4": 2 4 3 3 3 2 2 3 3 2 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : num 32 40 35 32 24 27 41 37 34 34 ...
## $ DailyRate : num 117 1308 200 801 567 ...
## $ DistanceFromHome : num 13 14 18 1 2 10 5 10 10 10 ...
## $ HourlyRate : num 73 44 60 48 32 32 90 88 87 92 ...
## $ MonthlyIncome : num 4403 19626 9362 10422 3760 ...
## $ MonthlyRate : num 9250 17544 19944 24032 17218 ...
## $ NumCompaniesWorked : num 2 1 2 1 1 1 2 2 1 1 ...
## $ PercentSalaryHike : num 11 14 11 19 13 21 12 14 19 14 ...
## $ TotalWorkingYears : num 8 21 10 14 6 9 7 8 1 8 ...
## $ YearsAtCompany : num 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : num 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion : num 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : num 3 9 2 7 3 7 3 0 0 7 ...
names(ddsBinded)
## [1] "Attrition" "BusinessTravel"
## [3] "Department" "Education"
## [5] "EducationField" "EnvironmentSatisfaction"
## [7] "Gender" "JobInvolvement"
## [9] "JobLevel" "JobRole"
## [11] "JobSatisfaction" "MaritalStatus"
## [13] "OverTime" "PerformanceRating"
## [15] "RelationshipSatisfaction" "StockOptionLevel"
## [17] "TrainingTimesLastYear" "WorkLifeBalance"
## [19] "Attrition" "Age"
## [21] "DailyRate" "DistanceFromHome"
## [23] "HourlyRate" "MonthlyIncome"
## [25] "MonthlyRate" "NumCompaniesWorked"
## [27] "PercentSalaryHike" "TotalWorkingYears"
## [29] "YearsAtCompany" "YearsInCurrentRole"
## [31] "YearsSinceLastPromotion" "YearsWithCurrManager"
ddsBinded <- ddsBinded[,-19]
boruta_output <- Boruta(Attrition~., data=ddsBinded, doTrace=2) # Boruta search
## 1. run of importance source...
## 2. run of importance source...
## 3. run of importance source...
## 4. run of importance source...
## 5. run of importance source...
## 6. run of importance source...
## 7. run of importance source...
## 8. run of importance source...
## 9. run of importance source...
## 10. run of importance source...
## 11. run of importance source...
## 12. run of importance source...
## After 12 iterations, +1.7 secs:
## confirmed 12 attributes: Age, JobInvolvement, JobLevel, JobRole, MaritalStatus and 7 more;
## rejected 6 attributes: BusinessTravel, Gender, HourlyRate, MonthlyRate, PerformanceRating and 1 more;
## still have 12 attributes left.
## 13. run of importance source...
## 14. run of importance source...
## 15. run of importance source...
## 16. run of importance source...
## After 16 iterations, +2.2 secs:
## rejected 2 attributes: DailyRate, DistanceFromHome;
## still have 10 attributes left.
## 17. run of importance source...
## 18. run of importance source...
## 19. run of importance source...
## 20. run of importance source...
## After 20 iterations, +2.6 secs:
## rejected 1 attribute: Education;
## still have 9 attributes left.
## 21. run of importance source...
## 22. run of importance source...
## 23. run of importance source...
## After 23 iterations, +2.9 secs:
## rejected 1 attribute: YearsSinceLastPromotion;
## still have 8 attributes left.
## 24. run of importance source...
## 25. run of importance source...
## 26. run of importance source...
## After 26 iterations, +3.2 secs:
## confirmed 1 attribute: WorkLifeBalance;
## still have 7 attributes left.
## 27. run of importance source...
## 28. run of importance source...
## 29. run of importance source...
## After 29 iterations, +3.6 secs:
## rejected 2 attributes: EducationField, RelationshipSatisfaction;
## still have 5 attributes left.
## 30. run of importance source...
## 31. run of importance source...
## 32. run of importance source...
## After 32 iterations, +3.8 secs:
## rejected 1 attribute: PercentSalaryHike;
## still have 4 attributes left.
## 33. run of importance source...
## 34. run of importance source...
## 35. run of importance source...
## 36. run of importance source...
## 37. run of importance source...
## 38. run of importance source...
## 39. run of importance source...
## 40. run of importance source...
## 41. run of importance source...
## 42. run of importance source...
## 43. run of importance source...
## 44. run of importance source...
## 45. run of importance source...
## 46. run of importance source...
## 47. run of importance source...
## 48. run of importance source...
## 49. run of importance source...
## 50. run of importance source...
## 51. run of importance source...
## 52. run of importance source...
## 53. run of importance source...
## 54. run of importance source...
## 55. run of importance source...
## 56. run of importance source...
## 57. run of importance source...
## 58. run of importance source...
## 59. run of importance source...
## 60. run of importance source...
## 61. run of importance source...
## 62. run of importance source...
## 63. run of importance source...
## 64. run of importance source...
## 65. run of importance source...
## 66. run of importance source...
## 67. run of importance source...
## 68. run of importance source...
## 69. run of importance source...
## 70. run of importance source...
## 71. run of importance source...
## 72. run of importance source...
## 73. run of importance source...
## 74. run of importance source...
## After 74 iterations, +7.7 secs:
## confirmed 1 attribute: Department;
## still have 3 attributes left.
## 75. run of importance source...
## 76. run of importance source...
## 77. run of importance source...
## 78. run of importance source...
## 79. run of importance source...
## 80. run of importance source...
## 81. run of importance source...
## 82. run of importance source...
## 83. run of importance source...
## 84. run of importance source...
## 85. run of importance source...
## 86. run of importance source...
## 87. run of importance source...
## 88. run of importance source...
## 89. run of importance source...
## 90. run of importance source...
## 91. run of importance source...
## 92. run of importance source...
## 93. run of importance source...
## 94. run of importance source...
## 95. run of importance source...
## 96. run of importance source...
## 97. run of importance source...
## 98. run of importance source...
## 99. run of importance source...
boruta_signif <- names(boruta_output$finalDecision[boruta_output$finalDecision %in% c("Confirmed", "Tentative")]) #collect Confirmed and Tentative variables
#view(boruta_signif) #view sig var
# Results
# 1. Department
# 2. EnvironmentSatisfaction
# 3. JobInvolvement
# 4. JobLevel
# 5. JobRole
# 6. JobSatisfaction
# 7. MaritalStatus
# 8. OverTime
# 9. StockOptionLevel
# 10. WorkLifeBalance
# 11. Age
# 12 MonthlyIncome
# 13 NumCompaniesWorked
# 14 TotalWorkingYears
# 15 YearsAtCompany
# 16 YearsInCurrentRole
# 17 YearsSinceLastPromotion
# 18 YearsWithCurrManager
plot(boruta_output, cex.axis=.7, las=2, xlab="",
main="Variable Importance to Attrition (Boruta Method)") #plot results
# Top 3
# 1. Overtime
# 2. MonthlyIncome
# 3. StockOptionLevel
####### Additional EDA Based on Boruta Results and Personal Interest #####
fritos <- ddsBinded[,c("Attrition",
"Department",
"EnvironmentSatisfaction",
"JobInvolvement",
"JobLevel",
"JobRole",
"JobSatisfaction",
"MaritalStatus",
"OverTime",
"StockOptionLevel",
"WorkLifeBalance",
"Age",
"MonthlyIncome",
"NumCompaniesWorked",
"TotalWorkingYears",
"YearsAtCompany",
"YearsInCurrentRole",
"YearsSinceLastPromotion",
"YearsWithCurrManager")]
str(fritos)
## 'data.frame': 870 obs. of 19 variables:
## $ Attrition : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
## $ EnvironmentSatisfaction: Factor w/ 4 levels "1","2","3","4": 2 3 3 3 1 4 2 4 3 4 ...
## $ JobInvolvement : Factor w/ 4 levels "1","2","3","4": 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : Factor w/ 5 levels "1","2","3","4",..: 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : chr "Sales Exec" "Res. Director" "Manufact Director" "Sales Exec" ...
## $ JobSatisfaction : Factor w/ 4 levels "1","2","3","4": 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
## $ StockOptionLevel : Factor w/ 4 levels "0","1","2","3": 2 1 1 3 1 3 1 4 2 2 ...
## $ WorkLifeBalance : Factor w/ 4 levels "1","2","3","4": 2 4 3 3 3 2 2 3 3 2 ...
## $ Age : num 32 40 35 32 24 27 41 37 34 34 ...
## $ MonthlyIncome : num 4403 19626 9362 10422 3760 ...
## $ NumCompaniesWorked : num 2 1 2 1 1 1 2 2 1 1 ...
## $ TotalWorkingYears : num 8 21 10 14 6 9 7 8 1 8 ...
## $ YearsAtCompany : num 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : num 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion: num 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : num 3 9 2 7 3 7 3 0 0 7 ...
# Let's see how these 18 factors stack up together for significance
glmTest <- glm(Attrition~., data=fritos, family = binomial)
summary(glmTest)
##
## Call:
## glm(formula = Attrition ~ ., family = binomial, data = fritos)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1993 -0.4391 -0.1961 -0.0541 3.5362
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.016e+01 7.420e+02 -0.014 0.989075
## DepartmentResearch & Development 1.390e+01 7.420e+02 0.019 0.985058
## DepartmentSales 1.398e+01 7.420e+02 0.019 0.984965
## EnvironmentSatisfaction2 -1.385e+00 3.966e-01 -3.492 0.000479 ***
## EnvironmentSatisfaction3 -1.096e+00 3.508e-01 -3.123 0.001789 **
## EnvironmentSatisfaction4 -1.135e+00 3.503e-01 -3.241 0.001190 **
## JobInvolvement2 -1.470e+00 4.743e-01 -3.099 0.001941 **
## JobInvolvement3 -2.043e+00 4.495e-01 -4.545 5.49e-06 ***
## JobInvolvement4 -2.051e+00 6.179e-01 -3.320 0.000901 ***
## JobLevel2 -8.189e-01 6.531e-01 -1.254 0.209888
## JobLevel3 7.751e-01 1.012e+00 0.766 0.443921
## JobLevel4 4.051e-01 1.691e+00 0.240 0.810714
## JobLevel5 4.246e+00 2.226e+00 1.907 0.056478 .
## JobRoleHR 1.422e+01 7.420e+02 0.019 0.984709
## JobRoleLab Tech 4.336e-02 7.550e-01 0.057 0.954200
## JobRoleManager -1.184e+00 1.463e+00 -0.809 0.418305
## JobRoleManufact Director -1.315e+00 8.847e-01 -1.486 0.137234
## JobRoleRes. Director -3.042e+00 1.571e+00 -1.936 0.052824 .
## JobRoleRes. Scientist -4.952e-01 7.654e-01 -0.647 0.517666
## JobRoleSales Exec 4.880e-01 1.431e+00 0.341 0.732986
## JobRoleSales Rep 1.113e+00 1.554e+00 0.717 0.473630
## JobSatisfaction2 -5.991e-01 3.777e-01 -1.586 0.112701
## JobSatisfaction3 -5.090e-01 3.348e-01 -1.520 0.128489
## JobSatisfaction4 -1.524e+00 3.714e-01 -4.102 4.09e-05 ***
## MaritalStatusMarried 1.219e+00 4.310e-01 2.828 0.004687 **
## MaritalStatusSingle 1.172e+00 5.597e-01 2.093 0.036308 *
## OverTimeYes 2.229e+00 2.754e-01 8.094 5.77e-16 ***
## StockOptionLevel1 -1.233e+00 4.033e-01 -3.056 0.002240 **
## StockOptionLevel2 -1.309e+00 7.150e-01 -1.830 0.067207 .
## StockOptionLevel3 3.554e-01 5.710e-01 0.622 0.533695
## WorkLifeBalance2 -1.397e+00 4.887e-01 -2.858 0.004262 **
## WorkLifeBalance3 -1.794e+00 4.601e-01 -3.899 9.64e-05 ***
## WorkLifeBalance4 -2.099e+00 5.983e-01 -3.508 0.000451 ***
## Age -3.178e-02 1.822e-02 -1.744 0.081110 .
## MonthlyIncome -1.295e-04 1.335e-04 -0.970 0.331800
## NumCompaniesWorked 1.801e-01 5.363e-02 3.357 0.000787 ***
## TotalWorkingYears -6.153e-02 4.267e-02 -1.442 0.149317
## YearsAtCompany 5.280e-02 5.670e-02 0.931 0.351733
## YearsInCurrentRole -1.040e-01 6.795e-02 -1.530 0.125941
## YearsSinceLastPromotion 2.540e-01 6.393e-02 3.973 7.10e-05 ***
## YearsWithCurrManager -1.555e-01 6.521e-02 -2.385 0.017085 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 767.67 on 869 degrees of freedom
## Residual deviance: 455.40 on 829 degrees of freedom
## AIC: 537.4
##
## Number of Fisher Scoring iterations: 15
car::vif(glmTest)
## GVIF Df GVIF^(1/(2*Df))
## Department 4.845337e+07 2 83.431681
## EnvironmentSatisfaction 1.324113e+00 3 1.047902
## JobInvolvement 1.262361e+00 3 1.039594
## JobLevel 1.255047e+02 4 1.829500
## JobRole 1.535226e+09 8 3.750902
## JobSatisfaction 1.338698e+00 3 1.049817
## MaritalStatus 2.839925e+00 2 1.298155
## OverTime 1.302421e+00 1 1.141236
## StockOptionLevel 2.966410e+00 3 1.198685
## WorkLifeBalance 1.369853e+00 3 1.053850
## Age 1.812045e+00 1 1.346122
## MonthlyIncome 1.910313e+01 1 4.370712
## NumCompaniesWorked 1.499120e+00 1 1.224386
## TotalWorkingYears 5.742731e+00 1 2.396400
## YearsAtCompany 7.225288e+00 1 2.687990
## YearsInCurrentRole 3.274701e+00 1 1.809613
## YearsSinceLastPromotion 3.153323e+00 1 1.775760
## YearsWithCurrManager 3.088774e+00 1 1.757491
plot(glmTest, which=4) # Cook's d plot
plot(glmTest, which=2) # Normal Q-Q Plot
############################################################
############################################################
## ##
## T-test and Chi-Sq Tests ##
## ##
############################################################
# t-Test Independent Numerical Variables vs Attrition
age <- t.test(ddsBinded$Age~ddsBinded$Attrition)
dayrate <- t.test(ddsBinded$DailyRate~ddsBinded$Attrition)
disthom <- t.test(ddsBinded$DistanceFromHome~ddsBinded$Attrition)
hrrt <- t.test(ddsBinded$HourlyRate~ddsBinded$Attrition)
moninc <- t.test(ddsBinded$MonthlyIncome~ddsBinded$Attrition)
numcowrk <- t.test(ddsBinded$NumCompaniesWorked~ddsBinded$Attrition)
percSal <- t.test(ddsBinded$PercentSalaryHike~ddsBinded$Attrition)
totwork <- t.test(ddsBinded$TotalWorkingYears~ddsBinded$Attrition)
yrsco <- t.test(ddsBinded$YearsAtCompany~ddsBinded$Attrition)
yrsinrole <- t.test(ddsBinded$YearsInCurrentRole~ddsBinded$Attrition)
tTestdf <- data.frame(Attrition = c("Age", "DayRate", "DistanceFromHome", "HourlyRate", "MonthlyIncome", "NumCompaniesWorked","PercentSalaryHike", "TotalWorkingYears", "YearsAtCompany","YearsInCurrentRole"),
statistic = c(age$statistic,dayrate$statistic,disthom$statistic,hrrt$statistic,moninc$statistic,
numcowrk$statistic,percSal$statistic,totwork$statistic,yrsco$statistic,yrsinrole$statistic),
parameter = c(age$parameter,dayrate$parameter,disthom$parameter,hrrt$parameter,moninc$parameter,
numcowrk$parameter,percSal$parameter,totwork$parameter,yrsco$parameter,yrsinrole$parameter),
pvalue = c(age$p.value,dayrate$p.value,disthom$p.value,hrrt$p.value,moninc$p.value,
numcowrk$p.value,percSal$p.value,totwork$p.value,yrsco$p.value,yrsinrole$p.value))
# Test Multiple variables against Attrition
features1 <- ddsBinded %>% select_if(is.numeric) %>%
pivot_longer(., cols=c(2:13), names_to = "variable", values_to="value") #Combine Numeric Variables into a stack
features1 <- cbind(ddsBinded$Attrition, features1) # add Attrition into the dataframe
names(features1)[1] <- "Attrition" #correct column name
str(features1)
## 'data.frame': 10440 obs. of 4 variables:
## $ Attrition: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : num 32 32 32 32 32 32 32 32 32 32 ...
## $ variable : chr "DailyRate" "DistanceFromHome" "HourlyRate" "MonthlyIncome" ...
## $ value : num 117 13 73 4403 9250 ...
stat.test <- features1 %>% group_by(variable) %>%
t_test(value~Attrition) %>%
adjust_pvalue(method = "bonferroni") %>%
add_significance()
# Chi-Square Tests for categorical variables vs Attrition
bt <- chisq.test(table(ddsBinded$Attrition, ddsBinded$BusinessTravel))
dep <- chisq.test(table(ddsBinded$Attrition,ddsBinded$Department))
edu <- chisq.test(table(ddsBinded$Attrition,ddsBinded$Education))
## Warning in chisq.test(table(ddsBinded$Attrition, ddsBinded$Education)): Chi-
## squared approximation may be incorrect
edufie <- chisq.test(table(ddsBinded$Attrition,ddsBinded$EducationField))
## Warning in chisq.test(table(ddsBinded$Attrition, ddsBinded$EducationField)):
## Chi-squared approximation may be incorrect
envsat <- chisq.test(table(ddsBinded$Attrition,ddsBinded$EnvironmentSatisfaction))
gender <- chisq.test(table(ddsBinded$Attrition,ddsBinded$Gender))
jobinv <- chisq.test(table(ddsBinded$Attrition,ddsBinded$JobInvolvement))
joblev <- chisq.test(table(ddsBinded$Attrition,ddsBinded$JobLevel))
jobro <- chisq.test(table(ddsBinded$Attrition,ddsBinded$JobRole))
## Warning in chisq.test(table(ddsBinded$Attrition, ddsBinded$JobRole)): Chi-
## squared approximation may be incorrect
jobsat <- chisq.test(table(ddsBinded$Attrition,ddsBinded$JobSatisfaction))
marstat <- chisq.test(table(ddsBinded$Attrition,ddsBinded$MaritalStatus))
OvrT <- chisq.test(table(ddsBinded$Attrition,ddsBinded$OverTime))
perrat <- chisq.test(table(ddsBinded$Attrition,ddsBinded$PerformanceRating))
relsat <- chisq.test(table(ddsBinded$Attrition,ddsBinded$RelationshipSatisfaction))
stoop <- chisq.test(table(ddsBinded$Attrition,ddsBinded$StockOptionLevel))
ttly <- chisq.test(table(ddsBinded$Attrition,ddsBinded$TrainingTimesLastYear))
## Warning in chisq.test(table(ddsBinded$Attrition,
## ddsBinded$TrainingTimesLastYear)): Chi-squared approximation may be incorrect
wlb <- chisq.test(table(ddsBinded$Attrition,ddsBinded$WorkLifeBalance))
dailyrate <- chisq.test(table(ddsBinded$Attrition,ddsBinded$DailyRate))
## Warning in chisq.test(table(ddsBinded$Attrition, ddsBinded$DailyRate)): Chi-
## squared approximation may be incorrect
moninc <- chisq.test(table(ddsBinded$Attrition, ddsBinded$MonthlyIncome))
## Warning in chisq.test(table(ddsBinded$Attrition, ddsBinded$MonthlyIncome)): Chi-
## squared approximation may be incorrect
namesDF <- names(ddsDF_cat)
namesDF <- namesDF[-1]
namesDF
## [1] "BusinessTravel" "Department"
## [3] "Education" "EducationField"
## [5] "EnvironmentSatisfaction" "Gender"
## [7] "JobInvolvement" "JobLevel"
## [9] "JobRole" "JobSatisfaction"
## [11] "MaritalStatus" "OverTime"
## [13] "PerformanceRating" "RelationshipSatisfaction"
## [15] "StockOptionLevel" "TrainingTimesLastYear"
## [17] "WorkLifeBalance"
# dataframe of ChiSquared Pvalues
ChiDF <- data.frame(pvalue = c(bt$p.value, dep$p.value, edu$p.value, edufie$p.value, envsat$p.value, gender$p.value,
jobinv$p.value, joblev$p.value, jobro$p.value, jobsat$p.value, marstat$p.value,
OvrT$p.value, perrat$p.value, relsat$p.value, stoop$p.value, ttly$p.value, wlb$p.value),
parameter = c(bt$parameter, dep$parameter, edu$parameter, edufie$parameter, envsat$parameter,
gender$parameter, jobinv$parameter, joblev$parameter, jobro$parameter, jobsat$parameter,
marstat$parameter, OvrT$parameter, perrat$parameter, relsat$parameter, stoop$parameter,
ttly$parameter, wlb$parameter),
statistic = c(bt$statistic, dep$statistic, edu$statistic, edufie$statistic, envsat$statistic,
gender$statistic, jobinv$statistic, joblev$statistic, jobro$statistic, jobsat$statistic,
marstat$statistic, OvrT$statistic, perrat$statistic, relsat$statistic, stoop$statistic, ttly$statistic,
wlb$statistic),
vsAttrition = namesDF)
ChiDF <- ChiDF[,c(4,3,2,1)]
ChiDF$pvalue <- as.numeric(ChiDF$pvalue)
ChiDF$vsAttrition <- as.character(ChiDF$vsAttrition)
ChiDF
## vsAttrition statistic parameter pvalue
## 1 BusinessTravel 5.9944524 2 4.992536e-02
## 2 Department 9.3290395 2 9.423773e-03
## 3 Education 2.6143467 4 6.242838e-01
## 4 EducationField 6.4114004 5 2.682198e-01
## 5 EnvironmentSatisfaction 11.2308474 3 1.054090e-02
## 6 Gender 0.4236332 1 5.151297e-01
## 7 JobInvolvement 41.4648341 3 5.211041e-09
## 8 JobLevel 41.5328455 4 2.084703e-08
## 9 JobRole 60.5429583 8 3.646836e-10
## 10 JobSatisfaction 11.1089264 3 1.115122e-02
## 11 MaritalStatus 34.4062337 2 3.378946e-08
## 12 OverTime 62.7616454 1 2.332981e-15
## 13 PerformanceRating 0.1047771 1 7.461706e-01
## 14 RelationshipSatisfaction 3.1252680 3 3.727117e-01
## 15 StockOptionLevel 56.2449858 3 3.724464e-12
## 16 TrainingTimesLastYear 10.1319456 6 1.192044e-01
## 17 WorkLifeBalance 14.3245375 3 2.495090e-03
#### More EDA ###
# Percentage of No vs Yes
Perc_Att <- dds %>% group_by(Attrition) %>% count()
str(Perc_Att)
## tibble [2 x 2] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ Attrition: chr [1:2] "No" "Yes"
## $ n : int [1:2] 730 140
## - attr(*, "groups")= tibble [2 x 2] (S3: tbl_df/tbl/data.frame)
## ..$ Attrition: chr [1:2] "No" "Yes"
## ..$ .rows : list<int> [1:2]
## .. ..$ : int 1
## .. ..$ : int 2
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
Perc_Att$Percent <- Perc_Att$n/length(dds$Age)
# 83.9% No Attrition 16.1% Yes Attrition
# Let's look at some summaries overall and then of people that left the company
ddsSummary <- data.frame(summary(dds))
ddsSummary <- na.omit(ddsSummary)
ddsSummary <- ddsSummary %>% separate(Freq, into = c("Variable", "Value"), sep = ":")
ddsSummary <- ddsSummary[,-1]
ddsSummary <- ddsSummary %>% unite(Var, Var2, Variable, sep = " ")
#view(ddsSummary)
# Find summaries of attribution = "yes"
ddsSumAtt <- dds %>% dplyr::filter(Attrition == "Yes") %>%
summary(dds)
ddsSumAtt <- as.data.frame(ddsSumAtt)
ddsSumAtt <- na.omit(ddsSumAtt)
ddsSumAtt <- ddsSumAtt %>% separate(Freq, into = c("Variable", "Value"), sep = ":")
ddsSumAtt <- ddsSumAtt[,-1]
ddsSumAtt <- ddsSumAtt %>% unite(Var, Var2, Variable, sep = " ")
# Find summaries of attribution = "no"
ddsSumAttNo <- dds %>% dplyr::filter(Attrition == "No") %>%
summary(dds)
ddsSumAttNo <- as.data.frame(ddsSumAttNo)
ddsSumAttNo <- na.omit(ddsSumAttNo)
ddsSumAttNo <- ddsSumAttNo %>% separate(Freq, into = c("Variable", "Value"), sep = ":")
ddsSumAttNo <- ddsSumAttNo[,-1]
ddsSumAttNo <- ddsSumAttNo %>% unite(Var, Var2, Variable, sep = " ")
# Create one master Yes vs No for numeric values for comparison
ddsSumMerge <- merge(ddsSumAtt, ddsSumAttNo, by = "Var")
ddsSumMerge <- rename(ddsSumMerge, "Att=Yes"="Value.x", "Att=No" = "Value.y")
ddsSumMerge <- ddsSumMerge[-c(1:6, 13:42, 49:54, 61:81, 88:90, 97:111,
130:132, 163:174),]
write.csv(ddsSumMerge, "SumMerge.csv")
ddsColMeansYes <- dds %>% dplyr::filter(Attrition == "Yes") %>%
dplyr::select(Age, DailyRate, HourlyRate,
MonthlyRate, DistanceFromHome,
EnvironmentSatisfaction,
JobInvolvement,
JobSatisfaction,
MonthlyIncome,
NumCompaniesWorked,
PerformanceRating,
RelationshipSatisfaction,
TotalWorkingYears,
TrainingTimesLastYear,
WorkLifeBalance,
YearsAtCompany,
YearsInCurrentRole,
YearsSinceLastPromotion,
YearsWithCurrManager) %>%
colMeans()
#view(ddsColMeansYes)
ddsColMeansYes <- as.data.frame((ddsColMeansYes))
ddsColMeansNo <- dds %>% dplyr::filter(Attrition == "No") %>%
dplyr::select(Age, DailyRate, HourlyRate,
MonthlyRate, DistanceFromHome,
EnvironmentSatisfaction,
JobInvolvement,
JobSatisfaction,
MonthlyIncome,
NumCompaniesWorked,
PerformanceRating,
RelationshipSatisfaction,
TotalWorkingYears,
TrainingTimesLastYear,
WorkLifeBalance,
YearsAtCompany,
YearsInCurrentRole,
YearsSinceLastPromotion,
YearsWithCurrManager) %>%
colMeans()
ddsColMeansNo <- as.data.frame((ddsColMeansNo))
ddsColMeans <- cbind(ddsColMeansYes, ddsColMeansNo)
ddsColMeans <- rename(ddsColMeans, "AttYes" = "(ddsColMeansYes)", "AttNo"="(ddsColMeansNo)")
ddsColMeans$Ratio <- (1 - ddsColMeans$AttYes / ddsColMeans$AttNo )
ddsColMeans$Diff <- ddsColMeans$AttNo - ddsColMeans$AttYes
ddsRN <- rownames(ddsColMeans)
ddsColMeans <- cbind(ddsRN, ddsColMeans)
ddsColMeans <- rename(ddsColMeans, "Attributes" = "ddsRN")
rownames(ddsColMeans) <- 1:nrow(ddsColMeans)
# Plot to show ratios between mean values of Att=Yes vs Att=No
ddsColMeans %>% ggplot(aes(x=Ratio, y=Attributes)) +
geom_point() +
geom_segment(aes(x=Ratio, xend=0, y=Attributes, yend=Attributes)) +
labs(title = "Yes to No Attrition Ratio by Employee Attribute",
subtitle = "Positive Means No is X% larger than Yes") +
geom_text(aes(label=formattable::percent(Ratio,1)), size = 3, nudge_y = .4) +
xlab("Ratio Yes:No") +
theme_classic()
# Plot to show differences in mean values of Att=Yes vs Att=No
ddsColMeans[-c(2,4,9),] %>%
ggplot(aes(x=Diff, y=Attributes)) +
geom_point() +
geom_segment(aes(x=Diff, xend=0, y=Attributes, yend=Attributes)) +
labs(title = "Yes to No Attrition Mean Different by Employee Attribute",
subtitle = "Positive Means No is larger than Yes") +
geom_text(aes(label=round(Diff, 2)), size = 3, nudge_y = .4) +
xlab("Difference = No - Yes") +
theme_classic()
# Plot to show differences in mean values of Att=Yes vs Att=No for Earnings
ddsColMeans[c(2,4,9),] %>%
ggplot(aes(x=Diff, y=Attributes)) +
geom_point() +
geom_segment(aes(x=Diff, xend=0, y=Attributes, yend=Attributes)) +
labs(title = "Yes to No Attrition Mean Differences by Employee Attribute",
subtitle = "Positive Means No is larger than Yes") +
geom_text(aes(label=scales::dollar(round(Diff, 2))), size = 3, nudge_y = .1) +
xlab("Difference = No - Yes") +
theme_classic()
##### Review the top 3 Factors that contribute to Attrition #####
############## Overtime #################################
# Plot Overtime by Percent vs Job Roles
ddsPlot <- ddsBinded %>% group_by(JobRole, OverTime) %>% #OT by JobRole
summarize(count = n()) %>% # count recordc by JobRoles
mutate(pct = count/sum(count)) # find percent total
## `summarise()` regrouping output by 'JobRole' (override with `.groups` argument)
# Overall OT for the company
OT <- ddsBinded %>% group_by(OverTime) %>% #OT for whole dataset
summarize(count = n()) %>% # count recordc by JobRoles
mutate(pct = count/sum(count)) # find percent total
## `summarise()` ungrouping output (override with `.groups` argument)
# plot the graph
ddsPlot %>%
ggplot(aes(x=JobRole, y=pct, fill = OverTime)) +
geom_bar(stat = "identity") +
labs(title = "Overtime by JobRole",
y="Percentage of Overtime",
x="JobRole") +
geom_text(aes(label=scales::percent(pct)), position = position_stack(vjust = .5)) +
scale_x_discrete(labels = function(x) str_wrap(x, width=10)) +
theme_bw() +
theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
OTValueYes <- ddsBinded %>% filter(OverTime == "Yes") %>%
aggregate(MonthlyIncome~JobRole, ., mean)
OTValueNo <- ddsBinded %>% filter(OverTime == "No") %>%
aggregate(MonthlyIncome~JobRole, ., mean)
aggOTV <- aggregate(MonthlyIncome~OverTime, ddsBinded, mean)
dff <- aggOTV$MonthlyIncome[1]-aggOTV$MonthlyIncome[2]
dff
## [1] 255.9768
OTV <- merge(x=OTValueYes, y=OTValueNo, by = "JobRole")
names(OTV)[2] <- "OT:Yes Monthly Income"
names(OTV)[3] <- "OT:No Monthly Income"
OTV
## JobRole OT:Yes Monthly Income OT:No Monthly Income
## 1 Healthcare Rep 7613.409 7362.796
## 2 HR 3581.333 3199.857
## 3 Lab Tech 3487.303 3148.858
## 4 Manager 16617.200 17338.610
## 5 Manufact Director 7876.609 7371.750
## 6 Res. Director 15632.312 15803.657
## 7 Res. Scientist 3342.862 3208.589
## 8 Sales Exec 6831.898 6916.837
## 9 Sales Rep 2369.000 2798.800
#################### Monthly Income ################
# Look at MonthlyIncome
# Overall MonthlyIncomes for the company
Monthly <- ddsBinded %>% group_by() %>% #OT for whole dataset
summarize(count = n()) %>% # count recordc by JobRoles
mutate(pct = count/sum(count)) # find percent total
# plot the graph
means <- aggregate(MonthlyIncome~JobRole, ddsBinded, mean)
ddsBinded %>%
ggplot(aes(x=JobRole,y=MonthlyIncome)) +
geom_boxplot(aes(color = JobRole), show.legend = FALSE) +
labs(title = "Monthly Incomes by JobRole",
y="Monthly Income",
x="JobRole") +
scale_y_continuous(labels=scales::dollar_format()) +
#geom_text(aes(label=scales::percent(pct)), position = position_stack(vjust = .5)) +
theme_classic() +
stat_summary(fun=mean, color="green", geom="point",
shape=18, size=3,show.legend = FALSE) +
geom_text(data = means, aes(label = scales::dollar(MonthlyIncome)),vjust = -1, size=3, color="darkgreen") +
scale_x_discrete(labels = function(x) str_wrap(x, width=10))
############### Explore Stock options ###################
# Stock Option Levels by Job Role
Opts <- ddsBinded %>% group_by(JobRole, StockOptionLevel) %>% #Stock option levels by JobRole
summarize(count = n()) %>% # count records by JobRoles
mutate(pct = count/sum(count)) # find percent total
## `summarise()` regrouping output by 'JobRole' (override with `.groups` argument)
# Overall Stock Options Levels for the company
Stocks <- ddsBinded %>% group_by(StockOptionLevel) %>% #stock options levels for whole dataset
summarize(count = n()) %>% # count records by JobRoles
mutate(pct = count/sum(count)) # find percent total
## `summarise()` ungrouping output (override with `.groups` argument)
# Plot Percentages
Opts %>%
ggplot(aes(x=JobRole, y=pct, fill = StockOptionLevel)) +
geom_bar(stat = "identity", position = position_stack(reverse = TRUE)) +
labs(title = "Stock Option Levels by JobRole",
y="Percentage of Stock Option Levels",
x="JobRole") +
geom_text(aes(label=scales::percent(pct, .1)), position = position_stack(vjust = .5, reverse = TRUE), size = 3) +
scale_x_discrete(labels = function(x) str_wrap(x, width=10)) +
scale_fill_discrete(guide=guide_legend(reverse = TRUE)) +
theme_bw() +
theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
######### Job Specific Trends ############
bbq <- ddsBinded
positions <- c(6,9,11,12,16,17,18)
bbq[,c(positions)] <- sapply(bbq[,c(positions)],as.numeric)
bbqPlot <- cbind(bbq$JobRole, bbq[,c(positions)])
names(bbqPlot)[1] <- "JobRole"
test = colMeans(bbqPlot[,c(2:8)])
# Plot each variable by JobRole
bbqPlot %>% pivot_longer(., cols=c(2:8), names_to = "variable", values_to="value") %>% #combines variables into one large variable
ggplot(aes(x=JobRole, y=value, fill=variable)) +
geom_bar(stat = "identity", position = "dodge", show.legend = TRUE) +
theme_bw() +
scale_x_discrete(labels = function(x) str_wrap(x, width=10)) +
coord_flip()
# Plot each JobRole by Variable
bbqPlot %>% pivot_longer(., cols=c(2:8), names_to = "variable", values_to="value") %>% #combines variables into one large variable
ggplot(aes(x=variable, y=value, fill=JobRole)) +
geom_bar(stat = "identity", position = "dodge", show.legend = TRUE) +
theme_bw() +
scale_x_discrete(labels = function(x) str_wrap(x, width=10)) +
coord_flip() +
labs(title = "Average Category Ratings by Job Role",
y = "Job Roles",
x = "Average of Ratings for Each Category")
### Plot the rest of the variables
bbqPlot2 <- bbq[,c(10, 21, 25:31)]
bbqPlot2 <- aggregate(.~JobRole, data=bbqPlot2, mean) #get colMeans for each variable by JobRole
# Plot each JobRole by Variable (age and monthly income should be run alone)
bbqPlot2 %>% pivot_longer(., cols=c(2:8), names_to = "variable", values_to="value") %>% #combines variables into one large variable
ggplot(aes(x=variable, y=value, fill=JobRole)) +
geom_bar(stat = "identity", position = "dodge", show.legend = TRUE) +
theme_bw() +
scale_x_discrete(labels = function(x) str_wrap(x, width=10)) +
coord_flip() +
labs(title = "Average Category Values by Job Role",
y = "Job Roles",
x = "Average of Value for Each Category")
##############################
# #
## Naive Bayes ##
### Model Fitting ###
## ##
##############################
fritoLay <- ddsBinded[,c("Attrition",
"Department",
"JobInvolvement",
"JobLevel",
"JobRole",
"JobSatisfaction",
"MaritalStatus",
"OverTime",
"StockOptionLevel",
"WorkLifeBalance",
"Age",
"MonthlyIncome",
"NumCompaniesWorked",
"TotalWorkingYears",
"YearsAtCompany",
"YearsInCurrentRole",
"YearsWithCurrManager")]
iterations = 200
masterAcc = matrix(nrow = iterations)
masterSen <- matrix(nrow = iterations)
masterSpec <- matrix(nrow = iterations)
splitPerc = .7 #Training / Test split Percentage
for(j in 1:iterations)
{ trainInd = createDataPartition(fritoLay$Attrition, p = splitPerc, list = FALSE)
train = fritoLay[trainInd,]
test = fritoLay[-trainInd,]
model = naiveBayes(train[,-1], train[,1], laplace = 0)
table(predict(model,test[,-1]), test[,1])
CM = confusionMatrix(table(predict(model,test[,-1]),test[,1]))
masterAcc[j] <- CM$overall[1]
masterSen[j] <- CM$byClass[1]
masterSpec[j] <- CM$byClass[2]
}
MeanAcc <- colMeans(masterAcc)
MeanSen <- colMeans(masterSen)
MeanSpec <- colMeans(masterSpec)
MeanAcc
## [1] 0.8069732
MeanSen
## [1] 0.8458447
MeanSpec
## [1] 0.6042857
CM
## Confusion Matrix and Statistics
##
##
## No Yes
## No 182 15
## Yes 37 27
##
## Accuracy : 0.8008
## 95% CI : (0.7471, 0.8475)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 0.958464
##
## Kappa : 0.3911
##
## Mcnemar's Test P-Value : 0.003589
##
## Sensitivity : 0.8311
## Specificity : 0.6429
## Pos Pred Value : 0.9239
## Neg Pred Value : 0.4219
## Prevalence : 0.8391
## Detection Rate : 0.6973
## Detection Prevalence : 0.7548
## Balanced Accuracy : 0.7370
##
## 'Positive' Class : No
##
#### BOOOM WE are in the money! ###
# Accuracy : 0.8352
# 95% CI : (0.7846, 0.8781)
# No Information Rate : 0.8391
# P-Value [Acc > NIR] : 0.6066
#
# Kappa : 0.4483
#
# Mcnemar's Test P-Value : 0.1273
#
# Sensitivity : 0.8767
# Specificity : 0.6190
# Pos Pred Value : 0.9231
# Neg Pred Value : 0.4906
# Prevalence : 0.8391
# Detection Rate : 0.7356
# Detection Prevalence : 0.7969
# Balanced Accuracy : 0.7479
#
##############################
## ##
## Prepare the Competition ##
## Set of Data for ##
## Attrition ##
## ##
##############################
compSet <- read.csv("CaseStudy2CompSet No Attrition.csv")
str(compSet)
## 'data.frame': 300 obs. of 35 variables:
## $ ID : int 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 ...
## $ Age : int 35 33 26 55 29 51 52 39 31 31 ...
## $ BusinessTravel : chr "Travel_Rarely" "Travel_Rarely" "Travel_Rarely" "Travel_Rarely" ...
## $ DailyRate : int 750 147 1330 1311 1246 1456 585 1387 1062 534 ...
## $ Department : chr "Research & Development" "Human Resources" "Research & Development" "Research & Development" ...
## $ DistanceFromHome : int 28 2 21 2 19 1 29 10 24 20 ...
## $ Education : int 3 3 3 3 3 4 4 5 3 3 ...
## $ EducationField : chr "Life Sciences" "Human Resources" "Medical" "Life Sciences" ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 1596 1207 1107 505 1497 145 2019 1618 1252 587 ...
## $ EnvironmentSatisfaction : int 2 2 1 3 3 1 1 2 3 1 ...
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ HourlyRate : int 46 99 37 97 77 30 40 76 96 66 ...
## $ JobInvolvement : int 4 3 3 3 2 2 3 3 2 3 ...
## $ JobLevel : int 2 1 1 4 2 3 1 2 2 3 ...
## $ JobRole : chr "Laboratory Technician" "Human Resources" "Laboratory Technician" "Manager" ...
## $ JobSatisfaction : int 3 3 3 4 3 1 4 1 1 3 ...
## $ MaritalStatus : chr "Married" "Married" "Divorced" "Single" ...
## $ MonthlyIncome : int 3407 3600 2377 16659 8620 7484 3482 5377 6812 9824 ...
## $ MonthlyRate : int 25348 8429 19373 23258 23757 25796 19788 3835 17198 22908 ...
## $ NumCompaniesWorked : int 1 1 1 2 1 3 2 2 1 3 ...
## $ Over18 : chr "Y" "Y" "Y" "Y" ...
## $ OverTime : chr "No" "No" "No" "Yes" ...
## $ PercentSalaryHike : int 17 13 20 13 14 20 15 13 19 12 ...
## $ PerformanceRating : int 3 3 4 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 4 4 3 3 3 3 2 4 2 1 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 2 1 1 0 2 0 2 3 0 0 ...
## $ TotalWorkingYears : int 10 5 1 30 10 23 16 10 10 12 ...
## $ TrainingTimesLastYear : int 3 2 0 2 3 1 3 3 2 2 ...
## $ WorkLifeBalance : int 2 3 2 3 3 2 2 3 3 3 ...
## $ YearsAtCompany : int 10 5 1 5 10 13 9 7 10 1 ...
## $ YearsInCurrentRole : int 9 4 1 4 7 12 8 7 9 0 ...
## $ YearsSinceLastPromotion : int 6 1 0 1 0 12 0 7 1 0 ...
## $ YearsWithCurrManager : int 8 4 0 2 4 8 0 7 8 0 ...
#view(names(compSet))
sapply(compSet, function(x) sum(is.na(x)))
## ID Age BusinessTravel
## 0 0 0
## DailyRate Department DistanceFromHome
## 0 0 0
## Education EducationField EmployeeCount
## 0 0 0
## EmployeeNumber EnvironmentSatisfaction Gender
## 0 0 0
## HourlyRate JobInvolvement JobLevel
## 0 0 0
## JobRole JobSatisfaction MaritalStatus
## 0 0 0
## MonthlyIncome MonthlyRate NumCompaniesWorked
## 0 0 0
## Over18 OverTime PercentSalaryHike
## 0 0 0
## PerformanceRating RelationshipSatisfaction StandardHours
## 0 0 0
## StockOptionLevel TotalWorkingYears TrainingTimesLastYear
## 0 0 0
## WorkLifeBalance YearsAtCompany YearsInCurrentRole
## 0 0 0
## YearsSinceLastPromotion YearsWithCurrManager
## 0 0
# Confirmed no NA's in the data
# Categorical Variable Manipulation
comp_cat <- compSet[,c(3,5,7,8,11,12,14:18,23,25,26,28,30,31)]
str(comp_cat)
## 'data.frame': 300 obs. of 17 variables:
## $ BusinessTravel : chr "Travel_Rarely" "Travel_Rarely" "Travel_Rarely" "Travel_Rarely" ...
## $ Department : chr "Research & Development" "Human Resources" "Research & Development" "Research & Development" ...
## $ Education : int 3 3 3 3 3 4 4 5 3 3 ...
## $ EducationField : chr "Life Sciences" "Human Resources" "Medical" "Life Sciences" ...
## $ EnvironmentSatisfaction : int 2 2 1 3 3 1 1 2 3 1 ...
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ JobInvolvement : int 4 3 3 3 2 2 3 3 2 3 ...
## $ JobLevel : int 2 1 1 4 2 3 1 2 2 3 ...
## $ JobRole : chr "Laboratory Technician" "Human Resources" "Laboratory Technician" "Manager" ...
## $ JobSatisfaction : int 3 3 3 4 3 1 4 1 1 3 ...
## $ MaritalStatus : chr "Married" "Married" "Divorced" "Single" ...
## $ OverTime : chr "No" "No" "No" "Yes" ...
## $ PerformanceRating : int 3 3 4 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 4 4 3 3 3 3 2 4 2 1 ...
## $ StockOptionLevel : int 2 1 1 0 2 0 2 3 0 0 ...
## $ TrainingTimesLastYear : int 3 2 0 2 3 1 3 3 2 2 ...
## $ WorkLifeBalance : int 2 3 2 3 3 2 2 3 3 3 ...
# let's turn all the variables into factors
comp_cat <- comp_cat %>% dplyr::mutate_if(is.character, as.factor)
comp_cat <- comp_cat %>% dplyr::mutate_if(is.integer, as.factor)
str(comp_cat)
## 'data.frame': 300 obs. of 17 variables:
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 3 3 3 2 1 3 3 2 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 2 1 2 2 3 2 3 2 2 2 ...
## $ Education : Factor w/ 5 levels "1","2","3","4",..: 3 3 3 3 3 4 4 5 3 3 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 1 4 2 2 4 2 4 4 2 ...
## $ EnvironmentSatisfaction : Factor w/ 4 levels "1","2","3","4": 2 2 1 3 3 1 1 2 3 1 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 2 1 2 2 1 2 ...
## $ JobInvolvement : Factor w/ 4 levels "1","2","3","4": 4 3 3 3 2 2 3 3 2 3 ...
## $ JobLevel : Factor w/ 5 levels "1","2","3","4",..: 2 1 1 4 2 3 1 2 2 3 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 3 2 3 4 8 1 9 5 1 1 ...
## $ JobSatisfaction : Factor w/ 4 levels "1","2","3","4": 3 3 3 4 3 1 4 1 1 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 2 2 1 3 1 3 1 2 3 2 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ PerformanceRating : Factor w/ 2 levels "3","4": 1 1 2 1 1 2 1 1 1 1 ...
## $ RelationshipSatisfaction: Factor w/ 4 levels "1","2","3","4": 4 4 3 3 3 3 2 4 2 1 ...
## $ StockOptionLevel : Factor w/ 4 levels "0","1","2","3": 3 2 2 1 3 1 3 4 1 1 ...
## $ TrainingTimesLastYear : Factor w/ 7 levels "0","1","2","3",..: 4 3 1 3 4 2 4 4 3 3 ...
## $ WorkLifeBalance : Factor w/ 4 levels "1","2","3","4": 2 3 2 3 3 2 2 3 3 3 ...
# Numerical Variable Manupulation
comp_num <- compSet[,c(2,4,6,13,19,20,21,24,29,32:35)]
#view(names(comp_num))
str(comp_num)
## 'data.frame': 300 obs. of 13 variables:
## $ Age : int 35 33 26 55 29 51 52 39 31 31 ...
## $ DailyRate : int 750 147 1330 1311 1246 1456 585 1387 1062 534 ...
## $ DistanceFromHome : int 28 2 21 2 19 1 29 10 24 20 ...
## $ HourlyRate : int 46 99 37 97 77 30 40 76 96 66 ...
## $ MonthlyIncome : int 3407 3600 2377 16659 8620 7484 3482 5377 6812 9824 ...
## $ MonthlyRate : int 25348 8429 19373 23258 23757 25796 19788 3835 17198 22908 ...
## $ NumCompaniesWorked : int 1 1 1 2 1 3 2 2 1 3 ...
## $ PercentSalaryHike : int 17 13 20 13 14 20 15 13 19 12 ...
## $ TotalWorkingYears : int 10 5 1 30 10 23 16 10 10 12 ...
## $ YearsAtCompany : int 10 5 1 5 10 13 9 7 10 1 ...
## $ YearsInCurrentRole : int 9 4 1 4 7 12 8 7 9 0 ...
## $ YearsSinceLastPromotion: int 6 1 0 1 0 12 0 7 1 0 ...
## $ YearsWithCurrManager : int 8 4 0 2 4 8 0 7 8 0 ...
comp_num[,c(1:13)] <- sapply(comp_num[,c(1:13)], as.numeric)
str(comp_num)
## 'data.frame': 300 obs. of 13 variables:
## $ Age : num 35 33 26 55 29 51 52 39 31 31 ...
## $ DailyRate : num 750 147 1330 1311 1246 ...
## $ DistanceFromHome : num 28 2 21 2 19 1 29 10 24 20 ...
## $ HourlyRate : num 46 99 37 97 77 30 40 76 96 66 ...
## $ MonthlyIncome : num 3407 3600 2377 16659 8620 ...
## $ MonthlyRate : num 25348 8429 19373 23258 23757 ...
## $ NumCompaniesWorked : num 1 1 1 2 1 3 2 2 1 3 ...
## $ PercentSalaryHike : num 17 13 20 13 14 20 15 13 19 12 ...
## $ TotalWorkingYears : num 10 5 1 30 10 23 16 10 10 12 ...
## $ YearsAtCompany : num 10 5 1 5 10 13 9 7 10 1 ...
## $ YearsInCurrentRole : num 9 4 1 4 7 12 8 7 9 0 ...
## $ YearsSinceLastPromotion: num 6 1 0 1 0 12 0 7 1 0 ...
## $ YearsWithCurrManager : num 8 4 0 2 4 8 0 7 8 0 ...
# Recombine the variable sets
compSetFormatted <- cbind(comp_cat, comp_num)
str(compSetFormatted)
## 'data.frame': 300 obs. of 30 variables:
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 3 3 3 2 1 3 3 2 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 2 1 2 2 3 2 3 2 2 2 ...
## $ Education : Factor w/ 5 levels "1","2","3","4",..: 3 3 3 3 3 4 4 5 3 3 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 1 4 2 2 4 2 4 4 2 ...
## $ EnvironmentSatisfaction : Factor w/ 4 levels "1","2","3","4": 2 2 1 3 3 1 1 2 3 1 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 2 1 2 2 1 2 ...
## $ JobInvolvement : Factor w/ 4 levels "1","2","3","4": 4 3 3 3 2 2 3 3 2 3 ...
## $ JobLevel : Factor w/ 5 levels "1","2","3","4",..: 2 1 1 4 2 3 1 2 2 3 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 3 2 3 4 8 1 9 5 1 1 ...
## $ JobSatisfaction : Factor w/ 4 levels "1","2","3","4": 3 3 3 4 3 1 4 1 1 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 2 2 1 3 1 3 1 2 3 2 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ PerformanceRating : Factor w/ 2 levels "3","4": 1 1 2 1 1 2 1 1 1 1 ...
## $ RelationshipSatisfaction: Factor w/ 4 levels "1","2","3","4": 4 4 3 3 3 3 2 4 2 1 ...
## $ StockOptionLevel : Factor w/ 4 levels "0","1","2","3": 3 2 2 1 3 1 3 4 1 1 ...
## $ TrainingTimesLastYear : Factor w/ 7 levels "0","1","2","3",..: 4 3 1 3 4 2 4 4 3 3 ...
## $ WorkLifeBalance : Factor w/ 4 levels "1","2","3","4": 2 3 2 3 3 2 2 3 3 3 ...
## $ Age : num 35 33 26 55 29 51 52 39 31 31 ...
## $ DailyRate : num 750 147 1330 1311 1246 ...
## $ DistanceFromHome : num 28 2 21 2 19 1 29 10 24 20 ...
## $ HourlyRate : num 46 99 37 97 77 30 40 76 96 66 ...
## $ MonthlyIncome : num 3407 3600 2377 16659 8620 ...
## $ MonthlyRate : num 25348 8429 19373 23258 23757 ...
## $ NumCompaniesWorked : num 1 1 1 2 1 3 2 2 1 3 ...
## $ PercentSalaryHike : num 17 13 20 13 14 20 15 13 19 12 ...
## $ TotalWorkingYears : num 10 5 1 30 10 23 16 10 10 12 ...
## $ YearsAtCompany : num 10 5 1 5 10 13 9 7 10 1 ...
## $ YearsInCurrentRole : num 9 4 1 4 7 12 8 7 9 0 ...
## $ YearsSinceLastPromotion : num 6 1 0 1 0 12 0 7 1 0 ...
## $ YearsWithCurrManager : num 8 4 0 2 4 8 0 7 8 0 ...
names(compSetFormatted)
## [1] "BusinessTravel" "Department"
## [3] "Education" "EducationField"
## [5] "EnvironmentSatisfaction" "Gender"
## [7] "JobInvolvement" "JobLevel"
## [9] "JobRole" "JobSatisfaction"
## [11] "MaritalStatus" "OverTime"
## [13] "PerformanceRating" "RelationshipSatisfaction"
## [15] "StockOptionLevel" "TrainingTimesLastYear"
## [17] "WorkLifeBalance" "Age"
## [19] "DailyRate" "DistanceFromHome"
## [21] "HourlyRate" "MonthlyIncome"
## [23] "MonthlyRate" "NumCompaniesWorked"
## [25] "PercentSalaryHike" "TotalWorkingYears"
## [27] "YearsAtCompany" "YearsInCurrentRole"
## [29] "YearsSinceLastPromotion" "YearsWithCurrManager"
# Reduce the Competition Set to the appropriate variables
compSetFinal <- compSetFormatted[,c("Department",
"JobInvolvement",
"JobLevel",
"JobRole",
"JobSatisfaction",
"MaritalStatus",
"OverTime",
"StockOptionLevel",
"WorkLifeBalance",
"Age",
"MonthlyIncome",
"NumCompaniesWorked",
"TotalWorkingYears",
"YearsAtCompany",
"YearsInCurrentRole",
"YearsWithCurrManager")]
str(compSetFinal)
## 'data.frame': 300 obs. of 16 variables:
## $ Department : Factor w/ 3 levels "Human Resources",..: 2 1 2 2 3 2 3 2 2 2 ...
## $ JobInvolvement : Factor w/ 4 levels "1","2","3","4": 4 3 3 3 2 2 3 3 2 3 ...
## $ JobLevel : Factor w/ 5 levels "1","2","3","4",..: 2 1 1 4 2 3 1 2 2 3 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 3 2 3 4 8 1 9 5 1 1 ...
## $ JobSatisfaction : Factor w/ 4 levels "1","2","3","4": 3 3 3 4 3 1 4 1 1 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 2 2 1 3 1 3 1 2 3 2 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ StockOptionLevel : Factor w/ 4 levels "0","1","2","3": 3 2 2 1 3 1 3 4 1 1 ...
## $ WorkLifeBalance : Factor w/ 4 levels "1","2","3","4": 2 3 2 3 3 2 2 3 3 3 ...
## $ Age : num 35 33 26 55 29 51 52 39 31 31 ...
## $ MonthlyIncome : num 3407 3600 2377 16659 8620 ...
## $ NumCompaniesWorked : num 1 1 1 2 1 3 2 2 1 3 ...
## $ TotalWorkingYears : num 10 5 1 30 10 23 16 10 10 12 ...
## $ YearsAtCompany : num 10 5 1 5 10 13 9 7 10 1 ...
## $ YearsInCurrentRole : num 9 4 1 4 7 12 8 7 9 0 ...
## $ YearsWithCurrManager: num 8 4 0 2 4 8 0 7 8 0 ...
##########################################
### ###
### Run the Model to Predict Attrition ###
### ###
##########################################
PredictAtt <- as.data.frame(predict(model, compSetFinal))
PredictAtt <- cbind(compSet$ID, PredictAtt)
colnames(PredictAtt) <- c("Model", "Attrition")
head(PredictAtt)
## Model Attrition
## 1 1171 No
## 2 1172 No
## 3 1173 No
## 4 1174 No
## 5 1175 No
## 6 1176 No
#write.csv(PredictAtt, "Case2PredictionsEhly Attrition.csv")
##############################################
## ##
## Boruta for Monthly Incomes (Salary) ##
## ##
##############################################
MonInc <- cbind(dds$ID, ddsDF_num, ddsDF_cat)
str(MonInc)
## 'data.frame': 870 obs. of 33 variables:
## $ dds$ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : num 32 40 35 32 24 27 41 37 34 34 ...
## $ DailyRate : num 117 1308 200 801 567 ...
## $ DistanceFromHome : num 13 14 18 1 2 10 5 10 10 10 ...
## $ HourlyRate : num 73 44 60 48 32 32 90 88 87 92 ...
## $ MonthlyIncome : num 4403 19626 9362 10422 3760 ...
## $ MonthlyRate : num 9250 17544 19944 24032 17218 ...
## $ NumCompaniesWorked : num 2 1 2 1 1 1 2 2 1 1 ...
## $ PercentSalaryHike : num 11 14 11 19 13 21 12 14 19 14 ...
## $ TotalWorkingYears : num 8 21 10 14 6 9 7 8 1 8 ...
## $ YearsAtCompany : num 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : num 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion : num 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : num 3 9 2 7 3 7 3 0 0 7 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
## $ Education : Factor w/ 5 levels "1","2","3","4",..: 4 3 2 4 1 2 5 4 4 4 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
## $ EnvironmentSatisfaction : Factor w/ 4 levels "1","2","3","4": 2 3 3 3 1 4 2 4 3 4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
## $ JobInvolvement : Factor w/ 4 levels "1","2","3","4": 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : Factor w/ 5 levels "1","2","3","4",..: 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : chr "Sales Exec" "Res. Director" "Manufact Director" "Sales Exec" ...
## $ JobSatisfaction : Factor w/ 4 levels "1","2","3","4": 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
## $ PerformanceRating : Factor w/ 2 levels "3","4": 1 1 1 1 1 2 1 1 1 1 ...
## $ RelationshipSatisfaction: Factor w/ 4 levels "1","2","3","4": 3 1 3 3 3 3 1 3 4 2 ...
## $ StockOptionLevel : Factor w/ 4 levels "0","1","2","3": 2 1 1 3 1 3 1 4 2 2 ...
## $ TrainingTimesLastYear : Factor w/ 7 levels "0","1","2","3",..: 4 3 3 4 3 5 6 6 3 4 ...
## $ WorkLifeBalance : Factor w/ 4 levels "1","2","3","4": 2 4 3 3 3 2 2 3 3 2 ...
names(MonInc)[1] <- "ID"
names(MonInc)
## [1] "ID" "Attrition"
## [3] "Age" "DailyRate"
## [5] "DistanceFromHome" "HourlyRate"
## [7] "MonthlyIncome" "MonthlyRate"
## [9] "NumCompaniesWorked" "PercentSalaryHike"
## [11] "TotalWorkingYears" "YearsAtCompany"
## [13] "YearsInCurrentRole" "YearsSinceLastPromotion"
## [15] "YearsWithCurrManager" "Attrition"
## [17] "BusinessTravel" "Department"
## [19] "Education" "EducationField"
## [21] "EnvironmentSatisfaction" "Gender"
## [23] "JobInvolvement" "JobLevel"
## [25] "JobRole" "JobSatisfaction"
## [27] "MaritalStatus" "OverTime"
## [29] "PerformanceRating" "RelationshipSatisfaction"
## [31] "StockOptionLevel" "TrainingTimesLastYear"
## [33] "WorkLifeBalance"
MonInc <- MonInc[,-2]
MonInc$ID <- as.numeric(MonInc$ID)
boruta_moninc <- Boruta(MonthlyIncome~., data=MonInc, doTrace=2) # Boruta search
## 1. run of importance source...
## 2. run of importance source...
## 3. run of importance source...
## 4. run of importance source...
## 5. run of importance source...
## 6. run of importance source...
## 7. run of importance source...
## 8. run of importance source...
## 9. run of importance source...
## 10. run of importance source...
## 11. run of importance source...
## 12. run of importance source...
## After 12 iterations, +2.2 secs:
## confirmed 10 attributes: Age, Department, JobLevel, JobRole, NumCompaniesWorked and 5 more;
## rejected 15 attributes: DailyRate, EnvironmentSatisfaction, Gender, HourlyRate, ID and 10 more;
## still have 6 attributes left.
## 13. run of importance source...
## 14. run of importance source...
## 15. run of importance source...
## 16. run of importance source...
## After 16 iterations, +2.7 secs:
## rejected 2 attributes: DistanceFromHome, MonthlyRate;
## still have 4 attributes left.
## 17. run of importance source...
## 18. run of importance source...
## 19. run of importance source...
## 20. run of importance source...
## 21. run of importance source...
## 22. run of importance source...
## 23. run of importance source...
## 24. run of importance source...
## 25. run of importance source...
## 26. run of importance source...
## 27. run of importance source...
## 28. run of importance source...
## 29. run of importance source...
## 30. run of importance source...
## 31. run of importance source...
## 32. run of importance source...
## After 32 iterations, +4.3 secs:
## rejected 1 attribute: EducationField;
## still have 3 attributes left.
## 33. run of importance source...
## 34. run of importance source...
## 35. run of importance source...
## 36. run of importance source...
## 37. run of importance source...
## 38. run of importance source...
## After 38 iterations, +4.8 secs:
## confirmed 1 attribute: Attrition;
## still have 2 attributes left.
## 39. run of importance source...
## 40. run of importance source...
## 41. run of importance source...
## 42. run of importance source...
## 43. run of importance source...
## 44. run of importance source...
## 45. run of importance source...
## 46. run of importance source...
## 47. run of importance source...
## 48. run of importance source...
## 49. run of importance source...
## 50. run of importance source...
## 51. run of importance source...
## 52. run of importance source...
## 53. run of importance source...
## 54. run of importance source...
## 55. run of importance source...
## 56. run of importance source...
## 57. run of importance source...
## 58. run of importance source...
## 59. run of importance source...
## 60. run of importance source...
## 61. run of importance source...
## 62. run of importance source...
## 63. run of importance source...
## 64. run of importance source...
## 65. run of importance source...
## 66. run of importance source...
## 67. run of importance source...
## 68. run of importance source...
## 69. run of importance source...
## 70. run of importance source...
## 71. run of importance source...
## 72. run of importance source...
## 73. run of importance source...
## 74. run of importance source...
## 75. run of importance source...
## 76. run of importance source...
## 77. run of importance source...
## 78. run of importance source...
## 79. run of importance source...
## 80. run of importance source...
## 81. run of importance source...
## 82. run of importance source...
## After 82 iterations, +8.8 secs:
## confirmed 1 attribute: Education;
## still have 1 attribute left.
## 83. run of importance source...
## 84. run of importance source...
## 85. run of importance source...
## 86. run of importance source...
## 87. run of importance source...
## 88. run of importance source...
## 89. run of importance source...
## 90. run of importance source...
## 91. run of importance source...
## 92. run of importance source...
## 93. run of importance source...
## 94. run of importance source...
## 95. run of importance source...
## 96. run of importance source...
## 97. run of importance source...
## 98. run of importance source...
## 99. run of importance source...
boruta_sigMonInc <- names(boruta_moninc$finalDecision[boruta_moninc$finalDecision %in% c("Confirmed", "Tentative")]) #collect Confirmed and Tentative variables
print(boruta_sigMonInc) #view sig var
## [1] "Age" "NumCompaniesWorked"
## [3] "TotalWorkingYears" "YearsAtCompany"
## [5] "YearsInCurrentRole" "YearsSinceLastPromotion"
## [7] "YearsWithCurrManager" "Attrition"
## [9] "BusinessTravel" "Department"
## [11] "Education" "JobLevel"
## [13] "JobRole"
plot(boruta_moninc, cex.axis=.7, las=2, xlab="", main="Variable Importance to Monthly Income (Boruta Method)") #plot results
# Most Important Variables
# [1] "Age" "NumCompaniesWorked" "TotalWorkingYears" "YearsAtCompany"
# [5] "YearsInCurrentRole" "YearsSinceLastPromotion" "YearsWithCurrManager" "Attrition"
# [9] "BusinessTravel" "Department" "Education" "JobLevel"
# [13] "JobRole"
##############################################
## ##
### Build Model to Estimate Monthly Income ###
## ##
##############################################
# Build a new DF for estimating monthly inc
Monthlylm <- MonInc[,c("MonthlyIncome", "TotalWorkingYears", "Education", "JobLevel", "JobRole")]
# Create aliases for categorical variables
Monthlylm <- Monthlylm %>%
mutate(
### Education ###
Educ1_2 = case_when(Monthlylm$Education == 1 ~ 1,
Monthlylm$Education == 2 ~ 1, TRUE ~ 0),
Educ3_4 = case_when(Monthlylm$Education == 3 ~ 1,
Monthlylm$Education == 4 ~ 1, TRUE ~ 0),
# Educ5 is the reference
### JobLevel ###
JobLev1 = case_when(Monthlylm$JobLevel == 1 ~ 1, TRUE ~ 0),
JobLev2 = case_when(Monthlylm$JobLevel == 2 ~ 1, TRUE ~ 0),
JobLev3 = case_when(Monthlylm$JobLevel == 3 ~ 1, TRUE ~ 0),
JobLev4 = case_when(Monthlylm$JobLevel == 4 ~ 1, TRUE ~ 0),
#JobLev5 is the reference
### JobRole ###
JobRolSalExec = case_when(Monthlylm$JobRole == "Sales Exec" ~ 1, TRUE ~ 0),
JobRolSalRep = case_when(Monthlylm$JobRole == "Sales Rep" ~ 1, TRUE ~ 0),
JobRolResDir = case_when(Monthlylm$JobRole == "Res. Director" ~ 1, TRUE ~ 0),
JobRolResSci = case_when(Monthlylm$JobRole == "Res. Scientist" ~ 1, TRUE ~ 0),
JobRolHR = case_when(Monthlylm$JobRole == "HR" ~ 1, TRUE ~ 0),
JobRolMgr = case_when(Monthlylm$JobRole == "Manager" ~ 1, TRUE ~ 0),
JobRolLabTech = case_when(Monthlylm$JobRole == "Lab Tech" ~1, TRUE ~ 0)
# Reference: Healthcare Rep, Manufacturing Director
)
Monthlylm <- Monthlylm[,-c(3:5)] #"Education", "JobLevel", "JobRole"
#str(Monthlylm)
### Let's start with lm ###
lmMonthlyInc <- lm(MonthlyIncome ~ .,
data = Monthlylm)
summary(lmMonthlyInc)
##
## Call:
## lm(formula = MonthlyIncome ~ ., data = Monthlylm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3194.8 -626.1 -76.6 617.8 4267.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14309.720 380.728 37.585 < 2e-16 ***
## TotalWorkingYears 44.649 7.782 5.737 1.33e-08 ***
## Educ1_2 410.630 211.525 1.941 0.0526 .
## Educ3_4 411.387 206.437 1.993 0.0466 *
## JobLev1 -11033.297 332.044 -33.228 < 2e-16 ***
## JobLev2 -9321.003 282.346 -33.013 < 2e-16 ***
## JobLev3 -6089.982 256.194 -23.771 < 2e-16 ***
## JobLev4 -2713.308 220.353 -12.313 < 2e-16 ***
## JobRolSalExec -80.718 107.188 -0.753 0.4516
## JobRolSalRep -1329.502 203.719 -6.526 1.15e-10 ***
## JobRolResDir 3454.330 194.856 17.728 < 2e-16 ***
## JobRolResSci -1091.874 158.270 -6.899 1.02e-11 ***
## JobRolHR -1153.441 238.526 -4.836 1.57e-06 ***
## JobRolMgr 3264.085 221.779 14.718 < 2e-16 ***
## JobRolLabTech -1297.850 154.265 -8.413 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1006 on 855 degrees of freedom
## Multiple R-squared: 0.9529, Adjusted R-squared: 0.9521
## F-statistic: 1236 on 14 and 855 DF, p-value: < 2.2e-16
RSS <- c(crossprod(lmMonthlyInc$residuals))
MSE <- RSS/length(lmMonthlyInc$residuals)
RMSE <- sqrt(MSE) #RSS # 844,567,449 #MSE # 970,767.2
RMSE # 985.26
## [1] 997.1992
car::vif(lmMonthlyInc)
## TotalWorkingYears Educ1_2 Educ3_4 JobLev1
## 2.936321 8.396465 8.354874 22.291995
## JobLev2 JobLev3 JobLev4 JobRolSalExec
## 15.765774 7.263280 2.680633 1.748869
## JobRolSalRep JobRolResDir JobRolResSci JobRolHR
## 2.041377 1.801549 3.416232 1.471047
## JobRolMgr JobRolLabTech
## 2.333782 2.965584
########### Cross Validation #########
iterations = 100
RMSEmatrix = numeric(iterations)
RSqmatrix = numeric(iterations)
for(j in 1:iterations){
trainInd = createDataPartition(Monthlylm$MonthlyIncome, p = 0.70, list = FALSE)
train = Monthlylm[trainInd,]
test = Monthlylm[-trainInd,]
fit <- lm(MonthlyIncome ~ ., data = train)
summary(fit)
RSS <- c(crossprod(fit$residuals))
MSE <- RSS/length(fit$residuals)
RMSE <- sqrt(MSE)
preds <- predict(fit, newdata=test)
test$preds <- preds
MSPE <- data.frame(Observed = test$MonthlyIncome, Predicted = preds)
RMSEmatrix[j] = RMSE
RSqmatrix[j] = summary(fit)$r.squared
}
min(RMSEmatrix)
## [1] 946.2515
mean(RMSEmatrix)
## [1] 994.4075
max(RMSEmatrix)
## [1] 1035.028
min(RSqmatrix)
## [1] 0.9458459
mean(RSqmatrix)
## [1] 0.9532775
max(RSqmatrix)
## [1] 0.9607749
# Review Test Results
df <- cbind(test$MonthlyIncome, test$preds)
df <- as.data.frame(df)
df$residuals <- df$V1 - df$V2
df$SqRes <- df$residuals^2
df$SqRtRes <- sqrt(df$SqRes)
df$perc <- df$V2/df$V1
mean(df$perc) * 100
## [1] 103.9476
#write.csv(summary(fit)$coefficients[,1], "newLMmodel.csv")
#view(df)
# Everything looks good...going to go for it!
#######################################
## ##
##### Prepare the Competition Set #####
## ##
#######################################
IncomeCompSet <- read.csv("CaseStudy2CompSet No Salary.csv")
#view(names(IncomeCompSet))
# reduce competition set to just the core variables needed
Comp <- IncomeCompSet[,c("TotalWorkingYears", "Education", "JobLevel", "JobRole")]
# Create some variables for prediction
Comp <- Comp %>%
mutate(
### Education ###
Educ1_2 = case_when(Comp$Education == 1 ~ 1,
Comp$Education == 2 ~ 1, TRUE ~ 0),
Educ3_4 = case_when(Comp$Education == 3 ~ 1,
Comp$Education == 4 ~ 1, TRUE ~ 0),
# Education 5 is the Reference
### JobLevel ###
JobLev1 = case_when(Comp$JobLevel == 1 ~ 1, TRUE ~ 0),
JobLev2 = case_when(Comp$JobLevel == 2 ~ 1, TRUE ~ 0),
JobLev3 = case_when(Comp$JobLevel == 3 ~ 1, TRUE ~ 0),
JobLev4 = case_when(Comp$JobLevel == 4 ~ 1, TRUE ~ 0),
# Reference: JobLev5
### JobRole ###
JobRolSalExec = case_when(Comp$JobRole == "Sales Exec" ~ 1, TRUE ~ 0),
JobRolSalRep = case_when(Comp$JobRole == "Sales Rep" ~ 1, TRUE ~ 0),
JobRolResDir = case_when(Comp$JobRole == "Res. Director" ~ 1, TRUE ~ 0),
JobRolResSci = case_when(Comp$JobRole == "Res. Scientist" ~ 1, TRUE ~ 0),
JobRolHR = case_when(Comp$JobRole == "HR" ~ 1, TRUE ~ 0),
JobRolMgr = case_when(Comp$JobRole == "Manager" ~ 1, TRUE ~ 0),
JobRolLabTech = case_when(Comp$JobRole == "Lab Tech" ~1, TRUE ~ 0)
# Reference: Healthcare Rep, Manufacturing Director
)
Comp <- Comp[,-c(2:4)] # Remove "Education", "JobLevel", "JobRole"
Comp <- Comp %>% dplyr::mutate_if(is.integer, as.numeric)
############################
## ##
### Run the Prediction ###
## ##
############################
fit <- lm(MonthlyIncome ~ ., data = Monthlylm)
summary(fit)
##
## Call:
## lm(formula = MonthlyIncome ~ ., data = Monthlylm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3194.8 -626.1 -76.6 617.8 4267.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14309.720 380.728 37.585 < 2e-16 ***
## TotalWorkingYears 44.649 7.782 5.737 1.33e-08 ***
## Educ1_2 410.630 211.525 1.941 0.0526 .
## Educ3_4 411.387 206.437 1.993 0.0466 *
## JobLev1 -11033.297 332.044 -33.228 < 2e-16 ***
## JobLev2 -9321.003 282.346 -33.013 < 2e-16 ***
## JobLev3 -6089.982 256.194 -23.771 < 2e-16 ***
## JobLev4 -2713.308 220.353 -12.313 < 2e-16 ***
## JobRolSalExec -80.718 107.188 -0.753 0.4516
## JobRolSalRep -1329.502 203.719 -6.526 1.15e-10 ***
## JobRolResDir 3454.330 194.856 17.728 < 2e-16 ***
## JobRolResSci -1091.874 158.270 -6.899 1.02e-11 ***
## JobRolHR -1153.441 238.526 -4.836 1.57e-06 ***
## JobRolMgr 3264.085 221.779 14.718 < 2e-16 ***
## JobRolLabTech -1297.850 154.265 -8.413 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1006 on 855 degrees of freedom
## Multiple R-squared: 0.9529, Adjusted R-squared: 0.9521
## F-statistic: 1236 on 14 and 855 DF, p-value: < 2.2e-16
RSS <- c(crossprod(fit$residuals))
MSE <- RSS/length(fit$residuals)
RMSE <- sqrt(MSE)
RMSE
## [1] 997.1992
preds <- predict(fit, newdata = Comp)
preds <- as.data.frame(preds)
names(preds)[1] <- "MonthlyIncome"
head(preds)
## MonthlyIncome
## 1 5712.646
## 2 3910.297
## 3 13079.371
## 4 3911.054
## 5 4000.351
## 6 6248.432
PredEhly <- cbind(IncomeCompSet$ID, preds)
names(PredEhly)[1] <- "ID"
str(PredEhly)
## 'data.frame': 300 obs. of 2 variables:
## $ ID : int 871 872 873 874 875 876 877 878 879 880 ...
## $ MonthlyIncome: num 5713 3910 13079 3911 4000 ...
head(PredEhly)
## ID MonthlyIncome
## 1 871 5712.646
## 2 872 3910.297
## 3 873 13079.371
## 4 874 3911.054
## 5 875 4000.351
## 6 876 6248.432
#write.csv(PredEhly, "Case2PredictionsEHLY Salary.csv")