# setup
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns


#catch error in case module not installed
#To install missingo in Jupyter: "!pip install missingno"

try:
    msno_check = True
    import missingno as msno
except ModuleNotFoundError:
    print("You don't seem to have missingno installed.")
    print("- you'll only miss one chart used in cleaning process, but it'll be fine.")
    msno_check = False

#to see all pandas dataframe columns (up to 25)        
pd.set_option("display.max_columns", 25)


# load data
df = pd.read_csv("A1_HR_Employee_Data.csv")


# check dataframe dimensions
df.shape

(1482, 22)


# initial check of dataframe
df.head()


# list columns and datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482 entries, 0 to 1481
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   EmployeeID                1482 non-null   int64  
 1   Age                       1482 non-null   object 
 2   Resigned                  1480 non-null   object 
 3   BusinessTravel            1482 non-null   object 
 4   BusinessUnit              1482 non-null   object 
 5   EducationLevel            1481 non-null   float64
 6   Gender                    1482 non-null   object 
 7   JobSatisfaction           1481 non-null   float64
 8   MaritalStatus             1482 non-null   object 
 9   MonthlyIncome             1479 non-null   float64
 10  NumCompaniesWorked        1482 non-null   int64  
 11  OverTime                  1479 non-null   object 
 12  PercentSalaryHike         1482 non-null   int64  
 13  PerformanceRating         1482 non-null   int64  
 14  AverageWeeklyHoursWorked  1482 non-null   float64
 15  TotalWorkingYears         1482 non-null   int64  
 16  TrainingTimesLastYear     1482 non-null   int64  
 17  WorkLifeBalance           1481 non-null   float64
 18  YearsAtCompany            1482 non-null   int64  
 19  YearsInRole               1482 non-null   int64  
 20  YearsSinceLastPromotion   1482 non-null   int64  
 21  YearsWithCurrManager      1482 non-null   int64  
dtypes: float64(5), int64(10), object(7)
memory usage: 254.8+ KB


# obtain a list for copy and paste into Word 
# - used for making notes on issues with each column, and within this notebook

for col in df.columns:
    print(col)

EmployeeID
Age
Resigned
BusinessTravel
BusinessUnit
EducationLevel
Gender
JobSatisfaction
MaritalStatus
MonthlyIncome
NumCompaniesWorked
OverTime
PercentSalaryHike
PerformanceRating
AverageWeeklyHoursWorked
TotalWorkingYears
TrainingTimesLastYear
WorkLifeBalance
YearsAtCompany
YearsInRole
YearsSinceLastPromotion
YearsWithCurrManager


# print unique values for each column for visual inspection of values and to identify issues

for col in df.columns:
    print(col, ":", df[col].dtypes, "\n",df[col].unique(), "\n")

EmployeeID : int64 
 [7912 1520 1488 ... 9017 2477 3238] 

Age : object 
 ['41' '49' '37' '33' '27' '32' '59' '30' '38' '36' '35' '29' '31' '34'
 '28' '22' '53' '24' '21' '42' '44' '46' '39' '43' '50' '26' '48' '55'
 '45' '56' '23' '51' '40' '54' '58' '20' '25' '19' '57' '52' '47' '18'
 '60' '36a'] 

Resigned : object 
 ['Yes' 'No' 'Y' nan 'NO' 'N' 'no'] 

BusinessTravel : object 
 ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel' 'Travels_Rarely'
 'TRAVEL_RARELY' 'rarely_travel'] 

BusinessUnit : object 
 ['Sales' 'Consultants' 'Business Operations' 'Female'] 

EducationLevel : float64 
 [ 2.  1.  4.  3.  5. nan] 

Gender : object 
 ['Female' 'Male' '      Male' '    Female' 'Sales' 'MMale' 'M' 'male'] 

JobSatisfaction : float64 
 [ 4.  2.  3.  1. nan] 

MaritalStatus : object 
 ['Single' 'Married' 'Divorced' '       Divorced' '         Single' 'D'] 

MonthlyIncome : float64 
 [5993. 5130. 2090. ... 3935. 3700. 4802.] 

NumCompaniesWorked : int64 
 [8 1 6 9 0 4 5 2 7 3] 

OverTime : object 
 ['Yes' 'No' nan] 

PercentSalaryHike : int64 
 [11 23 15 12 13 20 22 21 17 14 16 18 19 24 25  6 10  0] 

PerformanceRating : int64 
 [3 4 2] 

AverageWeeklyHoursWorked : float64 
 [ 63.  40.  50.  48.  52.  51.  54.  47.  53.  46.  45.  56.  55.  49.
  57.  66.  69.  67.  62.  70.  64.  71.  65. 400.] 

TotalWorkingYears : int64 
 [ 8 10  7  6 12  1 17  5  3 31 13  0 26 24 22  9 19  2 23 14 15  4 29 28
 21 25 20 11 16 37 38 30 40 18 36 34 32 33 35 27] 

TrainingTimesLastYear : int64 
 [0 3 2 5 1 4 6] 

WorkLifeBalance : float64 
 [ 1.  3.  2.  4. nan] 

YearsAtCompany : int64 
 [ 6 10  0  8  2  7  1  9  5  4 25  3 12 14 22 15 27 21 17 11 13 37 16 20
 40 24 33 19 36 18 29 31 32 34 26 30 23] 

YearsInRole : int64 
 [ 4  7  0  2  5  9  8  3  6 13  1 15 14 16 11 10 12 18 17] 

YearsSinceLastPromotion : int64 
 [ 0  1  3  2  7  4  8  6  5 15  9 13 12 10 11 14] 

YearsWithCurrManager : int64 
 [ 5  7  0  2  6  8  3 11 17  1  4 12  9 10 15 13 16 14]


#setting these up now, to be used to convert in batches after cleaning

string_columns = ["EmployeeID"
                , "Resigned"
                , "BusinessTravel"
                , "BusinessUnit"
                , "Gender"
                , "MaritalStatus"
                , "OverTime"  
                 ]
integer_columns = ["Age", "EducationLevel", "JobSatisfaction", "MonthlyIncome"
                , "NumCompaniesWorked", "PercentSalaryHike", "PerformanceRating"
                , "AverageWeeklyHoursWorked", "TotalWorkingYears", "TrainingTimesLastYear"
                , "WorkLifeBalance", "YearsAtCompany", "YearsInRole", "YearsSinceLastPromotion"
                , "YearsWithCurrManager"
                  ]
category_subset = ["JobSatisfaction"
                 , "WorkLifeBalance"
                 , "EducationLevel"
                 , "PerformanceRating" 
                  ]


# remove alphabetical characters - there is "36a" typo in source data
# this was discovered later, but inserted here to help streamline other cleaning

df["Age"] = df["Age"].str.replace("[a-zA-Z]","", regex=True).astype("int")


# checking missing data - this will skip if "missingno" library is not installed

if msno_check == True:
    msno.matrix(df)
else:
    print("To see this chart you will need to install missingno libary.")


# From the chart above, and from the dtypes output earlier, there is hardly any missing data, 
# QUESTION - is there something about the data collection for the 
#           last entries that needs to be addressed? 

# print all rows with null data/missing values
df_null = df[df.isnull().any(axis=1)]
df_null


# just a different output to look at, without need to scroll horizontally

print(df_null)

      EmployeeID  Age Resigned     BusinessTravel         BusinessUnit  \
1471        6264   21       No      Travel_Rarely          Consultants   
1472        5560   23       No     Travels_Rarely          Consultants   
1473        1215   26      NaN      Travel_Rarely  Business Operations   
1474        5988   26      NaN      Travel_Rarely  Business Operations   
1478        3190   33      Yes      TRAVEL_RARELY          Consultants   
1479        9017   38       no      rarely_travel          Consultants   
1480        2477   32       No  Travel_Frequently                Sales   

      EducationLevel  Gender  JobSatisfaction MaritalStatus  MonthlyIncome  \
1471             2.0    Male              4.0        Single            NaN   
1472             2.0   MMale              4.0        Single            NaN   
1473             NaN  Female              NaN       Married            NaN   
1474             2.0  Female              3.0       Married         4023.0   
1478             4.0    Male              3.0        Single         2686.0   
1479             2.0  Female              3.0       Married         3700.0   
1480             4.0    male              4.0        Single         4802.0   

      NumCompaniesWorked OverTime  PercentSalaryHike  PerformanceRating  \
1471                   1       No                 12                  3   
1472                   1       No                 10                  4   
1473                   1       No                 22                  4   
1474                   1       No                 18                  4   
1478                   1      NaN                 13                  3   
1479                   1      NaN                  0                  2   
1480                   8      NaN                 13                  3   

      AverageWeeklyHoursWorked  TotalWorkingYears  TrainingTimesLastYear  \
1471                      40.0                  0                      5   
1472                      40.0                  0                      2   
1473                      40.0                  8                      2   
1474                      40.0                  7                      2   
1478                      40.0                 10                      2   
1479                      46.0                  5                      3   
1480                      50.0                  4                      2   

      WorkLifeBalance  YearsAtCompany  YearsInRole  YearsSinceLastPromotion  \
1471              3.0               0            0                        0   
1472              3.0               1            1                        0   
1473              NaN               8            7                        5   
1474              2.0               7            3                        0   
1478              2.0              10            8                        8   
1479              3.0               5            4                        0   
1480              4.0               1            1                        0   

      YearsWithCurrManager  
1471                     0  
1472                     1  
1473                     3  
1474                     2  
1478                    10  
1479                     3  
1480                     1


# checking what % of entries have null/missing values 
# (from df.info() above, there are not many)

df_len, df_null_len = len(df), len(df_null)
print (f"""CONSIDERING MISSING VALUES:
Number of rows: {df_len}
Number of rows with nulls/NaN: {df_null_len}
% of rows with null values: {round(df_null_len/df_len*100, 2)}%
""")

CONSIDERING MISSING VALUES:
Number of rows: 1482
Number of rows with nulls/NaN: 7
% of rows with null values: 0.47%


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482 entries, 0 to 1481
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   EmployeeID                1482 non-null   int64  
 1   Age                       1482 non-null   int64  
 2   Resigned                  1480 non-null   object 
 3   BusinessTravel            1482 non-null   object 
 4   BusinessUnit              1482 non-null   object 
 5   EducationLevel            1481 non-null   float64
 6   Gender                    1482 non-null   object 
 7   JobSatisfaction           1481 non-null   float64
 8   MaritalStatus             1482 non-null   object 
 9   MonthlyIncome             1479 non-null   float64
 10  NumCompaniesWorked        1482 non-null   int64  
 11  OverTime                  1479 non-null   object 
 12  PercentSalaryHike         1482 non-null   int64  
 13  PerformanceRating         1482 non-null   int64  
 14  AverageWeeklyHoursWorked  1482 non-null   float64
 15  TotalWorkingYears         1482 non-null   int64  
 16  TrainingTimesLastYear     1482 non-null   int64  
 17  WorkLifeBalance           1481 non-null   float64
 18  YearsAtCompany            1482 non-null   int64  
 19  YearsInRole               1482 non-null   int64  
 20  YearsSinceLastPromotion   1482 non-null   int64  
 21  YearsWithCurrManager      1482 non-null   int64  
dtypes: float64(5), int64(11), object(6)
memory usage: 254.8+ KB


# I have settled on the median to fill NaN values
# - a number of categories are skewed, making the median more reliable as an indicator

# Filling NaN with column mean - and converting all numerical columns to integer

df[integer_columns] = df[integer_columns].fillna(df.median(axis=0, numeric_only=True)).astype("int")


# for "Resigned" and "Overtime", there are only 2 and 3 missing values respectively
# I will fill NaN with no, assuming that ommission indicates "False" or 0. 
# Statistically, they will have negligible impact.

df[["Resigned", "OverTime"]] = df[["Resigned", "OverTime"]].fillna("NO")


# check updated dataframe

df.loc[1471:1480]


# double-check for nulls after filling - hoping for no rows returned
df[df.isnull().any(axis=1)]


# Outliers - generate stats on all numeric columns,highlighting and any extreme values
# Also used as a reference for inspecting distribution for skewness

df_describe = df[integer_columns].describe()
df_describe


#quick stats

print (f"{round (df['Gender'].value_counts(normalize=True) * 100, 2)} ")

Male          59.51
Female        39.88
    Female     0.20
      Male     0.13
Sales          0.07
MMale          0.07
M              0.07
male           0.07
Name: Gender, dtype: float64


#let's find that 400hr/week employee!

df.loc[df["AverageWeeklyHoursWorked"] > 60].sort_values("AverageWeeklyHoursWorked", ascending=False)


# getting number of rows/employees working more than 60hrs per week.
df.loc[df["AverageWeeklyHoursWorked"] > 60].sort_values("AverageWeeklyHoursWorked", ascending=False).shape

(24, 22)


# Looking at this, there are some hardworkers (24) up around 70hrs/week, but only one incorrect value.
# Considering the other details, it seems just a typo and most likely 40hrs/week.

#amend to 40 hrs
df.loc[1481, "AverageWeeklyHoursWorked"] = 40


# make all text features upper case
df[string_columns] = df[string_columns].apply(lambda x: x.astype(str).str.upper())

# trim whitespace on text columns
for col in string_columns:
    df[col] = df[col].apply(lambda x: x.strip())


# check again for features requiring string cleaning
for col in df.columns:
    print(col, ":", df[col].dtypes, "\n",df[col].unique(), "\n")

EmployeeID : object 
 ['7912' '1520' '1488' ... '9017' '2477' '3238'] 

Age : int64 
 [41 49 37 33 27 32 59 30 38 36 35 29 31 34 28 22 53 24 21 42 44 46 39 43
 50 26 48 55 45 56 23 51 40 54 58 20 25 19 57 52 47 18 60] 

Resigned : object 
 ['YES' 'NO' 'Y' 'N'] 

BusinessTravel : object 
 ['TRAVEL_RARELY' 'TRAVEL_FREQUENTLY' 'NON-TRAVEL' 'TRAVELS_RARELY'
 'RARELY_TRAVEL'] 

BusinessUnit : object 
 ['SALES' 'CONSULTANTS' 'BUSINESS OPERATIONS' 'FEMALE'] 

EducationLevel : int64 
 [2 1 4 3 5] 

Gender : object 
 ['FEMALE' 'MALE' 'SALES' 'MMALE' 'M'] 

JobSatisfaction : int64 
 [4 2 3 1] 

MaritalStatus : object 
 ['SINGLE' 'MARRIED' 'DIVORCED' 'D'] 

MonthlyIncome : int64 
 [5993 5130 2090 ... 3935 3700 4802] 

NumCompaniesWorked : int64 
 [8 1 6 9 0 4 5 2 7 3] 

OverTime : object 
 ['YES' 'NO'] 

PercentSalaryHike : int64 
 [11 23 15 12 13 20 22 21 17 14 16 18 19 24 25  6 10  0] 

PerformanceRating : int64 
 [3 4 2] 

AverageWeeklyHoursWorked : int64 
 [63 40 50 48 52 51 54 47 53 46 45 56 55 49 57 66 69 67 62 70 64 71 65] 

TotalWorkingYears : int64 
 [ 8 10  7  6 12  1 17  5  3 31 13  0 26 24 22  9 19  2 23 14 15  4 29 28
 21 25 20 11 16 37 38 30 40 18 36 34 32 33 35 27] 

TrainingTimesLastYear : int64 
 [0 3 2 5 1 4 6] 

WorkLifeBalance : int64 
 [1 3 2 4] 

YearsAtCompany : int64 
 [ 6 10  0  8  2  7  1  9  5  4 25  3 12 14 22 15 27 21 17 11 13 37 16 20
 40 24 33 19 36 18 29 31 32 34 26 30 23] 

YearsInRole : int64 
 [ 4  7  0  2  5  9  8  3  6 13  1 15 14 16 11 10 12 18 17] 

YearsSinceLastPromotion : int64 
 [ 0  1  3  2  7  4  8  6  5 15  9 13 12 10 11 14] 

YearsWithCurrManager : int64 
 [ 5  7  0  2  6  8  3 11 17  1  4 12  9 10 15 13 16 14]


# RESIGNED
# Standardise to "YES" or "NO" (this ignores the NaN's and just makes them No's)

df["Resigned"] = df["Resigned"].apply(lambda x: "YES" if x in ["YES", "Y", "Yes"] else "NO")


# Source: 'Travel_Rarely' 'Travel_Frequently' 'Non-Travel' 'Travels_Rarely' 'TRAVEL_RARELY' 'rarely_travel'
# Standardise to "NON TRAVEL, RARELY, FREQUENTLY"

df["BusinessTravel"] = df["BusinessTravel"].str.upper()\
                               .str.replace("TRAVEL", "")\
                               .str.replace("S", "")\
                               .str.replace("_", "")\
                               .str.replace("-", "")


df["BusinessTravel"] = df["BusinessTravel"].str.upper().str.replace("TRAVEL", "").str.replace("S", "").str.replace("_", "").str.replace("-", "")


# Source:'Sales', 'Consultants', 'Business Operations', 'Female'
# Female is an error or misplacement

df.loc[df["BusinessUnit"]=="FEMALE"]


# need to swap BusinessUnit and Sales
gender, businessunit = df.loc[1470, ["BusinessUnit", "Gender"]]

df.loc[1470, ["BusinessUnit", "Gender"]] = [businessunit, gender]

# double-check swap
df.loc[1470, ["BusinessUnit", "Gender"]]

BusinessUnit     SALES
Gender          FEMALE
Name: 1470, dtype: object


# source: 'FEMALE' 'MALE' 'SALES' 'MMALE' 'M'

df["Gender"] = df["Gender"].apply(lambda x: "MALE" if x in ["MALE", "MMALE", "M"] else "FEMALE")


# Source: 'SINGLE' 'MARRIED' 'DIVORCED' 'D'

df["MaritalStatus"] = df["MaritalStatus"].apply(lambda x: "DIVORCED" if x == "D" else x)


# check again for features requiring string cleaning

for col in df.columns:
    print(col, ":", df[col].dtypes, "\n",df[col].unique(), "\n")
    
df.head()

EmployeeID : object 
 ['7912' '1520' '1488' ... '9017' '2477' '3238'] 

Age : int64 
 [41 49 37 33 27 32 59 30 38 36 35 29 31 34 28 22 53 24 21 42 44 46 39 43
 50 26 48 55 45 56 23 51 40 54 58 20 25 19 57 52 47 18 60] 

Resigned : object 
 ['YES' 'NO'] 

BusinessTravel : object 
 ['RARELY' 'FREQUENTLY' 'NON'] 

BusinessUnit : object 
 ['SALES' 'CONSULTANTS' 'BUSINESS OPERATIONS'] 

EducationLevel : int64 
 [2 1 4 3 5] 

Gender : object 
 ['FEMALE' 'MALE'] 

JobSatisfaction : int64 
 [4 2 3 1] 

MaritalStatus : object 
 ['SINGLE' 'MARRIED' 'DIVORCED'] 

MonthlyIncome : int64 
 [5993 5130 2090 ... 3935 3700 4802] 

NumCompaniesWorked : int64 
 [8 1 6 9 0 4 5 2 7 3] 

OverTime : object 
 ['YES' 'NO'] 

PercentSalaryHike : int64 
 [11 23 15 12 13 20 22 21 17 14 16 18 19 24 25  6 10  0] 

PerformanceRating : int64 
 [3 4 2] 

AverageWeeklyHoursWorked : int64 
 [63 40 50 48 52 51 54 47 53 46 45 56 55 49 57 66 69 67 62 70 64 71 65] 

TotalWorkingYears : int64 
 [ 8 10  7  6 12  1 17  5  3 31 13  0 26 24 22  9 19  2 23 14 15  4 29 28
 21 25 20 11 16 37 38 30 40 18 36 34 32 33 35 27] 

TrainingTimesLastYear : int64 
 [0 3 2 5 1 4 6] 

WorkLifeBalance : int64 
 [1 3 2 4] 

YearsAtCompany : int64 
 [ 6 10  0  8  2  7  1  9  5  4 25  3 12 14 22 15 27 21 17 11 13 37 16 20
 40 24 33 19 36 18 29 31 32 34 26 30 23] 

YearsInRole : int64 
 [ 4  7  0  2  5  9  8  3  6 13  1 15 14 16 11 10 12 18 17] 

YearsSinceLastPromotion : int64 
 [ 0  1  3  2  7  4  8  6  5 15  9 13 12 10 11 14] 

YearsWithCurrManager : int64 
 [ 5  7  0  2  6  8  3 11 17  1  4 12  9 10 15 13 16 14]


# Basic statistics on clean dataset
df_stats = df.describe().reset_index()
df_stats


# Calculating variance for all numerical columns
variance = {"index": "var"}
for x in df[integer_columns]:
    variance[x] = df[x].var()

df_stats = df_stats.append(variance, ignore_index=True).set_index("index")

/var/folders/pb/f8gg2y7w8xjbbn0059bcyddh0000gr/T/ipykernel_37126/2534005853.py:6: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  df_stats = df_stats.append(variance, ignore_index=True).set_index("index")


for x in df_stats.columns:
    if x != "index":
        df_stats[x] = df_stats[x].apply(lambda x: round(x, 2))


df_stats.head(10)


# double-checking variance by calculating square of standard deviation from describe()
# doing this more for practice and checking of methods - not criticial for this analysis

df_variance_check = df_stats[df_stats.index == "std"].apply(lambda x: x ** 2)
df_variance_check.rename(index={"std": "var"})


# Basic statistics on clean dataset
df_stats = df.describe().reset_index()
df_stats


# Calculating variance for all numerical columns
variance = {"index": "var"}
for x in df[integer_columns]:
    variance[x] = df[x].var()

df_stats = df_stats.append(variance, ignore_index=True).set_index("index")

/var/folders/pb/f8gg2y7w8xjbbn0059bcyddh0000gr/T/ipykernel_37126/2534005853.py:6: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  df_stats = df_stats.append(variance, ignore_index=True).set_index("index")


for x in df_stats.columns:
    if x != "index":
        df_stats[x] = df_stats[x].apply(lambda x: round(x, 2))


df_stats.head(10)


# double-checking variance by calculating square of standard deviation from describe()
# doing this more for practice and checking of methods - not criticial for this analysis

df_variance_check = df_stats[df_stats.index == "std"].apply(lambda x: x ** 2)
df_variance_check.rename(index={"std": "var"})


# selecting a clean style for plots

sns.set_style("white")


# Basic statistics on clean dataset
df_stats = df.describe().reset_index()
df_stats


# Calculating variance for all numerical columns
variance = {"index": "var"}
for x in df[integer_columns]:
    variance[x] = df[x].var()

df_stats = df_stats.append(variance, ignore_index=True).set_index("index")

/var/folders/pb/f8gg2y7w8xjbbn0059bcyddh0000gr/T/ipykernel_37126/2534005853.py:6: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  df_stats = df_stats.append(variance, ignore_index=True).set_index("index")


for x in df_stats.columns:
    if x != "index":
        df_stats[x] = df_stats[x].apply(lambda x: round(x, 2))


df_stats.head(10)


# double-checking variance by calculating square of standard deviation from describe()
# doing this more for practice and checking of methods - not criticial for this analysis

df_variance_check = df_stats[df_stats.index == "std"].apply(lambda x: x ** 2)
df_variance_check.rename(index={"std": "var"})


# setting colours

color="orange"
colors = ["brown", "darkorange", "orange", "navajowhite", "papayawhip"]
colors_r = list(reversed(colors))
colors_ext = ["brown", "darkorange", "orange", "gold", "navajowhite", "papayawhip"]
colors_ext_r = list(reversed(colors_ext))


# Histograms for each category (except EmployeeID)
# Note: for some columns with nominal, ordinal, Boolean data, 
# it might not be quite correct to use histograms - but this is just a sketch pad 

for x in df.columns[1:]:
    print(x)
    plt.hist(x=x, data=df, color=color)
    plt.show()

Age

Resigned

BusinessTravel

BusinessUnit

EducationLevel

Gender

JobSatisfaction

MaritalStatus

MonthlyIncome

NumCompaniesWorked


# pie plots for each ordinal category/rating

for x in category_subset:
    print(x)
    
    segments = df[x].value_counts().sort_index(ascending=True)

    print(segments)
    labels = segments.index
    print (labels, segments)
    plt.pie(segments
          , labels = labels
          , colors = colors_r
          , counterclock = True
          , startangle = 30
          , labeldistance = 1.2
          , pctdistance = 0.65
          , autopct = lambda p: f"{int(p)}%")
    plt.show()

JobSatisfaction
1    289
2    280
3    450
4    463
Name: JobSatisfaction, dtype: int64
Int64Index([1, 2, 3, 4], dtype='int64') 1    289
2    280
3    450
4    463
Name: JobSatisfaction, dtype: int64

WorkLifeBalance
1     81
2    346
3    900
4    155
Name: WorkLifeBalance, dtype: int64
Int64Index([1, 2, 3, 4], dtype='int64') 1     81
2    346
3    900
4    155
Name: WorkLifeBalance, dtype: int64

EducationLevel
1    170
2    287
3    574
4    403
5     48
Name: EducationLevel, dtype: int64
Int64Index([1, 2, 3, 4, 5], dtype='int64') 1    170
2    287
3    574
4    403
5     48
Name: EducationLevel, dtype: int64

PerformanceRating
2       1
3    1252
4     229
Name: PerformanceRating, dtype: int64
Int64Index([2, 3, 4], dtype='int64') 2       1
3    1252
4     229
Name: PerformanceRating, dtype: int64


# Looking at value counts for each string-based column (using bar graphs this time)

for x in string_columns[1:]:
    
    fig, ax = plt.subplots()
    print(x)
    if x == "EmployeeID":
        continue
    height = df[x].value_counts().sort_index(ascending=True)

    labels = height.index
    
    graph = plt.bar(labels, height
          , color = color
           )
    ax.bar_label(graph)
    plt.show()

Resigned

BusinessTravel

BusinessUnit

Gender

MaritalStatus

OverTime


#looking at breakdown of resignees by Business Unit roles

resigned = df[df["Resigned"]=="YES"]

fig, ax = plt.subplots()

no_resignees = resigned["BusinessUnit"].value_counts().sort_index(ascending=True)


labels = no_resignees.index
    
graph = plt.bar(labels, no_resignees
        , color = color
        )
ax.bar_label(graph)
plt.show()


total_by_businessunit = df["BusinessUnit"].value_counts().sort_index(ascending=True)

role = total_by_businessunit.index
print("Number of Resignees by role:")
print(no_resignees, "\n")
print("Total employees by role:")
print(total_by_businessunit, "\n")

percent_resign_role = []
print("Percentage of Resignees by role")
for role, con, total in zip(role, no_resignees, total_by_businessunit):
    
    #calculate % resignees by role
    
    calc = round(con/total*100, 2)
    

    percent_resign_role.append(calc)
    print(role, calc)

Number of Resignees by role:
BUSINESS OPERATIONS     12
CONSULTANTS            134
SALES                   93
Name: BusinessUnit, dtype: int64 

Total employees by role:
BUSINESS OPERATIONS     65
CONSULTANTS            969
SALES                  448
Name: BusinessUnit, dtype: int64 

Percentage of Resignees by role
BUSINESS OPERATIONS 18.46
CONSULTANTS 13.83
SALES 20.76


# Create dataframe for roles

role_by_es = pd.DataFrame({"Resigned": no_resignees
                 , "Total": total_by_businessunit
                }, index=total_by_businessunit.index).T

role_by_es_norm = role_by_es.copy()
#normalised version
for es in role_by_es_norm:
    min = role_by_es_norm[es].min()
    max = role_by_es_norm[es].max()
    range = max-min
    print(f"{es} - Max: {max}, Min: {min}, Range: {range}")
    role_by_es_norm[es] = role_by_es_norm[es].apply(lambda x: (x - min)/range)

BUSINESS OPERATIONS - Max: 65, Min: 12, Range: 53
CONSULTANTS - Max: 969, Min: 134, Range: 835
SALES - Max: 448, Min: 93, Range: 355


role_by_es


# plot for role and employment status
ax = role_by_es.plot.bar()


# maybe overkill but took a while to work out how to do the 100% stacked bar chart

no_resignees = pd.DataFrame(no_resignees)
total_by_businessunit = pd.DataFrame(total_by_businessunit)


# generate percentages

role_by_es_test = role_by_es.apply(lambda x: x/x.max()*100).T


role_by_es_test


# 100% stacked bar plot for Business roles and Employment Status
# (Took a bit of mucking around)

labels = role_by_es_test.index
top = list(role_by_es_test["Resigned"])
bottom = list(role_by_es_test["Total"])

x=0
for t, b in zip(top, bottom):
    bottom[x] = b - t
    ax.text(x, t, t, fontsize=10, ha='center', va='bottom')
    x += 1
    print (bottom)

fig, ax = plt.subplots()
ax.bar(labels, bottom, label="Total")

ax.bar(labels, top, label="Resigned", bottom=bottom)

plt.show()

[81.53846153846153, 100.0, 100.0]
[81.53846153846153, 86.17131062951496, 100.0]
[81.53846153846153, 86.17131062951496, 79.24107142857143]


#just because you can. 

sns.pairplot(df[integer_columns])

<seaborn.axisgrid.PairGrid at 0x7fd7a9018100>


# copied this from above, just to have as a reference in this section
"""string_columns = ["EmployeeID"
                , "Resigned"
                , "BusinessTravel"
                , "BusinessUnit"
                , "Gender"
                , "MaritalStatus"
                , "OverTime"  
                 ]
integer_columns = ["Age", "EducationLevel", "JobSatisfaction", "MonthlyIncome"
                , "NumCompaniesWorked", "PercentSalaryHike", "PerformanceRating"
                , "AverageWeeklyHoursWorked", "TotalWorkingYears", "TrainingTimesLastYear"
                , "WorkLifeBalance", "YearsAtCompany", "YearsInRole", "YearsSinceLastPromotion"
                , "YearsWithCurrManager"
                  ]
"""

# A more select group to explore and actually be able to read the data

pairplot_columns = ["Resigned", "Gender", "OverTime", "Age", "EducationLevel"
                  , "JobSatisfaction", "MonthlyIncome", "WorkLifeBalance"
                  , "PerformanceRating"]


sns.pairplot(df[pairplot_columns], hue="MonthlyIncome")

<seaborn.axisgrid.PairGrid at 0x7fd79a076820>


sns.pairplot(df[pairplot_columns], hue="Resigned")

<seaborn.axisgrid.PairGrid at 0x7fd78b00db20>


sns.pairplot(df[pairplot_columns], hue="Resigned", kind="hist")

<seaborn.axisgrid.PairGrid at 0x7fd7da499ee0>


sns.pairplot(df, hue="Resigned"
            , vars=["JobSatisfaction", "WorkLifeBalance", "MonthlyIncome"]
            , corner=True
            )

<seaborn.axisgrid.PairGrid at 0x7fd7ba4ed1f0>


sns.pairplot(df, hue="Resigned", kind="kde"
            , vars=["JobSatisfaction", "WorkLifeBalance", "MonthlyIncome"]
            , corner=True
            )

<seaborn.axisgrid.PairGrid at 0x7fd79ab85a00>


sns.pairplot(df, hue="Resigned", kind="kde"
            , vars=["JobSatisfaction", "WorkLifeBalance", "MonthlyIncome"]
            , corner=True
            , diag_kind = "auto")

<seaborn.axisgrid.PairGrid at 0x7fd79abaa430>


sns.pairplot(df, hue="MaritalStatus", kind="kde"
            , vars=["JobSatisfaction", "WorkLifeBalance", "MonthlyIncome"]
            , corner=True
            , diag_kind = "auto"
            )

<seaborn.axisgrid.PairGrid at 0x7fd78bcf2f70>


# looking at Divorcees 
# (initially mistaken that Divorcees earnt less money, 
# but actually the plot above shows the density, not value 
# - need to be careful to understand this plots!

df_divorced = df[df["MaritalStatus"] == "DIVORCED"]
sns.pairplot(df_divorced, hue="JobSatisfaction", kind="kde"
            , vars=["Age", "WorkLifeBalance", "MonthlyIncome"]
            , corner=True
            , diag_kind = "auto"
            )

<seaborn.axisgrid.PairGrid at 0x7fd7db3af7f0>


#and more features
df_divorced = df[df["MaritalStatus"] == "DIVORCED"]
sns.pairplot(df_divorced, hue="JobSatisfaction", kind="kde"
            , vars=["Age", "WorkLifeBalance", "MonthlyIncome", "AverageWeeklyHoursWorked", "YearsAtCompany"]
            , corner=True
            , diag_kind = "auto"
            )

<seaborn.axisgrid.PairGrid at 0x7fd7ba74da60>


g = sns.PairGrid(df
                , hue = "Resigned"
                , vars=["JobSatisfaction", "Age", "WorkLifeBalance", "MonthlyIncome", "YearsWithCurrManager"])
g.map(sns.scatterplot)

<seaborn.axisgrid.PairGrid at 0x7fd78c0823d0>


g = sns.PairGrid(df
                , hue = "OverTime"
                , vars=["JobSatisfaction", "Age", "WorkLifeBalance", "AverageWeeklyHoursWorked", "MonthlyIncome", "YearsAtCompany"])
g.map(sns.scatterplot)

#intersting cluster in the Age/AverageWeekly hours

<seaborn.axisgrid.PairGrid at 0x7fd7aba22f70>


g = sns.PairGrid(df
                , hue = "Resigned"
                , vars=["JobSatisfaction", "Age", "AverageWeeklyHoursWorked", "MonthlyIncome", "YearsAtCompany"])
g.map(sns.scatterplot)

#intersting cluster in the Age/AverageWeekly hours

<seaborn.axisgrid.PairGrid at 0x7fd78c4bedf0>


g = sns.PairGrid(df
                , hue = "Resigned"
                , vars=["Age", "AverageWeeklyHoursWorked", "MonthlyIncome", "YearsAtCompany"])
g.map(sns.scatterplot)

# intersting clustering in the AverageWeekly hours column in relation to Age, Income and YearsAt Company

# It seems AverageWeeklyHoursWorked is not highly correlated to YearsAtComoany, Age or Monthly Income
# BUT... look at the clusters based on hue of "resigned" 

# Looks like it the attrition comes from those that are:
# - putting in the longer hours
# - generally been with the company for less than 10 years (approx)
# - lower income
# TO DO: further research required to check Performance Rating

<seaborn.axisgrid.PairGrid at 0x7fd7bb0a2550>


g = sns.PairGrid(df
                , hue = "JobSatisfaction"
                , vars=["Age", "AverageWeeklyHoursWorked", "MonthlyIncome", "YearsAtCompany"])

g = g.add_legend(fontsize=14, bbox_to_anchor=(1,1))

g.map(sns.scatterplot)

<seaborn.axisgrid.PairGrid at 0x7fd7bae80160>


g = sns.PairGrid(df
                , hue = "Age"
                , vars=["Age", "AverageWeeklyHoursWorked", "MonthlyIncome", "YearsAtCompany"])

#g = g.add_legend(fontsize=14, bbox_to_anchor=(1,1))

g.map(sns.scatterplot)

<seaborn.axisgrid.PairGrid at 0x7fd7bbea83d0>


g = sns.PairGrid(df
                , hue = "MonthlyIncome"
                , vars=["Age", "AverageWeeklyHoursWorked", "MonthlyIncome", "YearsAtCompany"]
                ,) #palette = "Oranges")

#g = g.add_legend(fontsize=14, bbox_to_anchor=(1,1))

g.map(sns.scatterplot)

<seaborn.axisgrid.PairGrid at 0x7fd7acd63d30>


g = sns.PairGrid(df
                , hue = "MonthlyIncome"
                , vars=["Age", "JobSatisfaction", "AverageWeeklyHoursWorked", "MonthlyIncome", "YearsAtCompany"])

#g = g.add_legend(fontsize=14, bbox_to_anchor=(1,1))

g.map(sns.scatterplot)

<seaborn.axisgrid.PairGrid at 0x7fd79c977550>


g = sns.pairplot(df
                , hue = "JobSatisfaction"
                , vars = ["Age", "AverageWeeklyHoursWorked", "MonthlyIncome", "YearsAtCompany"]
                , kind = "reg")

g.map(sns.scatterplot)

#no correlation between income and age, hours worked - more so to do with years at the company
#QUESTION - what other features might be moe connected to income?

<seaborn.axisgrid.PairGrid at 0x7fd7ad7d92e0>


g = sns.PairGrid(df
                , hue = "Gender"
                , vars=["Age", "AverageWeeklyHoursWorked", "MonthlyIncome", "YearsAtCompany"])
g.map(sns.scatterplot)

#intersting cluster in the Age/AverageWeekly hours

<seaborn.axisgrid.PairGrid at 0x7fd7cc2c8d30>


g = sns.PairGrid(df
                , hue = "OverTime"
                , vars=["Age", "AverageWeeklyHoursWorked", "MonthlyIncome", "YearsAtCompany"])
g.map(sns.scatterplot)

#those doing overtime
# one to explore further - 
# curiuous about differences between linear cluster (40hrs) and the one roguhtly 45-55 hrs

<seaborn.axisgrid.PairGrid at 0x7fd78c464a30>


sns.pairplot(df, hue="MaritalStatus", kind="scatter"
            , vars=["JobSatisfaction", "Age", "WorkLifeBalance", "MonthlyIncome"]
            , corner=True
            )

<seaborn.axisgrid.PairGrid at 0x7fd79d5a1700>


df_heatmap = df.pivot_table(index="JobSatisfaction", columns="WorkLifeBalance", values="MonthlyIncome")

sns.heatmap(df_heatmap, annot=True, fmt=".1f", cmap="YlOrBr")

<Axes: xlabel='WorkLifeBalance', ylabel='JobSatisfaction'>


category_subset

['JobSatisfaction', 'WorkLifeBalance', 'EducationLevel', 'PerformanceRating']


df_heatmap = df[category_subset].pivot_table(index="JobSatisfaction", columns="WorkLifeBalance")

sns.heatmap(df_heatmap, annot=True, fmt=".1f", cmap="YlOrBr")

<Axes: xlabel='None-WorkLifeBalance', ylabel='JobSatisfaction'>


category_subset_r = list(reversed(category_subset))
df_heatmap = df[category_subset].pivot_table(index="JobSatisfaction")

sns.heatmap(df_heatmap, annot=True, fmt=".1f", cmap="RdBu_r")

<Axes: ylabel='JobSatisfaction'>


category_subset_r = list(reversed(category_subset))
df_heatmap = df[category_subset].pivot_table(index="JobSatisfaction").sort_index(ascending=False)

sns.heatmap(df_heatmap, annot=True, fmt=".1f", cmap="YlOrBr")

<Axes: ylabel='JobSatisfaction'>


df_heatmap = df[category_subset].pivot_table(index="JobSatisfaction").sort_index(ascending=False)

#reorder columns
df_heatmap2 = df_heatmap[["WorkLifeBalance", "EducationLevel", "PerformanceRating"]]

sns.heatmap(df_heatmap2, annot=True, fmt=".1f", cmap="YlOrBr")

<Axes: ylabel='JobSatisfaction'>


df_heatmap = df[category_subset].pivot_table(index="WorkLifeBalance").sort_index(ascending=False)

#reorder columns
df_heatmap2 = df_heatmap[["JobSatisfaction", "EducationLevel", "PerformanceRating"]]

sns.heatmap(df_heatmap2, annot=True, fmt=".1f", cmap="YlOrBr")

<Axes: ylabel='WorkLifeBalance'>


df_heatmap = df[category_subset].pivot_table(index="EducationLevel").sort_index(ascending=False)

#reorder columns
df_heatmap2 = df_heatmap[["WorkLifeBalance", "JobSatisfaction", "PerformanceRating"]]

sns.heatmap(df_heatmap2, annot=True, fmt=".1f", cmap="YlOrBr")

<Axes: ylabel='EducationLevel'>


df_heatmap = df[category_subset].pivot_table(index="PerformanceRating").sort_index(ascending=False)

#reorder columns
df_heatmap2 = df_heatmap[["WorkLifeBalance", "JobSatisfaction", "EducationLevel"]]

sns.heatmap(df_heatmap2, annot=True, fmt=".1f", cmap="YlOrBr")

<Axes: ylabel='PerformanceRating'>


df_heatmap = df[category_subset].pivot_table(index="PerformanceRating").sort_index(ascending=False)

#reorder columns
df_heatmap_corr = df[category_subset].corr()
                         

sns.heatmap(df_heatmap_corr, annot=True, fmt=".2f", cmap="Oranges")

<Axes: >


print(df.columns)

Index(['EmployeeID', 'Age', 'Resigned', 'BusinessTravel', 'BusinessUnit',
       'EducationLevel', 'Gender', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'AverageWeeklyHoursWorked', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')


# try complete dataset

plt.figure(figsize=(12, 12)) 
ax = sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="Oranges")
plt.suptitle("Employee Profile Heatmap", fontsize=16, fontweight="bold", y=0.93)
plt.subplots_adjust(left=.2, bottom=.2)

# plt.savefig("Corr_heatmap.jpg", dpi=300)
plt.show()


# Tried a couple of these, but was not getting much insight... maybe I needed to try more, 


df.plot(kind="hexbin", x="YearsAtCompany", y="AverageWeeklyHoursWorked", gridsize=15, figsize=(10,6))

<Axes: xlabel='YearsAtCompany', ylabel='AverageWeeklyHoursWorked'>


df.boxplot(column="YearsAtCompany", by="Resigned", figsize=(5,4))

<Axes: title={'center': 'YearsAtCompany'}, xlabel='Resigned'>


df.boxplot(column="YearsInRole", by="Resigned", figsize=(5,4))

<Axes: title={'center': 'YearsInRole'}, xlabel='Resigned'>


df.boxplot(column="MonthlyIncome", by="Resigned", figsize=(5,4))

<Axes: title={'center': 'MonthlyIncome'}, xlabel='Resigned'>


# This one got me especially excited, due to the very big differences between each box
# explored further in the "Official set"

df.boxplot(column="AverageWeeklyHoursWorked", by="Resigned", figsize=(5,4))

<Axes: title={'center': 'AverageWeeklyHoursWorked'}, xlabel='Resigned'>


df["BusinessUnit"].value_counts()

CONSULTANTS            969
SALES                  448
BUSINESS OPERATIONS     65
Name: BusinessUnit, dtype: int64


color = "Orange"

#create dataset based just on consultants
df_consultants = df[df["BusinessUnit"] =="CONSULTANTS"]
df_sales = df[df["BusinessUnit"] =="SALES"]
df_operations = df[df["BusinessUnit"] =="BUSINESS OPERATIONS"]

df


df_employment_status = df["Resigned"].map({"YES": "Resigned", "NO": "Employed"})

plt.bar(df_employment_status.value_counts().index, df_employment_status.value_counts(), color=color) 

plt.ylabel("No. of responses")

plt.suptitle("Fig. 1. Employment Status of survey respondents", fontsize=14, fontweight="bold")
plt.title("Currently Employed vs Resigned") 

#plt.savefig("Fig. 1. Employment_Status.jpg", dpi=300)

plt.show()


df["Resigned"].value_counts()

NO     1243
YES     239
Name: Resigned, dtype: int64


df_employment_status = df_consultants["Resigned"].map({"YES": "Resigned", "NO": "Employed"})

plt.bar(df_employment_status.value_counts().index, df_employment_status.value_counts(), color=color) 

plt.ylabel("No. of responses")

plt.suptitle("Employment Status of Consultants respondents", fontsize=14, fontweight="bold")
plt.title("Currently Employed vs Resigned") 

# plt.savefig("Employment_Status_consultants.jpg", dpi=300)

plt.show()
df_consultants["Resigned"].value_counts()

NO     835
YES    134
Name: Resigned, dtype: int64


segments = df["JobSatisfaction"].value_counts().sort_index(ascending=True)
labels = list(segments.index)
labels [0], labels[3] = "1 (least)", "4 (most)"

plt.pie(segments
      , labels = labels
      , colors = colors_r
      , counterclock = True
      , startangle = 90
      , labeldistance = 1.2
      , pctdistance = 0.65
      , autopct = lambda p: f"{int(p)}%"
      , textprops={"fontsize": 14}  
       )

plt.suptitle("Fig. 2. Job Satisfaction of Employees", fontsize=14, fontweight="bold")
plt.title("Scale of 1-4")


#plt.savefig("Fig. 2. JobSatisfaction_pie.jpg", dpi=300)

plt.show()


segments = df_consultants["JobSatisfaction"].value_counts().sort_index(ascending=True)
labels = list(segments.index)
labels [0], labels[3] = "1 (least)", "4 (most)"

plt.pie(segments
      , labels = labels
      , colors = colors_r
      , counterclock = True
      , startangle = 90
      , labeldistance = 1.2
      , pctdistance = 0.65
      , autopct = lambda p: f"{int(p)}%"
      , textprops={"fontsize": 14}  
       )

plt.suptitle("Job Satisfaction of Consultants", fontsize=14, fontweight="bold")
plt.title("Scale of 1-4")
#plt.title("Title")

# plt.savefig("JobSatisfaction_pie.jpg", dpi=300)

plt.show()


plt.hist("MonthlyIncome", data=df, color="Orange")
plt.xlabel("$/month")
plt.ylabel("No. of employees")
plt.suptitle("Monthly Income Distribution", fontsize=14, fontweight="bold")

plt.tight_layout()
# plt.savefig("Monthly_Income_hist.jpg", dpi=300)

plt.show()


plt.figure(figsize=(12,6))

ax = sns.displot(data=df["MonthlyIncome"], kind = "kde", color="Orange")
plt.xlabel("$/month")
plt.ylabel("Density")
plt.suptitle("Fig. 2. Monthly Income Distribution (KDE)", fontsize=14, fontweight="bold")

plt.tight_layout()
# plt.savefig("Fig. 3. - Monthly_Income_KDE.jpg", dpi=300)

plt.show()

<Figure size 1200x600 with 0 Axes>


role_by_es_test


#df_consultants["Resigned"].map({"YES": "Resigned", "NO": "Employed"})
#labels = role_by_es_test["Resigned]".map({"YES": "Resigned", "NO": "Employed"}).index

labels = role_by_es_test.index

top = list(role_by_es_test["Resigned"])
bottom = list(role_by_es_test["Total"])
x=0
for t, b in zip(top, bottom):
    bottom[x] = b - t
    x += 1
    print (bottom)


fig, ax = plt.subplots(figsize=(7,6))
ax.bar(labels, bottom, label="Total", color=color)

ax.bar(labels, top, label="Resigned", bottom=bottom)

for n, val in enumerate(role_by_es_test["Resigned"]):
    ax.text(-0.12+(n), 91, str(round(val,1))+"%", fontsize=12, color = "white")
plt.ylabel("%")
plt.title("Fig. 6. Employment Status by Business Role (%)", fontsize=14, fontweight="bold")
plt.legend(bbox_to_anchor=(1.25,.8), loc = "upper right")

plt.tight_layout()
# plt.savefig("Fig. 6. Employment Status by Business Role.jpg", dpi=300)

plt.show()

[81.53846153846153, 100.0, 100.0]
[81.53846153846153, 86.17131062951496, 100.0]
[81.53846153846153, 86.17131062951496, 79.24107142857143]


plt.hist("MonthlyIncome", data=df_consultants, color="Orange")
plt.xlabel("$/month")
plt.ylabel("No. of employees")
plt.suptitle("Monthly Income Distribution for Consultants", fontsize=14, fontweight="bold")

plt.tight_layout()
# plt.savefig("Monthly_Income_hist_Consultants.jpg", dpi=300)

plt.show()


sns.set_context(context="paper")

g = sns.PairGrid(df
                , hue = "Resigned"
                , vars=["Age", "AverageWeeklyHoursWorked", "MonthlyIncome", "YearsAtCompany"])

g.fig.subplots_adjust(top=.93)
g.fig.suptitle("Pairplots - Age, Average Weekly Hours, Income and Employment Status", fontsize=16, fontweight="bold")


g.map_lower(sns.kdeplot)
g.map_upper(sns.scatterplot)
g.map_diag(sns.histplot, bins= 5)

# g.savefig("Appendix 4. PairPlots_Age_WeeklyHours_MonthlyIncome_Years.jpg", dpi=300)

<seaborn.axisgrid.PairGrid at 0x7fd7bd056be0>


sns.set_context(context="paper")

g = sns.PairGrid(df_consultants
                , hue = "Resigned"
                , vars=["Age", "AverageWeeklyHoursWorked", "MonthlyIncome", "YearsAtCompany"])

g.fig.subplots_adjust(top=.93)
g.fig.suptitle("Pair Plots for Consultants: Age, Average Weekly Hours, Income and Employment Status", fontsize=16, fontweight="bold")


g.map_lower(sns.kdeplot)
g.map_upper(sns.scatterplot)
g.map_diag(sns.histplot, bins= 5)

# g.savefig("PairPlots_Age_WeeklyHours_MonthlyIncome_Years - Consultants.jpg", dpi=300)

<seaborn.axisgrid.PairGrid at 0x7fd7ae4f3190>


# plt.figure(figsize=(10,5))

g = sns.PairGrid(df
                , hue = "Resigned"
                , x_vars=["MonthlyIncome", "YearsAtCompany", "Age"]
                , y_vars=["AverageWeeklyHoursWorked",]
                , 
                )
g.fig.subplots_adjust(top=.85)
g.fig.suptitle("Fig. 7. - Employment Status insights", fontsize=12, fontweight="bold")



g.map(sns.scatterplot)
g.add_legend(title="Resigned?:")

# {"YES": "Resigned", "NO": "Employed"}

# g.savefig("Fig. 7. Employment_status_insights.jpg", dpi=300)

<seaborn.axisgrid.PairGrid at 0x7fd7de926f70>


df_heatmap = df[category_subset].pivot_table(index="JobSatisfaction").sort_index(ascending=False)

#reorder columns so it reads visually from least correlating to most
df_heatmap2 = df_heatmap[["WorkLifeBalance", "EducationLevel", "PerformanceRating"]]
plt.figure(figsize=(7,5))
plt.suptitle("Fig. 4. Job Satisfaction in relation to: \nWork/Life Balance, Education & Performance"
             , fontsize=12, fontweight="bold")



ax = sns.heatmap(df_heatmap2, annot=True, fmt=".1f", cmap="YlOrBr")

# plt.savefig("Fig. 4. - Job_WorkLife_Education_Performance_heatmap.jpg", dpi=300)


print(df.columns)

Index(['EmployeeID', 'Age', 'Resigned', 'BusinessTravel', 'BusinessUnit',
       'EducationLevel', 'Gender', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'AverageWeeklyHoursWorked', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')


select_columns=["Age", "MonthlyIncome", "WorkLifeBalance", "AverageWeeklyHoursWorked"
              , "NumCompaniesWorked", "YearsAtCompany", "YearsInRole"
              , "YearsSinceLastPromotion", "YearsWithCurrManager"]

# try complete dataset

plt.figure(figsize=(9, 9)) 
ax = sns.heatmap(df[select_columns].corr(), annot=True, fmt=".2f", cmap="YlOrBr")
plt.suptitle("Employee Profile Heatmap", fontsize=14, fontweight="bold", y=0.93)
plt.subplots_adjust(left=.2, bottom=.2)

# plt.savefig("Appendix - Employee Profile Heatmap.jpg", dpi=300)
plt.show()


df.boxplot(column="AverageWeeklyHoursWorked", by="Resigned", figsize=(5,4))

<Axes: title={'center': 'AverageWeeklyHoursWorked'}, xlabel='Resigned'>


# Just on consultants (approx 2/3 of dataset) - not noticebly different to whole population.

df_consultants.boxplot(column="AverageWeeklyHoursWorked", by="Resigned", figsize=(5,4))

<Axes: title={'center': 'AverageWeeklyHoursWorked'}, xlabel='Resigned'>


plt.figure(figsize=(11, 6)) 
ax = sns.boxplot( data=df
                , x="AverageWeeklyHoursWorked"
                , y="Resigned"
                , hue="JobSatisfaction"
                , fliersize=3
                , palette = "YlOrBr"
            )

plt.suptitle("Fig. 5. Employment Status and Job Satisfaction vs Weekly Hours"
            , fontsize=14, fontweight="bold")

sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

# plt.savefig("Fig. 5. Employment and Job Satisfaction Boxplot.jpg", dpi=300)


# working out number of fields to work out matrix format of histogram/bar subplots  
len(df.columns[1:])

21


# Appendix 2 - all features

# to iterate through subplot matrix
m_row,m_col = 0, 0


fig, ax = plt.subplots(7,3, figsize=(30,44))


for feature in df.columns[1:]:
    
    #select corrrect plot depending on data type
    if feature in integer_columns:
        ax[m_row, m_col].hist(x=feature, data=df, color=color)
    else:    
        ax[m_row, m_col].bar(df[feature].value_counts().index, df[feature].value_counts(), color=color)
    
    #add title to suplot
    ax[m_row, m_col].set_title(feature, fontsize = 20, fontweight="bold")
    
    #shift to next column, or if necessary move to a new row in first column
    m_col += 1 
    if m_col > 2:
        m_row, m_col = m_row + 1, 0

        

#fig.subplots_adjust(top=0.95)    
plt.suptitle("Histograms & Bar Charts for all Features", fontsize = 60, fontweight="bold")

#setting top of subplot area in order to have space for title to go above the subplots
plt.tight_layout(rect=[0,0, 1, 0.97])

# plt.savefig("Appendix 2. All Features.jpg", dpi=300)

plt.show()


# Calculating median for each level of Job Satisfation
#median = df[(df["Resigned"] == "YES") & (df["JobSatisfaction"] == 4)]["AverageWeeklyHoursWorked"].median()
#print (median)
for status in list(df["Resigned"].unique()):
    print("Resigned:", status)
    print("JobSatisfaction Level    MEDIAN weekly hours")
    for rating in list(df["JobSatisfaction"].unique()):
    
        median = df[(df["Resigned"] == status) & (df["JobSatisfaction"] == rating)]["AverageWeeklyHoursWorked"].median()
        print(f"{rating}                        {round(median, 2)}")
    print()

Resigned: YES
JobSatisfaction Level    MEDIAN weekly hours
4                        50.0
2                        50.0
3                        50.0
1                        40.0

Resigned: NO
JobSatisfaction Level    MEDIAN weekly hours
4                        40.0
2                        40.0
3                        40.0
1                        40.0


# Calculating mean for each level of Job Satisfation
#median = df[(df["Resigned"] == "YES") & (df["JobSatisfaction"] == 4)]["AverageWeeklyHoursWorked"].median()
#print (median)

for status in list(df["Resigned"].unique()):
    print("Resigned:", status)
    print("JobSatisfaction Level    MEAN weekly hours")
    for rating in list(df["JobSatisfaction"].unique()):
    
        mean = df[(df["Resigned"] == status) & (df["JobSatisfaction"] == rating)]["AverageWeeklyHoursWorked"].mean()
        print(f"{rating}                        {round(mean, 2)}")
    print()

Resigned: YES
JobSatisfaction Level    MEAN weekly hours
4                        48.47
2                        46.8
3                        46.99
1                        45.97

Resigned: NO
JobSatisfaction Level    MEAN weekly hours
4                        42.86
2                        41.73
3                        42.13
1                        42.13


# save cleaned CSV to file
# df.to_csv('df_cleaned.csv')


# save descriptive statistics to CSV
# df_stats.to_csv("df_statistics.csv")

	Age	EducationLevel	JobSatisfaction	MonthlyIncome	NumCompaniesWorked	PercentSalaryHike	PerformanceRating	AverageWeeklyHoursWorked	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInRole	YearsSinceLastPromotion	YearsWithCurrManager
count	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000	1482.00000	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000
mean	36.890013	2.913630	2.733468	6484.586370	2.695682	15.186910	3.153846	43.298246	11.24359	2.795547	2.761808	6.997976	4.230769	2.191633	4.119433
std	9.143432	1.023342	1.100525	4693.846366	2.502362	3.686868	0.362789	10.733283	7.76821	1.288565	0.707306	6.112108	3.618830	3.225864	3.563247
min	18.000000	1.000000	1.000000	1009.000000	0.000000	0.000000	2.000000	40.000000	0.00000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000
25%	30.000000	2.000000	2.000000	2914.750000	1.000000	12.000000	3.000000	40.000000	6.00000	2.000000	2.000000	3.000000	2.000000	0.000000	2.000000
50%	36.000000	3.000000	3.000000	4900.000000	2.000000	14.000000	3.000000	40.000000	10.00000	3.000000	3.000000	5.000000	3.000000	1.000000	3.000000
75%	43.000000	4.000000	4.000000	8307.750000	4.000000	18.000000	3.000000	47.000000	15.00000	3.000000	3.000000	9.000000	7.000000	3.000000	7.000000
max	60.000000	5.000000	4.000000	19999.000000	9.000000	25.000000	4.000000	400.000000	40.00000	6.000000	4.000000	40.000000	18.000000	15.000000	17.000000

	index	Age	EducationLevel	JobSatisfaction	MonthlyIncome	NumCompaniesWorked	PercentSalaryHike	PerformanceRating	AverageWeeklyHoursWorked	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInRole	YearsSinceLastPromotion	YearsWithCurrManager
0	count	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000	1482.00000	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000
1	mean	36.890013	2.913630	2.733468	6484.586370	2.695682	15.186910	3.153846	43.055331	11.24359	2.795547	2.761808	6.997976	4.230769	2.191633	4.119433
2	std	9.143432	1.023342	1.100525	4693.846366	2.502362	3.686868	0.362789	5.407341	7.76821	1.288565	0.707306	6.112108	3.618830	3.225864	3.563247
3	min	18.000000	1.000000	1.000000	1009.000000	0.000000	0.000000	2.000000	40.000000	0.00000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000
4	25%	30.000000	2.000000	2.000000	2914.750000	1.000000	12.000000	3.000000	40.000000	6.00000	2.000000	2.000000	3.000000	2.000000	0.000000	2.000000
5	50%	36.000000	3.000000	3.000000	4900.000000	2.000000	14.000000	3.000000	40.000000	10.00000	3.000000	3.000000	5.000000	3.000000	1.000000	3.000000
6	75%	43.000000	4.000000	4.000000	8307.750000	4.000000	18.000000	3.000000	47.000000	15.00000	3.000000	3.000000	9.000000	7.000000	3.000000	7.000000
7	max	60.000000	5.000000	4.000000	19999.000000	9.000000	25.000000	4.000000	71.000000	40.00000	6.000000	4.000000	40.000000	18.000000	15.000000	17.000000

	Age	EducationLevel	JobSatisfaction	MonthlyIncome	NumCompaniesWorked	PercentSalaryHike	PerformanceRating	AverageWeeklyHoursWorked	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInRole	YearsSinceLastPromotion	YearsWithCurrManager
index
count	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00
mean	36.89	2.91	2.73	6484.59	2.70	15.19	3.15	43.06	11.24	2.80	2.76	7.00	4.23	2.19	4.12
std	9.14	1.02	1.10	4693.85	2.50	3.69	0.36	5.41	7.77	1.29	0.71	6.11	3.62	3.23	3.56
min	18.00	1.00	1.00	1009.00	0.00	0.00	2.00	40.00	0.00	0.00	1.00	0.00	0.00	0.00	0.00
25%	30.00	2.00	2.00	2914.75	1.00	12.00	3.00	40.00	6.00	2.00	2.00	3.00	2.00	0.00	2.00
50%	36.00	3.00	3.00	4900.00	2.00	14.00	3.00	40.00	10.00	3.00	3.00	5.00	3.00	1.00	3.00
75%	43.00	4.00	4.00	8307.75	4.00	18.00	3.00	47.00	15.00	3.00	3.00	9.00	7.00	3.00	7.00
max	60.00	5.00	4.00	19999.00	9.00	25.00	4.00	71.00	40.00	6.00	4.00	40.00	18.00	15.00	17.00
var	83.60	1.05	1.21	22032193.71	6.26	13.59	0.13	29.24	60.35	1.66	0.50	37.36	13.10	10.41	12.70

	index	Age	EducationLevel	JobSatisfaction	MonthlyIncome	NumCompaniesWorked	PercentSalaryHike	PerformanceRating	AverageWeeklyHoursWorked	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInRole	YearsSinceLastPromotion	YearsWithCurrManager
0	count	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000	1482.00000	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000	1482.000000
1	mean	36.890013	2.913630	2.733468	6484.586370	2.695682	15.186910	3.153846	43.055331	11.24359	2.795547	2.761808	6.997976	4.230769	2.191633	4.119433
2	std	9.143432	1.023342	1.100525	4693.846366	2.502362	3.686868	0.362789	5.407341	7.76821	1.288565	0.707306	6.112108	3.618830	3.225864	3.563247
3	min	18.000000	1.000000	1.000000	1009.000000	0.000000	0.000000	2.000000	40.000000	0.00000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000
4	25%	30.000000	2.000000	2.000000	2914.750000	1.000000	12.000000	3.000000	40.000000	6.00000	2.000000	2.000000	3.000000	2.000000	0.000000	2.000000
5	50%	36.000000	3.000000	3.000000	4900.000000	2.000000	14.000000	3.000000	40.000000	10.00000	3.000000	3.000000	5.000000	3.000000	1.000000	3.000000
6	75%	43.000000	4.000000	4.000000	8307.750000	4.000000	18.000000	3.000000	47.000000	15.00000	3.000000	3.000000	9.000000	7.000000	3.000000	7.000000
7	max	60.000000	5.000000	4.000000	19999.000000	9.000000	25.000000	4.000000	71.000000	40.00000	6.000000	4.000000	40.000000	18.000000	15.000000	17.000000

	Age	EducationLevel	JobSatisfaction	MonthlyIncome	NumCompaniesWorked	PercentSalaryHike	PerformanceRating	AverageWeeklyHoursWorked	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInRole	YearsSinceLastPromotion	YearsWithCurrManager
index
count	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00	1482.00
mean	36.89	2.91	2.73	6484.59	2.70	15.19	3.15	43.06	11.24	2.80	2.76	7.00	4.23	2.19	4.12
std	9.14	1.02	1.10	4693.85	2.50	3.69	0.36	5.41	7.77	1.29	0.71	6.11	3.62	3.23	3.56
min	18.00	1.00	1.00	1009.00	0.00	0.00	2.00	40.00	0.00	0.00	1.00	0.00	0.00	0.00	0.00
25%	30.00	2.00	2.00	2914.75	1.00	12.00	3.00	40.00	6.00	2.00	2.00	3.00	2.00	0.00	2.00
50%	36.00	3.00	3.00	4900.00	2.00	14.00	3.00	40.00	10.00	3.00	3.00	5.00	3.00	1.00	3.00
75%	43.00	4.00	4.00	8307.75	4.00	18.00	3.00	47.00	15.00	3.00	3.00	9.00	7.00	3.00	7.00
max	60.00	5.00	4.00	19999.00	9.00	25.00	4.00	71.00	40.00	6.00	4.00	40.00	18.00	15.00	17.00
var	83.60	1.05	1.21	22032193.71	6.26	13.59	0.13	29.24	60.35	1.66	0.50	37.36	13.10	10.41	12.70

Assignment 1: Employee Retention - initial report¶

Code for dataset preparation, analysis and visualisation¶

1. Data Retrieving - initial¶

2. Check data types¶

3-5. Typos | Whitespace | Upper/Lower case |¶

Missing Data & Outliers¶

String Cleaning/Consistency¶

BUSINESS TRAVEL¶

BUSINESS UNIT¶

GENDER & MARITAL STATUS¶

COMPLETED CLEAN DATASET¶

End of data cleaning¶

Data Exploration¶

Observations/Questions¶

Some notable features are:¶

Pairplots¶

HexBin¶

Plots for report¶

Nominal - Employment Status¶

Ordinal - Job Satisfaction¶

Interval/Ratio - Wages¶

Correlation charts/scatter¶

Heatmap¶

For the final report: Adam's portfolio website ¶

	EmployeeID	Age	Resigned	BusinessTravel	BusinessUnit	EducationLevel	Gender	JobSatisfaction	MaritalStatus	MonthlyIncome	NumCompaniesWorked	OverTime	PercentSalaryHike	PerformanceRating	AverageWeeklyHoursWorked	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInRole	YearsSinceLastPromotion	YearsWithCurrManager
0	7912	41	Yes	Travel_Rarely	Sales	2.0	Female	4.0	Single	5993.0	8	Yes	11	3	63.0	8	0	1.0	6	4	0	5
1	1520	49	No	Travel_Frequently	Consultants	1.0	Male	2.0	Married	5130.0	1	No	23	4	40.0	10	3	3.0	10	7	1	7
2	1488	37	Yes	Travel_Rarely	Consultants	2.0	Male	3.0	Single	2090.0	6	Yes	15	3	50.0	7	3	3.0	0	0	0	0
3	2535	33	No	Travel_Frequently	Consultants	4.0	Female	3.0	Married	2909.0	1	Yes	11	3	48.0	8	3	3.0	8	7	3	0
4	4582	27	No	Travel_Rarely	Consultants	1.0	Male	2.0	Married	3468.0	9	No	12	3	40.0	6	3	3.0	2	2	2	2

	EmployeeID	Age	Resigned	BusinessTravel	BusinessUnit	EducationLevel	Gender	JobSatisfaction	MaritalStatus	MonthlyIncome	NumCompaniesWorked	OverTime	PercentSalaryHike	PerformanceRating	AverageWeeklyHoursWorked	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInRole	YearsSinceLastPromotion	YearsWithCurrManager
1471	6264	21	No	Travel_Rarely	Consultants	2.0	Male	4.0	Single	NaN	1	No	12	3	40.0	0	5	3.0	0	0	0	0
1472	5560	23	No	Travels_Rarely	Consultants	2.0	MMale	4.0	Single	NaN	1	No	10	4	40.0	0	2	3.0	1	1	0	1
1473	1215	26	NaN	Travel_Rarely	Business Operations	NaN	Female	NaN	Married	NaN	1	No	22	4	40.0	8	2	NaN	8	7	5	3
1474	5988	26	NaN	Travel_Rarely	Business Operations	2.0	Female	3.0	Married	4023.0	1	No	18	4	40.0	7	2	2.0	7	3	0	2
1478	3190	33	Yes	TRAVEL_RARELY	Consultants	4.0	Male	3.0	Single	2686.0	1	NaN	13	3	40.0	10	2	2.0	10	8	8	10
1479	9017	38	no	rarely_travel	Consultants	2.0	Female	3.0	Married	3700.0	1	NaN	0	2	46.0	5	3	3.0	5	4	0	3
1480	2477	32	No	Travel_Frequently	Sales	4.0	male	4.0	Single	4802.0	8	NaN	13	3	50.0	4	2	4.0	1	1	0	1

	EmployeeID	Age	Resigned	BusinessTravel	BusinessUnit	EducationLevel	Gender	JobSatisfaction	MaritalStatus	MonthlyIncome	NumCompaniesWorked	OverTime	PercentSalaryHike	PerformanceRating	AverageWeeklyHoursWorked	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInRole	YearsSinceLastPromotion	YearsWithCurrManager
1481	3238	36	No	Non-Travel	Consultants	4	Female	3	Married	4802	2	No	14	3	400	15	3	3	13	10	10	5
725	9550	35	Yes	Travel_Rarely	Consultants	4	Male	2	Divorced	3743	1	Yes	24	4	71	5	2	1	4	2	0	2
1036	6655	31	Yes	Travel_Frequently	Consultants	3	Male	4	Married	3722	6	Yes	13	3	71	7	2	1	2	2	2	2
1171	9807	40	Yes	Travel_Rarely	Consultants	3	Male	1	Single	2166	3	Yes	14	3	70	10	3	1	4	2	0	3
669	2680	39	Yes	Travel_Rarely	Consultants	3	Male	1	Married	2404	7	Yes	21	4	70	8	2	1	2	2	2	2
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
656	5853	32	Yes	Travel_Rarely	Consultants	4	Male	4	Single	2795	1	Yes	24	4	63	1	2	1	1	0	0	1
463	8305	26	Yes	Travel_Rarely	Consultants	3	Male	4	Single	2340	1	Yes	18	3	63	1	3	1	1	0	0	0
0	7912	41	Yes	Travel_Rarely	Sales	2	Female	4	Single	5993	8	Yes	11	3	63	8	0	1	6	4	0	5
559	3749	38	No	Travel_Rarely	Consultants	5	Male	3	Married	3057	6	Yes	13	3	62	6	0	1	1	0	0	1
312	5291	31	No	Travel_Rarely	Consultants	4	Male	4	Divorced	2695	0	Yes	18	3	62	3	2	1	2	2	2	2

	EmployeeID	Age	Resigned	BusinessTravel	BusinessUnit	EducationLevel	Gender	JobSatisfaction	MaritalStatus	MonthlyIncome	NumCompaniesWorked	OverTime	PercentSalaryHike	PerformanceRating	AverageWeeklyHoursWorked	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInRole	YearsSinceLastPromotion	YearsWithCurrManager
0	7912	41	YES	RARELY	SALES	2	FEMALE	4	SINGLE	5993	8	YES	11	3	63	8	0	1	6	4	0	5
1	1520	49	NO	FREQUENTLY	CONSULTANTS	1	MALE	2	MARRIED	5130	1	NO	23	4	40	10	3	3	10	7	1	7
2	1488	37	YES	RARELY	CONSULTANTS	2	MALE	3	SINGLE	2090	6	YES	15	3	50	7	3	3	0	0	0	0
3	2535	33	NO	FREQUENTLY	CONSULTANTS	4	FEMALE	3	MARRIED	2909	1	YES	11	3	48	8	3	3	8	7	3	0
4	4582	27	NO	RARELY	CONSULTANTS	1	MALE	2	MARRIED	3468	9	NO	12	3	40	6	3	3	2	2	2	2

	Age	EducationLevel	JobSatisfaction	MonthlyIncome	NumCompaniesWorked	PercentSalaryHike	PerformanceRating	AverageWeeklyHoursWorked	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInRole	YearsSinceLastPromotion	YearsWithCurrManager
index
var	83.5396	1.0404	1.21	2.203223e+07	6.25	13.6161	0.1296	29.2681	60.3729	1.6641	0.5041	37.3321	13.1044	10.4329	12.6736

	Resigned	Total
BUSINESS OPERATIONS	18.461538	100.0
CONSULTANTS	13.828689	100.0
SALES	20.758929	100.0

Assignment 1: Employee Retention - initial report¶

Code for dataset preparation, analysis and visualisation¶

1. Data Retrieving - initial¶

2. Check data types¶

3-5. Typos | Whitespace | Upper/Lower case |¶

Missing Data & Outliers¶

String Cleaning/Consistency¶

BUSINESS TRAVEL¶

BUSINESS UNIT¶

GENDER & MARITAL STATUS¶

COMPLETED CLEAN DATASET¶

End of data cleaning¶

Data Exploration¶

Observations/Questions¶

Some notable features are:¶

Pairplots¶

HexBin¶

Plots for report¶

Nominal - Employment Status¶

Ordinal - Job Satisfaction¶

Interval/Ratio - Wages¶

Correlation charts/scatter¶

Heatmap¶

For the final report: Adam's portfolio website¶

For the final report: Adam's portfolio website ¶