This report aims to present the capabilities of the package smbinning
.
The document is a part of the paper “Landscape of R packages for eXplainable Machine Learning”, S. Maksymiuk, A. Gosiewska, and P. Biecek. (https://arxiv.org/abs/2009.13248). It contains a real life use-case with a hand of titanic_imputed data set described in Section Example gallery for XAI packages of the article.
We did our best to show the entire range of the implemented explanations. Please note that the examples may be incomplete. If you think something is missing, feel free to make a pull request at the GitHub repository MI2DataLab/XAI-tools.
The list of use-cases for all packages included in the article is here.
Load titanic_imputed
data set.
data(titanic_imputed, package = "DALEX")
head(titanic_imputed)
## gender age class embarked fare sibsp parch survived
## 1 male 42 3rd Southampton 7.11 0 0 0
## 2 male 13 3rd Southampton 20.05 0 2 0
## 3 male 16 3rd Southampton 20.05 1 1 0
## 4 female 39 3rd Southampton 20.05 1 1 1
## 5 female 16 3rd Southampton 7.13 0 0 1
## 6 male 25 3rd Southampton 7.13 0 0 1
library(smbinning)
set.seed(123)
Fit a random forest and logistic regression to the titanic imputed data.
glm_model <- glm(survived~., data = titanic_imputed, family = "binomial")
# Generate binning object to generate variables
gender <- smbinning.factor(titanic_imputed, x="gender",y="survived")
age <- smbinning(titanic_imputed, x="age",y="survived")
smclass <- smbinning.factor(titanic_imputed, x="class",y="survived")
embarked <- smbinning.factor(titanic_imputed, x="embarked",y="survived")
fare <- smbinning(titanic_imputed, x="fare",y="survived")
sibsp <- smbinning(titanic_imputed, x="sibsp",y="survived")
parch <- smbinning(titanic_imputed, x="parch",y="survived")
pop <- titanic_imputed
pop <- smbinning.factor.gen(pop, gender, "g1gender")
pop <- smbinning.gen(pop, age, "g1age")
pop <- smbinning.factor.gen(pop, smclass, "g1class")
pop <- smbinning.factor.gen(pop, embarked, "g1embarked")
pop <- smbinning.gen(pop, fare, "g1fare")
# "No significant splits"
# pop <- smbinning.gen(pop, sibsp, "g1sibsp")
pop <- smbinning.gen(pop, parch, "g1parch")
glm_model <- glm(survived~g1gender+g1age+g1class+g1embarked+g1fare+g1parch, data = pop, family = "binomial")
smbscaled=smbinning.scaling(glm_model)
smbscaled$logitscaled # Scaled model
## [[1]]
## Characteristic Attribute Coefficient Weight WeightScaled
## 1 (Intercept) 2.5469311 73.488898 0.00000
## 2 g1gender 01 = 'female' 0.0000000 0.000000 110.15029
## 3 g1gender 02 = 'male' -2.6096528 -75.298663 34.85163
## 4 g1age 01 <= 16 0.0000000 0.000000 110.15029
## 5 g1age 02 > 16 -0.9184289 -26.500255 83.65004
## 6 g1class 01 = '1st' 0.0000000 0.000000 110.15029
## 7 g1class 02 = '2nd' -0.7667706 -22.124324 88.02597
## 8 g1class 03 = '3rd' -1.7293549 -49.898635 60.25166
## 9 g1class 04 = 'deck crew' 1.4296702 41.251562 151.40186
## 10 g1class 05 = 'engineering crew' -0.5312537 -15.328742 94.82155
## 11 g1class 06 = 'restaurant staff' -2.6134967 -75.409576 34.74072
## 12 g1class 07 = 'victualling crew' -0.6785343 -19.578362 90.57193
## 13 g1embarked 01 = 'Belfast' 0.0000000 0.000000 110.15029
## 14 g1embarked 02 = 'Cherbourg' 0.8654760 24.972358 135.12265
## 15 g1embarked 03 = 'Queenstown' 0.4630442 13.360632 123.51093
## 16 g1embarked 04 = 'Southampton' 0.2758445 7.959191 118.10948
## 17 g1fare 01 <= 15.02 0.0000000 0.000000 110.15029
## 18 g1fare 02 <= 52 -0.1331362 -3.841498 106.30880
## 19 g1fare 03 > 52 0.1474763 4.255266 114.40556
## 20 g1parch 01 <= 0 0.0000000 0.000000 110.15029
## 21 g1parch 02 <= 1 0.3695985 10.664359 120.81465
## 22 g1parch 03 > 1 -0.4556803 -13.148154 97.00214
## Points
## 1 0
## 2 110
## 3 35
## 4 110
## 5 84
## 6 110
## 7 88
## 8 60
## 9 151
## 10 95
## 11 35
## 12 91
## 13 110
## 14 135
## 15 124
## 16 118
## 17 110
## 18 106
## 19 114
## 20 110
## 21 121
## 22 97
results <- data.frame(pred = predict(glm_model, type = "response"),
survived = titanic_imputed$survived)
smbinning.metrics(dataset=results ,prediction="pred",actualclass="survived", report=0, plot="auc") # Plot AUC
results <- data.frame(pred = predict(glm_model, type = "response"),
survived = titanic_imputed$survived)
smbinning.metrics(dataset=results, prediction="pred", actualclass="survived", report=0, plot="ks") # Plot AUC
results <- data.frame(pred = predict(glm_model, type = "response"),
survived = titanic_imputed$survived)
smbmetricsdf <- smbinning.metrics(dataset=results, prediction="pred",
actualclass="survived", returndf=1)
##
## Overall Performance Metrics
## --------------------------------------------------
## KS : 0.4886 (Good)
## AUC : 0.8059 (Good)
##
## Classification Matrix
## --------------------------------------------------
## Cutoff (>=) : 0.2927 (Optimal)
## True Positives (TP) : 480
## False Positives (FP) : 279
## False Negatives (FN) : 231
## True Negatives (TN) : 1217
## Total Positives (P) : 711
## Total Negatives (N) : 1496
##
## Business/Performance Metrics
## --------------------------------------------------
## %Records>=Cutoff : 0.3439
## Good Rate : 0.6324 (Vs 0.3222 Overall)
## Bad Rate : 0.3676 (Vs 0.6778 Overall)
## Accuracy (ACC) : 0.7689
## Sensitivity (TPR) : 0.6751
## False Neg. Rate (FNR) : 0.3249
## False Pos. Rate (FPR) : 0.1865
## Specificity (TNR) : 0.8135
## Precision (PPV) : 0.6324
## False Discovery Rate : 0.3676
## False Omision Rate : 0.1595
## Inv. Precision (NPV) : 0.8405
##
## Note: 0 rows deleted due to missing data.
smbinning.metrics.plot(df=smbmetricsdf,plot='cmactual')
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18362)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=Polish_Poland.1250 LC_CTYPE=Polish_Poland.1250
## [3] LC_MONETARY=Polish_Poland.1250 LC_NUMERIC=C
## [5] LC_TIME=Polish_Poland.1250
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] smbinning_0.9 Formula_1.2-3 partykit_1.2-7 mvtnorm_1.1-0 libcoin_1.0-5
## [6] sqldf_0.4-11 RSQLite_2.2.0 gsubfn_0.7 proto_1.0.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.4.6 knitr_1.28 magrittr_1.5 splines_3.6.1
## [5] bit_1.1-15.2 lattice_0.20-40 rlang_0.4.6 stringr_1.4.0
## [9] blob_1.2.1 tcltk_3.6.1 tools_3.6.1 xfun_0.12
## [13] DBI_1.1.0 htmltools_0.4.0 survival_3.1-11 yaml_2.2.1
## [17] bit64_0.9-7 digest_0.6.25 inum_1.0-1 Matrix_1.2-18
## [21] vctrs_0.3.1 rpart_4.1-15 memoise_1.1.0 evaluate_0.14
## [25] rmarkdown_2.1 stringi_1.4.6 compiler_3.6.1 chron_2.3-55
## [29] pkgconfig_2.0.3