This report aims to present the capabilities of the package smbinning.

The document is a part of the paper “Landscape of R packages for eXplainable Machine Learning”, S. Maksymiuk, A. Gosiewska, and P. Biecek. (https://arxiv.org/abs/2009.13248). It contains a real life use-case with a hand of titanic_imputed data set described in Section Example gallery for XAI packages of the article.

We did our best to show the entire range of the implemented explanations. Please note that the examples may be incomplete. If you think something is missing, feel free to make a pull request at the GitHub repository MI2DataLab/XAI-tools.

The list of use-cases for all packages included in the article is here.

Load titanic_imputed data set.

data(titanic_imputed, package = "DALEX")

head(titanic_imputed)
##   gender age class    embarked  fare sibsp parch survived
## 1   male  42   3rd Southampton  7.11     0     0        0
## 2   male  13   3rd Southampton 20.05     0     2        0
## 3   male  16   3rd Southampton 20.05     1     1        0
## 4 female  39   3rd Southampton 20.05     1     1        1
## 5 female  16   3rd Southampton  7.13     0     0        1
## 6   male  25   3rd Southampton  7.13     0     0        1
library(smbinning)

set.seed(123)

Fit a random forest and logistic regression to the titanic imputed data.

glm_model <- glm(survived~., data = titanic_imputed, family = "binomial")

Model parts

Scaling coefficients of logistic regression

# Generate binning object to generate variables
gender <- smbinning.factor(titanic_imputed, x="gender",y="survived")
age <- smbinning(titanic_imputed, x="age",y="survived")
smclass <- smbinning.factor(titanic_imputed, x="class",y="survived")
embarked <- smbinning.factor(titanic_imputed, x="embarked",y="survived")
fare <- smbinning(titanic_imputed, x="fare",y="survived")
sibsp <- smbinning(titanic_imputed, x="sibsp",y="survived")
parch <- smbinning(titanic_imputed, x="parch",y="survived")

pop <- titanic_imputed
pop <- smbinning.factor.gen(pop, gender, "g1gender")
pop <- smbinning.gen(pop, age, "g1age")
pop <- smbinning.factor.gen(pop, smclass, "g1class")
pop <- smbinning.factor.gen(pop, embarked, "g1embarked")
pop <- smbinning.gen(pop, fare, "g1fare")
# "No significant splits"
# pop <- smbinning.gen(pop, sibsp, "g1sibsp") 
pop <- smbinning.gen(pop, parch, "g1parch")

glm_model <- glm(survived~g1gender+g1age+g1class+g1embarked+g1fare+g1parch, data = pop, family = "binomial")

smbscaled=smbinning.scaling(glm_model)
smbscaled$logitscaled # Scaled model
## [[1]]
##    Characteristic               Attribute Coefficient     Weight WeightScaled
## 1     (Intercept)                           2.5469311  73.488898      0.00000
## 2        g1gender           01 = 'female'   0.0000000   0.000000    110.15029
## 3        g1gender             02 = 'male'  -2.6096528 -75.298663     34.85163
## 4           g1age                01 <= 16   0.0000000   0.000000    110.15029
## 5           g1age                 02 > 16  -0.9184289 -26.500255     83.65004
## 6         g1class              01 = '1st'   0.0000000   0.000000    110.15029
## 7         g1class              02 = '2nd'  -0.7667706 -22.124324     88.02597
## 8         g1class              03 = '3rd'  -1.7293549 -49.898635     60.25166
## 9         g1class        04 = 'deck crew'   1.4296702  41.251562    151.40186
## 10        g1class 05 = 'engineering crew'  -0.5312537 -15.328742     94.82155
## 11        g1class 06 = 'restaurant staff'  -2.6134967 -75.409576     34.74072
## 12        g1class 07 = 'victualling crew'  -0.6785343 -19.578362     90.57193
## 13     g1embarked          01 = 'Belfast'   0.0000000   0.000000    110.15029
## 14     g1embarked        02 = 'Cherbourg'   0.8654760  24.972358    135.12265
## 15     g1embarked       03 = 'Queenstown'   0.4630442  13.360632    123.51093
## 16     g1embarked      04 = 'Southampton'   0.2758445   7.959191    118.10948
## 17         g1fare             01 <= 15.02   0.0000000   0.000000    110.15029
## 18         g1fare                02 <= 52  -0.1331362  -3.841498    106.30880
## 19         g1fare                 03 > 52   0.1474763   4.255266    114.40556
## 20        g1parch                 01 <= 0   0.0000000   0.000000    110.15029
## 21        g1parch                 02 <= 1   0.3695985  10.664359    120.81465
## 22        g1parch                  03 > 1  -0.4556803 -13.148154     97.00214
##    Points
## 1       0
## 2     110
## 3      35
## 4     110
## 5      84
## 6     110
## 7      88
## 8      60
## 9     151
## 10     95
## 11     35
## 12     91
## 13    110
## 14    135
## 15    124
## 16    118
## 17    110
## 18    106
## 19    114
## 20    110
## 21    121
## 22     97

Model diagnostics

ROC

results <- data.frame(pred = predict(glm_model, type = "response"),
                     survived = titanic_imputed$survived)

smbinning.metrics(dataset=results ,prediction="pred",actualclass="survived", report=0, plot="auc") # Plot AUC

Cumulative distribution

results <-  data.frame(pred = predict(glm_model, type = "response"),
                     survived = titanic_imputed$survived)


smbinning.metrics(dataset=results, prediction="pred", actualclass="survived", report=0, plot="ks") # Plot AUC

Confucion matrix plot

results <-  data.frame(pred = predict(glm_model, type = "response"),
                     survived = titanic_imputed$survived)
smbmetricsdf <- smbinning.metrics(dataset=results, prediction="pred",
                               actualclass="survived", returndf=1)
## 
##   Overall Performance Metrics 
##   -------------------------------------------------- 
##                     KS : 0.4886 (Good)
##                    AUC : 0.8059 (Good)
## 
##   Classification Matrix 
##   -------------------------------------------------- 
##            Cutoff (>=) : 0.2927 (Optimal)
##    True Positives (TP) : 480
##   False Positives (FP) : 279
##   False Negatives (FN) : 231
##    True Negatives (TN) : 1217
##    Total Positives (P) : 711
##    Total Negatives (N) : 1496
## 
##   Business/Performance Metrics 
##   -------------------------------------------------- 
##       %Records>=Cutoff : 0.3439
##              Good Rate : 0.6324 (Vs 0.3222 Overall)
##               Bad Rate : 0.3676 (Vs 0.6778 Overall)
##         Accuracy (ACC) : 0.7689
##      Sensitivity (TPR) : 0.6751
##  False Neg. Rate (FNR) : 0.3249
##  False Pos. Rate (FPR) : 0.1865
##      Specificity (TNR) : 0.8135
##        Precision (PPV) : 0.6324
##   False Discovery Rate : 0.3676
##     False Omision Rate : 0.1595
##   Inv. Precision (NPV) : 0.8405
## 
##   Note: 0 rows deleted due to missing data.
smbinning.metrics.plot(df=smbmetricsdf,plot='cmactual')

Session info

sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18362)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=Polish_Poland.1250  LC_CTYPE=Polish_Poland.1250   
## [3] LC_MONETARY=Polish_Poland.1250 LC_NUMERIC=C                  
## [5] LC_TIME=Polish_Poland.1250    
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] smbinning_0.9  Formula_1.2-3  partykit_1.2-7 mvtnorm_1.1-0  libcoin_1.0-5 
## [6] sqldf_0.4-11   RSQLite_2.2.0  gsubfn_0.7     proto_1.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.4.6    knitr_1.28      magrittr_1.5    splines_3.6.1  
##  [5] bit_1.1-15.2    lattice_0.20-40 rlang_0.4.6     stringr_1.4.0  
##  [9] blob_1.2.1      tcltk_3.6.1     tools_3.6.1     xfun_0.12      
## [13] DBI_1.1.0       htmltools_0.4.0 survival_3.1-11 yaml_2.2.1     
## [17] bit64_0.9-7     digest_0.6.25   inum_1.0-1      Matrix_1.2-18  
## [21] vctrs_0.3.1     rpart_4.1-15    memoise_1.1.0   evaluate_0.14  
## [25] rmarkdown_2.1   stringi_1.4.6   compiler_3.6.1  chron_2.3-55   
## [29] pkgconfig_2.0.3