3  Logistic Regression

3.1 Regresi Logistik Biner

3.1.1 Data

credit <- read.csv("Data/credit.csv")
head(credit[,1:5],10)
   creditability account.balance duration credit.amount saving.balance
1              1               1       18          1049              1
2              1               1        9          2799              1
3              1               2       12           841              2
4              1               1       12          2122              1
5              1               1       12          2171              1
6              1               1       10          2241              1
7              1               1        8          3398              1
8              1               1        6          1361              1
9              1               4       18          1098              1
10             1               2       24          3758              3
str(credit)
'data.frame':   1000 obs. of  14 variables:
 $ creditability   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ account.balance : int  1 1 2 1 1 1 1 1 4 2 ...
 $ duration        : int  18 9 12 12 12 10 8 6 18 24 ...
 $ credit.amount   : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
 $ saving.balance  : int  1 1 2 1 1 1 1 1 1 3 ...
 $ employment.year : int  2 3 4 3 3 2 4 2 1 1 ...
 $ installment.rate: int  4 2 2 3 4 1 1 2 4 1 ...
 $ marital.status  : int  2 3 2 3 3 3 3 3 2 2 ...
 $ duration.address: int  4 2 4 2 4 3 4 4 4 4 ...
 $ age             : int  21 36 23 39 38 48 39 40 65 23 ...
 $ dependents      : int  1 2 1 2 1 2 1 2 1 1 ...
 $ number.of.credit: int  1 2 1 2 2 2 2 1 2 1 ...
 $ occupation      : int  3 3 2 2 2 2 2 2 1 1 ...
 $ previous.credit : int  4 4 2 4 4 4 4 4 4 2 ...
library(dplyr)
credit <- credit %>% mutate(across(-c(duration,
                            credit.amount,
                            age),as.factor))
str(credit)
'data.frame':   1000 obs. of  14 variables:
 $ creditability   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
 $ account.balance : Factor w/ 4 levels "1","2","3","4": 1 1 2 1 1 1 1 1 4 2 ...
 $ duration        : int  18 9 12 12 12 10 8 6 18 24 ...
 $ credit.amount   : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
 $ saving.balance  : Factor w/ 5 levels "1","2","3","4",..: 1 1 2 1 1 1 1 1 1 3 ...
 $ employment.year : Factor w/ 5 levels "1","2","3","4",..: 2 3 4 3 3 2 4 2 1 1 ...
 $ installment.rate: Factor w/ 4 levels "1","2","3","4": 4 2 2 3 4 1 1 2 4 1 ...
 $ marital.status  : Factor w/ 4 levels "1","2","3","4": 2 3 2 3 3 3 3 3 2 2 ...
 $ duration.address: Factor w/ 4 levels "1","2","3","4": 4 2 4 2 4 3 4 4 4 4 ...
 $ age             : int  21 36 23 39 38 48 39 40 65 23 ...
 $ dependents      : Factor w/ 2 levels "1","2": 1 2 1 2 1 2 1 2 1 1 ...
 $ number.of.credit: Factor w/ 4 levels "1","2","3","4": 1 2 1 2 2 2 2 1 2 1 ...
 $ occupation      : Factor w/ 4 levels "1","2","3","4": 3 3 2 2 2 2 2 2 1 1 ...
 $ previous.credit : Factor w/ 5 levels "0","1","2","3",..: 5 5 3 5 5 5 5 5 5 3 ...

3.1.2 Pemodelan

logreg1 <- glm(creditability~.,data=credit,family = "binomial")
summary(logreg1)

Call:
glm(formula = creditability ~ ., family = "binomial", data = credit)

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        2.990e-01  8.942e-01   0.334 0.738097    
account.balance2   4.346e-01  2.013e-01   2.159 0.030852 *  
account.balance3   9.490e-01  3.602e-01   2.635 0.008421 ** 
account.balance4   1.804e+00  2.222e-01   8.119 4.69e-16 ***
duration          -2.705e-02  8.818e-03  -3.068 0.002156 ** 
credit.amount     -1.025e-04  4.161e-05  -2.465 0.013718 *  
saving.balance2    1.293e-01  2.701e-01   0.479 0.632222    
saving.balance3    4.144e-01  3.987e-01   1.039 0.298644    
saving.balance4    1.241e+00  5.032e-01   2.467 0.013629 *  
saving.balance5    8.811e-01  2.463e-01   3.577 0.000347 ***
employment.year2  -1.432e-01  4.109e-01  -0.348 0.727561    
employment.year3   2.530e-01  3.957e-01   0.639 0.522582    
employment.year4   7.646e-01  4.258e-01   1.796 0.072572 .  
employment.year5   2.386e-01  3.962e-01   0.602 0.547012    
installment.rate2 -2.841e-01  2.953e-01  -0.962 0.336089    
installment.rate3 -5.122e-01  3.217e-01  -1.592 0.111374    
installment.rate4 -9.279e-01  2.872e-01  -3.230 0.001236 ** 
marital.status2    1.744e-01  3.742e-01   0.466 0.641255    
marital.status3    7.482e-01  3.670e-01   2.039 0.041468 *  
marital.status4    5.577e-01  4.371e-01   1.276 0.201928    
duration.address2 -7.104e-01  2.832e-01  -2.509 0.012122 *  
duration.address3 -5.443e-01  3.163e-01  -1.721 0.085314 .  
duration.address4 -4.386e-01  2.762e-01  -1.588 0.112244    
age                1.125e-02  8.468e-03   1.329 0.183990    
dependents2       -2.607e-01  2.387e-01  -1.092 0.274669    
number.of.credit2 -4.177e-01  2.315e-01  -1.805 0.071133 .  
number.of.credit3 -4.131e-01  5.951e-01  -0.694 0.487625    
number.of.credit4 -4.589e-01  9.908e-01  -0.463 0.643240    
occupation2       -8.953e-02  6.276e-01  -0.143 0.886557    
occupation3       -1.487e-01  6.048e-01  -0.246 0.805804    
occupation4        1.276e-02  6.087e-01   0.021 0.983277    
previous.credit1  -3.136e-01  5.178e-01  -0.606 0.544686    
previous.credit2   6.063e-01  4.149e-01   1.461 0.143896    
previous.credit3   8.090e-01  4.531e-01   1.785 0.074205 .  
previous.credit4   1.511e+00  4.169e-01   3.625 0.000288 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1221.7  on 999  degrees of freedom
Residual deviance:  956.0  on 965  degrees of freedom
AIC: 1026

Number of Fisher Scoring iterations: 5

3.1.3 Odds Ratio

beta = round(coef(logreg1),2)
OR = round(exp(beta),2)
cbind(beta, OR)
                   beta   OR
(Intercept)        0.30 1.35
account.balance2   0.43 1.54
account.balance3   0.95 2.59
account.balance4   1.80 6.05
duration          -0.03 0.97
credit.amount      0.00 1.00
saving.balance2    0.13 1.14
saving.balance3    0.41 1.51
saving.balance4    1.24 3.46
saving.balance5    0.88 2.41
employment.year2  -0.14 0.87
employment.year3   0.25 1.28
employment.year4   0.76 2.14
employment.year5   0.24 1.27
installment.rate2 -0.28 0.76
installment.rate3 -0.51 0.60
installment.rate4 -0.93 0.39
marital.status2    0.17 1.19
marital.status3    0.75 2.12
marital.status4    0.56 1.75
duration.address2 -0.71 0.49
duration.address3 -0.54 0.58
duration.address4 -0.44 0.64
age                0.01 1.01
dependents2       -0.26 0.77
number.of.credit2 -0.42 0.66
number.of.credit3 -0.41 0.66
number.of.credit4 -0.46 0.63
occupation2       -0.09 0.91
occupation3       -0.15 0.86
occupation4        0.01 1.01
previous.credit1  -0.31 0.73
previous.credit2   0.61 1.84
previous.credit3   0.81 2.25
previous.credit4   1.51 4.53

3.1.4 Multikolineratitas

library(car)
vif(logreg1)
                     GVIF Df GVIF^(1/(2*Df))
account.balance  1.283532  3        1.042480
duration         1.828834  1        1.352344
credit.amount    2.284117  1        1.511330
saving.balance   1.286469  4        1.031989
employment.year  2.406179  4        1.116005
installment.rate 1.443706  3        1.063114
marital.status   1.439516  3        1.062599
duration.address 1.502426  3        1.070201
age              1.365556  1        1.168570
dependents       1.177252  1        1.085012
number.of.credit 2.060162  3        1.128020
occupation       1.893863  3        1.112307
previous.credit  2.136438  4        1.099541

3.1.5 Akurasi

pred_clas <- ifelse(logreg1$fitted.values > 0.5, 1, 0)
conf_matrix <- table(credit$creditability, pred_clas)
conf_matrix
   pred_clas
      0   1
  0 145 155
  1  68 632
paste0("Akurasi Model:")
[1] "Akurasi Model:"
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
accuracy
[1] 0.777

3.1.6 Kebaikan Model

#install.packages("performance")
library(performance)
#Outliers
performance::check_outliers(logreg1)
OK: No outliers detected.
- Based on the following method and threshold: cook (0.8).
- For variable: (Whole model)
#Metrik
performance(logreg1)
# Indices of model performance

AIC      |     AICc |      BIC | Tjur's R2 |  RMSE | Sigma | Log_loss
---------------------------------------------------------------------
1025.995 | 1028.609 | 1197.767 |     0.254 | 0.395 | 1.000 |    0.478

AIC      | Score_log | Score_spherical |   PCP
----------------------------------------------
1025.995 |      -Inf |           0.001 | 0.687
#Goodness Of Fit
performance_hosmer(logreg1)
# Hosmer-Lemeshow Goodness-of-Fit Test

  Chi-squared: 8.472
           df: 8    
      p-value: 0.389

3.2 Regresi Logistik Nominal atau Multinominal

3.2.1 Data

library(readxl)
students <- read_excel("Data/students.xlsx")
head(students,10)
# A tibble: 10 × 6
   gender ses    prog      read write  math
   <chr>  <chr>  <chr>    <dbl> <dbl> <dbl>
 1 female low    vocation    34    35    41
 2 male   middle general     34    33    41
 3 male   high   vocation    39    39    44
 4 male   low    vocation    37    37    42
 5 male   middle vocation    39    31    40
 6 female high   general     42    36    42
 7 male   middle vocation    31    36    46
 8 male   middle vocation    50    31    40
 9 female middle vocation    39    41    33
10 male   middle vocation    34    37    46
str(students)
tibble [200 × 6] (S3: tbl_df/tbl/data.frame)
 $ gender: chr [1:200] "female" "male" "male" "male" ...
 $ ses   : chr [1:200] "low" "middle" "high" "low" ...
 $ prog  : chr [1:200] "vocation" "general" "vocation" "vocation" ...
 $ read  : num [1:200] 34 34 39 37 39 42 31 50 39 34 ...
 $ write : num [1:200] 35 33 39 37 31 36 36 31 41 37 ...
 $ math  : num [1:200] 41 41 44 42 40 42 46 40 33 46 ...

3.2.2 Ubah jadi faktor

library(dplyr)
students <- students %>% mutate(across(-c(read,write,math),as.factor))
students$prog2 <- relevel(students$prog, ref = "academic")
str(students)
tibble [200 × 7] (S3: tbl_df/tbl/data.frame)
 $ gender: Factor w/ 2 levels "female","male": 1 2 2 2 2 1 2 2 1 2 ...
 $ ses   : Factor w/ 3 levels "high","low","middle": 2 3 1 2 3 1 3 3 3 3 ...
 $ prog  : Factor w/ 3 levels "academic","general",..: 3 2 3 3 3 2 3 3 3 3 ...
 $ read  : num [1:200] 34 34 39 37 39 42 31 50 39 34 ...
 $ write : num [1:200] 35 33 39 37 31 36 36 31 41 37 ...
 $ math  : num [1:200] 41 41 44 42 40 42 46 40 33 46 ...
 $ prog2 : Factor w/ 3 levels "academic","general",..: 3 2 3 3 3 2 3 3 3 3 ...
table(students$ses, students$prog)
        
         academic general vocation
  high         42       9        7
  low          19      16       12
  middle       44      20       31
table(students$gender, students$prog)
        
         academic general vocation
  female       58      24       27
  male         47      21       23

3.2.3 Pemodelan

#install.packages("nnet")
library(nnet)
logmultinom <- multinom(prog2 ~ ses + gender + write + read, data = students)
# weights:  21 (12 variable)
initial  value 219.722458 
iter  10 value 176.754587
final  value 174.725397 
converged
summary(logmultinom)
Call:
multinom(formula = prog2 ~ ses + gender + write + read, data = students)

Coefficients:
         (Intercept)    seslow sesmiddle gendermale       write        read
general     2.621831 1.0038426 0.5651588  0.1273914 -0.02860308 -0.04730781
vocation    6.505182 0.6239396 1.1539447 -0.3105237 -0.08243508 -0.07108839

Std. Errors:
         (Intercept)    seslow sesmiddle gendermale      write       read
general     1.434514 0.5323398 0.4713812  0.4137756 0.02686316 0.02480868
vocation    1.524572 0.6200276 0.5231819  0.4414783 0.02793343 0.02752520

Residual Deviance: 349.4508 
AIC: 373.4508 
z <- summary(logmultinom)$coefficients/summary(logmultinom)$standard.errors
# 2-tailed z test
p <- (1 - pnorm(abs(z), 0, 1)) * 2
p
          (Intercept)     seslow  sesmiddle gendermale       write        read
general  6.759775e-02 0.05933302 0.23055043  0.7581770 0.286980200 0.056532815
vocation 1.982164e-05 0.31426675 0.02741006  0.4818237 0.003166173 0.009804037

3.2.4 Odds Ratio

exp(coef(logmultinom))
         (Intercept)   seslow sesmiddle gendermale     write      read
general      13.7609 2.728747  1.759727   1.135862 0.9718021 0.9537938
vocation    668.5973 1.866266  3.170676   0.733063 0.9208712 0.9313796

3.2.5 Multikolineratitas

library(car)
vif(logmultinom)
            GVIF Df GVIF^(1/(2*Df))
ses     6.640420  2        1.605273
gender  2.650955  1        1.628175
write  66.396002  1        8.148374
read   53.940932  1        7.344449

3.2.6 Akurasi

df <- students[,c("ses","gender","write","read")]
#install.packages("caret")
library(caret)
prediksi <- predict(logmultinom, df, type = "class")
confusionMatrix(as.factor(prediksi), 
                students$prog2)
Confusion Matrix and Statistics

          Reference
Prediction academic general vocation
  academic       90      25       21
  general         3       7        4
  vocation       12      13       25

Overall Statistics
                                         
               Accuracy : 0.61           
                 95% CI : (0.5387, 0.678)
    No Information Rate : 0.525          
    P-Value [Acc > NIR] : 0.009485       
                                         
                  Kappa : 0.3094         
                                         
 Mcnemar's Test P-Value : 1.959e-05      

Statistics by Class:

                     Class: academic Class: general Class: vocation
Sensitivity                   0.8571         0.1556          0.5000
Specificity                   0.5158         0.9548          0.8333
Pos Pred Value                0.6618         0.5000          0.5000
Neg Pred Value                0.7656         0.7957          0.8333
Prevalence                    0.5250         0.2250          0.2500
Detection Rate                0.4500         0.0350          0.1250
Detection Prevalence          0.6800         0.0700          0.2500
Balanced Accuracy             0.6865         0.5552          0.6667

3.2.7 Kebaikan Model

logmultinom0 <- multinom(prog2 ~ 1, data = students)
# weights:  6 (2 variable)
initial  value 219.722458 
final  value 204.096674 
converged
#install.packages("lmtest")
library(lmtest)
lrtest(logmultinom0,logmultinom)
Likelihood ratio test

Model 1: prog2 ~ 1
Model 2: prog2 ~ ses + gender + write + read
  #Df  LogLik Df  Chisq Pr(>Chisq)    
1   2 -204.10                         
2  12 -174.72 10 58.743  6.263e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.3 Regresi Logistik Ordinal

3.3.1 Data

crash <- read.csv("Data/crash.csv")
head(crash,10)
   Gender Location SeatBelt Respon
1  Female    Urban      Yes      1
2    Male    Urban      Yes      1
3    Male    Urban       No      1
4  Female    Urban       No      1
5    Male    Rural      Yes      1
6  Female    Rural      Yes      1
7    Male    Rural       No      1
8  Female    Rural       No      1
9  Female    Urban       No      3
10 Female    Rural       No      3
library(dplyr)
crash <- crash %>% mutate(across(-c(Respon),as.factor))
str(crash)
'data.frame':   80 obs. of  4 variables:
 $ Gender  : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 1 2 1 1 1 ...
 $ Location: Factor w/ 2 levels "Rural","Urban": 2 2 2 2 1 1 1 1 2 1 ...
 $ SeatBelt: Factor w/ 2 levels "No","Yes": 2 2 1 1 2 2 1 1 1 1 ...
 $ Respon  : int  1 1 1 1 1 1 1 1 3 3 ...
crash$Respon <- ordered(crash$Respon, levels=c("1","2","3","4","5"))
str(crash)
'data.frame':   80 obs. of  4 variables:
 $ Gender  : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 1 2 1 1 1 ...
 $ Location: Factor w/ 2 levels "Rural","Urban": 2 2 2 2 1 1 1 1 2 1 ...
 $ SeatBelt: Factor w/ 2 levels "No","Yes": 2 2 1 1 2 2 1 1 1 1 ...
 $ Respon  : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 1 1 1 1 1 1 1 1 3 3 ...

3.3.2 Pemodelan

#install.packages("MASS")
library(MASS)
orderlog <- polr(Respon~., method='logistic',data=crash)
summary(orderlog)
Call:
polr(formula = Respon ~ ., data = crash, method = "logistic")

Coefficients:
                 Value Std. Error t value
GenderMale    -0.05369     0.3974 -0.1351
LocationUrban  0.05661     0.3958  0.1430
SeatBeltYes   -0.31102     0.3974 -0.7827

Intercepts:
    Value   Std. Error t value
1|2 -1.5425  0.4450    -3.4664
2|3 -0.5523  0.4060    -1.3603
3|4  0.2649  0.3966     0.6678
4|5  1.2472  0.4264     2.9249

Residual Deviance: 256.8444 
AIC: 270.8444 

3.3.3 Odds Ratio

koefisien<-coef(summary(orderlog)) 
exp(koefisien[,1])
   GenderMale LocationUrban   SeatBeltYes           1|2           2|3 
    0.9477303     1.0582414     0.7327004     0.2138542     0.5756362 
          3|4           4|5 
    1.3032362     3.4805710 
# menghitung pvalue
p <- pnorm(abs(koefisien[,"t value"]), lower.tail = FALSE)*2
(ctabel<-cbind(round(koefisien,2), "pvalue"=round(p,3))) 
              Value Std. Error t value pvalue
GenderMale    -0.05       0.40   -0.14  0.893
LocationUrban  0.06       0.40    0.14  0.886
SeatBeltYes   -0.31       0.40   -0.78  0.434
1|2           -1.54       0.44   -3.47  0.001
2|3           -0.55       0.41   -1.36  0.174
3|4            0.26       0.40    0.67  0.504
4|5            1.25       0.43    2.92  0.003

3.3.4 Multikolineratitas

library(car)
vif(orderlog)
  Gender Location SeatBelt 
1.002035 1.001265 1.001814 

3.3.5 Akurasi

df <- crash[,1:3]
prediksi <- predict(orderlog, df, type = "class")
confusionMatrix(as.factor(prediksi), 
                crash$Respon)
Confusion Matrix and Statistics

          Reference
Prediction  1  2  3  4  5
         1 10  8  7  7  8
         2  0  0  0  0  0
         3  0  0  0  0  0
         4  0  0  0  0  0
         5  6  8  9  9  8

Overall Statistics
                                          
               Accuracy : 0.225           
                 95% CI : (0.1391, 0.3321)
    No Information Rate : 0.2             
    P-Value [Acc > NIR] : 0.3292          
                                          
                  Kappa : 0.0312          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
Sensitivity            0.6250      0.0      0.0      0.0      0.5
Specificity            0.5312      1.0      1.0      1.0      0.5
Pos Pred Value         0.2500      NaN      NaN      NaN      0.2
Neg Pred Value         0.8500      0.8      0.8      0.8      0.8
Prevalence             0.2000      0.2      0.2      0.2      0.2
Detection Rate         0.1250      0.0      0.0      0.0      0.1
Detection Prevalence   0.5000      0.0      0.0      0.0      0.5
Balanced Accuracy      0.5781      0.5      0.5      0.5      0.5

3.3.6 Kebaikan Model

orderlog0 <-polr(Respon~1, method = "logistic", data = crash)
#install.packages("lmtest")
library(lmtest)
lrtest(orderlog0,orderlog)
Likelihood ratio test

Model 1: Respon ~ 1
Model 2: Respon ~ Gender + Location + SeatBelt
  #Df  LogLik Df  Chisq Pr(>Chisq)
1   4 -128.75                     
2   7 -128.42  3 0.6657     0.8813