library(ggplot2)
library(dplyr)
library("ggfortify")

Exemple

Nous traitons un problème de défaut bancaire. Nous cherchons à déterminer quels clients seront en défaut sur leur dette de carte de crédit (ici = 1 si le client fait défaut sur sa dette). La variable est la variable réponse.

Nous disposons d’un échantillon de taille \(10000\) et 3 variables explicativesLes variables explicatives sont les suivantes :

library(ISLR)
data(Default); attach(Default)
glimpse(Default)
## Observations: 10,000
## Variables: 4
## $ default <fctr> No, No, No, No, No, No, No, No, No, No, No, No, No, N...
## $ student <fctr> No, Yes, No, No, No, Yes, No, Yes, No, No, Yes, Yes, ...
## $ balance <dbl> 729.5265, 817.1804, 1073.5492, 529.2506, 785.6559, 919...
## $ income  <dbl> 44361.625, 12106.135, 31767.139, 35704.494, 38463.496,...
#résumé des données (un peu de stats descriptive)
summary(Default)
##  default    student       balance           income     
##  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##                        Median : 823.6   Median :34553  
##                        Mean   : 835.4   Mean   :33517  
##                        3rd Qu.:1166.3   3rd Qu.:43808  
##                        Max.   :2654.3   Max.   :73554

Default.logit=glm(default~balance + income + student,family=binomial(link="logit"),data=Default)
summary(Default.logit)
## 
## Call:
## glm(formula = default ~ balance + income + student, family = binomial(link = "logit"), 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4691  -0.1418  -0.0557  -0.0203   3.7383  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.087e+01  4.923e-01 -22.080  < 2e-16 ***
## balance      5.737e-03  2.319e-04  24.738  < 2e-16 ***
## income       3.033e-06  8.203e-06   0.370  0.71152    
## studentYes  -6.468e-01  2.363e-01  -2.738  0.00619 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.5  on 9996  degrees of freedom
## AIC: 1579.5
## 
## Number of Fisher Scoring iterations: 8

Selection de modèles par selection de variables

# Sélection par AIC et backward
step(Default.logit,direction="backward")
## Start:  AIC=1579.54
## default ~ balance + income + student
## 
##           Df Deviance    AIC
## - income   1   1571.7 1577.7
## <none>         1571.5 1579.5
## - student  1   1579.0 1585.0
## - balance  1   2907.5 2913.5
## 
## Step:  AIC=1577.68
## default ~ balance + student
## 
##           Df Deviance    AIC
## <none>         1571.7 1577.7
## - student  1   1596.5 1600.5
## - balance  1   2908.7 2912.7
## 
## Call:  glm(formula = default ~ balance + student, family = binomial(link = "logit"), 
##     data = Default)
## 
## Coefficients:
## (Intercept)      balance   studentYes  
##  -10.749496     0.005738    -0.714878  
## 
## Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
## Null Deviance:       2921 
## Residual Deviance: 1572  AIC: 1578
Default.logit.choisi=glm(default~balance + student,family=binomial(link="logit"),data=Default)
summary(Default.logit.choisi)
## 
## Call:
## glm(formula = default ~ balance + student, family = binomial(link = "logit"), 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4578  -0.1422  -0.0559  -0.0203   3.7435  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.075e+01  3.692e-01 -29.116  < 2e-16 ***
## balance      5.738e-03  2.318e-04  24.750  < 2e-16 ***
## studentYes  -7.149e-01  1.475e-01  -4.846 1.26e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.7  on 9997  degrees of freedom
## AIC: 1577.7
## 
## Number of Fisher Scoring iterations: 8
#score
score.glm  =  predict.glm(Default.logit.choisi, data=Default, type="response")
#Matrice de confusion (au seuil fixé 0.5)
pred.glm  = as.numeric(score.glm >=0.5)
pred.glm = factor(pred.glm, levels = c(0,1), labels = c("No","Yes"))
table(Default$default,pred.glm)
##      pred.glm
##         No  Yes
##   No  9628   39
##   Yes  228  105
mean(Default$default!=pred.glm)
## [1] 0.0267
#Proportion de vrais positifs (au seuil fixé 0.5)
sum( score.glm >= 0.5 & Default$default=="Yes")/sum(Default$default=="Yes")
## [1] 0.3153153
#Proportion de vrais negatifs
sum( score.glm < 0.5 & Default$default=="No")/sum(Default$default=="No")
## [1] 0.9959657
#Proportion de faux poistifs
sum( score.glm >= 0.5 & Default$default=="No")/sum(Default$default=="No")
## [1] 0.004034344
#Taux de vrais positifs (TPR) et taux faux positifs (FPR) pour difeferents valeurs de s

#seuil s
s.glm=seq(0,1,.01)
#des fois c'est mieux avec les quantiles car plus des precision dans les rangs des scores
#s.glm=quantile(score.glm,probs=seq(0,1,.01))

#Initialisation :
absc.glm=numeric(length(s.glm));ordo.glm=numeric(length(s.glm))

for (i in 1:length(s.glm)){
  
  ordo.glm[i] = sum( score.glm>=s.glm[i] & Default$default=="Yes")/sum(Default$default=="Yes")
  
  absc.glm[i] = sum( score.glm>=s.glm[i] & Default$default=="No")/sum(Default$default=="No")
}
ROC = data.frame(FPR=absc.glm, TPR=ordo.glm)
#Courbe Roc : taux de vrais positifs (TPR) en fonction du taux de faux positifs (FPR)
ggplot(ROC,aes(x=FPR,y=TPR)) + geom_path(aes(),color="red")  +
   geom_segment(aes(x = 0, y = 0, xend = 1, yend = 1), linetype = 2, color = "black")