Regréssion linéaire et selection de modèles

Exercice : (Criminologues américains)

Data : données USCrime (voir énoncé du TP4 partie II)

Nous souhaitons expliquer le taux de de criminalité en fonction d’autres variables.

UScrime<-read.table("UScrime.txt",skip=37,header=T)
attach(UScrime)
str(UScrime)

## 'data.frame':    47 obs. of  13 variables:
##  $ R  : num  79.1 163.5 57.8 196.9 123.4 ...
##  $ Age: int  151 143 142 136 141 121 127 131 157 140 ...
##  $ Ed : int  91 113 89 121 121 110 111 109 90 118 ...
##  $ Ex0: int  58 103 45 149 109 118 82 115 65 71 ...
##  $ Ex1: int  56 95 44 141 101 115 79 109 62 68 ...
##  $ LF : int  510 583 533 577 591 547 519 542 553 632 ...
##  $ M  : int  950 1012 969 994 985 964 982 969 955 1029 ...
##  $ N  : int  33 13 18 157 18 25 4 50 39 7 ...
##  $ NW : int  301 102 219 80 30 44 139 179 286 15 ...
##  $ U1 : int  108 96 94 102 91 84 97 79 81 100 ...
##  $ U2 : int  41 36 33 39 20 29 38 35 28 24 ...
##  $ W  : int  394 557 318 673 578 689 620 472 421 526 ...
##  $ X  : int  261 194 250 167 174 126 168 206 239 174 ...

A vous de jouer !

Méthodes AIC, BIC

 selection=leaps::regsubsets(R~.,data=UScrime,
                             nvmax=12,method = "forward")
#summary(selection)
plot(selection,scale="bic")

#help(stepAIC)
reg<-lm(R~.,data=UScrime)
selection=MASS::stepAIC(reg,direction="backward")

## Start:  AIC=300.1
## R ~ Age + Ed + Ex0 + Ex1 + LF + M + N + NW + U1 + U2 + W + X
## 
##        Df Sum of Sq   RSS    AIC
## - LF    1       0.1 16028 298.10
## - NW    1      14.7 16043 298.14
## - N     1      36.8 16065 298.21
## - Ex1   1     147.5 16176 298.53
## - M     1     225.9 16254 298.76
## <none>              16028 300.10
## - W     1     715.3 16743 300.15
## - U1    1     763.5 16792 300.29
## - Ex0   1    1104.7 17133 301.23
## - U2    1    1970.2 17998 303.55
## - Age   1    2884.6 18913 305.88
## - Ed    1    3600.8 19629 307.63
## - X     1    5548.4 21577 312.07
## 
## Step:  AIC=298.1
## R ~ Age + Ed + Ex0 + Ex1 + M + N + NW + U1 + U2 + W + X
## 
##        Df Sum of Sq   RSS    AIC
## - NW    1      14.6 16043 296.14
## - N     1      37.6 16066 296.21
## - Ex1   1     162.6 16191 296.58
## - M     1     307.1 16335 296.99
## <none>              16028 298.10
## - W     1     719.3 16748 298.17
## - U1    1     850.8 16879 298.53
## - Ex0   1    1169.0 17197 299.41
## - U2    1    1975.6 18004 301.56
## - Age   1    2987.1 19015 304.13
## - Ed    1    4046.2 20074 306.68
## - X     1    5554.9 21583 310.09
## 
## Step:  AIC=296.14
## R ~ Age + Ed + Ex0 + Ex1 + M + N + U1 + U2 + W + X
## 
##        Df Sum of Sq   RSS    AIC
## - N     1      37.2 16080 294.25
## - Ex1   1     185.5 16228 294.69
## - M     1     332.9 16376 295.11
## <none>              16043 296.14
## - W     1     769.7 16813 296.35
## - U1    1     843.0 16886 296.55
## - Ex0   1    1199.0 17242 297.53
## - U2    1    1961.0 18004 299.56
## - Age   1    3295.0 19338 302.93
## - Ed    1    4159.2 20202 304.98
## - X     1    5822.2 21865 308.70
## 
## Step:  AIC=294.25
## R ~ Age + Ed + Ex0 + Ex1 + M + U1 + U2 + W + X
## 
##        Df Sum of Sq   RSS    AIC
## - Ex1   1     171.5 16252 292.75
## - M     1     563.4 16643 293.87
## <none>              16080 294.25
## - W     1     734.7 16815 294.35
## - U1    1     906.0 16986 294.83
## - Ex0   1    1162.0 17242 295.53
## - U2    1    1978.0 18058 297.71
## - Age   1    3354.5 19434 301.16
## - Ed    1    4139.1 20219 303.02
## - X     1    6094.8 22175 307.36
## 
## Step:  AIC=292.75
## R ~ Age + Ed + Ex0 + M + U1 + U2 + W + X
## 
##        Df Sum of Sq   RSS    AIC
## - M     1     691.0 16943 292.71
## <none>              16252 292.75
## - W     1     759.0 17011 292.90
## - U1    1     921.8 17173 293.35
## - U2    1    2018.1 18270 296.25
## - Age   1    3323.1 19575 299.50
## - Ed    1    4005.1 20257 301.11
## - X     1    6402.7 22654 306.36
## - Ex0   1   11818.8 28070 316.44
## 
## Step:  AIC=292.71
## R ~ Age + Ed + Ex0 + U1 + U2 + W + X
## 
##        Df Sum of Sq   RSS    AIC
## - U1    1     408.6 17351 291.83
## <none>              16943 292.71
## - W     1    1016.9 17959 293.45
## - U2    1    1548.6 18491 294.82
## - Age   1    4511.6 21454 301.81
## - Ed    1    6430.6 23373 305.83
## - X     1    8147.7 25090 309.16
## - Ex0   1   12019.6 28962 315.91
## 
## Step:  AIC=291.83
## R ~ Age + Ed + Ex0 + U2 + W + X
## 
##        Df Sum of Sq   RSS    AIC
## <none>              17351 291.83
## - W     1    1252.6 18604 293.11
## - U2    1    1628.7 18980 294.05
## - Age   1    4461.0 21812 300.58
## - Ed    1    6214.7 23566 304.22
## - X     1    8932.3 26283 309.35
## - Ex0   1   15596.5 32948 319.97

summary(selection)

## 
## Call:
## lm(formula = R ~ Age + Ed + Ex0 + U2 + W + X, data = UScrime)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.306 -10.209  -1.313   9.919  54.544 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -618.5028   108.2456  -5.714 1.19e-06 ***
## Age            1.1252     0.3509   3.207 0.002640 ** 
## Ed             1.8179     0.4803   3.785 0.000505 ***
## Ex0            1.0507     0.1752   5.996 4.78e-07 ***
## U2             0.8282     0.4274   1.938 0.059743 .  
## W              0.1596     0.0939   1.699 0.097028 .  
## X              0.8236     0.1815   4.538 5.10e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.83 on 40 degrees of freedom
## Multiple R-squared:  0.7478, Adjusted R-squared:   0.71 
## F-statistic: 19.77 on 6 and 40 DF,  p-value: 1.441e-10

K-Fold Cross Validation (Validation croisée)

library(tidyverse)

## -- Attaching packages ----------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.2     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0

## -- Conflicts -------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

V <- 10
T <- 2
set.seed(42)
Folds <- caret::createMultiFolds(UScrime$R, k = V, times = T)

errCV <- matrix(nrow = 2, ncol = (T*V))
for (v in 1: (T*V)) {
  UScrimetrain <- UScrime[Folds[[v]],]
  UScrimetest <- UScrime[-Folds[[v]],]
  
  regtmp1 <- lm(R ~ Age + Ed + Ex0 + U2 + W + X, data = UScrimetrain)
  predtmp1 <- predict(regtmp1, newdata = UScrimetest)
  errCV[1,v] <- mean((UScrimetest$R-predtmp1)^2)
    
  regtmp2 <- lm(R ~ Age + Ed + Ex0 + U2 + X, data = UScrimetrain)
  predtmp2 <- predict(regtmp2, newdata = UScrimetest)
  errCV[2,v] <- mean((UScrimetest$R-predtmp2)^2)
}

errCV

##          [,1]     [,2]      [,3]     [,4]     [,5]     [,6]     [,7]
## [1,] 512.2084 1630.567 115.50292 72.20725 766.6511 519.7437 745.6542
## [2,] 654.5690 1742.317  92.91483 37.17599 539.4432 333.7470 864.1295
##          [,8]     [,9]    [,10]    [,11]     [,12]    [,13]    [,14]
## [1,] 464.9904 489.0253 216.9277  98.6023  954.2065 220.7154 193.5853
## [2,] 126.4217 237.9475 289.9710 145.8506 1126.1625 136.4183 207.1110
##         [,15]    [,16]    [,17]    [,18]    [,19]    [,20]
## [1,] 561.3703 790.7243 1063.628 250.5367 1263.181 363.5801
## [2,] 478.6685 182.0301 1046.730 240.1709 1414.138 486.2783

apply(sqrt(errCV),1,mean)

## [1] 22.14550 20.61174

set.seed(42)
trCtrl <- trainControl(method = "repeatedcv", number = V, repeats = T)
reg1 <- train(R ~ Age + Ed + Ex0 + U2 + W + X, data = UScrime, method = "lm",
              trControl = trCtrl)
reg1

## Linear Regression 
## 
## 47 samples
##  6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times) 
## Summary of sample sizes: 44, 42, 43, 41, 43, 42, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   22.83478  0.5759024  18.66797
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

reg2 <- train(R ~ Age + Ed + Ex0 + U2 + X, data = UScrime, method = "lm",
              trControl = trCtrl)
reg2

## Linear Regression 
## 
## 47 samples
##  5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times) 
## Summary of sample sizes: 42, 43, 43, 42, 42, 42, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   21.85104  0.7067288  17.09421
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Regréssion linéaire et selection de modèles

Ana Fermin

Méthodes AIC, BIC

K-Fold Cross Validation (Validation croisée)