Exercice : (Criminologues américains)
Nous souhaitons expliquer le taux de de criminalité en fonction d’autres variables.
UScrime<-read.table("UScrime.txt",skip=37,header=T)
attach(UScrime)
str(UScrime)
## 'data.frame': 47 obs. of 13 variables:
## $ R : num 79.1 163.5 57.8 196.9 123.4 ...
## $ Age: int 151 143 142 136 141 121 127 131 157 140 ...
## $ Ed : int 91 113 89 121 121 110 111 109 90 118 ...
## $ Ex0: int 58 103 45 149 109 118 82 115 65 71 ...
## $ Ex1: int 56 95 44 141 101 115 79 109 62 68 ...
## $ LF : int 510 583 533 577 591 547 519 542 553 632 ...
## $ M : int 950 1012 969 994 985 964 982 969 955 1029 ...
## $ N : int 33 13 18 157 18 25 4 50 39 7 ...
## $ NW : int 301 102 219 80 30 44 139 179 286 15 ...
## $ U1 : int 108 96 94 102 91 84 97 79 81 100 ...
## $ U2 : int 41 36 33 39 20 29 38 35 28 24 ...
## $ W : int 394 557 318 673 578 689 620 472 421 526 ...
## $ X : int 261 194 250 167 174 126 168 206 239 174 ...
A vous de jouer !
selection=leaps::regsubsets(R~.,data=UScrime,
nvmax=12,method = "forward")
#summary(selection)
plot(selection,scale="bic")
#help(stepAIC)
reg<-lm(R~.,data=UScrime)
selection=MASS::stepAIC(reg,direction="backward")
## Start: AIC=300.1
## R ~ Age + Ed + Ex0 + Ex1 + LF + M + N + NW + U1 + U2 + W + X
##
## Df Sum of Sq RSS AIC
## - LF 1 0.1 16028 298.10
## - NW 1 14.7 16043 298.14
## - N 1 36.8 16065 298.21
## - Ex1 1 147.5 16176 298.53
## - M 1 225.9 16254 298.76
## <none> 16028 300.10
## - W 1 715.3 16743 300.15
## - U1 1 763.5 16792 300.29
## - Ex0 1 1104.7 17133 301.23
## - U2 1 1970.2 17998 303.55
## - Age 1 2884.6 18913 305.88
## - Ed 1 3600.8 19629 307.63
## - X 1 5548.4 21577 312.07
##
## Step: AIC=298.1
## R ~ Age + Ed + Ex0 + Ex1 + M + N + NW + U1 + U2 + W + X
##
## Df Sum of Sq RSS AIC
## - NW 1 14.6 16043 296.14
## - N 1 37.6 16066 296.21
## - Ex1 1 162.6 16191 296.58
## - M 1 307.1 16335 296.99
## <none> 16028 298.10
## - W 1 719.3 16748 298.17
## - U1 1 850.8 16879 298.53
## - Ex0 1 1169.0 17197 299.41
## - U2 1 1975.6 18004 301.56
## - Age 1 2987.1 19015 304.13
## - Ed 1 4046.2 20074 306.68
## - X 1 5554.9 21583 310.09
##
## Step: AIC=296.14
## R ~ Age + Ed + Ex0 + Ex1 + M + N + U1 + U2 + W + X
##
## Df Sum of Sq RSS AIC
## - N 1 37.2 16080 294.25
## - Ex1 1 185.5 16228 294.69
## - M 1 332.9 16376 295.11
## <none> 16043 296.14
## - W 1 769.7 16813 296.35
## - U1 1 843.0 16886 296.55
## - Ex0 1 1199.0 17242 297.53
## - U2 1 1961.0 18004 299.56
## - Age 1 3295.0 19338 302.93
## - Ed 1 4159.2 20202 304.98
## - X 1 5822.2 21865 308.70
##
## Step: AIC=294.25
## R ~ Age + Ed + Ex0 + Ex1 + M + U1 + U2 + W + X
##
## Df Sum of Sq RSS AIC
## - Ex1 1 171.5 16252 292.75
## - M 1 563.4 16643 293.87
## <none> 16080 294.25
## - W 1 734.7 16815 294.35
## - U1 1 906.0 16986 294.83
## - Ex0 1 1162.0 17242 295.53
## - U2 1 1978.0 18058 297.71
## - Age 1 3354.5 19434 301.16
## - Ed 1 4139.1 20219 303.02
## - X 1 6094.8 22175 307.36
##
## Step: AIC=292.75
## R ~ Age + Ed + Ex0 + M + U1 + U2 + W + X
##
## Df Sum of Sq RSS AIC
## - M 1 691.0 16943 292.71
## <none> 16252 292.75
## - W 1 759.0 17011 292.90
## - U1 1 921.8 17173 293.35
## - U2 1 2018.1 18270 296.25
## - Age 1 3323.1 19575 299.50
## - Ed 1 4005.1 20257 301.11
## - X 1 6402.7 22654 306.36
## - Ex0 1 11818.8 28070 316.44
##
## Step: AIC=292.71
## R ~ Age + Ed + Ex0 + U1 + U2 + W + X
##
## Df Sum of Sq RSS AIC
## - U1 1 408.6 17351 291.83
## <none> 16943 292.71
## - W 1 1016.9 17959 293.45
## - U2 1 1548.6 18491 294.82
## - Age 1 4511.6 21454 301.81
## - Ed 1 6430.6 23373 305.83
## - X 1 8147.7 25090 309.16
## - Ex0 1 12019.6 28962 315.91
##
## Step: AIC=291.83
## R ~ Age + Ed + Ex0 + U2 + W + X
##
## Df Sum of Sq RSS AIC
## <none> 17351 291.83
## - W 1 1252.6 18604 293.11
## - U2 1 1628.7 18980 294.05
## - Age 1 4461.0 21812 300.58
## - Ed 1 6214.7 23566 304.22
## - X 1 8932.3 26283 309.35
## - Ex0 1 15596.5 32948 319.97
summary(selection)
##
## Call:
## lm(formula = R ~ Age + Ed + Ex0 + U2 + W + X, data = UScrime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.306 -10.209 -1.313 9.919 54.544
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -618.5028 108.2456 -5.714 1.19e-06 ***
## Age 1.1252 0.3509 3.207 0.002640 **
## Ed 1.8179 0.4803 3.785 0.000505 ***
## Ex0 1.0507 0.1752 5.996 4.78e-07 ***
## U2 0.8282 0.4274 1.938 0.059743 .
## W 0.1596 0.0939 1.699 0.097028 .
## X 0.8236 0.1815 4.538 5.10e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.83 on 40 degrees of freedom
## Multiple R-squared: 0.7478, Adjusted R-squared: 0.71
## F-statistic: 19.77 on 6 and 40 DF, p-value: 1.441e-10
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts -------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
V <- 10
T <- 2
set.seed(42)
Folds <- caret::createMultiFolds(UScrime$R, k = V, times = T)
errCV <- matrix(nrow = 2, ncol = (T*V))
for (v in 1: (T*V)) {
UScrimetrain <- UScrime[Folds[[v]],]
UScrimetest <- UScrime[-Folds[[v]],]
regtmp1 <- lm(R ~ Age + Ed + Ex0 + U2 + W + X, data = UScrimetrain)
predtmp1 <- predict(regtmp1, newdata = UScrimetest)
errCV[1,v] <- mean((UScrimetest$R-predtmp1)^2)
regtmp2 <- lm(R ~ Age + Ed + Ex0 + U2 + X, data = UScrimetrain)
predtmp2 <- predict(regtmp2, newdata = UScrimetest)
errCV[2,v] <- mean((UScrimetest$R-predtmp2)^2)
}
errCV
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 512.2084 1630.567 115.50292 72.20725 766.6511 519.7437 745.6542
## [2,] 654.5690 1742.317 92.91483 37.17599 539.4432 333.7470 864.1295
## [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 464.9904 489.0253 216.9277 98.6023 954.2065 220.7154 193.5853
## [2,] 126.4217 237.9475 289.9710 145.8506 1126.1625 136.4183 207.1110
## [,15] [,16] [,17] [,18] [,19] [,20]
## [1,] 561.3703 790.7243 1063.628 250.5367 1263.181 363.5801
## [2,] 478.6685 182.0301 1046.730 240.1709 1414.138 486.2783
apply(sqrt(errCV),1,mean)
## [1] 22.14550 20.61174
set.seed(42)
trCtrl <- trainControl(method = "repeatedcv", number = V, repeats = T)
reg1 <- train(R ~ Age + Ed + Ex0 + U2 + W + X, data = UScrime, method = "lm",
trControl = trCtrl)
reg1
## Linear Regression
##
## 47 samples
## 6 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times)
## Summary of sample sizes: 44, 42, 43, 41, 43, 42, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 22.83478 0.5759024 18.66797
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
reg2 <- train(R ~ Age + Ed + Ex0 + U2 + X, data = UScrime, method = "lm",
trControl = trCtrl)
reg2
## Linear Regression
##
## 47 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times)
## Summary of sample sizes: 42, 43, 43, 42, 42, 42, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 21.85104 0.7067288 17.09421
##
## Tuning parameter 'intercept' was held constant at a value of TRUE