Lorsque le forestier value la vigueur d’une forret, il considère souvent la hauteur des arbres qui la compose. Plus les arbres sont hauts, plus la forret ou la plantation produit.
Cependant, mesurer la hauteur d’un arbre d’une vingtaine de mètres n’est pas simple. Il est alors nécessaire d’estimer la hauteur grâce à une mesure simple, la mesure de la circonférence à 1 mètre 30 du sol.
Nous cherchons à expliquer la hauteur de n = 1429 eucalyptus par leur circonférence.
library(dplyr)
eucalyptus <- read.table("eucalyptus.txt",sep=";", header=TRUE)
glimpse(eucalyptus)
## Observations: 1,429
## Variables: 5
## $ numero <int> 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17,...
## $ ht <dbl> 18.25, 19.75, 16.50, 18.25, 19.50, 16.25, 17.25, 19.00,...
## $ circ <int> 36, 42, 33, 39, 43, 34, 37, 41, 27, 30, 26, 35, 39, 31,...
## $ bloc <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ clone <fct> L2-123, L2-123, L2-123, L2-123, L2-123, L2-123, L2-123,...
summary(eucalyptus)
## numero ht circ bloc
## Min. : 1.0 Min. :11.25 Min. :26.00 Min. :1.000
## 1st Qu.: 449.0 1st Qu.:19.75 1st Qu.:42.00 1st Qu.:1.000
## Median : 893.0 Median :21.75 Median :48.00 Median :2.000
## Mean : 883.2 Mean :21.21 Mean :47.35 Mean :1.852
## 3rd Qu.:1318.0 3rd Qu.:23.00 3rd Qu.:54.00 3rd Qu.:2.000
## Max. :1737.0 Max. :27.75 Max. :77.00 Max. :3.000
##
## clone
## 18-220 : 27
## 1-105 : 26
## 18-181 : 26
## 18-194 : 26
## 18-196 : 26
## 18-199 : 26
## (Other):1272
eucalyptus <- mutate(eucalyptus, bloc = factor(bloc))
summary(eucalyptus)
## numero ht circ bloc clone
## Min. : 1.0 Min. :11.25 Min. :26.00 1:527 18-220 : 27
## 1st Qu.: 449.0 1st Qu.:19.75 1st Qu.:42.00 2:586 1-105 : 26
## Median : 893.0 Median :21.75 Median :48.00 3:316 18-181 : 26
## Mean : 883.2 Mean :21.21 Mean :47.35 18-194 : 26
## 3rd Qu.:1318.0 3rd Qu.:23.00 3rd Qu.:54.00 18-196 : 26
## Max. :1737.0 Max. :27.75 Max. :77.00 18-199 : 26
## (Other):1272
Nous cherchons à expliquer la hauteur de n = 1429 eucalyptus par leur circonférence.
library(ggplot2)
ggplot(data = eucalyptus, aes(x = circ, y = ht)) + geom_point()
peuc <- ggplot(data = eucalyptus, aes(x = circ, y = ht)) + geom_point()
mod_lin <- lm(ht~ circ, data = eucalyptus)
mod_lin$coefficients
## (Intercept) circ
## 9.0374757 0.2571379
pred_lin <- predict(mod_lin, eucalyptus)
pred_df <- eucalyptus %>% select(circ,ht) %>% mutate(pred_lin)
peuc + geom_line(data = pred_df, aes(y = pred_lin), color = "blue", lwd=1)
Une autre facon de faire ce graphique est la suivante
peuc + geom_smooth(method = "lm", se =FALSE)+ ggtitle("Linear")
mod_sqrt<- lm(ht ~ circ + sqrt(circ) ,data = eucalyptus)
pred_sqrt <- predict(mod_sqrt, eucalyptus)
pred_df <- pred_df %>% mutate(pred_sqrt)
peuc + geom_line(data = pred_df, aes(y = pred_lin), color = "blue", lwd=1) +
geom_line(data = pred_df, aes(y = pred_sqrt), color = "pink", lwd=1)
mod_poly2 <- lm(ht ~ poly(circ, 2),data = eucalyptus)
pred_poly2 <- predict(mod_poly2, eucalyptus)
pred_df <- pred_df %>% mutate(pred_poly2)
peuc + geom_line(data = pred_df, aes(y = pred_lin), color = "blue", lwd=1) +
geom_line(data = pred_df, aes(y = pred_sqrt), color = "pink", lwd=1) +
geom_line(data = pred_df, aes(y = pred_poly2), color = "red", lwd=1)
pred_df <- pred_df %>% mutate(pred_poly2)
summary(mod_lin)
##
## Call:
## lm(formula = ht ~ circ, data = eucalyptus)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7659 -0.7802 0.0557 0.8271 3.6913
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.037476 0.179802 50.26 <2e-16 ***
## circ 0.257138 0.003738 68.79 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.199 on 1427 degrees of freedom
## Multiple R-squared: 0.7683, Adjusted R-squared: 0.7682
## F-statistic: 4732 on 1 and 1427 DF, p-value: < 2.2e-16
summary(mod_sqrt)
##
## Call:
## lm(formula = ht ~ circ + sqrt(circ), data = eucalyptus)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1881 -0.6881 0.0427 0.7927 3.7481
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24.35200 2.61444 -9.314 <2e-16 ***
## circ -0.48295 0.05793 -8.336 <2e-16 ***
## sqrt(circ) 9.98689 0.78033 12.798 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.136 on 1426 degrees of freedom
## Multiple R-squared: 0.7922, Adjusted R-squared: 0.7919
## F-statistic: 2718 on 2 and 1426 DF, p-value: < 2.2e-16
summary(mod_poly2)
##
## Call:
## lm(formula = ht ~ poly(circ, 2), data = eucalyptus)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2140 -0.6947 0.0360 0.7732 3.6970
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.21239 0.03022 701.98 <2e-16 ***
## poly(circ, 2)1 82.49443 1.14230 72.22 <2e-16 ***
## poly(circ, 2)2 -13.83374 1.14230 -12.11 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.142 on 1426 degrees of freedom
## Multiple R-squared: 0.7899, Adjusted R-squared: 0.7896
## F-statistic: 2681 on 2 and 1426 DF, p-value: < 2.2e-16
library(caret)
ind_train <- createDataPartition(eucalyptus$ht,p = 0.8,list = FALSE)
eucatrain <- eucalyptus[ind_train,]
eucatest <- eucalyptus[-ind_train,]
mod_poly2<- lm(ht ~ poly(circ,2), data = eucatrain)
pred_poly2_train <- predict(mod_poly2, newdata = eucatrain)
pred_poly2_test <- predict(mod_poly2, newdata = eucatest)
err_test <- mean((eucatest$ht - pred_poly2_test)^2)
err_test
## [1] 1.273336
Sur vous données, quel modèle choisir ? modèle 1, modèle 2 ou modèle 3 ?
Un cadeaux pour vous car vous avez été sages aujourdh’ui :-)
Remarque : J’ai fini par fixer la graine à l’aide de set.seed (voir ci-dessous)….
V <- 5
set.seed(123)
Folds <- caret::createFolds(eucalyptus$ht, k = V)
#createFolds
errCV <- tibble(model = character(),
err = numeric(), Folds = numeric())
for (k in 1: V) {
ind_test <- Folds[[k]]
eucatrain <-eucalyptus[-ind_test,]
eucatest <- eucalyptus[ind_test,]
reg1 <- lm(ht ~ circ, data = eucatrain)
pred1 <- predict(reg1, newdata = eucatest)
errCV <- errCV %>%
add_row(model = "model1(lin)",
err =mean((eucatest$ht-pred1)^2),
Folds = k)
reg2 <- lm(ht ~ circ + sqrt(circ), data = eucatrain)
pred2 <- predict(reg2, newdata = eucatest)
errCV <- errCV %>%
add_row(model = "model2(sqrt)",
err =mean((eucatest$ht-pred2)^2),
Folds = k)
reg3 <- lm(ht ~ poly(circ,2), data = eucatrain)
pred2 <- predict(reg3, newdata = eucatest)
errCV <- errCV %>%
add_row(model = "model2(poly2)",
err =mean((eucatest$ht-pred2)^2),
Folds = k)
}
errCV %>% group_by(model) %>% summarise(mean(err))
## # A tibble: 3 x 2
## model `mean(err)`
## <chr> <dbl>
## 1 model1(lin) 1.44
## 2 model2(poly2) 1.31
## 3 model2(sqrt) 1.29
Dans cette partie on travaille sur la famille de polynomes. Sur vous données, quel modèle choisir ? Le but est de chercher un bon degré de polynome.
d <- 8
V <- 5
set.seed(123)
Folds <- createFolds(eucalyptus$ht, k = V)
errCV <- tibble(degree = numeric(),
err = numeric(), Folds = numeric())
for (j in 1: d ){
for (k in 1: V) {
ind_test <- Folds[[k]]
eucatrain <-eucalyptus[-ind_test,]
eucatest <- eucalyptus[ind_test,]
reg <- lm(ht ~ poly(circ,j), data = eucatrain)
pred <- predict(reg, newdata = eucatest)
errCV <- errCV %>%
add_row(degree = j,
err = mean((eucatest$ht-pred)^2),
Folds = k)
}
}
EVC <- errCV %>% group_by(degree) %>% summarise(error=mean(err))
EVC
## # A tibble: 8 x 2
## degree error
## <dbl> <dbl>
## 1 1 1.44
## 2 2 1.31
## 3 3 1.28
## 4 4 1.28
## 5 5 1.28
## 6 6 1.28
## 7 7 1.29
## 8 8 1.32
EVC %>% ggplot(aes(x=degree,y=error)) + geom_line()