library(earth.dof.patch)
library(earth)
#> Loading required package: Formula
#> Loading required package: plotmo
#> Loading required package: plotrix
Ozone Data
The Ozone data is shift with earth
package, called
ozone1
To check the performance in higher dimension, we can manually create 50 nuisance features.
head(ozone1)
#> O3 vh wind humidity temp ibh dpg ibt vis doy
#> 1 3 5710 4 28 40 2693 -25 87 250 33
#> 2 5 5700 3 37 45 590 -24 128 100 34
#> 3 5 5760 3 51 54 1450 25 139 60 35
#> 4 6 5720 4 69 35 1568 15 121 60 36
#> 5 4 5790 6 19 45 2631 -33 123 100 37
#> 6 4 5790 3 25 55 554 -28 182 250 38
mod = earth(O3 ~ ., data = ozone1)
summary(mod)
#> Call: earth(formula=O3~., data=ozone1)
#>
#> coefficients
#> (Intercept) 14.1595171
#> h(5860-vh) -0.0137728
#> h(wind-3) -0.3377222
#> h(54-humidity) -0.1349547
#> h(temp-58) 0.2791320
#> h(1105-ibh) -0.0033837
#> h(dpg-10) -0.0991581
#> h(ibt-120) 0.0326330
#> h(150-vis) 0.0231881
#> h(96-doy) -0.1105145
#> h(doy-96) 0.0406468
#> h(doy-158) -0.0836732
#>
#> Selected 12 of 20 terms, and 9 of 9 predictors
#> Termination condition: Reached nk 21
#> Importance: temp, humidity, dpg, doy, vh, ibh, vis, ibt, wind
#> Number of terms at each degree of interaction: 1 11 (additive model)
#> GCV 14.61004 RSS 4172.671 GRSq 0.7730502 RSq 0.8023874
then calculate the corrected penalty given the fitted earth object
mod
,
res = correct_df(mod)
refit the earth model with new penalty
mod1 = earth(O3 ~ ., data = ozone1, penalty = res$penalty)
summary(mod1)
#> Call: earth(formula=O3~., data=ozone1, penalty=res$penalty)
#>
#> coefficients
#> (Intercept) 15.5159893
#> h(5860-vh) -0.0157353
#> h(wind-3) -0.3192190
#> h(54-humidity) -0.1404223
#> h(temp-58) 0.3044700
#> h(1105-ibh) -0.0035028
#> h(dpg-10) -0.0891300
#> h(ibt-120) 0.0321264
#> h(150-vis) 0.0223795
#> h(96-doy) -0.1350538
#> h(doy-158) -0.0358712
#>
#> Selected 11 of 20 terms, and 9 of 9 predictors
#> Termination condition: Reached nk 21
#> Importance: temp, humidity, dpg, doy, vh, ibh, vis, ibt, wind
#> Number of terms at each degree of interaction: 1 10 (additive model)
#> GCV 15.12914 RSS 4234.168 GRSq 0.7649866 RSq 0.7994749
perform a 5-fold cross-validation
cv_compare = function(nfold = 5, degree = 2) {
n = nrow(ozone1)
folds = rep(1:nfold, each = n / nfold)
folds = folds[sample(1:n, n)] # re-order
errs = matrix(0, nrow = 2, ncol = nfold)
corrected_penalty = NULL
for (k in 1:nfold) {
train = ozone1[folds != k, ]
test = ozone1[folds == k, ]
mod = earth(O3 ~ ., data = train, degree = degree)
if (is.null(corrected_penalty)) {
dfs = correct_df(mod)
corrected_penalty = dfs$penalty
cat("corrected penalty = ", dfs$penalty, "\n")
}
mod1 = earth(O3 ~ ., data = train, degree = degree, penalty = corrected_penalty)
pred = predict(mod, newdata = test)
pred1 = predict(mod1, newdata = test)
errs[, k] = c(mean((test$O3 - pred)^2), mean((test$O3 - pred1)^2))
}
mu = rowMeans(errs)
se = apply(errs, 1, sd) / sqrt(nfold)
cat("CV error = \n")
print(mu)
cat("CV error se = \n")
print(se)
return(errs)
}
when we only use linear terms,
set.seed(1234)
errs.d1 = cv_compare(5, 1)
#> corrected penalty = 2.99357
#> CV error =
#> [1] 16.65751 16.61001
#> CV error se =
#> [1] 2.294709 2.266587
and if we allow second-degree interactions,
set.seed(1234)
errs.d2 = cv_compare(5, 2)
#> corrected penalty = 3.92098
#> CV error =
#> [1] 17.32851 17.20663
#> CV error se =
#> [1] 2.336629 2.423364