Application to Ozone Data • earth.dof.patch

library(earth.dof.patch)
library(earth)
#> Loading required package: Formula
#> Loading required package: plotmo
#> Loading required package: plotrix

Ozone Data

The Ozone data is shift with earth package, called ozone1

To check the performance in higher dimension, we can manually create 50 nuisance features.

set.seed(1234)
ozone1 = cbind(ozone1, matrix(rnorm(nrow(ozone1) * 50), ncol = 50))

head(ozone1)
#>   O3   vh wind humidity temp  ibh dpg ibt vis doy
#> 1  3 5710    4       28   40 2693 -25  87 250  33
#> 2  5 5700    3       37   45  590 -24 128 100  34
#> 3  5 5760    3       51   54 1450  25 139  60  35
#> 4  6 5720    4       69   35 1568  15 121  60  36
#> 5  4 5790    6       19   45 2631 -33 123 100  37
#> 6  4 5790    3       25   55  554 -28 182 250  38

mod = earth(O3 ~ ., data = ozone1)

summary(mod)
#> Call: earth(formula=O3~., data=ozone1)
#> 
#>                coefficients
#> (Intercept)      14.1595171
#> h(5860-vh)       -0.0137728
#> h(wind-3)        -0.3377222
#> h(54-humidity)   -0.1349547
#> h(temp-58)        0.2791320
#> h(1105-ibh)      -0.0033837
#> h(dpg-10)        -0.0991581
#> h(ibt-120)        0.0326330
#> h(150-vis)        0.0231881
#> h(96-doy)        -0.1105145
#> h(doy-96)         0.0406468
#> h(doy-158)       -0.0836732
#> 
#> Selected 12 of 20 terms, and 9 of 9 predictors
#> Termination condition: Reached nk 21
#> Importance: temp, humidity, dpg, doy, vh, ibh, vis, ibt, wind
#> Number of terms at each degree of interaction: 1 11 (additive model)
#> GCV 14.61004    RSS 4172.671    GRSq 0.7730502    RSq 0.8023874

then calculate the corrected penalty given the fitted earth object mod,

res = correct_df(mod)

refit the earth model with new penalty

mod1 = earth(O3 ~ ., data = ozone1, penalty = res$penalty)

summary(mod1)
#> Call: earth(formula=O3~., data=ozone1, penalty=res$penalty)
#> 
#>                coefficients
#> (Intercept)      15.5159893
#> h(5860-vh)       -0.0157353
#> h(wind-3)        -0.3192190
#> h(54-humidity)   -0.1404223
#> h(temp-58)        0.3044700
#> h(1105-ibh)      -0.0035028
#> h(dpg-10)        -0.0891300
#> h(ibt-120)        0.0321264
#> h(150-vis)        0.0223795
#> h(96-doy)        -0.1350538
#> h(doy-158)       -0.0358712
#> 
#> Selected 11 of 20 terms, and 9 of 9 predictors
#> Termination condition: Reached nk 21
#> Importance: temp, humidity, dpg, doy, vh, ibh, vis, ibt, wind
#> Number of terms at each degree of interaction: 1 10 (additive model)
#> GCV 15.12914    RSS 4234.168    GRSq 0.7649866    RSq 0.7994749

perform a 5-fold cross-validation

cv_compare = function(nfold = 5, degree = 2) {
  n = nrow(ozone1)
  folds = rep(1:nfold, each = n / nfold)
  folds = folds[sample(1:n, n)] # re-order
  errs = matrix(0, nrow = 2, ncol = nfold)
  corrected_penalty = NULL
  for (k in 1:nfold) {
    train = ozone1[folds != k, ]
    test = ozone1[folds == k, ]
    mod = earth(O3 ~ ., data = train, degree = degree)
    if (is.null(corrected_penalty)) {
      dfs = correct_df(mod)
      corrected_penalty = dfs$penalty
      cat("corrected penalty = ", dfs$penalty, "\n")
    }
    mod1 = earth(O3 ~ ., data = train, degree = degree, penalty = corrected_penalty)
    pred = predict(mod, newdata = test)
    pred1 = predict(mod1, newdata = test)
    errs[, k] = c(mean((test$O3 - pred)^2), mean((test$O3 - pred1)^2))
  }
  mu = rowMeans(errs)
  se = apply(errs, 1, sd) / sqrt(nfold)
  cat("CV error = \n")
  print(mu)
  cat("CV error se = \n")
  print(se)
  return(errs)
}

when we only use linear terms,

set.seed(1234)
errs.d1 = cv_compare(5, 1)
#> corrected penalty =  2.99357 
#> CV error = 
#> [1] 16.65751 16.61001
#> CV error se = 
#> [1] 2.294709 2.266587

and if we allow second-degree interactions,

set.seed(1234)
errs.d2 = cv_compare(5, 2)
#> corrected penalty =  3.92098 
#> CV error = 
#> [1] 17.32851 17.20663
#> CV error se = 
#> [1] 2.336629 2.423364