Application to the Spam Email Data • earth.dof.patch

library(earth)
#> Loading required package: Formula
#> Loading required package: plotmo
#> Loading required package: plotrix
library(earth.dof.patch)

The spam data has been frequently used in The Elements of Statistical Learning (ESL) book. The data consists of information from 4601 email messages, in a study to try to predict whether the email was spam (i.e., junk email). The data can be accessible from https://archive.ics.uci.edu/dataset/94/spambase or https://github.com/szcf-weiya/ESL-CN/tree/master/data/Spam.

Here, we also include the data into our package, and one can simply load it as follows:

data("spam")
head(spam)
#>   word_freq_make word_freq_address word_freq_all word_freq_3d word_freq_our
#> 1           0.00              0.64          0.64            0          0.32
#> 2           0.21              0.28          0.50            0          0.14
#> 3           0.06              0.00          0.71            0          1.23
#> 4           0.00              0.00          0.00            0          0.63
#> 5           0.00              0.00          0.00            0          0.63
#> 6           0.00              0.00          0.00            0          1.85
#>   word_freq_over word_freq_remove word_freq_internet word_freq_order
#> 1           0.00             0.00               0.00            0.00
#> 2           0.28             0.21               0.07            0.00
#> 3           0.19             0.19               0.12            0.64
#> 4           0.00             0.31               0.63            0.31
#> 5           0.00             0.31               0.63            0.31
#> 6           0.00             0.00               1.85            0.00
#>   word_freq_mail word_freq_receive word_freq_will word_freq_people
#> 1           0.00              0.00           0.64             0.00
#> 2           0.94              0.21           0.79             0.65
#> 3           0.25              0.38           0.45             0.12
#> 4           0.63              0.31           0.31             0.31
#> 5           0.63              0.31           0.31             0.31
#> 6           0.00              0.00           0.00             0.00
#>   word_freq_report word_freq_addresses word_freq_free word_freq_business
#> 1             0.00                0.00           0.32               0.00
#> 2             0.21                0.14           0.14               0.07
#> 3             0.00                1.75           0.06               0.06
#> 4             0.00                0.00           0.31               0.00
#> 5             0.00                0.00           0.31               0.00
#> 6             0.00                0.00           0.00               0.00
#>   word_freq_email word_freq_you word_freq_credit word_freq_your word_freq_font
#> 1            1.29          1.93             0.00           0.96              0
#> 2            0.28          3.47             0.00           1.59              0
#> 3            1.03          1.36             0.32           0.51              0
#> 4            0.00          3.18             0.00           0.31              0
#> 5            0.00          3.18             0.00           0.31              0
#> 6            0.00          0.00             0.00           0.00              0
#>   word_freq_000 word_freq_money word_freq_hp word_freq_hpl word_freq_george
#> 1          0.00            0.00            0             0                0
#> 2          0.43            0.43            0             0                0
#> 3          1.16            0.06            0             0                0
#> 4          0.00            0.00            0             0                0
#> 5          0.00            0.00            0             0                0
#> 6          0.00            0.00            0             0                0
#>   word_freq_650 word_freq_lab word_freq_labs word_freq_telnet word_freq_857
#> 1             0             0              0                0             0
#> 2             0             0              0                0             0
#> 3             0             0              0                0             0
#> 4             0             0              0                0             0
#> 5             0             0              0                0             0
#> 6             0             0              0                0             0
#>   word_freq_data word_freq_415 word_freq_85 word_freq_technology word_freq_1999
#> 1              0             0            0                    0           0.00
#> 2              0             0            0                    0           0.07
#> 3              0             0            0                    0           0.00
#> 4              0             0            0                    0           0.00
#> 5              0             0            0                    0           0.00
#> 6              0             0            0                    0           0.00
#>   word_freq_parts word_freq_pm word_freq_direct word_freq_cs word_freq_meeting
#> 1               0            0             0.00            0                 0
#> 2               0            0             0.00            0                 0
#> 3               0            0             0.06            0                 0
#> 4               0            0             0.00            0                 0
#> 5               0            0             0.00            0                 0
#> 6               0            0             0.00            0                 0
#>   word_freq_original word_freq_project word_freq_re word_freq_edu
#> 1               0.00                 0         0.00          0.00
#> 2               0.00                 0         0.00          0.00
#> 3               0.12                 0         0.06          0.06
#> 4               0.00                 0         0.00          0.00
#> 5               0.00                 0         0.00          0.00
#> 6               0.00                 0         0.00          0.00
#>   word_freq_table word_freq_conference char_freq_; char_freq_( char_freq_[
#> 1               0                    0        0.00       0.000           0
#> 2               0                    0        0.00       0.132           0
#> 3               0                    0        0.01       0.143           0
#> 4               0                    0        0.00       0.137           0
#> 5               0                    0        0.00       0.135           0
#> 6               0                    0        0.00       0.223           0
#>   char_freq_! char_freq_$ char_freq_# capital_run_length_average
#> 1       0.778       0.000       0.000                      3.756
#> 2       0.372       0.180       0.048                      5.114
#> 3       0.276       0.184       0.010                      9.821
#> 4       0.137       0.000       0.000                      3.537
#> 5       0.135       0.000       0.000                      3.537
#> 6       0.000       0.000       0.000                      3.000
#>   capital_run_length_longest capital_run_length_total class
#> 1                         61                      278     1
#> 2                        101                     1028     1
#> 3                        485                     2259     1
#> 4                         40                      191     1
#> 5                         40                      191     1
#> 6                         15                       54     1
data("flag.esl.trainset")
spam.train = spam[flag.esl.trainset == 0, ]
spam.test = spam[flag.esl.trainset == 1, ]

where flag.esl.trainset is the indicator for training set and test set used in the ESL site.

Now we compare the performance of the original MARS (via earth) and our proposed MARS with corrected degrees of freedom.

compare_class_table = function(degree = 1) {
  mod = earth(class ~ ., data = spam.train, degree = degree)
  pred = predict(mod, newdata = spam.test, type = "class")
  tbl = table(pred, truth = spam.test$class)
  cat("the confusion table is:\n")
  print(tbl)
  cat("the confusion table in proportion is:\n")
  print(tbl / sum(tbl))
  cat("the error rate is: ", (tbl[2] + tbl[3]) / sum(tbl), "\n" )
  cat("==============\n")
  dfs = correct_df(mod)
  cat("use corrected penalty = ", dfs$penalty, "\n")
  mod1 = earth(class ~ ., data = spam.train, degree = degree, penalty = dfs$penalty)
  pred1 = predict(mod1, newdata = spam.test, type = "class")
  tbl1 = table(pred1, truth = spam.test$class)
  cat("the confusion table is:\n")
  print(tbl1)
  cat("the confusion table in proportion is:\n")
  print(tbl1 / sum(tbl1))
  cat("the error rate is: ", (tbl1[2] + tbl1[3]) / sum(tbl1), "\n" )
}

Firstly, when we only use linear terms, the performance are

compare_class_table(1)
#> the confusion table is:
#>     truth
#> pred   0   1
#>    0 887  76
#>    1  54 519
#> the confusion table in proportion is:
#>     truth
#> pred          0          1
#>    0 0.57747396 0.04947917
#>    1 0.03515625 0.33789062
#> the error rate is:  0.08463542 
#> ==============
#> use corrected penalty =  6.252168 
#> the confusion table is:
#>      truth
#> pred1   0   1
#>     0 892  76
#>     1  49 519
#> the confusion table in proportion is:
#>      truth
#> pred1          0          1
#>     0 0.58072917 0.04947917
#>     1 0.03190104 0.33789062
#> the error rate is:  0.08138021

Furthermore, when we allow second-degree interactions, the performance are

compare_class_table(2)
#> the confusion table is:
#>     truth
#> pred   0   1
#>    0 896  60
#>    1  45 535
#> the confusion table in proportion is:
#>     truth
#> pred          0          1
#>    0 0.58333333 0.03906250
#>    1 0.02929688 0.34830729
#> the error rate is:  0.06835938 
#> ==============
#> use corrected penalty =  7.935667 
#> the confusion table is:
#>      truth
#> pred1   0   1
#>     0 900  60
#>     1  41 535
#> the confusion table in proportion is:
#>      truth
#> pred1          0          1
#>     0 0.58593750 0.03906250
#>     1 0.02669271 0.34830729
#> the error rate is:  0.06575521