library(earth)
#> Loading required package: Formula
#> Loading required package: plotmo
#> Loading required package: plotrix
library(earth.dof.patch)
The spam
data has been frequently used in The Elements
of Statistical Learning (ESL) book. The data consists of information
from 4601 email messages, in a study to try to predict whether the email
was spam (i.e., junk email). The data can be accessible from https://archive.ics.uci.edu/dataset/94/spambase or https://github.com/szcf-weiya/ESL-CN/tree/master/data/Spam.
Here, we also include the data into our package, and one can simply load it as follows:
data("spam")
head(spam)
#> word_freq_make word_freq_address word_freq_all word_freq_3d word_freq_our
#> 1 0.00 0.64 0.64 0 0.32
#> 2 0.21 0.28 0.50 0 0.14
#> 3 0.06 0.00 0.71 0 1.23
#> 4 0.00 0.00 0.00 0 0.63
#> 5 0.00 0.00 0.00 0 0.63
#> 6 0.00 0.00 0.00 0 1.85
#> word_freq_over word_freq_remove word_freq_internet word_freq_order
#> 1 0.00 0.00 0.00 0.00
#> 2 0.28 0.21 0.07 0.00
#> 3 0.19 0.19 0.12 0.64
#> 4 0.00 0.31 0.63 0.31
#> 5 0.00 0.31 0.63 0.31
#> 6 0.00 0.00 1.85 0.00
#> word_freq_mail word_freq_receive word_freq_will word_freq_people
#> 1 0.00 0.00 0.64 0.00
#> 2 0.94 0.21 0.79 0.65
#> 3 0.25 0.38 0.45 0.12
#> 4 0.63 0.31 0.31 0.31
#> 5 0.63 0.31 0.31 0.31
#> 6 0.00 0.00 0.00 0.00
#> word_freq_report word_freq_addresses word_freq_free word_freq_business
#> 1 0.00 0.00 0.32 0.00
#> 2 0.21 0.14 0.14 0.07
#> 3 0.00 1.75 0.06 0.06
#> 4 0.00 0.00 0.31 0.00
#> 5 0.00 0.00 0.31 0.00
#> 6 0.00 0.00 0.00 0.00
#> word_freq_email word_freq_you word_freq_credit word_freq_your word_freq_font
#> 1 1.29 1.93 0.00 0.96 0
#> 2 0.28 3.47 0.00 1.59 0
#> 3 1.03 1.36 0.32 0.51 0
#> 4 0.00 3.18 0.00 0.31 0
#> 5 0.00 3.18 0.00 0.31 0
#> 6 0.00 0.00 0.00 0.00 0
#> word_freq_000 word_freq_money word_freq_hp word_freq_hpl word_freq_george
#> 1 0.00 0.00 0 0 0
#> 2 0.43 0.43 0 0 0
#> 3 1.16 0.06 0 0 0
#> 4 0.00 0.00 0 0 0
#> 5 0.00 0.00 0 0 0
#> 6 0.00 0.00 0 0 0
#> word_freq_650 word_freq_lab word_freq_labs word_freq_telnet word_freq_857
#> 1 0 0 0 0 0
#> 2 0 0 0 0 0
#> 3 0 0 0 0 0
#> 4 0 0 0 0 0
#> 5 0 0 0 0 0
#> 6 0 0 0 0 0
#> word_freq_data word_freq_415 word_freq_85 word_freq_technology word_freq_1999
#> 1 0 0 0 0 0.00
#> 2 0 0 0 0 0.07
#> 3 0 0 0 0 0.00
#> 4 0 0 0 0 0.00
#> 5 0 0 0 0 0.00
#> 6 0 0 0 0 0.00
#> word_freq_parts word_freq_pm word_freq_direct word_freq_cs word_freq_meeting
#> 1 0 0 0.00 0 0
#> 2 0 0 0.00 0 0
#> 3 0 0 0.06 0 0
#> 4 0 0 0.00 0 0
#> 5 0 0 0.00 0 0
#> 6 0 0 0.00 0 0
#> word_freq_original word_freq_project word_freq_re word_freq_edu
#> 1 0.00 0 0.00 0.00
#> 2 0.00 0 0.00 0.00
#> 3 0.12 0 0.06 0.06
#> 4 0.00 0 0.00 0.00
#> 5 0.00 0 0.00 0.00
#> 6 0.00 0 0.00 0.00
#> word_freq_table word_freq_conference char_freq_; char_freq_( char_freq_[
#> 1 0 0 0.00 0.000 0
#> 2 0 0 0.00 0.132 0
#> 3 0 0 0.01 0.143 0
#> 4 0 0 0.00 0.137 0
#> 5 0 0 0.00 0.135 0
#> 6 0 0 0.00 0.223 0
#> char_freq_! char_freq_$ char_freq_# capital_run_length_average
#> 1 0.778 0.000 0.000 3.756
#> 2 0.372 0.180 0.048 5.114
#> 3 0.276 0.184 0.010 9.821
#> 4 0.137 0.000 0.000 3.537
#> 5 0.135 0.000 0.000 3.537
#> 6 0.000 0.000 0.000 3.000
#> capital_run_length_longest capital_run_length_total class
#> 1 61 278 1
#> 2 101 1028 1
#> 3 485 2259 1
#> 4 40 191 1
#> 5 40 191 1
#> 6 15 54 1
data("flag.esl.trainset")
spam.train = spam[flag.esl.trainset == 0, ]
spam.test = spam[flag.esl.trainset == 1, ]
where flag.esl.trainset
is the indicator for training
set and test set used in the ESL site.
Now we compare the performance of the original MARS (via
earth
) and our proposed MARS with corrected degrees of
freedom.
compare_class_table = function(degree = 1) {
mod = earth(class ~ ., data = spam.train, degree = degree)
pred = predict(mod, newdata = spam.test, type = "class")
tbl = table(pred, truth = spam.test$class)
cat("the confusion table is:\n")
print(tbl)
cat("the confusion table in proportion is:\n")
print(tbl / sum(tbl))
cat("the error rate is: ", (tbl[2] + tbl[3]) / sum(tbl), "\n" )
cat("==============\n")
dfs = correct_df(mod)
cat("use corrected penalty = ", dfs$penalty, "\n")
mod1 = earth(class ~ ., data = spam.train, degree = degree, penalty = dfs$penalty)
pred1 = predict(mod1, newdata = spam.test, type = "class")
tbl1 = table(pred1, truth = spam.test$class)
cat("the confusion table is:\n")
print(tbl1)
cat("the confusion table in proportion is:\n")
print(tbl1 / sum(tbl1))
cat("the error rate is: ", (tbl1[2] + tbl1[3]) / sum(tbl1), "\n" )
}
Firstly, when we only use linear terms, the performance are
compare_class_table(1)
#> the confusion table is:
#> truth
#> pred 0 1
#> 0 887 76
#> 1 54 519
#> the confusion table in proportion is:
#> truth
#> pred 0 1
#> 0 0.57747396 0.04947917
#> 1 0.03515625 0.33789062
#> the error rate is: 0.08463542
#> ==============
#> use corrected penalty = 6.252168
#> the confusion table is:
#> truth
#> pred1 0 1
#> 0 892 76
#> 1 49 519
#> the confusion table in proportion is:
#> truth
#> pred1 0 1
#> 0 0.58072917 0.04947917
#> 1 0.03190104 0.33789062
#> the error rate is: 0.08138021
Furthermore, when we allow second-degree interactions, the performance are
compare_class_table(2)
#> the confusion table is:
#> truth
#> pred 0 1
#> 0 896 60
#> 1 45 535
#> the confusion table in proportion is:
#> truth
#> pred 0 1
#> 0 0.58333333 0.03906250
#> 1 0.02929688 0.34830729
#> the error rate is: 0.06835938
#> ==============
#> use corrected penalty = 7.935667
#> the confusion table is:
#> truth
#> pred1 0 1
#> 0 900 60
#> 1 41 535
#> the confusion table in proportion is:
#> truth
#> pred1 0 1
#> 0 0.58593750 0.03906250
#> 1 0.02669271 0.34830729
#> the error rate is: 0.06575521