library(earth)
#> Loading required package: Formula
#> Loading required package: plotmo
#> Loading required package: plotrix
library(earth.dof.patch)The spam data has been frequently used in The Elements
of Statistical Learning (ESL) book. The data consists of information
from 4601 email messages, in a study to try to predict whether the email
was spam (i.e., junk email). The data can be accessible from https://archive.ics.uci.edu/dataset/94/spambase or https://github.com/szcf-weiya/ESL-CN/tree/master/data/Spam.
Here, we also include the data into our package, and one can simply load it as follows:
data("spam")
head(spam)
#> word_freq_make word_freq_address word_freq_all word_freq_3d word_freq_our
#> 1 0.00 0.64 0.64 0 0.32
#> 2 0.21 0.28 0.50 0 0.14
#> 3 0.06 0.00 0.71 0 1.23
#> 4 0.00 0.00 0.00 0 0.63
#> 5 0.00 0.00 0.00 0 0.63
#> 6 0.00 0.00 0.00 0 1.85
#> word_freq_over word_freq_remove word_freq_internet word_freq_order
#> 1 0.00 0.00 0.00 0.00
#> 2 0.28 0.21 0.07 0.00
#> 3 0.19 0.19 0.12 0.64
#> 4 0.00 0.31 0.63 0.31
#> 5 0.00 0.31 0.63 0.31
#> 6 0.00 0.00 1.85 0.00
#> word_freq_mail word_freq_receive word_freq_will word_freq_people
#> 1 0.00 0.00 0.64 0.00
#> 2 0.94 0.21 0.79 0.65
#> 3 0.25 0.38 0.45 0.12
#> 4 0.63 0.31 0.31 0.31
#> 5 0.63 0.31 0.31 0.31
#> 6 0.00 0.00 0.00 0.00
#> word_freq_report word_freq_addresses word_freq_free word_freq_business
#> 1 0.00 0.00 0.32 0.00
#> 2 0.21 0.14 0.14 0.07
#> 3 0.00 1.75 0.06 0.06
#> 4 0.00 0.00 0.31 0.00
#> 5 0.00 0.00 0.31 0.00
#> 6 0.00 0.00 0.00 0.00
#> word_freq_email word_freq_you word_freq_credit word_freq_your word_freq_font
#> 1 1.29 1.93 0.00 0.96 0
#> 2 0.28 3.47 0.00 1.59 0
#> 3 1.03 1.36 0.32 0.51 0
#> 4 0.00 3.18 0.00 0.31 0
#> 5 0.00 3.18 0.00 0.31 0
#> 6 0.00 0.00 0.00 0.00 0
#> word_freq_000 word_freq_money word_freq_hp word_freq_hpl word_freq_george
#> 1 0.00 0.00 0 0 0
#> 2 0.43 0.43 0 0 0
#> 3 1.16 0.06 0 0 0
#> 4 0.00 0.00 0 0 0
#> 5 0.00 0.00 0 0 0
#> 6 0.00 0.00 0 0 0
#> word_freq_650 word_freq_lab word_freq_labs word_freq_telnet word_freq_857
#> 1 0 0 0 0 0
#> 2 0 0 0 0 0
#> 3 0 0 0 0 0
#> 4 0 0 0 0 0
#> 5 0 0 0 0 0
#> 6 0 0 0 0 0
#> word_freq_data word_freq_415 word_freq_85 word_freq_technology word_freq_1999
#> 1 0 0 0 0 0.00
#> 2 0 0 0 0 0.07
#> 3 0 0 0 0 0.00
#> 4 0 0 0 0 0.00
#> 5 0 0 0 0 0.00
#> 6 0 0 0 0 0.00
#> word_freq_parts word_freq_pm word_freq_direct word_freq_cs word_freq_meeting
#> 1 0 0 0.00 0 0
#> 2 0 0 0.00 0 0
#> 3 0 0 0.06 0 0
#> 4 0 0 0.00 0 0
#> 5 0 0 0.00 0 0
#> 6 0 0 0.00 0 0
#> word_freq_original word_freq_project word_freq_re word_freq_edu
#> 1 0.00 0 0.00 0.00
#> 2 0.00 0 0.00 0.00
#> 3 0.12 0 0.06 0.06
#> 4 0.00 0 0.00 0.00
#> 5 0.00 0 0.00 0.00
#> 6 0.00 0 0.00 0.00
#> word_freq_table word_freq_conference char_freq_; char_freq_( char_freq_[
#> 1 0 0 0.00 0.000 0
#> 2 0 0 0.00 0.132 0
#> 3 0 0 0.01 0.143 0
#> 4 0 0 0.00 0.137 0
#> 5 0 0 0.00 0.135 0
#> 6 0 0 0.00 0.223 0
#> char_freq_! char_freq_$ char_freq_# capital_run_length_average
#> 1 0.778 0.000 0.000 3.756
#> 2 0.372 0.180 0.048 5.114
#> 3 0.276 0.184 0.010 9.821
#> 4 0.137 0.000 0.000 3.537
#> 5 0.135 0.000 0.000 3.537
#> 6 0.000 0.000 0.000 3.000
#> capital_run_length_longest capital_run_length_total class
#> 1 61 278 1
#> 2 101 1028 1
#> 3 485 2259 1
#> 4 40 191 1
#> 5 40 191 1
#> 6 15 54 1
data("flag.esl.trainset")
spam.train = spam[flag.esl.trainset == 0, ]
spam.test = spam[flag.esl.trainset == 1, ]where flag.esl.trainset is the indicator for training
set and test set used in the ESL site.
Now we compare the performance of the original MARS (via
earth) and our proposed MARS with corrected degrees of
freedom.
compare_class_table = function(degree = 1) {
mod = earth(class ~ ., data = spam.train, degree = degree)
pred = predict(mod, newdata = spam.test, type = "class")
tbl = table(pred, truth = spam.test$class)
cat("the confusion table is:\n")
print(tbl)
cat("the confusion table in proportion is:\n")
print(tbl / sum(tbl))
cat("the error rate is: ", (tbl[2] + tbl[3]) / sum(tbl), "\n" )
cat("==============\n")
dfs = correct_df(mod)
cat("use corrected penalty = ", dfs$penalty, "\n")
mod1 = earth(class ~ ., data = spam.train, degree = degree, penalty = dfs$penalty)
pred1 = predict(mod1, newdata = spam.test, type = "class")
tbl1 = table(pred1, truth = spam.test$class)
cat("the confusion table is:\n")
print(tbl1)
cat("the confusion table in proportion is:\n")
print(tbl1 / sum(tbl1))
cat("the error rate is: ", (tbl1[2] + tbl1[3]) / sum(tbl1), "\n" )
}Firstly, when we only use linear terms, the performance are
compare_class_table(1)
#> the confusion table is:
#> truth
#> pred 0 1
#> 0 887 76
#> 1 54 519
#> the confusion table in proportion is:
#> truth
#> pred 0 1
#> 0 0.57747396 0.04947917
#> 1 0.03515625 0.33789062
#> the error rate is: 0.08463542
#> ==============
#> use corrected penalty = 6.252168
#> the confusion table is:
#> truth
#> pred1 0 1
#> 0 892 76
#> 1 49 519
#> the confusion table in proportion is:
#> truth
#> pred1 0 1
#> 0 0.58072917 0.04947917
#> 1 0.03190104 0.33789062
#> the error rate is: 0.08138021Furthermore, when we allow second-degree interactions, the performance are
compare_class_table(2)
#> the confusion table is:
#> truth
#> pred 0 1
#> 0 896 60
#> 1 45 535
#> the confusion table in proportion is:
#> truth
#> pred 0 1
#> 0 0.58333333 0.03906250
#> 1 0.02929688 0.34830729
#> the error rate is: 0.06835938
#> ==============
#> use corrected penalty = 7.935667
#> the confusion table is:
#> truth
#> pred1 0 1
#> 0 900 60
#> 1 41 535
#> the confusion table in proportion is:
#> truth
#> pred1 0 1
#> 0 0.58593750 0.03906250
#> 1 0.02669271 0.34830729
#> the error rate is: 0.06575521