決定木

「データからの知識発見」第10章 決定木

決定木

mvpartパッケージ

1
2
3
4
5
6
7
age<-c("Under40","Under40","Over40","Over40","Over40","Under40","Under40","Under40","Under40","Under40","Over40","Over40","Over40","Over40","Over40","Over40","Over40","Under40","Under40","Over40","Under40","Over40","Under40","Under40","Over40","Over40","Over40","Over40","Over40","Under40","Over40","Under40","Under40","Under40","Under40","Over40","Over40","Under40","Under40","Under40","Over40","Under40","Under40","Over40","Under40","Over40","Over40","Over40","Over40","Under40","Under40","Over40","Under40","Under40","Under40","Under40","Under40","Under40","Under40","Under40","Under40","Over40","Over40","Under40","Over40","Under40","Under40","Over40","Under40","Over40","Under40","Under40","Over40","Under40","Over40","Under40","Over40","Over40","Over40","Over40","Over40","Over40","Under40","Under40","Over40","Under40","Over40","Over40","Under40","Under40","Over40","Over40","Over40","Under40","Over40","Over40","Under40","Under40","Over40","Over40")
sex<-c("Man","Woman","Woman","Man","Woman","Man","Woman","Man","Woman","Man","Man","Woman","Woman","Man","Woman","Woman","Woman","Man","Man","Woman","Woman","Man","Man","Man","Woman","Man","Man","Woman","Woman","Man","Man","Man","Woman","Woman","Woman","Woman","Man","Man","Woman","Man","Woman","Man","Woman","Man","Woman","Woman","Woman","Woman","Woman","Man","Man","Man","Woman","Woman","Man","Woman","Woman","Woman","Woman","Woman","Man","Woman","Woman","Woman","Woman","Woman","Woman","Woman","Man","Man","Woman","Woman","Woman","Woman","Man","Man","Woman","Woman","Woman","Woman","Man","Woman","Man","Woman","Woman","Man","Man","Woman","Woman","Man","Woman","Woman","Man","Woman","Man","Woman","Woman","Woman","Man","Man")
trial<-c("New","New","Retry","New","New","New","New","New","Retry","New","Retry","New","New","New","New","New","Retry","New","New","New","Retry","New","New","Retry","New","New","New","New","New","New","New","Retry","New","New","New","New","Retry","New","Retry","Retry","New","Retry","New","New","New","New","New","New","Retry","New","Retry","New","New","New","New","Retry","Retry","Retry","New","Retry","New","Retry","New","New","New","Retry","New","New","Retry","New","New","Retry","New","New","Retry","New","New","Retry","New","Retry","New","Retry","New","New","Retry","New","New","Retry","New","New","New","Retry","New","New","New","Retry","New","Retry","New","New")
score<-c("Under70","Under70","Under70","Over70","Over70","Over70","Under70","Over70","Under70","Over70","Under70","Under70","Over70","Under70","Over70","Over70","Over70","Over70","Over70","Under70","Under70","Over70","Over70","Over70","Over70","Over70","Under70","Under70","Under70","Over70","Over70","Over70","Over70","Under70","Under70","Under70","Under70","Over70","Over70","Over70","Under70","Under70","Over70","Over70","Over70","Under70","Under70","Under70","Under70","Over70","Under70","Over70","Over70","Over70","Over70","Under70","Under70","Under70","Over70","Under70","Over70","Under70","Under70","Over70","Under70","Under70","Over70","Under70","Under70","Over70","Over70","Under70","Under70","Over70","Under70","Over70","Under70","Over70","Under70","Under70","Under70","Under70","Over70","Over70","Under70","Over70","Under70","Under70","Over70","Over70","Under70","Over70","Under70","Under70","Under70","Under70","Over70","Under70","Under70","Over70")
s1<-data.frame(age,sex,trial,score)
library(knitr)
kable(s1,align="c")
age sex trial score
Under40 Man New Under70
Under40 Woman New Under70
Over40 Woman Retry Under70
Over40 Man New Over70
Over40 Woman New Over70
Under40 Man New Over70
Under40 Woman New Under70
Under40 Man New Over70
Under40 Woman Retry Under70
Under40 Man New Over70
Over40 Man Retry Under70
Over40 Woman New Under70
Over40 Woman New Over70
Over40 Man New Under70
Over40 Woman New Over70
Over40 Woman New Over70
Over40 Woman Retry Over70
Under40 Man New Over70
Under40 Man New Over70
Over40 Woman New Under70
Under40 Woman Retry Under70
Over40 Man New Over70
Under40 Man New Over70
Under40 Man Retry Over70
Over40 Woman New Over70
Over40 Man New Over70
Over40 Man New Under70
Over40 Woman New Under70
Over40 Woman New Under70
Under40 Man New Over70
Over40 Man New Over70
Under40 Man Retry Over70
Under40 Woman New Over70
Under40 Woman New Under70
Under40 Woman New Under70
Over40 Woman New Under70
Over40 Man Retry Under70
Under40 Man New Over70
Under40 Woman Retry Over70
Under40 Man Retry Over70
Over40 Woman New Under70
Under40 Man Retry Under70
Under40 Woman New Over70
Over40 Man New Over70
Under40 Woman New Over70
Over40 Woman New Under70
Over40 Woman New Under70
Over40 Woman New Under70
Over40 Woman Retry Under70
Under40 Man New Over70
Under40 Man Retry Under70
Over40 Man New Over70
Under40 Woman New Over70
Under40 Woman New Over70
Under40 Man New Over70
Under40 Woman Retry Under70
Under40 Woman Retry Under70
Under40 Woman Retry Under70
Under40 Woman New Over70
Under40 Woman Retry Under70
Under40 Man New Over70
Over40 Woman Retry Under70
Over40 Woman New Under70
Under40 Woman New Over70
Over40 Woman New Under70
Under40 Woman Retry Under70
Under40 Woman New Over70
Over40 Woman New Under70
Under40 Man Retry Under70
Over40 Man New Over70
Under40 Woman New Over70
Under40 Woman Retry Under70
Over40 Woman New Under70
Under40 Woman New Over70
Over40 Man Retry Under70
Under40 Man New Over70
Over40 Woman New Under70
Over40 Woman Retry Over70
Over40 Woman New Under70
Over40 Woman Retry Under70
Over40 Man New Under70
Over40 Woman Retry Under70
Under40 Man New Over70
Under40 Woman New Over70
Over40 Woman Retry Under70
Under40 Man New Over70
Over40 Man New Under70
Over40 Woman Retry Under70
Under40 Woman New Over70
Under40 Man New Over70
Over40 Woman New Under70
Over40 Woman Retry Over70
Over40 Man New Under70
Under40 Woman New Under70
Over40 Man New Under70
Over40 Woman Retry Under70
Under40 Woman New Over70
Under40 Woman Retry Under70
Over40 Man New Under70
Over40 Man New Over70
1
2
3
library(mvpart)
s2<-rpart(score~.,data=s1,method="class")
s2

n= 100

node), split, n, loss, yval, (yprob)

  • denotes terminal node

    1) root 100 48 Under70 (0.4800000 0.5200000)
    2) trial=New 70 29 Over70 (0.5857143 0.4142857)
    4) age=Under40 34 6 Over70 (0.8235294 0.1764706)
    5) age=Over40 36 13 Under70 (0.3611111 0.6388889)
    10) sex=Man 15 7 Over70 (0.5333333 0.4666667)

    11) sex=Woman 21 5 Under70 (0.2380952 0.7619048)
    3) trial=Retry 30 7 Under70 (0.2333333 0.7666667)

1
2
plot(s2,margin=0.1)
text(s2,pretty=0,all=T,use.n=T,col="red",font =2,cex =0.8)

partykit パッケージのplot

1
2
library(partykit)
plot(as.party(s2))

(おまけ)決定木予測モデルの作成と評価

gclusパッケージのbankデータを使う
本物のスイス紙幣100枚と偽物のスイス紙幣100枚の6つの測定値
単位(mm) Length: Length of bill,Left: Width of left edge,Right: Width of right edge,Bottom: Bottom margin width,
(注)結果はデータの分かれ方によって異なる
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
library(mvpart)
library(gclus)
library(caret)
data(bank)
bank$Status <- factor(bank$Status, labels=c('真札','偽札'))
#データをモデル作成用と検定用に分ける
cdp<-createDataPartition(bank[,1],p=0.5)
cdp
#学習用データ
training<-bank[cdp$Resample1,]
training
#評価用データ
test<-bank[-cdp$Resample1,]
test
#テストデータで決定木
tree<- rpart(Status~.,data=training,method="class")
plot(tree,margin=0.15)
text(tree,all=T,use.n=T)
##### 対角線の長さが140.6以上なら真札、140.6未満なら偽札と予測
##### 検証する
predict<-predict(tree,test,type="class")
predict
test[,8]<-predict
test
table(test[,1],predict)
##### 1枚不正解
miss <- subset(test, subset=(test[,1]=="真札" & test[,8]=="偽札"))
miss