心理統計法('17) その12

放送大学「心理統計法(‘17)」第14章

授業ではrstanパッケージを使ってますが、MCMCpackパッケージを使ってみます。初学者ですので間違いが多々あると思います。

(Rスクリプトはここ)
早稲田大学文学部文学研究科 豊田研究室

(参考)
ベイジアンMCMCによる統計モデル
Using the ggmcmc package
Bayesian Inference With Stan ~番外編~
Exercise 2: Bayesian A/B testing using MCMC Metropolis-Hastings
政治学方法論 I

(データ)豊田 秀樹, 前田 忠彦, 柳井 晴夫の原因をさぐる統計学―共分散構造分析入門 (ブルーバックス)より
演習用公開データ

1
2
3
4
5
6
7
8
9
10
11
library(MCMCpack)
library(knitr)
#library(loo)
#
総熱量<-c(2336,2578,2431,1895,2352,2342,3349,2225,2869,1915,2135,3364,3398,3192,3264,3246,3457,3364,3107,2861,2848,3521,3201,3517,3190,2200,1940,1819,2570,2436,2938,2468,2551,2416,3366,3242,3409,3351,2862,3093,3079,3239,3260,2903,3177,3396,3256)
肉類<-c(40,168,205,98,178,140,672,175,187,99,88,486,443,291,485,311,490,486,285,179,192,447,161,753,575,120,79,55,144,151,720,291,55,146,487,229,474,450,166,459,208,406,327,189,416,529,683)
乳製品<-c(89,127,117,111,100,92,413,167,218,20,17,272,254,590,293,190,438,272,483,91,159,436,182,422,388,142,101,94,93,133,364,47,47,112,337,136,402,254,181,730,186,386,385,199,444,371,345)
酒類<-c(38,128,42,41,192,44,121,61,56,19,11,194,192,70,211,142,95,194,49,228,202,227,104,128,109,23,37,26,68,12,91,25,75,19,218,100,131,317,77,60,262,72,131,74,118,144,146)
大腸がん<-c(2.98,3.92,6.75,3.51,13.76,4.12,14.95,3.27,9.69,2.74,1.84,13.53,11.02,5.48,13.35,11.35,14.67,13.12,7.93,7.86,6.86,12.81,4.11,16.08,14.6,3.99,1.28,0.89,1.94,8.53,12.13,8.58,5.01,9.05,13.51,4.07,11.59,13.32,5.69,9.96,10.58,11.79,5.46,3.33,12.17,13.04,15.32)
#
can<-data.frame(総熱量,肉類,乳製品,酒類,大腸がん)

多変量散布図

1
2
3
4
#png("sinri14_1.png")
#par(family="TakaoExMincho")
pairs(can,pch=16,col=rgb(0,0,1,0.7))
#dev.off()

共分散行列

Rのcov関数で求める共分散は標本共分散。(n-1 で割る)
それを考慮して

1
kable(round(cov(can)*((nrow(can)-1)/nrow(can)),0) )
総熱量 肉類 乳製品 酒類 大腸がん
総熱量 250240 74906 57018 23564 1672
肉類 74906 37514 22301 6497 746
乳製品 57018 22301 25376 2747 417
酒類 23564 6497 2747 5518 197
大腸がん 1672 746 417 197 20

平均・標準偏差・相関行列

平均

1
round(apply(can,2,mean))

総熱量 肉類 乳製品 酒類 大腸がん
2871 307 243 109 9

標準偏差

1
round(apply(can,2,sd)*sqrt(((nrow(can)-1)/nrow(can))),0)

総熱量 肉類 乳製品 酒類 大腸がん
500 194 159 74 5

相関行列

1
kable(round(cor(can),3))
総熱量 肉類 乳製品 酒類 大腸がん
総熱量 1.000 0.773 0.716 0.634 0.739
肉類 0.773 1.000 0.723 0.452 0.853
乳製品 0.716 0.723 1.000 0.232 0.579
酒類 0.634 0.452 0.232 1.000 0.588
大腸がん 0.739 0.853 0.579 0.588 1.000

予測変数を「総熱量」「肉類」「乳製品」「酒類」とし重回帰分析を行う

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
fun <- function(beta, x1,x2,x3,x4 ,y ) {
# 切片a : beta[1] , 傾きb1,b2,b3,b4 : beta[2],beta[3],beta[4],beta[5] , sigmaE : beta[6]
ifelse (beta[6] < 0,l <- -Inf,
{
e <- y - (beta[1] + beta[2] * x1 + beta[3] * x2 +beta[4] * x3 +beta[5] * x4 )
l <- sum(log(dnorm(e, 0, beta[6])))
}
)
return(l)
}
#
chains <- 1:5
seeds <- seq(1,5,1)
burnin<-2000
mcmc_n<-200000
thin<- verbose <-10
#sink("./likelihood.txt")
blm <- lapply(chains ,
function(chain) {
MCMCmetrop1R(fun = fun ,
theta.init = c(0,0,0,0,0,1) ,
burnin = burnin, mcmc = mcmc_n ,thin=thin,
# verbose= verbose,
seed = seeds[chain], logfun=TRUE ,
y=can$大腸がん ,x1=can$総熱量 ,x2=can$肉類,x3=can$乳製品,x4=can$酒類 )})
# sink()
# system("gawk '/function value/' ./likelihood.txt | gawk 'gsub(\"function value =\",\"\")' > ./lik.txt")
# l<-scan("./lik.txt")
# ll<-matrix(l,ncol=length(chains),byrow = FALSE)
# head(ll) ; tail(ll)
# lll<-ll[-c(1 : burnin/thin),]
# ( waic1 <- waic(matrix(lll,ncol=1)) )
#
#waic 215.6
#
blm.mcmc <- mcmc.list(blm)
#
summary(blm.mcmc)
#
gelman.diag(blm.mcmc)
#
effectiveSize(blm.mcmc)
#
df <- data.frame(a=unlist(blm.mcmc[,1]),b1=unlist(blm.mcmc[,2]) ,b2=unlist(blm.mcmc[,3]),
b3=unlist(blm.mcmc[,4]),b4=unlist(blm.mcmc[,5]),sigmaE=unlist(blm.mcmc[,6]))
#
colnames(df)<-c("a","b1","b2","b3","b4","sigmaE")
#
ryou<-rbind(EAP=apply(df, 2, mean),post.sd=apply(df, 2, sd),
apply(df, 2, quantile, probs=c(0.025,0.05, 0.5,0.95, 0.975)))
colnames(ryou)<-c("a(切片)","総熱量(b1)","肉類(b2)","乳製品(b3)","酒類(b4)","sigmaE(誤差sd)")
kable(t(round(ryou,5)))
EAP post.sd 2.5% 5% 50% 95% 97.5%
a(切片) 0.55361 2.99219 -5.38977 -4.36036 0.54635 5.47844 6.45255
総熱量(b1) 0.00053 0.00143 -0.00229 -0.00181 0.00053 0.00288 0.00336
肉類(b2) 0.01735 0.00301 0.01146 0.01240 0.01735 0.02228 0.02325
乳製品(b3) -0.00153 0.00361 -0.00867 -0.00746 -0.00151 0.00434 0.00553
酒類(b4) 0.01383 0.00648 0.00107 0.00321 0.01382 0.02446 0.02662
sigmaE(誤差sd) 2.31097 0.26293 1.86597 1.92508 2.28665 2.78110 2.89779

 「乳製品」の偏回帰係数(b3)のEAPが負の値になっているが、

1
cor(can$大腸がん,can$乳製品)

0.5787253

予測変数の選択のためにヒストグラムを描いてみた

1
2
3
4
5
6
7
8
9
10
11
#png("sinri14_2.png")
#par(family="TakaoExMincho")
par(mfrow=c(2,2))
for(i in 2:5){
hist(df[,i],breaks=50,col="lightgreen",main="")
segments(quantile(df[,i], probs=0.05),0,quantile(df[,i], probs=0.05),par("usr"),col="blue",lwd=2)
segments(0 ,0 , 0 ,par("usr"),col="red",lty=2,lwd=2)
title(paste0("hist : ",colnames(ryou)[i]))
}
par(mfrow=c(1,1))
#dev.off()

青線(5%)赤破線(0)の位置関係を見る

予測変数を「肉類」と「酒類」とし重回帰分析を再度行う。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
fun <- function(beta, x1,x2,y ) {
# 切片a : beta[1] , 傾きb1,b2 : beta[2],beta[3] , sigmaE : beta[4]
ifelse (beta[4] < 0,l <- -Inf,
{
e <- y - (beta[1] + beta[2] * x1 + beta[3] * x2 )
l <- sum(log(dnorm(e, 0, beta[4])))
}
)
return(l)
}
#
chains <- 1:5
seeds <- seq(1,5,1)
burnin<-2000
mcmc_n<-200000
thin<- verbose <-10
#sink("./likelihood.txt")
blm <- lapply(chains ,
function(chain) {
MCMCmetrop1R(fun = fun ,
theta.init = c(0,0,0,1) ,
burnin = burnin, mcmc = mcmc_n ,thin=thin,
# verbose= verbose,
seed = seeds[chain], logfun=TRUE ,
y=can$大腸がん ,x1=can$肉類,x2=can$酒類 )})
# sink()
# system("gawk '/function value/' ./likelihood.txt | gawk 'gsub(\"function value =\",\"\")' > ./lik.txt")
# l<-scan("./lik.txt")
# ll<-matrix(l,ncol=length(chains),byrow = FALSE)
# head(ll) ; tail(ll)
# lll<-ll[-c(1 : burnin/thin),]
# ( waic2 <- waic(matrix(lll,ncol=1)) )
#
#waic 211.8
#
blm.mcmc <- mcmc.list(blm)
#
summary(blm.mcmc)
#
gelman.diag(blm.mcmc)
#
effectiveSize(blm.mcmc)
#
df <- data.frame(a=unlist(blm.mcmc[,1]),b2=unlist(blm.mcmc[,2]) ,b4=unlist(blm.mcmc[,3]),
sigmaE=unlist(blm.mcmc[,4]))
#
colnames(df)<-c("a","b2","b4","sigmaE")
#
ryou<-rbind(EAP=apply(df, 2, mean),post.sd=apply(df, 2, sd),
apply(df, 2, quantile, probs=c(0.025,0.05, 0.5,0.95, 0.975)))
colnames(ryou)<-c("a(切片)","肉類(b2)","酒類(b4)","sigmaE(誤差sd)")
kable(t(round(ryou,3)))
EAP post.sd 2.5% 5% 50% 95% 97.5%
a(切片) 1.566 0.685 0.217 0.440 1.566 2.692 2.917
肉類(b2) 0.017 0.002 0.013 0.014 0.017 0.020 0.021
酒類(b4) 0.016 0.005 0.006 0.007 0.016 0.024 0.025
sigmaE(誤差sd) 2.263 0.252 1.834 1.891 2.240 2.711 2.819

WAICを調べると、 215.6 > 211.8 で予測変数を「肉類」と「酒類」にした方を支持している。

sigmayhat^2=sigmay^2-sigmaE^2

予測値の事後分布

(13.12)式

1
2
3
4
5
6
7
yhat<-data.frame(df$a+df$b2*can$肉類[1]+df$b4*can$酒類[1])
for (i in 2:nrow(can)){
temp<-data.frame(df$a+df$b2*can$肉類[i]+df$b4*can$酒類[i])
yhat<-cbind(yhat,temp)
}
#colnames(yhat)<-c(paste0("yhat",(1:44)))
#head(yhat)

予測値の分散の事後分布

1
sigmayhat2<-apply(yhat,1,var)

決定係数

(13.16)式

1
2
3
4
eta2<-sigmayhat2/(sigmayhat2+df$sigmaE^2)
#
round(c(EAP=mean(eta2),post.sd=sd(eta2),
quantile(eta2, probs=c(0.025,0.05, 0.5,0.95, 0.975))),3)

EAP post.sd 2.5% 5% 50% 95% 97.5%
0.759 0.051 0.641 0.666 0.766 0.831 0.841

重相関係数(決定係数の平方根に一致)

(13.14)式

(14.8)式

1
2
3
4
eta<-sqrt(eta2)
#
round(c(EAP=mean(eta),post.sd=sd(eta),
quantile(eta, probs=c(0.025,0.05, 0.5,0.95, 0.975))),3)

EAP post.sd 2.5% 5% 50% 95% 97.5%
0.871 0.030 0.801 0.816 0.875 0.912 0.917

標準偏回帰係数

(14.7)式

肉類

1
2
3
4
b2star<-df$b2*sd(can$肉類)/sd(can$大腸がん)
#
round(c(EAP=mean(b2star),post.sd=sd(b2star),
quantile(b2star, probs=c(0.025,0.05, 0.5,0.95, 0.975))),3)

EAP post.sd 2.5% 5% 50% 95% 97.5%
0.738 0.083 0.575 0.602 0.737 0.874 0.902

酒類

1
2
3
4
b4star<-df$b4*sd(can$酒類)/sd(can$大腸がん)
#
round(c(EAP=mean(b4star),post.sd=sd(b4star),
quantile(b4star, probs=c(0.025,0.05, 0.5,0.95, 0.975))),3)

EAP post.sd 2.5% 5% 50% 95% 97.5%
0.256 0.082 0.093 0.120 0.256 0.390 0.418

回帰平面

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
library(scatterplot3d)
z <- can$大腸がん
x <- can$肉
y <- can$酒
### 予測値の点推定値
zhat<- mean(df$a) + mean(df$b2)*can$肉類 + mean(df$b4)*can$酒
#png("sinri14_3.png")
par(family="TakaoExMincho")
p3d<- scatterplot3d(x, y, z, type="h",angle=30,
scale.y=0.6, pch=16,color="red",xlab="肉類",ylab="酒類",zlab="大腸がん")
zhatlm <- lm(zhat ~ can$肉類 + can$酒類)
p3d$plane3d(zhatlm,draw_polygon=T,col=rgb(0,0,1,0.8))
pp <- recordPlot()
replayPlot(pp)
#dev.off()

残差プロット

残差の点推定値

1
2
3
4
5
6
7
8
9
eihat<-can$大腸がん - zhat
#
#png("sinri14_4.png")
par(family="TakaoExMincho")
plot(zhat,eihat,type="n",las=1,xlab="予測値",ylab="残差")
text(zhat, eihat,rownames(can),cex=0.8)
title("残差プロット(予測値と残差の散布図)")
abline(h=0,col="red",lwd=1)
#dev.off()

予測分布

(13.17)式

任意の点(テキストP.235)の事後分布・予測分布

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
niku<-c(0,300,0,150)
sake<-c(0,0,100,50)
#
yhat0<-data.frame(df$a+df$b2*niku[1]+df$b4*sake[1])
for (i in 2:4){
temp<-data.frame(df$a+df$b2*niku[i]+df$b4*sake[i])
yhat0<-cbind(yhat0,temp)
}
colnames(yhat0)<-c("d0_0","d300_0","d0_100","d150_50")
#
yaste0<-data.frame(rnorm(n = nrow(df), mean =df$a + df$b2*niku[1] + df$b4*sake[1], sd = df$sigmaE))
for (i in 2:4){
temp<-data.frame(rnorm(n = nrow(df), mean =df$a + df$b2*niku[i] + df$b4*sake[i], sd = df$sigmaE))
yaste0<-cbind(yaste0,temp)
}
colnames(yaste0)<-c("d0_0","d300_0","d0_100","d150_50")
#
ryou<-rbind(EAP=apply(yhat0, 2, mean),post.sd=apply(yhat0, 2, sd),
apply(yhat0, 2, quantile, probs=0.95),sd=apply(yaste0, 2, sd),apply(yaste0, 2, quantile, probs=0.95))
rownames(ryou)<-c("EAP","post.sd","事後分布95%点","sd","予測分布95%点")
colnames(ryou)<-c("肉類0___酒類0","肉類300_酒類0","肉類0___酒類100","肉類150_酒類50")
#
kable(t(round(ryou,1)))
EAP post.sd 事後分布95%点 sd 予測分布95%点
肉類0_酒類0 1.6 0.7 2.7 2.4 5.5
肉類300_酒類0 6.7 0.6 7.8 2.4 10.6
肉類0_酒類100 3.1 0.7 4.2 2.4 7.0
肉類150_酒類50 4.9 0.5 5.7 2.3 8.7