心理統計法('17) その2

放送大学「心理統計法(‘17)」第5章 第6章 その1

授業ではrstanパッケージを使ってますが、MCMCpackパッケージを使ってみます。初学者ですので間違いが多々あると思います。

(Rスクリプトはここ)
早稲田大学文学部文学研究科 豊田研究室

(参考)
ベイジアンMCMCによる統計モデル
Using the ggmcmc package
Bayesian Inference With Stan ~番外編~
Exercise 2: Bayesian A/B testing using MCMC Metropolis-Hastings
政治学方法論 I

標準偏差が共通するモデル(EQU)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
library(MCMCpack)
library(knitr)
library(loo)
#library(ggmcmc)
#表5.1の聴音条件のデータ入力
x1<-c(32.30,34.24,28.10,33.40,37.71,31.62,31.37,35.85,32.33,34.04,
34.96,31.43,35.28,30.19,35.09,33.38,31.49,28.44,32.12,31.81)
#表1.1の対照条件のデータ入力
x2<-c(31.43,31.09,33.38,30.49,29.62,35.40,32.58,28.96,29.43,28.52,
25.39,32.68,30.51,30.15,32.33,30.43,32.50,32.07,32.35,31.57)
######### burnin = 2000 , mcmc=200000 , thin=10
#
########## 標準偏差が共通するモデル(EQU) ##########
#
ttestfun <- function(beta, x , y) {
# beta[1] : mu1 , beta[2] : mu2 , beta[3] : sigma
ifelse( beta[3] < 0 , l<- -Inf ,
l <- sum(log(dnorm(x, mean=beta[1], sd=beta[3])) ) +
sum(log(dnorm(y, mean=beta[2], sd=beta[3]))) )
return(l)
}
#
chains <- 1:5
seeds <- seq(1,5,1)
init<-c(0,0,30)
#init <- c(mean(x1), mean(x2), max(sd(x1),sd(x2))*5 )
burnin<-2000
mcmc_n<-200000
thin<- verbose <-10
#sink("./likelihood.txt")
#
EQU <- lapply(chains ,
function(chain) {
MCMCmetrop1R(fun = ttestfun ,
theta.init = init ,
burnin = burnin, mcmc = mcmc_n ,thin=thin,
# verbose= verbose,
seed = seeds[chain],
x=x1, y=x2 )})
#
# sink()
# system("gawk '/function value/' ./likelihood.txt | gawk 'gsub(\"function value =\",\"\")' > ./equ_lik.txt")
# l<-scan("./equ_lik.txt")
# ll<-matrix(l,ncol=length(chains),byrow = FALSE)
# head(ll) ; tail(ll)
# lll<-ll[-c(1 : burnin/thin),]
# ( waic1 <- waic(matrix(lll,ncol=1)) )
#
EQU.mcmc <- mcmc.list(EQU)
#
summary(EQU.mcmc)

Iterations = 2001:201991
Thinning interval = 10
Number of chains = 5
Sample size per chain = 20000

  1. Empirical mean and standard deviation for each variable,
    plus standard error of the mean:

    Mean SD Naive SE Time-series SE
    [1,] 32.755 0.5304 0.0016773 0.002084
    [2,] 31.044 0.5299 0.0016756 0.002077
    [3,] 2.351 0.2812 0.0008892 0.001196

  2. Quantiles for each variable:

    2.5% 25% 50% 75% 97.5%
    [1,] 31.707 32.404 32.756 33.105 33.802
    [2,] 29.999 30.695 31.045 31.395 32.092
    [3,] 1.878 2.152 2.324 2.519 2.976

1
gelman.diag(EQU.mcmc)

Potential scale reduction factors:

Point est. Upper C.I.
[1,] 1 1
[2,] 1 1
[3,] 1 1

Multivariate psrf

1

1
effectiveSize(EQU.mcmc)

var1 var2 var3
64832.32 65093.17 55375.97

data.frame 作成、テキストにあわせた出力にするためにdata.frame ryou を作成

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
df<-data.frame(mu1=unlist(EQU.mcmc[,1]),mu2=unlist(EQU.mcmc[,2]),sigma=unlist(EQU.mcmc[,3]) )
#
ryou<-data.frame(c(round(mean(df[,1]),2) , round(sd(df[,1]),2) ,
+ round(quantile(df[,1],c(0.025,0.05,0.5,0.95,0.975)),2)))
for (i in 2:ncol(df) ){
+ ryou<-cbind(ryou,data.frame(c(round(mean(df[,i]),2) , round(sd(df[,i]),2) ,
+ round(quantile(df[,i],c(0.025,0.05,0.5,0.95,0.975)),2))))
+ }
names(ryou)<-c("mu1","mu2","sigma")
rownames(ryou)<-c("EAP","post.sd","2.5%","5%","50%","95%","97.5%")
mu1_mu2 <- df$mu1 - df$mu2
ryou<-cbind(ryou,data.frame(c(round(mean(mu1_mu2),2) , round(sd(mu1_mu2),2) ,
+ round(quantile(mu1_mu2,c(0.025,0.05,0.5,0.95,0.975)),2))))
names(ryou)<-c("mu1","mu2","sigma","mu1_mu2")
#kable(t(ryou))

確率:mu1>mu2 : mu2>mu1 ; mu1-mu2>1

1
2
meanDiff <- df$mu1 - df$mu2
( prob1 <- mean(meanDiff > 0) )

[1] 0.98817

1
2
meanDiff <- df$mu2 - df$mu1
( prob2 <- mean(meanDiff > 0) )

[1] 0.01183

1
2
meanDiff <- df$mu1 - df$mu2
( prob3 <- mean(meanDiff > 1) )

[1] 0.83177

1
2
3
4
5
6
#png("mcmchist6_1.png")
hist(df$mu1,breaks=100,col=rgb(1,0,0,0.3),main="母平均の事後分布( EQU )x1:red , x2:green",xlim=c(min(df$mu1,df$mu2),max(df$mu1,df$mu2)) )
hist(df$mu2,breaks=100,col=adjustcolor("green",0.3),add=T)
segments(mean(df$mu1),0,mean(df$mu1),par("usr")[4],lwd=2 ,col="blue")
segments(mean(df$mu2),0,mean(df$mu2),par("usr")[4],lwd=2 ,col="blue")
#dev.off()

1
2
3
4
5
#png("mcmchist6_2.png")
hist(mu1_mu2,breaks=100,col="lightgreen",main="母平均の差の事後分布( EQU )")
#abline(v=mean(mu1_mu2),lwd=2.5 ,col="red")
segments(mean(mu1_mu2),0,mean(mu1_mu2),par("usr")[4],lwd=2.5 ,col="red")
#dev.off()

事後予測分布

1
2
3
4
5
6
7
8
9
x1aste<-rnorm(n = nrow(df), mean =df$mu1, sd = df$sigma)
x2aste<-rnorm(n = nrow(df), mean =df$mu2, sd = df$sigma)
#
ryou<-cbind(ryou,data.frame(c(round(mean(x1aste),2) , round(sd(x1aste),2) ,
round(quantile(x1aste,c(0.025,0.05,0.5,0.95,0.975)),2))))
ryou<-cbind(ryou,data.frame(c(round(mean(x2aste),2) , round(sd(x2aste),2) ,
round(quantile(x2aste,c(0.025,0.05,0.5,0.95,0.975)),2))))
names(ryou)<-c("mu1","mu2","sigma","mu1_mu2","x1aste","x2aste")
kable(t(ryou))
EAP post.sd 2.5% 5% 50% 95% 97.5%
mu1 32.76 0.53 31.71 31.88 32.76 33.63 33.80
mu2 31.04 0.53 30.00 30.17 31.05 31.91 32.09
sigma 2.35 0.28 1.88 1.94 2.32 2.85 2.98
mu1_mu2 1.71 0.75 0.24 0.49 1.71 2.94 3.18
x1aste 32.76 2.42 27.97 28.79 32.76 36.71 37.53
x2aste 31.04 2.43 26.23 27.05 31.05 35.01 35.82

効果量

1
2
3
delta<- (df$mu1-df$mu2)/df$sigma
c(EAP=round(mean(delta),3) , post.sd=round(sd(delta),3) ,
round(quantile(delta,c(0.025,0.05,0.5,0.95,0.975)),3))

EAP post.sd 2.5% 5% 50% 95% 97.5%
0.738 0.326 0.096 0.200 0.738 1.278 1.373

1
2
delta02<-table(delta>0.2)
delta02[2]/(delta02[1]+delta02[2])

TRUE
0.95

非重複度

1
2
3
U3<-pnorm(df$mu1, mean =df$mu2, sd = df$sigma)
c(EAP=round(mean(U3),3) , post.sd=round(sd(U3),3) ,
round(quantile(U3,c(0.025,0.05,0.5,0.95,0.975)),3))

EAP post.sd 2.5% 5% 50% 95% 97.5%
0.759 0.098 0.538 0.579 0.770 0.899 0.915

1
2
U3_075<-table(U3 > 0.75)
U3_075[2]/(U3_075[1]+U3_075[2])

TRUE
0.57709

優越率

直接比較
1
2
yuetsu<-table(x1aste-x2aste > 0)
yuetsu[2]/(yuetsu[1]+yuetsu[2])

TRUE
0.69458

事後分布から
1
2
3
yuetsu<-pnorm(delta/sqrt(2) , mean =0, sd = 1)
c(EAP=round(mean(yuetsu),3) , post.sd=round(sd(yuetsu),3) ,
round(quantile(yuetsu,c(0.025,0.05,0.5,0.95,0.975)),3))

EAP post.sd 2.5% 5% 50% 95% 97.5%
0.694 0.079 0.527 0.556 0.699 0.817 0.834

1
2
yuetsu075<-table(yuetsu>0.75)
yuetsu075[2]/(yuetsu075[1]+yuetsu075[2])

TRUE
0.25565

閾上率

1
2
3
4
c<-1
##### 直接比較
ikizyo<-table(x1aste-x2aste > c)
ikizyo[2]/(ikizyo[1]+ikizyo[2])

TRUE
0.58576

1
2
3
4
5
c<-1
##### 事後分布から
ikizyo<-pnorm( (df$mu1-df$mu2-c)/(sqrt(2)*df$sigma) , mean =0, sd = 1)
c(EAP=round(mean(ikizyo),3) , post.sd=round(sd(ikizyo),3) ,
round(quantile(ikizyo,c(0.025,0.05,0.5,0.95,0.975)),3))

EAP post.sd 2.5% 5% 50% 95% 97.5%
0.584 0.086 0.411 0.439 0.586 0.721 0.744

1
2
ikizyo075<-table(ikizyo>0.75)
ikizyo075[2]/(ikizyo075[1]+ikizyo075[2])

TRUE
0.02037  

「MCMCpack」でWAICを求める

以下は「linux」で行った方法です

この部分のコメントアウト # を取り除く

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#sink("./likelihood.txt")
#
EQU <- lapply(chains ,
function(chain) {
MCMCmetrop1R(fun = ttestfun ,
theta.init = init ,
burnin = burnin, mcmc = mcmc_n ,thin=thin,
# verbose= verbose,
seed = seeds[chain],
x=x1, y=x2 )})
#
# sink()
# system("gawk '/function value/' ./likelihood.txt | gawk 'gsub(\"function value =\",\"\")' > ./equ_lik.txt")
# l<-scan("./equ_lik.txt")
# ll<-matrix(l,ncol=length(chains),byrow = FALSE)
# head(ll) ; tail(ll)
# lll<-ll[-c(1 : burnin/thin),]
# ( waic1 <- waic(matrix(lll,ncol=1)) )

WAICを求めるためには対数尤度が必要ですが、「MCMCpack」では対数尤度を返り値に加える方法がわかりません。(いまのところできない?)
しかし、パラメータ verbose=数値 を指定すれば、function value = -195.02600 と対数尤度が表示されます。
画面表示される代わりに
sink関数を使ってファイルに保存 
-> 不要な行等を削除(gawkを使いました)し、ファイルに保存 
-> scan関数でRに対数尤度を取り込み 
-> chain行のmatrixに変換 
-> burnin期間の行を削除 
-> 1列のmatrixに変換してloo::waic に入力

(注) thin=verbose , burnin=thin*整数 とすること

で、結果は 

182.7 となり、テキストの値とほぼ一致しました。