library(mclust) betadens <- function(x,a, b){ dens <- 1/beta(a, b) * x^(a-1) * (1-x)^(b-1) return(dens) } # betadens2 <- function(data.vec){ # dens <- (data.vec[1]*betadens(data.vec[2],data.vec[3],data.vec[4])) # return(dens) # } betadens2 <- function(x,ppi,param){ asum = 0 L=length(ppi) for (i in 1:L) asum = asum + ppi[i]*betadens(x,param[2*i-1],param[2*i]) dens = asum/2 return(dens) } mixsamp <- function(mle,ppi){ grid0 <- seq(0.0001,0.9999,by=0.00001) dens <- betadens2(grid0,ppi,mle) grid <- 2*grid0-1 return(list(grid,dens)) } ###################### Define the likelihood function ################### # targfunc <- function(param, L,z,y){ # partlike <- 0 # for (i in 1:L) # partlike <- partlike + sum(z[,i]*(-lbeta(param[2*i-1],param[2*i]) + param[2*i-1] * log(y) + param[2*i] * log(1-y))) # # partlike <- -partlike # } #################### EM algorithm ################### EMopti<-function(cordata, L) { cordata <- (1+cordata)/2 y <- sort(cordata)## y is the transformed corr. within [0,1] n <- length(y) ## n is the number of correlations. # ix <- sort(cordata,index.return=T)$ix ####################### Initializing parameters ######################### ppi <- rep(0,L) ## ppi is the mixing proportion z <- matrix(rep(0,n*L),nrow=n) ## z is the indicator vector tmp <- ceiling(n/L) for (i in 1:L) { z[(tmp*(i-1)+1):min(tmp*i,n),i] <-1 } ###################### Define the likelihood function ################### targfunc <- function(param){ partlike <- 0 for (i in 1:L) partlike <- partlike + sum(z[,i]*(-lbeta(param[2*i-1],param[2*i]) + param[2*i-1] * log(y) + param[2*i] * log(1-y))) partlike <- -partlike } lastlike <- -10^5 currlike <- 0 count <- 0 #mle <- rep(c(50,2),L) mle <- rep(1,2*L) while(abs(lastlike-currlike) > 0.1){ count <- count + 1 ## M-step using "nlm" function ## maxlike <- nlm(targfunc,mle,stepmax=10000,print.level=0)#,L=L,z=z,y=y) mle <- maxlike$estimate ## mle contains alpha's and beta's ##print(c("mle",mle)) lastlike <- currlike currlike <- maxlike$minimum ##print(c("difference in likelihood", abs(currlike-lastlike))) for(i in 1:length(mle)){ if(mle[i] <= 0) print("EROOR: one of the mle's are less than or equal to zero") quit } ppi <- apply(z,2,sum,na.rm=T)/n ##print(c("ppi",ppi)) ## E-step to update the z's ## for (i in 1:n){ for (j in 1:L){ z[i,j] <- ppi[j]*betadens(y[i],mle[2*j-1],mle[2*j]) } } z <- z/apply(z,1,sum) } print(c("EM has converged after", count, "times")) print(c('L=',L)) print(c('parameters:',mle)) #BIC = -2*log-likelihood + npar*log(nobs) bic <- 2*currlike + (2*L+L-1)*log(n) #2L for # of mle; L-1 for # of ppi entrop <- -sum(z*log(z)) icl.bic <- bic + 2*entrop print(c('BIC=',bic)) print(c('icl.bic=',icl.bic)) final.beta <- mixsamp(mle,ppi) return(list(data=final.beta,bic=bic, maxlike=maxlike, icl=icl.bic)) } main <- function(cordata,minL,maxL,datafile){ res.list = NULL for (L in minL:maxL){ print(c('L=',L)) res <- EMopti(cordata,L) res.list <- c(res.list,res) } ## bic.list = NULL; icl.list=NULL; ## for (i in 1:length(res.list)){ ## bic.list = c(bic.list,res.list$bic) ## icl.list = c(icl.list, res.list$icl) ## } #pdf("bic.pdf") #plot(cbind(minL:maxL,bic.list), xlab = 'L', ylab='BIC',type='o') #dev.off() #write.table(cbind(minL:maxL,bic.list),'bic.txt',col.names=c('L','BIC')) #save.image() return(res.list) } cordata <- as.matrix(read.table("/home/yuanj/personal/project/crossplatform/data/SScor.txt",header=F)) ##cordata <- as.matrix(read.table("/home/yuanj/personal/project/crossplatform/data/CorCoefOriginalData.txt",header=T)) pp <- function(datafile){ postscript(datafile) hist(cordata, freq=F, ylim=c(0,1.5), xlab="Correlation", main=NULL) lines(result$data[[1]], result$data[[2]]) dev.off() } result <- main(cordata, 1,1) pp("clust1.ps") result <- main(cordata, 2,2) pp("clust2.ps") result <- main(cordata, 3,3) pp("clust3.ps") result <- main(cordata, 4,4) pp("clust4.ps") result <- main(cordata, 5,5) pp("clust5.ps") ## postscript("betamix2.ps") # hist(2*y-1,nclass=100,probability=T, main=NULL, xlab="corr") # lines(final.beta$grid, final.beta$dens) # dev.off() # ##typsize=c(2,2,4,2) # cory <- 2*y-1 # ## z's are the posterior probabilities # postscript("postprob2.ps", horiz=F) # #par(mfrow=c(2,1)) # #hist(z[,2]-z[,1], breaks=c(-1,0,1), main=NULL, xlab="Differences of posterior probabilities z", labels=T, ylim=c(0, 12500)) # plot(cory, type="n", ylab="Correlation") # lines(cory[z[,2]<=z[,1]],col=2);ind<-c(1:length(cory));lines(ind[z[,2]>z[,1]],cory[z[,2]>z[,1]],col=3); # lines(c(max(ind[z[,2]<=z[,1]]),max(ind[z[,2]<=z[,1]])), c(-1,max(cory[z[,2]<=z[,1]]))) # text(max(ind[z[,2]<=z[,1]]) + 1500, min(cory[z[,2]<=z[,1]]), labels=c(max(ind[z[,2]<=z[,1]]) + 100)) # lines(c(-2000,max(ind[z[,2]<=z[,1]])), c(max(cory[z[,2]<=z[,1]]),max(cory[z[,2]<=z[,1]]))) # text(0, max(cory[z[,2]<=z[,1]]) + 0.07, labels=c(round(max(cory[z[,2]<=z[,1]]),3))) # dev.off() # cordata <- scan("correlations_jeff.txt") # cordata <- (1+cordata)/2 # new.z <- matrix(rep(0,n*L),nrow=n) # for(i in 1:n){ # new.z[ix[i],] <- z[i,] # } # write.table(cbind(cordata,new.z), "outdatalog.txt", sep=" ", quote=F, row.names=F, col.names=F)