gpcfa-examples • LAWBL

Note: the estimation process can be time consuming depending on the computing power. You can same some time by reducing the length of the chains.

Categorical Data with Missingness but no Local Dependence:

Load the package, obtain the data, loading pattern (qlam), and setup the design matrix Q.

library(LAWBL)
dat <- sim18ccfa40$dat
dim(dat)
R> [1] 1000   18
summary(dat) #10% missingness at random
R>        V1              V2              V3              V4       
R>  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
R>  1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000  
R>  Median :2.000   Median :2.000   Median :2.000   Median :2.000  
R>  Mean   :2.507   Mean   :2.508   Mean   :2.498   Mean   :2.464  
R>  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000  
R>  Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.000  
R>  NA's   :91      NA's   :92      NA's   :93      NA's   :104    
R>        V5              V6              V7              V8       
R>  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
R>  1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000  
R>  Median :3.000   Median :3.000   Median :3.000   Median :3.000  
R>  Mean   :2.511   Mean   :2.506   Mean   :2.511   Mean   :2.537  
R>  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000  
R>  Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.000  
R>  NA's   :105     NA's   :119     NA's   :115     NA's   :108    
R>        V9             V10             V11            V12             V13       
R>  Min.   :1.000   Min.   :1.000   Min.   :1.00   Min.   :1.000   Min.   :1.000  
R>  1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.00   1st Qu.:2.000   1st Qu.:2.000  
R>  Median :3.000   Median :3.000   Median :3.00   Median :3.000   Median :2.000  
R>  Mean   :2.507   Mean   :2.518   Mean   :2.52   Mean   :2.524   Mean   :2.459  
R>  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.00   3rd Qu.:3.000   3rd Qu.:3.000  
R>  Max.   :4.000   Max.   :4.000   Max.   :4.00   Max.   :4.000   Max.   :4.000  
R>  NA's   :102     NA's   :99      NA's   :92     NA's   :100     NA's   :103    
R>       V14             V15             V16             V17             V18      
R>  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.00  
R>  1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.00  
R>  Median :3.000   Median :3.000   Median :3.000   Median :2.000   Median :2.00  
R>  Mean   :2.544   Mean   :2.501   Mean   :2.508   Mean   :2.485   Mean   :2.49  
R>  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.00  
R>  Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.00  
R>  NA's   :96      NA's   :114     NA's   :96      NA's   :84      NA's   :110
J <- ncol(dat) # no. of items
K <- 3 # no. of factors
qlam <- sim18ccfa40$qlam
qlam
R>       [,1] [,2] [,3]
R>  [1,]  0.7  0.0  0.0
R>  [2,]  0.7  0.0  0.0
R>  [3,]  0.7  0.0  0.0
R>  [4,]  0.7  0.0  0.0
R>  [5,]  0.7  0.3  0.0
R>  [6,]  0.7  0.3  0.0
R>  [7,]  0.0  0.7  0.0
R>  [8,]  0.0  0.7  0.0
R>  [9,]  0.0  0.7  0.0
R> [10,]  0.0  0.7  0.0
R> [11,]  0.0  0.7  0.3
R> [12,]  0.0  0.7  0.3
R> [13,]  0.0  0.0  0.7
R> [14,]  0.0  0.0  0.7
R> [15,]  0.0  0.0  0.7
R> [16,]  0.0  0.0  0.7
R> [17,]  0.3  0.0  0.7
R> [18,]  0.3  0.0  0.7

Q<-matrix(-1,J,K); # -1 for unspecified items
Q[1:2,1]<-Q[7:8,2]<-Q[13:14,3]<-1 # 1 for specified items
Q
R>       [,1] [,2] [,3]
R>  [1,]    1   -1   -1
R>  [2,]    1   -1   -1
R>  [3,]   -1   -1   -1
R>  [4,]   -1   -1   -1
R>  [5,]   -1   -1   -1
R>  [6,]   -1   -1   -1
R>  [7,]   -1    1   -1
R>  [8,]   -1    1   -1
R>  [9,]   -1   -1   -1
R> [10,]   -1   -1   -1
R> [11,]   -1   -1   -1
R> [12,]   -1   -1   -1
R> [13,]   -1   -1    1
R> [14,]   -1   -1    1
R> [15,]   -1   -1   -1
R> [16,]   -1   -1   -1
R> [17,]   -1   -1   -1
R> [18,]   -1   -1   -1

E-step: Estimate with the GPCFA-LI model (E-step) by setting LD=F. Only a few loadings need to be specified in Q (e.g., 2 per factor). Longer chain is suggested for more accurate and stable estimation.

m0 <- pcfa(dat = dat, Q = Q,LD = FALSE, cati = -1,burn = 4000, iter = 4000,verbose = TRUE)

# summarize basic information
summary(m0)

#summarize significant loadings in pattern/Q-matrix format
summary(m0, what = 'qlambda') 

#factorial eigenvalue
summary(m0,what='eigen') 

#thresholds for categorical items
summary(m0,what='thd')

#plotting factorial eigenvalue
plot_eigen(m0) # trace
plot_eigen(m0, what='density') #density
plot_eigen(m0, what='APSR') #adj, PSRF

C-step: Reconfigure the Q matrix for the C-step with one specified loading per item based on results from the E-step. Estimate with the GPCFA model by setting LD=TRUE (by default). Longer chain is suggested for more accurate and stable estimation.

Q<-matrix(-1,J,K);
tmp<-summary(m0, what="qlambda")
cind<-apply(tmp,1,which.max)
Q[cbind(c(1:J),cind)]<-1
#alternatively
#Q[1:6,1]<-Q[7:12,2]<-Q[13:18,3]<-1 # 1 for specified items

m1 <- pcfa(dat = dat, Q = Q, cati = -1,burn = 4000, iter = 4000,verbose = TRUE)
summary(m1)
summary(m1, what = 'qlambda')
summary(m1, what = 'offpsx') #summarize significant LD terms
summary(m1,what='eigen')
summary(m1,what='thd')

#plotting factorial eigenvalue
plot_eigen(m1) # trace
plot_eigen(m1, what='density') #density
plot_eigen(m1, what='APSR') #adj, PSRF

CFA-LD: One can also configure the Q matrix for a CCFA model with local dependence (i.e. without any unspecified loading) based on results from the C-step.

Q<-summary(m1, what="qlambda")
Q[Q!=0]<-1
Q

m2 <- pcfa(dat = dat, Q = Q, cati = -1,burn = 4000, iter = 4000,verbose = TRUE)
summary(m2)
summary(m2, what = 'qlambda') 
summary(m2, what = 'offpsx')
summary(m2,what='eigen')
summary(m2,what='thd')

plot_eigen(m2) # Eigens' traces are excellent without regularization of the loadings

Categorical Data with Missingness and Local Dependence:

Load the the data, loading pattern (qlam), and LD terms, and setup the design matrix Q.

dat <- sim18ccfa41$dat
summary(dat) #10% missingness at random
J <- ncol(dat) # no. of items
K <- 3 # no. of factors
sim18ccfa41$qlam
sim18ccfa41$LD # effect size = .3

Q<-matrix(-1,J,K); # -1 for unspecified items
Q[1:2,1]<-Q[7:8,2]<-Q[13:14,3]<-1 # 1 for specified items
Q

E-step: Estimate with the GPCFA-LI model (E-step) by setting LD=FALSE. Only a few loadings need to be specified in Q (e.g., 2 per factor). Some loading estimates are biased due to ignoring the LD. So do the eigenvalues.

m0 <- pcfa(dat = dat, Q = Q,LD = FALSE, cati = -1,burn = 4000, iter = 4000,verbose = TRUE)
summary(m0)
summary(m0, what = 'qlambda')
summary(m0,what='eigen')
summary(m0,what='thd')

plot_eigen(m0) # trace
plot_eigen(m0, what='APSR')

C-step: Reconfigure the Q matrix for the C-step with one specified loading per item based on results from the E-step. Estimate with the GPCFA model by setting LD=TRUE (by default). The estimates are more accurate, and the LD terms can be largely recovered.

Q<-matrix(-1,J,K);
tmp<-summary(m0, what="qlambda")
cind<-apply(tmp,1,which.max)
Q[cbind(c(1:J),cind)]<-1
Q

m1 <- pcfa(dat = dat, Q = Q, cati = -1,burn = 4000, iter = 4000,verbose = TRUE)
summary(m1)
summary(m1, what = 'qlambda')
summary(m1,what='eigen')
summary(m1, what = 'offpsx')
summary(m1,what='thd')

CFA-LD: Configure the Q matrix for a CCFA model with local dependence (i.e. without any unspecified loading) based on results from the C-step. Results are better than, but similar to the C-step.

Q<-summary(m1, what="qlambda")
Q[Q!=0]<-1
Q

m2 <- pcfa(dat = dat, Q = Q, cati = -1,burn = 4000, iter = 4000,verbose = TRUE)
summary(m2)
summary(m2, what = 'qlambda') 
summary(m2,what='eigen')
summary(m2, what = 'offpsx')
summary(m2,what='thd')

Mixed-type Data with Missingness and Local Dependence:

Load the the data, loading pattern (qlam), and LD terms, and setup the design matrix Q.

dat <- sim18mcfa41$dat
summary(dat) #10% missingness at random
J <- ncol(dat) # no. of items
K <- 3 # no. of factors
sim18mcfa41$qlam
sim18mcfa41$LD # effect size = .3

Q<-matrix(-1,J,K); # -1 for unspecified items
Q[1:2,1]<-Q[7:8,2]<-Q[13:14,3]<-1 # 1 for specified items
Q

E-step: Estimate with the GPCFA-LI model (E-step) by setting LD=FALSE. Only a few loadings need to be specified in Q (e.g., 2 per factor). The first 12 items are categorical and need to be specified with cati.

m0 <- pcfa(dat = dat, Q = Q,LD = FALSE, cati = c(1:12),burn = 4000, iter = 4000,verbose = TRUE)
summary(m0)
summary(m0, what = 'qlambda')
summary(m0,what='eigen')
summary(m0,what='thd') # only for 12 items 

plot_eigen(m0) # trace
plot_eigen(m0, what='density')
plot_eigen(m0, what='APSR')

C-step: Reconfigure the Q matrix for the C-step with one specified loading per item based on results from the E-step. Estimate with the GPCFA model by setting LD=TRUE (by default). The estimates are more accurate, and the LD terms can be largely recovered.

Q<-matrix(-1,J,K);
tmp<-summary(m0, what="qlambda")
cind<-apply(tmp,1,which.max)
Q[cbind(c(1:J),cind)]<-1
Q

m1 <- pcfa(dat = dat, Q = Q, cati = c(1:12),burn = 4000, iter = 4000,verbose = TRUE)
summary(m1)
summary(m1, what = 'qlambda')
summary(m1,what='eigen')
summary(m1, what = 'offpsx')
summary(m1,what='thd')

CFA-LD: Configure the Q matrix for a mix of CFA and CCFA model with local dependence (i.e. without any unspecified loading) based on results from the C-step. Results are better than, but similar to the C-step.

Q<-summary(m1, what="qlambda")
Q[Q!=0]<-1
Q

m2 <- pcfa(dat = dat, Q = Q, cati = c(1:12),burn = 4000, iter = 4000,verbose = TRUE)
summary(m2)
summary(m2, what = 'qlambda') 
summary(m2,what='eigen')
summary(m2, what = 'offpsx')
summary(m2,what='thd')