bbr rmsb models

Objectives

This short vignette is based on Section 6-12 of Prof Frank Harrell’s incredibly useful Biostatistics and Biomedical Research (bbr). Its goal are (i) to re-fit the brms Bernoulli models with Harrell’s rmsb package, (ii) run a visual posterior predictive check on the rmsb model and, (iii) visualize the prior and posterior distributions of the predictor effects. By showing how the visualizations may be created using functions that are not part of the rmsb package, it is my hope that more readers will consider fitting ordinal regression models using rmsb¹.

Dataset

## Creating the dataset
set.seed(10)
priority <- factor(c(rep('emergency', 25), rep('other', 111)), c('other', 'emergency'))
death <- c(rep(0, 19), rep(1, 6), rep(0, 100), rep(1, 11))
d  <- data.frame(priority, death)
randomUniform <- runif(length(priority))
d$random <- ifelse(priority == 'emergency', randomUniform < 1/3,
                   randomUniform > 1/3) * 1
dd <- datadist(d); options(datadist='dd')

d_describe <- describe(d, descript="Section 6-12 database")
html(d_describe, size=100, scroll=TRUE)

Section 6-12 database

3 Variables 136 Observations

priority

n	missing	distinct
136	0	2

 Value          other emergency
 Frequency        111        25
 Proportion     0.816     0.184

death

n	missing	distinct	Info	Sum	Mean	Gmd
136	0	2	0.328	17	0.125	0.2204

random

n	missing	distinct	Info	Sum	Mean	Gmd
136	0	2	0.727	80	0.5882	0.488

Prior Specification

`normsolve`

Modify bbr’s useful normsolve function which solves for the SD (of a normal distribution) such that the tail (area) probability beyond myvalue is myprob

normsolve <- function (myvalue, myprob = 0.025, mu=0, convert2 =c("none", "log", "logodds")){
  ## modified bbr's normsolve() 
  ## normsolve() solves for SD (of a normal distribution) such that the tail (area) probability beyond myvalue is myprob
  ## myprob = tail area probability
  ## convert2() converts mu and myvalue to the scale of the normal distribution 
  ## If mu and myvalue are OR values, use "log" to convert them to log(OR)
  ## If mu and myvalue are prob values, use "logodds" to convert them to log(odds)
  
  if (missing(convert2)) {convert2 <-  "none"}    
  if (convert2=="log" & mu==0) {
    cat ("Note: mu cannot be 0 so setting an arbitrary value of 1.0")
    mu = 1
  }
  mysd <- switch(convert2, 
                 "none" = (myvalue-mu)/qnorm(1-myprob),
                 "log" =  (log(myvalue)- log(mu))/qnorm(1-myprob),
                 "logodds" = (qlogis(myvalue)- qlogis(mu))/qnorm(1-myprob)   )
  return(mysd)
}

\(\beta_0\)

Specify a normal distribution on the log odds scale such that it corresponds to a mean proportion of 0.05 with a tail probability of 10% for proportions exceeding 0.20

(intercept_sd <- normsolve(mu = 0.05, 
                          myvalue = 0.20,
                          myprob  = 0.10,
                          convert2 = "logodds"))

## [1] 1.215827

\(\beta_{priority}\)

Specify a normal distribution on the log OR scale such that it corresponds to a mean OR of 1.0 with a 10% prob that OR>3

(priority_sd <- normsolve(mu = 1, 
                         myvalue = 3,
                         myprob  = 0.10,
                         convert2 = "log"))

## [1] 0.8572517

\(\beta_{random}\)

Specify a normal distribution on the log OR scale such that it corresponds to a mean OR of 1.0 with a 5% prob that OR>1.25

(random_sd <- normsolve(mu = 1, 
                         myvalue = 1.25,
                         myprob  = 0.05,
                         convert2 = "log"))

## [1] 0.1356616

Model Fitting

brms model

Fit a brms logistic regression model and measure model fitting time. Model fitting took ~160 secs on my 5-year old laptop.

p <- c(
  prior_string(glue('normal(-2.944, {intercept_sd})'),class='Intercept') ,
  prior_string(glue('normal(0, {priority_sd})'), class='b', coef='priorityemergency') ,
  prior_string(glue('normal(0, {random_sd})'), class='b', coef='random')
) 


if(rerun_analysis) {
tic()
mybrm <- brm(death ~ priority + random, 
             data=d, 
             prior=p, 
             family='bernoulli',
             seed=121, 
             refresh=FALSE)
toc()     ##165 sec elapsed
saveRDS( mybrm, file= here::here("PYH Bayesian/rmsb/mybrm.Rds"))  

} else {
  
  mybrm  <- readRDS(here::here("PYH Bayesian/rmsb/mybrm.Rds"))  
  
}

mybrm

##  Family: bernoulli 
##   Links: mu = logit 
## Formula: death ~ priority + random 
##    Data: d (Number of observations: 136) 
##   Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 4000
## 
## Population-Level Effects: 
##                   Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept            -2.16      0.31    -2.80    -1.59 1.00     3364     2606
## priorityemergency     0.71      0.50    -0.29     1.68 1.00     3088     2598
## random               -0.06      0.13    -0.32     0.20 1.00     4326     3037
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

rmsb model

Fit a rmsb logistic regression model and measure model fitting time. rmsb took ~70 secs and both packages gave similar regression coefficients.

if(rerun_analysis) {
  tic()
  myblrm  <- blrm(death ~ priority + random, 
                  data=d,
                  keepsep=('priority|random'),
                  priorsd = c(priority_sd , random_sd))  
  toc() ##70.8 sec elapsed
  saveRDS( myblrm, file= here::here("PYH Bayesian/rmsb/myblrm.Rds"))  
  
  
} else {
  
  myblrm  <- readRDS(here::here("PYH Bayesian/rmsb/myblrm.Rds"))  
 
}

myblrm

## Bayesian Logistic Model
##  
##  Dirichlet Priors With Concentration Parameter 0.541 for Intercepts
##  
##  blrm(formula = death ~ priority + random, keepsep = ("priority|random"), 
##      data = d, priorsd = c(priority_sd, random_sd))
##  
##                   Mixed Calibration/             Discrimination                Rank Discrim.    
##               Discrimination Indexes                    Indexes                      Indexes    
##     Obs136    LOO log L-51.43+/-7.56    g  0.256 [0.029, 0.519]    C    0.598 [0.446, 0.659]    
##      0 119    LOO IC  102.86+/-15.13    gp  0.033 [0.002, 0.07]    Dxy 0.196 [-0.107, 0.319]    
##      1  17    Effective p1.78+/-0.36    EV     0.018 [0, 0.066]                                 
##  Draws4000    B 0.108 [0.105, 0.112]    v      0.111 [0, 0.349]                                 
##    Chains4                              vp     0.002 [0, 0.007]                                 
##     p    2                                                                                      
##  
##                     Mode Beta Mean Beta Median Beta S.E.   Lower   Upper  
##  Intercept          -2.0752   -2.0778   -2.0712     0.2975 -2.6583 -1.5102
##  priority=emergency  0.7144    0.6760    0.6895     0.4845 -0.3001  1.5920
##  random             -0.0585   -0.0573   -0.0566     0.1307 -0.3061  0.2126
##                     Pr(Beta>0) Symmetry
##  Intercept          0.0000     0.92    
##  priority=emergency 0.9190     0.95    
##  random             0.3385     0.94    
##  
##  The following parameters remained separate (where not orthogonalized) during model fitting so that prior distributions could be focused explicitly on them: priority=emergency, random

Model Checking

brms model

Run bayesplot::pp_check to examine if our Bayesian models capture the data-generating process

pp_check(mybrm, "bars", nsamples = 1000) + ## bayesplot::pp_check()
 facet_wrap (. ~"brms model") +
  theme_bw()

rmsb model

Define a pp_check method and apply it to our rmsb model

pp_check.blrm <- function(modblrm, 
                          type, 
                          ndraws = NULL,
                          group = NULL) {
  
  ## posterior predictive checks using \pkg{bayesplot} package
  ## codes adapted from brms::pp_check.R (all mistakes, however, are clearly mine)
  ## https://github.com/paul-buerkner/brms/blob/master/R/pp_check.R
  

  if (!any(class(modblrm) %in% Cs(blrm, rmsb))) {
    stop("rms object must be of class blrm", 
         call. = FALSE)
  }
  
  if (missing(type)) {
    type <- "dens_overlay"
  }
  
  # from brms::pp_check.r
  valid_types <- as.character(bayesplot::available_ppc(""))
  valid_types <- sub("^ppc_", "", valid_types)
  if (!type %in% valid_types) {
    stop("Type '", type, "' is not a valid ppc type. ", 
         "Valid types are:\n", paste(valid_types, collapse = " , "))
  }
  
  ppc_fun <- get(paste0("ppc_", type), asNamespace("bayesplot"))
  
  newdata = eval(modblrm$call$data)[all.vars(modblrm$sformula)] %>% data.frame
  if (anyNA(newdata)) {
    warning("NA responses in sample")              ## issue warnings about NAs
    newdata <- newdata[complete.cases(newdata), ]  ## remove NAs (if any)  
  }
  
  valid_vars  <- modblrm$Design$name 
  ## codes from pp_check.brms
  if ("group" %in% names(formals(ppc_fun))) {
    if (is.null(group)) {
      stop("Argument 'group' is required for ppc type '", type, "'.")
    }
    if (!group %in% valid_vars) {
      stop("Variable '", group, "' could not be found in the data.")
    }
  }
  
  ## Y and Yrep
  y <- newdata [ , all.vars(modblrm$sformula)[1] ] 
  y_length = length(unique(y))
  
  if(is.null(ndraws)) {ndraws = 20}  ## 20 draws to save time 
  
  if(y_length == 2) {
    
    ## binary response variable
    pred_binary <- predict(modblrm, newdata, fun=plogis, funint=FALSE, posterior.summary="all")
    yrep_alldraws <- apply(pred_binary, c(1,2), function (x) rbinom(1,1,x))
    yrep <- yrep_alldraws[c(1:ndraws), ]
    
  } else { 
    
    ## >=3 level ordinal/continuous response variable  
    pred_ordinal <- predict(modblrm, newdata, type="fitted.ind", posterior.summary = "all")
    yrep_alldraws <-   apply(pred_ordinal, c(1,2), function (x) {
      myvec = unlist(rmultinom(1,1,x))
      myvec_names = modblrm$ylevels   ## get unique levels of Y
      return(myvec_names[myvec==1])
    })
    yrep_alldraws <- apply(yrep_alldraws, c(1,2), as.numeric) 
    yrep <- yrep_alldraws[c(1:ndraws), ]
    
  }
  
  ## bayesplot::ppc_funs  
  ppc_args <- list(y, yrep)
  
  if (!is.null(group)) {
    ppc_args$group <- newdata[[group]]
  }
  
  do.call (ppc_fun, ppc_args)
  
}

pp_check(myblrm, "bars", ndraws = 1000) +
  facet_wrap (. ~"rmsb model") +
  theme_bw()

Visualization

Use the impressive tidybayes to visualize the posterior distribution of the exponentiated \(\beta_{priority}\) and superimpose over it its prior distribution. Shade the posterior density plot based on “watershed threshold” such as OR >1.0 or other values. (Bayesian methods do not place undue emphasis on “null” values.)

post_emergency <- myblrm %>% plot("priority=emergency") %>% pluck("data") 

ggplot(data = data.frame(x = c(0.01,10)), aes(x)) +       
  scale_x_log10('Odds Ratio', breaks=c(0.1, 0.2,0.33,1,3, 5, 10)) +  
  # posterior density plot
  stat_halfeye(
    data=data.frame(x=exp(post_emergency$draws)),
    normalize = "none", 
    slab_size = 0.4,slab_color = "black", alpha=0.5,
    aes(fill = stat(x > 0)),   ## 0 on the logged scale = OR of 1 
    color= NA) +
  ## prior distribution 
  ## lnorm dist: normal distribution on the log OR scale
  stat_dist_halfeye(
    orientation = "horizontal",
    linetype = 2,
    slab_size = 0.5,
    slab_fill = NA,
    slab_colour = "black",
    normalize = "none",
    alpha=0.5,
    color= NA,   
    aes(dist = "lnorm", arg1 = 0, arg2 =priority_sd))  +
  scale_fill_manual(labels = c("Other: Higher Odds", "Emergency: Higher odds"), 
                    values = c("blue", "skyblue")) +
  stat_interval(data=data.frame(x= exp(post_emergency$draws)), aes(y = 0.03), 
                point_interval= median_hdi, 
                .width = c(.95), color="black", show_point = TRUE, size=1, show.legend = FALSE) +
  scale_y_continuous('Probability density', expand = c(0, 0), breaks=seq(0, 3.4, 0.2))   + 
  coord_cartesian(xlim = c(0.1,11), ylim = c(0, 1.8)) +
  geom_vline(xintercept=1,linetype="solid") +
  ## labels to indicate prior and posterior distributions
  annotate("text", x = 0.2, y = 0.4, hjust = 0,
           label = "Prior",
           size = 5, color="#818283") +
  annotate("text", x = 3, y = 1.4, hjust = 0,
           label = "Posterior",
           size = 5, color="#818283") +
  theme_light(base_size = 13) +
  theme(
    legend.position=c(0.2,0.8),
    text = element_text(colour = "#2E3F4F"),
    plot.title = element_text(face = "bold"),
    plot.caption = element_text(face = "italic"),
    panel.grid.major = element_line(colour = "#E7E9E9"),
    panel.grid.minor = element_line(colour = "#F4F5F5"),
    strip.text = element_text(face = "bold", colour = "#2E3F4F"  ),
    strip.background = element_rect(colour = "#2E3F4F", fill = "grey90"),  
    legend.background = element_rect(fill = 'transparent'),
    legend.title=element_blank())

pp_checks (sidetrack)

Apply the pp_check function on rmsb ordinal models
Example taken from https://hbiostat.org/R/rmsb/blrm.html#bayesian-wilcoxon-test

d <- tibble(
  calpro = c(2500, 244, 2500, 726, 86, 2500, 61, 392, 2500, 114, 1226,
              2500, 168, 910, 627, 2500, 781, 57, 483, 30, 925, 1027,
              2500, 2500, 38, 18),
  endo = c(2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2,
            2, 2, 2, 2, 2, 1, 1)) %>% 
  mutate(endo = factor(endo, 1 : 2,
         c("No or Mild Activity", "Moderate or Severe Activity")))


bcalpro <- readRDS(here::here("./PYH Bayesian/bigfiles/cpro_blrm.Rds"))  

if(!exists("bcalpro")){
  bcalpro <- blrm(calpro ~ endo, data=d)
  saveRDS( bcalpro, file= here("./PYH Bayesian/bigfiles/cpro_blrm.Rds"))
}

pp_check(bcalpro)

pp_check(bcalpro, group = "endo",  type="ecdf_overlay_grouped")

Special thanks to Prof Harrell for his guidance and the opportunity to contribute a vignette. All mistakes in this vignette, however, are mine.↩︎