bbr rmsb models

Objective

This short vignette is based on Section 6-12 of Prof Frank Harrell’s incredibly useful Biostatistics and Biomedical Research (bbr). Its goal is to re-fit the brms Bernoulli models with Harrell’s rmsb package.

Dataset

## Creating the dataset
set.seed(10)
priority <- factor(c(rep('emergency', 25), rep('other', 111)), c('other', 'emergency'))
death <- c(rep(0, 19), rep(1, 6), rep(0, 100), rep(1, 11))
d  <- data.frame(priority, death)
randomUniform <- runif(length(priority))
d$random <- ifelse(priority == 'emergency', randomUniform < 1/3,
                   randomUniform > 1/3) * 1
dd <- datadist(d); options(datadist='dd')

d_describe <- describe(d, descript="Section 6-12 database")
html(d_describe, size=100, scroll=TRUE)
Section 6-12 database

3 Variables   136 Observations

priority
nmissingdistinct
13602
 Value          other emergency
 Frequency        111        25
 Proportion     0.816     0.184
 

death
nmissingdistinctInfoSumMeanGmd
136020.328170.1250.2204

random
nmissingdistinctInfoSumMeanGmd
136020.727800.58820.488

Prior Specification

normsolve

Modify bbr’s useful normsolve function which solves for the SD (of a normal distribution) such that the tail (area) probability beyond myvalue is myprob

normsolve <- function (myvalue, myprob = 0.025, mu=0, convert2 =c("none", "log", "logodds")){
  ## modified bbr's normsolve() 
  ## normsolve() solves for SD (of a normal distribution) such that the tail (area) probability beyond myvalue is myprob
  ## myprob = tail area probability
  ## convert2() converts mu and myvalue to the scale of the normal distribution 
  ## If mu and myvalue are OR values, use "log" to convert them to log(OR)
  ## If mu and myvalue are prob values, use "logodds" to convert them to log(odds)
  
  if (missing(convert2)) {convert2 <-  "none"}    
  if (convert2=="log" & mu==0) {
    cat ("Note: mu cannot be 0 so setting an arbitrary value of 1.0")
    mu = 1
  }
  mysd <- switch(convert2, 
                 "none" = (myvalue-mu)/qnorm(1-myprob),
                 "log" =  (log(myvalue)- log(mu))/qnorm(1-myprob),
                 "logodds" = (qlogis(myvalue)- qlogis(mu))/qnorm(1-myprob)   )
  return(mysd)
}

\(\beta_0\)

Specify a normal distribution on the log odds scale such that it corresponds to a mean proportion of 0.05 with a tail probability of 10% for proportions exceeding 0.20

(intercept_sd <- normsolve(mu = 0.05, 
                          myvalue = 0.20,
                          myprob  = 0.10,
                          convert2 = "logodds"))
## [1] 1.215827

\(\beta_{priority}\)

Specify a normal distribution on the log OR scale such that it corresponds to a mean OR of 1.0 with a 10% prob that OR>3

(priority_sd <- normsolve(mu = 1, 
                         myvalue = 3,
                         myprob  = 0.10,
                         convert2 = "log"))
## [1] 0.8572517

\(\beta_{random}\)

Specify a normal distribution on the log OR scale such that it corresponds to a mean OR of 1.0 with a 5% prob that OR>1.25

(random_sd <- normsolve(mu = 1, 
                         myvalue = 1.25,
                         myprob  = 0.05,
                         convert2 = "log"))
## [1] 0.1356616

Model Fitting

brms model

Fit a brms logistic regression model and measure model fitting time. Model fitting took ~160 secs on my 5-year old laptop.

p <- c(
  prior_string(glue('normal(-2.944, {intercept_sd})'),class='Intercept') ,
  prior_string(glue('normal(0, {priority_sd})'), class='b', coef='priorityemergency') ,
  prior_string(glue('normal(0, {random_sd})'), class='b', coef='random')
) 


if(rerun_analysis) {
tic()
mybrm <- brm(death ~ priority + random, 
             data=d, 
             prior=p, 
             family='bernoulli',
             seed=121, 
             refresh=FALSE)
toc()     ##165 sec elapsed
saveRDS( mybrm, file= here::here("PYH Bayesian/rmsb/mybrm.Rds"))  

} else {
  
  mybrm  <- readRDS(here::here("PYH Bayesian/rmsb/mybrm.Rds"))  
  
}

mybrm
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: death ~ priority + random 
##    Data: d (Number of observations: 136) 
## Samples: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup samples = 4000
## 
## Population-Level Effects: 
##                   Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept            -2.16      0.31    -2.80    -1.59 1.00     3364     2606
## priorityemergency     0.71      0.50    -0.29     1.68 1.00     3088     2598
## random               -0.06      0.13    -0.32     0.20 1.00     4325     3037
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

rmsb model

Fit a rmsb logistic regression model and measure model fitting time. rmsb took ~60 secs and both packages gave similar regression coefficients.

if(rerun_analysis) {
  tic()
  myblrm  <- blrm(death ~ priority + random, 
                  data=d,
                  keepsep=('priority|random'),
                  priorsd = c(priority_sd , random_sd))  
  toc() ##70.8 sec elapsed
  saveRDS( myblrm, file= here::here("PYH Bayesian/rmsb/myblrm.Rds"))  
  
  
} else {
  
  myblrm  <- readRDS(here::here("PYH Bayesian/rmsb/myblrm.Rds"))  
 
}

myblrm
## Bayesian Logistic Model
##  
##  Dirichlet Priors With Concentration Parameter 0.541 for Intercepts
##  
##  blrm(formula = death ~ priority + random, keepsep = ("priority|random"), 
##      data = d, priorsd = c(priority_sd, random_sd))
##  
##                   Mixed Calibration/             Discrimination                Rank Discrim.    
##               Discrimination Indexes                    Indexes                      Indexes    
##     Obs136    LOO log L-51.43+/-7.56    g  0.256 [0.029, 0.519]    C    0.598 [0.446, 0.659]    
##      0 119    LOO IC  102.86+/-15.13    gp  0.033 [0.002, 0.07]    Dxy 0.196 [-0.107, 0.319]    
##      1  17    Effective p1.78+/-0.36    EV     0.018 [0, 0.066]                                 
##  Draws4000    B 0.108 [0.105, 0.112]    v      0.111 [0, 0.349]                                 
##    Chains4                              vp     0.002 [0, 0.007]                                 
##     p    2                                                                                      
##  
##                     Mode Beta Mean Beta Median Beta S.E.   Lower   Upper  
##  Intercept          -2.0752   -2.0778   -2.0712     0.2975 -2.6583 -1.5102
##  priority=emergency  0.7144    0.6760    0.6895     0.4845 -0.3001  1.5920
##  random             -0.0585   -0.0573   -0.0566     0.1307 -0.3061  0.2126
##                     Pr(Beta>0) Symmetry
##  Intercept          0.0000     0.92    
##  priority=emergency 0.9190     0.95    
##  random             0.3385     0.94    
##  
##  The following parameters remained separate (where not orthogonalized) during model fitting so that prior distributions could be focused explicitly on them: priority=emergency, random

Model Checking

brms model

Run bayesplot::pp_check to examine if our Bayesian models capture the data-generating process

pp_check(mybrm, "bars", nsamples = 1000) + ## bayesplot::pp_check()
 facet_wrap (. ~"brms model") +
  theme_bw()

rmsb model

Write a function to mimick bayesplot::pp_check and apply it to our rmsb model

ppcheck_rmsb <- function (modblrm, nsamples = 1000){
  
  ## ppcheck examines if data-generating process is captured by blrm model
  ## this is done by comparing observed and simulated Y 
  ## mimicking bayesplot::pp_check  
  ## TODO: extend this to handle rmsb models fitted on a short (say, <10 levels) ordinal outcome 
  
  if (!any(class(modblrm) %in% Cs(blrm, rmsb))) {
    stop("rms object must be of class blrm or rmsb", 
         call. = FALSE)
  }
  
  ## Y observed
  Yobs <- 
    modblrm$freq %>% 
    enframe %>% 
    mutate(name = as.integer(name),
           yobs = "Y_obs")
  
  ## Y rep
  Yrep <-
    predict(modblrm, 
            eval(modblrm$call$data), 
            fun=plogis, 
            funint=FALSE, posterior.summary="all") %>% 
    as_tibble %>% 
    slice_sample(n = nsamples) %>%   
    mutate(draw = 1:n()) %>% 
    pivot_longer(-draw) %>%
    mutate(prediction = map_int(value, ~ rbinom(1, size = 1, prob = .x))) %>%   
    count(draw, prediction)  %>%   ## draw = 1 data-set
    complete(draw, prediction, fill = list(n = 0)) %>% 
    group_by(prediction) %>% 
    median_hdi(n) %>% 
    clean_names() %>% 
    mutate(
      yemp = "Y_rep",
      prediction = prediction)
  
  ## Generate the plot
  ppcheck_barplot <-
    ggplot(Yobs, aes(x = name, y = value)) +
    geom_col(aes(fill=yobs)) +
    scale_fill_manual(values = 'lightsteelblue') +
    geom_pointrange(size = 0.7, 
                    data = Yrep, aes(x=prediction, y = n, ymin = lower, ymax = upper, color = yemp))  + 
    scale_color_manual(values = 'black') +
    theme_classic() + 
    labs(fill = "", color = "", y = "Count", x = str_glue({modblrm$terms[[2]]})) + 
    theme_minimal()
  
  return(ppcheck_barplot) 
  
}

ppcheck_rmsb (myblrm) +
  facet_wrap (. ~"rmsb model") +
  theme_bw()

Visualization

Use the impressive tidybayes to visualize the posterior distribution of the exponentiated \(\beta_{emergency}\) and superimpose over it its prior distribution. Shade the posterior density plot based on “watershed threshold” such as OR >1.0 or other values. (Bayesian methods do not place undue emphasis on “null” values.)

post_emergency <- myblrm %>% plot("priority=emergency") %>% pluck("data") 

ggplot(data = data.frame(x = c(0.01,10)), aes(x)) +       
  scale_x_log10('Odds Ratio', breaks=c(0.1, 0.2,0.33,1,3, 5, 10)) +  
  # posterior density plot
  stat_halfeye(
    data=data.frame(x=exp(post_emergency$draws)),
    normalize = "none", 
    slab_size = 0.4,slab_color = "black", alpha=0.5,
    aes(fill = stat(x > 0)),   ## 0 on the logged scale = OR of 1 
    color= NA) +
  ## prior distribution 
  ## lnorm dist: normal distribution on the log OR scale
  stat_dist_halfeye(
    orientation = "horizontal",
    linetype = 2,
    slab_size = 0.5,
    slab_fill = NA,
    slab_colour = "black",
    normalize = "none",
    alpha=0.5,
    color= NA,   
    aes(dist = "lnorm", arg1 = 0, arg2 =priority_sd))  +
  scale_fill_manual(labels = c("Other: Higher Odds", "Emergency: Higher odds"), 
                    values = c("blue", "skyblue")) +
  stat_interval(data=data.frame(x= exp(post_emergency$draws)), aes(y = 0.03), 
                point_interval= median_hdi, 
                .width = c(.95), color="black", show_point = TRUE, size=1, show.legend = FALSE) +
  scale_y_continuous('Probability density', expand = c(0, 0), breaks=seq(0, 3.4, 0.2))   + 
  coord_cartesian(xlim = c(0.1,11), ylim = c(0, 1.8)) +
  geom_vline(xintercept=1,linetype="solid") +
  ## labels + arrows to indicate prior, posterior 
  annotate("text", x = 0.2, y = 0.4, hjust = 0,
           label = "Prior",
           size = 5, color="#818283") +
  annotate("text", x = 3, y = 1.4, hjust = 0,
           label = "Posterior",
           size = 5, color="#818283") +
  theme_light(base_size = 13) +
  theme(
    legend.position=c(0.2,0.8),
    text = element_text(colour = "#2E3F4F"),
    plot.title = element_text(face = "bold"),
    plot.caption = element_text(face = "italic"),
    panel.grid.major = element_line(colour = "#E7E9E9"),
    panel.grid.minor = element_line(colour = "#F4F5F5"),
    strip.text = element_text(face = "bold", colour = "#2E3F4F"  ),
    strip.background = element_rect(colour = "#2E3F4F", fill = "grey90"),  
    legend.background = element_rect(fill = 'transparent'),
    legend.title=element_blank())