bbr rmsb models

Objective

This short vignette is based on Section 6-12 of Prof Frank Harrell’s incredibly useful Biostatistics and Biomedical Research (bbr). Its goal is to re-fit the brms Bernoulli models with Harrell’s rmsb package.

Dataset

## Creating the dataset
set.seed(10)
priority <- factor(c(rep('emergency', 25), rep('other', 111)), c('other', 'emergency'))
death <- c(rep(0, 19), rep(1, 6), rep(0, 100), rep(1, 11))
d  <- data.frame(priority, death)
randomUniform <- runif(length(priority))
d$random <- ifelse(priority == 'emergency', randomUniform < 1/3,
                   randomUniform > 1/3) * 1
dd <- datadist(d); options(datadist='dd')

d_describe <- describe(d, descript="Section 6-12 database")
html(d_describe, size=100, scroll=TRUE)

Section 6-12 database

3 Variables 136 Observations

priority

n	missing	distinct
136	0	2

 Value          other emergency
 Frequency        111        25
 Proportion     0.816     0.184

death

n	missing	distinct	Info	Sum	Mean	Gmd
136	0	2	0.328	17	0.125	0.2204

random

n	missing	distinct	Info	Sum	Mean	Gmd
136	0	2	0.727	80	0.5882	0.488

Prior Specification

`normsolve`

Modify bbr’s useful normsolve function which solves for the SD (of a normal distribution) such that the tail (area) probability beyond myvalue is myprob

normsolve <- function (myvalue, myprob = 0.025, mu=0, convert2 =c("none", "log", "logodds")){
  ## modified bbr's normsolve() 
  ## normsolve() solves for SD (of a normal distribution) such that the tail (area) probability beyond myvalue is myprob
  ## myprob = tail area probability
  ## convert2() converts mu and myvalue to the scale of the normal distribution 
  ## If mu and myvalue are OR values, use "log" to convert them to log(OR)
  ## If mu and myvalue are prob values, use "logodds" to convert them to log(odds)
  
  if (missing(convert2)) {convert2 <-  "none"}    
  if (convert2=="log" & mu==0) {
    cat ("Note: mu cannot be 0 so setting an arbitrary value of 1.0")
    mu = 1
  }
  mysd <- switch(convert2, 
                 "none" = (myvalue-mu)/qnorm(1-myprob),
                 "log" =  (log(myvalue)- log(mu))/qnorm(1-myprob),
                 "logodds" = (qlogis(myvalue)- qlogis(mu))/qnorm(1-myprob)   )
  return(mysd)
}

\(\beta_0\)

Specify a normal distribution on the log odds scale such that it corresponds to a mean proportion of 0.05 with a tail probability of 10% for proportions exceeding 0.20

(intercept_sd <- normsolve(mu = 0.05, 
                          myvalue = 0.20,
                          myprob  = 0.10,
                          convert2 = "logodds"))

## [1] 1.215827

\(\beta_{priority}\)

Specify a normal distribution on the log OR scale such that it corresponds to a mean OR of 1.0 with a 10% prob that OR>3

(priority_sd <- normsolve(mu = 1, 
                         myvalue = 3,
                         myprob  = 0.10,
                         convert2 = "log"))

## [1] 0.8572517

\(\beta_{random}\)

Specify a normal distribution on the log OR scale such that it corresponds to a mean OR of 1.0 with a 5% prob that OR>1.25

(random_sd <- normsolve(mu = 1, 
                         myvalue = 1.25,
                         myprob  = 0.05,
                         convert2 = "log"))

## [1] 0.1356616

Model Fitting

brms model

Fit a brms logistic regression model and measure model fitting time. Model fitting took ~160 secs on my 5-year old laptop.

p <- c(
  prior_string(glue('normal(-2.944, {intercept_sd})'),class='Intercept') ,
  prior_string(glue('normal(0, {priority_sd})'), class='b', coef='priorityemergency') ,
  prior_string(glue('normal(0, {random_sd})'), class='b', coef='random')
) 


if(rerun_analysis) {
tic()
mybrm <- brm(death ~ priority + random, 
             data=d, 
             prior=p, 
             family='bernoulli',
             seed=121, 
             refresh=FALSE)
toc()     ##165 sec elapsed
saveRDS( mybrm, file= here::here("PYH Bayesian/rmsb/mybrm.Rds"))  

} else {
  
  mybrm  <- readRDS(here::here("PYH Bayesian/rmsb/mybrm.Rds"))  
  
}

mybrm

##  Family: bernoulli 
##   Links: mu = logit 
## Formula: death ~ priority + random 
##    Data: d (Number of observations: 136) 
## Samples: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup samples = 4000
## 
## Population-Level Effects: 
##                   Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept            -2.16      0.31    -2.80    -1.59 1.00     3364     2606
## priorityemergency     0.71      0.50    -0.29     1.68 1.00     3088     2598
## random               -0.06      0.13    -0.32     0.20 1.00     4325     3037
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

rmsb model

Fit a rmsb logistic regression model and measure model fitting time. rmsb took ~60 secs and both packages gave similar regression coefficients.

if(rerun_analysis) {
  tic()
  myblrm  <- blrm(death ~ priority + random, 
                  data=d,
                  keepsep=('priority|random'),
                  priorsd = c(priority_sd , random_sd))  
  toc() ##70.8 sec elapsed
  saveRDS( myblrm, file= here::here("PYH Bayesian/rmsb/myblrm.Rds"))  
  
  
} else {
  
  myblrm  <- readRDS(here::here("PYH Bayesian/rmsb/myblrm.Rds"))  
 
}

myblrm

## Bayesian Logistic Model
##  
##  Dirichlet Priors With Concentration Parameter 0.541 for Intercepts
##  
##  blrm(formula = death ~ priority + random, keepsep = ("priority|random"), 
##      data = d, priorsd = c(priority_sd, random_sd))
##  
##                   Mixed Calibration/             Discrimination                Rank Discrim.    
##               Discrimination Indexes                    Indexes                      Indexes    
##     Obs136    LOO log L-51.43+/-7.56    g  0.256 [0.029, 0.519]    C    0.598 [0.446, 0.659]    
##      0 119    LOO IC  102.86+/-15.13    gp  0.033 [0.002, 0.07]    Dxy 0.196 [-0.107, 0.319]    
##      1  17    Effective p1.78+/-0.36    EV     0.018 [0, 0.066]                                 
##  Draws4000    B 0.108 [0.105, 0.112]    v      0.111 [0, 0.349]                                 
##    Chains4                              vp     0.002 [0, 0.007]                                 
##     p    2                                                                                      
##  
##                     Mode Beta Mean Beta Median Beta S.E.   Lower   Upper  
##  Intercept          -2.0752   -2.0778   -2.0712     0.2975 -2.6583 -1.5102
##  priority=emergency  0.7144    0.6760    0.6895     0.4845 -0.3001  1.5920
##  random             -0.0585   -0.0573   -0.0566     0.1307 -0.3061  0.2126
##                     Pr(Beta>0) Symmetry
##  Intercept          0.0000     0.92    
##  priority=emergency 0.9190     0.95    
##  random             0.3385     0.94    
##  
##  The following parameters remained separate (where not orthogonalized) during model fitting so that prior distributions could be focused explicitly on them: priority=emergency, random

Model Checking

brms model

Run bayesplot::pp_check to examine if our Bayesian models capture the data-generating process

pp_check(mybrm, "bars", nsamples = 1000) + ## bayesplot::pp_check()
 facet_wrap (. ~"brms model") +
  theme_bw()

rmsb model

Write a function to mimick bayesplot::pp_check and apply it to our rmsb model

ppcheck_rmsb <- function (modblrm, nsamples = 1000){
  
  ## ppcheck examines if data-generating process is captured by blrm model
  ## this is done by comparing observed and simulated Y 
  ## mimicking bayesplot::pp_check  
  ## TODO: extend this to handle rmsb models fitted on a short (say, <10 levels) ordinal outcome 
  
  if (!any(class(modblrm) %in% Cs(blrm, rmsb))) {
    stop("rms object must be of class blrm or rmsb", 
         call. = FALSE)
  }
  
  ## Y observed
  Yobs <- 
    modblrm$freq %>% 
    enframe %>% 
    mutate(name = as.integer(name),
           yobs = "Y_obs")
  
  ## Y rep
  Yrep <-
    predict(modblrm, 
            eval(modblrm$call$data), 
            fun=plogis, 
            funint=FALSE, posterior.summary="all") %>% 
    as_tibble %>% 
    slice_sample(n = nsamples) %>%   
    mutate(draw = 1:n()) %>% 
    pivot_longer(-draw) %>%
    mutate(prediction = map_int(value, ~ rbinom(1, size = 1, prob = .x))) %>%   
    count(draw, prediction)  %>%   ## draw = 1 data-set
    complete(draw, prediction, fill = list(n = 0)) %>% 
    group_by(prediction) %>% 
    median_hdi(n) %>% 
    clean_names() %>% 
    mutate(
      yemp = "Y_rep",
      prediction = prediction)
  
  ## Generate the plot
  ppcheck_barplot <-
    ggplot(Yobs, aes(x = name, y = value)) +
    geom_col(aes(fill=yobs)) +
    scale_fill_manual(values = 'lightsteelblue') +
    geom_pointrange(size = 0.7, 
                    data = Yrep, aes(x=prediction, y = n, ymin = lower, ymax = upper, color = yemp))  + 
    scale_color_manual(values = 'black') +
    theme_classic() + 
    labs(fill = "", color = "", y = "Count", x = str_glue({modblrm$terms[[2]]})) + 
    theme_minimal()
  
  return(ppcheck_barplot) 
  
}

ppcheck_rmsb (myblrm) +
  facet_wrap (. ~"rmsb model") +
  theme_bw()

Visualization

Use the impressive tidybayes to visualize the posterior distribution of the exponentiated \(\beta_{emergency}\) and superimpose over it its prior distribution. Shade the posterior density plot based on “watershed threshold” such as OR >1.0 or other values. (Bayesian methods do not place undue emphasis on “null” values.)

post_emergency <- myblrm %>% plot("priority=emergency") %>% pluck("data") 

ggplot(data = data.frame(x = c(0.01,10)), aes(x)) +       
  scale_x_log10('Odds Ratio', breaks=c(0.1, 0.2,0.33,1,3, 5, 10)) +  
  # posterior density plot
  stat_halfeye(
    data=data.frame(x=exp(post_emergency$draws)),
    normalize = "none", 
    slab_size = 0.4,slab_color = "black", alpha=0.5,
    aes(fill = stat(x > 0)),   ## 0 on the logged scale = OR of 1 
    color= NA) +
  ## prior distribution 
  ## lnorm dist: normal distribution on the log OR scale
  stat_dist_halfeye(
    orientation = "horizontal",
    linetype = 2,
    slab_size = 0.5,
    slab_fill = NA,
    slab_colour = "black",
    normalize = "none",
    alpha=0.5,
    color= NA,   
    aes(dist = "lnorm", arg1 = 0, arg2 =priority_sd))  +
  scale_fill_manual(labels = c("Other: Higher Odds", "Emergency: Higher odds"), 
                    values = c("blue", "skyblue")) +
  stat_interval(data=data.frame(x= exp(post_emergency$draws)), aes(y = 0.03), 
                point_interval= median_hdi, 
                .width = c(.95), color="black", show_point = TRUE, size=1, show.legend = FALSE) +
  scale_y_continuous('Probability density', expand = c(0, 0), breaks=seq(0, 3.4, 0.2))   + 
  coord_cartesian(xlim = c(0.1,11), ylim = c(0, 1.8)) +
  geom_vline(xintercept=1,linetype="solid") +
  ## labels + arrows to indicate prior, posterior 
  annotate("text", x = 0.2, y = 0.4, hjust = 0,
           label = "Prior",
           size = 5, color="#818283") +
  annotate("text", x = 3, y = 1.4, hjust = 0,
           label = "Posterior",
           size = 5, color="#818283") +
  theme_light(base_size = 13) +
  theme(
    legend.position=c(0.2,0.8),
    text = element_text(colour = "#2E3F4F"),
    plot.title = element_text(face = "bold"),
    plot.caption = element_text(face = "italic"),
    panel.grid.major = element_line(colour = "#E7E9E9"),
    panel.grid.minor = element_line(colour = "#F4F5F5"),
    strip.text = element_text(face = "bold", colour = "#2E3F4F"  ),
    strip.background = element_rect(colour = "#2E3F4F", fill = "grey90"),  
    legend.background = element_rect(fill = 'transparent'),
    legend.title=element_blank())