# Function to start a verbatim quote if results='asis' in effect # Works for all output formats in R markdown squote <- function() { # start quote r <- knitr::opts_current$get('results') if(length(r) && r == 'asis') cat('\n```') invisible() } # Function to close the quote if needed equote <- function() { r <- knitr::opts_current$get('results') if(length(r) && r == 'asis') cat('```\n\n') invisible() } # Function to print an object or inline text or both, # verbatim quoting if needed (results='asis') in effect in chunk # Inline text is printed with cat() # Fractional numbers are rounded to the nearest dec digits for data frames pr <- function(x='', obj=NULL, inline=NULL, dec=3) { r <- knitr::opts_current$get('results') asis <- length(r) && r == 'asis' if(asis) cat('\n```') if(any(x != '') || length(inline)) cat('\n', x, if(any(x != '')) ' ', inline, '\n\n', sep='') if(length(obj)) { if(is.data.frame(obj)) for(i in 1 : length(obj)) { x <- obj[[i]] if(is.numeric(x)) obj[[i]] <- round(x, dec) } print(obj, quote=FALSE) } if(asis) cat('```\n\n') invisible() } # This function is now in Hmisc so just do: # pr <- markupSpecs$markdown$pr pst <- function(x) paste(x, collapse=', ') rn <- function(x) round(x, 2) relfreq <- function(x) table(x) / length(x) # Randomly tries different starting values given to intMarkovOrd # This is needed because intMarkovOrd can get lost if a single set of # starting values is used and these values are far from the optima. findstart <- function(ntry=50, seed=8, g, intercepts, extra, constraints=NULL, ftarget=NULL, absorb=4, initial=2, y=1:4, times) { set.seed(seed) m <- numeric(ntry) ints <- matrix(0, nrow=ntry, ncol=length(intercepts), dimnames=list(NULL, names(intercepts))) ex <- matrix(0, nrow=ntry, ncol=length(extra), dimnames=list(NULL, names(extra))) t <- as.numeric(row.names(target)) for(i in 1 : ntry) { ui <- runif(length(intercepts), -1, 1) # random increments to initial ints u <- runif(length(extra), -1, 1) # random increments to initial values ints[i, ] <- intercepts + ui ex[i, ] <- extra + u m[i] <- intMarkovOrd(y, times, initial=initial, absorb=absorb, intercepts=ints[i, ], g=g, target=target, t=t, ftarget=ftarget, extra=ex[i, ], onlycrit=TRUE, constraints=constraints) } pr('Minimum sum of absolute errors:', inline=min(m)) extra <- ex[which.min(m), ] pr('Extra achieving this minimum:', extra) squote() z <- intMarkovOrd(y, times, initial=initial, absorb=absorb, intercepts=intercepts, g=g, target=target, t=t, ftarget=ftarget, extra=extra, constraints=constraints, printsop=FALSE) equote() z } # Function to print or plot state occupancy probabilities for groups # 1 and 2. If a pdf report is being produced, prints and produces # regular ggplot2 graphic. If an html report is produced, does not # print, and produces a plotly interactive graphics # gtype='combined' to create pairs of bars for groups 1, 2 or # gtype='sep' to make separate panels for 1, 2 # Assumes that extra has been updated inside g or is passed # as extra=... to this function sop12 <- function(y, times, initial, absorb=NULL, intercepts, g, which=12, gtype=c('combined', 'sep'), pri=outfmt != 'html', ...) { gtype <- match.arg(gtype) # Convert from matrix to tall and thin data frame cnv <- function(z) data.frame(p = as.vector(z), time = as.numeric(rownames(z)[as.vector(row(z))]), y = y[as.vector(col(z))]) if(which != 2) { s1 <- soprobMarkovOrd(y, times, initial=initial, absorb=absorb, intercepts=intercepts, g=g, X=1, ...) if(pri) pr('Occupancy probabilities for group 1:', round(s1, 3)) z1 <- cnv(s1) if(which == 1) { z1$txt <- with(z1, paste0(y, '
t=', time, '
p=', round(p, 3))) g <- ggplot(z1, aes(x=factor(time), y=p, fill=factor(y), label=txt)) + geom_col() + guides(fill = guide_legend(title='')) + xlab('Time') + ylab('Probability') return(ggp(g)) # ggp defined in sim.Rmd } } if(which != 1) { s2 <- soprobMarkovOrd(y, times, initial=initial, absorb=absorb, intercepts=intercepts, g=g, X=2, ...) if(pri) pr('Occupancy probabilities for group 2:', round(s2, 3)) z2 <- cnv(s2) if(which == 2) { z2$txt <- with(z2, paste0(y, '
t=', time, '
p=', round(p, 3))) g <- ggplot(z2, aes(x=factor(time), y=p, fill=factor(y), label=txt)) + geom_col() + guides(fill = guide_legend(title='')) + xlab('Time') + ylab('Probability') + return(ggp(g)) # ggp defined in sim.Rmd } } # which=12: make combined plot for groups 1 and 2 z <- rbind(data.frame(z1, group=1), data.frame(z2, group=2)) z$group <- factor(z$group) z$txt <- with(z, paste0('Group ', group, '
State:', y, '
t=', time, '
p=', round(p, 3))) g <- if(gtype == 'sep') ggplot(z, aes(x=factor(time), y=p, fill=factor(y), label=txt)) + geom_col() + facet_wrap(~ group) + xlab('Time') else ggplot(z, aes(x=group, y=p, fill=factor(y), label=txt)) + geom_col() + facet_wrap(~ time, nrow=1) + xlab('') + theme(axis.text.x=element_blank()) g <- g + guides(fill = guide_legend(title='')) + ylab('Probability') ggp(g) } # Plot single result from soprobMarkovOrd, along the lines of sop12 # Also prints if using pdf output plotsop <- function(s, pri=outfmt != 'html') { # Convert from matrix to tall and thin data frame cnv <- function(z) data.frame(p = as.vector(z), time = as.numeric(rownames(z)[as.vector(row(z))]), y = dimnames(z)[[2]][as.vector(col(z))]) if(pri) pr('Occupancy probabilities for group 1:', round(s, 3)) z <- cnv(s) z$txt <- with(z, paste0('State:', y, '
t=', time, '
p=', round(p, 3))) g <- ggplot(z, aes(x=factor(time), y=p, fill=factor(y), label=txt)) + geom_col() + guides(fill = guide_legend(title='')) + xlab('Time') + ylab('Probability') ggp(g) } # Check non-proportional-odds induced intercepts to see if in descending order # Only need to vary time since it's the only thing we interact with y chkints <- function(times, intercepts, extra, more=NULL) { x <- NULL for(t in times) { talpha <- intercepts + as.vector(g(1, t=t, gap=2, X=1, extra=extra)) if(any(diff(talpha) > 0.)) x <- rbind(x, data.frame(t=t, alpha=paste(round(talpha, 3), collapse=' '), .more.=more)) } if(length(x)) { names(x) <- ifelse(names(x) == '.more.', names(more), names(x)) pr('Intercepts out of order at the following times due to magnitude of departure from PO:', x) } invisible(length(x) == 0) } # For a given set of objects defining the simulation model, run nsim # simulations for each odds ratio in ors and save the results in # file. Don't re-run the simulations unless one of the arguments to # dosim changed since the last time the file was stored, or # code for one of the two called Hmisc functions has changed. # This is accomplished by comparing hashes from the current call # and the previously stored fit if there was one. # The function assumes that the extra parameter values are stored # in the default value of g function arguments 'extra'. # The nsim simulations are split into batches, with 'cores' batches # run in parallel. The 'runParallel' function in # https://github.com/harrelfe/rscripts uses all cores but one. # # If estcut and/or vestcut are given, after initial results are # printed, any records with Markov point estimate greater in absolute value # than estcut or with variance estimate > vestcut will be set to missing # and basic calculations will be repeated. dosim <- function(y=1:4, times, g, absorb=4, initial, intercepts=intercepts, ors, formula, ppo=NULL, yprevfactor=TRUE, groupContrast=NULL, timecriterion=NULL, cscov=FALSE, coxzph=length(timecriterion) > 0, sstat=NULL, rdsample=NULL, looks=600, nsim=1000, seed=1, maxest=NULL, maxvest=NULL, file) { # See if previous fit had identical inputs and Hmisc code hashobj <- hashCheck(estSeqMarkovOrd, simMarkovOrd, y, times, g, absorb, initial, intercepts, ors, formula, ppo, yprevfactor, groupContrast, timecriterion, cscov, coxzph, sstat, rdsample, looks, nsim, seed, maxest, maxvest, file=file) hash <- hashobj$hash R <- hashobj$result if(! length(R)) { # Run the simulations since something changed or file didn't exist # f runs simulations for one core f <- function(reps, showprogress, core) { pfile <- showprogress(core=core, pr=FALSE) # just get file name r <- estSeqMarkovOrd(y, times=times, initial=initial, absorb=absorb, intercepts=intercepts, parameter=log(ors), looks=looks, g=g, formula=formula, ppo=ppo, cscov=cscov, yprevfactor=yprevfactor, groupContrast=groupContrast, timecriterion=timecriterion, sstat=sstat, nsim=reps, rdsample=rdsample, maxest=maxest, maxvest=maxvest, progress=TRUE, pfile=pfile) sim <- r ## Make simulation numbers distinct for later across-core combining sim$sim <- paste(core, sim$sim) attr(sim, 'etimefreq') <- attr(sim, 'sstat') <- attr(sim, 'lrmcoef') <- NULL R <- list(sim=sim) etimefreq <- attr(r, 'etimefreq') if(length(etimefreq)) R$etimefreq <- etimefreq sstat <- attr(r, 'sstat') if(length(sstat)) { dimnames(sstat)[[1]] <- paste(core, dimnames(sstat)[[1]]) R$sstat <- sstat } lrmcoef <- attr(r, 'lrmcoef') if(length(lrmcoef)) R$lrmcoef <- lrmcoef failures <- attr(r, 'failures') if(length(failures)) R$failures <- failures R } R <- runParallel(f, reps=nsim, seed=seed, along=c(etimefreq=1, sstat=1, lrmcoef=2, failures=1)) sim <- R$sim setDT(sim) sim[, OR := exp(parameter)] setkey(sim, OR) R$sim <- sim ## Condense failure frequencies fa <- R$failures if(length(fa)) { setDT(fa, key='failure') fa <- fa[, .(Freq=sum(Freq)), by=failure] fa <- fa[Freq > 0, ] R$failures <- fa } setattr(R, 'hash', hash) saveRDS(R, file, compress='xz') } fa <- R$failures if(length(fa) && nrow(fa) > 0) pr('Failed simulations', fa) sim <- R$sim cof <- R$lrmcoef cof <- round(apply(cof, c(1, 3), median, na.rm=TRUE), 4) rownames(cof) <- as.character(round(exp(as.numeric(row.names(cof))), 2)) pr(paste('Median of', nsim, 'logistic regression coefficients for each OR'), cof) w <- sim[, .(SD = round(sd (est, na.rm=TRUE), 3), MAD = round(mad(est, na.rm=TRUE), 3), SDest = round(sqrt(median(vest, na.rm=TRUE)), 3)), by=.(OR,look)] pr('Standard deviation of simulated group effects, scaled MAD, and square root of median of estimated variances', w) R } # Simulate a sample of size n from an ordinal Markov model # testsamp assumes that values of extra are stored as default argument values # for g. Returns the correlation matrix. testsamp <- function(n=10000, initial=2, absorb=4, intercepts=intercepts, y=1:4, times) { s <- simMarkovOrd(n=n, y, times, initial=initial, X=c(group=1), absorb=absorb, intercepts=intercepts, g=g, carry=TRUE) # Remember: vector extra is now predefined in g w <- with(s, tapply(y, time, relfreq)) pr(paste('State occupancy proportions from', n, 'samples'), round(do.call(rbind, w), 3)) setDT(s, key=c('id', 'time')) w <- dcast(s, id ~ time, value.var='y') r <- cor(as.matrix(w)[, -1]) p <- dim(r)[1] if(p < 11 && outfmt != 'html') { w <- round(r, 2) w[lower.tri(w)] <- NA w[] <- ifelse(is.na(w), '', format(w)) pr('Correlation matrix', w) } invisible(r) } # Examine and analyze the simulated regression coefficients, inluding # making various graphical displays examSim <- function(R, timedist=TRUE, ORvsHR=TRUE, hist=TRUE, scatter=TRUE, timeevent='Y=1', desc='') { sim <- R$sim setkey(sim, OR, look) cox <- 'lrchisq' %in% names(sim) ph <- 'phchisq' %in% names(sim) caps <- character(0); pobj <- list(); no <- 0 bar <- function(x) mean(x, na.rm=TRUE) nsim <- min(sim[, .(n=length(est)), by=.(OR, look)]$n) pr(paste('Number of successful simulations out of', nsim), sim[, .(n=sum(! is.na(est))), by=.(OR, look)]) ## Check: at last look should equal this: ## apply(R$lrmcoef[,,1], 1, function(x) sum(! is.na(x))) et <- R$etimefreq if(timedist && length(et)) { et <- apply(et, 2:5, sum) dn <- dimnames(et) et <- data.frame(n=as.vector(et), OR = exp(as.numeric(dn[[1]])[as.vector(slice.index(et, 1))]), group = paste('Group', dn[[2]][as.vector(slice.index(et, 2))]), event = dn[[3]][as.vector(slice.index(et, 3))], t = as.numeric(dn[[4]][as.vector(slice.index(et, 4))], levels=dn[[4]])) ang <- if(length(unique(et$t)) > 10) 90 else 0 et$txt <- with(et, paste0(event, '
Group:', group, '
OR=', OR, '
n=', n, '
t=', t)) gp <- ggplot(et, aes(x=factor(t), y=n, fill=event, label=txt)) + geom_col() + facet_grid(OR ~ group) + xlab('t') + theme(axis.text.x=element_text(angle=ang)) no <- no + 1 pobj[[no]] <- ggp(gp, tooltip='text') caps <- c(caps, paste('Time to ', timeevent, 'by group and OR, over simulations')) ORs <- sort(unique(et$OR)) m <- length(ORs) d <- NULL for(or in ORs) { ets <- subset(et, OR == or) km <- survfit(Surv(t, event=='event') ~ group, weights=n, data=ets) d <- rbind(d, data.frame(survfit2df(km), OR=or)) } d$txt <- with(d, paste0('Group:', group, '
OR:', OR, '
t:', time, '
At risk:', nrisk, '
', round(1. - surv, 3))) g <- ggplot(d, aes(x=time, y=1. - surv, color=group, group=group, label=txt)) + geom_step(direction='hv') + facet_wrap(~ OR) + xlab('Day') + ylab('Cumulative Incidence') no <- no + 1 pobj[[no]] <- ggp(g) caps <- c(caps, paste('Cumulative incidence of', timeevent)) # Old way using base graphics if(FALSE) { par(mar=c(2.5, 2.4, 1, 1), mgp=c(1.5, 0.5, 0), bty='l') ## When multiple graphs are produced in one chunk, knitr will ## not allow you to vary the plot sizes, so break into groups ## of 4 paneled plots ORgroups <- if(m > 8) list(ORs[1 : 4], ORs[5 : 8], ORs[-(1 : 8)]) else if(m > 4) list(ORs[1 : 4], ORs[-(1 : 4)]) else list(ORs) for(ors in ORgroups) { m <- length(ors) par(mfrow = c(1 + (m > 1), 1 + (m > 2))) for(or in ors) { ets <- subset(et, OR == or) km <- survfit(Surv(t, event=='event') ~ group, weights=n, data=ets) plot(km, fun='event', col=1:2, xlab='Day', ylab='Cumulative Incidence') if(or == ors[1]) title(sub='Black: group 1; Red: group 2', adj=0, cex.sub=0.7, line=1.5, col.sub='blue') title(sub=paste('OR:', or), line=1.5, cex.sub=0.7, adj=1, col.sub='blue') } } } if(FALSE) { pr('Frequency distribution of times to Y=1 by group') for(pm in dimnames(et)[[1]]) { pr('\nOR:', inline=exp(as.numeric(pm))) print(et[pm, ,]) } } } if(ORvsHR && 'loghr' %in% names(sim)) pr('Comparison of OR and HR', rn(sim[, .(HR=exp(- bar(loghr))), by=OR])) if(hist && cox && ph) { z <- rbind(data.frame(x=sim[, lrchisq], Test='Cox model group comparison'), data.frame(x=sim[, phchisq], Test='Cox Model PH test')) no <- no + 1 pobj[[no]] <- ggp( ggplot(z, aes(x=x)) + geom_histogram(bins=50) + facet_wrap(~ Test) + xlab(expression(chi^2))) caps <- c(caps, 'Histograms of Cox 2-sample and PH test statistics') } if(cox) pr(paste('Power of Cox test at last look for time until', timeevent), rn(sim[, .(power=bar(lrchisq > 3.8415)), by=OR])) if(ph) pr('Proportion of simulations with chi-sq for PH > 3.84', rn(sim[, .(rejectPH=bar(phchisq > 3.8415)), by=OR])) sim[, tchi := est * est / vest] pr('Power of Markov proportional odds model test', sim[, .(power=round(bar(tchi > 3.8415), 2)), by=.(OR, look)] ) if(cox) { rho <- sim[, spearman(tchi, lrchisq)] pr('Spearman rho correlation between Markov and Cox model chi-square:', inline=rn(rho)) } gr <- function(x, y, z) ggfreqScatter(x, y, ylab='PO Model Z Statistic', xlab='Cox Model Z Statistic', by=z, bins=25) + geom_abline(intercept=0, slope=1, col='red', alpha=0.3) + theme(legend.position='bottom', axis.text.x = element_text(angle = 90)) if(scatter && cox) { g <- with(sim, gr(pmin(10, sqrt(lrchisq)), pmin(10, sqrt(tchi)), exp(parameter))) no <- no + 1 pobj[[no]] <- ggp(g, remove='Freq: ') caps <- c(caps, 'Scatter plot of Cox and PO model Z statistics') } s <- R$sstat if(length(s)) { group <- s[, , , 'group'] y <- s[, , , 'y'] dn <- dimnames(y) ss <- data.table( sim = dn[[1]][as.vector(slice.index(y, 1))], OR = exp(as.numeric(dn[[2]])[as.vector(slice.index(y, 2))]), group = as.vector(group), y = as.vector(y), key = c('sim', 'OR', 'group')) tab <- with(ss, as.data.frame(table(group, y, OR))) tab$txt <- with(tab, paste0('Group:', group, '
y:', y, '
', Freq)) g <- ggplot(tab, aes(x=group, y=Freq, label=txt)) + geom_col() + facet_grid(OR ~ y) + xlab('Ventilator/ARDS - Free Days') + theme(axis.text.x=element_blank()) + guides(fill = guide_legend(title='')) + ylab('Frequency') no <- no + 1 pobj[[no]] <- ggp(g) caps <- c(caps, 'Distribution of ventilator/ARDS-free days by group and OR. Left bar of each pair is group 1, right bar is group 2. -1 represents death.') ## For each simulation and true OR compute the p-value from the ## Wilcoxon test, then compute the power of this test for vent-free days h <- function(group, y) wilcox.test(y[group == 1], y[group == 2])$p.value s <- ss[, .(p = h(group, y)), by=.(sim, OR)] pr('Power of Wilcoxon test on vent/ards-free days', s[, .(power=bar(p < 0.05)), by=OR] ) } # All the ggplot2 or plotly plots are saved in list pobj # Render all the plots, each with its own chunk and caption if(no > 0) { if(desc != '') caps <- paste0(caps, '. ', desc, '.') chunkname <- knitr::opts_current$get('label') if(! length(chunkname)) stop('no chunk name') cnames <- paste0(chunkname, 'examsim', letters[1 : no]) markupSpecs$html$mdchunk(robj=pobj, caption=caps, cnames=cnames) } return(invisible()) } # Function to create a data frame suitable for ggplot2 for plotting # Kaplan-Meier estimates from output of survfit (fit) # Put in a data frame suitable for ggplot2/plotly survfit2df <- function(fit) { n <- fit$strata sn <- sub('^.*=', '', names(n)) sl <- rep(sn, n) r <- function(x, prefix) { X <- split(x, sl) X <- lapply(X, prefix) unlist(X) } time <- r(fit$time, function(x) c(0., x)) surv <- r(fit$surv, function(x) c(1., x)) nrisk <- r(fit$n.risk, function(x) c(x[1], x)) group <- r(sl, function(x) c(x[1], x)) data.frame(group=group, time=time, surv=surv, nrisk=nrisk) }