# Function to start a verbatim quote if results='asis' in effect
# Works for all output formats in R markdown
squote <- function() { # start quote
r <- knitr::opts_current$get('results')
if(length(r) && r == 'asis') cat('\n```')
invisible()
}
# Function to close the quote if needed
equote <- function() {
r <- knitr::opts_current$get('results')
if(length(r) && r == 'asis') cat('```\n\n')
invisible()
}
# Function to print an object or inline text or both,
# verbatim quoting if needed (results='asis') in effect in chunk
# Inline text is printed with cat()
# Fractional numbers are rounded to the nearest dec digits for data frames
pr <- function(x='', obj=NULL, inline=NULL, dec=3) {
r <- knitr::opts_current$get('results')
asis <- length(r) && r == 'asis'
if(asis) cat('\n```')
if(any(x != '') || length(inline))
cat('\n', x, if(any(x != '')) ' ', inline, '\n\n', sep='')
if(length(obj)) {
if(is.data.frame(obj))
for(i in 1 : length(obj)) {
x <- obj[[i]]
if(is.numeric(x)) obj[[i]] <- round(x, dec)
}
print(obj, quote=FALSE)
}
if(asis) cat('```\n\n')
invisible()
}
# This function is now in Hmisc so just do:
# pr <- markupSpecs$markdown$pr
pst <- function(x) paste(x, collapse=', ')
rn <- function(x) round(x, 2)
relfreq <- function(x) table(x) / length(x)
# Randomly tries different starting values given to intMarkovOrd
# This is needed because intMarkovOrd can get lost if a single set of
# starting values is used and these values are far from the optima.
findstart <- function(ntry=50, seed=8, g, intercepts, extra, constraints=NULL,
ftarget=NULL, absorb=4, initial=2, y=1:4, times) {
set.seed(seed)
m <- numeric(ntry)
ints <- matrix(0, nrow=ntry, ncol=length(intercepts),
dimnames=list(NULL, names(intercepts)))
ex <- matrix(0, nrow=ntry, ncol=length(extra),
dimnames=list(NULL, names(extra)))
t <- as.numeric(row.names(target))
for(i in 1 : ntry) {
ui <- runif(length(intercepts), -1, 1) # random increments to initial ints
u <- runif(length(extra), -1, 1) # random increments to initial values
ints[i, ] <- intercepts + ui
ex[i, ] <- extra + u
m[i] <- intMarkovOrd(y, times, initial=initial, absorb=absorb,
intercepts=ints[i, ], g=g, target=target,
t=t, ftarget=ftarget, extra=ex[i, ],
onlycrit=TRUE, constraints=constraints)
}
pr('Minimum sum of absolute errors:', inline=min(m))
extra <- ex[which.min(m), ]
pr('Extra achieving this minimum:', extra)
squote()
z <- intMarkovOrd(y, times, initial=initial, absorb=absorb,
intercepts=intercepts, g=g, target=target,
t=t, ftarget=ftarget, extra=extra, constraints=constraints,
printsop=FALSE)
equote()
z
}
# Function to print or plot state occupancy probabilities for groups
# 1 and 2. If a pdf report is being produced, prints and produces
# regular ggplot2 graphic. If an html report is produced, does not
# print, and produces a plotly interactive graphics
# gtype='combined' to create pairs of bars for groups 1, 2 or
# gtype='sep' to make separate panels for 1, 2
# Assumes that extra has been updated inside g or is passed
# as extra=... to this function
sop12 <- function(y, times, initial, absorb=NULL, intercepts, g,
which=12, gtype=c('combined', 'sep'),
pri=outfmt != 'html', ...) {
gtype <- match.arg(gtype)
# Convert from matrix to tall and thin data frame
cnv <- function(z)
data.frame(p = as.vector(z),
time = as.numeric(rownames(z)[as.vector(row(z))]),
y = y[as.vector(col(z))])
if(which != 2) {
s1 <- soprobMarkovOrd(y, times, initial=initial, absorb=absorb,
intercepts=intercepts, g=g, X=1, ...)
if(pri)
pr('Occupancy probabilities for group 1:', round(s1, 3))
z1 <- cnv(s1)
if(which == 1) {
z1$txt <- with(z1, paste0(y, '
t=', time, '
p=', round(p, 3)))
g <- ggplot(z1, aes(x=factor(time), y=p, fill=factor(y), label=txt)) +
geom_col() + guides(fill = guide_legend(title='')) +
xlab('Time') + ylab('Probability')
return(ggp(g)) # ggp defined in sim.Rmd
}
}
if(which != 1) {
s2 <- soprobMarkovOrd(y, times, initial=initial, absorb=absorb,
intercepts=intercepts, g=g, X=2, ...)
if(pri) pr('Occupancy probabilities for group 2:', round(s2, 3))
z2 <- cnv(s2)
if(which == 2) {
z2$txt <- with(z2, paste0(y, '
t=', time, '
p=', round(p, 3)))
g <- ggplot(z2, aes(x=factor(time), y=p, fill=factor(y), label=txt)) +
geom_col() + guides(fill = guide_legend(title='')) +
xlab('Time') + ylab('Probability') +
return(ggp(g)) # ggp defined in sim.Rmd
}
}
# which=12: make combined plot for groups 1 and 2
z <- rbind(data.frame(z1, group=1), data.frame(z2, group=2))
z$group <- factor(z$group)
z$txt <- with(z, paste0('Group ', group, '
State:', y,
'
t=', time, '
p=', round(p, 3)))
g <- if(gtype == 'sep')
ggplot(z, aes(x=factor(time), y=p, fill=factor(y), label=txt)) +
geom_col() + facet_wrap(~ group) + xlab('Time')
else
ggplot(z, aes(x=group, y=p, fill=factor(y), label=txt)) +
geom_col() + facet_wrap(~ time, nrow=1) + xlab('') +
theme(axis.text.x=element_blank())
g <- g + guides(fill = guide_legend(title='')) + ylab('Probability')
ggp(g)
}
# Plot single result from soprobMarkovOrd, along the lines of sop12
# Also prints if using pdf output
plotsop <- function(s, pri=outfmt != 'html') {
# Convert from matrix to tall and thin data frame
cnv <- function(z)
data.frame(p = as.vector(z),
time = as.numeric(rownames(z)[as.vector(row(z))]),
y = dimnames(z)[[2]][as.vector(col(z))])
if(pri) pr('Occupancy probabilities for group 1:', round(s, 3))
z <- cnv(s)
z$txt <- with(z, paste0('State:', y, '
t=', time, '
p=', round(p, 3)))
g <- ggplot(z, aes(x=factor(time), y=p, fill=factor(y), label=txt)) +
geom_col() + guides(fill = guide_legend(title='')) +
xlab('Time') + ylab('Probability')
ggp(g)
}
# Check non-proportional-odds induced intercepts to see if in descending order
# Only need to vary time since it's the only thing we interact with y
chkints <- function(times, intercepts, extra, more=NULL) {
x <- NULL
for(t in times) {
talpha <- intercepts + as.vector(g(1, t=t, gap=2, X=1, extra=extra))
if(any(diff(talpha) > 0.))
x <- rbind(x, data.frame(t=t, alpha=paste(round(talpha, 3), collapse=' '), .more.=more))
}
if(length(x)) {
names(x) <- ifelse(names(x) == '.more.', names(more), names(x))
pr('Intercepts out of order at the following times due to magnitude of departure from PO:', x)
}
invisible(length(x) == 0)
}
# For a given set of objects defining the simulation model, run nsim
# simulations for each odds ratio in ors and save the results in
# file. Don't re-run the simulations unless one of the arguments to
# dosim changed since the last time the file was stored, or
# code for one of the two called Hmisc functions has changed.
# This is accomplished by comparing hashes from the current call
# and the previously stored fit if there was one.
# The function assumes that the extra parameter values are stored
# in the default value of g function arguments 'extra'.
# The nsim simulations are split into batches, with 'cores' batches
# run in parallel. The 'runParallel' function in
# https://github.com/harrelfe/rscripts uses all cores but one.
#
# If estcut and/or vestcut are given, after initial results are
# printed, any records with Markov point estimate greater in absolute value
# than estcut or with variance estimate > vestcut will be set to missing
# and basic calculations will be repeated.
dosim <- function(y=1:4, times,
g, absorb=4, initial,
intercepts=intercepts,
ors, formula, ppo=NULL,
yprevfactor=TRUE, groupContrast=NULL,
timecriterion=NULL, cscov=FALSE,
coxzph=length(timecriterion) > 0,
sstat=NULL,
rdsample=NULL,
looks=600, nsim=1000, seed=1,
maxest=NULL, maxvest=NULL, file) {
# See if previous fit had identical inputs and Hmisc code
hashobj <-
hashCheck(estSeqMarkovOrd, simMarkovOrd,
y, times, g, absorb, initial, intercepts, ors, formula, ppo,
yprevfactor, groupContrast, timecriterion, cscov,
coxzph, sstat, rdsample, looks, nsim, seed, maxest, maxvest, file=file)
hash <- hashobj$hash
R <- hashobj$result
if(! length(R)) {
# Run the simulations since something changed or file didn't exist
# f runs simulations for one core
f <- function(reps, showprogress, core) {
pfile <- showprogress(core=core, pr=FALSE) # just get file name
r <-
estSeqMarkovOrd(y, times=times, initial=initial, absorb=absorb,
intercepts=intercepts,
parameter=log(ors), looks=looks, g=g,
formula=formula, ppo=ppo, cscov=cscov,
yprevfactor=yprevfactor, groupContrast=groupContrast,
timecriterion=timecriterion, sstat=sstat, nsim=reps,
rdsample=rdsample,
maxest=maxest, maxvest=maxvest,
progress=TRUE, pfile=pfile)
sim <- r
## Make simulation numbers distinct for later across-core combining
sim$sim <- paste(core, sim$sim)
attr(sim, 'etimefreq') <- attr(sim, 'sstat') <-
attr(sim, 'lrmcoef') <- NULL
R <- list(sim=sim)
etimefreq <- attr(r, 'etimefreq')
if(length(etimefreq)) R$etimefreq <- etimefreq
sstat <- attr(r, 'sstat')
if(length(sstat)) {
dimnames(sstat)[[1]] <- paste(core, dimnames(sstat)[[1]])
R$sstat <- sstat
}
lrmcoef <- attr(r, 'lrmcoef')
if(length(lrmcoef)) R$lrmcoef <- lrmcoef
failures <- attr(r, 'failures')
if(length(failures)) R$failures <- failures
R
}
R <- runParallel(f, reps=nsim, seed=seed,
along=c(etimefreq=1, sstat=1, lrmcoef=2, failures=1))
sim <- R$sim
setDT(sim)
sim[, OR := exp(parameter)]
setkey(sim, OR)
R$sim <- sim
## Condense failure frequencies
fa <- R$failures
if(length(fa)) {
setDT(fa, key='failure')
fa <- fa[, .(Freq=sum(Freq)), by=failure]
fa <- fa[Freq > 0, ]
R$failures <- fa
}
setattr(R, 'hash', hash)
saveRDS(R, file, compress='xz')
}
fa <- R$failures
if(length(fa) && nrow(fa) > 0) pr('Failed simulations', fa)
sim <- R$sim
cof <- R$lrmcoef
cof <- round(apply(cof, c(1, 3), median, na.rm=TRUE), 4)
rownames(cof) <- as.character(round(exp(as.numeric(row.names(cof))), 2))
pr(paste('Median of', nsim,
'logistic regression coefficients for each OR'), cof)
w <- sim[, .(SD = round(sd (est, na.rm=TRUE), 3),
MAD = round(mad(est, na.rm=TRUE), 3),
SDest = round(sqrt(median(vest, na.rm=TRUE)), 3)), by=.(OR,look)]
pr('Standard deviation of simulated group effects, scaled MAD, and square root of median of estimated variances',
w)
R
}
# Simulate a sample of size n from an ordinal Markov model
# testsamp assumes that values of extra are stored as default argument values
# for g. Returns the correlation matrix.
testsamp <- function(n=10000, initial=2, absorb=4,
intercepts=intercepts, y=1:4, times) {
s <- simMarkovOrd(n=n, y, times, initial=initial, X=c(group=1),
absorb=absorb, intercepts=intercepts, g=g, carry=TRUE)
# Remember: vector extra is now predefined in g
w <- with(s, tapply(y, time, relfreq))
pr(paste('State occupancy proportions from', n, 'samples'),
round(do.call(rbind, w), 3))
setDT(s, key=c('id', 'time'))
w <- dcast(s, id ~ time, value.var='y')
r <- cor(as.matrix(w)[, -1])
p <- dim(r)[1]
if(p < 11 && outfmt != 'html') {
w <- round(r, 2)
w[lower.tri(w)] <- NA
w[] <- ifelse(is.na(w), '', format(w))
pr('Correlation matrix', w)
}
invisible(r)
}
# Examine and analyze the simulated regression coefficients, inluding
# making various graphical displays
examSim <- function(R, timedist=TRUE, ORvsHR=TRUE,
hist=TRUE, scatter=TRUE, timeevent='Y=1', desc='') {
sim <- R$sim
setkey(sim, OR, look)
cox <- 'lrchisq' %in% names(sim)
ph <- 'phchisq' %in% names(sim)
caps <- character(0); pobj <- list(); no <- 0
bar <- function(x) mean(x, na.rm=TRUE)
nsim <- min(sim[, .(n=length(est)), by=.(OR, look)]$n)
pr(paste('Number of successful simulations out of', nsim),
sim[, .(n=sum(! is.na(est))), by=.(OR, look)])
## Check: at last look should equal this:
## apply(R$lrmcoef[,,1], 1, function(x) sum(! is.na(x)))
et <- R$etimefreq
if(timedist && length(et)) {
et <- apply(et, 2:5, sum)
dn <- dimnames(et)
et <-
data.frame(n=as.vector(et),
OR = exp(as.numeric(dn[[1]])[as.vector(slice.index(et, 1))]),
group = paste('Group', dn[[2]][as.vector(slice.index(et, 2))]),
event = dn[[3]][as.vector(slice.index(et, 3))],
t = as.numeric(dn[[4]][as.vector(slice.index(et, 4))],
levels=dn[[4]]))
ang <- if(length(unique(et$t)) > 10) 90 else 0
et$txt <- with(et, paste0(event, '
Group:', group, '
OR=', OR,
'
n=', n, '
t=', t))
gp <- ggplot(et, aes(x=factor(t), y=n, fill=event, label=txt)) +
geom_col() + facet_grid(OR ~ group) + xlab('t') +
theme(axis.text.x=element_text(angle=ang))
no <- no + 1
pobj[[no]] <- ggp(gp, tooltip='text')
caps <- c(caps, paste('Time to ', timeevent,
'by group and OR, over simulations'))
ORs <- sort(unique(et$OR))
m <- length(ORs)
d <- NULL
for(or in ORs) {
ets <- subset(et, OR == or)
km <- survfit(Surv(t, event=='event') ~ group, weights=n, data=ets)
d <- rbind(d, data.frame(survfit2df(km), OR=or))
}
d$txt <- with(d, paste0('Group:', group,
'
OR:', OR,
'
t:', time,
'
At risk:', nrisk,
'
', round(1. - surv, 3)))
g <- ggplot(d, aes(x=time, y=1. - surv, color=group,
group=group, label=txt)) +
geom_step(direction='hv') +
facet_wrap(~ OR) + xlab('Day') + ylab('Cumulative Incidence')
no <- no + 1
pobj[[no]] <- ggp(g)
caps <- c(caps, paste('Cumulative incidence of', timeevent))
# Old way using base graphics
if(FALSE) {
par(mar=c(2.5, 2.4, 1, 1), mgp=c(1.5, 0.5, 0), bty='l')
## When multiple graphs are produced in one chunk, knitr will
## not allow you to vary the plot sizes, so break into groups
## of 4 paneled plots
ORgroups <- if(m > 8) list(ORs[1 : 4], ORs[5 : 8], ORs[-(1 : 8)])
else
if(m > 4) list(ORs[1 : 4], ORs[-(1 : 4)]) else list(ORs)
for(ors in ORgroups) {
m <- length(ors)
par(mfrow = c(1 + (m > 1), 1 + (m > 2)))
for(or in ors) {
ets <- subset(et, OR == or)
km <- survfit(Surv(t, event=='event') ~ group, weights=n, data=ets)
plot(km, fun='event', col=1:2, xlab='Day', ylab='Cumulative Incidence')
if(or == ors[1])
title(sub='Black: group 1; Red: group 2', adj=0, cex.sub=0.7,
line=1.5, col.sub='blue')
title(sub=paste('OR:', or), line=1.5, cex.sub=0.7, adj=1,
col.sub='blue')
}
}
}
if(FALSE) {
pr('Frequency distribution of times to Y=1 by group')
for(pm in dimnames(et)[[1]]) {
pr('\nOR:', inline=exp(as.numeric(pm)))
print(et[pm, ,])
}
}
}
if(ORvsHR && 'loghr' %in% names(sim))
pr('Comparison of OR and HR',
rn(sim[, .(HR=exp(- bar(loghr))), by=OR]))
if(hist && cox && ph) {
z <- rbind(data.frame(x=sim[, lrchisq], Test='Cox model group comparison'),
data.frame(x=sim[, phchisq], Test='Cox Model PH test'))
no <- no + 1
pobj[[no]] <- ggp(
ggplot(z, aes(x=x)) + geom_histogram(bins=50) + facet_wrap(~ Test) +
xlab(expression(chi^2)))
caps <- c(caps, 'Histograms of Cox 2-sample and PH test statistics')
}
if(cox)
pr(paste('Power of Cox test at last look for time until', timeevent),
rn(sim[, .(power=bar(lrchisq > 3.8415)), by=OR]))
if(ph)
pr('Proportion of simulations with chi-sq for PH > 3.84',
rn(sim[, .(rejectPH=bar(phchisq > 3.8415)), by=OR]))
sim[, tchi := est * est / vest]
pr('Power of Markov proportional odds model test',
sim[, .(power=round(bar(tchi > 3.8415), 2)), by=.(OR, look)] )
if(cox) {
rho <- sim[, spearman(tchi, lrchisq)]
pr('Spearman rho correlation between Markov and Cox model chi-square:',
inline=rn(rho))
}
gr <- function(x, y, z)
ggfreqScatter(x, y, ylab='PO Model Z Statistic',
xlab='Cox Model Z Statistic',
by=z, bins=25) +
geom_abline(intercept=0, slope=1, col='red', alpha=0.3) +
theme(legend.position='bottom', axis.text.x = element_text(angle = 90))
if(scatter && cox) {
g <- with(sim, gr(pmin(10, sqrt(lrchisq)), pmin(10, sqrt(tchi)),
exp(parameter)))
no <- no + 1
pobj[[no]] <- ggp(g, remove='Freq: ')
caps <- c(caps, 'Scatter plot of Cox and PO model Z statistics')
}
s <- R$sstat
if(length(s)) {
group <- s[, , , 'group']
y <- s[, , , 'y']
dn <- dimnames(y)
ss <- data.table(
sim = dn[[1]][as.vector(slice.index(y, 1))],
OR = exp(as.numeric(dn[[2]])[as.vector(slice.index(y, 2))]),
group = as.vector(group),
y = as.vector(y),
key = c('sim', 'OR', 'group'))
tab <- with(ss, as.data.frame(table(group, y, OR)))
tab$txt <- with(tab, paste0('Group:', group, '
y:', y, '
', Freq))
g <- ggplot(tab, aes(x=group, y=Freq, label=txt)) +
geom_col() + facet_grid(OR ~ y) +
xlab('Ventilator/ARDS - Free Days') +
theme(axis.text.x=element_blank()) +
guides(fill = guide_legend(title='')) + ylab('Frequency')
no <- no + 1
pobj[[no]] <- ggp(g)
caps <- c(caps, 'Distribution of ventilator/ARDS-free days by group and OR. Left bar of each pair is group 1, right bar is group 2. -1 represents death.')
## For each simulation and true OR compute the p-value from the
## Wilcoxon test, then compute the power of this test for vent-free days
h <- function(group, y) wilcox.test(y[group == 1], y[group == 2])$p.value
s <- ss[, .(p = h(group, y)), by=.(sim, OR)]
pr('Power of Wilcoxon test on vent/ards-free days',
s[, .(power=bar(p < 0.05)), by=OR] )
}
# All the ggplot2 or plotly plots are saved in list pobj
# Render all the plots, each with its own chunk and caption
if(no > 0) {
if(desc != '') caps <- paste0(caps, '. ', desc, '.')
chunkname <- knitr::opts_current$get('label')
if(! length(chunkname)) stop('no chunk name')
cnames <- paste0(chunkname, 'examsim', letters[1 : no])
markupSpecs$html$mdchunk(robj=pobj, caption=caps, cnames=cnames)
}
return(invisible())
}
# Function to create a data frame suitable for ggplot2 for plotting
# Kaplan-Meier estimates from output of survfit (fit)
# Put in a data frame suitable for ggplot2/plotly
survfit2df <- function(fit) {
n <- fit$strata
sn <- sub('^.*=', '', names(n))
sl <- rep(sn, n)
r <- function(x, prefix) {
X <- split(x, sl)
X <- lapply(X, prefix)
unlist(X)
}
time <- r(fit$time, function(x) c(0., x))
surv <- r(fit$surv, function(x) c(1., x))
nrisk <- r(fit$n.risk, function(x) c(x[1], x))
group <- r(sl, function(x) c(x[1], x))
data.frame(group=group, time=time, surv=surv, nrisk=nrisk)
}