# 1 Introduction

The purpose of this investigation is to study whether a first-order Markov binary logistic model can obtain valid estimates when there are missing measurements, with no adjustment other than interacting measurement or gap times with previous state.

require(rms)
require(data.table)
knitrSet(lang='markdown', w=7, h=7, dev='png', fig.path='png/irreg-')
options(prType='html')

# 2 Simulation Model

Simulate data for 50,000 patients assessed at times 1, 2, â€¦, m where m is a random number between 1 and 20 for each patient. Assume a first-order Markov model with autocorrelation but without an interaction between time and previous state. There is a linear (in the logit) time trend on the outcome.

# Simulate m records for one subject, with covariate value x
# id=numeric ID for subject, y0=initial state 0/1
# a=intercept, b1=x effect (log odds ratio), b2=previous y effect (log OR)
# b3=time effect on 1:m scale
sim <- function(m, id, x, y0, a=0, b1=0.5, b2=0.6, b3=0.05) {
yprev <- y0
y <- integer(m)
r <- qlogis(runif(m))
for(i in 1 : m) {
L <- a + b1 * x + b2 * yprev + b3 * i
yprev <- y[i] <- 1 * (r[i] <= L)
}
cbind(id=id, x=x, time=1 : m, yprev=c(y0, y[-m]), y=y)
}

# Create a dataset for 50,000 subjects, half for each of x=0,1 with
# number of measurements randomly varying from 1 to 20
# All subjects have initial state of 0

n  <- 50000
m  <- sample(1 : 20, n, replace=TRUE)
x  <- sample(0 : 1,  n, replace=TRUE)
y0 <- 0
w  <- matrix(NA, nrow=sum(m), ncol=5)
colnames(w) <- c('id', 'x', 'time', 'yprev', 'y')
ie <- 0
for(i in 1 : n) {
is <- ie + 1
ie <- is + m[i] - 1
y0 <- 0
w[is : ie, ] <- sim(m[i], i, x[i], y0)
}
w <- as.data.frame(w)
setDT(w, key=c('x', 'id'))

Compute state occupancy probabilities using simple proportions

prop <- w[, .(p = mean(y)), by=.(x, time)]
prop
    x time         p
1: 0    1 0.5099673
2: 0    2 0.5934273
3: 0    3 0.6185800
4: 0    4 0.6394838
5: 0    5 0.6512103
6: 0    6 0.6636740
7: 0    7 0.6795389
8: 0    8 0.6874275
9: 0    9 0.7054971
10: 0   10 0.7175125
11: 0   11 0.7272944
12: 0   12 0.7369718
13: 0   13 0.7305550
14: 0   14 0.7626888
15: 0   15 0.7643346
16: 0   16 0.7714015
17: 0   17 0.7929541
18: 0   18 0.7965608
19: 0   19 0.8040593
20: 0   20 0.8146688
21: 1    1 0.6341199
22: 1    2 0.7220043
23: 1    3 0.7499667
24: 1    4 0.7559482
25: 1    5 0.7719272
26: 1    6 0.7808622
27: 1    7 0.7898352
28: 1    8 0.7971238
29: 1    9 0.8009615
30: 1   10 0.8076699
31: 1   11 0.8204653
32: 1   12 0.8296998
33: 1   13 0.8391637
34: 1   14 0.8390962
35: 1   15 0.8511146
36: 1   16 0.8579080
37: 1   17 0.8580504
38: 1   18 0.8717813
39: 1   19 0.8814038
40: 1   20 0.8985849
x time         p
prop[, type:='Empirical']

# 3 Analysis of Regularly-Spaced Points

Fit the model to the complete dataset to make sure the above programming is right. The period of observation varies by patient but within that there are no missing times.

f <- lrm(y ~ x + yprev + time * yprev, data=w)
f

Logistic Regression Model

 lrm(formula = y ~ x + yprev + time * yprev, data = w)

Model Likelihood
Ratio Test
Discrimination
Indexes
Rank Discrim.
Indexes
Obsâ€ƒ526277 LR Ï‡2â€ƒ26482.02 R2â€ƒ0.071 Câ€ƒ0.642
0â€ƒ146306 d.f.â€ƒ4 gâ€ƒ0.584 Dxyâ€ƒ0.283
1â€ƒ379971 Pr(>Ï‡2)â€ƒ<0.0001 grâ€ƒ1.794 Î³â€ƒ0.290
max |âˆ‚log L/âˆ‚Î²|â€ƒ4Ã—10-10 gpâ€ƒ0.114 Ï„aâ€ƒ0.114
Brierâ€ƒ0.191
Î² S.E. Wald Z Pr(>|Z|)
Intercept Â -0.0023 Â 0.0080 -0.28 0.7775
x Â  0.5071 Â 0.0064 79.35 <0.0001
yprev Â  0.5982 Â 0.0112 53.21 <0.0001
time Â  0.0507 Â 0.0011 45.90 <0.0001
yprev Ã— time Â -0.0003 Â 0.0014 -0.23 0.8180

Use the fitted model to estimate the state occupancy probabilities for times 1, 2, â€¦, 20 for x=0,1

# Compute unconditional probabilities
# Output is a matrix with s rows corresponding to t=times(1), times(2), ...
# Each row has columns corresponding to the probabilities of being in each of the states
# times(1) must be 1
#
# fit is the model fit which must contain the variable named day and the
# previous state, which must be named by the value of pvarname
# data is a data frame or data table that contains covariate settings
# other than day and what's in pvarname.  data has one row.
# times is the vector of days for which to evaluate state occupancy probabilities
# ylevels are the ordered levels of the outcome variable

uprob <- function(fit, data, times, ylevels, pvarname='yprev', absorb=NULL) {
k <- length(ylevels)
s <- length(times)
P <- matrix(NA, nrow=s, ncol=k)
colnames(P) <- as.character(ylevels)
rownames(P) <- as.character(times)
# Never uncondition on initial state
data$time <- times[1] p <- predict(fit, data, type='fitted.ind') if(k == 2) p <- c(1. - p, p) # predict doesn't predict all levels if Y binary P[1, ] <- p # Matrix of conditional probabilities of Y conditioning on previous time Y # Columns = k conditional probabilities conditional on a single previous state # Rows = all possible previous states data <- as.list(data) data[[pvarname]] <- as.character(setdiff(ylevels, absorb)) # don't request estimates after absorbing state edata <- expand.grid(data) for(it in 2 : s) { edata$time <- times[it]
pp <- predict(fit, edata, type='fitted.ind')
if(k == 2) pp <- cbind(1. - pp, pp)
cp <- if(! length(absorb)) pp
else
if(ylevels[1] == absorb) rbind(c(1., rep(0., k - 1)), pp)
else
if(ylevels[k] == absorb) rbind(pp, c(rep(0., k - 1), 1.))
# Compute unconditional probabilities of being in all possible states at current time t
P[it, ] <- t(cp) %*% P[it - 1, ]
}
P
}

dat <- data.frame(yprev=0)
times <- 1 : 20
xlev  <- 0 : 1

U <- NULL
for(x in xlev) {   # separately for treatments
dat$x <- x u <- uprob(f, dat, times, 0:1) cat('\nx=', x, '\n\n') print(round(u, 3)) U <- rbind(U, data.frame(x, time=times, p=u[, 2])) }  x= 0 0 1 1 0.488 0.512 2 0.402 0.598 3 0.378 0.622 4 0.363 0.637 5 0.350 0.650 6 0.337 0.663 7 0.324 0.676 8 0.312 0.688 9 0.300 0.700 10 0.288 0.712 11 0.276 0.724 12 0.265 0.735 13 0.254 0.746 14 0.243 0.757 15 0.233 0.767 16 0.223 0.777 17 0.214 0.786 18 0.204 0.796 19 0.195 0.805 20 0.187 0.813 x= 1 0 1 1 0.365 0.635 2 0.275 0.725 3 0.255 0.745 4 0.243 0.757 5 0.233 0.767 6 0.223 0.777 7 0.213 0.787 8 0.204 0.796 9 0.195 0.805 10 0.186 0.814 11 0.178 0.822 12 0.170 0.830 13 0.162 0.838 14 0.155 0.845 15 0.148 0.852 16 0.141 0.859 17 0.135 0.865 18 0.129 0.871 19 0.123 0.877 20 0.117 0.883 setDT(U) U[, type := 'Model'] z <- rbind(prop, U) setkey(z, type, x, time) ggplot(z, aes(x=time, y=p, color=factor(x), linetype=type)) + geom_line() + xlab('t') + ylab('P(Y=1)') + labs(color='x', linetype='') # 4 Analysis of Irregularly-Spaced Points Take the large simulated dataset used above and randomly sample half of all the person-periods. Recompute the lagged outcome so that it only knows about the most recent outcome that was actually sampled. ws <- w[runif(nrow(w)) < 0.5, ] setkey(ws, x, time) ws[, yprev := shift(y), by=id] ws[is.na(yprev), yprev := 0] # all patients had initial state 0 ws[, tprev := shift(time), by=id] ws[is.na(tprev), tprev := 0] ws[, tgap := time - tprev] table(ws$tgap)

1      2      3      4      5      6      7      8      9     10     11
144082  65759  29666  13162   5855   2655   1188    502    201     84     49
12     13     14
15      3      2 
# View(ws[1:1000, ])
g <- lrm(y ~ x + yprev + (rcs(time, 5) + rcs(tgap, 5)) * yprev, data=ws)
g

Logistic Regression Model

 lrm(formula = y ~ x + yprev + (rcs(time, 5) + rcs(tgap, 5)) *
yprev, data = ws)

Model Likelihood
Ratio Test
Discrimination
Indexes
Rank Discrim.
Indexes
Obsâ€ƒ263223 LR Ï‡2â€ƒ11579.56 R2â€ƒ0.062 Câ€ƒ0.632
0â€ƒ73459 d.f.â€ƒ14 gâ€ƒ0.543 Dxyâ€ƒ0.265
1â€ƒ189764 Pr(>Ï‡2)â€ƒ<0.0001 grâ€ƒ1.722 Î³â€ƒ0.268
max |âˆ‚log L/âˆ‚Î²|â€ƒ9Ã—10-11 gpâ€ƒ0.106 Ï„aâ€ƒ0.107
Brierâ€ƒ0.192
Î² S.E. Wald Z Pr(>|Z|)
Intercept Â -0.3738 Â 0.0246 -15.19 <0.0001
x Â  0.5318 Â 0.0090 59.06 <0.0001
yprev Â  1.1107 Â 0.0701 15.84 <0.0001
time Â  0.0438 Â 0.0111 3.96 <0.0001
timeâ€™ Â  0.1694 Â 0.1065 1.59 0.1119
timeâ€™â€™ Â -0.5254 Â 0.3198 -1.64 0.1004
timeâ€™â€™â€™ Â  0.5791 Â 0.3785 1.53 0.1260
tgap Â  0.3527 Â 0.0162 21.77 <0.0001
tgapâ€™ Â -0.3870 Â 0.0239 -16.21 <0.0001
yprev Ã— time Â  0.0184 Â 0.0252 0.73 0.4647
yprev Ã— timeâ€™ Â -0.2064 Â 0.1846 -1.12 0.2636
yprev Ã— timeâ€™â€™ Â  0.5355 Â 0.5037 1.06 0.2877
yprev Ã— timeâ€™â€™â€™ Â -0.4275 Â 0.5258 -0.81 0.4162
yprev Ã— tgap Â -0.5429 Â 0.0216 -25.11 <0.0001
yprev Ã— tgapâ€™ Â  0.6199 Â 0.0327 18.96 <0.0001
anova(g)
 Ï‡2 d.f. P Wald Statistics for y x 3487.68 1 <0.0001 yprev (Factor+Higher Order Factors) 1794.23 7 <0.0001 â€ƒAll Interactions 810.67 6 <0.0001 time (Factor+Higher Order Factors) 2475.28 8 <0.0001 â€ƒAll Interactions 6.86 4 0.1434 â€ƒNonlinear (Factor+Higher Order Factors) 8.81 6 0.1847 tgap (Factor+Higher Order Factors) 798.88 4 <0.0001 â€ƒAll Interactions 766.31 2 <0.0001 â€ƒNonlinear (Factor+Higher Order Factors) 371.47 2 <0.0001 yprev Ã— time (Factor+Higher Order Factors) 6.86 4 0.1434 â€ƒNonlinear 4.03 3 0.2586 â€ƒNonlinear Interaction : f(A,B) vs.Â AB 4.03 3 0.2586 yprev Ã— tgap (Factor+Higher Order Factors) 766.31 2 <0.0001 â€ƒNonlinear 359.52 1 <0.0001 â€ƒNonlinear Interaction : f(A,B) vs.Â AB 359.52 1 <0.0001 TOTAL NONLINEAR 395.99 8 <0.0001 TOTAL INTERACTION 810.67 6 <0.0001 TOTAL NONLINEAR + INTERACTION 924.61 10 <0.0001 TOTAL 11015.88 14 <0.0001

Absolute time is important, as it should be, but sensibly does not have a strong interaction with previous state. The gap time is very important, is very nonlinear, and interacts extremely strongly with previous state. From the coefficients one can see that longer gap times discounts the effect of previous state. Letâ€™s plot the estimated effect of gap time, by previous state.

dd <- datadist(ws); options(datadist='dd')
ggplot(Predict(g, tgap, yprev))