# Marginal Structural Models - ltmleMSM()

## Marginal Structural Models (MSMs) - Multiple regimes with a single outcome

In this example there are 5 time points with a treatment and time varying covariate. At each, treatment can be 0 or 1. There is a single outcome Y.

L_0 L_1 A_1 L_2 A_2 … L_5 A_5 Y

There are 2^5 = 32 regimes of interest. Some of these may have limited support because there are few patients who follow a particular regime. We pool over all regimes using a working marginal structural model. In this example we want to know the effect of time on treatment on Y. We include time.on.treatment and time.on.treatment^2.

rexpit <- function(x) rbinom(n=length(x), size=1, prob=plogis(x))
n <- 5000
time.points <- 5
prev.L <- rnorm(n)
prev.A <- rep(0, n)
sum.A <- rep(0, n)
data <- data.frame(L_0 = prev.L)
for (t in 1:time.points) {
L <- 0.1 * prev.L + 0.3 * prev.A + rnorm(n)
A <- rexpit(L)

data1 <- data.frame(L, A)
names(data1) <- paste0(c("L_", "A_"), t)
data <- cbind(data, data1)

prev.A <- A
prev.L <- L

sum.A <- sum.A + A
}
data$Y <- rexpit(sum.A / time.points + L) head(data) #> L_0 L_1 A_1 L_2 A_2 L_3 A_3 L_4 A_4 #> 1 -0.7480303 -0.1865826 0 0.4469474 1 0.12149207 1 -1.76985617 0 #> 2 0.6557836 -0.4355446 1 0.8469899 0 -1.07773708 0 -0.98969343 0 #> 3 1.2270065 -0.9697197 0 -1.6375742 0 0.03251294 1 0.07680317 0 #> 4 -1.6460771 -1.7278791 0 -0.5303850 0 -1.25567621 0 0.04886727 1 #> 5 0.2744846 -1.1419435 0 0.9679383 1 -1.49024538 0 0.15774825 1 #> 6 -2.0300750 -0.1996350 0 0.8142871 1 0.82830073 0 -0.87578580 0 #> L_5 A_5 Y #> 1 0.7097768 1 1 #> 2 1.4101966 0 1 #> 3 -0.6878611 0 0 #> 4 -0.6242420 0 0 #> 5 1.5856941 0 1 #> 6 -0.2540231 0 0 regime.matrix <- as.matrix(expand.grid(rep(list(0:1), time.points))) dim(regime.matrix) #>  32 5 head(regime.matrix, 20) #> Var1 Var2 Var3 Var4 Var5 #> [1,] 0 0 0 0 0 #> [2,] 1 0 0 0 0 #> [3,] 0 1 0 0 0 #> [4,] 1 1 0 0 0 #> [5,] 0 0 1 0 0 #> [6,] 1 0 1 0 0 #> [7,] 0 1 1 0 0 #> [8,] 1 1 1 0 0 #> [9,] 0 0 0 1 0 #> [10,] 1 0 0 1 0 #> [11,] 0 1 0 1 0 #> [12,] 1 1 0 1 0 #> [13,] 0 0 1 1 0 #> [14,] 1 0 1 1 0 #> [15,] 0 1 1 1 0 #> [16,] 1 1 1 1 0 #> [17,] 0 0 0 0 1 #> [18,] 1 0 0 0 1 #> [19,] 0 1 0 0 1 #> [20,] 1 1 0 0 1 num.regimes <- 2^time.points regimes <- array(dim = c(n, time.points, num.regimes)) #n x numAnodes x numRegimes = n x time.points x 2^time.points summary.measures <- array(dim = c(num.regimes, 1, 1)) #numRegimes x num.summary.measures x num.final.Ynodes = 2^time.points x 1 x 1 for (i in 1:num.regimes) { regimes[, , i] <- matrix(regime.matrix[i, ], byrow = TRUE, nrow = n, ncol = time.points) summary.measures[i, 1, 1] <- sum(regime.matrix[i, ]) } colnames(summary.measures) <- "time.on.treatment" regimes[1:3, , 1:3] #> , , 1 #> #> [,1] [,2] [,3] [,4] [,5] #> [1,] 0 0 0 0 0 #> [2,] 0 0 0 0 0 #> [3,] 0 0 0 0 0 #> #> , , 2 #> #> [,1] [,2] [,3] [,4] [,5] #> [1,] 1 0 0 0 0 #> [2,] 1 0 0 0 0 #> [3,] 1 0 0 0 0 #> #> , , 3 #> #> [,1] [,2] [,3] [,4] [,5] #> [1,] 0 1 0 0 0 #> [2,] 0 1 0 0 0 #> [3,] 0 1 0 0 0 summary.measures #> , , 1 #> #> time.on.treatment #> [1,] 0 #> [2,] 1 #> [3,] 1 #> [4,] 2 #> [5,] 1 #> [6,] 2 #> [7,] 2 #> [8,] 3 #> [9,] 1 #> [10,] 2 #> [11,] 2 #> [12,] 3 #> [13,] 2 #> [14,] 3 #> [15,] 3 #> [16,] 4 #> [17,] 1 #> [18,] 2 #> [19,] 2 #> [20,] 3 #> [21,] 2 #> [22,] 3 #> [23,] 3 #> [24,] 4 #> [25,] 2 #> [26,] 3 #> [27,] 3 #> [28,] 4 #> [29,] 3 #> [30,] 4 #> [31,] 4 #> [32,] 5 For large numbers of regimes, variance.method = "ic" is much faster, but may give anticonservative confidence intervals. You may want to use variance.method = "ic" first to make sure the MSM coefficients look reasonable and then use variance.method = "tmle" (the default) for your final estimates. result1 <- ltmleMSM(data, Anodes = paste0("A_", 1:time.points), Lnodes = paste0("L_", 0:time.points), Ynodes = "Y", regimes = regimes, summary.measures = summary.measures, working.msm = "Y ~ time.on.treatment + I(time.on.treatment^2)", variance.method = "ic") #> Qform not specified, using defaults: #> formula for L_2: #> Q.kplus1 ~ L_0 + L_1 + A_1 #> formula for L_3: #> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 #> formula for L_4: #> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 #> formula for L_5: #> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 #> formula for Y: #> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 + L_5 + A_5 #> #> gform not specified, using defaults: #> formula for A_1: #> A_1 ~ L_0 + L_1 #> formula for A_2: #> A_2 ~ L_0 + L_1 + A_1 + L_2 #> formula for A_3: #> A_3 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 #> formula for A_4: #> A_4 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 #> formula for A_5: #> A_5 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 + L_5 #> #> Estimate of time to completion: 1 minute #> Warning in CheckForVarianceWarning(inputs, g.ratio): Variance estimate is #> based on influence curve only, which may be significantly anticonservative #> because your data appears to contain positivity violations. It is recommended #> to use variance.method='tmle' or variance.method='iptw' to obtain a more #> robust variance estimate (but run time may be significantly longer). See #> variance.method details in ?ltmle result2 <- ltmleMSM(data, Anodes = paste0("A_", 1:time.points), Lnodes = paste0("L_", 0:time.points), Ynodes = "Y", regimes = regimes, summary.measures = summary.measures, working.msm = "Y ~ time.on.treatment + I(time.on.treatment^2)") #> Qform not specified, using defaults: #> formula for L_2: #> Q.kplus1 ~ L_0 + L_1 + A_1 #> formula for L_3: #> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 #> formula for L_4: #> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 #> formula for L_5: #> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 #> formula for Y: #> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 + L_5 + A_5 #> #> gform not specified, using defaults: #> formula for A_1: #> A_1 ~ L_0 + L_1 #> formula for A_2: #> A_2 ~ L_0 + L_1 + A_1 + L_2 #> formula for A_3: #> A_3 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 #> formula for A_4: #> A_4 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 #> formula for A_5: #> A_5 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 + L_5 #> #> Estimate of time to completion: 4 to 16 minutes summary(result1) #> Estimator: tmle #> Estimate Std. Error CI 2.5% CI 97.5% p-value #> (Intercept) 0.11565 0.13593 -0.15078 0.382 0.395 #> time.on.treatment 0.08999 0.11442 -0.13428 0.314 0.432 #> I(time.on.treatment^2) 0.03304 0.02284 -0.01171 0.078 0.148 summary(result2) #> Estimator: tmle #> Estimate Std. Error CI 2.5% CI 97.5% p-value #> (Intercept) 0.11565 0.18547 -0.24786 0.479 0.533 #> time.on.treatment 0.08999 0.14958 -0.20318 0.383 0.547 #> I(time.on.treatment^2) 0.03304 0.02862 -0.02304 0.089 0.248 Suppose we are only interested in the effect of time on treatment on Y considering regimes that include at least 3 periods on treatment. at.least.3 <- summary.measures[, "time.on.treatment", 1] >= 3 regimes.3 <- regimes[, , at.least.3] summary.measures.3 <- summary.measures[at.least.3, , , drop = F] dim(regimes.3) #>  5000 5 16 summary.measures.3 #> , , 1 #> #> time.on.treatment #> [1,] 3 #> [2,] 3 #> [3,] 3 #> [4,] 3 #> [5,] 4 #> [6,] 3 #> [7,] 3 #> [8,] 3 #> [9,] 4 #> [10,] 3 #> [11,] 3 #> [12,] 4 #> [13,] 3 #> [14,] 4 #> [15,] 4 #> [16,] 5 result <- ltmleMSM(data, Anodes = paste0("A_", 1:time.points), Lnodes = paste0("L_", 0:time.points), Ynodes = "Y", regimes = regimes.3, summary.measures = summary.measures.3, working.msm = "Y ~ time.on.treatment + I(time.on.treatment^2)") #> Qform not specified, using defaults: #> formula for L_2: #> Q.kplus1 ~ L_0 + L_1 + A_1 #> formula for L_3: #> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 #> formula for L_4: #> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 #> formula for L_5: #> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 #> formula for Y: #> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 + L_5 + A_5 #> #> gform not specified, using defaults: #> formula for A_1: #> A_1 ~ L_0 + L_1 #> formula for A_2: #> A_2 ~ L_0 + L_1 + A_1 + L_2 #> formula for A_3: #> A_3 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 #> formula for A_4: #> A_4 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 #> formula for A_5: #> A_5 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 + L_5 #> #> Estimate of time to completion: 3 to 8 minutes summary(result) #> Estimator: tmle #> Estimate Std. Error CI 2.5% CI 97.5% p-value #> (Intercept) -0.47416 2.20574 -4.79733 3.849 0.830 #> time.on.treatment 0.51495 1.20690 -1.85053 2.880 0.670 #> I(time.on.treatment^2) -0.03434 0.16045 -0.34882 0.280 0.831 ## Marginal Structural Models (MSMs) - Switching time example - Multiple regimes and outcomes Given data over 3 time points where A switches to 1 once and then stays 1. We want to know how death varies as a function of gender, time and an indicator of whether a patient’s intended regime was to switch before time. Note that working.msm includes time and switch.time, which are columns of summary.measures; working.msm also includes male, which is ok because it is a baseline covariate (it comes before any A/C/L/Y nodes). data(sampleDataForLtmleMSM) head(sampleDataForLtmleMSM$data, 20)
#>    male age CD4_0 A0 Y1 CD4_1 A1 Y2 CD4_2 A2 Y3
#> 1     1  33   347  0  0   349  0  0   315  0  0
#> 2     0  18   277  0  0   302  0  0   300  0  0
#> 3     1  33   419  0  0   423  0  0   462  0  0
#> 4     1  35   318  1  0   358  1  0   413  1  0
#> 5     0  27   145  0  0   134  1  1    NA NA  1
#> 6     1  27   320  0  0   332  0  0   347  1  0
#> 7     1  35   220  0  0   241  0  0   216  1  0
#> 8     0  31   184  0  0   202  1  0   230  1  0
#> 9     0  35   289  0  0   302  0  0   290  0  0
#> 10    0  30   295  1  0   312  1  0   340  1  0
#> 11    0  31   300  1  0   394  1  0   467  1  0
#> 12    0  30   300  0  0   331  0  0   320  0  0
#> 13    0  25   276  0  1    NA NA  1    NA NA  1
#> 14    1  26   242  1  0   280  1  0   307  1  0
#> 15    0  21   238  1  0   345  1  0   379  1  0
#> 16    1  30   304  0  0   258  0  0   287  0  0
#> 17    0  30   271  1  0   297  1  0   324  1  0
#> 18    1  27   296  0  0   305  0  0   306  0  0
#> 19    1  33   217  1  0   242  1  0   267  1  0
#> 20    1  25   337  0  0   360  0  0   390  0  0
dim(sampleDataForLtmleMSM$regimes) #>  200 3 4 sampleDataForLtmleMSM$regimes[1:5, , ]
#> , , 1
#>
#>      [,1] [,2] [,3]
#> [1,]    1    1    1
#> [2,]    1    1    1
#> [3,]    1    1    1
#> [4,]    1    1    1
#> [5,]    1    1    1
#>
#> , , 2
#>
#>      [,1] [,2] [,3]
#> [1,]    0    1    1
#> [2,]    0    1    1
#> [3,]    0    1    1
#> [4,]    0    1    1
#> [5,]    0    1    1
#>
#> , , 3
#>
#>      [,1] [,2] [,3]
#> [1,]    0    0    1
#> [2,]    0    0    1
#> [3,]    0    0    1
#> [4,]    0    0    1
#> [5,]    0    0    1
#>
#> , , 4
#>
#>      [,1] [,2] [,3]
#> [1,]    0    0    0
#> [2,]    0    0    0
#> [3,]    0    0    0
#> [4,]    0    0    0
#> [5,]    0    0    0
sampleDataForLtmleMSM$summary.measures #> , , 1 #> #> switch.time time #> [1,] 0 1 #> [2,] 1 1 #> [3,] 2 1 #> [4,] 3 1 #> #> , , 2 #> #> switch.time time #> [1,] 0 2 #> [2,] 1 2 #> [3,] 2 2 #> [4,] 3 2 #> #> , , 3 #> #> switch.time time #> [1,] 0 3 #> [2,] 1 3 #> [3,] 2 3 #> [4,] 3 3 Anodes <- c("A0", "A1", "A2") Lnodes <- c("CD4_1", "CD4_2") Ynodes <- c("Y1", "Y2", "Y3") Here msm.weights is just an example. It could also be a 200x3x4 array or NULL (for no weights), or "empirical" (the default). msm.weights <- matrix(1:12, nrow=4, ncol=3)  result.regimes <- ltmleMSM(sampleDataForLtmleMSM$data, Anodes=Anodes,
Lnodes=Lnodes, Ynodes=Ynodes,
survivalOutcome=TRUE,
regimes=sampleDataForLtmleMSM$regimes, summary.measures=sampleDataForLtmleMSM$summary.measures,
final.Ynodes=Ynodes,
working.msm="Y ~ male + time + pmax(time - switch.time, 0)",
msm.weights=msm.weights, estimate.time=FALSE)
#> Qform not specified, using defaults:
#> formula for Y1:
#> Q.kplus1 ~ male + age + CD4_0 + A0
#> formula for Y2:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1
#> formula for Y3:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2 + A2
#>
#> gform not specified, using defaults:
#> formula for A0:
#> A0 ~ male + age + CD4_0
#> formula for A1:
#> A1 ~ male + age + CD4_0 + A0 + CD4_1
#> formula for A2:
#> A2 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2
#>
print(summary(result.regimes))
#> Estimator:  tmle
#>                             Estimate Std. Error CI 2.5% CI 97.5%  p-value
#> (Intercept)                  -3.4059     0.6545 -4.6886   -2.123 1.95e-07 ***
#> male                         -0.3802     0.5305 -1.4198    0.660  0.47359
#> time                          0.7241     0.2602  0.2141    1.234  0.00539 **
#> pmax(time - switch.time, 0)  -0.4717     0.2019 -0.8675   -0.076  0.01948 *
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

regimes can also be specified as a list of rule functions where each rule is a function applied to each row of data which returns a numeric vector of the same length as Anodes.

regimesList <- list(function(row) c(1,1,1),
function(row) c(0,1,1),
function(row) c(0,0,1),
function(row) c(0,0,0))
result.regList <- ltmleMSM(sampleDataForLtmleMSM$data, Anodes=Anodes, Lnodes=Lnodes, Ynodes=Ynodes, survivalOutcome=TRUE, regimes=regimesList, summary.measures=sampleDataForLtmleMSM$summary.measures,
final.Ynodes=Ynodes,
working.msm="Y ~ male + time + pmax(time - switch.time, 0)",
msm.weights=msm.weights, estimate.time=FALSE)
#> Qform not specified, using defaults:
#> formula for Y1:
#> Q.kplus1 ~ male + age + CD4_0 + A0
#> formula for Y2:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1
#> formula for Y3:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2 + A2
#>
#> gform not specified, using defaults:
#> formula for A0:
#> A0 ~ male + age + CD4_0
#> formula for A1:
#> A1 ~ male + age + CD4_0 + A0 + CD4_1
#> formula for A2:
#> A2 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2
#> 

This should be the same as result.regimes:

print(summary(result.regList))
#> Estimator:  tmle
#>                             Estimate Std. Error CI 2.5% CI 97.5%  p-value
#> (Intercept)                  -3.4059     0.6545 -4.6886   -2.123 1.95e-07 ***
#> male                         -0.3802     0.5305 -1.4198    0.660  0.47359
#> time                          0.7241     0.2602  0.2141    1.234  0.00539 **
#> pmax(time - switch.time, 0)  -0.4717     0.2019 -0.8675   -0.076  0.01948 *
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Suppose we are only interested in pooling over the result at Y1 and Y3.

result <- ltmleMSM(sampleDataForLtmleMSM$data, Anodes=Anodes, Lnodes=Lnodes, Ynodes=Ynodes, survivalOutcome=TRUE, regimes=sampleDataForLtmleMSM$regimes,
summary.measures=sampleDataForLtmleMSM$summary.measures[, , c(1, 3)], final.Ynodes=c("Y1", "Y3"), working.msm="Y ~ male + time + pmax(time - switch.time, 0)", estimate.time=FALSE) #> Qform not specified, using defaults: #> formula for Y1: #> Q.kplus1 ~ male + age + CD4_0 + A0 #> formula for Y2: #> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1 #> formula for Y3: #> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2 + A2 #> #> gform not specified, using defaults: #> formula for A0: #> A0 ~ male + age + CD4_0 #> formula for A1: #> A1 ~ male + age + CD4_0 + A0 + CD4_1 #> formula for A2: #> A2 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2 #> #> speedglm failed, using glm instead. If you see a lot of this message and you have large absolute values in data[, Lnodes], you may get a speed performance improvement by rescaling these values. summary(result) #> Estimator: tmle #> Estimate Std. Error CI 2.5% CI 97.5% p-value #> (Intercept) -3.8891 0.5533 -4.9736 -2.805 2.08e-12 *** #> male 0.2256 0.5152 -0.7842 1.235 0.661472 #> time 0.7826 0.2318 0.3282 1.237 0.000737 *** #> pmax(time - switch.time, 0) -0.4331 0.2023 -0.8296 -0.037 0.032262 * #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 We could be only interested in the result at Y3. time is now a constant in working.msm, so let’s remove it. result <- ltmleMSM(sampleDataForLtmleMSM$data, Anodes=Anodes,
Lnodes=Lnodes, Ynodes=Ynodes,
survivalOutcome=TRUE,
regimes=sampleDataForLtmleMSM$regimes, summary.measures=sampleDataForLtmleMSM$summary.measures[, , 3],
final.Ynodes="Y3",
working.msm="Y ~ male + switch.time",
estimate.time=FALSE)
#> Qform not specified, using defaults:
#> formula for Y1:
#> Q.kplus1 ~ male + age + CD4_0 + A0
#> formula for Y2:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1
#> formula for Y3:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2 + A2
#>
#> gform not specified, using defaults:
#> formula for A0:
#> A0 ~ male + age + CD4_0
#> formula for A1:
#> A1 ~ male + age + CD4_0 + A0 + CD4_1
#> formula for A2:
#> A2 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2
#>
#> speedglm failed, using glm instead. If you see a lot of this message and you have large absolute values in data[, Lnodes], you may get a speed performance improvement by rescaling these values.
summary(result)
#> Estimator:  tmle
#>             Estimate Std. Error  CI 2.5% CI 97.5%  p-value
#> (Intercept) -2.81063    0.46551 -3.72301   -1.898 1.56e-09 ***
#> male         0.07357    0.53775 -0.98039    1.128   0.8912
#> switch.time  0.45362    0.19436  0.07268    0.835   0.0196 *
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## Marginal Structural Models (MSMs) - Dynamic Treatment

In this example there are two treatment nodes and one outcome: W A1 L A2 Y W is normally distributed and L is continuous in (0, 1). We are interested in treatments where A1 is set to either 0 or 1 and A2 is set dynamically. The treatment for A2 is indexed by theta between 0 and 1. If $$L > theta$$, set A2 to 1, otherwise set A2 to 0.
Here is a function that can be used to generate observed data (if abar = NULL) or generate counterfactual truth (if abar is a list with a1 and theta):

GenerateData <- function(n, abar = NULL) {
W <- rnorm(n)
if (is.null(abar)) {
A1 <- rexpit(W)
} else {
A1 <- abar$a1 } L <- plogis(rnorm(n) + 0.3 * W + 0.5 * A1) if (is.null(abar)) { A2 <- rexpit(-0.5 * W + A1 - 0.6 * L) } else { A2 <- as.integer(L > abar$theta)
}
Y <- rexpit(-2 + W + A1 + L + 2 * A2)
if (is.null(abar)) {
return(data.frame(W, A1, L, A2, Y))
} else {
return(mean(Y))
}
}

Set up regimes and summary.measures:

set.seed(11)
n <- 10000
data <- GenerateData(n)
regimes <- array(dim = c(n, 2, 10)) #n x num.Anodes x num.regimes
theta.set <- seq(0, 1, length.out = 5)
summary.measures <- array(theta.set, dim = c(10, 2, 1))
colnames(summary.measures) <- c("a1", "theta")
cnt <- 0
for (a1 in 0:1) {
for (theta.index in 1:5) {
cnt <- cnt + 1
regimes[, 1, cnt] <- a1
regimes[, 2, cnt] <- data\$L > theta.set[theta.index]
summary.measures[cnt, , 1] <- c(a1, theta.set[theta.index])
}
}
summary.measures
#> , , 1
#>
#>       a1 theta
#>  [1,]  0  0.00
#>  [2,]  0  0.25
#>  [3,]  0  0.50
#>  [4,]  0  0.75
#>  [5,]  0  1.00
#>  [6,]  1  0.00
#>  [7,]  1  0.25
#>  [8,]  1  0.50
#>  [9,]  1  0.75
#> [10,]  1  1.00
#>             W A1          L A2 Y
#> 1 -0.59103110  1 0.77137613  1 1
#> 2  0.02659437  0 0.47561600  1 0
#> 3 -1.51655310  0 0.08392365  1 0
regimes[1:3, , ]
#> , , 1
#>
#>      [,1] [,2]
#> [1,]    0    1
#> [2,]    0    1
#> [3,]    0    1
#>
#> , , 2
#>
#>      [,1] [,2]
#> [1,]    0    1
#> [2,]    0    1
#> [3,]    0    0
#>
#> , , 3
#>
#>      [,1] [,2]
#> [1,]    0    1
#> [2,]    0    0
#> [3,]    0    0
#>
#> , , 4
#>
#>      [,1] [,2]
#> [1,]    0    1
#> [2,]    0    0
#> [3,]    0    0
#>
#> , , 5
#>
#>      [,1] [,2]
#> [1,]    0    0
#> [2,]    0    0
#> [3,]    0    0
#>
#> , , 6
#>
#>      [,1] [,2]
#> [1,]    1    1
#> [2,]    1    1
#> [3,]    1    1
#>
#> , , 7
#>
#>      [,1] [,2]
#> [1,]    1    1
#> [2,]    1    1
#> [3,]    1    0
#>
#> , , 8
#>
#>      [,1] [,2]
#> [1,]    1    1
#> [2,]    1    0
#> [3,]    1    0
#>
#> , , 9
#>
#>      [,1] [,2]
#> [1,]    1    1
#> [2,]    1    0
#> [3,]    1    0
#>
#> , , 10
#>
#>      [,1] [,2]
#> [1,]    1    0
#> [2,]    1    0
#> [3,]    1    0
working.msm <- "Y ~ a1*theta"
summary(ltmleMSM(data, Anodes = c("A1", "A2"), Lnodes = "L", Ynodes = "Y",
regimes = regimes, summary.measures = summary.measures,
working.msm = working.msm))
#> Qform not specified, using defaults:
#> formula for L:
#> Q.kplus1 ~ W + A1
#> formula for Y:
#> Q.kplus1 ~ W + A1 + L + A2
#>
#> gform not specified, using defaults:
#> formula for A1:
#> A1 ~ W
#> formula for A2:
#> A2 ~ W + A1 + L
#>
#> Estimate of time to completion: 2 to 3 minutes
#> Estimator:  tmle
#>              Estimate Std. Error   CI 2.5% CI 97.5% p-value
#> (Intercept)  0.531670   0.050294  0.433095    0.630  <2e-16 ***
#> a1           1.039499   0.074409  0.893660    1.185  <2e-16 ***
#> theta       -1.817514   0.071766 -1.958172   -1.677  <2e-16 ***
#> a1:theta    -0.006772   0.110068 -0.222502    0.209   0.951
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Let’s compare to the true coefficients of the MSM. First we find the true value of $$E[Y_{a1, theta}]$$ for 5 values of theta.

truth <- rep(NA_real_, 10)
cnt <- 0
for (a1 in 0:1) {
for (theta.index in 1:5) {
cnt <- cnt + 1
truth[cnt] <- GenerateData(n = 1e6,
abar = list(a1 = a1, theta = theta.set[theta.index]))
}
}

Fit a working MSM to the true values of $$E[Y_{a1, theta}]$$.

m.true <- glm(working.msm,
data = data.frame(Y = truth, summary.measures[, , 1]),
family = "quasibinomial")
m.true
#>
#> Call:  glm(formula = working.msm, family = "quasibinomial", data = data.frame(Y = truth,
#>     summary.measures[, , 1]))
#>
#> Coefficients:
#> (Intercept)           a1        theta     a1:theta
#>     0.51497      0.95904     -1.76125     -0.02284
#>
#> Degrees of Freedom: 9 Total (i.e. Null);  6 Residual
#> Null Deviance:       1.341
#> Residual Deviance: 0.02342   AIC: NA

The estimated MSM coefficients are close to the true MSM coefficients.