\[\begin{align*} \mbox{logit}(P(T >t | x)) & = \log(G(t)) + x^T \beta \\ P(T >t | x) & = \frac{1}{1 + G(t) exp( x^T \beta) } \end{align*}\]
Input are intervals given by \(]t_l,t_r]\) where t_r can be infinity for right-censored intervals. When the data is truly discrete, in contrast to grouping of continuous data, \(]0,1]\) will be an observation at 1, and \(]j,j+1]\) will be an observation at j+1.
Likelihood is maximized: \[\begin{align*} \prod_i P(T_i >t_{il} | x) - P(T_i> t_{ir}| x). \end{align*}\]
This model is also called the cumulative odds model \[\begin{align*} P(T \leq t | x) & = \frac{ G(t) exp( x^T \beta) }{1 + G(t) exp( x^T \beta) }. \end{align*}\] and \(\beta\) says something about the OR of probability of being before \(t\).
The baseline is parametrized as \[\begin{align*} G(t) & = \sum_{j \leq t} \exp( \alpha_j ) \end{align*}\]
An important consequence of the model is that for all cut-points \(t\) we have the same OR parameters for the OR of being early or later than \(t\).
First we look at some time to pregnancy data (simulated discrete survival data) that is right-censored, and set it up to fit the cumulative odds model by constructing the intervals appropriately:
library(mets)
data(ttpd)
dtable(ttpd,~entry+time2)
#>
#> time2 1 2 3 4 5 6 Inf
#> entry
#> 0 316 0 0 0 0 0 0
#> 1 0 133 0 0 0 0 0
#> 2 0 0 150 0 0 0 0
#> 3 0 0 0 23 0 0 0
#> 4 0 0 0 0 90 0 0
#> 5 0 0 0 0 0 68 0
#> 6 0 0 0 0 0 0 220
<- interval.logitsurv.discrete(Interval(entry,time2)~X1+X2+X3+X4,ttpd)
out summary(out)
#> Estimate Std.Err 2.5% 97.5% P-value
#> time1 -2.0064 0.1461 -2.29277 -1.7201 6.466e-43
#> time2 -2.1749 0.1543 -2.47725 -1.8725 3.869e-45
#> time3 -1.4581 0.1496 -1.75132 -1.1648 1.936e-22
#> time4 -2.9260 0.2436 -3.40344 -2.4486 3.078e-33
#> time5 -1.2051 0.1655 -1.52935 -0.8808 3.267e-13
#> time6 -0.9102 0.1790 -1.26103 -0.5594 3.671e-07
#> X1 0.9913 0.1171 0.76175 1.2208 2.557e-17
#> X2 0.6962 0.1156 0.46953 0.9228 1.739e-09
#> X3 0.3466 0.1150 0.12110 0.5721 2.590e-03
#> X4 0.3223 0.1147 0.09749 0.5470 4.952e-03
Now using this discrete survival model we simulate some data from this model
set.seed(1000) # to control output in simulatins for p-values below.
<- 1000
n <- matrix(rbinom(n*4,1,0.5),n,4)
Z <- simlogitSurvd(out$coef,Z)
outsim <- transform(outsim,left=time,right=time+1)
outsim <- dtransform(outsim,right=Inf,status==0)
outsim <- interval.logitsurv.discrete(Interval(left,right)~+X1+X2+X3+X4,outsim)
outss summary(outss)
#> Estimate Std.Err 2.5% 97.5% P-value
#> time1 -2.0814 0.1436 -2.36292 -1.7998 1.401e-47
#> time2 -2.2012 0.1497 -2.49451 -1.9079 5.663e-49
#> time3 -1.4943 0.1447 -1.77790 -1.2107 5.329e-25
#> time4 -3.1848 0.2594 -3.69322 -2.6764 1.200e-34
#> time5 -1.3539 0.1636 -1.67443 -1.0333 1.260e-16
#> time6 -0.6973 0.1671 -1.02478 -0.3699 2.997e-05
#> X1 0.9283 0.1159 0.70119 1.1553 1.125e-15
#> X2 0.7191 0.1152 0.49326 0.9449 4.338e-10
#> X3 0.4306 0.1143 0.20654 0.6546 1.654e-04
#> X4 0.2950 0.1140 0.07156 0.5184 9.659e-03
<- predictlogitSurvd(out,se=TRUE)
pred plotSurvd(pred,se=TRUE)
Finally, we look at some data and compare with the icenReg package that can also fit the proportional odds model for continous or discrete data. We make the data fully interval censored/discrete by letting also exact obsevations be only observed to be in an interval.
We consider the interval censored survival times for time from onset of diabetes to to diabetic nephronpathy, then modify it to observe only that the event times are in certain intervals.
<- 0
test if (test==1) {
require(icenReg)
data(IR_diabetes)
<- IR_diabetes
IRdia ## removing fully observed data in continuous version, here making it a discrete observation
<- dtransform(IRdia,left=left-1,left==right)
IRdia dtable(IRdia,~left+right,level=1)
<- with(IRdia,dInterval(left,right,cuts=c(0,5,10,20,30,40,Inf),show=TRUE) )
ints }
We note that the gender effect is equivalent for the two approaches.
if (test==1) {
$Ileft <- ints$left
ints$Iright <- ints$right
ints<- cbind(IRdia,data.frame(Ileft=ints$Ileft,Iright=ints$Iright))
IRdia dtable(IRdia,~Ileft+Iright)
#
# Iright 1 2 3 4 5 Inf
# Ileft
# 0 10 1 34 25 4 0
# 1 0 55 19 17 1 1
# 2 0 0 393 16 4 0
# 3 0 0 0 127 1 0
# 4 0 0 0 0 21 0
# 5 0 0 0 0 0 2
<- interval.logitsurv.discrete(Interval(Ileft,Iright)~+gender,IRdia)
outss # Estimate Std.Err 2.5% 97.5% P-value
# time1 -3.934 0.3316 -4.5842 -3.28418 1.846e-32
# time2 -2.042 0.1693 -2.3742 -1.71038 1.710e-33
# time3 1.443 0.1481 1.1530 1.73340 1.911e-22
# time4 3.545 0.2629 3.0295 4.06008 1.976e-41
# time5 6.067 0.7757 4.5470 7.58784 5.217e-15
# gendermale -0.385 0.1691 -0.7165 -0.05351 2.283e-02
summary(outss)
$ploglik
outss# [1] -646.1946
<- ic_sp(cbind(Ileft, Iright) ~ gender, data = IRdia, model = "po")
fit #
# Model: Proportional Odds
# Dependency structure assumed: Independence
# Baseline: semi-parametric
# Call: ic_sp(formula = cbind(Ileft, Iright) ~ gender, data = IRdia,
# model = "po")
#
# Estimate Exp(Est)
# gendermale 0.385 1.47
#
# final llk = -646.1946
# Iterations = 6
# Bootstrap Samples = 0
# WARNING: only 0 bootstrap samples used for standard errors.
# Suggest using more bootstrap samples for inference
summary(fit)
## sometimes NR-algorithm needs modifications of stepsize to run
## outss <- interval.logitsurv.discrete(Interval(Ileft,Iright)~+gender,IRdia,control=list(trace=TRUE,stepsize=1.0))
}
Also agrees with the cumulative link regression of the ordinal package, although the baseline is parametrized differently.
if (test==1) {
###
### data(ttpd)
### dtable(ttpd,~entry+time2)
### out <- interval.logitsurv.discrete(Interval(entry,time2)~X1+X2+X3+X4,ttpd)
### summary(out)
# Estimate Std.Err 2.5% 97.5% P-value
# time1 -2.0064 0.1461 -2.29277 -1.7201 6.466e-43
# time2 -2.1749 0.1543 -2.47725 -1.8725 3.869e-45
# time3 -1.4581 0.1496 -1.75132 -1.1648 1.936e-22
# time4 -2.9260 0.2436 -3.40344 -2.4486 3.078e-33
# time5 -1.2051 0.1655 -1.52935 -0.8808 3.267e-13
# time6 -0.9102 0.1790 -1.26103 -0.5594 3.671e-07
# X1 0.9913 0.1171 0.76175 1.2208 2.557e-17
# X2 0.6962 0.1156 0.46953 0.9228 1.739e-09
# X3 0.3466 0.1150 0.12110 0.5721 2.590e-03
# X4 0.3223 0.1147 0.09749 0.5470 4.952e-03
$ploglik
out# [1] -1676.456
### library(ordinal)
### ttpd <- dfactor(ttpd,fentry~entry)
### out1 <- clm(fentry~X1+X2+X3+X4,data=ttpd)
### summary(out1)
# formula: fentry ~ X1 + X2 + X3 + X4
# data: ttpd
#
# link threshold nobs logLik AIC niter max.grad cond.H
# logit flexible 1000 -1676.46 3372.91 6(2) 1.17e-12 5.3e+02
#
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# X1 -0.9913 0.1171 -8.465 < 2e-16 ***
# X2 -0.6962 0.1156 -6.021 1.74e-09 ***
# X3 -0.3466 0.1150 -3.013 0.00259 **
# X4 -0.3223 0.1147 -2.810 0.00495 **
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Threshold coefficients:
# Estimate Std. Error z value
# 0|1 -2.0064 0.1461 -13.733
# 1|2 -1.3940 0.1396 -9.984
# 2|3 -0.7324 0.1347 -5.435
# 3|4 -0.6266 0.1343 -4.667
# 4|5 -0.1814 0.1333 -1.361
# 5|6 0.2123 0.1342 1.582
}
sessionInfo()
#> R version 4.2.1 (2022-06-23)
#> Platform: x86_64-apple-darwin21.5.0 (64-bit)
#> Running under: macOS Monterey 12.5.1
#>
#> Matrix products: default
#> BLAS: /usr/local/Cellar/openblas/0.3.21/lib/libopenblasp-r0.3.21.dylib
#> LAPACK: /usr/local/Cellar/r/4.2.1_2/lib/R/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] mets_1.3.0 lava_1.7.0 timereg_2.0.2 survival_3.3-1
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_1.0.9 highr_0.9 bslib_0.3.1
#> [4] compiler_4.2.1 jquerylib_0.1.4 tools_4.2.1
#> [7] digest_0.6.29 jsonlite_1.8.0 evaluate_0.15
#> [10] lattice_0.20-45 ucminf_1.1-4 rlang_1.0.2
#> [13] Matrix_1.4-1 cli_3.3.0 yaml_2.3.5
#> [16] parallel_4.2.1 mvtnorm_1.1-3 xfun_0.30
#> [19] fastmap_1.1.0 stringr_1.4.0 knitr_1.37
#> [22] sass_0.4.0 globals_0.16.1 grid_4.2.1
#> [25] listenv_0.8.0 R6_2.5.1 future.apply_1.9.0
#> [28] parallelly_1.32.1 rmarkdown_2.13 magrittr_2.0.2
#> [31] codetools_0.2-18 htmltools_0.5.2 splines_4.2.1
#> [34] future_1.27.0 numDeriv_2016.8-1.1 stringi_1.7.6