Time-independent dataset, when the dataset can be saved and loaded as a whole: Fitting a Cox model for a time-independent dataset with 50 variables and 100,000 samples (detailed version of Key Example 1).
##### Generating a time-independent dataset ##### set.seed(1) N <- 1e5 p.x <- 50 K <- 100 n <- N / K cor <- 0.2 bb <- c(rep(0.4, 4), rep(0.2, 4), rep(0.1, 4), rep(0.05, 4)) beta0 <- c(1, bb, rep(0, p.x - length(bb))) dat.mat0 <- as.data.frame(SIM.FUN(N, p.x = p.x, cor = cor, family = "Cox", beta0 = beta0)) dat.mat0[, "strat"] <- rep(1:20, each = N / 20) ## A Cox model, without stratification of baseline hazard: Surv(u,delta)~V3+V4+.....+V52 # Example option 1: Using 1 core for computation, dividing the dataset to 10 chunks, using 2 iterations of one-step estimator for update mod <- dcalasso(as.formula(paste0("Surv(u,delta)~", paste(paste0("V", 3:52), collapse = "+"))), family = "cox.ph", data = dat.mat0, K = 10, iter.os = 2 ) sum.mod <- summary(mod) print(sum.mod, unpen = T) plot(mod) # Performing model prediction pred.link <- predict(mod, newdata = dat.mat0) pred.term <- predict(mod, newdata = dat.mat0, type = "terms") pred.response <- predict(mod, newdata = dat.mat0, type = "response") # Example option 2: Same model as above: Using 2 cores for parallel computation, dividing the dataset to 10 chunks, using 4 iterations of one-step estimator for update modp <- dcalasso(as.formula(paste0("Surv(u,delta)~", paste(paste0("V", 3:52), collapse = "+"))), family = "cox.ph", data = dat.mat0, K = 10, iter.os = 4, ncores = 2 ) sum.modp <- summary(modp) print(sum.modp, unpen = T) plot(modp) # Compare the divide and conquer method unpenalized model fit against a standard Cox fit using the whole dataset std <- coxph(as.formula(paste0("Surv(u,delta)~", paste(paste0("V", 3:52), collapse = "+"))), data = dat.mat0 ) plot(mod$coefficients.unpen, std$coefficients) plot(modp$coefficients.unpen, std$coefficients) ## A Cox model, with stratification of baseline hazard by "strat": Surv(u,delta)~strata(strat)+V3+V4+.....+V52 # Example option 1: Using 1 core for computation, dividing the dataset to 10 chunks, using 2 iterations of one-step estimator for update mod <- dcalasso(as.formula(paste0("Surv(u,delta)~strata(strat)+", paste(paste0("V", 3:52), collapse = "+"))), family = "cox.ph", data = dat.mat0, K = 10, iter.os = 2 ) sum.mod <- summary(mod) print(sum.mod, unpen = T) plot(mod) # Example option 2: Same model as above: Using 2 cores for parallel computation, dividing the dataset to 10 chunks, using 4 iterations of one-step estimator for update modp <- dcalasso(as.formula(paste0("Surv(u,delta)~strata(strat)+", paste(paste0("V", 3:52), collapse = "+"))), family = "cox.ph", data = dat.mat0, K = 10, iter.os = 2, ncores = 2 ) sum.modp <- summary(modp) print(sum.modp, unpen = T) plot(modp) # Compare the divide and conquer method unpenalized model fit against a standard Cox fit using the whole dataset std <- coxph(as.formula(paste0("Surv(u,delta)~strata(strat)+", paste(paste0("V", 3:52), collapse = "+"))), data = dat.mat0 ) plot(mod$coefficients.unpen, std$coefficients) plot(modp$coefficients.unpen, std$coefficients)
Time-independent dataset, when the dataset are saved in multiple files: Fitting a Cox model for a time-independent dataset with 50 variables and 100,000 samples.
##### Generating a time-independent dataset, saving the data into 10 separate files ##### set.seed(1) N <- 1e5 p.x <- 50 K <- 100 n <- N / K cor <- 0.2 bb <- c(rep(0.4, 4), rep(0.2, 4), rep(0.1, 4), rep(0.05, 4)) beta0 <- c(1, bb, rep(0, p.x - length(bb))) dat.mat0 <- as.data.frame(SIM.FUN(N, p.x = p.x, cor = cor, family = "Cox", beta0 = beta0)) dat.mat0[, "strat"] <- rep(1:20, each = N / 20) dir <- "C:/" ll <- split(1:N, factor(1:10)) for (kk in 1:10) { df <- dat.mat0[ll[[kk]], ] saveRDS(df, file = paste0(dir, "dataTI", kk, ".rds")) } ## A Cox model, without stratification of baseline hazard: Surv(u,delta)~V3+V4+.....+V52 # Example option 1: Using 1 core for computation, data.rds specifies that the data are saved into 10 files [each time the RAM will only contain data from 1 file], using 2 iterations of one-step estimator for update mod <- dcalasso(as.formula(paste0("Surv(u,delta)~", paste(paste0("V", 3:52), collapse = "+"))), family = "cox.ph", data.rds = paste0(dir, "dataTI", 1:10, ".rds"), iter.os = 2 ) sum.mod <- summary(mod) print(sum.mod, unpen = T) plot(mod) # Example option 2: Using 2 core for parallel computation, data.rds specifies that the data are saved into 10 files [each time the RAM will only contain data from 1 file], using 2 iterations of one-step estimator for update modp <- dcalasso(as.formula(paste0("Surv(u,delta)~", paste(paste0("V", 3:52), collapse = "+"))), family = "cox.ph", data.rds = paste0(dir, "dataTI", 1:10, ".rds"), iter.os = 2, ncores = 2 ) sum.modp <- summary(modp) print(sum.modp, unpen = T) plot(modp) # Compare the divide and conquer method unpenalized model fit against a standard Cox fit using the whole dataset std <- coxph(as.formula(paste0("Surv(u,delta)~", paste(paste0("V", 3:52), collapse = "+"))), data = dat.mat0 ) plot(mod$coefficients.unpen, std$coefficients) plot(modp$coefficients.unpen, std$coefficients) ## A Cox model, with stratification of baseline hazard by "strat": Surv(u,delta)~strata(strat)+V3+V4+.....+V52 # Example option 1: Using 1 core for computation, loading data from 10 separate files (imply K=10) [each time the RAM will only contain data from 1 file], using 2 iterations of one-step estimator for update mod <- dcalasso(as.formula(paste0("Surv(u,delta)~strata(strat)+", paste(paste0("V", 3:52), collapse = "+"))), family = "cox.ph", data.rds = paste0(dir, "dataTI", 1:10, ".rds"), iter.os = 2 ) sum.mod <- summary(mod) print(sum.mod, unpen = T) plot(mod) # Example option 2: Using 2 core for parallel computation, data.rds specifies that the data are saved into 10 files [each time the RAM will only contain data from 1 file], using 2 iterations of one-step estimator for update modp <- dcalasso(as.formula(paste0("Surv(u,delta)~strata(strat)+", paste(paste0("V", 3:52), collapse = "+"))), family = "cox.ph", data.rds = paste0(dir, "dataTI", 1:10, ".rds"), iter.os = 2, ncores = 2 ) sum.modp <- summary(modp) print(sum.modp, unpen = T) plot(modp) # Compare the divide and conquer method unpenalized model fit against a standard Cox fit using the whole dataset std <- coxph(as.formula(paste0("Surv(u,delta)~strata(strat)+", paste(paste0("V", 3:52), collapse = "+"))), data = dat.mat0 ) plot(mod$coefficients.unpen, std$coefficients) plot(modp$coefficients.unpen, std$coefficients)
Time-dependent dataset, when the dataset can be saved and loaded as a whole: Fitting a Cox model for a dataset with 50 time-dependent variables, 50 additional time-independent variables, and 100,000 samples (detailed version of Key Example 2).
########### Generating a time-dependent dataset #################### set.seed(1) n.subject <- 1e5 p.ti <- 50 p.tv <- 50 K <- 20 n <- n.subject / K cor <- 0.2 lambda.grid <- 10^seq(-10, 3, 0.01) beta0.ti <- NULL beta0.tv <- NULL dat.mat0 <- as.data.frame(SIM.FUN.TVC(p.ti, p.tv, n.subject, cor, beta0.ti, beta0.tv)) dat.mat0[, "strat"] <- dat.mat0[, dim(dat.mat0)[2]] %% (n.subject / 20) dat.mat0 <- dat.mat0[, -(dim(dat.mat0)[2] - 1)] ## A time-dependent Cox model, without stratification of baseline hazard: Surv(t0,t1,status)~V4+V5+...+V103 # Example option 1: Using 1 core for computation, dividing the data into 10 chunks, using 2 iterations of one-step estimator for update mod <- dcalasso(as.formula(paste0("Surv(t0,t1,status)~", paste(paste0("V", 4:103), collapse = "+"))), family = "cox.ph", data = dat.mat0, K = 10, iter.os = 2 ) sum.mod <- summary(mod) print(sum.mod, unpen = T) plot(mod) # Example option 2: Using 2 cores for parallel computation, dividing the data into 10 chunks, using 2 iterations of one-step estimator for update modp <- dcalasso(as.formula(paste0("Surv(t0,t1,status)~", paste(paste0("V", 4:103), collapse = "+"))), family = "cox.ph", data = dat.mat0, K = 10, iter.os = 2, ncores = 2 ) sum.modp <- summary(modp) print(sum.modp, unpen = T) plot(modp) # Compare the divide and conquer method unpenalized model fit against a standard Cox fit using the whole dataset std <- coxph(as.formula(paste0("Surv(t0,t1,status)~", paste(paste0("V", 4:103), collapse = "+" ))), data = dat.mat0 ) plot(mod$coefficients.unpen, std$coefficients) plot(modp$coefficients.unpen, mod$coefficients.unpen) ## A time-dependent Cox model, with stratification of baseline hazard: Surv(t0,t1,status)~strata(strat)+V4+V5+...+V103 # Example option 1: Using 1 core for computation, dividing the data into 10 chunks, using 4 iterations of one-step estimator for update mod <- dcalasso(as.formula(paste0("Surv(t0,t1,status)~strata(strat)+", paste(paste0("V", 4:103), collapse = "+"))), family = "cox.ph", data = dat.mat0, K = 10, iter.os = 4 ) sum.mod <- summary(mod) print(sum.mod, unpen = T) plot(mod) # Example option 2: Using 2 cores for parallel computation, dividing the data into 10 chunks, using 2 iterations of one-step estimator for update modp <- dcalasso(as.formula(paste0("Surv(t0,t1,status)~strata(strat)+", paste(paste0("V", 4:103), collapse = "+"))), family = "cox.ph", data = dat.mat0, K = 10, iter.os = 4, ncores = 2 ) sum.modp <- summary(modp) print(sum.modp, unpen = T) plot(modp) # Compare the divide and conquer method unpenalized model fit against a standard Cox fit using the whole dataset std <- coxph(as.formula(paste0("Surv(t0,t1,status)~strata(strat)+", paste(paste0("V", 4:103), collapse = "+" ))), data = dat.mat0 ) plot(mod$coefficients.unpen, std$coefficients) plot(modp$coefficients.unpen, mod$coefficients.unpen)
Time-dependent dataset, when the dataset are saved in multiple files: Fitting a Cox model for a dataset with 50 time-dependent variables, 50 additional time-independent variables, and 100,000 samples.
########### Generating a time-dependent dataset, saving the dataset to 10 separate files #################### set.seed(1) n.subject <- 1e5 p.ti <- 50 p.tv <- 50 K <- 20 n <- n.subject / K cor <- 0.2 lambda.grid <- 10^seq(-10, 3, 0.01) beta0.ti <- NULL beta0.tv <- NULL dat.mat0 <- as.data.frame(SIM.FUN.TVC(p.ti, p.tv, n.subject, cor, beta0.ti, beta0.tv)) dat.mat0[, "strat"] <- dat.mat0[, dim(dat.mat0)[2]] %% (n.subject / 20) dat.mat0 <- dat.mat0[, -(dim(dat.mat0)[2] - 1)] ll <- split(1:dim(dat.mat0)[1], factor(1:10)) for (kk in 1:10) { df <- dat.mat0[ll[[kk]], ] saveRDS(df, file = paste0(dir, "dataTV", kk, ".rds")) } ## A time-dependent Cox model, without stratification of baseline hazard: Surv(t0,t1,status)~V4+V5+...+V103 # Example option 1: Using 1 core for computation, loading dataset from 10 chunks [each time the RAM will only contain data from 1 file], using 2 iterations of one-step estimator for update mod <- dcalasso(as.formula(paste0("Surv(t0,t1,status)~", paste(paste0("V", 4:103), collapse = "+"))), family = "cox.ph", data.rds = paste0(dir, "dataTV", 1:10, ".rds"), iter.os = 2 ) sum.mod <- summary(mod) print(sum.mod, unpen = T) plot(mod) # Example option 2: Using 2 cores for parallel computation, loading dataset from 10 chunks [each time the RAM will only contain data from 1 file], using 2 iterations of one-step estimator for update modp <- dcalasso(as.formula(paste0("Surv(t0,t1,status)~", paste(paste0("V", 4:103), collapse = "+"))), family = "cox.ph", data.rds = paste0(dir, "dataTV", 1:10, ".rds"), iter.os = 2, ncores = 2 ) sum.modp <- summary(modp) print(sum.modp, unpen = T) plot(modp) # Compare the divide and conquer method unpenalized model fit against a standard Cox fit using the whole dataset std <- coxph(as.formula(paste0("Surv(t0,t1,status)~", paste(paste0("V", 4:103), collapse = "+" ))), data = dat.mat0 ) plot(mod$coefficients.unpen, std$coefficients) plot(modp$coefficients.unpen, mod$coefficients.unpen) ## A time-dependent Cox model, with stratification of baseline hazard: Surv(t0,t1,status)~strata(strat)+V4+V5+...+V103 # Example option 1: Using 1 core for computation, loading dataset from 10 chunks [each time the RAM will only contain data from 1 file], using 4 iterations of one-step estimator for update mod <- dcalasso(as.formula(paste0("Surv(t0,t1,status)~strata(strat)+", paste(paste0("V", 4:103), collapse = "+"))), family = "cox.ph", data.rds = paste0(dir, "dataTV", 1:10, ".rds"), iter.os = 4 ) sum.mod <- summary(mod) print(sum.mod, unpen = T) plot(mod) # Example option 2: Using 2 cores for parallel computation, loading dataset from 10 chunks [each time the RAM will only contain data from 1 file], using 4 iterations of one-step estimator for update modp <- dcalasso(as.formula(paste0("Surv(t0,t1,status)~strata(strat)+", paste(paste0("V", 4:103), collapse = "+"))), family = "cox.ph", data.rds = paste0(dir, "dataTV", 1:10, ".rds"), K = 10, iter.os = 4, ncores = 2 ) sum.modp <- summary(modp) print(sum.modp, unpen = T) plot(modp) # Compare the divide and conquer method unpenalized model fit against a standard Cox fit using the whole dataset std <- coxph(as.formula(paste0("Surv(t0,t1,status)~strata(strat)+", paste(paste0("V", 4:103), collapse = "+" ))), data = dat.mat0 ) plot(mod$coefficients.unpen, std$coefficients) plot(modp$coefficients.unpen, mod$coefficients.unpen)