Penalised regression improves imputation of cell-type specific expression using RNA-seq data from mixed cell populations compared to domain-specific methods

Modified

23-Jan-25

[1] "/rds/project/rds-csoP2nj6Y6Y/wyl37/Rbookdown/sceQTLGen/CLUSTERdeconRes"

1 Main Figures

1.1 Figure1, multi-response LASSO/ridge

multi-response LASSO/ridge to predict sample-level cell-type expression

1.2 Figure 2, Data and study design

CLUSTER samples by cell type (row) and subject (column). Cells are coloured based on the availability of RNA (Y for yes, N for no), and the top panel annotations indicate the RNA sequencing batch (Batch).

Data analysis workflow. Transcripts per million (TPM) were calculated after excluding low-expressed genes. TPM from sorted cells (CD4, CD8, CD14, and CD19) from 80 training samples were used to generate custom signature genes using the CIBERSORTxFractions module. We deconvoluted the cell fractions from PBMC based on inbuilt and custom signatures using CIBERSORTx, using the custom signature genes with bMIND and cell-type specific genes using debCAM. Estimates of cell fractions were compared to the ground-truth cell fractions from flow cytometry, and we assessed fraction accuracy using Pearson correlation and RMSE (root mean square error). Next, we estimated sample-level cell-type gene expression based on inbuilt and custom signature matrices using the CIBERSORTx high resolution module. In parallel, we ran bMIND and swCAM, with the flow cytometry cell fractions, in a supervised mode for estimating cell-type expression. For each cell type, we trained a LASSO/RIDGE model on PBMC and sorted cells with 5-fold cross-validation and used this to predict cell-type gene expression in the test samples. We compared imputed cell-type expressions with the observed ones and evaluated and benchmarked the performance using Pearson correlation, RMSE and a novel measure, differential gene expression (DGE) recovery.

1.3 Figure 3, Prediction accuracy of cell fractions

Code

####################
## flow fractions ##
####################
fractflow <- file.path(
  data_dir,
  "CLUSTERData/flowFract_wide.rds"
) %>%
  readRDS()



## cibx-LM22
#############################
## fractions based on LM22 ##
#############################
lm22merged <- "data4repo/mergedClasses.csv" %>%
  file.path(here::here(), .) %>%
  scan(., what = "character", sep = "\t")

res_d <- file.path("CIBXres/LM22deconv", runid) %>%
  file.path(data_dir, .)
res_d
[1] "/rds/project/rds-csoP2nj6Y6Y/wyl37/Rbookdown/sceQTLGen/CLUSTERdeconRes/CIBXres/LM22deconv/cluster"

lm22mtx <- file.path(res_d, "sigMTX4dev.txt") %>%
  fread()

lm22res <- file.path(res_d, "CIBERSORTxGEP_NA_Fractions-Adjusted.txt") %>%
  fread()

stopifnot(names(lm22res)[-1][1:length(names(lm22mtx)[-1])] == names(lm22mtx)[-1])

## fractions in 22 cell types
lm22res.mtx <- lm22res[, names(lm22mtx)[-1], with = FALSE] %>% as.matrix()
rownames(lm22res.mtx) <- lm22res$Mixture

## fractions merged into 4 cell types based on lm22merged
lm22res.4ct <- matrix(NA, nrow = nrow(lm22res.mtx), ncol = length(sctorder))
rownames(lm22res.4ct) <- lm22res$Mixture

for (n in seq_along(sctorder)) {
  colidx <- which(lm22merged == sctorder[n])
  lm22res.4ct[, n] <- rowSums(lm22res.mtx[, colidx, drop = FALSE])
}

## merged class for 4 sorted cells
ctlm22pooled <- lapply(sctorder, function(ct) {
  colnames(lm22res.mtx)[lm22merged == ct]
})
names(ctlm22pooled) <- sctorder

## fractions of 4 cell types scaled to 1
lm22res.4ct.scaled <- apply(lm22res.4ct, 1, function(x) 1 / sum(x) * x) %>% t()
colnames(lm22res.4ct.scaled) <- sctorder
lm22res.4ct.scaled <- lm22res.4ct.scaled[rownames(fractflow), ]

##################
## sorted cells ##
##################
res_d <- file.path("CIBXres/deconv", runid) %>%
  file.path(data_dir, .)
res_d
[1] "/rds/project/rds-csoP2nj6Y6Y/wyl37/Rbookdown/sceQTLGen/CLUSTERdeconRes/CIBXres/deconv/cluster"

customfract <- file.path(res_d, "CIBERSORTxGEP_NA_Fractions.txt") %>%
  fread()
customres <- customfract[, ..sctorder] %>% as.matrix()
rownames(customres) <- customfract$Mixture
customres <- customres[rownames(fractflow), ]

###########
## bMIND ##
###########
res_d <- file.path("bMINDres", runid) %>%
  file.path(data_dir, .)
res_d
[1] "/rds/project/rds-csoP2nj6Y6Y/wyl37/Rbookdown/sceQTLGen/CLUSTERdeconRes/bMINDres/cluster"

bmindresFract <- readRDS(file.path(res_d, "bMIND_Fractres.rds"))[["frac"]]
bmindresFract <- bmindresFract[rownames(fractflow), ]

############
## debCAM ##
############
res_d <- file.path("swCAMres", runid) %>%
  file.path(data_dir, .)
res_d
[1] "/rds/project/rds-csoP2nj6Y6Y/wyl37/Rbookdown/sceQTLGen/CLUSTERdeconRes/swCAMres/cluster"

## 4 fold changes (FC) for marker selection
debCAMresFract <- readRDS(file.path(res_d, "debCAM_FractresPure80.rds"))
debCAMresFract %>% names() # OVER.FC_1, FC_2, FC_5, FC_10
[1] "OVE.FC_1"  "OVE.FC_2"  "OVE.FC_5"  "OVE.FC_10"

## go for fraction results based on FC10
debFract <- debCAMresFract$OVE.FC_10$Aest[rownames(fractflow), ]

#######################
## flow + pred fract ##
#######################
## true fraction longformat
flowfract.m <- copy(fractflow) %>%
  setDT(., keep.rownames = TRUE) %>%
  melt(
    data = .,
    id.vars = c("rn"),
    measure.vars = sctorder,
    variable.name = "celltype",
    variable.factor = FALSE,
    value.name = "flowfract"
  ) %>%
  setnames(., "rn", "sampleName")

## pred fraction in

fract_appro <- c("CIBX-inbuilt", "CIBX-custom", "bMIND-custom", "debCAM")

predfract <- list(
  lm22res.4ct.scaled,
  customres,
  bmindresFract,
  debFract
)
names(predfract) <- fract_appro

pltfract <- lapply(names(predfract), function(dat) {
  dsn <- as.data.frame(predfract[[dat]]) %>% setDT(., keep.rownames = TRUE)
  dsn.m <- melt(
    data = dsn,
    id.vars = c("rn"),
    measure.vars = sctorder,
    variable.name = "celltype",
    variable.factor = FALSE,
    value.name = "predfract",
  )
  dsn.m[, frt_type := dat]
  setnames(dsn.m, "rn", "sampleName")

  dsn.m[flowfract.m, flowfract := i.flowfract, on = c("sampleName", "celltype")]

  dsn.m
}) %>% rbindlist(., use.names = TRUE)

## limited to test samples
pltfract_test <- pltfract[sampleName %in% sampdata[samtype != "refsamp", sample], ]
pltfract_test[, ":="(
  frt_type = factor(frt_type, levels = fract_appro),
  celltype = factor(celltype, levels = sctorder)

)]

## r & rmse
cordsn <- pltfract_test[,
  {
    rhores <- cor(flowfract, predfract)
    rmse <- sqrt(mean((flowfract - predfract)^2))
    list(rhores, rmse)
  },
  by = c("frt_type", "celltype")
][, tlab := paste0("R=", round(rhores, 2), "\n", "RMSE=", round(rmse, 3))]

fig2plt <- ggpubr::ggscatter(
  data = pltfract_test,
  x = "flowfract", y = "predfract",
  fill = "celltype", shape = 21, size = 2,
  facet.by = c("frt_type", "celltype"),
  alpha = 1,
  palette = ctfrat.pal,
  position = "jitter",
  xlab = "Flow Fraction",
  ylab = "Predicted Fraction"
) +
  geom_abline(slope = 1, intercept = 0, linetype = 2, color = "blue") +
  facet_grid(frt_type ~ celltype, scales = "fixed") +
  ggpp::geom_text_npc(
    aes(
      npcx = 0.05, npcy = 0.97,
      label = tlab
    ),
    size = 3.5,
    data = cordsn
  ) +
  used_ggthemes(
    strip.text.y.right = element_text(angle = 0),
    legend.position = "none"
  )

Prediction accuracy of cell fractions by cell type (column) and approaches (row). Pearson correlation (R) and root mean square errors (RMSE) were calculated between estimated fractions (y-axis) and flow cytometry measures (x-axis). Each point is a testing sample and dashed blue lines indicate \(y = x\). CIBX-inbuilt: CIBERSORTx fraction deconvolution using the inbuilt signature matrix; CIBX-custom: CIBERSORTx fraction deconvolution using the custom signature matrix; bMIND-custom: bMIND fraction estimation using the custom signature matrix; debCAM-custom: debCAM fraction estimation using cell-type specific genes.

Code

fig2plt

1.4 Figure 4, Prediction accuracy of cell-type expression

Distributions of prediction accuracy by cell type and approach. Prediction accuracy was evaluated by calculating Pearson correlation and root mean square error (RMSE) between observed and predicted cell-type expression across genes for the same subjects by cell type and approach. The approaches used were: inbuilt - CIBERSORTx expression deconvolution with the inbuilt signature matrix, custom - CIBERSORTx expression deconvolution with a custom signature matrix derived from sorted cell-type expression in training samples, bMIND - bMIND expression deconvolution with flow fractions, swCAM - swCAM deconvolution with flow fractions, and LASSO/RIDGE - expression predicted from regularised multi-response Gaussian models.

Code

## expression from approaches
cdata_explst <- prep_dat(
  rid = "cluster",
  res_dir = data_dir,
  cluster = TRUE
)

## predicted accuracy

numCores <- length(expr_order)
cl <- makeCluster(numCores, type = "FORK")
registerDoParallel(cl)

pedaccdsn_persub <- foreach(i = expr_order, .combine = "rbind") %dopar% {
  run_fig3v(
    obvexpr = cdata_explst$oexpr, impvexpr = cdata_explst$iexpr,
    sce = i, ct = sctorder, persub = TRUE
  ) %>%
    rbindlist()
}

stopCluster(cl)


pedaccdsn_persub[, .N, by = c("approach", "celltype")]

pedaccdsn_persub[, ":="(
  approach = factor(approach, levels = expr_order),
  celltype = factor(celltype, levels = sctorder)
)][, measure_rmse := log(measure_rmse)]


measureii <- c("Correlation per subject", "log(RMSE) per subject")
names(measureii) <- c("measure_cor", "measure_rmse")

lapply(seq_along(measureii), function(idx) {
  bplt <- ggboxplot(
    data = pedaccdsn_persub,
    x = "approach",
    xlab = "",
    y = names(measureii)[idx],
    ylab = measureii[idx],
    fill = "approach",
    palette = appro.cols %>% as.vector(),
    facet.by = c("celltype")
  ) +
    facet_grid(~celltype, scales = "free")

  if (idx == 1) {
    bplt <- bplt +
      used_ggthemes(
        axis.text.x = element_blank(),
        legend.position = "none"
      )
  } else {
    bplt <- bplt +
      used_ggthemes(
        axis.text.x = element_blank(),
        legend.position = "none",
        strip.text = element_blank()
      )
  }

  bplt
})

Code

numCores <- length(expr_order)
cl <- makeCluster(numCores, type = "FORK")
registerDoParallel(cl)

pedaccdsn <- foreach(i = expr_order, .combine = "rbind") %dopar% {
  run_fig3v(
    obvexpr = cdata_explst$oexpr, impvexpr = cdata_explst$iexpr,
    sce = i, ct = sctorder, persub = FALSE
  ) %>%
    rbindlist()
}

stopCluster(cl)


pedaccdsn[, .N, by = c("approach", "celltype")]
    approach celltype     N
 1:  inbuilt      CD4  6261
 2:  inbuilt      CD8  8003
 3:  inbuilt     CD14  7522
 4:  inbuilt     CD19  5779
 5:   custom      CD4  7241
 6:   custom      CD8 11794
 7:   custom     CD14  7823
 8:   custom     CD19  5881
 9:    bMIND      CD4 18871
10:    bMIND      CD8 18871
11:    bMIND     CD14 18821
12:    bMIND     CD19 18864
13:    swCAM      CD4 18871
14:    swCAM      CD8 18870
15:    swCAM     CD14 18867
16:    swCAM     CD19 18871
17:    LASSO      CD4 18871
18:    LASSO      CD8 18871
19:    LASSO     CD14 18053
20:    LASSO     CD19 18854
21:    RIDGE      CD4 18871
22:    RIDGE      CD8 18871
23:    RIDGE     CD14 18555
24:    RIDGE     CD19 18747
    approach celltype     N

pedaccdsn[, ":="(
  approach = factor(approach, levels = expr_order),
  celltype = factor(celltype, levels = sctorder)
)][, measure_rmse := log(measure_rmse)]

Distributions of prediction accuracy by cell type and approach. Prediction accuracy was evaluated by calculating Pearson correlation and root mean square error (RMSE) between observed and predicted cell-type expression across testing samples for each gene by cell type and approach. Standardised RMSEs: RMSE/average observed expression. The approaches used were: inbuilt - CIBERSORTx expression deconvolution with the inbuilt signature matrix, custom - CIBERSORTx expression deconvolution with a custom signature matrix derived from sorted cell-type expression in training samples, bMIND - bMIND expression deconvolution with flow fractions, swCAM - swCAM deconvolution with flow fractions, and LASSO/RIDGE - expression predicted from regularised multi-response Gaussian models.

Code

measureii <- c("Correlation per gene", "log(standardised RMSE) per gene")
names(measureii) <- c("measure_cor", "measure_rmse")

lapply(seq_along(measureii), function(idx) {
  aplt <- ggboxplot(
    data = pedaccdsn,
    x = "approach",
    xlab = "",
    y = names(measureii)[idx],
    ylab = measureii[idx],
    fill = "approach",
    palette = appro.cols %>% as.vector(),
    facet.by = c("celltype")
  ) +
    scale_fill_manual(labels = appro_lab, values = appro.cols) +
    facet_grid(~celltype, scales = "free")

  if (idx == 1) {
    aplt <- aplt +
      used_ggthemes(
        axis.text.x = element_blank(),
        legend.position = "none",
        strip.text = element_blank()
      )
  } else {
    aplt <- aplt +
      used_ggthemes(
        axis.text.x = element_blank(),
        legend.position = "top",
        strip.text = element_blank()
      )
  }


  aplt
})

Code

fig4_f <- "fig4res.rds"

if (!file.exists(fig4_f)) {
  ## prepare data to run fig4
  myrunfig4 <- run_datprepfig4(
    obvexpr = cdata_explst$oexpr,
    impvexpr = cdata_explst$iexpr,
    tsampct = cdata_explst$trainsamp
  )

  names(myrunfig4)

  ## dge & roc
  fig4res <- run_fig4(myrunfig4)

  names(fig4res)

  saveRDS(fig4res, file = fig4_f)
}

if (file.exists(fig4_f)) {
  fig4res <- readRDS(fig4_f)
}

1.5 Figure 5 , Differential gene expression (DGE) recovery

Code

fig4_f <- "fig4rundata.rds"

if (!file.exists(fig4_f)) {
  ## main: different gene set; common: same genes per cell across methods
  ## impvexpr_all <- list(main = expr_ori, common = expr_scenarios)
  impvexpr_all <- list(
    main = cdata_explst$iexpr,
    common = cdata_explst$icexpr
  )

  ## TRUE for dichotomous; FALSE for continuous
  dcht_pheno <- c(TRUE, FALSE)
  names(dcht_pheno) <- c("dcht", "cntn")


  numCores <- length(expr_order)
  numCores

  cl <- makeCluster(numCores, type = "FORK")
  registerDoParallel(cl)

  aucall <- foreach(
    j = seq_along(impvexpr_all),
    .final = function(x) setNames(x, names(impvexpr_all))
  ) %:%
    foreach(
      i = dcht_pheno,
      .final = function(x) setNames(x, names(dcht_pheno))
    ) %dopar% {
      observy <- cdata_explst$oexpr
      train_samps_ct <- cdata_explst$trainsamp
      sampdata <- cdata_explst$phenodata

      dat <- run_datprepfig4(
        obvexpr = observy,
        impvexpr = impvexpr_all[[j]],
        tsampct = train_samps_ct,
        dich = i
      )
      ## expr ~ pheno
      res1 <- run_fig4(datprlst = dat, truesig = 0.05)

      ## expr~ sex + pheno
      res2 <- run_fig4(
        datprlst = dat, truesig = 0.05,
        covdata = sampdata, covid = "sample", covar = "sex"
      )

      list(nocov = res1, covin = res2)
    }

  stopCluster(cl)

  mychk <- unlist(aucall, recursive = FALSE) %>%
    unlist(., recursive = FALSE) %>%
    lapply(., "[[", "aucres") %>%
    lapply(., function(datin) {
      datin[, ":="(
        celltype = factor(celltype, levels = sctorder),
        approach = factor(approach, levels = expr_order))]
      datin
    }) %>%
    rbindlist(l = ., idcol = "scenarios") %>%
    .[, pheno := gsub("^main\\.|^common\\.", "", scenarios)] %>%
    .[, scenario := str_extract(scenarios, "main|common")]


  saveRDS(mychk, file = fig4_f)
} else {
  mychk <- readRDS(fig4_f)
}

Code

pheno.order <- CJ(c("dcht", "cntn"), c("nocov", "covin"), sorted = FALSE)[, paste(V1, V2, sep = ".")]
pheno.order
[1] "dcht.nocov" "dcht.covin" "cntn.nocov" "cntn.covin"

mychk[, pheno := factor(
  pheno,
  levels = pheno.order,
  labels = c("dichPheno", "dichPheno+sex", "contPheno", "contPheno+sex")
)]

ggboxplot(
  data = mychk[scenario == "main", ],
  x = "approach", y = "auc", ylab = "AUC",
  palette = appro.cols,
  alpha = 0.35, fill = "approach",
  add = c("jitter"),
  add.params = list(shape = 21, size = 2.5, alpha = 1)
) +
  scale_fill_manual(labels = appro_lab, values = appro.cols) +
  facet_grid(pheno ~ celltype, scales = "free") +
  labs(fill = NULL) +
  used_ggthemes(
    axis.text.x = element_blank(),
    legend.position = "top",
    strip.text.y.right = element_text(angle = 0),
    legend.text = element_text(size = rel(1))
  ) +
  rremove("xlab")

Differential gene expression (DGE) recovery. Area under curve (AUC) distributions by cell type (columns) and approach for each scenario (rows). dichPheno: simulated dichotomous phenotype; dichPheno+sex: simulated dichotomous phenotype and sex; contPheno: continous phenotype; contPheno+sex: continous phenotype and sex. Each point is a simulated phenotype, and there are ten simulated phenotypes. For each simulated phenotype, the receiver operating characteristic curve and AUC were estimated by FDR fixed at 0.05 in the observed data and varied FDRs from 0 to 1 by 0.05 in the imputed data. Box plots showed the AUC distributions, with horizontal lines from the bottom to the top for 25 %, 50 % and 75 % quantiles, respectively. inbuilt: CIBERSORTx with the inbuilt signature matrix; custom: CIBERSORTx with a custom signature matrix; bMIND: bMIND with flow fractions; swCAM: swCAM with flow fractions; LASSO/RIDGE: regularised multi-response Gaussian models.

Code

mychk_tbl <- copy(mychk)
mychk_tbl[, .N, by = c("scenarios", "celltype", "approach")]
             scenarios celltype approach  N
  1:   main.dcht.nocov      CD4  inbuilt 10
  2:   main.dcht.nocov      CD4   custom 10
  3:   main.dcht.nocov      CD4    bMIND 10
  4:   main.dcht.nocov      CD4    swCAM 10
  5:   main.dcht.nocov      CD4    LASSO 10
 ---                                       
188: common.cntn.covin     CD19   custom 10
189: common.cntn.covin     CD19    bMIND 10
190: common.cntn.covin     CD19    swCAM 10
191: common.cntn.covin     CD19    LASSO 10
192: common.cntn.covin     CD19    RIDGE 10

tbl_summary <- mychk_tbl[, list(
  Q50 = median(auc) %>% round(., 2),
  Q25 = quantile(auc, probs = 0.25) %>% round(., 2),
  Q75 = quantile(auc, probs = 0.75) %>% round(., 2)
),
by = c("scenarios", "celltype", "approach")
][
  , V1 := paste0(Q50, " (", Q25, "-", Q75, ")")
]

tbl_summary[, pheno := gsub("^main\\.|^common\\.", "", scenarios)]
tbl_summary[, scenario := str_extract(scenarios, "main|common")]

tbl_res <- dcast(
  data = tbl_summary, pheno + celltype ~ scenario + approach,
  value.var = "V1"
)

setDF(tbl_res, rownames = paste0(tbl_res$pheno, tbl_res$celltype))

pheno.order <- CJ(c("dcht", "cntn"), c("nocov", "covin"), sorted = FALSE)[, paste(V1, V2, sep = ".")]

cols.order <- CJ(c("main", "common"), V2 = expr_order, sorted = FALSE)[, paste(V1, V2, sep = "_")]

rows.order <- CJ(pheno.order, sctorder, sorted = FALSE)[, paste0(pheno.order, sctorder)]

tbl_out <- tbl_res[rows.order, c("celltype", grep("main", cols.order, value = TRUE))]
names(tbl_out) <- gsub("main_|common_", "", names(tbl_out))

kbl(
  x = tbl_out, row.names = FALSE,
  caption = "Median AUC (Q25-Q75) across 10 simulated phentoypes per cell by apparoch. **No. predicted genes vary with cell type and approaches.**"
) %>%
  kable_paper("striped", full_width = FALSE) %>%
  pack_rows("dichotomous pheno", 1, 4,
    label_row_css = "text-align: left;"
  ) %>%
  pack_rows("dichotomous pheno + sex", 5, 8) %>%
  pack_rows("continous pheno", 9, 12) %>%
  pack_rows("continous pheno + sex", 13, 16) %>%
  column_spec(column = 6:7, bold = T)

Median AUC (Q25-Q75) across 10 simulated phentoypes per cell by apparoch. **No. predicted genes vary with cell type and approaches.**
celltype	inbuilt	custom	bMIND	swCAM	LASSO	RIDGE
dichotomous pheno
CD4	0.72 (0.7-0.74)	0.77 (0.74-0.79)	0.74 (0.68-0.76)	0.72 (0.67-0.72)	0.84 (0.83-0.86)	0.85 (0.83-0.87)
CD8	0.63 (0.62-0.64)	0.7 (0.66-0.76)	0.71 (0.6-0.77)	0.69 (0.58-0.73)	0.84 (0.82-0.85)	0.85 (0.83-0.86)
CD14	0.66 (0.62-0.72)	0.71 (0.68-0.77)	0.69 (0.68-0.72)	0.64 (0.61-0.66)	0.86 (0.84-0.88)	0.87 (0.84-0.9)
CD19	0.62 (0.56-0.7)	0.73 (0.67-0.75)	0.77 (0.76-0.79)	0.72 (0.7-0.73)	0.86 (0.81-0.89)	0.87 (0.83-0.91)
dichotomous pheno + sex
CD4	0.71 (0.69-0.76)	0.75 (0.74-0.78)	0.72 (0.67-0.76)	0.71 (0.65-0.72)	0.84 (0.82-0.86)	0.84 (0.82-0.87)
CD8	0.63 (0.62-0.64)	0.7 (0.65-0.73)	0.71 (0.6-0.76)	0.68 (0.59-0.71)	0.83 (0.81-0.86)	0.83 (0.8-0.85)
CD14	0.66 (0.61-0.72)	0.72 (0.68-0.77)	0.68 (0.68-0.72)	0.64 (0.61-0.66)	0.86 (0.84-0.88)	0.87 (0.86-0.89)
CD19	0.63 (0.56-0.68)	0.73 (0.71-0.74)	0.76 (0.76-0.79)	0.72 (0.7-0.74)	0.86 (0.83-0.89)	0.87 (0.84-0.9)
continous pheno
CD4	0.73 (0.68-0.75)	0.74 (0.71-0.75)	0.7 (0.67-0.72)	0.67 (0.66-0.69)	0.82 (0.79-0.84)	0.82 (0.8-0.84)
CD8	0.66 (0.63-0.67)	0.7 (0.68-0.72)	0.72 (0.69-0.74)	0.68 (0.66-0.71)	0.83 (0.78-0.85)	0.82 (0.79-0.85)
CD14	0.7 (0.62-0.74)	0.72 (0.7-0.76)	0.71 (0.69-0.77)	0.66 (0.62-0.66)	0.82 (0.81-0.84)	0.84 (0.81-0.85)
CD19	0.63 (0.57-0.7)	0.68 (0.65-0.74)	0.72 (0.67-0.73)	0.66 (0.64-0.7)	0.76 (0.72-0.87)	0.78 (0.73-0.87)
continous pheno + sex
CD4	0.71 (0.65-0.75)	0.73 (0.7-0.75)	0.69 (0.66-0.72)	0.66 (0.65-0.7)	0.8 (0.78-0.84)	0.81 (0.79-0.84)
CD8	0.66 (0.63-0.68)	0.7 (0.69-0.72)	0.71 (0.69-0.73)	0.67 (0.64-0.69)	0.81 (0.77-0.84)	0.8 (0.78-0.85)
CD14	0.69 (0.6-0.74)	0.72 (0.69-0.74)	0.7 (0.65-0.77)	0.65 (0.61-0.66)	0.81 (0.78-0.84)	0.82 (0.8-0.85)
CD19	0.63 (0.58-0.73)	0.7 (0.67-0.74)	0.73 (0.68-0.73)	0.65 (0.64-0.7)	0.76 (0.72-0.86)	0.77 (0.72-0.87)

1.6 Fig6, simdata

Code

simdata_dir <- file.path(here::here(), "simDataRes")
simdata_dir
[1] "/rds/project/rds-csoP2nj6Y6Y/wyl37/Rbookdown/sceQTLGen/simDataRes"

sim_ids <- list.dirs(file.path(simdata_dir, "deconvInput"),
  full.names = FALSE, recursive = FALSE
)
sim_ids
 [1] "1Msim_Combatdat.160.1.100" "1Msim_Combatdat.160.1.25" 
 [3] "1Msim_Combatdat.160.1.50"  "1Msim_Combatdat.160.1.75" 
 [5] "1Msim_Combatdat.160.2.100" "1Msim_Combatdat.160.2.25" 
 [7] "1Msim_Combatdat.160.2.50"  "1Msim_Combatdat.160.2.75" 
 [9] "1Msim_Combatdat.160.3.100" "1Msim_Combatdat.160.3.25" 
[11] "1Msim_Combatdat.160.3.50"  "1Msim_Combatdat.160.3.75" 
[13] "1Msim_dat.160.1.100"       "1Msim_dat.160.1.25"       
[15] "1Msim_dat.160.1.50"        "1Msim_dat.160.1.75"       
[17] "1Msim_dat.160.2.100"       "1Msim_dat.160.2.25"       
[19] "1Msim_dat.160.2.50"        "1Msim_dat.160.2.75"       
[21] "1Msim_dat.160.3.100"       "1Msim_dat.160.3.25"       
[23] "1Msim_dat.160.3.50"        "1Msim_dat.160.3.75"       

names(sim_ids) <- sim_ids

sim_ids <- str_sort(sim_ids, numeric = TRUE)
sim_ids
 [1] "1Msim_Combatdat.160.1.25"  "1Msim_Combatdat.160.1.50" 
 [3] "1Msim_Combatdat.160.1.75"  "1Msim_Combatdat.160.1.100"
 [5] "1Msim_Combatdat.160.2.25"  "1Msim_Combatdat.160.2.50" 
 [7] "1Msim_Combatdat.160.2.75"  "1Msim_Combatdat.160.2.100"
 [9] "1Msim_Combatdat.160.3.25"  "1Msim_Combatdat.160.3.50" 
[11] "1Msim_Combatdat.160.3.75"  "1Msim_Combatdat.160.3.100"
[13] "1Msim_dat.160.1.25"        "1Msim_dat.160.1.50"       
[15] "1Msim_dat.160.1.75"        "1Msim_dat.160.1.100"      
[17] "1Msim_dat.160.2.25"        "1Msim_dat.160.2.50"       
[19] "1Msim_dat.160.2.75"        "1Msim_dat.160.2.100"      
[21] "1Msim_dat.160.3.25"        "1Msim_dat.160.3.50"       
[23] "1Msim_dat.160.3.75"        "1Msim_dat.160.3.100"      

simdata_dir
[1] "/rds/project/rds-csoP2nj6Y6Y/wyl37/Rbookdown/sceQTLGen/simDataRes"

noimpvgene <- mclapply(sim_ids, function(runid) {
  resin <- prep_dat(rid = runid, res_dir = simdata_dir)
  ## main
  mainres <- sapply(unlist(resin$iexpr, recursive = FALSE), nrow)
  ## common
  commres <- sapply(unlist(resin$icexpr, recursive = FALSE), nrow)

  data.table(
    runid = runid,
    appro = tstrsplit(names(mainres), "\\.", keep = 1L) %>% unlist(),
    celltype = tstrsplit(names(mainres), "\\.", keep = 2L) %>% unlist(),
    mainNo = mainres,
    commNo = commres[match(names(mainres), names(commres))]
  )
}, mc.cores = 6) %>% rbindlist()

Code

noimpvgene[, ":="(
  rungrp = str_remove(runid, pattern = "\\.[[:digit:]]+$"),
  trainPer = tstrsplit(runid, "\\.", keep = 4L) %>%
    unlist(),
  CombatYN = ifelse(str_detect(runid, "Combatdat"), "Y", "N")
)]

simnopredtbl <- dcast(data = noimpvgene, celltype + runid ~ appro, value.var = "mainNo")
simnopredtbl[, celltype := factor(celltype, sctorder)]
simnopredtbl[, runid := factor(runid, levels = unique(runid) %>% str_sort(., numeric = TRUE))]

setorder(simnopredtbl, celltype, runid)
setcolorder(simnopredtbl, neworder = c("runid", "celltype", expr_order))

r_idx <- apply(simnopredtbl[, ..expr_order], 1, function(x) any(x < 100)) %>% which()

kbl(x = simnopredtbl, caption = "No.predicted genes", booktabs = TRUE) %>%
  kable_paper("striped", full_width = F) %>%
  row_spec(row = r_idx, bold = TRUE)

No.predicted genes
runid	celltype	inbuilt	custom	bMIND	swCAM	LASSO	RIDGE
1Msim_Combatdat.160.1.25	CD4	3353	4776	11986	11987	11982	11646
1Msim_Combatdat.160.1.50	CD4	3272	4665	11993	11994	11913	11913
1Msim_Combatdat.160.1.75	CD4	3619	4924	11995	11996	11910	11994
1Msim_Combatdat.160.1.100	CD4	3519	4714	12005	12006	12004	11931
1Msim_Combatdat.160.2.25	CD4	2108	4347	11961	11975	11926	11972
1Msim_Combatdat.160.2.50	CD4	2853	4284	11977	11981	11979	11979
1Msim_Combatdat.160.2.75	CD4	2716	4356	11979	11981	11979	11914
1Msim_Combatdat.160.2.100	CD4	2703	4295	11973	11974	11972	11972
1Msim_Combatdat.160.3.25	CD4	3183	6160	11982	11983	11879	11707
1Msim_Combatdat.160.3.50	CD4	3502	5601	11971	11976	11942	11974
1Msim_Combatdat.160.3.75	CD4	3465	5535	11976	11981	11979	11935
1Msim_Combatdat.160.3.100	CD4	3698	5559	12461	12464	12461	12416
1Msim_dat.160.1.25	CD4	1128	4388	11977	11977	11917	11898
1Msim_dat.160.1.50	CD4	1053	4791	11981	11981	11955	11907
1Msim_dat.160.1.75	CD4	1215	4319	11992	11992	11990	11990
1Msim_dat.160.1.100	CD4	1212	4301	11996	11996	11994	11994
1Msim_dat.160.2.25	CD4	1085	4112	11983	11984	11886	11945
1Msim_dat.160.2.50	CD4	1512	4191	11998	11999	11997	11997
1Msim_dat.160.2.75	CD4	1384	4237	11997	11997	11995	11981
1Msim_dat.160.2.100	CD4	1373	4917	11986	11986	11974	11974
1Msim_dat.160.3.25	CD4	973	4691	11964	11966	11905	11855
1Msim_dat.160.3.50	CD4	1076	5242	11960	11964	11933	11928
1Msim_dat.160.3.75	CD4	1160	5215	11963	11964	11962	11949
1Msim_dat.160.3.100	CD4	2026	5225	12463	12463	12460	12460
1Msim_Combatdat.160.1.25	CD8	2595	4292	11975	11966	11924	11723
1Msim_Combatdat.160.1.50	CD8	2569	4295	11978	11966	11986	11986
1Msim_Combatdat.160.1.75	CD8	2737	4137	11986	11991	11939	11939
1Msim_Combatdat.160.1.100	CD8	3021	4235	11997	11997	11999	11999
1Msim_Combatdat.160.2.25	CD8	2638	5067	11941	11954	11886	11884
1Msim_Combatdat.160.2.50	CD8	3300	5010	11944	11978	11974	11974
1Msim_Combatdat.160.2.75	CD8	3244	4941	11953	11974	11974	11899
1Msim_Combatdat.160.2.100	CD8	3380	4850	11958	11970	11968	11968
1Msim_Combatdat.160.3.25	CD8	2626	6193	11958	11982	11976	11805
1Msim_Combatdat.160.3.50	CD8	2714	5679	11941	11975	11970	11970
1Msim_Combatdat.160.3.75	CD8	2821	5603	11943	11980	11975	11975
1Msim_Combatdat.160.3.100	CD8	2960	5563	12439	12461	12457	12457
1Msim_dat.160.1.25	CD8	2291	4780	11964	11977	11970	11970
1Msim_dat.160.1.50	CD8	2158	4107	11951	11981	11953	11974
1Msim_dat.160.1.75	CD8	2142	4735	11979	11992	11985	11985
1Msim_dat.160.1.100	CD8	2063	4695	11978	11995	11989	11972
1Msim_dat.160.2.25	CD8	1984	5045	11929	11984	11884	11855
1Msim_dat.160.2.50	CD8	1789	5240	11945	11999	11975	11975
1Msim_dat.160.2.75	CD8	1804	5091	11966	11995	11965	11971
1Msim_dat.160.2.100	CD8	1903	4493	11972	11986	11979	11964
1Msim_dat.160.3.25	CD8	1823	5582	11939	11966	11960	11855
1Msim_dat.160.3.50	CD8	1766	4154	11924	11961	11960	11960
1Msim_dat.160.3.75	CD8	1765	4215	11912	11964	11919	11960
1Msim_dat.160.3.100	CD8	1958	4293	12432	12461	12455	12455
1Msim_Combatdat.160.1.25	CD14	2730	6680	11956	11981	11985	11956
1Msim_Combatdat.160.1.50	CD14	3545	6799	11976	11993	11993	11993
1Msim_Combatdat.160.1.75	CD14	3604	6772	11981	11992	11990	11981
1Msim_Combatdat.160.1.100	CD14	3767	6838	11992	12000	12005	12005
1Msim_Combatdat.160.2.25	CD14	3098	6798	11969	11935	11923	11933
1Msim_Combatdat.160.2.50	CD14	3426	6805	11976	11959	11961	11904
1Msim_Combatdat.160.2.75	CD14	3376	6808	11975	11973	11980	11962
1Msim_Combatdat.160.2.100	CD14	3382	6909	11971	11972	11937	11919
1Msim_Combatdat.160.3.25	CD14	2845	4604	11977	11931	11924	11957
1Msim_Combatdat.160.3.50	CD14	2760	4937	11970	11926	11955	11965
1Msim_Combatdat.160.3.75	CD14	2845	4999	11976	11917	11980	11980
1Msim_Combatdat.160.3.100	CD14	3529	5265	12459	12433	12462	12462
1Msim_dat.160.1.25	CD14	3875	3467	11950	11977	11930	11976
1Msim_dat.160.1.50	CD14	4268	2135	11966	11981	11980	11980
1Msim_dat.160.1.75	CD14	4333	3987	11973	11992	11991	11979
1Msim_dat.160.1.100	CD14	4386	3989	11985	11996	11986	11995
1Msim_dat.160.2.25	CD14	3770	4915	11976	11980	11967	11983
1Msim_dat.160.2.50	CD14	4077	4553	11991	11999	11970	11979
1Msim_dat.160.2.75	CD14	4213	4750	11995	11996	11988	11989
1Msim_dat.160.2.100	CD14	4172	2656	11983	11986	11985	11985
1Msim_dat.160.3.25	CD14	4353	3073	11960	11966	11940	11941
1Msim_dat.160.3.50	CD14	4395	4008	11962	11964	11964	11948
1Msim_dat.160.3.75	CD14	4267	4040	11961	11964	11949	11944
1Msim_dat.160.3.100	CD14	4457	4207	12461	12459	12455	12455
1Msim_Combatdat.160.1.25	CD19	8	982	11924	10073	11922	11919
1Msim_Combatdat.160.1.50	CD19	55	968	11863	9961	11935	11906
1Msim_Combatdat.160.1.75	CD19	14	892	11863	9945	11946	11946
1Msim_Combatdat.160.1.100	CD19	51	927	11879	10128	11952	11952
1Msim_Combatdat.160.2.25	CD19	1	1059	11948	11381	11871	11893
1Msim_Combatdat.160.2.50	CD19	1	1095	11935	10023	11926	11904
1Msim_Combatdat.160.2.75	CD19	32	1018	11923	10167	11917	11927
1Msim_Combatdat.160.2.100	CD19	57	945	11895	8760	11910	11898
1Msim_Combatdat.160.3.25	CD19	6	86	11942	11653	11913	11932
1Msim_Combatdat.160.3.50	CD19	13	1298	11915	11364	11904	11928
1Msim_Combatdat.160.3.75	CD19	33	1342	11928	11311	11931	11901
1Msim_Combatdat.160.3.100	CD19	46	1476	12374	11824	12377	12377
1Msim_dat.160.1.25	CD19	1198	2589	11958	10689	11914	11906
1Msim_dat.160.1.50	CD19	1144	3409	11923	11183	11919	11914
1Msim_dat.160.1.75	CD19	1396	2252	11938	11159	11933	11930
1Msim_dat.160.1.100	CD19	1956	2323	11945	11578	11940	11940
1Msim_dat.160.2.25	CD19	1823	1475	11939	11887	11909	11924
1Msim_dat.160.2.50	CD19	2647	1478	11914	11992	11941	11934
1Msim_dat.160.2.75	CD19	2875	1391	11907	11961	11943	11943
1Msim_dat.160.2.100	CD19	2783	2870	11880	11973	11927	11923
1Msim_dat.160.3.25	CD19	1052	1821	11954	11370	11891	11906
1Msim_dat.160.3.50	CD19	787	2100	11939	10372	11908	11907
1Msim_dat.160.3.75	CD19	474	2085	11939	10721	11906	11913
1Msim_dat.160.3.100	CD19	2310	2124	12401	10990	12375	12366

Code

simres_f <- "simres_chunksize.rds"

if (!file.exists(simres_f)) {
  sim_pen_chunks <- lapply(seq_along(sim_ids), function(idx) {
    chunk_files <- file.path(
      simdata_dir,
      paste0("PenalisedInput/", sim_ids[idx])
    ) %>% list.files(
      pattern = "*_chunk[[:digit:]]+.rds",
      full.names = TRUE
    )

    chunk_celltypes <- str_extract(
      basename(chunk_files),
      "CD4|CD8|CD14|CD19"
    )

    chunk_s <- sapply(chunk_files, function(c_f) {
      readRDS(c_f)[["yyinput"]] %>% ncol()
    })

    data.table(celltype = chunk_celltypes, chunksize = chunk_s)
  }) %>%
    set_names(x = ., value = sim_ids) %>%
    rbindlist(., idcol = "simid")

  saveRDS(object = sim_pen_chunks, file = simres_f)
}

if (file.exists(simres_f)) {
  sim_pen_chunks <- readRDS(simres_f)
}


## no.chunks by cell type across pesudobulk data
sim_pen_chunks[,
  {
    Nchunks <- .N
    sizesmin <- min(chunksize)
    sizesmax <- max(chunksize)
    list(Nchunks, sizesmin, sizesmax)
  },
  by = c("celltype")
][match(sctorder, celltype), ]
   celltype Nchunks sizesmin sizesmax
1:      CD4    1358        4      500
2:      CD8    1306        8      498
3:     CD14    1533        3      499
4:     CD19    1180        2      497

Code

simdata_dir
[1] "/rds/project/rds-csoP2nj6Y6Y/wyl37/Rbookdown/sceQTLGen/simDataRes"

simres_f <- "simres_all.rds"

if (!file.exists(simres_f)) {
  simres_all <- lapply(sim_ids, function(runid) {
    message(runid)
    datin <- prep_dat(rid = runid, res_dir = simdata_dir)

    commonexpr <- datin$icexpr

    ## 0 genes common across methods
    if (any(sapply(datin$icexpr[[1]], nrow) == 0)) {
      k_ct <- which(sapply(datin$icexpr[[1]], nrow) != 0) %>%
        names(datin$icexpr[[1]])[.]

      commonexpr <- lapply(datin$icexpr, function(xin) {
        xin[k_ct]
      })
    }

    impvexpr_all <- list(
      main = datin$iexpr,
      common = commonexpr
    )

    observy <- datin$oexpr
    train_samps_ct <- datin$trainsamp
    sampdata <- datin$phenodata

    ## sce1: train+test in [1] & [2]
    ## sce2: test in [1] & [2]

    sce_list <- list(
      sce1 = list(tsin = train_samps_ct, testsamp = FALSE),
      sce2 = list(tsin = NULL, testsamp = FALSE)
    )

    numCores <- length(impvexpr_all) * length(sce_list)
    numCores

    cl <- makeCluster(numCores, type = "FORK")
    registerDoParallel(cl)

    aucall <- foreach(
      j = seq_along(impvexpr_all),
      .final = function(x) setNames(x, names(impvexpr_all))
    ) %:%
      foreach(
        i = seq_along(sce_list),
        .final = function(x) setNames(x, names(sce_list))
      ) %dopar% {
        tsin <- sce_list[[i]][["tsin"]]

        ttin <- sce_list[[i]][["testsamp"]]

        dat <- run_datprepfig4(
          obvexpr = observy,
          impvexpr = impvexpr_all[[j]],
          tsampct = tsin,
          phenodata = sampdata,
          dich = TRUE
        )
        ## expr ~ pheno
        res1 <- run_fig4(
          datprlst = dat,
          truesig = 0.05, testsamp = ttin
        )

        ## expr~ batch + pheno
        res2 <- run_fig4(
          datprlst = dat, truesig = 0.05,
          testsamp = ttin,
          covdata = sampdata, covid = "sample",
          covar = "batch"
        )

        list(nocov = res1, covin = res2)
      }

    stopCluster(cl)

    aucall
  })

  names(simres_all) <- sim_ids

  simresmychk <- unlist(simres_all, recursive = FALSE) %>%
    unlist(., recursive = FALSE) %>%
    unlist(., recursive = FALSE) %>%
    lapply(., "[[", "aucres") %>%
    rbindlist(l = ., idcol = "runidsce")

  saveRDS(simresmychk, file = simres_f)
}

if (file.exists(simres_f)) {
  simresmychk <- readRDS(simres_f)
}

simresmychk[, scenario := str_extract(runidsce, "main|common")]
simresmychk[, trainper := tstrsplit(simresmychk$runidsce, "\\.", keep = 4L) %>% unlist()]
simresmychk[, sce := str_extract(runidsce, "sce[1|2|3]")]
simresmychk[, pheno := str_extract(runidsce, "nocov|covin")]
simresmychk[, pheno := ifelse(grepl("Combatdat", runidsce), paste0("Cb", pheno), pheno)]

simresmychk <- simresmychk[pheno != "Cbcovin", ]

simresmychk[, ":="(
  celltype = factor(celltype, levels = sctorder),
  approach = factor(approach, levels = expr_order),
  pheno = factor(pheno,
    levels = c("nocov", "covin", "Cbnocov"),
    labels = c("cond", "batch+cond", "Combat-Seq\ncond")
  )
)]

simresmychk[, trainper := as.numeric(trainper)]

## simresmychk[, trainper := factor(trainper, levels = unique(simresmychk$trainper) %>% str_sort(., numeric = TRUE))]
## n_trainper <- nlevels(simresmychk$trainper)
## n_trainper
## shape_m <- seq(21, length.out = n_trainper) %>%
##     set_names(levels(simresmychk$trainper))

simresmychk[, approach := appro_lab[match(simresmychk$approach, names(appro_lab))]]
simresmychk[, approach := factor(approach, appro_lab)]
appro.cols.mod <- appro.cols
names(appro.cols.mod) <- appro_lab

## CIBX-inbuilt imputed < 60 genes for CD19 under Combat-Seq cond
## unstable AUCs, set the rest as NA
simresmychk[pheno == "Combat-Seq\ncond" & celltype == "CD19" & approach == "CIBX-inbuilt", ]
                                       runidsce fakedPCs     approach celltype
 1:    1Msim_Combatdat.160.1.25.main.sce1.nocov     cond CIBX-inbuilt     CD19
 2:    1Msim_Combatdat.160.1.25.main.sce2.nocov     cond CIBX-inbuilt     CD19
 3:    1Msim_Combatdat.160.1.50.main.sce1.nocov     cond CIBX-inbuilt     CD19
 4:    1Msim_Combatdat.160.1.50.main.sce2.nocov     cond CIBX-inbuilt     CD19
 5:    1Msim_Combatdat.160.1.75.main.sce1.nocov     cond CIBX-inbuilt     CD19
 6:    1Msim_Combatdat.160.1.75.main.sce2.nocov     cond CIBX-inbuilt     CD19
 7:   1Msim_Combatdat.160.1.100.main.sce1.nocov     cond CIBX-inbuilt     CD19
 8:   1Msim_Combatdat.160.1.100.main.sce2.nocov     cond CIBX-inbuilt     CD19
 9:    1Msim_Combatdat.160.2.25.main.sce1.nocov     cond CIBX-inbuilt     CD19
10:    1Msim_Combatdat.160.2.25.main.sce2.nocov     cond CIBX-inbuilt     CD19
11:    1Msim_Combatdat.160.2.50.main.sce1.nocov     cond CIBX-inbuilt     CD19
12:    1Msim_Combatdat.160.2.50.main.sce2.nocov     cond CIBX-inbuilt     CD19
13:    1Msim_Combatdat.160.2.75.main.sce1.nocov     cond CIBX-inbuilt     CD19
14:    1Msim_Combatdat.160.2.75.main.sce2.nocov     cond CIBX-inbuilt     CD19
15:   1Msim_Combatdat.160.2.100.main.sce1.nocov     cond CIBX-inbuilt     CD19
16:   1Msim_Combatdat.160.2.100.main.sce2.nocov     cond CIBX-inbuilt     CD19
17:    1Msim_Combatdat.160.3.25.main.sce1.nocov     cond CIBX-inbuilt     CD19
18:    1Msim_Combatdat.160.3.25.main.sce2.nocov     cond CIBX-inbuilt     CD19
19:    1Msim_Combatdat.160.3.50.main.sce1.nocov     cond CIBX-inbuilt     CD19
20:    1Msim_Combatdat.160.3.50.main.sce2.nocov     cond CIBX-inbuilt     CD19
21:  1Msim_Combatdat.160.3.50.common.sce1.nocov     cond CIBX-inbuilt     CD19
22:  1Msim_Combatdat.160.3.50.common.sce2.nocov     cond CIBX-inbuilt     CD19
23:    1Msim_Combatdat.160.3.75.main.sce1.nocov     cond CIBX-inbuilt     CD19
24:    1Msim_Combatdat.160.3.75.main.sce2.nocov     cond CIBX-inbuilt     CD19
25:  1Msim_Combatdat.160.3.75.common.sce1.nocov     cond CIBX-inbuilt     CD19
26:  1Msim_Combatdat.160.3.75.common.sce2.nocov     cond CIBX-inbuilt     CD19
27:   1Msim_Combatdat.160.3.100.main.sce1.nocov     cond CIBX-inbuilt     CD19
28:   1Msim_Combatdat.160.3.100.main.sce2.nocov     cond CIBX-inbuilt     CD19
29: 1Msim_Combatdat.160.3.100.common.sce1.nocov     cond CIBX-inbuilt     CD19
30: 1Msim_Combatdat.160.3.100.common.sce2.nocov     cond CIBX-inbuilt     CD19
                                       runidsce fakedPCs     approach celltype
          auc scenario trainper  sce            pheno
 1: 0.5481005     main       25 sce1 Combat-Seq\ncond
 2:        NA     main       25 sce2 Combat-Seq\ncond
 3: 0.5030582     main       50 sce1 Combat-Seq\ncond
 4: 0.4960178     main       50 sce2 Combat-Seq\ncond
 5: 0.5923738     main       75 sce1 Combat-Seq\ncond
 6: 0.9588608     main       75 sce2 Combat-Seq\ncond
 7: 0.6417989     main      100 sce1 Combat-Seq\ncond
 8: 0.5084310     main      100 sce2 Combat-Seq\ncond
 9:        NA     main       25 sce1 Combat-Seq\ncond
10:        NA     main       25 sce2 Combat-Seq\ncond
11:        NA     main       50 sce1 Combat-Seq\ncond
12:        NA     main       50 sce2 Combat-Seq\ncond
13: 0.6268437     main       75 sce1 Combat-Seq\ncond
14: 0.5205106     main       75 sce2 Combat-Seq\ncond
15: 0.5907763     main      100 sce1 Combat-Seq\ncond
16: 0.5075536     main      100 sce2 Combat-Seq\ncond
17:        NA     main       25 sce1 Combat-Seq\ncond
18:        NA     main       25 sce2 Combat-Seq\ncond
19:        NA     main       50 sce1 Combat-Seq\ncond
20:        NA     main       50 sce2 Combat-Seq\ncond
21:        NA   common       50 sce1 Combat-Seq\ncond
22:        NA   common       50 sce2 Combat-Seq\ncond
23: 0.4792505     main       75 sce1 Combat-Seq\ncond
24: 0.9679359     main       75 sce2 Combat-Seq\ncond
25:        NA   common       75 sce1 Combat-Seq\ncond
26:        NA   common       75 sce2 Combat-Seq\ncond
27: 0.5507876     main      100 sce1 Combat-Seq\ncond
28: 0.5317240     main      100 sce2 Combat-Seq\ncond
29:        NA   common      100 sce1 Combat-Seq\ncond
30:        NA   common      100 sce2 Combat-Seq\ncond
          auc scenario trainper  sce            pheno

simresmychk[pheno == "Combat-Seq\ncond" & celltype == "CD19" & approach == "CIBX-inbuilt", auc := NA]

Code

## main: varying gene sets
## common: gene sets common across approaches
## sce1: train+ test; sce2: test only
simres_pltlist <- lapply(unique(simresmychk$scenario), function(scein) {
  lapply(c("sce1", "sce2"), function(xin) {
    ggplot(
      data = simresmychk[scenario == scein & sce == xin, ],
      aes(x = trainper, y = auc, colour = approach)
    ) +
      geom_point( # aes(shape = celltype),
        alpha = 0.5, size = 1.5
      ) +
      geom_smooth(
        aes(
          group = approach,
          colour = approach,
          linetype = approach,
          fill = approach
        ),
        alpha = 0.35, se = FALSE, linewidth = 0.5
      ) +
      scale_colour_manual(values = appro.cols.mod) +
      scale_fill_manual(values = appro.cols.mod) +
      facet_grid(cols = vars(celltype), rows = vars(pheno)) +
      ylab("AUC") +
      xlab("Percentage of 80 train samples") +
      scale_x_continuous(
        breaks = seq(25, 100, by = 25),
        labels = scales::label_percent(scale = 1)
      ) +
      used_ggthemes(
        strip.text.y.right = element_text(angle = 0),
        legend.position = "top"
      )
  }) %>% set_names(., c("sce1", "sce2"))
}) %>% set_names(., unique(simresmychk$scenario))

simres_pltlist$main$sce1

Differential gene expression (DGE) recovery. Area under curve (AUC) distributions (y-axis) across percentages of train samples (x-axis) by cell type (columns) for each scenario (rows). cond: raw synthesised read counts were used for DGE analysis of in vitro stimulation with C. albicans after 3 hours (3hCA) vs untreated (UT). batch+cond: same as cond, and included batch (V2 & V3 chemistry) as a covariate in the 3hCA-UT DGE model. Combat-seq cond: batch-corrected read counts from Combat-seq were used in DGE analysis of 3hCA vs UT. Each point is a simulation data. train and test samples + varing gene sets

Code

openxlsx::write.xlsx(
  list(
    "Fig1A" = sampdata,
    "Fig2" = pltfract_test,
    "Fig2.anno" = cordsn,
    "Fig3AB" = pedaccdsn_persub,
    "Fig3CD" = pedaccdsn,
    "Fig4" = mychk[scenario == "main",
      setdiff(names(mychk), c("scenarios", "scenario")),
      with = FALSE
    ],
    "Fig5" = simresmychk[
      scenario == "main" & sce == "sce1",
      .(approach, celltype, auc, trainper, pheno)
    ]
  ),
  file = "SourceData_mainFigures.xlsx",
  overwrite = TRUE
)

figS6 <- mychk[scenario == "common",
  setdiff(names(mychk), c("scenarios", "scenario")),
  with = FALSE
]

figS8 <- simresmychk[
  scenario == "common" & sce == "sce1",
  .(approach, celltype, auc, trainper, pheno)
]

2 Supplementary material

2.1 SupFig1, Overlap of predicted genes by approach

Overlap of predicted genes by CIBERSORTx using inbuilt (inbuilt) and our custom signatures (custom), bMIND, swCAM, LASSO and RIDGE. Predicted genes were defined as those with variations in expression across subjects. For each panel (cell type), the right bar plot indicates the numbers of predicted genes (No.Pred.Genes) by approach, and the top bar plot demonstrates No.Pred.Genes common in different combinations of approaches (black dots), but not in the grey-dot approaches, if present.

Code

col_pals <- cols4all::c4a("miscs.okabe", n = 4)

expr_scenarios <- cdata_explst$iexpr
stopifnot(names(expr_scenarios) == expr_order)
names(expr_scenarios) <- appro_lab

m_list <- lapply(seq_along(sctorder), function(idx) {
  ct <- sctorder[idx]
  Ocom <- lapply(expr_scenarios, function(dat) {
    rownames(dat[[ct]])
  })
  m1_mod <- make_comb_mat(Ocom, mode = "distinct")
  m1 <- m1_mod
  m1
})

names(m_list) <- sctorder

figS1 <- lapply(seq_along(sctorder), function(idx) {
  ct <- sctorder[idx]
  Ocom <- lapply(expr_scenarios, function(dat) {
    rownames(dat[[ct]])
  })
  max.n <- sapply(Ocom, length) %>% max()

  for (n in seq_along(Ocom)) {
    length(Ocom[[n]]) <- max.n
  }

  do.call(cbind, Ocom)
})

names(figS1) <- paste0("figS1.", sctorder)

sapply(m_list, comb_size)

m_list <- normalize_comb_mat(m_list)
sapply(m_list, comb_size)

max_set_size <- max(sapply(m_list, set_size))
max_comb_size <- max(sapply(m_list, comb_size))

ht_list <- NULL
for (i in seq_along(m_list)) {
  ht_list <- ht_list %v%
    UpSet(m_list[[i]],
      row_names_side = "left",
      row_title = paste0(names(m_list)[i]),
      row_title_gp = gpar(fontface = 2),
      row_title_rot = 0,
      set_order = names(expr_scenarios),
      comb_order = NULL,
      top_annotation = upset_top_annotation(m_list[[i]],
        ylim = c(0, max_comb_size),
        gp = gpar(fill = col_pals[2]),
        annotation_name_rot = 90,
        add_numbers = T, numbers_rot = 0
      ),
      right_annotation = upset_right_annotation(m_list[[i]],
        ylim = c(0, max_set_size),
        gp = gpar(fill = col_pals[3]),
        add_numbers = T
      )
    )
}

ht_list

2.2 SupFig2, Correlations per subject stratified by same and different subjects

Distributions of Pearson correlations (y-axis) between observed and imputed expression across genes from the same/different subjects by cell type and approach. One estimate per subject. inbuilt: CIBERSORTx with the inbuilt signature matrix; custom: CIBERSORTx with a custom signature matrix; bMIND: bMIND with flow fractions; swCAM: swCAM with flow fractions; LASSO/RIDGE: regularised multi-response Gaussian

Code

numCores <- length(expr_order)
cl <- makeCluster(numCores, type = "FORK")
registerDoParallel(cl)

pedaccdsn_rotpersub <- foreach(j = seq_along(expr_order), .combine = "rbind") %:%

  foreach(i = seq(sctorder), .combine = "rbind") %dopar% {
    expr_scenarios <- cdata_explst$iexpr
    observy <- cdata_explst$oexpr

    sce <- expr_order[j]
    ctin <- sctorder[i]

    expr_in <- expr_scenarios[[sce]][[ctin]]
    observ_in <- observy[rownames(expr_in), colnames(expr_in)]

    ## differnt subjects
    rot <- colnames(observ_in)[c(2:ncol(observ_in), 1)]

    ## correlation across genes per subject
    resdsn <- cor(expr_in[, rot], observ_in, method = "pearson") %>% diag()

    ## rmse across genes per subject
    mse1 <- apply((observ_in - expr_in[, rot])^2, 2, mean) %>% sqrt()

    stopifnot(names(resdsn) == names(mse1))

    ## another baseline as observed expression in different subjects
    abaseline <- cor(observ_in, observ_in[, rot]) %>% diag()

    dsn1 <- data.table(
      approach = sce, celltype = ctin,
      measure_cor = resdsn,
      measure_rmse = mse1,
      avecor = abaseline
    )
    dsn1
  }

stopCluster(cl)

pedaccdsn_rotpersub[, .N, by = c("approach", "celltype")]
    approach celltype  N
 1:  inbuilt      CD4 71
 2:  inbuilt      CD8 65
 3:  inbuilt     CD14 52
 4:  inbuilt     CD19 57
 5:   custom      CD4 71
 6:   custom      CD8 65
 7:   custom     CD14 52
 8:   custom     CD19 57
 9:    bMIND      CD4 71
10:    bMIND      CD8 65
11:    bMIND     CD14 52
12:    bMIND     CD19 57
13:    swCAM      CD4 71
14:    swCAM      CD8 65
15:    swCAM     CD14 52
16:    swCAM     CD19 57
17:    LASSO      CD4 71
18:    LASSO      CD8 65
19:    LASSO     CD14 52
20:    LASSO     CD19 57
21:    RIDGE      CD4 71
22:    RIDGE      CD8 65
23:    RIDGE     CD14 52
24:    RIDGE     CD19 57
    approach celltype  N

pedaccdsn_rotpersub[, ":="(
  approach = factor(approach, levels = expr_order),
  celltype = factor(celltype, levels = sctorder)
)]


ped_a <- pedaccdsn_rotpersub[, .(approach, celltype, measure_cor)][
  , sce := "diff"
]
ped_b <- pedaccdsn_rotpersub[, .(approach, celltype, avecor)][
  , sce := "ObsDiff"
]
setnames(ped_b, old = "avecor", new = "measure_cor")

pedaccdsn_persub[, sce := "same"]

list(pedaccdsn_persub, ped_a, ped_b) %>%
  rbindlist(., fill = TRUE) %>%
  .[, sce := factor(sce, levels = c("same", "diff", "ObsDiff"))] %>%
  ggboxplot(
    data = ., x = "celltype", y = "measure_cor",
    fill = "sce",
    facet.by = "approach",
    panel.labs = list(approach = appro_lab),
    ylab = "Correlation",
    palette = "jco",
    ggtheme = used_ggthemes(
      legend.position = "top",
      legend.text = element_text(size = rel(1.2))
    )
  ) +
  labs(fill = "Subject") +
  rremove("xlab")


figS2 <- rbindlist(list(pedaccdsn_persub, ped_a, ped_b), fill = TRUE) %>%
  .[, sce := factor(sce, levels = c("same", "diff", "ObsDiff"))]

figS2[, (c("measure_rmse")) := NULL]

2.3 SupFig3, Common gene sets-CLUSTER data

Code

## expression from approaches
cdata_explst <- prep_dat(
  rid = "cluster",
  res_dir = data_dir,
  cluster = TRUE
)

gcommon <- lapply(sctorder, function(ctin) {
  lapply(cdata_explst$icexpr, "[[", ctin) %>%
    lapply(., rownames) %>%
    Reduce(f = intersect, x = .)
})
names(gcommon) <- sctorder

sapply(gcommon, length)
 CD4  CD8 CD14 CD19 
3837 6274 5185 2689

2.3.1 correlation & rmse per subject

Code

### predicted accuracy

observy <- cdata_explst$oexpr
expr_scenarios <- cdata_explst$icexpr

numCores <- length(expr_order)
cl <- makeCluster(numCores, type = "FORK")
registerDoParallel(cl)

pedaccdsn_persub <- foreach(i = expr_order, .combine = "rbind") %dopar% {
  run_fig3v(
    obvexpr = observy, impvexpr = expr_scenarios,
    sce = i, ct = sctorder, persub = TRUE
  ) %>%
    rbindlist()
}

stopCluster(cl)


pedaccdsn_persub[, .N, by = c("approach", "celltype")]
    approach celltype  N
 1:  inbuilt      CD4 71
 2:  inbuilt      CD8 65
 3:  inbuilt     CD14 52
 4:  inbuilt     CD19 57
 5:   custom      CD4 71
 6:   custom      CD8 65
 7:   custom     CD14 52
 8:   custom     CD19 57
 9:    bMIND      CD4 71
10:    bMIND      CD8 65
11:    bMIND     CD14 52
12:    bMIND     CD19 57
13:    swCAM      CD4 71
14:    swCAM      CD8 65
15:    swCAM     CD14 52
16:    swCAM     CD19 57
17:    LASSO      CD4 71
18:    LASSO      CD8 65
19:    LASSO     CD14 52
20:    LASSO     CD19 57
21:    RIDGE      CD4 71
22:    RIDGE      CD8 65
23:    RIDGE     CD14 52
24:    RIDGE     CD19 57
    approach celltype  N

pedaccdsn_persub[, ":="(
  approach = factor(approach, levels = expr_order),
  celltype = factor(celltype, levels = sctorder)
)][, measure_rmse := log(measure_rmse)]


measureii <- c("Correlation per subject", "log(RMSE) per subject")
names(measureii) <- c("measure_cor", "measure_rmse")

lapply(seq_along(measureii), function(idx) {
  bplt <- ggboxplot(
    data = pedaccdsn_persub,
    x = "approach",
    xlab = "",
    y = names(measureii)[idx],
    ylab = measureii[idx],
    fill = "approach",
    palette = appro.cols,
    facet.by = c("celltype")
  ) +
    facet_grid(~celltype, scales = "free")

  if (idx == 1) {
    bplt <- bplt +
      used_ggthemes(
        axis.text.x = element_blank(),
        legend.position = "none"
      )
  } else {
    bplt <- bplt +
      used_ggthemes(
        axis.text.x = element_blank(),
        legend.position = "none",
        strip.text = element_blank()
      )
  }

  bplt
})
[[1]]

[[2]]

Distributions of prediction accuracy by cell type and approach. Prediction accuracy was evaluated by calculating Pearson correlation and root mean square error (RMSE) between observed and predicted cell-type expression across genes for the same subjects by cell type and approach. For each cell type, genes common across approaches were used. No. common genes are 3837 for CD4, 6274 for CD8, 5185 for CD14, and 2689 for CD19, respectively. The approaches used were: inbuilt - CIBERSORTx expression deconvolution with the inbuilt signature matrix, custom - CIBERSORTx expression deconvolution with a custom signature matrix derived from sorted cell-type expression in training samples, bMIND - bMIND expression deconvolution with flow fractions, swCAM - swCAM deconvolution with flow fractions, and LASSO/RIDGE - expression predicted from regularised multi-response Gaussian models.

2.3.2 correlation & rmse per gene

Code

numCores <- length(expr_order)
cl <- makeCluster(numCores, type = "FORK")
registerDoParallel(cl)

pedaccdsn <- foreach(i = expr_order, .combine = "rbind") %dopar% {
  run_fig3v(
    obvexpr = observy, impvexpr = expr_scenarios,
    sce = i, ct = sctorder, persub = FALSE
  ) %>%
    rbindlist()
}


stopCluster(cl)

pedaccdsn[, .N, by = c("approach", "celltype")]
    approach celltype    N
 1:  inbuilt      CD4 3837
 2:  inbuilt      CD8 6274
 3:  inbuilt     CD14 5185
 4:  inbuilt     CD19 2689
 5:   custom      CD4 3837
 6:   custom      CD8 6274
 7:   custom     CD14 5185
 8:   custom     CD19 2689
 9:    bMIND      CD4 3837
10:    bMIND      CD8 6274
11:    bMIND     CD14 5185
12:    bMIND     CD19 2689
13:    swCAM      CD4 3837
14:    swCAM      CD8 6274
15:    swCAM     CD14 5185
16:    swCAM     CD19 2689
17:    LASSO      CD4 3837
18:    LASSO      CD8 6274
19:    LASSO     CD14 5185
20:    LASSO     CD19 2689
21:    RIDGE      CD4 3837
22:    RIDGE      CD8 6274
23:    RIDGE     CD14 5185
24:    RIDGE     CD19 2689
    approach celltype    N

pedaccdsn[, ":="(
  approach = factor(approach, levels = expr_order),
  celltype = factor(celltype, levels = sctorder)
)][, measure_rmse := log(measure_rmse)]

Code

measureii <- c("Correlation per gene", "log(standardised RMSE) per gene")
names(measureii) <- c("measure_cor", "measure_rmse")

lapply(seq_along(measureii), function(idx) {
  aplt <- ggboxplot(
    data = pedaccdsn,
    x = "approach",
    xlab = "",
    y = names(measureii)[idx],
    ylab = measureii[idx],
    fill = "approach",
    palette = appro.cols,
    facet.by = c("celltype")
  ) +
    scale_fill_manual(labels = appro_lab, values = appro.cols) +
    facet_grid(~celltype, scales = "free")

  if (idx == 1) {
    aplt <- aplt + used_ggthemes(
      axis.text.x = element_blank(),
      legend.position = "none",
      strip.text = element_blank()
    )
  } else {
    aplt <- aplt + used_ggthemes(
      axis.text.x = element_blank(),
      legend.position = "top",
      strip.text = element_blank()
    )
  }


  aplt
})
[[1]]

[[2]]

Distributions of prediction accuracy by cell type and approach. Prediction accuracy was evaluated by calculating Pearson correlation and root mean square error (RMSE) between observed and predicted cell-type expression across testing samples for each gene by cell type and approach. For each cell type, genes common across approaches were used. No. common genes are 3837 for CD4, 6274 for CD8, 5185 for CD14, and 2689 for CD19, respectively. Standardised RMSEs: RMSE/average observed expression. The approaches used were: inbuilt - CIBERSORTx expression deconvolution with the inbuilt signature matrix, custom - CIBERSORTx expression deconvolution with a custom signature matrix derived from sorted cell-type expression in training samples, bMIND - bMIND expression deconvolution with flow fractions, swCAM - swCAM deconvolution with flow fractions, and LASSO/RIDGE - expression predicted from regularised multi-response Gaussian models.

Code

figS3ab <- pedaccdsn_persub
figS3cd <- pedaccdsn

2.4 SupFig4,5-DGE recovery

Comparisons of log\(_2\) fold changes in genes between the observed (x-axis) and imputed (y-axis) data by method (column) and by recovery status (row). DGE analysis was carried out using limma based on one of the simulated phenotypes. An FDR of 0.05 was used in both observed and imputed data here, and CD4 DGE recovery is shown. DGE results in each method (column) are the same, with coloured points for genes falling into that category of recovery status (row) and grey points for genes not belonging to the same category. Recovery status: No, Yes-NS, and Yes-Sig. Yes-Sig (sensitivity; Sen): differentially expressed genes in the observed data were also called significance in the imputed data, and the orientations of the effect sizes are the same in both data. Yes-NS (specificity; Spe): genes are called non-significance (NS) in both data. No (error; Err): misclassified genes; Err is calculated as the percentage of misclassified genes to the total number of predicted genes.

Code

degres <- copy(fig4res$dge_res) %>% setDT()
degres[, celltype := factor(celltype, levels = sctorder)]
degres[, approach := factor(approach, levels = expr_order)]

## FDR in true
fdr.thr <- 0.05

## use truecol and callmade
degres[, truecol := "0No"][adj.P.Val < fdr.thr, truecol := "1Yes"]
degres[, callmade := "0Nonsig"][
  impvadjp < fdr.thr & sign(t) == sign(impvt),
  callmade := "1Yes"
]
degres[, recovery := "No"][truecol == "1Yes" & callmade == "1Yes", recovery := "Yes-Sig"][
  truecol == "0No" & callmade == "0Nonsig", recovery := "Yes-NS"
]

## numerator
annotdsn <- degres[, .N, by = c("recovery", "fakedpheno", "approach", "celltype")]

annotdsn1 <- degres[truecol == "0No",
  {
    N_NS_obs <- .N
    list(N_NS_obs)
  },
  by = c("fakedpheno", "approach", "celltype")
][
  degres[truecol == "1Yes",
    {
      N_SIG_obs <- .N
      list(N_SIG_obs)
    },
    by = c("fakedpheno", "approach", "celltype")
  ], N_SIG_obs := i.N_SIG_obs,
  on = c("fakedpheno", "approach", "celltype")
][is.na(N_SIG_obs), N_SIG_obs := 0][
  annotdsn[recovery == "Yes-NS", ], N_recNS := i.N,
  on = c("fakedpheno", "approach", "celltype")
][
  annotdsn[recovery == "Yes-Sig", ], N_recSIG := i.N,
  on = c("fakedpheno", "approach", "celltype")
][is.na(N_recSIG), N_recSIG := 0]


tidelabfun <- function(a, b, group) {
  ## a1 <- paste0(group, ":", a, "/", b)
  a1 <- paste0(group, ":")
  if (b != 0) {
    c <- round(a / b, 2)
    ## a1 <- paste0(a1, " (", c, ")")
    a1 <- paste0(a1, c)
  }
  a1
}

annotdsn1[, ":="(
  tlab = paste0(
    tidelabfun(N_recSIG, N_SIG_obs, "Sig"), "\n",
    tidelabfun(N_recNS, N_NS_obs, "NS "), "\n",
    tidelabfun(N_recNS + N_recSIG, N_SIG_obs + N_NS_obs, "ALL")
  )

),
by = c("fakedpheno", "approach", "celltype")
]

mypaldeg <- cols4all::c4a("brewer.dark2", n = 3)
names(mypaldeg) <- c("Yes-NS", "Yes-Sig", "No")

annotdsn1[, ":="(
  tlab = paste0(
    tidelabfun(N_recSIG, N_SIG_obs, "Sen"), "\n",
    tidelabfun(N_recNS, N_NS_obs, "Spe")
  )
),
by = c("fakedpheno", "approach", "celltype")
]

mypaldeg <- cols4all::c4a("brewer.dark2", n = 3)
names(mypaldeg) <- c("Yes-NS", "Yes-Sig", "No")

## PC1 and CD4 subset
pltdsn <- degres[fakedpheno == "PC1" & celltype == "CD4", ]
pltdsn[, recovery := factor(recovery, levels = c("Yes-Sig", "Yes-NS", "No"))]

## summary of Sen & Spe, and plot lab
annodf1 <- pltdsn[, .N, by = c("approach", "recovery")]
annodf2 <- pltdsn[truecol == "1Yes", .N, by = c("approach")][, recovery := "Yes-Sig"]
annodf3 <- pltdsn[truecol == "0No", .N, by = c("approach")][, recovery := "Yes-NS"]
annodf4 <- pltdsn[, .N, by = c("approach")][, recovery := "No"]
annodf1[annodf2, Nt := i.N, on = c("approach", "recovery")]
annodf1[annodf3, Nt := i.N, on = c("approach", "recovery")]
annodf1[annodf4, Nt := i.N, on = c("approach", "recovery")]
annodf1[!is.na(Nt), pper := round(N / Nt, 2)]

annodf1[recovery == "Yes-Sig", tlab := paste0("Sen:", pper)]
annodf1[recovery == "Yes-NS", tlab := paste0("Spe:", pper)]
annodf1[recovery == "No", tlab := paste0("Err:", pper)]

kklab <- levels(pltdsn$recovery)
names(kklab) <- kklab

figS4 <- copy(pltdsn)
figS4.anno <- copy(annodf1)

ggplot(data = pltdsn, aes(x = logFC, y = impvlogFC)) +
  geom_point(data = pltdsn %>% dplyr::select(-recovery), color = "grey", alpha = 0.85) +
  geom_point(aes(fill = recovery), shape = 21) +
  geom_hline(yintercept = 0, linetype = 2, color = "black") +
  geom_vline(xintercept = 0, linetype = 2, color = "black") +
  geom_abline(
    intercept = 0, slope = 1, linetype = 4,
    color = ggsci::pal_lancet(alpha = 1)(8)[4]
  ) +
  ggrepel::geom_text_repel(
    data = annodf1,
    aes(
      x = -Inf, y = Inf,
      label = tlab
    ),
    hjust = 0, size = 5,
    min.segment.length = Inf,
    parse = FALSE
  ) +
  scale_fill_manual(values = mypaldeg) +
  facet_grid(recovery ~ approach,
    labeller = as_labeller(c(appro_lab, kklab))
  ) +
  xlab("Fold changes in observed data + simulated phenotype") +
  ylab("Fold changes in imputed data") +
  labs(fill = "Recovery") +
  used_ggthemes(
    legend.position = "top",
    axis.title = element_text(size = rel(1.2)),
    legend.text = element_text(size = rel(1.2)),
    legend.title = element_text(
      size = rel(1.2),
      face = "bold"
    ),
    strip.text.y.right = element_text(angle = 0)
  )

Receiver operating characteristic (ROC) curves and estimated area under curve (AUC) by cell type and approach based on one simulated phenotype. FDR was fixed at 0.05 in the observed data and varied from 0 to 1 by 0.05 in the imputed data.

Code

###################################################################################
## summarise the ROC curves by approach based on the first simulated phenotypes  ##
###################################################################################
## pc1 as example
pc_examp <- "PC1"

rocresall <- fig4res$rocresall

myaucs <- fig4res$aucres[fakedPCs == "PC1", ]
myaucs[, auc := round(auc, 3)]
myaucs[, name := paste(celltype, approach, sep = ".")]


appro.cols.MOD <- appro.cols[match(myaucs$approach, names(appro.cols))]
names(appro.cols.MOD) <- myaucs$name

myaucs[, ":="(
  celltype = factor(celltype, levels = sctorder),
  approach = factor(approach,
    levels = expr_order
  )
)]

myrun <- lapply(sctorder, function(ctin) {
  aucpltdsn <- lapply(rocresall[[pc_examp]][[ctin]], "[[", "rocres")
  names(aucpltdsn) <- paste0(ctin, ".", names(aucpltdsn))
  aucpltdsn
})

myrun <- unlist(myrun, recursive = FALSE)

library(pROC)
g.list <- ggroc(myrun, legacy.axes = TRUE)

kklab <- sctorder
names(kklab) <- sctorder

g.list$data$celltype <- tstrsplit(g.list$data$name, "\\.", keep = 1L) %>%
  unlist() %>%
  factor(., levels = sctorder)
g.list$data$approach <- tstrsplit(g.list$data$name, "\\.", keep = 2L) %>%
  unlist() %>%
  factor(., levels = expr_order)

g.list +
  scale_color_manual(values = appro.cols.MOD) +
  facet_grid(celltype ~ approach,
    labeller = as_labeller(c(appro_lab, kklab))
  ) +
  geom_text(
    data = myaucs,
    aes(0.75, 0.25,
      label = auc
    ),
    size = 6
  ) +
  geom_abline(
    slope = 1, intercept = 0, linetype = 2,
    colour = cols4all::c4a_na("okabe")
  ) +
  ylab("Sensitivity") +
  xlab("1-Specificity") +
  used_ggthemes(
    legend.position = "none",
    strip.text.y.right = element_text(angle = 0)
  )

figS5 <- g.list$data
figS5.anno <- myaucs[, .(approach, celltype, auc, name)]

2.5 SupFig6, Train + Test samples, CLUSTER data

2.5.1 Common gene sets

Code

ggboxplot(
  data = mychk[scenario == "common", ],
  x = "approach", y = "auc", ylab = "AUC",
  palette = appro.cols,
  alpha = 0.35, fill = "approach",
  add = c("jitter"),
  add.params = list(shape = 21, size = 2.5, alpha = 1)
) +
  scale_fill_manual(labels = appro_lab, values = appro.cols) +
  facet_grid(pheno ~ celltype, scales = "free") +
  labs(fill = NULL) +
  used_ggthemes(
    axis.text.x = element_blank(),
    legend.position = "top",
    strip.text.y.right = element_text(angle = 0),
    legend.text = element_text(size = rel(1))
  ) +
  rremove("xlab")

Differential gene expression (DGE) recovery. Genes common across approaches were used, and No. common genes are 3837 for CD4, 6274 for CD8, 5185 for CD14, and 2689 for CD19. Area under curve (AUC) distributions by cell type (columns) and approach for each scenario (rows). **dichPheno: simulated dichotomous phenotype; dichPheno+sex: simulated dichotomous phenotype and sex; contPheno: continous phenotype; contPheno+sex: continous phenotype and sex**. Each point is a simulated phenotype, and there are ten simulated phenotypes. For each simulated phenotype, the receiver operating characteristic curve and AUC were estimated by FDR fixed at 0.05 in the observed data and varied FDRs from 0 to 1 by 0.05 in the imputed data. Box plots showed the AUC distributions, with horizontal lines from the bottom to the top for 25 %, 50 % and 75 % quantiles, respectively. inbuilt: CIBERSORTx with the inbuilt signature matrix; custom: CIBERSORTx with a custom signature matrix; bMIND: bMIND with flow fractions; swCAM: swCAM with flow fractions; LASSO/RIDGE: regularised multi-response Gaussian models.

Code

tbl_out <- tbl_res[rows.order, c("celltype", grep("common", cols.order, value = TRUE))]
names(tbl_out) <- gsub("main_|common_", "", names(tbl_out))

kbl(
  x = tbl_out, row.names = FALSE,
  caption = "Median AUC (Q25-Q75) across 10 simulated phentoypes per cell by apparoch. **Genes common across approaches per cell type**"
) %>%
  kable_paper("striped", full_width = FALSE) %>%
  column_spec(column = 6:7, bold = T) %>%
  pack_rows("dichotomous pheno", 1, 4,
    label_row_css = "text-align: left;"
  ) %>%
  pack_rows("dichotomous pheno + sex", 5, 8) %>%
  pack_rows("continous pheno", 9, 12) %>%
  pack_rows("continous pheno + sex", 13, 16)

Median AUC (Q25-Q75) across 10 simulated phentoypes per cell by apparoch. **Genes common across approaches per cell type**
celltype	inbuilt	custom	bMIND	swCAM	LASSO	RIDGE
dichotomous pheno
CD4	0.73 (0.71-0.76)	0.79 (0.78-0.83)	0.76 (0.75-0.79)	0.72 (0.7-0.73)	0.85 (0.84-0.87)	0.86 (0.84-0.88)
CD8	0.64 (0.62-0.66)	0.75 (0.74-0.79)	0.77 (0.73-0.83)	0.73 (0.7-0.79)	0.84 (0.82-0.89)	0.84 (0.82-0.88)
CD14	0.68 (0.64-0.72)	0.78 (0.75-0.82)	0.73 (0.71-0.81)	0.71 (0.7-0.78)	0.87 (0.86-0.89)	0.88 (0.86-0.89)
CD19	0.63 (0.59-0.69)	0.77 (0.72-0.83)	0.83 (0.79-0.87)	0.75 (0.71-0.8)	0.86 (0.8-0.91)	0.85 (0.83-0.91)
dichotomous pheno + sex
CD4	0.72 (0.7-0.76)	0.79 (0.78-0.81)	0.77 (0.75-0.79)	0.72 (0.7-0.73)	0.84 (0.83-0.85)	0.83 (0.82-0.87)
CD8	0.64 (0.62-0.66)	0.75 (0.72-0.78)	0.79 (0.73-0.8)	0.74 (0.69-0.77)	0.84 (0.82-0.89)	0.85 (0.81-0.89)
CD14	0.68 (0.62-0.72)	0.77 (0.76-0.82)	0.74 (0.72-0.8)	0.73 (0.69-0.77)	0.88 (0.86-0.89)	0.89 (0.87-0.89)
CD19	0.63 (0.59-0.69)	0.75 (0.72-0.82)	0.83 (0.81-0.87)	0.74 (0.7-0.83)	0.86 (0.84-0.92)	0.88 (0.83-0.92)
continous pheno
CD4	0.73 (0.68-0.74)	0.77 (0.75-0.8)	0.75 (0.72-0.77)	0.7 (0.68-0.71)	0.83 (0.8-0.85)	0.83 (0.81-0.85)
CD8	0.66 (0.64-0.68)	0.74 (0.73-0.76)	0.75 (0.73-0.79)	0.72 (0.72-0.73)	0.83 (0.79-0.85)	0.85 (0.8-0.85)
CD14	0.7 (0.62-0.75)	0.78 (0.75-0.81)	0.76 (0.71-0.8)	0.74 (0.69-0.78)	0.84 (0.81-0.87)	0.86 (0.83-0.87)
CD19	0.64 (0.6-0.7)	0.73 (0.68-0.8)	0.8 (0.75-0.83)	0.75 (0.64-0.8)	0.78 (0.68-0.89)	0.79 (0.74-0.9)
continous pheno + sex
CD4	0.72 (0.65-0.75)	0.76 (0.74-0.8)	0.73 (0.71-0.78)	0.7 (0.7-0.72)	0.82 (0.79-0.83)	0.83 (0.8-0.83)
CD8	0.66 (0.64-0.68)	0.74 (0.73-0.76)	0.74 (0.73-0.78)	0.72 (0.67-0.73)	0.8 (0.79-0.85)	0.81 (0.79-0.86)
CD14	0.7 (0.6-0.74)	0.78 (0.73-0.8)	0.75 (0.71-0.8)	0.73 (0.69-0.76)	0.84 (0.8-0.86)	0.86 (0.81-0.87)
CD19	0.67 (0.59-0.72)	0.73 (0.7-0.78)	0.8 (0.76-0.83)	0.73 (0.72-0.8)	0.82 (0.71-0.9)	0.84 (0.75-0.91)

2.6 SupFig7, slopes

Distributions of sensitivity (y-axis) & specificity (y-axis) by cell type and approach. An FDR < 0.05 were used in both imputed and observed data. Each point is a simulated phenotype.

Code

## sen and spe across phenotypes
mychks <- copy(annotdsn1)
mychks[, Sens := N_recSIG / N_SIG_obs][
  , Spes := N_recNS / N_NS_obs
]
specresplt <- ggboxplot(
  data = mychks, x = "approach",
  y = "Spes", ylab = "Specificity",
  palette = appro.cols,
  alpha = 0.35, fill = "approach",
  add.params = list(shape = 21, size = 3, alpha = 0.8),
  add = c("jitter")
) +
  scale_fill_manual(labels = appro_lab, values = appro.cols) +
  facet_grid(~celltype, scales = "free") +
  used_ggthemes() + rremove("xlab") +
  theme(
    axis.text.x = element_blank(),
    legend.text = element_text(size = rel(1.2))
  ) +
  labs(fill = NULL)

sensresplt <- ggboxplot(
  data = mychks, x = "approach",
  y = "Sens", ylab = "Sensitivity",
  palette = appro.cols,
  alpha = 0.35, fill = "approach",
  add.params = list(shape = 21, size = 3, alpha = 0.8),
  add = c("jitter")
) +
  scale_fill_manual(labels = appro_lab, values = appro.cols) +
  facet_grid(~celltype, scales = "free") +
  used_ggthemes(
    axis.text.x = element_blank(),
    legend.text = element_text(size = rel(1.2))
  ) +
  rremove("xlab") + labs(fill = NULL)

ggarrange(sensresplt, specresplt, nrow = 2, common.legend = TRUE)

figS7a <- copy(mychks)

Effect sizes (logFC), standard errors (S.E.) of effect sizes, z scores of effect size between observed (x-axis) and imputed (y-axis) data based on CD4 with one simulated phenotype

Code

y <- copy(degres)
head(y)
       logFC  AveExpr        t      P.Value    adj.P.Val        B     s2post
1: 0.3554621 7.260532 8.525827 1.339332e-14 8.385557e-11 22.64075 0.06458024
2: 0.3090823 5.954265 8.394649 2.884120e-14 9.028738e-11 21.90352 0.05036503
3: 0.2993696 5.147788 8.049545 2.126368e-13 4.437730e-10 19.98378 0.05138763
4: 0.3058392 4.130806 7.938247 4.023035e-13 6.297056e-10 19.37123 0.05514713
5: 0.2602279 4.532937 7.783485 9.707277e-13 1.078018e-09 18.52517 0.04152844
6: 0.3030935 5.021438 7.772500 1.033079e-12 1.078018e-09 18.46538 0.05649601
   Nsample Ncase Ncon    impvt impvlogFC        impvp     impvadjp impvNsample
1:     151    66   85 5.287726 0.2487747 4.205486e-07 0.0002633055         151
2:     151    66   85 5.197207 0.2747704 6.374286e-07 0.0003325784         151
3:     151    66   85 4.384273 0.3168297 2.153044e-05 0.0018722509         151
4:     151    66   85 5.493173 0.3488366 1.609410e-07 0.0001439502         151
5:     151    66   85 4.416395 0.2156537 1.888024e-05 0.0017860222         151
6:     151    66   85 4.447224 0.3540465 1.663371e-05 0.0017506914         151
   impvNcase impvNcon approach celltype fakedpheno    gene truecol callmade
1:        66       85  inbuilt      CD4        PC1    CD44    1Yes     1Yes
2:        66       85  inbuilt      CD4        PC1 C1orf43    1Yes     1Yes
3:        66       85  inbuilt      CD4        PC1     VCP    1Yes     1Yes
4:        66       85  inbuilt      CD4        PC1   P2RX4    1Yes     1Yes
5:        66       85  inbuilt      CD4        PC1   NAA20    1Yes     1Yes
6:        66       85  inbuilt      CD4        PC1   PSME3    1Yes     1Yes
   recovery
1:  Yes-Sig
2:  Yes-Sig
3:  Yes-Sig
4:  Yes-Sig
5:  Yes-Sig
6:  Yes-Sig

xx <- y[fakedpheno == "PC1" & celltype == "CD4"]

library(ggplot2)
library(cowplot)
theme_set(theme_cowplot())
xx[, class := paste(truecol, callmade, sep = "/")]
xx[, se := logFC / t]
xx[, impvse := impvlogFC / impvt]
xsum <- xx[, .(n = .N, medse = median(se), medimpvse = median(impvse)), by = c("class", "approach")]
xsum[, p := n / sum(n), by = "approach"]
xsum[order(approach, class)]
           class approach    n      medse  medimpvse           p
 1:  0No/0Nonsig  inbuilt 4187 0.05508787 0.05873184 0.668743012
 2:     0No/1Yes  inbuilt   27 0.06178480 0.05758701 0.004312410
 3: 1Yes/0Nonsig  inbuilt 1372 0.05195210 0.05968275 0.219134324
 4:    1Yes/1Yes  inbuilt  675 0.05002479 0.05390566 0.107810254
 5:  0No/0Nonsig   custom 4294 0.05983496 0.05406578 0.593012015
 6:     0No/1Yes   custom  122 0.06114711 0.05215159 0.016848502
 7: 1Yes/0Nonsig   custom 1011 0.05544873 0.05477584 0.139621599
 8:    1Yes/1Yes   custom 1814 0.05195309 0.04837343 0.250517884
 9:  0No/0Nonsig    bMIND 9844 0.05656370 0.07044463 0.521646972
10:     0No/1Yes    bMIND  253 0.05715402 0.06521827 0.013406815
11: 1Yes/0Nonsig    bMIND 3340 0.05657640 0.07587389 0.176991150
12:    1Yes/1Yes    bMIND 5434 0.05345500 0.06754613 0.287955063
13:  0No/0Nonsig    swCAM 9940 0.05655989 0.08271920 0.526734142
14:     0No/1Yes    swCAM  157 0.05816015 0.07310080 0.008319644
15: 1Yes/0Nonsig    swCAM 4477 0.05538832 0.10034547 0.237242330
16:    1Yes/1Yes    swCAM 4297 0.05354609 0.09459480 0.227703884
17:  0No/0Nonsig    LASSO 9286 0.05643993 0.04478155 0.492077791
18:     0No/1Yes    LASSO  811 0.05817531 0.04490354 0.042975995
19: 1Yes/0Nonsig    LASSO 1443 0.05524053 0.04429984 0.076466536
20:    1Yes/1Yes    LASSO 7331 0.05416448 0.04387849 0.388479678
21:  0No/0Nonsig    RIDGE 9196 0.05632371 0.04528211 0.487308569
22:     0No/1Yes    RIDGE  901 0.05933682 0.04601117 0.047745218
23: 1Yes/0Nonsig    RIDGE 1240 0.05582991 0.04509011 0.065709289
24:    1Yes/1Yes    RIDGE 7534 0.05411614 0.04438538 0.399236924
           class approach    n      medse  medimpvse           p

Distributions of r-squared (Rsq, y-axis) of imputed log\(_{2}\) fold changes (FC) regression on observed effect sizes by approach.
Distributions of slopes (y-axis) of imputed log\(_{2}\) fold changes (FC) regression on observed effect sizes by approach. Each point is a simulated phenotype, coloured by cell type.

Code

library(magrittr)
calc_rsq <- function(x, y) {
  summ <- lm(y ~ x) %>% summary()
  summ$r.squared
}
calc_slope <- function(x, y) {
  cf <- lm(y ~ x - 1) %>% coef()
}


ysum <- y[, .(logFC_rsq = calc_rsq(logFC, impvlogFC), logFC_slope = calc_slope(logFC, impvlogFC)),
  by = c("approach", "fakedpheno", "celltype")
]

## colors for cell type
ctfrat.pal <- friend11[seq(length(sctorder))]

## plot b
ysum[, approach := factor(approach,
  levels = expr_order,
  labels = appro_lab
)]

ggboxplot(
  data = ysum, x = "approach",
  y = "logFC_rsq",
  palette = ctfrat.pal,
  ylab = "Rsq between logFC\nin imputed and observed data",
  add = "jitter",
  add.params = list(
    fill = "celltype",
    alpha = 0.95, shape = 21,
    size = 3
  ),
  ggtheme = used_ggthemes(
    legend.text = element_text(size = rel(1.2))
  )
) +
  labs(fill = NULL) + rremove("xlab")


## plot c

ggboxplot(
  data = ysum, x = "approach",
  y = "logFC_slope",
  ylab = "Slope of between logFC \nin imputed data compared to observed data",
  palette = ctfrat.pal,
  add = "jitter",
  add.params = list(fill = "celltype", alpha = 0.95, shape = 21, size = 3),
  ggtheme = used_ggthemes(legend.text = element_text(size = rel(1.2)))
) +
  labs(fill = NULL) +
  rremove("xlab")

figS7bc <- copy(ysum)

## mean logFC slope across simulated phentoypes by approach and cell type
ysumres <- ysum[,
  {
    xx <- round(mean(logFC_slope), 2)
  },
  by = c("approach", "celltype")
]

## mean logFC slop across cell types by approach
ysumres[,
  {
    xxmin <- range(V1)[1]
    xxmax <- range(V1)[2]
    list(xxmin, xxmax)
  },
  by = "approach"
]
       approach xxmin xxmax
1: CIBX-inbuilt  0.60  0.70
2:  CIBX-custom  0.64  0.76
3:        bMIND  0.66  0.85
4:        swCAM  0.55  0.90
5:        LASSO  0.69  0.76
6:        ridge  0.68  0.76

2.7 SupFig8, AUCs in pseudo datasets based on common gene sets

Code

simres_pltlist$common$sce1

2.8 SupFig9, SupTab2, SupTab3

Code

## cluster data
proj_dir <- here::here()
proj_dir
[1] "/rds/project/rds-csoP2nj6Y6Y/wyl37/Rbookdown/sceQTLGen"

aa_files <- list.files(file.path(proj_dir, "CLUSTERdeconRes/workflowInfo"),
  "pipeline_trace*",
  full.names = TRUE
)

aadata <- lapply(aa_files, fread) %>% set_names(., basename(aa_files))

lapply(aadata, nrow)
$`pipeline_trace_2024-10-12_23-09-13.txt`
[1] 2

$`pipeline_trace_2024-10-12_23-34-29.txt`
[1] 622

$`pipeline_trace_2024-10-16_18-31-30.txt`
[1] 13

$`pipeline_trace_2024-10-16_18-39-21.txt`
[1] 5

## pipeline_trace_2024-10-12_23-34-29.txt: full run, but CIBX ran on 4 cpus
## pipeline_trace_2024-10-16_18-39-21.txt; rerun CIBX with 8 cpus

nextflowlog <- aadata[["pipeline_trace_2024-10-12_23-34-29.txt"]][!(name %in% aadata[["pipeline_trace_2024-10-16_18-39-21.txt"]]$name), ] %>%
  rbind(aadata[["pipeline_trace_2024-10-16_18-39-21.txt"]], .)

nextflowlog[, .N] # 622
[1] 622
nextflowlog <- nextflowlog[status != "FAILED", ]
nextflowlog[, .N] # 622
[1] 622

## check if realtime ==0
stopifnot(all(vtimesec(nextflowlog$realtime) != 0))

## memory units in the log
stringr::str_extract_all(nextflowlog$vmem, "[[:alpha:]]+$") |>
  unique() |>
  unlist()
[1] "MB" "GB"

## check if memory format not captured by default
nextflowlog[!stringr::str_detect(nextflowlog$vmem, "[[:alpha:]]+$")]
Empty data.table (0 rows and 17 cols): task_id,name,tag,status,exit,realtime...

stopifnot(all(!is.na(vmem.usage(nextflowlog$vmem))))

## unique process
uni_procs <- stringr::str_extract(nextflowlog$name, "[[:alnum:]]+") |> unique()
uni_procs
[1] "ZENODO"       "PREPCLUSTER"  "cibxrun"      "runSWCAM"     "runBMIND"    
[6] "runPenalised"

nextflowlog[grepl(paste0(uni_procs[1:2], collapse = "|"), name), .N] # 2
[1] 2

## exclude data prep steps
nextflowlog <- nextflowlog[!(tag %in% c("zenodo", "prepcluster")), ]
nextflowlog$trainper <- 100
nextflowlog$simdataid <- "cluster"


nextflowlog[grep("^cibxrun:LM22DECONVRUN", name), approach := "inbuilt"]
nextflowlog[grep("^cibxrun:DECONVRUN[[:space:]]", name), approach := "custom"]
nextflowlog[grep("^runBMIND:BMIND[[:space:]]", name), approach := "bMIND"]
nextflowlog[grep("^runSWCAM:SWCAM", name), approach := "swCAM"]
nextflowlog[grep("^runPenalised:PREPCHUNK", name), approach := "chunkprep"]
nextflowlog[grepl("^runPenalised", name) & grepl("lasso", tag), approach := "LASSO"]
nextflowlog[grepl("^runPenalised", name) & grepl("ridge", tag), approach := "RIDGE"]

nextflowlog[, table(approach)]
approach
    bMIND chunkprep    custom   inbuilt     LASSO     RIDGE     swCAM 
        1         4         1         1       304       304         2 

## na for fraction deconvolution or gene signature
nextflowlog[is.na(approach), name]
[1] "cibxrun:DECONVRUNGS (CIBXgs_cluster)"            
[2] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_cluster)"
[3] "runBMIND:BMINDFRACT (bmindrunfract_cluster)"     

expr_order %in% nextflowlog$approach
[1] TRUE TRUE TRUE TRUE TRUE TRUE

rundata4plt <- lapply(expr_order, function(appin) {
  nextflowlog[approach == appin, ][,
    {
      ncpu <- unique(cpus) %>% paste0(., collapse = "/")
      cputimes <- sum(vtimesec(realtime) / 60) %>% round(., 2)
      memt <- median(vmem.usage(vmem) / 1024) %>% round(., 2)
      list(ncpu, cputimes, memt)
    },
    by = c("simdataid", "trainper")
  ]
}) %>%
  set_names(., expr_order) %>%
  rbindlist(., idcol = "approach")

## per chunk cpu time & memory for LASSO and ridge

expr_order[5:6]
[1] "LASSO" "RIDGE"

cluster_perchunk <- lapply(expr_order[5:6], function(appin) {
  nextflowlog[approach == appin, ][,
    {
      ncpu <- unique(cpus) %>% paste0(., collapse = "/")
      cputimes <- vtimesec(realtime) / 60
      memt <- vmem.usage(vmem) / 1024
      nchunk <- .N

      Q50c <- median(cputimes) %>% round(., 2)

      Q2575c <- quantile(cputimes, c(0.25, 0.75)) %>%
        round(., 2) %>%
        paste0(., collapse = ",")

      Q50m <- median(memt) %>% round(., 2)

      Q2575m <- quantile(memt, c(0.25, 0.75)) %>%
        round(., 2) %>%
        paste0(., collapse = ",")

      list(ncpu,
        nchunk,
        cputimes_pchunk = paste0(Q50c, " (", Q2575c, ")"),
        memt_pchunk = paste0(Q50m, " (", Q2575m, ")")
      )
    },
    by = c("simdataid", "trainper")
  ]
}) %>%
  set_names(., expr_order[5:6]) %>%
  rbindlist(., idcol = "approach")

rundata4plt[cluster_perchunk, Nchunks := i.nchunk, on = "approach"]
rundata4plt[cluster_perchunk, ":="(
  cpu_perchunk = i.cputimes_pchunk,
  mem_perchunk = i.memt_pchunk
), on = "approach"]

rundata4plt[, trainper := as.numeric(trainper)]

rundata4plt[, approach := appro_lab[match(rundata4plt$approach, names(appro_lab))]]

rundata4plt[, approach := factor(approach, appro_lab)]

appro.cols.mod <- appro.cols
names(appro.cols.mod) <- appro_lab

##

## cpu time
cluster_cputime <- rundata4plt[, tstrsplit(cputimes, " ", keep = 1L) %>% unlist() %>% as.numeric()]
cluster_cputime
[1]    8.80    4.45   12.00 1713.13  624.22 2928.45
## relative to CIBX
(cluster_cputime / cluster_cputime[2]) %>% round(., 2)
[1]   1.98   1.00   2.70 384.97 140.27 658.08

## mem
cluster_mem <- rundata4plt[, tstrsplit(memt, " ", keep = 1L) %>% unlist() %>% as.numeric()]
cluster_mem
[1] 11.00  5.50  1.80  2.95 16.70 58.50
## relative to CIBX
(cluster_mem / cluster_mem[2]) %>% round(., 2)
[1]  2.00  1.00  0.33  0.54  3.04 10.64

Code

setdiff(names(rundata4plt), c("simdataid", "trainper")) %>%
  rundata4plt[, ., with = FALSE] %>%
  kable(., caption = "resource usage metrics for the CLUSTER data")

resource usage metrics for the CLUSTER data
approach	ncpu	cputimes	memt	Nchunks	cpu_perchunk	mem_perchunk
CIBX-inbuilt	8	8.80	11.00	NA	NA	NA
CIBX-custom	8	4.45	5.50	NA	NA	NA
bMIND	4	12.00	1.80	NA	NA	NA
swCAM	10/1	1713.13	2.95	NA	NA	NA
LASSO	1	624.22	16.70	304	1.84 (1.23,2.73)	16.7 (11.7,24.8)
ridge	1	2928.45	58.50	304	8.85 (6.09,12.78)	58.5 (39.18,81.22)

2.9 resource usage metrics for simdata

3 simulation data (N=160 samples) x Combat-seq (Yes/No) x training percentage (25, 50, 75, 100)

Code

## simData
proj_dir <- here::here()
proj_dir
[1] "/rds/project/rds-csoP2nj6Y6Y/wyl37/Rbookdown/sceQTLGen"

## runlog
nextflowlog <- file.path(
  proj_dir,
  "simDataRes/workflowInfo/pipeline_trace_2024-10-12_22-15-21.txt"
) %>% fread()

nextflowlog[, .N] # 11073
[1] 11073
nextflowlog <- nextflowlog[status != "FAILED", ]
nextflowlog[, .N] # 11073
[1] 11073

## check if realtime ==0
stopifnot(all(vtimesec(nextflowlog$realtime) != 0))

## memory units in the log
stringr::str_extract_all(nextflowlog$vmem, "[[:alpha:]]+$") |>
  unique() |>
  unlist()
[1] "GB" "MB"

## check if memory format not captured by default
nextflowlog[!stringr::str_detect(nextflowlog$vmem, "[[:alpha:]]+$")]
Empty data.table (0 rows and 17 cols): task_id,name,tag,status,exit,realtime...

stopifnot(all(!is.na(vmem.usage(nextflowlog$vmem))))

## unique process
uni_procs <- stringr::str_extract(nextflowlog$name, "[[:alnum:]]+") |> unique()
uni_procs
[1] "PREPSCEQTL"     "CBATSIMDATA"    "SIMDATA"        "SIMDECONVINPUT"
[5] "runSWCAM"       "cibxrun"        "runBMIND"       "runPenalised"  

nextflowlog[grepl(paste0(uni_procs[1:4], collapse = "|"), name), .N] # 31
[1] 31
## tag="-" for data prep steps
nextflowlog[tag == "-", name] %>% length()
[1] 31

## exclude data prep steps
nextflowlog <- nextflowlog[tag != "-", ]
nextflowlog$trainper <- str_extract(nextflowlog$tag, "[[:digit:]]+$")
nextflowlog$simdataid <- str_extract(nextflowlog$tag, "1Msim.+$")
stopifnot(all(!is.na(nextflowlog$simdataid)))

nextflowlog[grep("^cibxrun:LM22DECONVRUN", name), approach := "inbuilt"]
nextflowlog[grep("^cibxrun:DECONVRUN[[:space:]]", name), approach := "custom"]
nextflowlog[grep("^runBMIND:BMIND[[:space:]]", name), approach := "bMIND"]
nextflowlog[grep("^runSWCAM:SWCAM", name), approach := "swCAM"]
nextflowlog[grep("^runPenalised:PREPCHUNK", name), approach := "chunkprep"]
nextflowlog[grepl("^runPenalised", name) & grepl("lasso", tag), approach := "LASSO"]
nextflowlog[grepl("^runPenalised", name) & grepl("ridge", tag), approach := "RIDGE"]

nextflowlog[, table(approach)]
approach
    bMIND chunkprep    custom   inbuilt     LASSO     RIDGE     swCAM 
       24        96        24        24      5377      5377        48 

## na for fraction deconvolution or gene signature
nextflowlog[is.na(approach), name]
 [1] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_Combatdat.160.2.50)" 
 [2] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_dat.160.1.75)"       
 [3] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_Combatdat.160.3.100)"
 [4] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_Combatdat.160.1.75)" 
 [5] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_Combatdat.160.3.50)"             
 [6] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_dat.160.2.75)"       
 [7] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_Combatdat.160.2.100)"
 [8] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_dat.160.2.25)"       
 [9] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_Combatdat.160.3.100)"            
[10] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_dat.160.3.25)"       
[11] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_dat.160.2.50)"       
[12] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_dat.160.1.25)"       
[13] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_Combatdat.160.2.50)"             
[14] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_Combatdat.160.1.50)" 
[15] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_Combatdat.160.3.25)" 
[16] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_Combatdat.160.1.25)" 
[17] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_Combatdat.160.3.50)" 
[18] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_Combatdat.160.2.25)" 
[19] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_dat.160.3.75)"       
[20] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_dat.160.3.100)"      
[21] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_dat.160.1.100)"      
[22] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_Combatdat.160.3.50)"      
[23] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_Combatdat.160.1.100)"
[24] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_Combatdat.160.2.75)" 
[25] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_Combatdat.160.2.50)"      
[26] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_Combatdat.160.1.100)"            
[27] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_dat.160.2.25)"                   
[28] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_dat.160.3.50)"       
[29] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_dat.160.1.75)"                   
[30] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_dat.160.1.50)"       
[31] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_dat.160.3.25)"                   
[32] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_Combatdat.160.2.100)"            
[33] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_Combatdat.160.3.100)"     
[34] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_dat.160.2.100)"      
[35] "runSWCAM:DEBCAMFRACTS80 (debCAMfracts80_1Msim_Combatdat.160.3.75)" 
[36] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_dat.160.2.75)"                   
[37] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_Combatdat.160.1.100)"     
[38] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_Combatdat.160.1.25)"             
[39] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_Combatdat.160.2.25)"             
[40] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_Combatdat.160.3.25)"             
[41] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_dat.160.2.50)"                   
[42] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_dat.160.1.75)"            
[43] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_Combatdat.160.2.100)"     
[44] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_dat.160.3.100)"                  
[45] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_Combatdat.160.1.25)"      
[46] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_dat.160.3.25)"            
[47] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_dat.160.2.75)"            
[48] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_dat.160.2.25)"            
[49] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_Combatdat.160.3.25)"      
[50] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_dat.160.2.50)"            
[51] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_dat.160.3.50)"                   
[52] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_dat.160.1.50)"                   
[53] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_dat.160.1.25)"                   
[54] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_Combatdat.160.2.75)"             
[55] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_Combatdat.160.1.75)"             
[56] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_Combatdat.160.2.25)"      
[57] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_Combatdat.160.1.50)"             
[58] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_dat.160.3.100)"           
[59] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_dat.160.3.50)"            
[60] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_Combatdat.160.1.75)"      
[61] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_Combatdat.160.2.75)"      
[62] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_Combatdat.160.1.50)"      
[63] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_dat.160.2.100)"                  
[64] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_dat.160.1.50)"            
[65] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_dat.160.2.100)"           
[66] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_dat.160.1.100)"                  
[67] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_dat.160.1.100)"           
[68] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_dat.160.3.75)"                   
[69] "cibxrun:DECONVRUNGS (CIBXgs_1Msim_Combatdat.160.3.75)"             
[70] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_dat.160.3.75)"            
[71] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_dat.160.1.25)"            
[72] "runBMIND:BMINDFRACT (bmindrunfract_1Msim_Combatdat.160.3.75)"      

expr_order %in% nextflowlog$approach
[1] TRUE TRUE TRUE TRUE TRUE TRUE


rundata4plt <- lapply(expr_order, function(appin) {
  nextflowlog[approach == appin, ][,
    {
      ncpu <- unique(cpus) %>% paste0(., collapse = "/")
      cputimes <- sum(vtimesec(realtime) / 60) %>% round(., 2)
      memt <- median(vmem.usage(vmem) / 1024) %>% round(., 2)
      list(ncpu, cputimes, memt)
    },
    by = c("simdataid", "trainper")
  ]
}) %>%
  set_names(., expr_order) %>%
  rbindlist(., idcol = "approach")

rundata4plt[, trainper := as.numeric(trainper)]

rundata4plt[, approach := appro_lab[match(rundata4plt$approach, names(appro_lab))]]

rundata4plt[, approach := factor(approach, appro_lab)]

appro.cols.mod <- appro.cols
names(appro.cols.mod) <- appro_lab

Code

figS9 <- rundata4plt

re_ws <- c("cputimes", "memt")

lapply(re_ws, function(resin) {
  yyl <- ifelse(resin == "cputimes",
    "log(CPU realtime in mins)",
    "log(memory GB usage)"
  )

  ggplot(
    data = rundata4plt,
    aes(
      x = trainper, y = log(!!sym(resin)),
      colour = approach
    )
  ) +
    geom_point(alpha = 0.5, size = 1.5) +
    geom_smooth(
      aes(
        group = approach,
        colour = approach,
        linetype = approach,
        fill = approach
      ),
      alpha = 0.35, se = FALSE, linewidth = 0.5
    ) +
    scale_colour_manual(values = appro.cols.mod) +
    scale_fill_manual(values = appro.cols.mod) +
    scale_x_continuous(
      breaks = seq(25, 100, by = 25),
      labels = scales::label_percent(scale = 1)
    ) +
    ylab(yyl) +
    xlab("Percentage of 80 train samples") +
    used_ggthemes(
      legend.position = "top"
    )
}) %>%
  ggarrange(
    plotlist = ., common.legend = TRUE,
    labels = LETTERS
  )

Code

## per chunk cpu time & memory for LASSO and ridge
## across 24 simulation data

expr_order[5:6]
[1] "LASSO" "RIDGE"

cluster_perchunk <- lapply(expr_order[5:6], function(appin) {
  nextflowlog[approach == appin, ][
    , {
      ncpu <- unique(cpus) %>% paste0(., collapse = "/")
      cputimes <- vtimesec(realtime) / 60
      memt <- vmem.usage(vmem) / 1024
      nchunk <- .N

      Q50c <- median(cputimes) %>% round(., 2)

      Q2575c <- quantile(cputimes, c(0.25, 0.75)) %>%
        round(., 2) %>%
        paste0(., collapse = ",")

      Q50m <- median(memt) %>% round(., 2)

      Q2575m <- quantile(memt, c(0.25, 0.75)) %>%
        round(., 2) %>%
        paste0(., collapse = ",")

      list(ncpu,
        nchunk,
        cputimes_pchunk = paste0(Q50c, " (", Q2575c, ")"),
        memt_pchunk = paste0(Q50m, " (", Q2575m, ")")
      )
    } # , by = c("simdataid", "trainper")
  ]
}) %>%
  set_names(., expr_order[5:6]) %>%
  rbindlist(., idcol = "approach")


cluster_perchunk[, approach := appro_lab[match(cluster_perchunk$approach, names(appro_lab))]]

cluster_perchunk[, approach := factor(approach, appro_lab)]


rundata_sim <- rundata4plt[,
  {
    ncpus <- unique(ncpu)
    Nsimdata <- .N
    Q50c <- median(cputimes) %>% round(., 2)
    Q2575c <- quantile(cputimes, c(0.25, 0.75)) %>%
      round(., 2) %>%
      paste0(., collapse = ",")

    Q50m <- median(memt) %>% round(., 2)
    Q2575m <- quantile(memt, c(0.25, 0.75)) %>%
      round(., 2) %>%
      paste0(., collapse = ",")

    list(ncpus,
      Ndata = Nsimdata,
      CPUmins = paste0(Q50c, " (", Q2575c, ")"),
      memGB = paste0(Q50m, " (", Q2575m, ")")
    )
  },
  by = "approach"
]


rundata_sim[cluster_perchunk, Nchunks := i.nchunk, on = "approach"]
rundata_sim[cluster_perchunk, ":="(
  cpu_perchunk = i.cputimes_pchunk,
  mem_perchunk = i.memt_pchunk
), on = "approach"]


## cpu time
sim_cputime <- rundata_sim[, tstrsplit(CPUmins, " ", keep = 1L) %>% unlist() %>% as.numeric()]
sim_cputime
[1]    7.22    3.08    7.61 1103.68  592.13 1103.60
## relative to CIBX
(sim_cputime / sim_cputime[2]) %>% round(., 2)
[1]   2.34   1.00   2.47 358.34 192.25 358.31

## mem
sim_mem <- rundata_sim[, tstrsplit(memGB, " ", keep = 1L) %>% unlist() %>% as.numeric()]
sim_mem
[1]  6.45  6.00  1.30  1.92 10.40 33.60
## relative to CIBX
(sim_mem / sim_mem[2]) %>% round(., 2)
[1] 1.07 1.00 0.22 0.32 1.73 5.60

kable(rundata_sim, caption = "resource usage metrics for the simulation data")

resource usage metrics for the simulation data
approach	ncpus	Ndata	CPUmins	memGB	Nchunks	cpu_perchunk	mem_perchunk
CIBX-inbuilt	4	24	7.22 (5.64,8.12)	6.45 (6.25,6.6)	NA	NA	NA
CIBX-custom	4	24	3.08 (2.71,4.08)	6 (4.47,6.53)	NA	NA	NA
bMIND	4	24	7.61 (6.33,8.18)	1.3 (1.3,1.3)	NA	NA	NA
swCAM	10/1	24	1103.68 (799.36,1256.04)	1.92 (1.77,2.07)	NA	NA	NA
LASSO	1	24	592.13 (391.41,862.2)	10.4 (9.46,10.94)	5377	1.88 (0.54,3.85)	10.2 (4.4,15.9)
ridge	1	24	1103.6 (1017.01,1199.88)	33.6 (32.83,35.35)	5377	4.62 (2.1,7.25)	33.1 (15.3,52.2)

2.10 SupFig10, Inbuilt and custom signature Genes, and debCAM cell type-specific genes

Code

lm22mtx_f <- file.path("CIBXres/LM22deconv", runid) %>%
  file.path(data_dir, .) %>%
  file.path(., "sigMTX4dev.txt")

cumtx_f <- file.path("CIBXres/GS", runid) %>%
  file.path(data_dir, .) %>%
  file.path(., "CIBERSORTx_refexpr4CIBXgsClass.CIBERSORTx_refexpr4CIBXgs.bm.K999.txt")
## CIBX-inbuilt and custom signature genes
sigmtx_files <- c(lm22mtx_f, cumtx_f)

## signature matrcies, inbuilt and custom,
sigmtxs <- lapply(sigmtx_files, function(filein) {
  datin <- fread(filein)
  pltmat <- datin[, -1] %>% as.matrix()
  rownames(pltmat) <- datin[[1]]

  if (ncol(pltmat) != 4) {
    ourorder <- lm22merged
    names(ourorder) <- colnames(pltmat)
    othercls <- setdiff(unique(ourorder), sctorder)
    ourorder <- sort(factor(ourorder, levels = c(sctorder, othercls)))
    pltmat <- pltmat[, names(ourorder)]
  } else {
    pltmat <- pltmat[, sctorder]
    ourorder <- NULL
  }


  min_max <- pheatmap:::scale_mat(pltmat, "row") %>% range()

  col_fun <- circlize::colorRamp2(
    seq(from = min_max[1], to = min_max[2], length.out = 11),
    cols4all::c4a("brewer.rd_bu", n = 11, reverse = TRUE)
  )

  list(
    pltmat = pltmat,
    col_fun = col_fun,
    matorder = ourorder
  )
})

names(sigmtxs) <- c("inbuilt", "custom")

figS10ab <- lapply(sigmtxs, "[[", "pltmat")
names(figS10ab) <- paste0("figS10", letters[seq_along(figS10ab)])


## Lm22 merged text
ourorder <- sigmtxs$inbuilt$matorder
descri.text <- NULL
for (cttype in unique(ourorder)) {
  descri.text <- c(
    descri.text,
    sprintf(
      "%s: %s",
      cttype,
      names(ourorder)[ourorder == cttype] %>% combine_words()
    )
  )
}

descri.text
 [1] "CD4: T cells CD4 naive, T cells CD4 memory resting, T cells CD4 memory activated, T cells follicular helper, and T cells regulatory (Tregs)"
 [2] "CD8: T cells CD8"                                                                                                                           
 [3] "CD14: Monocytes, Macrophages M0, Macrophages M1, and Macrophages M2"                                                                        
 [4] "CD19: B cells naive and B cells memory"                                                                                                     
 [5] "PCs: Plasma cells"                                                                                                                          
 [6] "GammaT: T cells gamma delta"                                                                                                                
 [7] "NK: NK cells resting and NK cells activated"                                                                                                
 [8] "Dendritic: Dendritic cells resting and Dendritic cells activated"                                                                           
 [9] "Mast: Mast cells resting and Mast cells activated"                                                                                          
[10] "Eos: Eosinophils"                                                                                                                           
[11] "PMN: Neutrophils"                                                                                                                           

gpar(fontface = "bold")
$font
bold 
   2 
## https://github.com/wilkelab/gridtext/issues/24
update_gpar <- function(gp, gp_new) {
  names_new <- names(gp_new)
  names_old <- names(gp)
  gp[c(intersect(names_old, names_new), "fontface")] <- NULL
  gp_new["fontface"] <- NULL
  do.call(gpar, c(gp, gp_new))
}
gp <- gpar(fontface = "bold")
gpmod <- update_gpar(gp, gpar(fontsize = 10))


## heatmaps for inbuilt and custom
ht_sigmtx <- lapply(names(sigmtxs), function(nin) {
  datmtx <- sigmtxs[[nin]]$pltmat %>%
    pheatmap:::scale_mat(., "row")
  datcol <- sigmtxs[[nin]]$col_fun
  datsplit <- sigmtxs[[nin]]$matorder

  Heatmap(
    matrix = datmtx, col = datcol,
    cluster_columns = FALSE,
    show_row_names = FALSE,
    column_split = datsplit,
    column_gap = unit(1, "mm"),
    ## column_title_gp = gpar(fontface = "bold",
    ##                        fontsize=10),
    column_title_gp = gpmod,
    name = paste0(nin, "\nExpr")
  )
})

names(ht_sigmtx) <- names(sigmtxs)

Code

## observed expr
observ_expr <- file.path(
  data_dir,
  "CLUSTERData/ObservTPMExpr.rds"
) %>%
  readRDS()

########################################################
## sig genes in observ_expr, including debCAM markers ##
########################################################

## cell type train and testing samples
sampin4pca <- sampdata[celltype != "PBMC" & samtype == "nonrefsamp", sample]
## sampin4pca <- sampdata[celltype!="PBMC", sample]

pca1_4res <- list(
  inbuilt = rownames(sigmtxs$inbuilt$pltmat),
  custom = rownames(sigmtxs$custom$pltmat),
  debCAM = unlist(debCAMresFract$OVE.FC_10$markers, use.names = FALSE)
) %>% lapply(., function(sgin) {
  ## some LM22 genes are not found in observ_expr
  gin <- intersect(sgin, rownames(observ_expr))

  ## pca of signature genes in our observed expr
  mat <- log2(observ_expr[gin, sampin4pca] + 1)
  mat %>% dim()

  pca_res <- prcomp(t(mat))

  pcs <- pca_res$x[, seq(1, 4)] %>% as.data.frame()
  pcs$celltype <- sampdata$celltype[match(rownames(pcs), sampdata$sample)]
  pcs$celltype <- factor(pcs$celltype, levels = sctorder)
  pcs$Batch <- sampdata$batch[match(rownames(pcs), sampdata$sample)]
  pcs$Batch <- factor(pcs$Batch)

  pcaplts <- lapply(1:2, function(x) {
    x1 <- paste0("PC", 2 * x - 1)
    y1 <- paste0("PC", 2 * x)
    ggscatter(pcs,
      x = x1, y = y1,
      fill = "celltype",
      shape = "Batch",
      alpha = 0.9,
      ellipse = TRUE,
      ellipse.type = "convex",
      ellipse.alpha = 0.3,
      ellipse.border.remove = TRUE,
      palette = ctfrat.pal
    ) +
      scale_shape_manual(
        values = c(
          "1" = 21,
          "2" = 22,
          "3" = 23,
          "4" = 24
        ),
        limits = force
      ) +

      guides(fill = guide_legend(override.aes = list(shape = 21))) +

      used_ggthemes()
  }) %>% ggarrange(plotlist = ., common.legend = TRUE)

  list(
    gin = gin,
    pcaplts = pcaplts
  )
})


figS10def <- list(
  inbuilt = rownames(sigmtxs$inbuilt$pltmat),
  custom = rownames(sigmtxs$custom$pltmat),
  debCAM = unlist(debCAMresFract$OVE.FC_10$markers, use.names = FALSE)
) %>% lapply(., function(sgin) {
  ## some LM22 genes are not found in observ_expr
  gin <- intersect(sgin, rownames(observ_expr))

  ## pca of signature genes in our observed expr
  mat <- log2(observ_expr[gin, sampin4pca] + 1)
  mat %>% dim()

  pca_res <- prcomp(t(mat))

  pcs <- pca_res$x[, seq(1, 4)] %>% as.data.frame()
  pcs$celltype <- sampdata$celltype[match(rownames(pcs), sampdata$sample)]
  pcs$celltype <- factor(pcs$celltype, levels = sctorder)
  pcs$Batch <- sampdata$batch[match(rownames(pcs), sampdata$sample)]
  pcs$Batch <- factor(pcs$Batch)
  pcs
})


names(figS10def) <- paste0("figS10", c("d", "e", "f"))

Expression of inbuilt signature genes (N=547) in 22 leukocyte subsets, curated by the CIBERSORTx team from microarray gene expression. For each gene (row), expression is centred to the mean and scaled by the standard deviation of cell types (column). Columns are split by the collapsed classes. CD4: T cells CD4 naive, T cells CD4 memory resting, T cells CD4 memory activated, T cells follicular helper, and T cells regulatory (Tregs); CD8: T cells CD8; CD14: Monocytes, Macrophages M0, Macrophages M1, and Macrophages M2; CD19: B cells naive and B cells memory; PCs: Plasma cells; GammaT: T cells gamma delta; NK: NK cells resting and NK cells activated; Dendritic: Dendritic cells resting and Dendritic cells activated; Mast: Mast cells resting and Mast cells activated; Eos: EosinophilsPMN: Neutrophils

Code

## inbuilt signature genes
ht_sigmtx[[1]]

Expression of custom signature genes (N=1589), derived from our sorted-cell RNAseq expression in 80 training subjects using CIBERSORTx. Expression is centred and scaled across cell types (column) by gene (row).

Code

## custom signature genes

ht_sigmtx[[2]]

Numbers (No.) of debCAM cell-type specific genes by cell type (row) and selection criteria (column). debCAM, which does not have the signature matrix, selects the cell-type-specific genes, that are over-expressed (OVE) in one cell type compared to others combined. OVE fold change (FC) of 1, 2, 5 and 10 were used in debCAM on our sorted cell expression in 80 training subjects.

Code

## debCAM cell type-specific genes by fold changes
## supple for debCAM
## summary of no. markers by 4 FCs

debCAMmarkers <- lapply(debCAMresFract, function(dat) {
  sapply(dat$markers, function(x) {
    length(x)
  })
}) %>%
  do.call(rbind, .) %>%
  as.data.frame() %>%
  setDT(., keep.rownames = TRUE)

## no. markers by fc from debcam
debbarplt <- melt(debCAMmarkers, id.vars = "rn", measure.vars = sctorder, variable.name = "celltype", variable.factor = sctorder)[, rn := factor(rn, names(debCAMresFract))] %>%
  ggplot(data = ., aes(x = celltype, y = value)) +
  geom_bar(stat = "identity", fill = "#69b3a2", width = 0.8) +
  geom_text(aes(label = value, hjust = ifelse(value < 500, 0, 0.8)), size = 3.5) +
  coord_flip() +
  facet_wrap(~rn, ncol = 4) +
  ylab("No. cell-type specific genes") +
  xlab("cell type") +
  used_ggthemes()


debbarplt

figS10c <- melt(debCAMmarkers, id.vars = "rn", measure.vars = sctorder, variable.name = "celltype", variable.factor = sctorder)[, rn := factor(rn, names(debCAMresFract))]

The first four principal components from PCA analysis of inbuilt signature gene expression in test samples. Each dot is a sample, coloured by cell type and shaped by sequencing batch.

Code

## PCA analysis of cell type expression data in test samples
### inbuilt signature genes

pca1_4res$inbuilt$pcaplts

The first four principal components from PCA analysis of custom signature gene expression in test samples.

Code

### custum signature genes
pca1_4res$custom$pcaplts

The first four principal components from PCA analysis of debCAM cell-type specific gene expression in test samples.

Code

### debCAM cell-type specific genes

pca1_4res$debCAM$pcaplts

2.11 SupFig12, Combat-seq

The first four principal components from PCA analysis of log2(count-per-million) expression derived from raw read counts in RNAseq samples used in this work.

Code

res_d <- file.path(data_dir, "ZenodoCLUSTER")
rawreadtbl <- read.table(file.path(res_d, "RawReadCounts_Zenodo.csv"), row.names = 1, sep = ",", header = TRUE)
rawreadtbl <- rawreadtbl[, sampdata$sample]

adjreadtbl <- file.path(res_d, "BatchCorrectedReadCounts_Zenodo.csv") %>%
  read.table(file = ., row.names = 1, sep = ",", header = TRUE)
adjreadtbl <- adjreadtbl[, sampdata$sample]

geneMeta <- file.path(res_d, "GeneMetaInfo_Zenodo.csv") %>%
  read.table(
    file = ., row.names = 1, sep = ",",
    header = TRUE
  )


pca_res <- list(rawreadtbl, adjreadtbl) %>%
  lapply(., function(countmtx) {
    objinput <- DGEList(
      counts = countmtx,
      genes = geneMeta[match(rownames(countmtx), geneMeta$Geneid), ]
    )

    keep <- filterByExpr(objinput,
      group = sampdata$celltype[match(colnames(objinput), sampdata$sample)]
    )
    objinput <- objinput[keep, , keep.lib.sizes = FALSE]
    objinput <- calcNormFactors(objinput)
    matcpm <- cpm(objinput, log = TRUE)
    respca <- prcomp(x = t(matcpm))
    stopifnot(all(rownames(respca$x) == sampdata$sample))
    dsn4pca <- cbind(respca$x[, 1:4], sampdata[, .(celltype, batch)])
    dsn4pca$batch <- factor(dsn4pca$batch, levels = 1:4)
    dsn4pca$celltype <- factor(
      dsn4pca$celltype,
      levels = c("PBMC", sctorder)
    )
    dsn4pca
  })


figS12ab <- copy(pca_res)
names(figS12ab) <- paste0("figS12", letters[seq_along(pca_res)])

ctfrat.palpmbc <- c(ctfrat.pal, "#AAAA00")
names(ctfrat.palpmbc) <- c(sctorder, "PBMC")

pca_plts <- lapply(pca_res, function(datin) {
  pcaplts <- lapply(1:2, function(x) {
    x1 <- paste0("PC", 2 * x - 1)
    y1 <- paste0("PC", 2 * x)

    ggscatter(datin,
      x = x1, y = y1,
      fill = "celltype",
      shape = "batch",
      alpha = 0.9,
      palette = ctfrat.palpmbc
    ) +
      scale_shape_manual(
        values = c(
          "1" = 21,
          "2" = 22,
          "3" = 23,
          "4" = 24
        ),
        limits = force
      ) +
      guides(fill = guide_legend(override.aes = list(shape = 21))) +
      used_ggthemes()
  })
})

## raw
ggarrange(plotlist = pca_plts[[1]], common.legend = TRUE)

The first four principal components from PCA analysis of log2(count-per-million) expression derived from Combat-Seq batch-adjusted read counts in RNAseq samples used in this work

Code

## combat-seq

ggarrange(plotlist = pca_plts[[2]], common.legend = TRUE)

2.12 SupFig13, Batch composition of subjects by train and test set

Frequencies (A) and percentages (B) of subjects by batch and training/testing set.

Code

## Batch composition of subjects by train and test set
categoryplt <- function(dat, xgroup, fillvar, ...) {
  ## count
  adsn <- dat[!is.na(get(xgroup)), .N, by = c(xgroup, fillvar)][eval(parse(text = paste0("order(", xgroup, ",", fillvar, ")")))]

  ares <- adsn %>% ggbarplot(
    data = ., x = xgroup, y = "N",
    fill = fillvar,
    ylab = "Counts", ...
  )
  ## percentage
  bdsn <- dat[!is.na(get(xgroup)), .N, by = c(xgroup, fillvar)][eval(parse(text = paste0("order(", xgroup, ",", fillvar, ")")))]
  bdsn <- bdsn[,
    {
      an <- sum(N)
      props <- N / an
      labelt <- round(props, 2)
      list(eval(parse(text = fillvar)), N, an, props, labelt)
    },
    by = xgroup
  ]

  bres <- bdsn %>% ggbarplot(
    data = ., x = xgroup, y = "labelt", fill = "V1",
    ylab = "Percentages", ...
  )
  bres

  overp <- chisq.test(dat[[fillvar]], dat[[xgroup]])$p.value
  overp <- ifelse(overp < 10^-2,
    paste0("=", format(overp,
      nsmall = 2,
      digits = 3,
      scientific = TRUE
    )),
    paste0("=", round(overp, 2))
  )
  if (class(dat[[xgroup]]) == "character") {
    dat[[xgroup]] <- factor(dat[[xgroup]])
  }
  ## grp <- unique(dat[[xgroup]])
  grp <- levels(dat[[xgroup]])
  dddf <- data.table(
    group1 = grp[1], group2 = grp[length(grp)],
    p = paste0(" p ", overp)
  )
  bres <- bres +
    stat_pvalue_manual(dddf,
      y.position = 1.02,
      bracket.nudge.y = 0.02,
      label = "Chi-square test, {p}"
    ) +
    labs(fill = fillvar)

  list(ares, bres)
}

## subjects by train and test set
sampdata.sub <- unique(sampdata, by = "samp")
sampdata.sub[, Batch := factor(batch, levels = seq(1, 4))]
sampdata.sub[, sample_type := factor(samtype, levels = c("refsamp", "nonrefsamp"), labels = c("train", "Test"))]

categoryplt(
  dat = sampdata.sub,
  xgroup = "sample_type",
  fillvar = "Batch",
  xlab = FALSE,
  label = TRUE,
  lab.pos = "in",
  palette = "jama"
) %>%
  ggarrange(plotlist = ., common.legend = TRUE, labels = LETTERS)

figS13 <- sampdata.sub[, .(samp, sample_type, Batch)]

2.13 SupFig14, Correalations of fractions between inbuilt cell linages with true flow data

Code

## Fractions between 22 inbuilt cell lineages with flow data
####################
## flow fractions ##
####################
fractflow <- file.path(data_dir, "CLUSTERData/flowFract_wide.rds") %>%
  readRDS() %>%
  as.matrix()

## cibx-LM22
#############################
## fractions based on LM22 ##
#############################
lm22merged <- "data4repo/mergedClasses.csv" %>%
  file.path(here::here(), .) %>%
  scan(., what = "character", sep = "\t")

lm22mtx <- lm22mtx_f %>% fread()

res_d <- file.path("CIBXres/LM22deconv", runid) %>%
  file.path(data_dir, .)
res_d
[1] "/rds/project/rds-csoP2nj6Y6Y/wyl37/Rbookdown/sceQTLGen/CLUSTERdeconRes/CIBXres/LM22deconv/cluster"

lm22res <- file.path(res_d, "CIBERSORTxGEP_NA_Fractions-Adjusted.txt") %>%
  fread()

stopifnot(names(lm22res)[-1][1:length(names(lm22mtx)[-1])] == names(lm22mtx)[-1])

## fractions in 22 cell types
lm22res.mtx <- lm22res[, names(lm22mtx)[-1], with = FALSE] %>% as.matrix()
rownames(lm22res.mtx) <- lm22res$Mixture

ourorder <- lm22merged
names(ourorder) <- names(lm22mtx)[-1]
othercls <- setdiff(unique(ourorder), sctorder)
ourorder <- sort(factor(ourorder, levels = c(sctorder, othercls)))

## sample order matched to fractflow
lm22res.mtx <- lm22res.mtx[rownames(fractflow), names(ourorder)]

## test samples
nonrefsamps <- sampdata[samtype == "nonrefsamp" & celltype == "PBMC", sample]

## correlation
mychks_test <- cor(
  x = fractflow[nonrefsamps, ],
  y = lm22res.mtx[nonrefsamps, ]
)

annotcoldf <- data.frame(mergedClass = ourorder)
rownames(annotcoldf) <- names(ourorder)

## find column split
kkkres <- S4Vectors::Rle(ourorder)


annotcol_pal <- friend11
names(annotcol_pal) <- levels(ourorder)

annotcol_pal_dat <- list(mergedClass = annotcol_pal)


## heatmap correlation
library(circlize)
col_fun <- circlize::colorRamp2(
  c(1, 0, -1),
  colorspace::diverging_hcl(palette = "Red-Green", n = 3)
)

gp <- gpar(fontface = "bold")
update_gpar(gp, gpar(fontsize = 10))
$font
[1] 2

$fontsize
[1] 10

beforeCorht <- ComplexHeatmap::pheatmap(
  mat = mychks_test,
  name = "Cor",
  color = col_fun,
  cluster_rows = FALSE, cluster_cols = FALSE,
  display_numbers = TRUE, number_format = "%.2f",
  number_color = "black",
  annotation_col = annotcoldf,
  annotation_colors = annotcol_pal_dat,
  gaps_col = S4Vectors::end(kkkres),
  cellwidth = 25, cellheight = 25,
  row_names_side = "left"
)

figS14 <- mychks_test

Pearson correlations between inbuilt LM22 and ground-truth flow fractions. Columns are LM22 cell subsets split by their merged classes (top annotation). Rows are ground truth cell types. Each cell is coloured based on the strength of the correlation.

Code

beforeCorht

Code

ddd <- ls(pattern = "figS") %>% str_sort(., numeric = TRUE)
dddidx <- sapply(ddd, get) %>% sapply(., function(x) inherits(x, "list"))

cccin <- ddd[!dddidx]
ccc <- vector(mode = "list", length = length(cccin))
for (i in seq_along(cccin)) {
  ccc[[i]] <- get(cccin[i])
}
names(ccc) <- cccin

ddd[dddidx]
[1] "figS1"     "figS10ab"  "figS10def" "figS12ab" 

outlist <- c(
  figS1, ccc[1:12],
  figS10ab, ccc["figS10c"],
  figS12ab, ccc[14:15]
)

names(outlist) <- toupper(names(outlist)) %>%
  str_extract(., pattern = "S.+$") %>%
  paste0(., "_fig")

openxlsx::write.xlsx(outlist,
  file = "SourceData_SuppFigures.xlsx"
)

2.14 AUCs in Test samples, CLUSTER data

Code

rds_f <- "fig4rundatatestonly.rds"

if (!file.exists(rds_f)) {
  ## main: different gene set; common: same genes per cell across methods
  ## impvexpr_all <- list(main = expr_ori, common = expr_scenarios)
  impvexpr_all <- list(
    main = cdata_explst$iexpr,
    common = cdata_explst$icexpr
  )

  ## TRUE for dichotomous; FALSE for continuous
  dcht_pheno <- c(TRUE, FALSE)
  names(dcht_pheno) <- c("dcht", "cntn")

  numCores <- length(expr_order)
  numCores

  cl <- makeCluster(numCores, type = "FORK")
  registerDoParallel(cl)

  aucall <- foreach(
    j = seq_along(impvexpr_all),
    .final = function(x) setNames(x, names(impvexpr_all))
  ) %:%
    foreach(
      i = dcht_pheno,
      .final = function(x) setNames(x, names(dcht_pheno))
    ) %dopar% {
      observy <- cdata_explst$oexpr
      train_samps_ct <- cdata_explst$trainsamp
      sampdata <- cdata_explst$phenodata

      dat <- run_datprepfig4(
        obvexpr = observy,
        impvexpr = impvexpr_all[[j]],
        tsampct = NULL,
        dich = i
      )
      ## expr ~ pheno
      res1 <- run_fig4(datprlst = dat, truesig = 0.05)

      ## expr~ sex + pheno
      res2 <- run_fig4(
        datprlst = dat, truesig = 0.05,
        covdata = sampdata, covid = "sample", covar = "sex"
      )

      list(nocov = res1, covin = res2)
    }

  stopCluster(cl)

  mychktestonly <- unlist(aucall, recursive = FALSE) %>%
    unlist(., recursive = FALSE) %>%
    lapply(., "[[", "aucres") %>%
    lapply(., function(datin) {
      datin[, ":="(
        celltype = factor(celltype, levels = sctorder),
        approach = factor(approach, levels = expr_order))]
      datin
    }) %>%
    rbindlist(l = ., idcol = "scenarios") %>%
    .[, pheno := gsub("^main\\.|^common\\.", "", scenarios)] %>%
    .[, scenario := str_extract(scenarios, "main|common")]

  saveRDS(mychktestonly, file = rds_f)
} else {
  mychktestonly <- readRDS(rds_f)
}

2.14.1 Varying gene sets

Code

pheno.order <- CJ(c("dcht", "cntn"), c("nocov", "covin"), sorted = FALSE)[, paste(V1, V2, sep = ".")]
pheno.order
[1] "dcht.nocov" "dcht.covin" "cntn.nocov" "cntn.covin"
mychktestonly[, pheno := factor(pheno, levels = pheno.order, labels = c("dichPheno", "dichPheno+sex", "contPheno", "contPheno+sex"))]

## no. auc available
achk <- mychktestonly[scenario == "main" & !is.na(auc), .N, by = c("pheno", "celltype", "approach")][N != 10, ]
achk
            pheno celltype approach N
 1:     dichPheno      CD8  inbuilt 8
 2:     dichPheno      CD8   custom 8
 3:     dichPheno      CD8    bMIND 9
 4:     dichPheno      CD8    swCAM 9
 5:     dichPheno      CD8    LASSO 9
 6:     dichPheno      CD8    RIDGE 9
 7:     dichPheno     CD14  inbuilt 7
 8:     dichPheno     CD14   custom 8
 9:     dichPheno     CD14    bMIND 7
10:     dichPheno     CD14    swCAM 7
11:     dichPheno     CD14    LASSO 7
12:     dichPheno     CD14    RIDGE 7
13:     dichPheno     CD19  inbuilt 8
14:     dichPheno     CD19   custom 8
15:     dichPheno     CD19    bMIND 8
16:     dichPheno     CD19    swCAM 8
17:     dichPheno     CD19    LASSO 8
18:     dichPheno     CD19    RIDGE 8
19: dichPheno+sex      CD4  inbuilt 9
20: dichPheno+sex      CD4   custom 8
21: dichPheno+sex      CD8  inbuilt 7
22: dichPheno+sex      CD8   custom 7
23: dichPheno+sex      CD8    bMIND 8
24: dichPheno+sex      CD8    swCAM 8
25: dichPheno+sex      CD8    LASSO 8
26: dichPheno+sex      CD8    RIDGE 8
27: dichPheno+sex     CD14  inbuilt 8
28: dichPheno+sex     CD14   custom 8
29: dichPheno+sex     CD14    bMIND 7
30: dichPheno+sex     CD14    swCAM 7
31: dichPheno+sex     CD14    LASSO 7
32: dichPheno+sex     CD14    RIDGE 7
33: dichPheno+sex     CD19  inbuilt 8
34: dichPheno+sex     CD19   custom 8
35: dichPheno+sex     CD19    bMIND 9
36: dichPheno+sex     CD19    swCAM 9
37: dichPheno+sex     CD19    LASSO 9
38: dichPheno+sex     CD19    RIDGE 9
39:     contPheno      CD8  inbuilt 9
40:     contPheno     CD14  inbuilt 9
41:     contPheno     CD14   custom 9
42:     contPheno     CD14    bMIND 9
43:     contPheno     CD14    swCAM 9
44:     contPheno     CD14    LASSO 8
45:     contPheno     CD14    RIDGE 9
46: contPheno+sex     CD14  inbuilt 9
47: contPheno+sex     CD14   custom 9
48: contPheno+sex     CD14    LASSO 9
49: contPheno+sex     CD19  inbuilt 9
            pheno celltype approach N

achk_words <- "**Noted AUC was not calculated for some simulated phenotypes because all associations were true negative, and none were false negative. Scenario-celltype-approach with no. AUCs $<$ 10 are listed in the code chunk.**"


ggboxplot(
  data = mychktestonly[scenario == "main", ],
  x = "approach", y = "auc", ylab = "AUC",
  palette = appro.cols,
  alpha = 0.35, fill = "approach",
  add = c("jitter"),
  add.params = list(shape = 21, size = 2.5, alpha = 1)
) +
  scale_fill_manual(labels = appro_lab, values = appro.cols) +
  facet_grid(pheno ~ celltype, scales = "free") +
  labs(fill = NULL) +
  used_ggthemes(
    axis.text.x = element_blank(),
    legend.position = "top",
    strip.text.y.right = element_text(angle = 0),
    legend.text = element_text(size = rel(1))
  ) +
  rremove("xlab")

Code

mychktestonly_tbl <- copy(mychktestonly)
mychktestonly_tbl[, .N, by = c("scenarios", "celltype", "approach")]
             scenarios celltype approach  N
  1:   main.dcht.nocov      CD4  inbuilt 10
  2:   main.dcht.nocov      CD4   custom 10
  3:   main.dcht.nocov      CD4    bMIND 10
  4:   main.dcht.nocov      CD4    swCAM 10
  5:   main.dcht.nocov      CD4    LASSO 10
 ---                                       
188: common.cntn.covin     CD19   custom 10
189: common.cntn.covin     CD19    bMIND 10
190: common.cntn.covin     CD19    swCAM 10
191: common.cntn.covin     CD19    LASSO 10
192: common.cntn.covin     CD19    RIDGE 10

tbl_summary <- mychktestonly_tbl[, list(
  Q50 = median(auc, na.rm = TRUE) %>% round(., 2),
  Q25 = quantile(auc, probs = 0.25, na.rm = TRUE) %>% round(., 2),
  Q75 = quantile(auc, probs = 0.75, na.rm = TRUE) %>% round(., 2)
),
by = c("scenarios", "celltype", "approach")
][
  , V1 := paste0(Q50, " (", Q25, "-", Q75, ")")
]

tbl_summary[, pheno := gsub("^main\\.|^common\\.", "", scenarios)]
tbl_summary[, scenario := str_extract(scenarios, "main|common")]

tbl_res <- dcast(
  data = tbl_summary, pheno + celltype ~ scenario + approach,
  value.var = "V1"
)

setDF(tbl_res, rownames = paste0(tbl_res$pheno, tbl_res$celltype))

pheno.order <- CJ(c("dcht", "cntn"), c("nocov", "covin"), sorted = FALSE)[, paste(V1, V2, sep = ".")]

cols.order <- CJ(c("main", "common"), V2 = expr_order, sorted = FALSE)[, paste(V1, V2, sep = "_")]

rows.order <- CJ(pheno.order, sctorder, sorted = FALSE)[, paste0(pheno.order, sctorder)]

tbl_out <- tbl_res[rows.order, c("celltype", grep("main", cols.order, value = TRUE))]
names(tbl_out) <- gsub("main_|common_", "", names(tbl_out))

capin <- paste0(
  "Median AUC (Q25-Q75) across",
  ifelse(nrow(achk) != 0, "", uniqueN(mychktestonly$fakedPCs)),
  " simulated phentoypes per cell by apparoch. **Genes common across approaches per cell type**"
)


kbl(
  x = tbl_out, row.names = FALSE,
  caption = capin
) %>%
  kable_paper("striped", full_width = FALSE) %>%
  pack_rows("dichotomous pheno", 1, 4,
    label_row_css = "text-align: left;"
  ) %>%
  pack_rows("dichotomous pheno + sex", 5, 8) %>%
  pack_rows("continous pheno", 9, 12) %>%
  pack_rows("continous pheno + sex", 13, 16) %>%
  column_spec(column = 6:7, bold = T)

Median AUC (Q25-Q75) across simulated phentoypes per cell by apparoch. **Genes common across approaches per cell type**
celltype	inbuilt	custom	bMIND	swCAM	LASSO	RIDGE
dichotomous pheno
CD4	0.62 (0.6-0.65)	0.64 (0.6-0.66)	0.63 (0.62-0.66)	0.59 (0.56-0.61)	0.66 (0.6-0.7)	0.65 (0.57-0.71)
CD8	0.6 (0.56-0.64)	0.62 (0.58-0.68)	0.59 (0.58-0.63)	0.5 (0.49-0.52)	0.66 (0.62-0.74)	0.69 (0.65-0.73)
CD14	0.61 (0.56-0.67)	0.57 (0.51-0.65)	0.55 (0.5-0.62)	0.5 (0.5-0.51)	0.53 (0.51-0.63)	0.56 (0.56-0.64)
CD19	0.59 (0.56-0.62)	0.63 (0.59-0.67)	0.63 (0.58-0.68)	0.5 (0.5-0.54)	0.69 (0.59-0.75)	0.7 (0.59-0.76)
dichotomous pheno + sex
CD4	0.62 (0.59-0.62)	0.64 (0.63-0.66)	0.63 (0.61-0.66)	0.58 (0.53-0.61)	0.66 (0.59-0.69)	0.65 (0.57-0.7)
CD8	0.58 (0.5-0.6)	0.59 (0.55-0.61)	0.59 (0.53-0.61)	0.5 (0.49-0.51)	0.64 (0.62-0.67)	0.68 (0.61-0.7)
CD14	0.6 (0.52-0.65)	0.54 (0.51-0.6)	0.55 (0.5-0.61)	0.51 (0.5-0.51)	0.53 (0.52-0.63)	0.56 (0.56-0.64)
CD19	0.58 (0.5-0.6)	0.61 (0.53-0.63)	0.63 (0.56-0.67)	0.51 (0.5-0.53)	0.67 (0.57-0.77)	0.73 (0.59-0.78)
continous pheno
CD4	0.57 (0.55-0.6)	0.58 (0.56-0.63)	0.59 (0.55-0.63)	0.57 (0.54-0.61)	0.59 (0.53-0.64)	0.6 (0.54-0.64)
CD8	0.58 (0.56-0.63)	0.61 (0.58-0.66)	0.59 (0.58-0.64)	0.5 (0.49-0.55)	0.64 (0.61-0.67)	0.64 (0.61-0.68)
CD14	0.58 (0.52-0.59)	0.56 (0.53-0.57)	0.57 (0.51-0.6)	0.51 (0.5-0.52)	0.55 (0.49-0.59)	0.56 (0.54-0.63)
CD19	0.53 (0.51-0.59)	0.59 (0.48-0.61)	0.56 (0.54-0.63)	0.5 (0.5-0.52)	0.53 (0.5-0.59)	0.54 (0.49-0.6)
continous pheno + sex
CD4	0.57 (0.51-0.6)	0.58 (0.53-0.63)	0.59 (0.53-0.63)	0.56 (0.52-0.62)	0.59 (0.52-0.64)	0.6 (0.54-0.64)
CD8	0.57 (0.55-0.59)	0.59 (0.57-0.61)	0.58 (0.57-0.61)	0.5 (0.49-0.53)	0.61 (0.6-0.65)	0.62 (0.6-0.67)
CD14	0.58 (0.52-0.59)	0.56 (0.53-0.6)	0.56 (0.51-0.6)	0.5 (0.5-0.52)	0.55 (0.52-0.58)	0.58 (0.54-0.63)
CD19	0.53 (0.51-0.58)	0.54 (0.5-0.61)	0.57 (0.53-0.62)	0.5 (0.5-0.52)	0.55 (0.5-0.59)	0.55 (0.5-0.61)

2.14.2 Common gene sets

Code

achk1 <- mychktestonly[scenario == "common" & !is.na(auc), .N, by = c("pheno", "celltype", "approach")][N != 10, ]
achk1
            pheno celltype approach N
 1:     dichPheno      CD8  inbuilt 8
 2:     dichPheno      CD8   custom 8
 3:     dichPheno      CD8    bMIND 8
 4:     dichPheno      CD8    swCAM 8
 5:     dichPheno      CD8    LASSO 8
 6:     dichPheno      CD8    RIDGE 8
 7:     dichPheno     CD14  inbuilt 7
 8:     dichPheno     CD14   custom 7
 9:     dichPheno     CD14    bMIND 7
10:     dichPheno     CD14    swCAM 7
11:     dichPheno     CD14    LASSO 7
12:     dichPheno     CD14    RIDGE 7
13:     dichPheno     CD19  inbuilt 7
14:     dichPheno     CD19   custom 7
15:     dichPheno     CD19    bMIND 7
16:     dichPheno     CD19    swCAM 7
17:     dichPheno     CD19    LASSO 7
18:     dichPheno     CD19    RIDGE 7
19: dichPheno+sex      CD4  inbuilt 9
20: dichPheno+sex      CD4   custom 9
21: dichPheno+sex      CD4    bMIND 9
22: dichPheno+sex      CD4    swCAM 9
23: dichPheno+sex      CD4    LASSO 9
24: dichPheno+sex      CD4    RIDGE 9
25: dichPheno+sex      CD8  inbuilt 7
26: dichPheno+sex      CD8   custom 7
27: dichPheno+sex      CD8    bMIND 7
28: dichPheno+sex      CD8    swCAM 7
29: dichPheno+sex      CD8    LASSO 7
30: dichPheno+sex      CD8    RIDGE 7
31: dichPheno+sex     CD14  inbuilt 7
32: dichPheno+sex     CD14   custom 7
33: dichPheno+sex     CD14    bMIND 7
34: dichPheno+sex     CD14    swCAM 7
35: dichPheno+sex     CD14    LASSO 7
36: dichPheno+sex     CD14    RIDGE 7
37: dichPheno+sex     CD19  inbuilt 8
38: dichPheno+sex     CD19   custom 8
39: dichPheno+sex     CD19    bMIND 8
40: dichPheno+sex     CD19    swCAM 8
41: dichPheno+sex     CD19    LASSO 8
42: dichPheno+sex     CD19    RIDGE 8
43:     contPheno      CD8  inbuilt 9
44:     contPheno      CD8   custom 9
45:     contPheno      CD8    bMIND 9
46:     contPheno      CD8    swCAM 9
47:     contPheno      CD8    LASSO 9
48:     contPheno      CD8    RIDGE 9
49:     contPheno     CD14  inbuilt 9
50:     contPheno     CD14   custom 9
51:     contPheno     CD14    bMIND 9
52:     contPheno     CD14    swCAM 9
53:     contPheno     CD14    LASSO 9
54:     contPheno     CD14    RIDGE 9
55:     contPheno     CD19  inbuilt 9
56:     contPheno     CD19   custom 9
57:     contPheno     CD19    bMIND 9
58:     contPheno     CD19    swCAM 9
59:     contPheno     CD19    LASSO 9
60:     contPheno     CD19    RIDGE 9
61: contPheno+sex     CD14  inbuilt 9
62: contPheno+sex     CD14   custom 9
63: contPheno+sex     CD14    bMIND 9
64: contPheno+sex     CD14    swCAM 9
65: contPheno+sex     CD14    LASSO 9
66: contPheno+sex     CD14    RIDGE 9
67: contPheno+sex     CD19  inbuilt 9
68: contPheno+sex     CD19   custom 9
69: contPheno+sex     CD19    bMIND 9
70: contPheno+sex     CD19    swCAM 9
71: contPheno+sex     CD19    LASSO 9
72: contPheno+sex     CD19    RIDGE 9
            pheno celltype approach N

ggboxplot(
  data = mychktestonly[scenario == "common", ],
  x = "approach", y = "auc", ylab = "AUC",
  palette = appro.cols,
  alpha = 0.35, fill = "approach",
  add = c("jitter"),
  add.params = list(shape = 21, size = 2.5, alpha = 1)
) +
  scale_fill_manual(labels = appro_lab, values = appro.cols) +
  facet_grid(pheno ~ celltype, scales = "free") +
  labs(fill = NULL) +
  used_ggthemes(
    axis.text.x = element_blank(),
    legend.position = "top",
    strip.text.y.right = element_text(angle = 0),
    legend.text = element_text(size = rel(1))
  ) +
  rremove("xlab")

Differential gene expression (DGE) recovery. **Genes common across approaches were used**. No. common genes are 3837 for CD4, 6274 for CD8, 5185 for CD14, and 2689 for CD19 Area under curve (AUC) distributions by cell type (columns) and approach for each scenario (rows). **dichPheno: simulated dichotomous phenotype; dichPheno+sex: simulated dichotomous phenotype and sex; contPheno: continous phenotype; contPheno+sex: continous phenotype and sex**. Each point is a simulated phenotype, and ten phenotypes were simulated. For each simulated phenotype, the receiver operating characteristic curve and AUC were estimated by FDR fixed at 0.05 in the observed data and varied FDRs from 0 to 1 by 0.05 in the imputed data. Box plots showed the AUC distributions, with horizontal lines from the bottom to the top for 25 %, 50 % and 75 % quantiles, respectively. inbuilt: CIBERSORTx with the inbuilt signature matrix; custom: CIBERSORTx with a custom signature matrix; bMIND: bMIND with flow fractions; swCAM: swCAM with flow fractions; LASSO/RIDGE: regularised multi-response Gaussian models. Noted AUC was not calculated for some simulated phenotypes because all associations were true negative, and none were false negative. Scenario-celltype-approach with no. AUCs \(<\) 10 are listed in the code chunk.

Code

tbl_out <- tbl_res[rows.order, c("celltype", grep("common", cols.order, value = TRUE))]
names(tbl_out) <- gsub("main_|common_", "", names(tbl_out))

capin <- paste0(
  "Median AUC (Q25-Q75) across",
  ifelse(nrow(achk1) != 0, "", uniqueN(mychktestonly$fakedPCs)),
  " simulated phentoypes per cell by apparoch. **Genes common across approaches per cell type**"
)

kbl(
  x = tbl_out, row.names = FALSE,
  caption = capin
) %>%
  kable_paper("striped", full_width = FALSE) %>%
  column_spec(column = 6:7, bold = TRUE) %>%
  pack_rows("dichotomous pheno", 1, 4,
    label_row_css = "text-align: left;"
  ) %>%
  pack_rows("dichotomous pheno + sex", 5, 8) %>%
  pack_rows("continous pheno", 9, 12) %>%
  pack_rows("continous pheno + sex", 13, 16)

Median AUC (Q25-Q75) across simulated phentoypes per cell by apparoch. **Genes common across approaches per cell type**
celltype	inbuilt	custom	bMIND	swCAM	LASSO	RIDGE
dichotomous pheno
CD4	0.63 (0.6-0.64)	0.64 (0.62-0.65)	0.63 (0.61-0.68)	0.6 (0.54-0.65)	0.65 (0.62-0.66)	0.65 (0.54-0.71)
CD8	0.6 (0.57-0.64)	0.62 (0.59-0.69)	0.61 (0.56-0.71)	0.49 (0.49-0.5)	0.69 (0.64-0.74)	0.68 (0.61-0.72)
CD14	0.61 (0.49-0.66)	0.57 (0.53-0.67)	0.6 (0.57-0.64)	0.52 (0.49-0.58)	0.62 (0.56-0.63)	0.64 (0.58-0.69)
CD19	0.6 (0.55-0.64)	0.6 (0.55-0.65)	0.61 (0.56-0.69)	0.53 (0.51-0.58)	0.7 (0.49-0.83)	0.77 (0.51-0.82)
dichotomous pheno + sex
CD4	0.63 (0.62-0.65)	0.64 (0.62-0.64)	0.67 (0.63-0.68)	0.61 (0.53-0.66)	0.65 (0.63-0.67)	0.66 (0.59-0.69)
CD8	0.58 (0.5-0.6)	0.6 (0.55-0.61)	0.56 (0.52-0.61)	0.49 (0.49-0.5)	0.64 (0.6-0.7)	0.67 (0.6-0.73)
CD14	0.6 (0.51-0.65)	0.56 (0.52-0.65)	0.6 (0.58-0.63)	0.53 (0.52-0.56)	0.61 (0.54-0.63)	0.64 (0.59-0.67)
CD19	0.59 (0.51-0.62)	0.59 (0.52-0.65)	0.58 (0.53-0.69)	0.52 (0.5-0.57)	0.74 (0.54-0.8)	0.79 (0.51-0.82)
continous pheno
CD4	0.58 (0.55-0.62)	0.58 (0.56-0.62)	0.6 (0.55-0.64)	0.57 (0.54-0.63)	0.6 (0.53-0.63)	0.6 (0.52-0.63)
CD8	0.58 (0.56-0.64)	0.6 (0.59-0.67)	0.6 (0.59-0.69)	0.51 (0.5-0.56)	0.63 (0.59-0.71)	0.66 (0.61-0.74)
CD14	0.6 (0.52-0.6)	0.57 (0.52-0.61)	0.63 (0.56-0.66)	0.53 (0.51-0.55)	0.56 (0.51-0.6)	0.59 (0.55-0.63)
CD19	0.56 (0.54-0.63)	0.54 (0.5-0.64)	0.57 (0.56-0.68)	0.5 (0.5-0.53)	0.51 (0.5-0.73)	0.52 (0.51-0.78)
continous pheno + sex
CD4	0.57 (0.53-0.62)	0.58 (0.53-0.62)	0.59 (0.53-0.64)	0.57 (0.52-0.63)	0.6 (0.51-0.63)	0.6 (0.52-0.63)
CD8	0.57 (0.56-0.59)	0.6 (0.58-0.61)	0.59 (0.57-0.61)	0.5 (0.5-0.54)	0.62 (0.57-0.68)	0.64 (0.59-0.7)
CD14	0.59 (0.52-0.6)	0.6 (0.52-0.61)	0.61 (0.57-0.66)	0.52 (0.5-0.55)	0.56 (0.51-0.6)	0.58 (0.55-0.63)
CD19	0.56 (0.52-0.61)	0.58 (0.54-0.63)	0.54 (0.51-0.66)	0.51 (0.5-0.53)	0.58 (0.5-0.75)	0.58 (0.49-0.77)

2.15 AUCs in pseudo datasets

testing samples + varying gene sets

Code

simres_pltlist$main$sce2

2.16 AUCs in pseudo datasets

testing samples + common gene set

Code

simres_pltlist$common$sce2

Session info

R version 4.3.3 (2024-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Rocky Linux 8.10 (Green Obsidian)

Matrix products: default
BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.15.so;  LAPACK version 3.9.0

locale:
 [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
 [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
 [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       

time zone: GB
tzcode source: system (glibc)

attached base packages:
[1] grid      parallel  stats     graphics  grDevices datasets  utils    
[8] methods   base     

other attached packages:
 [1] circlize_0.4.16       cowplot_1.1.1         pROC_1.18.4          
 [4] kableExtra_1.3.4.9000 cols4all_0.7-1        ggpubr_0.6.0         
 [7] edgeR_3.42.4          limma_3.56.2          gridExtra_2.3        
[10] ggrepel_0.9.3         RColorBrewer_1.1-3    ggplot2_3.5.1        
[13] ComplexHeatmap_2.18.0 styler_1.10.2         knitr_1.44           
[16] stringr_1.5.0         magrittr_2.0.3        data.table_1.14.8    
[19] doParallel_1.0.17     iterators_1.0.14      foreach_1.5.2        

loaded via a namespace (and not attached):
  [1] rstudioapi_0.15.0       jsonlite_1.8.7          shape_1.4.6            
  [4] farver_2.1.1            rmarkdown_2.25          GlobalOptions_0.1.2    
  [7] vctrs_0.6.5             rstatix_0.7.2           webshot_0.5.5          
 [10] htmltools_0.5.6.1       polynom_1.4-1           curl_5.1.0             
 [13] broom_1.0.5             htmlwidgets_1.6.2       plyr_1.8.9             
 [16] mime_0.12               lifecycle_1.0.4         pkgconfig_2.0.3        
 [19] Matrix_1.6-5            R6_2.5.1                fastmap_1.1.1          
 [22] shiny_1.7.5             clue_0.3-65             digest_0.6.33          
 [25] colorspace_2.1-0        S4Vectors_0.38.2        rprojroot_2.0.3        
 [28] labeling_0.4.3          spacesXYZ_1.3-0         fansi_1.0.5            
 [31] httr_1.4.7              abind_1.4-5             mgcv_1.9-1             
 [34] compiler_4.3.3          here_1.0.1              fontquiver_0.2.1       
 [37] withr_2.5.1             backports_1.4.1         carData_3.0-5          
 [40] highr_0.10              Rttf2pt1_1.3.12         R.utils_2.12.2         
 [43] ggsignif_0.6.4          MASS_7.3-60.0.1         rjson_0.2.21           
 [46] ggpp_0.5.8-1            ggsci_3.0.0             gfonts_0.2.0           
 [49] tools_4.3.3             zip_2.3.0               httpuv_1.6.11          
 [52] extrafontdb_1.0         R.oo_1.25.0             glue_1.7.0             
 [55] nlme_3.1-164            R.cache_0.16.0          promises_1.2.1         
 [58] cluster_2.1.6           generics_0.1.3          gtable_0.3.4           
 [61] R.methodsS3_1.8.2       tidyr_1.3.0             xml2_1.3.5             
 [64] car_3.1-2               utf8_1.2.3              BiocGenerics_0.46.0    
 [67] pillar_1.9.0            later_1.3.1             splines_4.3.3          
 [70] dplyr_1.1.4             lattice_0.22-5          renv_1.0.10            
 [73] tidyselect_1.2.0        fontLiberation_0.1.0    locfit_1.5-9.8         
 [76] fontBitstreamVera_0.1.1 IRanges_2.34.1          svglite_2.1.2          
 [79] crul_1.4.0              stats4_4.3.3            xfun_0.40              
 [82] matrixStats_1.0.0       pheatmap_1.0.12         stringi_1.7.12         
 [85] yaml_2.3.7              evaluate_0.22           codetools_0.2-19       
 [88] httpcode_0.3.0          extrafont_0.19          gdtools_0.3.3          
 [91] tibble_3.2.1            BiocManager_1.30.22     cli_3.6.2              
 [94] xtable_1.8-4            systemfonts_1.0.5       munsell_0.5.0          
 [97] Rcpp_1.0.11             png_0.1-8               ellipsis_0.3.2         
[100] viridisLite_0.4.2       hrbrthemes_0.8.0        scales_1.3.0           
[103] openxlsx_4.2.5.2        purrr_1.0.2             crayon_1.5.2           
[106] GetoptLong_1.0.5        rlang_1.1.3             rvest_1.0.3