CASCADE 2.0 Analysis (Link Operator Mutations)
Performance of automatically parameterized models against SINTEF dataset (Flobak et al. 2019)
HSA results
- HSA refers to the synergy method used in
Drabme
to assess the synergies from thegitsbe
models - We test performance using ROC and PR AUC for both the ensemble-wise and model-wise synergies from
Drabme
- Calibrated models: fitted to steady state (\(50,100,150,200\) simulations)
- Random models: fitted to proliferation profile (\(150\) simulations)
Gitsbe
models have mutations on link operator only
Load results:
# 'ss' => calibrated models, 'prolif' => random models
# 'ew' => ensemble-wise, 'mw' => model-wise
## HSA results
= paste0("results/link-only/cascade_2.0_ss_50sim_fixpoints_hsa_ensemblewise_synergies.tab")
ss_hsa_ensemblewise_50sim_file = paste0("results/link-only/cascade_2.0_ss_50sim_fixpoints_hsa_modelwise_synergies.tab")
ss_hsa_modelwise_50sim_file = paste0("results/link-only/cascade_2.0_ss_100sim_fixpoints_hsa_ensemblewise_synergies.tab")
ss_hsa_ensemblewise_100sim_file = paste0("results/link-only/cascade_2.0_ss_100sim_fixpoints_hsa_modelwise_synergies.tab")
ss_hsa_modelwise_100sim_file = paste0("results/link-only/cascade_2.0_ss_150sim_fixpoints_hsa_ensemblewise_synergies.tab")
ss_hsa_ensemblewise_150sim_file = paste0("results/link-only/cascade_2.0_ss_150sim_fixpoints_hsa_modelwise_synergies.tab")
ss_hsa_modelwise_150sim_file = paste0("results/link-only/cascade_2.0_ss_200sim_fixpoints_hsa_ensemblewise_synergies.tab")
ss_hsa_ensemblewise_200sim_file = paste0("results/link-only/cascade_2.0_ss_200sim_fixpoints_hsa_modelwise_synergies.tab")
ss_hsa_modelwise_200sim_file = paste0("results/link-only/cascade_2.0_rand_150sim_fixpoints_hsa_ensemblewise_synergies.tab")
prolif_hsa_ensemblewise_file = paste0("results/link-only/cascade_2.0_rand_150sim_fixpoints_hsa_modelwise_synergies.tab")
prolif_hsa_modelwise_file
= emba::get_synergy_scores(ss_hsa_ensemblewise_50sim_file)
ss_hsa_ensemblewise_synergies_50sim = emba::get_synergy_scores(ss_hsa_modelwise_50sim_file, file_type = "modelwise")
ss_hsa_modelwise_synergies_50sim = emba::get_synergy_scores(ss_hsa_ensemblewise_100sim_file)
ss_hsa_ensemblewise_synergies_100sim = emba::get_synergy_scores(ss_hsa_modelwise_100sim_file, file_type = "modelwise")
ss_hsa_modelwise_synergies_100sim = emba::get_synergy_scores(ss_hsa_ensemblewise_150sim_file)
ss_hsa_ensemblewise_synergies_150sim = emba::get_synergy_scores(ss_hsa_modelwise_150sim_file, file_type = "modelwise")
ss_hsa_modelwise_synergies_150sim = emba::get_synergy_scores(ss_hsa_ensemblewise_200sim_file)
ss_hsa_ensemblewise_synergies_200sim = emba::get_synergy_scores(ss_hsa_modelwise_200sim_file, file_type = "modelwise")
ss_hsa_modelwise_synergies_200sim = emba::get_synergy_scores(prolif_hsa_ensemblewise_file)
prolif_hsa_ensemblewise_synergies_150sim = emba::get_synergy_scores(prolif_hsa_modelwise_file, file_type = "modelwise")
prolif_hsa_modelwise_synergies_150sim
# calculate probability of synergy in the modelwise results
= ss_hsa_modelwise_synergies_50sim %>%
ss_hsa_modelwise_synergies_50sim mutate(synergy_prob_ss = synergies/(synergies + `non-synergies`))
= ss_hsa_modelwise_synergies_100sim %>%
ss_hsa_modelwise_synergies_100sim mutate(synergy_prob_ss = synergies/(synergies + `non-synergies`))
= ss_hsa_modelwise_synergies_150sim %>%
ss_hsa_modelwise_synergies_150sim mutate(synergy_prob_ss = synergies/(synergies + `non-synergies`))
= ss_hsa_modelwise_synergies_200sim %>%
ss_hsa_modelwise_synergies_200sim mutate(synergy_prob_ss = synergies/(synergies + `non-synergies`))
= prolif_hsa_modelwise_synergies_150sim %>%
prolif_hsa_modelwise_synergies_150sim mutate(synergy_prob_prolif = synergies/(synergies + `non-synergies`))
= 'data/observed_synergies_cascade_2.0'
observed_synergies_file = emba::get_observed_synergies(observed_synergies_file)
observed_synergies # 1 (positive/observed synergy) or 0 (negative/not observed) for all tested drug combinations
= sapply(prolif_hsa_modelwise_synergies_150sim$perturbation %in% observed_synergies, as.integer)
observed
# 'ew' => ensemble-wise, 'mw' => model-wise
= bind_cols(
pred_ew_hsa %>% select(score) %>% rename(ss_score_50sim = score),
ss_hsa_ensemblewise_synergies_50sim %>% select(score) %>% rename(ss_score_100sim = score),
ss_hsa_ensemblewise_synergies_100sim %>% select(score) %>% rename(ss_score_150sim = score),
ss_hsa_ensemblewise_synergies_150sim %>% select(score) %>% rename(ss_score_200sim = score),
ss_hsa_ensemblewise_synergies_200sim %>% select(score) %>% rename(prolif_score_150sim = score),
prolif_hsa_ensemblewise_synergies_150sim as_tibble_col(observed, column_name = "observed"))
= bind_cols(
pred_mw_hsa %>% select(perturbation, synergy_prob_ss) %>% rename(synergy_prob_ss_50sim = synergy_prob_ss),
ss_hsa_modelwise_synergies_50sim %>% select(synergy_prob_ss) %>% rename(synergy_prob_ss_100sim = synergy_prob_ss),
ss_hsa_modelwise_synergies_100sim %>% select(synergy_prob_ss) %>% rename(synergy_prob_ss_150sim = synergy_prob_ss),
ss_hsa_modelwise_synergies_150sim %>% select(synergy_prob_ss) %>% rename(synergy_prob_ss_200sim = synergy_prob_ss),
ss_hsa_modelwise_synergies_200sim %>% select(synergy_prob_prolif) %>% rename(synergy_prob_prolif_150sim = synergy_prob_prolif),
prolif_hsa_modelwise_synergies_150sim as_tibble_col(observed, column_name = "observed"))
ROC curves
= get_roc_stats(df = pred_ew_hsa, pred_col = "ss_score_50sim", label_col = "observed")
res_ss_ew_50sim = get_roc_stats(df = pred_ew_hsa, pred_col = "ss_score_100sim", label_col = "observed")
res_ss_ew_100sim = get_roc_stats(df = pred_ew_hsa, pred_col = "ss_score_150sim", label_col = "observed")
res_ss_ew_150sim = get_roc_stats(df = pred_ew_hsa, pred_col = "ss_score_200sim", label_col = "observed")
res_ss_ew_200sim = get_roc_stats(df = pred_ew_hsa, pred_col = "prolif_score_150sim", label_col = "observed")
res_prolif_ew_150sim
= get_roc_stats(df = pred_mw_hsa, pred_col = "synergy_prob_ss_50sim", label_col = "observed", direction = ">")
res_ss_mw_50sim = get_roc_stats(df = pred_mw_hsa, pred_col = "synergy_prob_ss_100sim", label_col = "observed", direction = ">")
res_ss_mw_100sim = get_roc_stats(df = pred_mw_hsa, pred_col = "synergy_prob_ss_150sim", label_col = "observed", direction = ">")
res_ss_mw_150sim = get_roc_stats(df = pred_mw_hsa, pred_col = "synergy_prob_ss_200sim", label_col = "observed", direction = ">")
res_ss_mw_200sim = get_roc_stats(df = pred_mw_hsa, pred_col = "synergy_prob_prolif_150sim", label_col = "observed", direction = ">")
res_prolif_mw_150sim
# Plot ROCs
plot(x = res_ss_ew_50sim$roc_stats$FPR, y = res_ss_ew_50sim$roc_stats$TPR,
type = 'l', lwd = 3, col = my_palette[1], main = 'ROC curve, Ensemble-wise synergies (HSA)',
xlab = 'False Positive Rate (FPR)', ylab = 'True Positive Rate (TPR)')
lines(x = res_ss_ew_100sim$roc_stats$FPR, y = res_ss_ew_100sim$roc_stats$TPR,
lwd = 3, col = my_palette[2])
lines(x = res_ss_ew_150sim$roc_stats$FPR, y = res_ss_ew_150sim$roc_stats$TPR,
lwd = 3, col = my_palette[3])
lines(x = res_ss_ew_200sim$roc_stats$FPR, y = res_ss_ew_200sim$roc_stats$TPR,
lwd = 3, col = my_palette[4])
lines(x = res_prolif_ew_150sim$roc_stats$FPR, y = res_prolif_ew_150sim$roc_stats$TPR,
lwd = 3, col = my_palette[5])
legend('bottomright', title = 'AUC', col = my_palette[1:5], pch = 19,
legend = c(paste(round(res_ss_ew_50sim$AUC, digits = 3), "Calibrated (50 sim)"),
paste(round(res_ss_ew_100sim$AUC, digits = 3), "Calibrated (100 sim)"),
paste(round(res_ss_ew_150sim$AUC, digits = 3), "Calibrated (150 sim)"),
paste(round(res_ss_ew_200sim$AUC, digits = 3), "Calibrated (200 sim)"),
paste(round(res_prolif_ew_150sim$AUC, digits = 3), "Random (150 sim)")))
grid(lwd = 0.5)
abline(a = 0, b = 1, col = 'lightgrey', lty = 'dotdash', lwd = 1.2)
plot(x = res_ss_mw_50sim$roc_stats$FPR, y = res_ss_mw_50sim$roc_stats$TPR,
type = 'l', lwd = 3, col = my_palette[1], main = 'ROC curve, Model-wise synergies (HSA)',
xlab = 'False Positive Rate (FPR)', ylab = 'True Positive Rate (TPR)')
lines(x = res_ss_mw_100sim$roc_stats$FPR, y = res_ss_mw_100sim$roc_stats$TPR,
lwd = 3, col = my_palette[2])
lines(x = res_ss_mw_150sim$roc_stats$FPR, y = res_ss_mw_150sim$roc_stats$TPR,
lwd = 3, col = my_palette[3])
lines(x = res_ss_mw_200sim$roc_stats$FPR, y = res_ss_mw_200sim$roc_stats$TPR,
lwd = 3, col = my_palette[4])
lines(x = res_prolif_mw_150sim$roc_stats$FPR, y = res_prolif_mw_150sim$roc_stats$TPR,
lwd = 3, col = my_palette[5])
legend('bottomright', title = 'AUC', col = my_palette[1:5], pch = 19,
legend = c(paste(round(res_ss_mw_50sim$AUC, digits = 3), "Calibrated (50 sim)"),
paste(round(res_ss_mw_100sim$AUC, digits = 3), "Calibrated (100 sim)"),
paste(round(res_ss_mw_150sim$AUC, digits = 3), "Calibrated (150 sim)"),
paste(round(res_ss_mw_200sim$AUC, digits = 3), "Calibrated (200 sim)"),
paste(round(res_prolif_mw_150sim$AUC, digits = 3), "Random (150 sim)")))
grid(lwd = 0.5)
abline(a = 0, b = 1, col = 'lightgrey', lty = 'dotdash', lwd = 1.2)
PR curves
= pr.curve(scores.class0 = pred_ew_hsa %>% pull(ss_score_50sim) %>% (function(x) {-x}),
pr_ss_ew_hsa_50sim weights.class0 = pred_ew_hsa %>% pull(observed), curve = TRUE, rand.compute = TRUE)
= pr.curve(scores.class0 = pred_ew_hsa %>% pull(ss_score_100sim) %>% (function(x) {-x}),
pr_ss_ew_hsa_100sim weights.class0 = pred_ew_hsa %>% pull(observed), curve = TRUE)
= pr.curve(scores.class0 = pred_ew_hsa %>% pull(ss_score_150sim) %>% (function(x) {-x}),
pr_ss_ew_hsa_150sim weights.class0 = pred_ew_hsa %>% pull(observed), curve = TRUE)
= pr.curve(scores.class0 = pred_ew_hsa %>% pull(ss_score_200sim) %>% (function(x) {-x}),
pr_ss_ew_hsa_200sim weights.class0 = pred_ew_hsa %>% pull(observed), curve = TRUE)
= pr.curve(scores.class0 = pred_ew_hsa %>% pull(prolif_score_150sim) %>% (function(x) {-x}),
pr_prolif_ew_hsa_150sim weights.class0 = pred_ew_hsa %>% pull(observed), curve = TRUE)
= pr.curve(scores.class0 = pred_mw_hsa %>% pull(synergy_prob_ss_50sim),
pr_ss_mw_hsa_50sim weights.class0 = pred_mw_hsa %>% pull(observed), curve = TRUE, rand.compute = TRUE)
= pr.curve(scores.class0 = pred_mw_hsa %>% pull(synergy_prob_ss_100sim),
pr_ss_mw_hsa_100sim weights.class0 = pred_mw_hsa %>% pull(observed), curve = TRUE)
= pr.curve(scores.class0 = pred_mw_hsa %>% pull(synergy_prob_ss_150sim),
pr_ss_mw_hsa_150sim weights.class0 = pred_mw_hsa %>% pull(observed), curve = TRUE)
= pr.curve(scores.class0 = pred_mw_hsa %>% pull(synergy_prob_ss_200sim),
pr_ss_mw_hsa_200sim weights.class0 = pred_mw_hsa %>% pull(observed), curve = TRUE)
= pr.curve(scores.class0 = pred_mw_hsa %>% pull(synergy_prob_prolif_150sim),
pr_prolif_mw_hsa_150sim weights.class0 = pred_mw_hsa %>% pull(observed), curve = TRUE)
plot(pr_ss_ew_hsa_50sim, main = 'PR curve, Ensemble-wise synergies (HSA)',
auc.main = FALSE, color = my_palette[1], rand.plot = TRUE)
plot(pr_ss_ew_hsa_100sim, add = TRUE, color = my_palette[2])
plot(pr_ss_ew_hsa_150sim, add = TRUE, color = my_palette[3])
plot(pr_ss_ew_hsa_200sim, add = TRUE, color = my_palette[4])
plot(pr_prolif_ew_hsa_150sim, add = TRUE, color = my_palette[5])
legend('topright', title = 'AUC', col = my_palette[1:5], pch = 19,
legend = c(paste(round(pr_ss_ew_hsa_50sim$auc.davis.goadrich, digits = 3), "Calibrated (50 sim)"),
paste(round(pr_ss_ew_hsa_100sim$auc.davis.goadrich, digits = 3), "Calibrated (100 sim)"),
paste(round(pr_ss_ew_hsa_150sim$auc.davis.goadrich, digits = 3), "Calibrated (150 sim)"),
paste(round(pr_ss_ew_hsa_200sim$auc.davis.goadrich, digits = 3), "Calibrated (200 sim)"),
paste(round(pr_prolif_ew_hsa_150sim$auc.davis.goadrich, digits = 3), "Random (150 sim)")))
grid(lwd = 0.5)
plot(pr_ss_mw_hsa_50sim, main = 'PR curve, Model-wise synergies (HSA)',
auc.main = FALSE, color = my_palette[1], rand.plot = TRUE)
plot(pr_ss_mw_hsa_100sim, add = TRUE, color = my_palette[2])
plot(pr_ss_mw_hsa_150sim, add = TRUE, color = my_palette[3])
plot(pr_ss_mw_hsa_200sim, add = TRUE, color = my_palette[4])
plot(pr_prolif_mw_hsa_150sim, add = TRUE, color = my_palette[5])
legend('topright', title = 'AUC', col = my_palette[1:5], pch = 19,
legend = c(paste(round(pr_ss_mw_hsa_50sim$auc.davis.goadrich, digits = 3), "Calibrated (50 sim)"),
paste(round(pr_ss_mw_hsa_100sim$auc.davis.goadrich, digits = 3), "Calibrated (100 sim)"),
paste(round(pr_ss_mw_hsa_150sim$auc.davis.goadrich, digits = 3), "Calibrated (150 sim)"),
paste(round(pr_ss_mw_hsa_200sim$auc.davis.goadrich, digits = 3), "Calibrated (200 sim)"),
paste(round(pr_prolif_mw_hsa_150sim$auc.davis.goadrich, digits = 3), "Random (200 sim)")))
grid(lwd = 0.5)
- To minimize the resulting performance variance, \(150\) seems to be a good number of
Gitsbe
simulations to run for the CASCADE 2.0 network. - The PR curves show that the performance of each individual predictor is poor compared to the baseline. Someone looking at the ROC curves only, might reach a different conclusion.
- Random models perform almost equally well to calibrated models.
- The model-wise approach produces slightly better ROC results than the ensemble-wise approach
AUC sensitivity
Investigate same thing as described in here. This is very crucial since the PR performance is poor for the individual predictors, but a combined predictor might be able to counter this. We will combine the synergy scores from the random proliferative simulations with the results from the calibrated Gitsbe simulations (using the results from the \(150\) simulation runs).
# Ensemble-wise
= seq(from = -7.5, to = 7.5, by = 0.1)
betas
= sapply(betas, function(beta) {
prolif_roc = pred_ew_hsa %>% mutate(combined_score = ss_score_150sim + beta * prolif_score_150sim)
pred_ew_hsa = roc.curve(scores.class0 = pred_ew_hsa %>% pull(combined_score) %>% (function(x) {-x}),
res weights.class0 = pred_ew_hsa %>% pull(observed))
= res$auc
auc_value
})
= sapply(betas, function(beta) {
prolif_pr = pred_ew_hsa %>% mutate(combined_score = ss_score_150sim + beta * prolif_score_150sim)
pred_ew_hsa = pr.curve(scores.class0 = pred_ew_hsa %>% pull(combined_score) %>% (function(x) {-x}),
res weights.class0 = pred_ew_hsa %>% pull(observed))
= res$auc.davis.goadrich
auc_value
})
= as_tibble(cbind(betas, prolif_roc, prolif_pr))
df_ew = df_ew %>% tidyr::pivot_longer(-betas, names_to = "type", values_to = "AUC")
df_ew
ggline(data = df_ew, x = "betas", y = "AUC", numeric.x.axis = TRUE, color = "type",
plot_type = "l", xlab = TeX("$\\beta$"), ylab = "AUC (Area Under Curve)",
legend = "none", facet.by = "type", palette = my_palette,
panel.labs = list(type = c("Precision-Recall", "ROC")),
title = TeX("AUC sensitivity to $\\beta$ parameter (HSA, CASCADE 2.0)")) +
theme(plot.title = element_text(hjust = 0.5)) +
geom_vline(xintercept = 0) +
geom_vline(xintercept = -1, color = "black", size = 0.3, linetype = "dashed") +
geom_text(aes(x=-1.5, label="β = -1", y=0.25), colour="black", angle=90) +
grids()
# Model-wise
= seq(from = 0, to = 1, by = 0.05)
weights
= sapply(weights, function(w) {
prolif_roc_mw = pred_mw_hsa %>%
pred_mw_hsa mutate(weighted_prob = (1 - w) * pred_mw_hsa$synergy_prob_ss_150sim + w * pred_mw_hsa$synergy_prob_prolif_150sim)
= roc.curve(scores.class0 = pred_mw_hsa %>% pull(weighted_prob),
res weights.class0 = pred_mw_hsa %>% pull(observed))
= res$auc
auc_value
})
= sapply(weights, function(w) {
prolif_pr_mw = pred_mw_hsa %>%
pred_mw_hsa mutate(weighted_prob = (1 - w) * pred_mw_hsa$synergy_prob_ss_150sim + w * pred_mw_hsa$synergy_prob_prolif_150sim)
= pr.curve(scores.class0 = pred_mw_hsa %>% pull(weighted_prob),
res weights.class0 = pred_mw_hsa %>% pull(observed))
= res$auc.davis.goadrich
auc_value
})
= as_tibble(cbind(weights, prolif_roc_mw, prolif_pr_mw))
df_mw = df_mw %>% tidyr::pivot_longer(-weights, names_to = "type", values_to = "AUC")
df_mw
ggline(data = df_mw, x = "weights", y = "AUC", numeric.x.axis = TRUE, color = "type",
plot_type = "l", xlab = TeX("weight $w$"), ylab = "AUC (Area Under Curve)",
legend = "none", facet.by = "type", palette = my_palette,
panel.labs = list(type = c("Precision-Recall", "ROC")), title.position = "center",
title = TeX("AUC sensitivity to weighted average score (HSA, CASCADE 2.0)")) +
theme(plot.title = element_text(hjust = 0.5)) +
grids()
- No added benefit when using the model-wise approach.
- The proliferative models seem to add a small contribution to the calibrated models performance (right panel ensemble-wise results => ROC-AUC increases, PR-AUC is insignificantly changed nonetheless).
- The \(\beta_{best}\) that maximizes the ROC and PR AUC for the combination of random and calibrated models and is equal to \(\beta_{best}=-0.3\). For \(\beta=-1\) we do not observe performance improvement in this case.
Logistic Regression Analysis
We tried fitting a model using logistic regression as a different approach to combine/augment the results from calibrated simulations with the random proliferative ones (for the HSA-assessed ensemble-wise results where there was a minimal benefit).
= glm(formula = observed ~ ss_score_150sim + prolif_score_150sim - 1, data = pred_ew_hsa, family = binomial())
model = broom::tidy(model)
model_tidy = model_tidy %>% filter(term == "ss_score_150sim") %>% pull(estimate)
coef1 = model_tidy %>% filter(term == "prolif_score_150sim") %>% pull(estimate)
coef2 = pred_ew_hsa %>% mutate(glm = coef1 * ss_score_150sim + coef2 * prolif_score_150sim)
pred_ew_hsa = get_roc_stats(df = pred_ew_hsa, pred_col = "glm", label_col = "observed")
res_roc = pr.curve(scores.class0 = pred_ew_hsa %>% pull(glm) %>% (function(x) {-x}), weights.class0 = pred_ew_hsa %>% pull(observed)) res_pr
The model with the coefficients is as follows (note that adding an intercept makes ROC AUC result considerably worse):
extract_eq(model, use_coefs = TRUE)
\[ \log\left[ \frac { \widehat{P( \operatorname{observed} = \operatorname{1} )} }{ 1 - \widehat{P( \operatorname{observed} = \operatorname{1} )} } \right] = -10.63(\operatorname{ss\_score\_150sim}) + 42.92(\operatorname{prolif\_score\_150sim}) \]
The ROC AUC produced with a logistic regression model is lower than the calibrated models (with \(150\) Gitsbe simulations): 0.5782313 (PR-AUC is also lower: 0.0527052).
Regularized Logistic Regression Analysis
Because the coefficient values found from the above approach are large, we try a regularized logistic regression approach using the glmnet
R package (Friedman et al. 2021).
We cross validate the \(\lambda\) parameter and try with different \(\alpha \in [0,1]\) (\(\alpha=0\) means Ridge regression, \(\alpha=1\) means LASSO, in between means Elastic net) while either minimizing the missclassification error (type.measure="class"
) or maximizing the ROC-AUC (type.measure = "auc"
).
For each respective \(\alpha\) we choose the \(\lambda_{min}\) as the one the minimizes the average CV error.
The intercept was again excluded as it resulted in worse AUC performance.
= pred_ew_hsa %>% select(ss_score_150sim, prolif_score_150sim) %>% as.matrix()
x = pred_ew_hsa %>% pull(observed)
y
= list()
data_list = 1
index for (i in 0:10) { # from Ridge to LASSO
= i/10
a for (measure in c("auc", "class")) {
set.seed(42) # for reproducibility
= cv.glmnet(x, y, family = "binomial", type.measure = measure, intercept = FALSE, alpha = a)
cvfit = coef(cvfit, s = "lambda.min")
coef_mat = pred_ew_hsa %>% mutate(glm_reg = coef_mat[1] + coef_mat[2] * ss_score_150sim + coef_mat[3] * prolif_score_150sim)
pred_ew_hsa = get_roc_stats(df = pred_ew_hsa, pred_col = "glm_reg", label_col = "observed")
res_roc = pr.curve(scores.class0 = pred_ew_hsa %>% pull(glm_reg) %>% (function(x) {-x}),
pr_roc weights.class0 = pred_ew_hsa %>% pull(observed))
= as_tibble_row(list(alpha = a, measure = measure, ROC_AUC = res_roc$AUC, PR_AUC = pr_roc$auc.davis.goadrich))
data_list[[index]] = index + 1
index
}
}
= bind_rows(data_list)
data
# List the best two results
%>% arrange(desc(ROC_AUC)) %>% slice(1:4) %>% kable() data
alpha | measure | ROC_AUC | PR_AUC |
---|---|---|---|
0.0 | auc | 0.6802721 | 0.0623343 |
0.0 | class | 0.6802721 | 0.0623343 |
0.1 | auc | 0.5770975 | 0.0527103 |
0.2 | auc | 0.5770975 | 0.0527103 |
The best ROC AUC produced with a regularized logistic regression model is also lower than the one using calibrated models alone (with \(150\) Gitsbe simulations).
Note that we get warnings when using glmnet
because of the small number of observations for the positive class (observed synergies).
Resulting coefficients vary, but tend to be either all too small or larger on the random proliferative model predictor.
MAMSE ROC Analysis
Using the MAMSE
R package (Plante 2017) we try another method to combine the predictor values from the calibrated and the random proliferative models.
The resulting ROC curve gets a little bit distorted and AUC is not statistically better from the reference sample population (i.e. the calibrated Gitsbe
models with \(150\) simulations):
# healthy => non-synergy, diseased => synergy
= list()
healthy 1]] = pred_ew_hsa %>% filter(observed == 0) %>% pull(ss_score_150sim)
healthy[[2]] = pred_ew_hsa %>% filter(observed == 0) %>% pull(prolif_score_150sim)
healthy[[
= list()
diseased 1]] = pred_ew_hsa %>% filter(observed == 1) %>% pull(ss_score_150sim)
diseased[[2]] = pred_ew_hsa %>% filter(observed == 1) %>% pull(prolif_score_150sim)
diseased[[
plot(roc(healthy = healthy, diseased = diseased, smalldiseased=TRUE, AUC=TRUE,
wh=NULL, wd=NULL, FPR=NULL, method="np"))
Bliss results
- Bliss refers to the synergy method used in
Drabme
to assess the synergies from thegitsbe
models - We test performance using ROC and PR AUC for both the ensemble-wise and model-wise synergies from
Drabme
- Calibrated models: fitted to steady state (\(50,100,150,200\) simulations)
- Random models: fitted to proliferation profile (\(150\) simulations)
Gitsbe
models have mutations on link operator only
Load results:
# 'ss' => calibrated models, 'prolif' => random proliferative models
## Bliss results
= paste0("results/link-only/cascade_2.0_ss_50sim_fixpoints_bliss_ensemblewise_synergies.tab")
ss_bliss_ensemblewise_50sim_file = paste0("results/link-only/cascade_2.0_ss_50sim_fixpoints_bliss_modelwise_synergies.tab")
ss_bliss_modelwise_50sim_file = paste0("results/link-only/cascade_2.0_ss_100sim_fixpoints_bliss_ensemblewise_synergies.tab")
ss_bliss_ensemblewise_100sim_file = paste0("results/link-only/cascade_2.0_ss_100sim_fixpoints_bliss_modelwise_synergies.tab")
ss_bliss_modelwise_100sim_file = paste0("results/link-only/cascade_2.0_ss_150sim_fixpoints_bliss_ensemblewise_synergies.tab")
ss_bliss_ensemblewise_150sim_file = paste0("results/link-only/cascade_2.0_ss_150sim_fixpoints_bliss_modelwise_synergies.tab")
ss_bliss_modelwise_150sim_file = paste0("results/link-only/cascade_2.0_ss_200sim_fixpoints_bliss_ensemblewise_synergies.tab")
ss_bliss_ensemblewise_200sim_file = paste0("results/link-only/cascade_2.0_ss_200sim_fixpoints_bliss_modelwise_synergies.tab")
ss_bliss_modelwise_200sim_file = paste0("results/link-only/cascade_2.0_rand_150sim_fixpoints_bliss_ensemblewise_synergies.tab")
prolif_bliss_ensemblewise_150sim_file = paste0("results/link-only/cascade_2.0_rand_150sim_fixpoints_bliss_modelwise_synergies.tab")
prolif_bliss_modelwise_150sim_file
= emba::get_synergy_scores(ss_bliss_ensemblewise_50sim_file)
ss_bliss_ensemblewise_synergies_50sim = emba::get_synergy_scores(ss_bliss_modelwise_50sim_file, file_type = "modelwise")
ss_bliss_modelwise_synergies_50sim = emba::get_synergy_scores(ss_bliss_ensemblewise_100sim_file)
ss_bliss_ensemblewise_synergies_100sim = emba::get_synergy_scores(ss_bliss_modelwise_100sim_file, file_type = "modelwise")
ss_bliss_modelwise_synergies_100sim = emba::get_synergy_scores(ss_bliss_ensemblewise_150sim_file)
ss_bliss_ensemblewise_synergies_150sim = emba::get_synergy_scores(ss_bliss_modelwise_150sim_file, file_type = "modelwise")
ss_bliss_modelwise_synergies_150sim = emba::get_synergy_scores(ss_bliss_ensemblewise_200sim_file)
ss_bliss_ensemblewise_synergies_200sim = emba::get_synergy_scores(ss_bliss_modelwise_200sim_file, file_type = "modelwise")
ss_bliss_modelwise_synergies_200sim = emba::get_synergy_scores(prolif_bliss_ensemblewise_150sim_file)
prolif_bliss_ensemblewise_synergies_150sim = emba::get_synergy_scores(prolif_bliss_modelwise_150sim_file, file_type = "modelwise")
prolif_bliss_modelwise_synergies_150sim
# calculate probability of synergy in the modelwise results
= ss_bliss_modelwise_synergies_50sim %>%
ss_bliss_modelwise_synergies_50sim mutate(synergy_prob_ss = synergies/(synergies + `non-synergies`))
= ss_bliss_modelwise_synergies_100sim %>%
ss_bliss_modelwise_synergies_100sim mutate(synergy_prob_ss = synergies/(synergies + `non-synergies`))
= ss_bliss_modelwise_synergies_150sim %>%
ss_bliss_modelwise_synergies_150sim mutate(synergy_prob_ss = synergies/(synergies + `non-synergies`))
= ss_bliss_modelwise_synergies_200sim %>%
ss_bliss_modelwise_synergies_200sim mutate(synergy_prob_ss = synergies/(synergies + `non-synergies`))
= prolif_bliss_modelwise_synergies_150sim %>%
prolif_bliss_modelwise_synergies_150sim mutate(synergy_prob_prolif = synergies/(synergies + `non-synergies`))
# tidy data
= bind_cols(
pred_ew_bliss %>% select(perturbation, score) %>% rename(ss_score_50sim = score),
ss_bliss_ensemblewise_synergies_50sim %>% select(score) %>% rename(ss_score_100sim = score),
ss_bliss_ensemblewise_synergies_100sim %>% select(score) %>% rename(ss_score_150sim = score),
ss_bliss_ensemblewise_synergies_150sim %>% select(score) %>% rename(ss_score_200sim = score),
ss_bliss_ensemblewise_synergies_200sim %>% select(score) %>% rename(prolif_score_150sim = score),
prolif_bliss_ensemblewise_synergies_150sim as_tibble_col(observed, column_name = "observed"))
= bind_cols(
pred_mw_bliss %>% select(perturbation, synergy_prob_ss) %>% rename(synergy_prob_ss_50sim = synergy_prob_ss),
ss_bliss_modelwise_synergies_50sim %>% select(synergy_prob_ss) %>% rename(synergy_prob_ss_100sim = synergy_prob_ss),
ss_bliss_modelwise_synergies_100sim %>% select(synergy_prob_ss) %>% rename(synergy_prob_ss_150sim = synergy_prob_ss),
ss_bliss_modelwise_synergies_150sim %>% select(synergy_prob_ss) %>% rename(synergy_prob_ss_200sim = synergy_prob_ss),
ss_bliss_modelwise_synergies_200sim %>% select(synergy_prob_prolif) %>% rename(synergy_prob_prolif_150sim = synergy_prob_prolif),
prolif_bliss_modelwise_synergies_150sim as_tibble_col(observed, column_name = "observed"))
ROC curves
# 'ew' => ensemble-wise, 'mw' => model-wise
= get_roc_stats(df = pred_ew_bliss, pred_col = "ss_score_50sim", label_col = "observed")
res_ss_ew_50sim = get_roc_stats(df = pred_ew_bliss, pred_col = "ss_score_100sim", label_col = "observed")
res_ss_ew_100sim = get_roc_stats(df = pred_ew_bliss, pred_col = "ss_score_150sim", label_col = "observed")
res_ss_ew_150sim = get_roc_stats(df = pred_ew_bliss, pred_col = "ss_score_200sim", label_col = "observed")
res_ss_ew_200sim = get_roc_stats(df = pred_ew_bliss, pred_col = "prolif_score_150sim", label_col = "observed")
res_prolif_ew_150sim
= get_roc_stats(df = pred_mw_bliss, pred_col = "synergy_prob_ss_50sim", label_col = "observed", direction = ">")
res_ss_mw_50sim = get_roc_stats(df = pred_mw_bliss, pred_col = "synergy_prob_ss_100sim", label_col = "observed", direction = ">")
res_ss_mw_100sim = get_roc_stats(df = pred_mw_bliss, pred_col = "synergy_prob_ss_150sim", label_col = "observed", direction = ">")
res_ss_mw_150sim = get_roc_stats(df = pred_mw_bliss, pred_col = "synergy_prob_ss_200sim", label_col = "observed", direction = ">")
res_ss_mw_200sim = get_roc_stats(df = pred_mw_bliss, pred_col = "synergy_prob_prolif_150sim", label_col = "observed", direction = ">")
res_prolif_mw_150sim
# Plot ROCs
plot(x = res_ss_ew_50sim$roc_stats$FPR, y = res_ss_ew_50sim$roc_stats$TPR,
type = 'l', lwd = 3, col = my_palette[1], main = 'ROC curve, Ensemble-wise synergies (Bliss)',
xlab = 'False Positive Rate (FPR)', ylab = 'True Positive Rate (TPR)')
lines(x = res_ss_ew_100sim$roc_stats$FPR, y = res_ss_ew_100sim$roc_stats$TPR,
lwd = 3, col = my_palette[2])
lines(x = res_ss_ew_150sim$roc_stats$FPR, y = res_ss_ew_150sim$roc_stats$TPR,
lwd = 3, col = my_palette[3])
lines(x = res_ss_ew_200sim$roc_stats$FPR, y = res_ss_ew_200sim$roc_stats$TPR,
lwd = 3, col = my_palette[4])
lines(x = res_prolif_ew_150sim$roc_stats$FPR, y = res_prolif_ew_150sim$roc_stats$TPR,
lwd = 3, col = my_palette[5])
legend('topleft', title = 'AUC', col = my_palette[1:5], pch = 19,
legend = c(paste(round(res_ss_ew_50sim$AUC, digits = 2), "Calibrated (50 sim)"),
paste(round(res_ss_ew_100sim$AUC, digits = 2), "Calibrated (100 sim)"),
paste(round(res_ss_ew_150sim$AUC, digits = 2), "Calibrated (150 sim)"),
paste(round(res_ss_ew_200sim$AUC, digits = 2), "Calibrated (200 sim)"),
paste(round(res_prolif_ew_150sim$AUC, digits = 2), "Random (150 sim)")))
grid(lwd = 0.5)
abline(a = 0, b = 1, col = 'lightgrey', lty = 'dotdash', lwd = 1.2)
plot(x = res_ss_mw_50sim$roc_stats$FPR, y = res_ss_mw_50sim$roc_stats$TPR,
type = 'l', lwd = 3, col = my_palette[1], main = 'ROC curve, Model-wise synergies (Bliss)',
xlab = 'False Positive Rate (FPR)', ylab = 'True Positive Rate (TPR)')
lines(x = res_ss_mw_100sim$roc_stats$FPR, y = res_ss_mw_100sim$roc_stats$TPR,
lwd = 3, col = my_palette[2])
lines(x = res_ss_mw_150sim$roc_stats$FPR, y = res_ss_mw_150sim$roc_stats$TPR,
lwd = 3, col = my_palette[3])
lines(x = res_ss_mw_200sim$roc_stats$FPR, y = res_ss_mw_200sim$roc_stats$TPR,
lwd = 3, col = my_palette[4])
lines(x = res_prolif_mw_150sim$roc_stats$FPR, y = res_prolif_mw_150sim$roc_stats$TPR,
lwd = 3, col = my_palette[5])
legend('bottomright', title = 'AUC', col = my_palette[1:5], pch = 19, cex = 0.9,
legend = c(paste(round(res_ss_mw_50sim$AUC, digits = 2), "Calibrated (50 sim)"),
paste(round(res_ss_mw_100sim$AUC, digits = 2), "Calibrated (100 sim)"),
paste(round(res_ss_mw_150sim$AUC, digits = 2), "Calibrated (150 sim)"),
paste(round(res_ss_mw_200sim$AUC, digits = 2), "Calibrated (200 sim)"),
paste(round(res_prolif_mw_150sim$AUC, digits = 2), "Random (150 sim)")))
grid(lwd = 0.5)
abline(a = 0, b = 1, col = 'lightgrey', lty = 'dotdash', lwd = 1.2)
PR curves
= pr.curve(scores.class0 = pred_ew_bliss %>% pull(ss_score_50sim) %>% (function(x) {-x}),
pr_ss_ew_bliss_50sim weights.class0 = pred_ew_bliss %>% pull(observed), curve = TRUE, rand.compute = TRUE)
= pr.curve(scores.class0 = pred_ew_bliss %>% pull(ss_score_100sim) %>% (function(x) {-x}),
pr_ss_ew_bliss_100sim weights.class0 = pred_ew_bliss %>% pull(observed), curve = TRUE)
= pr.curve(scores.class0 = pred_ew_bliss %>% pull(ss_score_150sim) %>% (function(x) {-x}),
pr_ss_ew_bliss_150sim weights.class0 = pred_ew_bliss %>% pull(observed), curve = TRUE)
= pr.curve(scores.class0 = pred_ew_bliss %>% pull(ss_score_200sim) %>% (function(x) {-x}),
pr_ss_ew_bliss_200sim weights.class0 = pred_ew_bliss %>% pull(observed), curve = TRUE)
= pr.curve(scores.class0 = pred_ew_bliss %>% pull(prolif_score_150sim) %>% (function(x) {-x}),
pr_prolif_ew_bliss_150sim weights.class0 = pred_ew_bliss %>% pull(observed), curve = TRUE)
= pr.curve(scores.class0 = pred_mw_bliss %>% pull(synergy_prob_ss_50sim),
pr_ss_mw_bliss_50sim weights.class0 = pred_mw_bliss %>% pull(observed), curve = TRUE, rand.compute = TRUE)
= pr.curve(scores.class0 = pred_mw_bliss %>% pull(synergy_prob_ss_100sim),
pr_ss_mw_bliss_100sim weights.class0 = pred_mw_bliss %>% pull(observed), curve = TRUE)
= pr.curve(scores.class0 = pred_mw_bliss %>% pull(synergy_prob_ss_150sim),
pr_ss_mw_bliss_150sim weights.class0 = pred_mw_bliss %>% pull(observed), curve = TRUE)
= pr.curve(scores.class0 = pred_mw_bliss %>% pull(synergy_prob_ss_200sim),
pr_ss_mw_bliss_200sim weights.class0 = pred_mw_bliss %>% pull(observed), curve = TRUE)
= pr.curve(scores.class0 = pred_mw_bliss %>% pull(synergy_prob_prolif_150sim),
pr_prolif_mw_bliss_150sim weights.class0 = pred_mw_bliss %>% pull(observed), curve = TRUE)
plot(pr_ss_ew_bliss_50sim, main = 'PR curve, Ensemble-wise synergies (Bliss)',
auc.main = FALSE, color = my_palette[1], rand.plot = TRUE)
plot(pr_ss_ew_bliss_100sim, add = TRUE, color = my_palette[2])
plot(pr_ss_ew_bliss_150sim, add = TRUE, color = my_palette[3])
plot(pr_ss_ew_bliss_200sim, add = TRUE, color = my_palette[4])
plot(pr_prolif_ew_bliss_150sim, add = TRUE, color = my_palette[5])
legend('topright', title = 'AUC', col = my_palette[1:6], pch = 19,
legend = c(paste(round(pr_ss_ew_bliss_50sim$auc.davis.goadrich, digits = 3), "Calibrated (50 sim)"),
paste(round(pr_ss_ew_bliss_100sim$auc.davis.goadrich, digits = 3), "Calibrated (100 sim)"),
paste(round(pr_ss_ew_bliss_150sim$auc.davis.goadrich, digits = 3), "Calibrated (150 sim)"),
paste(round(pr_ss_ew_bliss_200sim$auc.davis.goadrich, digits = 3), "Calibrated (200 sim)"),
paste(round(pr_prolif_ew_bliss_150sim$auc.davis.goadrich, digits = 3), "Random (150 sim)")))
grid(lwd = 0.5)
plot(pr_ss_mw_bliss_50sim, main = 'PR curve, Model-wise synergies (Bliss)',
auc.main = FALSE, color = my_palette[1], rand.plot = TRUE)
plot(pr_ss_mw_bliss_100sim, add = TRUE, color = my_palette[2])
plot(pr_ss_mw_bliss_150sim, add = TRUE, color = my_palette[3])
plot(pr_ss_mw_bliss_200sim, add = TRUE, color = my_palette[4])
plot(pr_prolif_mw_bliss_150sim, add = TRUE, color = my_palette[5])
legend('topright', title = 'AUC', col = my_palette[1:5], pch = 19,
legend = c(paste(round(pr_ss_mw_bliss_50sim$auc.davis.goadrich, digits = 3), "Calibrated (50 sim)"),
paste(round(pr_ss_mw_bliss_100sim$auc.davis.goadrich, digits = 3), "Calibrated (100 sim)"),
paste(round(pr_ss_mw_bliss_150sim$auc.davis.goadrich, digits = 3), "Calibrated (150 sim)"),
paste(round(pr_ss_mw_bliss_200sim$auc.davis.goadrich, digits = 3), "Calibrated (200 sim)"),
paste(round(pr_prolif_mw_bliss_150sim$auc.davis.goadrich, digits = 3), "Random (150 sim)")))
grid(lwd = 0.5)
- To minimize the resulting performance variance, \(150\) seems to be a good number of
Gitsbe
simulations to run for the CASCADE 2.0 network. - Individual predictor model-wise results (when looking at the ROC curves) show good performance.
- Individual predictor ensemble-wise results show that random and calibrated models have poor performance.
- The PR curves show that the performance of all individual predictors is poor compared to the baseline.
Bootstrap Random Model AUC
In the previous ROC and PR curves we found a very low ensemble-wise random (proliferative) model performance, indicated by the low numbers of ROC and PR AUC. We want to assess the statistical significance of this result, by bootstrapping many model samples from a pool of random models and evaluating the performance of these ensembles.
For more details on how to generate the bootstrapped model ensembles and tidy up the result data, see section Random Model Bootstrap.
As we can see below, the random model performance if indeed very close to the median of the bootstrapped AUCs:
= readRDS(file = "data/bootstrap_rand_res.rds")
rand_res
ggboxplot(data = rand_res, y = "roc_auc", title = "Bootstrap Random Models (ROC)",
xlab = "", ylab = "ROC AUC", fill = "gray") +
theme(plot.title = element_text(hjust = 0.5)) +
rremove("x.text") +
rremove("x.ticks")
ggboxplot(data = rand_res, y = "pr_auc", title = "Bootstrap Random Models (Precision-Recall)",
xlab = "", ylab = "PR AUC", fill = "gray") +
theme(plot.title = element_text(hjust = 0.5)) +
rremove("x.text") +
rremove("x.ticks")
AUC sensitivity
Investigate same thing as described in here. This is very crucial since the PR performance is poor for the individual predictors and the ensemble-wise predictors were really bad in terms of AUC-ROC, but a combined predictor might be able to counter this. We will combine the synergy scores from the random proliferative simulations with the results from the calibrated Gitsbe simulations (number of simulations: \(150\)).
# Ensemble-wise
= seq(from = -5, to = 5, by = 0.1)
betas
= sapply(betas, function(beta) {
prolif_roc = pred_ew_bliss %>% mutate(combined_score = ss_score_50sim + beta * prolif_score_150sim)
pred_ew_bliss = roc.curve(scores.class0 = pred_ew_bliss %>% pull(combined_score) %>% (function(x) {-x}),
res weights.class0 = pred_ew_bliss %>% pull(observed))
= res$auc
auc_value
})
= sapply(betas, function(beta) {
prolif_pr = pred_ew_bliss %>% mutate(combined_score = ss_score_50sim + beta * prolif_score_150sim)
pred_ew_bliss = pr.curve(scores.class0 = pred_ew_bliss %>% pull(combined_score) %>% (function(x) {-x}),
res weights.class0 = pred_ew_bliss %>% pull(observed))
= res$auc.davis.goadrich
auc_value
})
= as_tibble(cbind(betas, prolif_roc, prolif_pr))
df_ew = df_ew %>% tidyr::pivot_longer(-betas, names_to = "type", values_to = "AUC")
df_ew
ggline(data = df_ew, x = "betas", y = "AUC", numeric.x.axis = TRUE, color = "type",
plot_type = "l", xlab = TeX("$\\beta$"), ylab = "AUC (Area Under Curve)",
legend = "none", facet.by = "type", palette = my_palette, ylim = c(0,0.85),
panel.labs = list(type = c("Precision-Recall", "ROC")),
title = TeX("AUC sensitivity to $\\beta$ parameter")) +
theme(plot.title = element_text(hjust = 0.5)) +
geom_vline(xintercept = 0) +
geom_vline(xintercept = -1.6, color = "black", size = 0.3, linetype = "dashed") +
geom_vline(xintercept = -1, color = "black", size = 0.3, linetype = "dashed") +
geom_text(aes(x=-2, y=0.4, label="β = -1.6"), angle=90, colour="black") +
geom_text(aes(x=-0.75, y=0.38, label="β = -1"), angle=90, colour="black") +
grids()
# Model-wise
= seq(from = 0, to = 1, by = 0.05)
weights
= sapply(weights, function(w) {
prolif_roc_mw = pred_mw_bliss %>%
pred_mw_bliss mutate(weighted_prob = (1 - w) * pred_mw_bliss$synergy_prob_ss_150sim + w * pred_mw_bliss$synergy_prob_prolif_150sim)
= roc.curve(scores.class0 = pred_mw_bliss %>% pull(weighted_prob),
res weights.class0 = pred_mw_bliss %>% pull(observed))
= res$auc
auc_value
})
= sapply(weights, function(w) {
prolif_pr_mw = pred_mw_bliss %>%
pred_mw_bliss mutate(weighted_prob = (1 - w) * pred_mw_bliss$synergy_prob_ss_150sim + w * pred_mw_bliss$synergy_prob_prolif_150sim)
= pr.curve(scores.class0 = pred_mw_bliss %>% pull(weighted_prob),
res weights.class0 = pred_mw_bliss %>% pull(observed))
= res$auc.davis.goadrich
auc_value
})
= as_tibble(cbind(weights, prolif_roc_mw, prolif_pr_mw))
df_mw = df_mw %>% tidyr::pivot_longer(-weights, names_to = "type", values_to = "AUC")
df_mw
ggline(data = df_mw, x = "weights", y = "AUC", numeric.x.axis = TRUE, color = "type",
plot_type = "l", xlab = TeX("weight $w$"), ylab = "AUC (Area Under Curve)",
legend = "none", facet.by = "type", palette = my_palette,
panel.labs = list(type = c("Precision-Recall", "ROC")), title.position = "center",
title = TeX("AUC sensitivity to weighted average score (Bliss, CASCADE 2.0)")) +
theme(plot.title = element_text(hjust = 0.5)) +
grids()
- No added benefit when using the model-wise approach.
- The random proliferative models can be used to normalize against the predictions of the calibrated models and thus bring significant contribution to the calibrated models performance (both ROC-AUC and PR-AUC are increased).
- The \(\beta_{best}\) that maximizes the ROC and PR AUC for the combination of proliferative and calibrated models and is equal to \(\beta_{best}=-1.6\). For \(\beta=-1\) we still see significant performance improvement.
Best ROC and PRC
For the Bliss ensemble-wise results we demonstrated above that a value of \(\beta_{best}=-1.6\) can result in significant performance gain of the combined predictor (\(calibrated + \beta \times random\)) using the results from the \(150\) simulation runs (the results for \(\beta=-1\) were still better than the single predictors). Here, we present the ROC and PR curves for the calibrated (normalized to random model) predictions compared to the random proliferative model results.
Only for the next two Figures, Calibrated stands for the combined predictor results, i.e. \(calibrated + \beta \times random, \beta=-1\).
= -1.6
best_beta1 = -1
best_beta2 = pred_ew_bliss %>%
pred_ew_bliss mutate(best_score1 = ss_score_150sim + best_beta1 * prolif_score_150sim,
best_score2 = ss_score_150sim + best_beta2 * prolif_score_150sim)
= get_roc_stats(df = pred_ew_bliss, pred_col = "best_score1", label_col = "observed")
roc_best_res1 = get_roc_stats(df = pred_ew_bliss, pred_col = "best_score2", label_col = "observed")
roc_best_res2 = pr.curve(scores.class0 = pred_ew_bliss %>% pull(best_score1) %>% (function(x) {-x}),
pr_best_res1 weights.class0 = pred_ew_bliss %>% pull(observed), curve = TRUE, rand.compute = TRUE)
= pr.curve(scores.class0 = pred_ew_bliss %>% pull(best_score2) %>% (function(x) {-x}),
pr_best_res2 weights.class0 = pred_ew_bliss %>% pull(observed), curve = TRUE, rand.compute = TRUE)
# Plot best ROCs
plot(x = roc_best_res2$roc_stats$FPR, y = roc_best_res2$roc_stats$TPR,
type = 'l', lwd = 3, col = my_palette[1],
main = ('ROC curve, Ensemble-wise synergies (Bliss)'),
xlab = 'False Positive Rate (FPR)', ylab = 'True Positive Rate (TPR)')
lines(x = res_prolif_ew_150sim$roc_stats$FPR, y = res_prolif_ew_150sim$roc_stats$TPR,
lwd = 3, col = my_palette[2])
#lines(x = roc_best_res1$roc_stats$FPR, y = roc_best_res1$roc_stats$TPR,
# lwd = 2, col = my_palette[3])
legend('topleft', title = 'AUC', col = my_palette[1:4], pch = 19, cex = 1.3,
legend = c(paste(round(roc_best_res2$AUC, digits = 2), 'Calibrated'),
paste(round(res_prolif_ew_150sim$AUC, digits = 2), 'Random')))
grid(lwd = 0.5)
abline(a = 0, b = 1, col = 'lightgrey', lty = 'dotdash', lwd = 1.2)
# Plot best PRCs
plot(pr_best_res2, main = 'PR curve, Ensemble-wise synergies (Bliss)',
auc.main = FALSE, color = my_palette[1], rand.plot = TRUE, lwd = 3)
plot(pr_prolif_ew_bliss_150sim, add = TRUE, color = my_palette[2], lwd = 3)
#plot(pr_best_res1, add = TRUE, color = my_palette[3], lwd = 2)
legend('topright', title = 'AUC', col = my_palette[1:2], pch = 19, cex = 1.5,
legend = c(paste(round(pr_best_res2$auc.davis.goadrich, digits = 2), 'Calibrated'),
paste(round(pr_prolif_ew_bliss_150sim$auc.davis.goadrich, digits = 2), 'Random')))
grid(lwd = 0.5)
The ROC ensemble-wise statistics data for the combined predictor (\(\beta=-1\), the Calibrated in the above plot) are as follows:
::datatable(data = roc_best_res2$roc_stats, options =
DTlist(pageLength = 5, lengthMenu = c(5, 20, 40), searching = FALSE)) %>%
formatRound(c(1,6,7,8,9), digits = 5)
# TP synergies at a specified threshold (0 below)
# pred_ew_bliss %>%
# select(perturbation, observed, best_score2) %>%
# filter(best_score2 < 0, observed == 1)
# investigate the average threshold as a synergy classification index
# thres = roc_best_res2$roc_stats %>% pull(threshold)
# thres = thres[is.finite(thres)] # remove Inf's
# roc_best_res2$roc_stats %>%
# filter(threshold < mean(thres)) %>%
# slice(n()) %>% kable()
If we add the predictions of the non-normalized calibrated data to the above Figures (again using the results from the \(150\) simulations), we have:
# best_beta2 = -1
# Plot best ROCs
plot(x = roc_best_res2$roc_stats$FPR, y = roc_best_res2$roc_stats$TPR,
type = 'l', lwd = 3, col = my_palette[1],
main = ('ROC curve, Ensemble-wise synergies (Bliss)'),
xlab = 'False Positive Rate (FPR)', ylab = 'True Positive Rate (TPR)')
lines(x = res_prolif_ew_150sim$roc_stats$FPR, y = res_prolif_ew_150sim$roc_stats$TPR,
lwd = 3, col = my_palette[2])
lines(x = res_ss_ew_150sim$roc_stats$FPR, y = res_ss_ew_150sim$roc_stats$TPR,
lwd = 3, col = my_palette[3])
legend('topleft', title = 'AUC', col = my_palette[c(3,1,2)], pch = 19, cex = 1,
legend = c(paste(round(res_ss_ew_150sim$AUC, digits = 2), 'Calibrated (non-normalized)'),
paste(round(roc_best_res2$AUC, digits = 2), 'Calibrated'),
paste(round(res_prolif_ew_150sim$AUC, digits = 2), 'Random')))
grid(lwd = 0.5)
abline(a = 0, b = 1, col = 'lightgrey', lty = 'dotdash', lwd = 1.2)
# Plot best PRCs
plot(pr_best_res2, main = 'PR curve, Ensemble-wise synergies (Bliss)',
auc.main = FALSE, color = my_palette[1], rand.plot = TRUE, lwd = 3)
plot(pr_prolif_ew_bliss_150sim, add = TRUE, color = my_palette[2], lwd = 3)
plot(pr_ss_ew_bliss_150sim, add = TRUE, color = my_palette[3], lwd = 2)
legend('topright', title = 'AUC', col = my_palette[c(3,1,2)], pch = 19, cex = 1,
legend = c(paste(round(pr_ss_ew_bliss_150sim$auc.davis.goadrich, digits = 2), 'Calibrated (non-normalized)'),
paste(round(pr_best_res2$auc.davis.goadrich, digits = 2), 'Calibrated'),
paste(round(pr_prolif_ew_bliss_150sim$auc.davis.goadrich, digits = 2), 'Random')))
grid(lwd = 0.5)
Correlation
We test for correlation between some of the results shown in the ROC curves. The results tested are the ensemble-wise vs model-wise, random models vs calibrated models and HSA vs Bliss synergy assessment (the calibrated and proliferative models are from the \(150\) simulation results). P-values are represented at 3 significant levels: \(0.05, 0.01, 0.001\) (*, **, ***) and the correlation coefficient is calculated using Kendall’s tau statistic.
= bind_cols(
synergy_scores %>% select(ss_score_150sim, prolif_score_150sim) %>% rename(cal_ew_hsa = ss_score_150sim, random_ew_hsa = prolif_score_150sim),
pred_ew_hsa %>% select(ss_score_150sim, prolif_score_150sim) %>% rename(cal_ew_bliss = ss_score_150sim, random_ew_bliss = prolif_score_150sim),
pred_ew_bliss %>% select(synergy_prob_ss_150sim, synergy_prob_prolif_150sim) %>%
pred_mw_hsa rename(cal_mw_hsa = synergy_prob_ss_150sim, random_mw_hsa = synergy_prob_prolif_150sim),
%>% select(synergy_prob_ss_150sim, synergy_prob_prolif_150sim) %>%
pred_mw_bliss rename(cal_mw_bliss = synergy_prob_ss_150sim, random_mw_bliss = synergy_prob_prolif_150sim)
)
= cor(synergy_scores, method = "kendall")
M = cor.mtest(synergy_scores, method = "kendall")
res corrplot(corr = M, type = "upper", p.mat = res$p, sig.level = c(.001, .01, .05),
pch.cex = 1, pch.col = "white", insig = "label_sig", tl.col = "black", tl.srt = 45)
- Bliss ensemble-wise results don’t correlate at all with the model-wise results (topright part of the correlation plot). The HSA ensemble-wise results do so (at some degree).
- Between the ensemble-wise results there is no strong correlation (topleft) while between the model-wise (bottomright) there is strong correlation.
Fitness Evolution
Results are from the run with \(200\) Gitsbe simulations, fitting to steady state (calibrated models).
= "results/link-only/cascade_2.0_ss_200sim_fixpoints_hsa_summary.txt"
fitness_summary_file = read_summary_file(file_name = fitness_summary_file)
fit_res
# rows = simulations, columns = generations
# value in (sim,gen) cell = average fitness of models in that particular (sim,gen) combination
= do.call(dplyr::bind_rows, sapply(fit_res, colMeans))
avg_fit_link colnames(avg_fit_link) = 1:ncol(avg_fit_link)
= avg_fit_link %>% pivot_longer(cols = everything()) %>% mutate(name = as.integer(name))
avg_fit_long_link
ggline(data = avg_fit_long_link, x = "name", y = "value", color = my_palette[2],
add = "mean_sd", add.params = list(color = "black"),
main = "Fitness Evolution across Generations",
xlab = "Generations", ylab = "Fitness") +
theme(plot.title = element_text(hjust = 0.5)) + grids()
Fitness vs Ensemble Performance
We check for correlation between the calibrated models fitness to the AGS steady state and their ensemble performance subject to normalization to the random model predictions.
The main idea here is that we generate different training data samples, in which the boolean steady state nodes have their values flipped (so they are only partially correct) and we fit models to these (\(20\) simulations => \(60\) models per training data, \(205\) training data samples in total). These calibrated model ensembles can then be tested for their prediction performance. Then we use the ensemble-wise random proliferative model predictions (\(150\) simulations) to normalize (\(\beta=-1\)) against the calibrated model predictions and compute the AUC ROC and AUC PR for each model ensemble.
Check how to generate the appropriate data, run the simulations and tidy up the results in the section Fitness vs Performance Methods.
Load the already-stored result:
= readRDS(file = "data/res_fit_aucs.rds") res
We check if our data is normally distributed using the Shapiro-Wilk normality test:
shapiro.test(x = res$roc_auc)
Shapiro-Wilk normality test
data: res$roc_auc
W = 0.92436, p-value = 8.883e-09
shapiro.test(x = res$pr_auc)
Shapiro-Wilk normality test
data: res$pr_auc
W = 0.94464, p-value = 4.475e-07
shapiro.test(x = res$avg_fit)
Shapiro-Wilk normality test
data: res$avg_fit
W = 0.89506, p-value = 8.472e-11
We observe from the low p-values that the data is not normally distributed. Thus, we are going to use a non-parametric correlation metric, namely the Kendall rank-based test (and it’s respective coefficient, \(\tau\)), to check for correlation between the ensemble model performance (ROC-AUC, PR-AUC) and the fitness to the AGS steady state:
= ggpubr::ggscatter(data = res, x = "avg_fit", y = "roc_auc", color = "per_flipped_data",
p xlab = "Average Fitness per Model Ensemble",
title = "Fitness to AGS Steady State vs Performance (ROC)",
ylab = "ROC AUC", add = "reg.line", conf.int = TRUE,
add.params = list(color = "blue", fill = "lightgray"),
cor.coef = TRUE, cor.coeff.args = list(method = "kendall", label.y.npc = "bottom", size = 6, cor.coef.name = "tau")) +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_distiller(n.breaks = 5, labels = scales::label_percent(accuracy = 1L), palette = 'RdYlBu', guide = guide_colourbar(title = '%Data Flipped'))
::ggpar(p, legend = "right", font.legend = 14) ggpubr
= ggpubr::ggscatter(data = res, x = "avg_fit", y = "pr_auc", color = "per_flipped_data",
p xlab = "Average Fitness per Model Ensemble",
title = "Fitness to AGS Steady State vs Performance (Precision-Recall)",
add.params = list(color = "blue", fill = "lightgray"),
ylab = "PR AUC", add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.coeff.args = list(method = "kendall", size = 6, cor.coef.name = "tau")) +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_distiller(n.breaks = 5, labels = scales::label_percent(accuracy = 1L), palette = 'RdYlBu', guide = guide_colourbar(title = '%Data Flipped'))
::ggpar(p, legend = "right", font.legend = 14) ggpubr
- We observe that there exists some correlation between the normalized ensemble model performance vs the models fitness to the training steady state data.
- The performance as measured by the ROC AUC is less sensitive to changes in the training data but there is better correlation with regards to the PR AUC, which is a more informative measure for our imbalanced dataset (Saito and Rehmsmeier 2015).
Scrambled Topologies Investigation
We create several scrambled topologies from the CASCADE 2.0 one, in order to assess the tolerance of the curated network topology to random edge changes with regards to model ensemble performance using link-operator mutations. For more details see the same investigation done for CASCADE 1.0.
For each of the \(4\) different types of scrambling, we make \(10\) random topologies for each expected similarity score between the randomized and the curated topology, ranging from \(0\) to \(0.99\) similarity with a total of \(23\) steps, thus \(10\times23=230\) random topologies per different type of scrambling. See more details on how to generate these topologies in the script gen_scrambled_topologies_cascade2.R.
To get the drug combination predictions for each scrambled topology, we executed the druglogics-synergy
module with the default configuration (\(50\) simulations per topology, for both calibrated to steady state and random proliferative models, using the Bliss synergy assessment method in Drabme
).
Note that because of the CASCADE 2.0 network size and the amount of simulations for this analysis, we had to switch to the faster BNReduction attractor tool for the calculation of fixpoints (Veliz-Cuba et al. 2014).
See discussion here for limitations of the use of the bnet_reduction_reduced
attractor tool option in our configuration file and to execute the simulations with the scrambled topologies use the run_druglogics_synergy_scrambled_topo_cascade2.sh script.
We calculate the normalized predictor performance (\(calibrated - random\)) for each topology-specific simulation and tidy up the result data in get_syn_res_scrambled_topo_cascade2.R.
Next, we load the scrambled topologies simulation results and also add the ROC and PR AUC results of the link-operator bootstrapped model analysis (see section below). The results are split to multiple groups, based on their similarity score (percentage of common edges with the curated CASCADE 2.0 topology).
Note that the topology scrambling type is set to none for the bootstrap results that used the original/curated CASCADE 2.0 topology.
# results from the scrambled topology simulations
= readRDS(file = 'data/scrambled_topo_res_cascade2.rds')
scrambled_topo_res
# results from bootstrap parameterization analysis
= readRDS(file = "data/res_param_boot_aucs.rds")
boot_res
# the un-scrambled topology results have a similarity score equal to 1,
# and 'none' scrambling whatsoever as `scramble_type`
= boot_res %>%
lo_boot_res ::add_column(sim = 1, scramble_type = 'none', .before = 1) %>%
tibblefilter(param == 'link-only') %>% # keep only the link-operator results
select(-one_of("param")) # remove unwanted column
= dplyr::bind_rows(scrambled_topo_res, lo_boot_res)
scrambled_topo_res
# group results by similarity score
=
scrambled_topo_res %>% mutate(grp = factor(x =
scrambled_topo_res case_when(sim >= 0 & sim < 0.25 ~ '0 - 0.25',
>= 0.25 & sim < 0.5 ~ '0.25 - 0.5',
sim >= 0.5 & sim < 0.75 ~ '0.5 - 0.75',
sim >= 0.75 & sim < 0.85 ~ '0.75 - 0.85',
sim >= 0.85 & sim < 0.95 ~ '0.85 - 0.95',
sim >= 0.95 & sim < 1 ~ '0.95 - 1',
sim == 1 ~ 'Curated'),
sim levels = c('0 - 0.25', '0.25 - 0.5', '0.5 - 0.75',
'0.75 - 0.85', '0.85 - 0.95','0.95 - 1', 'Curated')))
Interestingly, there were some scrambled topologies which didn’t produce not even \(1\) boolean model with a stable state when using the genetic algorithm of Gitsbe
(so no predictions could be made for these topologies):
= c('none', 'source', 'target', 'sign', 'all')
ordered_types
%>%
scrambled_topo_res mutate(scramble_type =
replace(x = scramble_type, list = scramble_type == 'effect', values = 'sign')) %>%
group_by(scramble_type) %>%
summarise(percent = sum(is.na(roc_auc))/n(), .groups = 'drop') %>%
mutate(scramble_type = factor(scramble_type, levels = ordered_types)) %>%
ggplot(aes(x = scramble_type, y = percent, fill = scramble_type)) +
geom_col() +
geom_text(aes(label = scales::percent(percent, accuracy = 1)), vjust = -0.5, size = 8) +
scale_y_continuous(labels = scales::percent, limits = c(0,0.3)) +
scale_fill_brewer(palette = "Set1") +
guides(fill = guide_legend(title = latex2exp::TeX("Scramble Type"))) +
labs(x = "", title = "Topologies with zero-stable-state boolean models", y = "") +
theme_classic(base_size = 14) +
theme(axis.text.x = element_text(size = 18))
So potentially tweaking either the source or the target nodes of each edge in the curated topology, resulted in \(13\%\) of the produced topologies to have a network configuration that wouldn’t allow the existence of attractor stability in the explored link-operator parameterization space of the Gitsbe
algorithm.
Same as with the CASCADE 1.0 results, tweaking the effect (activation vs inhibition), we always get topologies that can be translated to boolean models with a stable state attractor. Tweaking both source node, target node and interaction sign, results in the highest number of topologies with boolean models that lack stable behavior.
Source Scrambling
# ROC results
%>%
scrambled_topo_res filter(scramble_type == 'source' | scramble_type == 'none', !is.na(roc_auc)) %>%
ggplot(aes(x = grp, y = roc_auc, fill = grp)) +
geom_boxplot(show.legend = FALSE) +
scale_fill_brewer(palette = 'Set1') +
geom_jitter(shape = 20, position = position_jitter(0.2), show.legend = FALSE) +
ylim(c(0,1)) +
labs(x = 'Similarity Score to CASCADE 2.0 Topology', y = 'ROC AUC',
title = "Source Scrambling vs Performance (ROC)") +
theme_classic(base_size = 14) +
geom_hline(yintercept = 0.5, linetype = 'dashed', color = "red") +
geom_text(aes(x = 6.7, y = 0.45, label = "Random (AUC = 0.5)")) +
theme(plot.title = element_text(hjust = 0.5))
# PR results
%>%
scrambled_topo_res filter(scramble_type == 'source' | scramble_type == 'none', !is.na(pr_auc)) %>%
ggplot(aes(x = grp, y = pr_auc, fill = grp)) +
geom_boxplot(show.legend = FALSE) +
scale_fill_brewer(palette = 'Set1') +
geom_jitter(shape = 20, position = position_jitter(0.2), show.legend = FALSE) +
ylim(c(0,0.6)) +
labs(x = 'Similarity Score to CASCADE 2.0 Topology', y = 'PR AUC',
title = "Source Scrambling vs Performance (Precision-Recall)") +
theme_classic(base_size = 14) +
geom_hline(yintercept = sum(observed)/length(observed), linetype = 'dashed', color = "red") +
geom_text(aes(x = 6.7, y = 0.01, label = "Random (AUC = 0.04)")) +
theme(plot.title = element_text(hjust = 0.5))
Target Scrambling
# ROC results
%>%
scrambled_topo_res filter(scramble_type == 'target' | scramble_type == 'none', !is.na(roc_auc)) %>%
ggplot(aes(x = grp, y = roc_auc, fill = grp)) +
geom_boxplot(show.legend = FALSE) +
scale_fill_brewer(palette = 'Set1') +
geom_jitter(shape = 20, position = position_jitter(0.2), show.legend = FALSE) +
ylim(c(0,1)) +
labs(x = 'Similarity Score to CASCADE 2.0 Topology', y = 'ROC AUC',
title = "Target Scrambling vs Performance (ROC)") +
theme_classic(base_size = 14) +
geom_hline(yintercept = 0.5, linetype = 'dashed', color = "red") +
geom_text(aes(x = 6.7, y = 0.45, label = "Random (AUC = 0.5)")) +
theme(plot.title = element_text(hjust = 0.5))
# PR results
%>%
scrambled_topo_res filter(scramble_type == 'target' | scramble_type == 'none', !is.na(pr_auc)) %>%
ggplot(aes(x = grp, y = pr_auc, fill = grp)) +
geom_boxplot(show.legend = FALSE) +
scale_fill_brewer(palette = 'Set1') +
geom_jitter(shape = 20, position = position_jitter(0.2), show.legend = FALSE) +
ylim(c(0,0.6)) +
labs(x = 'Similarity Score to CASCADE 2.0 Topology', y = 'PR AUC',
title = "Target Scrambling vs Performance (Precision-Recall)") +
theme_classic(base_size = 14) +
geom_hline(yintercept = sum(observed)/length(observed), linetype = 'dashed', color = "red") +
geom_text(aes(x = 6.7, y = 0.01, label = "Random (AUC = 0.04)")) +
theme(plot.title = element_text(hjust = 0.5))
Sign Inversion
# ROC results
%>%
scrambled_topo_res filter(scramble_type == 'effect' | scramble_type == 'none', !is.na(roc_auc)) %>%
ggplot(aes(x = grp, y = roc_auc, fill = grp)) +
geom_boxplot(show.legend = FALSE) +
scale_fill_brewer(palette = 'Set1') +
geom_jitter(shape = 20, position = position_jitter(0.2), show.legend = FALSE) +
ylim(c(0,1)) +
labs(x = 'Similarity Score to CASCADE 2.0 Topology', y = 'ROC AUC',
title = "Sign Inversion vs Performance (ROC)") +
theme_classic(base_size = 14) +
geom_hline(yintercept = 0.5, linetype = 'dashed', color = "red") +
geom_text(aes(x = 6.7, y = 0.45, label = "Random (AUC = 0.5)")) +
theme(plot.title = element_text(hjust = 0.5))
# PR results
%>%
scrambled_topo_res filter(scramble_type == 'effect' | scramble_type == 'none', !is.na(pr_auc)) %>%
ggplot(aes(x = grp, y = pr_auc, fill = grp)) +
geom_boxplot(show.legend = FALSE) +
scale_fill_brewer(palette = 'Set1') +
geom_jitter(shape = 20, position = position_jitter(0.2), show.legend = FALSE) +
ylim(c(0,0.6)) +
labs(x = 'Similarity Score to CASCADE 2.0 Topology', y = 'PR AUC',
title = "Sign Inversion vs Performance (Precision-Recall)") +
theme_classic(base_size = 14) +
geom_hline(yintercept = sum(observed)/length(observed), linetype = 'dashed', color = "red") +
geom_text(aes(x = 6.5, y = 0.01, label = "Random (AUC = 0.04)")) +
theme(plot.title = element_text(hjust = 0.5))
Source, Target Scrambling and Sign Inversion
# ROC results
%>%
scrambled_topo_res filter(scramble_type == 'all' | scramble_type == 'none', !is.na(roc_auc)) %>%
ggplot(aes(x = grp, y = roc_auc, fill = grp)) +
geom_boxplot(show.legend = FALSE) +
scale_fill_brewer(palette = 'Set1') +
geom_jitter(shape = 20, position = position_jitter(0.2), show.legend = FALSE) +
ylim(c(0,1)) +
labs(x = 'Similarity Score to CASCADE 2.0 Topology', y = 'ROC AUC',
title = "All types of Scrambling vs Performance (ROC)") +
theme_classic(base_size = 14) +
geom_hline(yintercept = 0.5, linetype = 'dashed', color = "red") +
geom_text(aes(x = 6.5, y = 0.45, label = "Random (AUC = 0.5)")) +
theme(plot.title = element_text(hjust = 0.5))
# PR results
%>%
scrambled_topo_res filter(scramble_type == 'all' | scramble_type == 'none', !is.na(pr_auc)) %>%
ggplot(aes(x = grp, y = pr_auc, fill = grp)) +
geom_boxplot(show.legend = FALSE) +
scale_fill_brewer(palette = 'Set1') +
geom_jitter(shape = 20, position = position_jitter(0.2), show.legend = FALSE) +
ylim(c(0,0.6)) +
labs(x = 'Similarity Score to CASCADE 2.0 Topology', y = 'PR AUC',
title = "All types of Scrambling vs Performance (Precision-Recall)") +
theme_classic(base_size = 14) +
geom_hline(yintercept = sum(observed)/length(observed), linetype = 'dashed', color = "red") +
geom_text(aes(x = 6.5, y = 0.01, label = "Random (AUC = 0.04)")) +
theme(plot.title = element_text(hjust = 0.5))
We observe that even a small perturbation/violation/scrambling of the curated topology (of any type) produces results close to random prediction that are significantly lower than the prediction results when using the curated CASCADE 2.0 topology.
Note that even with completely scrambled topologies we get high ROC and PR AUC values (this is true for all types of scrambling expect the sign inversion), but these topologies (and the boolean ensemble model performance resulted from them) represent statistical outliers (they are more like the exception and not the rule so to speak).