CASCADE 1.0 Analysis
Performance of automatically parameterized models against published data in (Flobak et al. 2015)
HSA results
- HSA refers to the synergy method used in
Drabme
to assess the synergies from thegitsbe
models - We test performance using ROC and PR AUC for both the ensemble-wise and model-wise synergies from
Drabme
- Calibrated models: fitted to steady state (\(50\) simulations)
- Random models: fitted to proliferation profile (\(50\) simulations)
Gitsbe
models have mutations on link operator only
Load results:
# 'ss' => calibrated models, 'prolif' => proliferative random models
# 'ew' => ensemble-wise, 'mw' => model-wise
## HSA results
= paste0("results/link-only/cascade_1.0_ss_50sim_fixpoints_hsa_ensemblewise_synergies.tab")
ss_hsa_ew_file = paste0("results/link-only/cascade_1.0_ss_50sim_fixpoints_hsa_modelwise_synergies.tab")
ss_hsa_mw_file = paste0("results/link-only/cascade_1.0_rand_50sim_fixpoints_hsa_ensemblewise_synergies.tab")
prolif_hsa_ew_file = paste0("results/link-only/cascade_1.0_rand_50sim_fixpoints_hsa_modelwise_synergies.tab")
prolif_hsa_mw_file
= emba::get_synergy_scores(ss_hsa_ew_file)
ss_hsa_ensemblewise_synergies = emba::get_synergy_scores(ss_hsa_mw_file, file_type = "modelwise")
ss_hsa_modelwise_synergies = emba::get_synergy_scores(prolif_hsa_ew_file)
prolif_hsa_ensemblewise_synergies = emba::get_synergy_scores(prolif_hsa_mw_file, file_type = "modelwise")
prolif_hsa_modelwise_synergies
# calculate probability of synergy in the modelwise results
= ss_hsa_modelwise_synergies %>%
ss_hsa_modelwise_synergies mutate(synergy_prob_ss = synergies/(synergies + `non-synergies`))
= prolif_hsa_modelwise_synergies %>%
prolif_hsa_modelwise_synergies mutate(synergy_prob_prolif = synergies/(synergies + `non-synergies`))
= 'data/observed_synergies_cascade_1.0'
observed_synergies_file = emba::get_observed_synergies(observed_synergies_file)
observed_synergies # 1 (positive/observed synergy) or 0 (negative/not observed) for all tested drug combinations
= sapply(prolif_hsa_modelwise_synergies$perturbation %in% observed_synergies, as.integer)
observed
# 'ew' => ensemble-wise, 'mw' => model-wise
= bind_cols(ss_hsa_ensemblewise_synergies %>% rename(ss_score = score),
pred_ew_hsa %>% select(score) %>% rename(prolif_score = score),
prolif_hsa_ensemblewise_synergies as_tibble_col(observed, column_name = "observed"))
= bind_cols(
pred_mw_hsa %>% select(perturbation, synergy_prob_ss),
ss_hsa_modelwise_synergies %>% select(synergy_prob_prolif),
prolif_hsa_modelwise_synergies as_tibble_col(observed, column_name = "observed"))
ROC curves
= get_roc_stats(df = pred_ew_hsa, pred_col = "ss_score", label_col = "observed")
res_ss_ew = get_roc_stats(df = pred_ew_hsa, pred_col = "prolif_score", label_col = "observed")
res_prolif_ew
= get_roc_stats(df = pred_mw_hsa, pred_col = "synergy_prob_ss", label_col = "observed", direction = ">")
res_ss_mw = get_roc_stats(df = pred_mw_hsa, pred_col = "synergy_prob_prolif", label_col = "observed", direction = ">")
res_prolif_mw
# Plot ROCs
= RColorBrewer::brewer.pal(n = 9, name = "Set1")
my_palette
plot(x = res_ss_ew$roc_stats$FPR, y = res_ss_ew$roc_stats$TPR,
type = 'l', lwd = 3, col = my_palette[1], main = 'ROC curve, Ensemble-wise synergies (HSA)',
xlab = 'False Positive Rate (FPR)', ylab = 'True Positive Rate (TPR)')
lines(x = res_prolif_ew$roc_stats$FPR, y = res_prolif_ew$roc_stats$TPR,
lwd = 3, col = my_palette[2])
legend('bottomright', title = 'AUC', col = my_palette[1:2], pch = 19,
legend = c(paste(round(res_ss_ew$AUC, digits = 2), "Calibrated"),
paste(round(res_prolif_ew$AUC, digits = 2), "Random")), cex = 1.3)
grid(lwd = 0.5)
abline(a = 0, b = 1, col = 'lightgrey', lty = 'dotdash', lwd = 1.2)
plot(x = res_ss_mw$roc_stats$FPR, y = res_ss_mw$roc_stats$TPR,
type = 'l', lwd = 3, col = my_palette[1], main = 'ROC curve, Model-wise synergies (HSA)',
xlab = 'False Positive Rate (FPR)', ylab = 'True Positive Rate (TPR)')
lines(x = res_prolif_mw$roc_stats$FPR, y = res_prolif_mw$roc_stats$TPR,
lwd = 3, col = my_palette[2])
legend('bottomright', title = 'AUC', col = my_palette[1:3], pch = 19,
legend = c(paste(round(res_ss_mw$AUC, digits = 2), "Calibrated"),
paste(round(res_prolif_mw$AUC, digits = 2), "Random")), cex = 1.3)
grid(lwd = 0.5)
abline(a = 0, b = 1, col = 'lightgrey', lty = 'dotdash', lwd = 1.2)
PR curves
= pr.curve(scores.class0 = pred_ew_hsa %>% pull(ss_score) %>% (function(x) {-x}),
pr_ss_ew_hsa weights.class0 = pred_ew_hsa %>% pull(observed), curve = TRUE, rand.compute = TRUE)
= pr.curve(scores.class0 = pred_ew_hsa %>% pull(prolif_score) %>% (function(x) {-x}),
pr_prolif_ew_hsa weights.class0 = pred_ew_hsa %>% pull(observed), curve = TRUE)
= pr.curve(scores.class0 = pred_mw_hsa %>% pull(synergy_prob_ss),
pr_ss_mw_hsa weights.class0 = pred_mw_hsa %>% pull(observed), curve = TRUE, rand.compute = TRUE)
= pr.curve(scores.class0 = pred_mw_hsa %>% pull(synergy_prob_prolif),
pr_prolif_mw_hsa weights.class0 = pred_mw_hsa %>% pull(observed), curve = TRUE)
plot(pr_ss_ew_hsa, main = 'PR curve, Ensemble-wise synergies (HSA)',
auc.main = FALSE, color = my_palette[1], rand.plot = TRUE)
plot(pr_prolif_ew_hsa, add = TRUE, color = my_palette[2])
legend('topright', title = 'AUC', col = my_palette[1:2], pch = 19,
legend = c(paste(round(pr_ss_ew_hsa$auc.davis.goadrich, digits = 2), "Calibrated"),
paste(round(pr_prolif_ew_hsa$auc.davis.goadrich, digits = 2), "Random")))
grid(lwd = 0.5)
plot(pr_ss_mw_hsa, main = 'PR curve, Model-wise synergies (HSA)',
auc.main = FALSE, color = my_palette[1], rand.plot = TRUE)
plot(pr_prolif_mw_hsa, add = TRUE, color = my_palette[2])
legend('left', title = 'AUC', col = my_palette[1:3], pch = 19,
legend = c(paste(round(pr_ss_mw_hsa$auc.davis.goadrich, digits = 2), "Calibrated"),
paste(round(pr_prolif_mw_hsa$auc.davis.goadrich, digits = 2), "Random")))
grid(lwd = 0.5)
Calibrated models perform a lot better than the random ones
AUC sensitivity
- Investigate combining the synergy results of calibrated and proliferative (random) models
- Quantify the amount of information from the proliferative (random) models that can be used to augment the calibrated results?
- Ensemble-wise scenario: \(score = calibrated + \beta \times random\)
- \(\beta \rightarrow +\infty\): mostly proliferative (random) model predictions
- \(\beta \rightarrow -\infty\): mostly reverse proliferative (random) model predictions
- \(\beta \simeq -1\): calibrated models are normalized against proliferative (random) model predictions.
- Model-wise scenario: \((1-w) \times prob_{cal} + w \times prob_{rand}, w \in[0,1]\)
- \(w=0\): only calibrated model predictions
- \(w=1\): only proliferative (random) model predictions
# Ensemble-wise
= seq(from = -12, to = 12, by = 0.1)
betas
= sapply(betas, function(beta) {
prolif_roc = pred_ew_hsa %>% mutate(combined_score = ss_score + beta * prolif_score)
pred_ew_hsa = roc.curve(scores.class0 = pred_ew_hsa %>% pull(combined_score) %>% (function(x) {-x}),
res weights.class0 = pred_ew_hsa %>% pull(observed))
= res$auc
auc_value
})
= sapply(betas, function(beta) {
prolif_pr = pred_ew_hsa %>% mutate(combined_score = ss_score + beta * prolif_score)
pred_ew_hsa = pr.curve(scores.class0 = pred_ew_hsa %>% pull(combined_score) %>% (function(x) {-x}),
res weights.class0 = pred_ew_hsa %>% pull(observed))
= res$auc.davis.goadrich
auc_value
})
= as_tibble(cbind(betas, prolif_roc, prolif_pr))
df_ew = df_ew %>% tidyr::pivot_longer(-betas, names_to = "type", values_to = "AUC")
df_ew
ggline(data = df_ew, x = "betas", y = "AUC", numeric.x.axis = TRUE, color = "type",
plot_type = "l", xlab = TeX("$\\beta$"), ylab = "AUC (Area Under Curve)",
legend = "none", facet.by = "type", palette = my_palette,
panel.labs = list(type = c("Precision-Recall", "ROC")),
title = TeX("AUC sensitivity to $\\beta$ parameter (HSA, CASCADE 1.0)")) +
theme(plot.title = element_text(hjust = 0.5)) +
geom_vline(xintercept = 0) +
geom_vline(xintercept = -1, color = "black", size = 0.3, linetype = "dashed") +
geom_text(aes(x=-2, label="β = -1", y=0.25), colour="black", angle=90) +
grids()
# Model-wise
= seq(from = 0, to = 1, by = 0.05)
weights
= sapply(weights, function(w) {
prolif_roc_mw = pred_mw_hsa %>%
pred_mw_hsa mutate(weighted_prob = (1 - w) * pred_mw_hsa$synergy_prob_ss + w * pred_mw_hsa$synergy_prob_prolif)
= roc.curve(scores.class0 = pred_mw_hsa %>% pull(weighted_prob),
res weights.class0 = pred_mw_hsa %>% pull(observed))
= res$auc
auc_value
})
= sapply(weights, function(w) {
prolif_pr_mw = pred_mw_hsa %>%
pred_mw_hsa mutate(weighted_prob = (1 - w) * pred_mw_hsa$synergy_prob_ss + w * pred_mw_hsa$synergy_prob_prolif)
= pr.curve(scores.class0 = pred_mw_hsa %>% pull(weighted_prob),
res weights.class0 = pred_mw_hsa %>% pull(observed))
= res$auc.davis.goadrich
auc_value
})
= as_tibble(cbind(weights, prolif_roc_mw, prolif_pr_mw))
df_mw = df_mw %>% tidyr::pivot_longer(-weights, names_to = "type", values_to = "AUC")
df_mw
ggline(data = df_mw, x = "weights", y = "AUC", numeric.x.axis = TRUE, color = "type",
plot_type = "l", xlab = TeX("weight $w$"), ylab = "AUC (Area Under Curve)",
legend = "none", facet.by = "type", palette = my_palette,
panel.labs = list(type = c("Precision-Recall", "ROC")), title.position = "center",
title = TeX("AUC sensitivity to weighted average score (HSA, CASCADE 1.0)")) +
theme(plot.title = element_text(hjust = 0.5)) +
grids()
- There are \(\beta\) values that can boost the predictive performance of the calibrated models (ensemble-wise) but no \(w\) weight in the model-wise case.
- \(\beta=-1\) seems to be a common value that maximizes both the ROC-AUC and the PR-AUC.
- The PR-AUC is more sensitive than the ROC-AUC, so a better indicator of performance.
Bliss results
- Bliss refers to the synergy method used in
Drabme
to assess the synergies from thegitsbe
models - We test performance using ROC and PR AUC for both the ensemble-wise and model-wise synergies from
Drabme
- Calibrated models: fitted to steady state (\(50\) simulations)
- Random models: fitted to proliferation profile (\(50\) simulations)
Gitsbe
models have mutations on link operator only
Load results:
# 'ss' => calibrated models, 'prolif' => random models
## Bliss results
= paste0("results/link-only/cascade_1.0_ss_50sim_fixpoints_bliss_ensemblewise_synergies.tab")
ss_bliss_ensemblewise_file = paste0("results/link-only/cascade_1.0_ss_50sim_fixpoints_bliss_modelwise_synergies.tab")
ss_bliss_modelwise_file = paste0("results/link-only/cascade_1.0_rand_50sim_fixpoints_bliss_ensemblewise_synergies.tab")
prolif_bliss_ensemblewise_file = paste0("results/link-only/cascade_1.0_rand_50sim_fixpoints_bliss_modelwise_synergies.tab")
prolif_bliss_modelwise_file
= emba::get_synergy_scores(ss_bliss_ensemblewise_file)
ss_bliss_ensemblewise_synergies = emba::get_synergy_scores(ss_bliss_modelwise_file, file_type = "modelwise")
ss_bliss_modelwise_synergies = emba::get_synergy_scores(prolif_bliss_ensemblewise_file)
prolif_bliss_ensemblewise_synergies = emba::get_synergy_scores(prolif_bliss_modelwise_file, file_type = "modelwise")
prolif_bliss_modelwise_synergies
# calculate probability of synergy in the modelwise results
= ss_bliss_modelwise_synergies %>%
ss_bliss_modelwise_synergies mutate(synergy_prob_ss = synergies/(synergies + `non-synergies`))
= prolif_bliss_modelwise_synergies %>%
prolif_bliss_modelwise_synergies mutate(synergy_prob_prolif = synergies/(synergies + `non-synergies`))
# 'ew' => ensemble-wise, 'mw' => model-wise
= bind_cols(ss_bliss_ensemblewise_synergies %>% rename(ss_score = score),
pred_ew_bliss %>% select(score) %>% rename(prolif_score = score),
prolif_bliss_ensemblewise_synergies as_tibble_col(observed, column_name = "observed"))
= bind_cols(
pred_mw_bliss %>% select(perturbation, synergy_prob_ss),
ss_bliss_modelwise_synergies %>% select(synergy_prob_prolif),
prolif_bliss_modelwise_synergies as_tibble_col(observed, column_name = "observed"))
ROC curves
= get_roc_stats(df = pred_ew_bliss, pred_col = "ss_score", label_col = "observed")
res_ss_ew = get_roc_stats(df = pred_ew_bliss, pred_col = "prolif_score", label_col = "observed")
res_prolif_ew
= get_roc_stats(df = pred_mw_bliss, pred_col = "synergy_prob_ss", label_col = "observed", direction = ">")
res_ss_mw = get_roc_stats(df = pred_mw_bliss, pred_col = "synergy_prob_prolif", label_col = "observed", direction = ">")
res_prolif_mw
# Plot ROCs
plot(x = res_ss_ew$roc_stats$FPR, y = res_ss_ew$roc_stats$TPR,
type = 'l', lwd = 3, col = my_palette[1], main = 'ROC curve, Ensemble-wise synergies (Bliss)',
xlab = 'False Positive Rate (FPR)', ylab = 'True Positive Rate (TPR)')
lines(x = res_prolif_ew$roc_stats$FPR, y = res_prolif_ew$roc_stats$TPR,
lwd = 3, col = my_palette[2])
legend('bottomright', title = 'AUC', col = my_palette[1:2], pch = 19,
legend = c(paste(round(res_ss_ew$AUC, digits = 2), "Calibrated"),
paste(round(res_prolif_ew$AUC, digits = 2), "Random")), cex = 1.3)
grid(lwd = 0.5)
abline(a = 0, b = 1, col = 'lightgrey', lty = 'dotdash', lwd = 1.2)
plot(x = res_ss_mw$roc_stats$FPR, y = res_ss_mw$roc_stats$TPR,
type = 'l', lwd = 3, col = my_palette[1], main = 'ROC curve, Model-wise synergies (Bliss)',
xlab = 'False Positive Rate (FPR)', ylab = 'True Positive Rate (TPR)')
lines(x = res_prolif_mw$roc_stats$FPR, y = res_prolif_mw$roc_stats$TPR,
lwd = 3, col = my_palette[2])
legend('bottomright', title = 'AUC', col = my_palette[1:2], pch = 19,
legend = c(paste(round(res_ss_mw$AUC, digits = 2), "Calibrated"),
paste(round(res_prolif_mw$AUC, digits = 2), "Random")), cex = 1.3)
grid(lwd = 0.5)
abline(a = 0, b = 1, col = 'lightgrey', lty = 'dotdash', lwd = 1.2)
The ROC statistics data for the calibrated models are as follows:
::datatable(data = res_ss_ew$roc_stats, options =
DTlist(pageLength = 5, lengthMenu = c(5, 10, 16), searching = FALSE)) %>%
formatRound(c(1,6,7,8,9), digits = 5)
# investigate the average threshold as a synergy classification index
# thres = res_ss_ew$roc_stats %>% pull(threshold)
# thres = thres[is.finite(thres)] # remove Inf's
# res_ss_ew$roc_stats %>%
# filter(threshold < mean(thres)) %>%
# slice(n()) %>% kable()
PR curves
= pr.curve(scores.class0 = pred_ew_bliss %>% pull(ss_score) %>% (function(x) {-x}),
pr_ss_ew_bliss weights.class0 = pred_ew_bliss %>% pull(observed), curve = TRUE, rand.compute = TRUE)
= pr.curve(scores.class0 = pred_ew_bliss %>% pull(prolif_score) %>% (function(x) {-x}),
pr_prolif_ew_bliss weights.class0 = pred_ew_bliss %>% pull(observed), curve = TRUE)
= pr.curve(scores.class0 = pred_mw_bliss %>% pull(synergy_prob_ss),
pr_ss_mw_bliss weights.class0 = pred_mw_bliss %>% pull(observed), curve = TRUE, rand.compute = TRUE)
= pr.curve(scores.class0 = pred_mw_bliss %>% pull(synergy_prob_prolif),
pr_prolif_mw_bliss weights.class0 = pred_mw_bliss %>% pull(observed), curve = TRUE)
plot(pr_ss_ew_bliss, main = 'PR curve, Ensemble-wise synergies (Bliss)',
auc.main = FALSE, color = my_palette[1], rand.plot = TRUE)
plot(pr_prolif_ew_bliss, add = TRUE, color = my_palette[2])
legend(x = 0, y = 0.9, title = 'AUC', col = my_palette[1:2], pch = 19, cex = 1.3,
legend = c(paste(round(pr_ss_ew_bliss$auc.davis.goadrich, digits = 2), "Calibrated"),
paste(round(pr_prolif_ew_bliss$auc.davis.goadrich, digits = 2), "Random")))
grid(lwd = 0.5)
plot(pr_ss_mw_bliss, main = 'PR curve, Model-wise synergies (Bliss)',
auc.main = FALSE, color = my_palette[1], rand.plot = TRUE)
plot(pr_prolif_mw_bliss, add = TRUE, color = my_palette[2])
legend(x = 0, y = 0.9, title = 'AUC', col = my_palette[1:3], pch = 19, cex = 1.3,
legend = c(paste(round(pr_ss_mw_bliss$auc.davis.goadrich, digits = 2), "Calibrated"),
paste(round(pr_prolif_mw_bliss$auc.davis.goadrich, digits = 2), "Random")))
grid(lwd = 0.5)
Calibrated models perform a lot better than the random ones
AUC sensitivity
Investigate same thing as described in here.
# Ensemble-wise
= seq(from = -12, to = 12, by = 0.1)
betas
= sapply(betas, function(beta) {
prolif_roc = pred_ew_bliss %>% mutate(combined_score = ss_score + beta * prolif_score)
pred_ew_bliss = roc.curve(scores.class0 = pred_ew_bliss %>% pull(combined_score) %>% (function(x) {-x}),
res weights.class0 = pred_ew_bliss %>% pull(observed))
= res$auc
auc_value
})
= sapply(betas, function(beta) {
prolif_pr = pred_ew_bliss %>% mutate(combined_score = ss_score + beta * prolif_score)
pred_ew_bliss = pr.curve(scores.class0 = pred_ew_bliss %>% pull(combined_score) %>% (function(x) {-x}),
res weights.class0 = pred_ew_bliss %>% pull(observed))
= res$auc.davis.goadrich
auc_value
})
= as_tibble(cbind(betas, prolif_roc, prolif_pr))
df_ew = df_ew %>% tidyr::pivot_longer(-betas, names_to = "type", values_to = "AUC")
df_ew
ggline(data = df_ew, x = "betas", y = "AUC", numeric.x.axis = TRUE, color = "type",
plot_type = "l", xlab = TeX("$\\beta$"), ylab = "AUC (Area Under Curve)",
legend = "none", facet.by = "type", palette = my_palette,
panel.labs = list(type = c("Precision-Recall", "ROC")),
title = TeX("AUC sensitivity to $\\beta$ parameter (Bliss, CASCADE 1.0)")) +
theme(plot.title = element_text(hjust = 0.5)) +
geom_vline(xintercept = 0) +
geom_vline(xintercept = -1, color = "black", size = 0.3, linetype = "dashed") +
geom_text(aes(x=-2, label="β = -1", y=0.25), colour="black", angle=90) +
grids()
# Model-wise
= seq(from = 0, to = 1, by = 0.05)
weights
= sapply(weights, function(w) {
prolif_roc_mw = pred_mw_bliss %>%
pred_mw_bliss mutate(weighted_prob = (1 - w) * pred_mw_bliss$synergy_prob_ss + w * pred_mw_bliss$synergy_prob_prolif)
= roc.curve(scores.class0 = pred_mw_bliss %>% pull(weighted_prob),
res weights.class0 = pred_mw_bliss %>% pull(observed))
= res$auc
auc_value
})
= sapply(weights, function(w) {
prolif_pr_mw = pred_mw_bliss %>%
pred_mw_bliss mutate(weighted_prob = (1 - w) * pred_mw_bliss$synergy_prob_ss + w * pred_mw_bliss$synergy_prob_prolif)
= pr.curve(scores.class0 = pred_mw_bliss %>% pull(weighted_prob),
res weights.class0 = pred_mw_bliss %>% pull(observed))
= res$auc.davis.goadrich
auc_value
})
= as_tibble(cbind(weights, prolif_roc_mw, prolif_pr_mw))
df_mw = df_mw %>% tidyr::pivot_longer(-weights, names_to = "type", values_to = "AUC")
df_mw
ggline(data = df_mw, x = "weights", y = "AUC", numeric.x.axis = TRUE, color = "type",
plot_type = "l", xlab = TeX("weight $w$"), ylab = "AUC (Area Under Curve)",
legend = "none", facet.by = "type", palette = my_palette,
panel.labs = list(type = c("Precision-Recall", "ROC")), title.position = "center",
title = TeX("AUC sensitivity to weighted average score (Bliss, CASCADE 1.0)")) +
theme(plot.title = element_text(hjust = 0.5)) +
grids()
- There are \(\beta\) values that can boost the predictive performance of the calibrated models (ensemble-wise) but no \(w\) weight in the model-wise case.
- The PR-AUC is more sensitive than the ROC-AUC, so a better indicator of performance.
- A value very close to \(\beta=-1\) seems to be the one maximizes both the ROC-AUC and the PR-AUC.
Best ROC and PRC
The ROC ensemble-wise statistics data for the combined predictor \(calibrated + \beta \times random, \beta=-1\) (using the results from the \(50\) simulation runs) are:
= -1
beta = pred_ew_bliss %>% mutate(combined_score = ss_score + beta * prolif_score)
pred_ew_bliss = usefun::get_roc_stats(df = pred_ew_bliss, pred_col = "combined_score", label_col = "observed")
res_comb_pred
::datatable(data = res_comb_pred$roc_stats, options =
DTlist(pageLength = 5, lengthMenu = c(5, 10, 16), searching = FALSE)) %>%
::formatRound(c(1,6,7,8,9), digits = 5) DT
# All observed synergies are in top 6
# pred_ew_bliss %>% arrange(combined_score)
Only for the next 2 Figures, Calibrated stands for the combined predictor results, i.e. \(calibrated + \beta \times random, \beta=-1\).
plot(x = res_comb_pred$roc_stats$FPR, y = res_comb_pred$roc_stats$TPR,
type = 'l', lwd = 3, col = my_palette[1], main = 'ROC curve, Ensemble-wise synergies (Bliss)',
xlab = 'False Positive Rate (FPR)', ylab = 'True Positive Rate (TPR)')
lines(x = res_prolif_ew$roc_stats$FPR, y = res_prolif_ew$roc_stats$TPR,
lwd = 3, col = my_palette[2])
legend('bottomright', title = 'AUC', col = my_palette[1:2], pch = 19,
legend = c(paste(round(res_comb_pred$AUC, digits = 2), "Calibrated"),
paste(round(res_prolif_ew$AUC, digits = 2), "Random")), cex = 1.3)
grid(lwd = 0.5)
abline(a = 0, b = 1, col = 'lightgrey', lty = 'dotdash', lwd = 1.2)
= pr.curve(scores.class0 = pred_ew_bliss %>% pull(combined_score) %>% (function(x) {-x}),
res_comb_pred_pr weights.class0 = pred_ew_bliss %>% pull(observed), curve = TRUE, rand.compute = TRUE)
plot(res_comb_pred_pr, main = 'PR curve, Ensemble-wise synergies (Bliss)',
auc.main = FALSE, color = my_palette[1], rand.plot = TRUE)
plot(pr_prolif_ew_bliss, add = TRUE, color = my_palette[2])
legend(x = 0, y = 0.9, title = 'AUC', col = my_palette[1:2], pch = 19, cex = 1.3,
legend = c(paste(round(res_comb_pred_pr$auc.davis.goadrich, digits = 2), "Calibrated"),
paste(round(pr_prolif_ew_bliss$auc.davis.goadrich, digits = 2), "Random")))
grid(lwd = 0.5)
If we add the predictions of the non-normalized calibrated data to the above Figures, we have:
plot(x = res_comb_pred$roc_stats$FPR, y = res_comb_pred$roc_stats$TPR,
type = 'l', lwd = 3, col = my_palette[1], main = 'ROC curve, Ensemble-wise synergies (Bliss)',
xlab = 'False Positive Rate (FPR)', ylab = 'True Positive Rate (TPR)')
lines(x = res_prolif_ew$roc_stats$FPR, y = res_prolif_ew$roc_stats$TPR,
lwd = 3, col = my_palette[2])
lines(x = res_ss_ew$roc_stats$FPR, y = res_ss_ew$roc_stats$TPR,
lwd = 3, col = my_palette[3])
legend('bottomright', title = 'AUC', col = my_palette[c(3,1,2)], pch = 19,
legend = c(paste(round(res_ss_ew$AUC, digits = 2), "Calibrated (non-normalized)"),
paste(round(res_comb_pred$AUC, digits = 2), "Calibrated"),
paste(round(res_prolif_ew$AUC, digits = 2), "Random")), cex = 1)
grid(lwd = 0.5)
abline(a = 0, b = 1, col = 'lightgrey', lty = 'dotdash', lwd = 1.2)
= pr.curve(scores.class0 = pred_ew_bliss %>% pull(combined_score) %>% (function(x) {-x}),
res_comb_pred_pr weights.class0 = pred_ew_bliss %>% pull(observed), curve = TRUE, rand.compute = TRUE)
plot(res_comb_pred_pr, main = 'PR curve, Ensemble-wise synergies (Bliss)',
auc.main = FALSE, color = my_palette[1], rand.plot = TRUE)
plot(pr_prolif_ew_bliss, add = TRUE, color = my_palette[2])
plot(pr_ss_ew_bliss, add = TRUE, color = my_palette[3])
legend(x = 0, y = 0.9, title = 'AUC', col = my_palette[c(3,1,2)], pch = 19, cex = 0.9,
legend = c(paste(round(pr_ss_ew_bliss$auc.davis.goadrich, digits = 2), "Calibrated (non-normalized)"),
paste(round(res_comb_pred_pr$auc.davis.goadrich, digits = 2), "Calibrated"),
paste(round(pr_prolif_ew_bliss$auc.davis.goadrich, digits = 2), "Random")))
grid(lwd = 0.5)
Correlation
We test for correlation between all the synergy predictor results shown in the previous curves. This means ensemble-wise vs model-wise, random proliferative models vs calibrated models and HSA vs Bliss synergy assessment. P-values are represented at 3 significant levels: \(0.05, 0.01, 0.001\) (*, **, ***) and the correlation coefficient is calculated using Kendall’s tau statistic.
= bind_cols(
synergy_scores %>% select(ss_score, prolif_score) %>% rename(cal_ew_hsa = ss_score, random_ew_hsa = prolif_score),
pred_ew_hsa %>% select(ss_score, prolif_score) %>% rename(cal_ew_bliss = ss_score, random_ew_bliss = prolif_score),
pred_ew_bliss %>% select(synergy_prob_ss, synergy_prob_prolif) %>%
pred_mw_hsa rename(cal_mw_hsa = synergy_prob_ss, random_mw_hsa = synergy_prob_prolif),
%>% select(synergy_prob_ss, synergy_prob_prolif) %>%
pred_mw_bliss rename(cal_mw_bliss = synergy_prob_ss, random_mw_bliss = synergy_prob_prolif)
)
= cor(synergy_scores, method = "kendall")
M = cor.mtest(synergy_scores, method = "kendall")
res corrplot(corr = M, type = "upper", p.mat = res$p, sig.level = c(.001, .01, .05),
pch.cex = 1, pch.col = "white", insig = "label_sig", tl.col = "black", tl.srt = 45)
- Model-wise don’t correlate a lot with ensemble-wise results (topright part of the correlation plot).
- HSA and Bliss results correlate, higher for the model-wise (bottomright) than the ensemble-wise results (topleft).
- Calibrated results also show some correlation with the random results
Fitness Evolution
We did a test run of Gitsbe
with \(1000\) simulations, fitting to steady state (generating thus calibrated models).
The only difference between the following results and the ones above is the total number of simulations specified in the configuration and that the option bootstrap_mutations_factor
was set to \(1\) (to avoid reaching good fitness models in the earlier generations).
Firstly, we show the fitness evolution of the first \(20\) simulations. Each data point is the average fitness in that generation out of \(20\) models. Note that some simulations end because the target fitness is reached by some of the models (\(0.99\)).
= function(file_name) {
read_summary_file = readr::read_lines(file = file_name, skip = 5, skip_empty_rows = TRUE)
lines
= list()
data_list = 1
index
= list()
gen_fit_list = 1
gen_index for (line_index in 1:length(lines)) {
= lines[line_index]
line if (stringr::str_detect(string = line, pattern = "Simulation")) {
= dplyr::bind_cols(gen_fit_list)
data_list[[index]] = index + 1
index
= list()
gen_fit_list = 1
gen_index else { # read fitness values
} = tibble::as_tibble_col(as.numeric(unlist(strsplit(line, split = '\t'))), column_name = paste0(gen_index))
gen_fit_list[[gen_index]] = gen_index + 1
gen_index
}
}
# add the last simulation's values
= dplyr::bind_cols(gen_fit_list)
data_list[[index]]
return(data_list)
}
= "results/link-only/cascade_1.0_ss_1000sim_fixpoints_hsa_summary.txt"
fitness_summary_file
# `fit_res` is a list of tibbles
# Each tibble has the fitness results of a simulation
# Rows represent the models and columns are the generations
= read_summary_file(file_name = fitness_summary_file)
fit_res
= colMeans(fit_res[[1]])
first_sim_data plot(1:length(first_sim_data), y = first_sim_data, ylim = c(0,1),
xlim = c(0,20), type = 'l', lwd = 1.5,
main = 'Fitness Evolution across Generations', xlab = 'Generations',
ylab = 'Average Fitness', col = usefun:::colors.100[1])
= 2
index for (fit_data in fit_res) {
if (index > 20) break
#if (ncol(fit_data) != 20) next
= colMeans(fit_data)
mean_fit_per_gen lines(x = 1:length(mean_fit_per_gen), y = mean_fit_per_gen, lwd = 1.5,
col = usefun:::colors.100[index])
= index + 1
index
}grid(lwd = 0.5)
Next, we plot the average fitness + standard deviation per generation across all \(1000\) simulations:
# `avg_fit` is a tibble with rows the number of simulations and
# columns the generations. Each value in a (sim,gen) cell is the average
# fitness of models in that particular (sim,gen) combination
= do.call(dplyr::bind_rows, sapply(fit_res, colMeans))
avg_fit = avg_fit %>% pivot_longer(cols = everything()) %>% mutate(name = as.integer(name))
avg_fit_long
ggline(data = avg_fit_long, x = "name", y = "value", color = my_palette[2],
add = "mean_sd", add.params = list(color = "black"), ylim = c(0, 1),
main = "Fitness Evolution across Generations",
xlab = "Generations", ylab = "Fitness") +
theme(plot.title = element_text(hjust = 0.5)) + grids()
# DIY way:
# df = avg_fit_long %>% group_by(name) %>%
# summarise(median = median(value, na.rm = TRUE),
# mean = mean(value, na.rm = TRUE),
# sd = sd(value, na.rm = TRUE))
#
# ggplot(data = df, aes(x=name, y=mean)) +
# ggtitle("Fitness Evolution across Generations") +
# xlab("Generations") + ylab("Fitness") +
# geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2) +
# geom_line(color='steelblue') +
# geom_point(color='steelblue') +
# theme_pubr() + theme(plot.title = element_text(hjust = 0.5)) +
# grids()
- The average fitness stabilizes after \(\approx 10-15\) generations but also the standard deviation: new models are still being created through the crossover genetic algorithm phase to explore various model parameterization while keeping the fitness score relatively high.
- The S-shaped (sigmoid) curve is in agreement with Holland’s schema theorem (Holland 1992).
Fitness vs Ensemble Performance
We check for correlation between the calibrated models fitness to the AGS steady state and their ensemble performance subject to normalization to the random model predictions.
The main idea here is that we generate different training data samples, in which the boolean steady state nodes have their values flipped (so they are only partially correct) and we fit models to these (\(50\) simulations => \(150\) models per training data, \(205\) training data samples in total). These calibrated model ensembles can then be tested for their prediction performance. Then we use the ensemble-wise random proliferative model predictions (\(50\) simulations) to normalize (\(\beta=-1\)) against the calibrated model predictions and compute the AUC ROC and AUC PR for each model ensemble.
Check how to generate the appropriate data, run the simulations and tidy up the results in the section Fitness vs Performance Methods.
Load the already-stored result:
= readRDS(file = "data/res_fit_aucs_cascade1.rds") res
We check if our data is normally distributed using the Shapiro-Wilk normality test:
shapiro.test(x = res$roc_auc)
Shapiro-Wilk normality test
data: res$roc_auc
W = 0.95822, p-value = 9.995e-06
shapiro.test(x = res$pr_auc)
Shapiro-Wilk normality test
data: res$pr_auc
W = 0.86074, p-value = 9.719e-13
shapiro.test(x = res$avg_fit)
Shapiro-Wilk normality test
data: res$avg_fit
W = 0.87328, p-value = 4.518e-12
We observe from the low p-values that the data is not normally distributed. Thus, we are going to use a non-parametric correlation metric, namely the Kendall rank-based test (and it’s respective coefficient, \(\tau\)), to check for correlation between the ensemble model performance (ROC-AUC, PR-AUC) and the fitness to the AGS steady state:
= ggpubr::ggscatter(data = res, x = "avg_fit", y = "roc_auc", color = "per_flipped_data",
p xlab = "Average Fitness per Model Ensemble",
title = "Fitness to AGS Steady State vs Performance (ROC)",
ylab = "ROC AUC", add = "reg.line", conf.int = TRUE,
add.params = list(color = "blue", fill = "lightgray"),
cor.coef = TRUE, cor.coeff.args = list(method = "kendall", label.y.npc = "top", size = 6, cor.coef.name = "tau")) +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_distiller(n.breaks = 5, labels = scales::label_percent(accuracy = 1L), palette = 'RdYlBu', guide = guide_colourbar(title = '%Data Flipped'))
::ggpar(p, legend = "right", font.legend = 14) ggpubr
= ggpubr::ggscatter(data = res, x = "avg_fit", y = "pr_auc", color = "per_flipped_data",
p xlab = "Average Fitness per Model Ensemble",
title = "Fitness to AGS Steady State vs Performance (Precision-Recall)",
add.params = list(color = "blue", fill = "lightgray"),
ylab = "PR AUC", add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.coeff.args = list(method = "kendall", size = 6, cor.coef.name = "tau")) +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_distiller(n.breaks = 5, labels = scales::label_percent(accuracy = 1L), palette = 'RdYlBu', guide = guide_colourbar(title = '%Data Flipped'))
::ggpar(p, legend = "right", font.legend = 14) ggpubr
- We observe that there exists some correlation between the normalized ensemble model performance vs the models fitness to the training steady state data.
- The performance as measured by the ROC AUC is less sensitive to changes in the training data but there is better correlation with regards to the PR AUC, which is a more informative measure for our imbalanced dataset (Saito and Rehmsmeier 2015).
Scrambled Topologies Investigation
We create several scrambled topologies from the CASCADE 1.0 one, in order to assess the tolerance of the curated network topology to random edge changes with regards to model ensemble performance.
We introduce \(4\) types of topology scrambling that are performed in a varying number of edges. The more edges are changed, the more scrambled/randomized is the resulting topology. The \(4\) types of scrambling are:
- Randomly permutating the source nodes of the edges (source)
- Randomly permutating the target nodes of the edges (target)
- Randomly changing the interaction effect from inhibition to activation and vice-versa (Sign Inversion)
- Combining all the above (all)
Note that each type of scrambling produces a topology with the same input and output degree distribution as the original one and as such, the scale-free property of the CASCADE 1.0 topology remains unchanged in the scrambled topologies.
For each different type of scrambling, we make \(10\) random topologies for each expected similarity score between the randomized and the curated topology, ranging from \(0\) similarity to \(0.98\) with a total of \(22\) steps, thus \(10\times22=220\) random topologies per different type of scrambling. See more details on how to generate these topologies in the script gen_scrambled_topologies_cascade1.R.
To get the drug combination predictions for each scrambled topology, we executed the druglogics-synergy
module with the default configuration (\(50\) simulations per topology, for both calibrated to steady state and random proliferative models, using the Bliss synergy assessment method in Drabme
) - see more info on the run_druglogics_synergy_scrambled_topo_cascade1.sh script.
We calculate the normalized predictor performance (\(calibrated - random\)) for each topology-specific simulation and tidy up the result data in get_syn_res_scrambled_topo_cascade1.R.
Next, we load the data results and add the ROC and PR AUC results of the combined predictor (termed Calibrated) for the curated CASCADE 1.0 topology (see above). Note that the topology scrambling type is set to none for the results that used the original/curated CASCADE 1.0 topology.
= readRDS(file = 'data/scrambled_topo_res_cascade1.rds')
scrambled_topo_res
# the un-scrambled topology results have a similarity score equal to 1, 'none'
# scrambling whatsoever as `scramble_type`, and the ROC and PR AUC values have been previously
# calculated and shown in the figures above but we re-do them here anyway :)
= PRROC::roc.curve(scores.class0 = pred_ew_bliss %>% pull(combined_score) %>% (function(x) {-x}),
res_comb_roc weights.class0 = pred_ew_bliss %>% pull(observed))
= PRROC::pr.curve(scores.class0 = pred_ew_bliss %>% pull(combined_score) %>% (function(x) {-x}),
res_comb_pr weights.class0 = pred_ew_bliss %>% pull(observed))
= dplyr::bind_rows(scrambled_topo_res,
scrambled_topo_res ::tibble(sim = 1, scramble_type = 'none', roc_auc = res_comb_roc$auc,
tibblepr_auc = res_comb_pr$auc.davis.goadrich))
Interestingly, there were some scrambled topologies which didn’t produce not even \(1\) boolean model with a stable state when using the genetic algorithm of Gitsbe
(so no predictions could be made for these topologies):
= c('none', 'source', 'target', 'sign', 'all')
ordered_types
%>%
scrambled_topo_res mutate(scramble_type =
replace(x = scramble_type, list = scramble_type == 'effect', values = 'sign')) %>%
group_by(scramble_type) %>%
summarise(percent = sum(is.na(roc_auc))/n(), .groups = 'drop') %>%
mutate(scramble_type = factor(scramble_type, levels = ordered_types)) %>%
ggplot(aes(x = scramble_type, y = percent, fill = scramble_type)) +
geom_col() +
geom_text(aes(label = scales::percent(percent, accuracy = 1)), vjust = -0.5, size = 8) +
scale_y_continuous(labels = scales::percent, limits = c(0,0.3)) +
scale_fill_brewer(palette = "Set1") +
guides(fill = guide_legend(title = latex2exp::TeX("Scramble Type"))) +
labs(x = "", title = "Topologies with zero-stable-state boolean models", y = "") +
theme_classic(base_size = 14) +
theme(axis.text.x = element_text(size = 18))
So potentially tweaking the source nodes of each edge in the curated topology, resulted in \(11\%\) of the produced topologies to have a network configuration that wouldn’t allow the existence of attractor stability in the explored link-operator parameterization space of the Gitsbe
algorithm.
Tweaking the target nodes results in less topologies having this property (\(5\%\)).
Lastly, tweaking the effect (activation vs inhibition), we always get topologies that can be translated to boolean models with a stable state attractor.
Source Scrambling
In the next figures, the red dot/point is the result from using the original/unscrambled/curated CASCADE 1.0 topology:
::ggscatter(data = scrambled_topo_res %>%
ggpubrfilter(scramble_type == 'source' | scramble_type == 'none', !is.na(roc_auc)),
x = "sim", y = "roc_auc", color = "scramble_type", palette = c('red', 'black'),
xlab = "Similarity Score",
title = "Source node Scrambling vs Performance (ROC)",
ylab = "ROC AUC") +
#, add = "reg.line", conf.int = TRUE,
#add.params = list(color = "blue", fill = "lightgray"),
#cor.coef = TRUE, cor.coeff.args = list(method = "kendall", label.y.npc = "top", size = 6, cor.coef.name = "tau") +
ylim(c(0,1)) +
geom_text(x = 0.95, y = 1, label = "CASCADE 1.0") +
theme(plot.title = element_text(hjust = 0.5), legend.position = 'none')
::ggscatter(data = scrambled_topo_res %>%
ggpubrfilter(scramble_type == 'source' | scramble_type == 'none', !is.na(pr_auc)),
x = "sim", y = "pr_auc", color = "scramble_type", palette = c('red', 'black'),
xlab = "Similarity Score",
title = "Source node Scrambling vs Performance (Precision-Recall)",
ylab = "PR AUC") +
#add = "reg.line", conf.int = TRUE,
#add.params = list(color = "blue", fill = "lightgray"),
#cor.coef = TRUE, cor.coeff.args = list(method = "kendall", label.y.npc = "top", size = 6, cor.coef.name = "tau")) +
ylim(c(0,1)) +
geom_text(x = 0.9, y = 0.91, label = "CASCADE 1.0") +
theme(plot.title = element_text(hjust = 0.5), legend.position = 'none')
Target Scrambling
::ggscatter(data = scrambled_topo_res %>%
ggpubrfilter(scramble_type == 'target' | scramble_type == 'none', !is.na(roc_auc)),
x = "sim", y = "roc_auc", color = "scramble_type", palette = c('red', 'black'),
xlab = "Similarity Score",
title = "Target node Scrambling vs Performance (ROC)",
ylab = "ROC AUC") +
ylim(c(0,1)) +
geom_text(x = 0.95, y = 1, label = "CASCADE 1.0") +
theme(plot.title = element_text(hjust = 0.5), legend.position = 'none')
::ggscatter(data = scrambled_topo_res %>%
ggpubrfilter(scramble_type == 'target' | scramble_type == 'none', !is.na(pr_auc)),
x = "sim", y = "pr_auc", color = "scramble_type", palette = c('red', 'black'),
xlab = "Similarity Score",
title = "Target node Scrambling vs Performance (Precision-Recall)",
ylab = "PR AUC") +
ylim(c(0,1)) +
geom_text(x = 0.9, y = 0.91, label = "CASCADE 1.0") +
theme(plot.title = element_text(hjust = 0.5), legend.position = 'none')
Sign Inversion
::ggscatter(data = scrambled_topo_res %>%
ggpubrfilter(scramble_type == 'effect' | scramble_type == 'none', !is.na(roc_auc)),
x = "sim", y = "roc_auc", color = "scramble_type", palette = c('black', 'red'),
xlab = "Similarity Score",
title = "Sign Inversion vs Performance (ROC)",
ylab = "ROC AUC") +
ylim(c(0,1)) +
geom_text(x = 0.9, y = 0.98, label = "CASCADE 1.0") +
theme(plot.title = element_text(hjust = 0.5), legend.position = 'none')
::ggscatter(data = scrambled_topo_res %>%
ggpubrfilter(scramble_type == 'effect' | scramble_type == 'none', !is.na(pr_auc)),
x = "sim", y = "pr_auc", color = "scramble_type", palette = c('black', 'red'),
xlab = "Similarity Score",
title = "Sign Inversion vs Performance (Precision-Recall)",
ylab = "PR AUC") +
ylim(c(0,1)) +
geom_text(x = 0.9, y = 0.91, label = "CASCADE 1.0") +
theme(plot.title = element_text(hjust = 0.5), legend.position = 'none')
Source, Target Scrambling and Sign Inversion
::ggscatter(data = scrambled_topo_res %>%
ggpubrfilter(scramble_type == 'all' | scramble_type == 'none', !is.na(roc_auc)),
x = "sim", y = "roc_auc", color = "scramble_type", palette = c('black', 'red'),
xlab = "Similarity Score",
title = "All types of Scrambling vs Performance (ROC)",
ylab = "ROC AUC") +
ylim(c(0,1)) +
geom_text(x = 0.95, y = 1, label = "CASCADE 1.0") +
geom_hline(yintercept = 0.5, linetype = 'dashed', color = "red") +
geom_text(aes(x = 0.92, y = 0.45, label = "Random (AUC = 0.5)"), color = '#377EB8') +
theme(plot.title = element_text(hjust = 0.5), legend.position = 'none')
::ggscatter(data = scrambled_topo_res %>%
ggpubrfilter(scramble_type == 'all' | scramble_type == 'none', !is.na(pr_auc)),
x = "sim", y = "pr_auc", color = "scramble_type", palette = c('black', 'red'),
xlab = "Similarity Score",
title = "All types of Scrambling vs Performance (Precision-Recall)",
ylab = "PR AUC") +
ylim(c(0,1)) +
geom_text(x = 0.9, y = 0.91, label = "CASCADE 1.0") +
geom_hline(yintercept = sum(observed)/length(observed), linetype = 'dashed', color = "red") +
geom_text(aes(x = 0.83, y = 0.08, label = "Random (AUC = 0.2)"), color = '#377EB8') +
theme(plot.title = element_text(hjust = 0.5), legend.position = 'none')
Bootstrap Calibrated Models + Boxplots
Since almost all scrambled results (no matter the type of scrambling) are worse than the results we got when using the curated/unscrambled CASCADE 1.0 topology, we proceed to further generate bootstrap model predictions derived from the curated topology to assess if the results we had found weren’t artifacts and/or outliers.
We generate a large pool of gitsbe
models (\(1000\) simulations => \(3000\) models) and draw randomly a total of \(50\) batches of \(50\) models each and assess ROC and PR AUC performance for each one of these normalized to the random model predictions (see above).
All these bootstrapped models will be part of one category called Curated.
The rest of the scrambled topology data (that we presented in scatter plots) will be split to multiple groups based on their similarity score (percentage of common edges with curated topology) and we will visualize the different groups with boxplots.
See more details on how to reproduce these simulation results here.
Load the bootstrap results and tidy up the data:
# add the bootstrapped results of the curated topology to the scrambled results
= readRDS(file = 'data/scrambled_topo_res_cascade1.rds')
scrambled_topo_res = readRDS(file = 'data/boot_cascade1_res.rds')
boot_cascade1_res
= dplyr::bind_rows(scrambled_topo_res, boot_cascade1_res)
scrambled_topo_res
# group by similarity score
=
scrambled_topo_res %>% mutate(grp = factor(x =
scrambled_topo_res case_when(sim >= 0 & sim < 0.25 ~ '0 - 0.25',
>= 0.25 & sim < 0.5 ~ '0.25 - 0.5',
sim >= 0.5 & sim < 0.75 ~ '0.5 - 0.75',
sim >= 0.75 & sim < 0.85 ~ '0.75 - 0.85',
sim >= 0.85 & sim < 0.95 ~ '0.85 - 0.95',
sim >= 0.95 & sim < 1 ~ '0.95 - 1',
sim == 1 ~ 'Curated'),
sim levels = c('0 - 0.25', '0.25 - 0.5', '0.5 - 0.75',
'0.75 - 0.85', '0.85 - 0.95','0.95 - 1', 'Curated')))
# ROC results
%>%
scrambled_topo_res filter(scramble_type == 'source' | scramble_type == 'none', !is.na(roc_auc)) %>%
ggplot(aes(x = grp, y = roc_auc, fill = grp)) +
geom_boxplot(show.legend = FALSE) +
scale_fill_brewer(palette = 'Set1') +
geom_jitter(shape = 20, position = position_jitter(0.2), show.legend = FALSE) +
ylim(c(0,1)) +
labs(x = 'Similarity Score to CASCADE 1.0 Topology', y = 'ROC AUC',
title = "Source Scrambling vs Performance (ROC)") +
theme_classic(base_size = 14) +
geom_hline(yintercept = 0.5, linetype = 'dashed', color = "red") +
geom_text(aes(x = 6.7, y = 0.45, label = "Random (AUC = 0.5)")) +
theme(plot.title = element_text(hjust = 0.5))
# PR results
%>%
scrambled_topo_res filter(scramble_type == 'source' | scramble_type == 'none', !is.na(pr_auc)) %>%
ggplot(aes(x = grp, y = pr_auc, fill = grp)) +
geom_boxplot(show.legend = FALSE) +
scale_fill_brewer(palette = 'Set1') +
geom_jitter(shape = 20, position = position_jitter(0.2), show.legend = FALSE) +
ylim(c(0,1)) +
labs(x = 'Similarity Score to CASCADE 1.0 Topology', y = 'PR AUC',
title = "Source Scrambling vs Performance (Precision-Recall)") +
theme_classic(base_size = 14) +
geom_hline(yintercept = sum(observed)/length(observed), linetype = 'dashed', color = "red") +
geom_text(aes(x = 6.7, y = 0.15, label = "Random (AUC = 0.2)")) +
theme(plot.title = element_text(hjust = 0.5))
# ROC results
%>%
scrambled_topo_res filter(scramble_type == 'target' | scramble_type == 'none', !is.na(roc_auc)) %>%
ggplot(aes(x = grp, y = roc_auc, fill = grp)) +
geom_boxplot(show.legend = FALSE) +
scale_fill_brewer(palette = 'Set1') +
geom_jitter(shape = 20, position = position_jitter(0.2), show.legend = FALSE) +
ylim(c(0,1)) +
labs(x = 'Similarity Score to CASCADE 1.0 Topology', y = 'ROC AUC',
title = "Target Scrambling vs Performance (ROC)") +
theme_classic(base_size = 14) +
geom_hline(yintercept = 0.5, linetype = 'dashed', color = "red") +
geom_text(aes(x = 6.7, y = 0.45, label = "Random (AUC = 0.5)")) +
theme(plot.title = element_text(hjust = 0.5))
# PR results
%>%
scrambled_topo_res filter(scramble_type == 'target' | scramble_type == 'none', !is.na(pr_auc)) %>%
ggplot(aes(x = grp, y = pr_auc, fill = grp)) +
geom_boxplot(show.legend = FALSE) +
scale_fill_brewer(palette = 'Set1') +
geom_jitter(shape = 20, position = position_jitter(0.2), show.legend = FALSE) +
ylim(c(0,1)) +
labs(x = 'Similarity Score to CASCADE 1.0 Topology', y = 'PR AUC',
title = "Target Scrambling vs Performance (Precision-Recall)") +
theme_classic(base_size = 14) +
geom_hline(yintercept = sum(observed)/length(observed), linetype = 'dashed', color = "red") +
geom_text(aes(x = 6.7, y = 0.13, label = "Random (AUC = 0.2)")) +
theme(plot.title = element_text(hjust = 0.5))
# ROC results
%>%
scrambled_topo_res filter(scramble_type == 'effect' | scramble_type == 'none', !is.na(roc_auc)) %>%
ggplot(aes(x = grp, y = roc_auc, fill = grp)) +
geom_boxplot(show.legend = FALSE) +
scale_fill_brewer(palette = 'Set1') +
geom_jitter(shape = 20, position = position_jitter(0.2), show.legend = FALSE) +
ylim(c(0,1)) +
labs(x = 'Similarity Score to CASCADE 1.0 Topology', y = 'ROC AUC',
title = "Sign Inversion vs Performance (ROC)") +
theme_classic(base_size = 14) +
geom_hline(yintercept = 0.5, linetype = 'dashed', color = "red") +
geom_text(aes(x = 6.7, y = 0.45, label = "Random (AUC = 0.5)")) +
theme(plot.title = element_text(hjust = 0.5))
# PR results
%>%
scrambled_topo_res filter(scramble_type == 'effect' | scramble_type == 'none', !is.na(pr_auc)) %>%
ggplot(aes(x = grp, y = pr_auc, fill = grp)) +
geom_boxplot(show.legend = FALSE) +
scale_fill_brewer(palette = 'Set1') +
geom_jitter(shape = 20, position = position_jitter(0.2), show.legend = FALSE) +
ylim(c(0,1)) +
labs(x = 'Similarity Score to CASCADE 1.0 Topology', y = 'PR AUC',
title = "Sign Inversion vs Performance (Precision-Recall)") +
theme_classic(base_size = 14) +
geom_hline(yintercept = sum(observed)/length(observed), linetype = 'dashed', color = "red") +
geom_text(aes(x = 6.9, y = 0.15, label = "Random (AUC = 0.2)"), size = 3) +
theme(plot.title = element_text(hjust = 0.5))
# no data points in the (0.95-1) class
= RColorBrewer::brewer.pal(n = 7, name = 'Set1')[c(1:5,7)]
set1_cols
# ROC results
%>%
scrambled_topo_res filter(scramble_type == 'all' | scramble_type == 'none', !is.na(roc_auc)) %>%
ggplot(aes(x = grp, y = roc_auc, fill = grp)) +
geom_boxplot(show.legend = FALSE) +
scale_fill_manual(values = set1_cols) +
geom_jitter(shape = 20, position = position_jitter(0.2), show.legend = FALSE) +
ylim(c(0,1)) +
labs(x = 'Similarity Score to CASCADE 1.0 Topology', y = 'ROC AUC',
title = "All types of Scrambling vs Performance (ROC)") +
theme_classic(base_size = 14) +
geom_hline(yintercept = 0.5, linetype = 'dashed', color = "red") +
geom_text(aes(x = 5.8, y = 0.45, label = "Random (AUC = 0.5)")) +
theme(plot.title = element_text(hjust = 0.5))
# PR results
%>%
scrambled_topo_res filter(scramble_type == 'all' | scramble_type == 'none', !is.na(pr_auc)) %>%
ggplot(aes(x = grp, y = pr_auc, fill = grp)) +
geom_boxplot(show.legend = FALSE) +
scale_fill_manual(values = set1_cols) +
geom_jitter(shape = 20, position = position_jitter(0.2), show.legend = FALSE) +
ylim(c(0,1)) +
labs(x = 'Similarity Score to CASCADE 1.0 Topology', y = 'PR AUC',
title = "All types of Scrambling vs Performance (Precision-Recall)") +
theme_classic(base_size = 14) +
geom_hline(yintercept = sum(observed)/length(observed), linetype = 'dashed', color = "red") +
geom_text(aes(x = 5.8, y = 0.15, label = "Random (AUC = 0.2)")) +
theme(plot.title = element_text(hjust = 0.5))
We observe that even a small perturbation/violation/scrambling of the curated topology (of any type) produces results close to random prediction that are significantly lower than the prediction results when using the curated CASCADE 1.0 topology.