This markdown contains the following analysis:
On all TCGA variants as a whole, or specifically on essential genes.
plotVarEnrich <- function(tb, cat_col = "abbrev")
{
tb$region <- factor(tb$region, levels = c("surface", "core", "interact"))
tb[which(tb$cilow == 0), "cilow"] <- 0.0001
tb$pval <- p.adjust(tb$pval)
tb$pvalsig <- (tb$pval < 0.05)
# tb$log_pval <- -log(tb$pval)
ggplot(tb, aes_string(y = "region", x = "enrich")) +
geom_vline(xintercept = 1, linetype = "dashed") + geom_violin(fill = NA) +
#geom_point(aes_string(col = "pvalsig"), position = position_jitterdodge()) +
ylab("") + scale_x_log10(name = "enrichment", limits = c(0.2, 10)) +
scale_colour_manual(values = c("TRUE" = "black", "FALSE" = "grey"),
name = "adjusted\np < 0.05?") +
cowplot::theme_cowplot()
}
processVarEnrich <- function(tb, cat_col = "abbrev")
{
tb <- tb[, c(cat_col, "surf_enrich", "surf_enrich_95CI_low",
"surf_enrich_95CI_high", "surf_pval",
"core_enrich", "core_enrich_95CI_low",
"core_enrich_95CI_high", "core_pval",
"interact_enrich", "interact_enrich_95CI_low",
"interact_enrich_95CI_high", "interact_pval")]
clust <- tb[, c("surf_enrich", "core_enrich", "interact_enrich")]
clust <- log(clust)
rownames(clust) <- tb[, cat_col]
clust <- hclust(dist(clust))
cat_order <- tb[, cat_col][clust$order]
tb <- list("surface" = tb[, c(cat_col, "surf_enrich", "surf_enrich_95CI_low",
"surf_enrich_95CI_high", "surf_pval")],
"core" = tb[, c(cat_col, "core_enrich", "core_enrich_95CI_low",
"core_enrich_95CI_high", "core_pval")],
"interact" = tb[, c(cat_col, "interact_enrich", "interact_enrich_95CI_low",
"interact_enrich_95CI_high", "interact_pval")])
for(i in 1:length(tb)){
colnames(tb[[i]]) <- c(cat_col, "enrich", "cilow", "cihigh", "pval")
tb[[i]]$region <- names(tb)[i]
}
tb <- do.call("rbind", tb)
tb[, cat_col] <- factor(tb[, cat_col], levels = cat_order)
tb
}
# good-looking names to replace TCGA abbreviations
abbrev <- read.table('/media/josefng/My_Passport/ZoomvarTCGA/cancerType_abbrev',
header = TRUE, stringsAsFactors = FALSE, sep = "\t")
# first for TCGA_CGC only
tcga_cgc <- read.csv('/media/josefng/My_Passport/ZoomvarTCGA/TCGA_general_zoomvar_CGC.csv',
stringsAsFactors = FALSE)
tcga_cgc <- tcga_cgc[, 1:24]
tcga_cgc <- merge(abbrev[, 1:2], tcga_cgc, by.x = "cancerType", by.y = "Sample",
all.x = FALSE, all.y = TRUE, sort = FALSE)
# TCGA_all
tcga_all <- read.csv('/media/josefng/My_Passport/ZoomvarTCGA/TCGA_general_zoomvar_all.csv',
stringsAsFactors = FALSE)
tcga_all <- tcga_all[, 1:24]
tcga_all <- merge(abbrev[, 1:2], tcga_all, by.x = "cancerType", by.y = "Sample",
all.x = FALSE, all.y = TRUE, sort = FALSE)
tcga_all <- processVarEnrich(tcga_all)
tcga_all$abbrev <- factor(tcga_all$abbrev,
levels = levels(plotVarEnrich(processVarEnrich(tcga_cgc))$data$abbrev))
#together
cowplot::plot_grid(plotVarEnrich(tcga_all) + ggtitle("All variants"),
plotVarEnrich(processVarEnrich(tcga_cgc)) + ggtitle("CGC Drivers only"),
nrow = 1, axis = "tb", align = "h")
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
# for TCGA each signature
tcga_sig <- read.csv('/media/josefng/My_Passport/ZoomvarTCGA/TCGA_sigs_stats_new.txt',
stringsAsFactors = FALSE)
tcga_sig$Sample <- c("5-FU", "Platinum", "POLE", "Aging", "UV", "APOBEC")
tcga_sig <- processVarEnrich(tcga_sig, cat_col = "Sample")
tcga_sig$cohort <- factor(tcga_sig$Sample,
levels = c("5-FU", "Platinum", "POLE", "Aging", "UV", "APOBEC"),
labels = c("Treated with 5-FU", "Treated with\nplatinum-based drugs",
"POLE-mut\nhypermutated", "Age over 70", "Melanoma", "APOBEC-enriched"))
tcga_sig$Sample <- factor(tcga_sig$Sample,
levels = c("Aging", "APOBEC", "POLE", "UV", "5-FU", "Platinum"))
tcga_sig$region <- factor(tcga_sig$region, levels = c("surface", "core", "interact"))
ggplot(tcga_sig, aes(x = enrich, y = region, xmin = cilow, xmax = cihigh)) +
geom_vline(xintercept = 1, linetype = "dashed") +
geom_point(size = 2) + geom_errorbar(width = 0) + ylab("") +
scale_x_continuous(name = "enrichment", limits = c(0, 2.2), breaks = c(0.5, 1, 2)) +
cowplot::theme_cowplot() + facet_wrap(Sample ~ cohort, nrow = 1)
Count substrate availability (i.e. % CDS with signature motif) over core/surface/interact for each protein and each signature.
cds <- list.files(path = '/media/josefng/My_Passport/ZoomvarTCGA',
pattern = "proteinCDS_struct_.*_NEW.csv", full.names = TRUE)
cds <- lapply(cds, read.csv, stringsAsFactor = FALSE)
names(cds) <- c("5fu", "aging", "apobec", "platinum","pole", "uv")
cds <- lapply(names(cds), function(x){
tb <- cds[[x]]
tb$signature <- x
tb
})
cds <- do.call("rbind", cds)
cds$surface <- cds$surface_sig / cds$len_surface
cds$core <- cds$core_sig / cds$len_core
cds$interact <- cds$interact_sig / cds$len_interact
cds <- cds[, c("Uniprot", "surface", "core", "interact", "signature")]
cds <- reshape2::melt(cds, id.vars = c("Uniprot", "signature"),
measure.vars = c("surface", "core", "interact"))
cds <- cds[which(!is.nan(cds$value)), ]
cds$signature <- factor(cds$signature,
levels = c("aging", "apobec", "pole", "uv", "5fu", "platinum"),
labels = c("Aging", "APOBEC", "POLE", "UV", "5-FU", "Platinum"))
ggplot(cds, aes(x = variable, y = value)) + geom_boxplot(outlier.shape = NA) +
facet_wrap(~ signature, nrow = 1) + cowplot::theme_cowplot() +
scale_x_discrete(name = "") +
scale_y_continuous(breaks = c(0, 0.5, 1),
labels = c("", "50%", "100%"),
name = "% AA positions perturbable by signature") +
theme() + coord_flip()
For any pairwise comparison, sample values from either of the categories (sample \(n=10,000\) for 1,000 iterations) under comparison, randomly assign labels. Compute the difference of medians of the two labels. This forms the null distribution of differences. Compare this with the actual observed difference to derive a p-value.
At the same time, calculation of Cohen’s \(d\) using package effsize
.
pairwiseMedianTest <- function(input, category_column, categories, data_column,
sample_times = 1000, sample_size = 10000){
actual1 <- input[which(input[, category_column] == categories[1]), data_column]
actual2 <- input[which(input[, category_column] == categories[2]), data_column]
actual_diff <- abs(median(actual1, na.rm = TRUE) - median(actual2, na.rm = TRUE))
null_diff <- sapply(1:sample_times, function(x){
n <- sample(input[which(input[, category_column] %in% categories), data_column],
sample_size * 2, replace = TRUE)
sample1 <- n[1:sample_size]; sample2 <- n[(sample_size+1):(sample_size*2)]
return( abs(median(sample1, na.rm = TRUE) - median(sample2, na.rm = TRUE)) )
})
p <- sum( actual_diff > null_diff ) / length(null_diff)
if(p > 0.5) return(1 - p) else return(p)
}
p <- sapply(levels(cds$signature), function(x){
c(
"surface-vs-core" = pairwiseMedianTest(cds[which(cds$signature == x), ],
"variable",
c("surface", "core"), "value"),
"surface-vs-interact" = pairwiseMedianTest(cds[which(cds$signature == x), ],
"variable",
c("surface", "interact"), "value"),
"core-vs-interact" = pairwiseMedianTest(cds[which(cds$signature == x), ],
"variable",
c("core", "interact"), "value")
)
})
cat("P-value\n")
## P-value
p
## Aging APOBEC POLE UV 5-FU Platinum
## surface-vs-core 0.0 0 0 0.000 0 0.000
## surface-vs-interact 0.5 0 0 0.000 0 0.000
## core-vs-interact 0.0 0 0 0.005 0 0.001
# cohen's d effect size
cds$variable <- as.character(cds$variable)
cohen_d <- sapply(levels(cds$signature), function(x){
suppressWarnings(c(
"surface-vs-core" = effsize::cohen.d(value ~ variable,
data = cds[which(cds$signature == x &
cds$variable %in% c("surface","core")),])$estimate,
"surface-vs-interact" = effsize::cohen.d(value ~ variable,
data = cds[which(cds$signature == x &
cds$variable %in% c("surface","interact")),])$estimate,
"core-vs-interact" = effsize::cohen.d(value ~ variable,
data = cds[which(cds$signature == x &
cds$variable %in% c("core", "interact")),])$estimate
))
})
cds$variable <- factor(cds$variable, levels = c("surface", "core", "interact"))
cat("Cohen's d\n")
## Cohen's d
cohen_d
## Aging APOBEC POLE UV 5-FU
## surface-vs-core 0.16270193 -0.1668344 -0.6691839 -0.19946736 -0.31875877
## surface-vs-interact -0.04463563 0.1604665 0.2076794 -0.11533111 0.06377551
## core-vs-interact 0.19240685 -0.2633472 -0.6572191 -0.07778975 -0.29834772
## Platinum
## surface-vs-core 0.22543374
## surface-vs-interact 0.12403254
## core-vs-interact 0.08952865
Uses dndscv
package. Consider for each sample, dN/dS specifically in protein surface vs core.
Three types of samples:
library("dndscv")
## Warning: replacing previous import 'Biostrings::translate' by
## 'seqinr::translate' when loading 'dndscv'
#__________________________________
# functions to parse input/output
get_dndscv_input <- function(file, filterCol = "struct_cat",
filter = FALSE, filter_value = NULL)
{
if(is.character(file)){
maf <- read.table(file, header = T, sep = "\t", stringsAsFactors = F)
} else if(is.data.frame(file)){
maf <- file
}
if(isTRUE(filter) & (filterCol %in% colnames(maf)))
{
maf <- maf[maf[, filterCol] == filter_value, ]
}
# format:
## sampleID chr pos ref mut
## 1 Sample_1 1 871244 G C
## 2 Sample_1 1 6648841 C G
## 3 Sample_1 1 17557072 G A
## 4 Sample_1 1 22838492 G C
## 5 Sample_1 1 27097733 G A
## 6 Sample_1 1 27333206 G A
maf <- maf[, c("Tumor_Sample_Barcode", "Chromosome", "End_position",
"Tumor_Seq_Allele1", "Tumor_Seq_Allele2", "Variant_Classification")]
colnames(maf) <- c("sampleID", "chr", "pos", "ref", "mut", "Variant_Classification")
return(maf)
}
get_dndscv_output <- function(input)
{
out <- dndscv(input, min_indel = 1 )
# global dnds estimates
global <- out$globaldnds
# per gene neutrality tests
pergene <- out$sel_cv
# map ensembl protein ID to the pergene table
pergene[, "ensembl_protein_id"] <- sapply(as.character(pergene[, "gene_name"]), function(x){
prot <- unique(out$annotmuts[which(as.character(out$annotmuts$gene) == x), "pid"])
if(length(prot) == 0) return(NA)
if(length(prot) == 1) return(prot)
if(length(prot) > 1) return(paste(prot, collapse = ";"))
})
return(list(pergene = pergene, global = global))
# return(out)
}
get_dndscv_output_global <- function(input)
{
if(nrow(input) == 0) return(NULL)
print(input[1, 1])
if(sum(input[, 6] == 'Missense_Mutation') == 0){
return(NULL)
}
library(dndscv)
out <- try( dndscv(input, min_indel = 1, outp = 1,
max_coding_muts_per_sample = Inf ) )
# global dnds estimates
if(class(out) == 'try-error') return(out)
else {
global <- out$globaldnds
global
}
}
# Parse dNdScv results into data frames
parse_dNdScv_all <- function(dNdSresults)
{
do.call("rbind", lapply(names(dNdSresults), function(x){
tb <- dNdSresults[[x]]
if(!is.null(tb)){
dNdS <- tb["wall", "mle"]
CIlow <- tb["wall", "cilow"]
CIhigh <- tb["wall", "cihigh"]
data.frame(sample = x, dNdS = dNdS, cilow = CIlow, cihigh = CIhigh,
stringsAsFactors = FALSE)
} else return(NULL)
}))
}
parse_dNdScv_struct <- function(surf_results, core_results)
{
sample_names <- union(names(surf_results), names(core_results))
do.call("rbind", lapply(sample_names, function(x){
surf <- surf_results[[x]]
core <- core_results[[x]]
if(!is.null(surf)){
dNdS_surf <- surf["wall", "mle"]
CIlow_surf <- surf["wall", "cilow"]
CIhigh_surf <- surf["wall", "cihigh"]
} else {
dNdS_surf <- NA; CIlow_surf <- NA; CIhigh_surf <- NA
}
if(!is.null(core)){
dNdS_core <- core["wall", "mle"]
CIlow_core <- core["wall", "cilow"]
CIhigh_core <- core["wall", "cihigh"]
} else {
dNdS_core <- NA; CIlow_core <- NA; CIhigh_core <- NA
}
data.frame(sample = x, dNdS_surface = dNdS_surf,
cilow_surface = CIlow_surf, cihigh_surface = CIhigh_surf,
dNdS_core = dNdS_core, cilow_core = CIlow_core,
cihigh_core = CIhigh_core, stringsAsFactors = FALSE)
}))
}
# dNdS per tumour + specifically restricted to surface/core
#________________________________________
# TCGA per sample
tcga <- list.files(path = "~/TCGA.firehose_maf",
pattern = glob2rx("*_maf_allmuttype_mapped_withSig_new.txt"),
full.names = T)
tcga <- tcga[ which(!grepl("TCGA_", tcga)) ]
tcga <- lapply(tcga, function(file){
maf <- read.table(file, header = T, quote = "", sep = "\t", stringsAsFactors = F)
# print(summary(as.factor(maf$Variant_Classification)))
maf$Chromosome <- sapply(maf$Chromosome, function(x){
out <- gsub("chr", "", x)
if (!(out %in% c("X", "Y"))) return(as.numeric(out)) else return(out)
})
maf$cohort <- unlist(strsplit(basename(file), split ="_"))[1]
maf
#split(maf, maf$Tumor_Sample_Barcode, drop = TRUE)
})
tcga <- do.call("rbind", tcga)
tcga <- split(tcga, tcga$Tumor_Sample_Barcode, drop = TRUE)
tcga_all <- lapply(tcga, get_dndscv_input, filter = FALSE)
tcga_surf <- lapply(tcga, get_dndscv_input, filter = TRUE,
filterCol = "struct_cat", filter_value = "surface")
tcga_core <- lapply(tcga, get_dndscv_input, filter = TRUE,
filterCol = "struct_cat", filter_value = "core")
library(parallel)
no_cores <- 2
cl <- makeCluster(no_cores)
data("refcds_hg19", package = "dndscv")
clusterExport(cl, c("gr_genes", "RefCDS"))
tcga_all <- parLapply(cl, tcga_all, get_dndscv_output_global)
stopCluster(cl)
tcga_surf <- lapply(tcga_surf, get_dndscv_output_global)
tcga_core <- lapply(tcga_core, get_dndscv_output_global)
#________________________________________
# hipsci per sample
hipsci <- "hipsci_unique_ProteinAlteringMutations_noHLA_zoomvar.maf"
hipsci <- read.table(hipsci, header = T, sep = "\t", stringsAsFactors = F)
hipsci$cohort <- 'iPSCs'
hipsci <- split(hipsci, hipsci$Tumor_Sample_Barcode, drop = TRUE)
hipsci_all <- lapply(hipsci, get_dndscv_input, filter = FALSE)
hipsci_surf <- lapply(hipsci, get_dndscv_input, filter = TRUE,
filterCol = "struct_cat", filter_value = "surface")
hipsci_core <- lapply(hipsci, get_dndscv_input, filter = TRUE,
filterCol = "struct_cat", filter_value = "core")
hipsci_all <- lapply(hipsci_all, get_dndscv_output_global)
hipsci_surf <- lapply(hipsci_surf, get_dndscv_output_global)
hipsci_core <- lapply(hipsci_core, get_dndscv_output_global)
#________________________________________
# ICGC per sample
icgc <- "ICGC_20190603_All_Metastatic_simple_somatic_mutation_exonic_oncotator_noHLA_zoomvar.maf"
icgc <- read.table(icgc, header = T, sep = "\t", stringsAsFactors = F, quote = '')
icgc$cohort <- sapply(icgc$Center, function(x) unlist(strsplit(x, split = "-"))[1])
icgc <- split(icgc, icgc$Tumor_Sample_Barcode, drop = TRUE)
icgc_all <- lapply(icgc, get_dndscv_input, filter = FALSE)
icgc_surf <- lapply(icgc, get_dndscv_input, filter = TRUE,
filterCol = "struct_cat", filter_value = "surface")
icgc_core <- lapply(icgc, get_dndscv_input, filter = TRUE,
filterCol = "struct_cat", filter_value = "core")
icgc_all <- lapply(icgc_all, get_dndscv_output_global)
icgc_surf <- lapply(icgc_surf, get_dndscv_output_global)
icgc_core <- lapply(icgc_core, get_dndscv_output_global)
#________________________________________
tcga_all <- parse_dNdScv_all(tcga_all)
tcga_struct<- parse_dNdScv_struct(tcga_surf, tcga_core)
save("tcga_all", "tcga_struct", file='dNdScv_TCGAperSample.RData')
rm(tcga_core); rm(tcga_surf)
hipsci_all <- parse_dNdScv_all(hipsci_all)
hipsci_struct<- parse_dNdScv_struct(hipsci_surf, hipsci_core)
save("hipsci_all", "hipsci_struct", file='dNdScv_HipSciperSample.RData')
rm(hipsci_core); rm(hipsci_surf)
icgc_all <- parse_dNdScv_all(icgc_all)
icgc_struct<- parse_dNdScv_struct(icgc_surf, icgc_core)
save("icgc_all", "icgc_struct", file='dNdScv_ICGCperSample.RData')
rm(icgc_core); rm(icgc_surf)
Samplewise pair (surface vs core). Orange lines indicate dN/dS at the surface is higher than that at the core; blue lines indicate otherwise.
Paired t-test to evaluate statistical significance.
#________________________________________
# Analysis
load('dNdScv_HipSciperSample.RData')
load('dNdScv_ICGCperSample.RData')
load('dNdScv_TCGAperSample.RData')
# (1) Paired analysis of surface-vs-core dnds per sample
plot.paired <- function (df, condition1, condition2, y.lab = "dN/dS", y.lim = c(0, 4),
paired_line = FALSE)
{
pairedttest <- t.test(x = df[df$Conditions == condition1, gsub("/", "", y.lab)],
y = df[df$Conditions == condition2, gsub("/", "", y.lab)],
paired = T, alternative = "two.sided")$p.value
if(pairedttest < 0.001) pairedttest <- formatC(pairedttest, format = "e", digits = 2)
else pairedttest <- round(pairedttest, digits = 3)
effectsize <- effsize::cohen.d(df[df$Conditions == condition1, gsub("/", "", y.lab)],
df[df$Conditions == condition2, gsub("/", "", y.lab)], paired = T)$estimate
pval <- data.frame(x1 = 0.7, x2 = 2.3, y1 = y.lim[2] - 0.1, y2 = y.lim[2] - 0.1,
p=paste("P =", pairedttest )) #,
# esize = paste("Cohen's d =", round(effectsize, digits = 3)))
library(ggplot2)
plotP <- ggplot(data = df) + aes_string(x = "Conditions", y = gsub("/", "", y.lab))
if(isTRUE(paired_line)){
plotP <- plotP + geom_line(aes(group = Sample, col = Comparison), size = 0.6, alpha = 0.6) +
scale_color_manual(values = c("sandybrown", "lightskyblue3"))
}
plotP <- plotP + geom_boxplot(aes(group = NULL), outlier.shape = NA, colour = "black") +
xlab("") + ylab(y.lab) + xlim(c(condition1,condition2)) + ylim(y.lim) +
theme(plot.margin = unit(c(1,1,1,1), "lines"),
legend.position = "none",
title = element_text(size = 16),
axis.text = element_text(size = 14),
axis.title = element_text(size = 14) ) +
geom_hline(aes(yintercept=1), linetype="dashed") +
geom_segment(aes(x = x1, y = y1, xend = x2, yend = y2), colour = "grey50", data = pval) +
geom_text(aes(y = y.lim[2] - 0.1, x = 1.5, label = p), nudge_y = 0.1, data = pval) # +
# geom_text(aes(y = y.lim[2] - 0.2, x = 1.5, label = esize), nudge_y = 0.2, data = pval)
return(plotP)
}
paired.t <- function(tb, pairedLine = FALSE)
{
tb <- tb[!is.infinite(tb[, "dNdS_core"]) & !is.infinite(tb[, "dNdS_surface"]), ]
tb <- tb[!is.na(tb[, "dNdS_core"]) & !is.na(tb[, "dNdS_surface"]), ]
# tb <- tb[(tb[, "N_surface"] + tb[, "S_surface"]) > 10 &
# (tb[, "N_core"] + tb[, "S_core"]) > 10 , ]
surface <- tb[, "dNdS_surface"]
core <- tb[, "dNdS_core"]
comparison <- apply(tb[, c("dNdS_surface", "dNdS_core")], MARGIN = 1, function(x){
if(x[1] < x[2]) return("smaller") else return("larger")
})
#colour <- replace(comparison, which(comparison == "smaller"), "lightskyblue3")
#colour <- replace(colour, which(comparison == "larger"), "sandybrown")
tb <- data.frame(Conditions = c(rep("surface", length(surface)), rep("core", length(core))),
dNdS = c(surface, core), Sample = rep(tb[, "sample"], 2),
Comparison = rep(comparison, 2))
# print(head(tb))
return(plot.paired(tb, "surface", "core", paired_line = pairedLine))
}
gridExtra::grid.arrange(paired.t(hipsci_struct, pairedLine = TRUE) +ggtitle("iPSCs"),
paired.t(tcga_struct, pairedLine = TRUE) +ggtitle("TCGA"),
paired.t(icgc_struct, pairedLine = TRUE) +ggtitle("Metastatic"),
ncol = 3)
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 2707 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2707 row(s) containing missing values (geom_path).
## Warning: Removed 178 rows containing non-finite values (stat_boxplot).
## Warning: Removed 178 row(s) containing missing values (geom_path).
Query pre-computed enrichment of ClinVar / COSMIC / gnomAD variants from ZoomVar.
Uses the essential genes curated CRISPR data:
# essential_genes and transition_matrix
#___________________________________
# VES from zoomvar for the Hart essential gene list
files <- list.files('../ZoomvarBrowserApp', pattern = "protein.csv",
full.names = TRUE)
files <- lapply(files, read.csv, stringsAsFactors = FALSE)
colnames(files[[1]])[2] <- 'clinvar'; colnames(files[[2]])[2] <- 'cosmic'; colnames(files[[3]])[2] <- 'gnomad_common'; colnames(files[[4]])[2] <- 'gnomad_rare'
files <- Reduce(function(f1, f2) merge(f1, f2, by = "protein"), files)
head(files)
## protein clinvar cosmic gnomad_common gnomad_rare
## 1 A0A024R161 0.7048348 8.696627e-01 0.7734538 1.874989e-04
## 2 A0A075B6Q4 0.7260977 1.367583e-01 0.4399445 4.139155e-02
## 3 A0A075B6T3 0.9422908 2.886422e-01 0.8585672 2.400534e-05
## 4 A0A087WSX6 0.1954577 8.004230e-01 0.9995812 9.999147e-01
## 5 A0A087WSY0 0.6857606 4.659894e-14 0.7476300 4.110339e-01
## 6 A0A087WSZ3 0.7162055 2.461940e-01 0.7884314 8.948293e-01
rownames(files) <- files$protein
files <- files[, 2:5]
cdf <- files
hart_essential <- read.table('~/gene_sets/Hart_CRISPR_uniprot.tab',
stringsAsFactors = FALSE,
header = TRUE, sep = "\t", comment.char = '', quote = '')
hart_essential <- hart_essential$Entry
# count of the number of genes int the Hart Essential gene set
sum(rownames(cdf) %in% hart_essential)
## [1] 587
hart_essential <- cdf[which(rownames(cdf) %in% hart_essential), ]
# those significantly enriched in COSMIC variants
cosmic_protein <- read.csv('cosmic_protein.csv', stringsAsFactors = FALSE)
hart_essential_depleted <- rownames(hart_essential)[rownames(hart_essential) %in% cosmic_protein[cosmic_protein$qval < 0.05 & cosmic_protein$cdf < 0.5, "protein"]]
hart_essential_enriched <- rownames(hart_essential)[rownames(hart_essential) %in% cosmic_protein[cosmic_protein$qval < 0.05 & cosmic_protein$cdf > 0.5, "protein"]]
Gene-set enrichment analysis (GSEA)
#______________________________________
# GSEA on own essential gene list
#gs <- GSA::GSA.read.gmt('/home/josefng/c5.bp.v7.1.symbols.gmt')
gobp <- gs$genesets
names(gobp) <- gs$geneset.names
rm(gs)
total_n <- length(unique(unlist(gobp)))# total number of genes covered
gsea_fisher <- function(query, pathways, N = total_n, fdr = 0.05) {
#overlapping analyses
#query: vector of genes on
# which to perform GSEA.
#db_query, db_pathway: obtained from the gmt le for gene sets with commands above
query_gene <- query
output = lapply(names(pathways), function (j){
pathway = pathways[[j]]
intersection = intersect(query_gene, pathway)
a = length(intersection)
c = length(pathway) - a
b = length(query_gene) - a
d = N - a - b - c
fish = fisher.test(matrix(c(a, b, c, d), 2, 2))
pval = fish$p.value
pathway_size = length(pathway)
query_size = length(query_gene)
intersection_size = a
enrichment = a * N / query_size / pathway_size
if (a == 0) {
intersection = "NA"
} else {
intersection = paste(intersection, collapse = ",")
}
ap = cbind(j, intersection_size, query_size, pathway_size,
N, pval, enrichment, intersection)
return(ap)
})
output = do.call("rbind", output)
output = as.data.frame(output)
output[, "pval"] = as.numeric(as.character(output[, "pval"]))
output[, "enrichment"] = as.numeric(as.character(output[, "enrichment"]))
qval = p.adjust(as.numeric(output[, "pval"]), method = "fdr")
output = cbind(output, qval)
output = output[order(output[, "qval"]), c(1:5, 7, 6, 9, 8)]
colnames(output)[1] <- 'pathway'
return(output)
}
essential_genes <- readLines('essential_genes/essential_genes_anno')
gsea_essential <- gsea_fisher(essential_genes, gobp)
# plot
GetGSEAGraph <- function(gsea_results, idents){
tb <- gsea_results
tb$significance <- -log10(tb$qval)
tb <- tb[order(tb$significance, decreasing = TRUE)[1:20], ]
#tb <- tb[1:15, ]
tb <- tb[order(tb$significance), ]
tb$pathway <- gsub("^GO_", "", tb$pathway)
tb$pathway <- gsub("_", " ", tb$pathway)
tb$pathway <- factor(tb$pathway, levels = tb$pathway)
tb$intersection <- sapply(as.character(tb$intersection), function(y){
top_genes <- unlist(strsplit(y, split = ","))
if(length(top_genes) > 4){
top_genes <- top_genes[1:4]
top_genes <- paste(top_genes, collapse = ", ")
top_genes <- paste0(top_genes, " etc.")
} else {
top_genes <- paste(top_genes, collapse = ", ")
}
top_genes
})
ggplot(data = tb) + geom_bar(aes(x = pathway, y = significance), stat = "identity") +
coord_flip() + ggtitle("") + theme_classic() +
ylab("-log10(q)") + xlab("") +
geom_text(aes(x = pathway, y = 0, label = intersection),
hjust = 0, col = "white", size= 3) +
# geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
scale_x_discrete(labels = function(x) stringr::str_wrap(x, width = 50))
}
GetGSEAGraph(gsea_essential, idents = NA)
Show levels of variant enrichment (surface/core/interface) in these genes:
#_________________________________________
# heatmap for enriched struct region level cdf for essential genes
essential_genes <- readLines('essential_genes/essential_genes_uniprot')
essential_genes <- union(essential_genes, rownames(hart_essential))
cosmic_region <- read.csv('cosmic_region.csv', stringsAsFactors = FALSE)
cosmic_region[which(cosmic_region$core_len == 0), "cdf_core"] <- NA
cosmic_region[which(cosmic_region$interact_len == 0), "cdf_interact"] <- NA
cosmic_region[which(cosmic_region$surf_len == 0), "cdf_surf"] <- NA
cosmic_region <- cosmic_region[which(cosmic_region$pval_core < 0.05 |
cosmic_region$pval_surf < 0.05 |
cosmic_region$pval_interact < 0.05), ]
cosmic_region <- cosmic_region[which(cosmic_region$uniprot %in% essential_genes), ]
cosmic_region <- cosmic_region[, c("uniprot", "cdf_surf", "cdf_core","cdf_interact")]
# map uniprot to gene symbol for plotting
annotateBioMart = function(l, nsplit = 400, GRCh = 38,
attributes = c('uniprotswissprot', 'hgnc_symbol'),
filters = "uniprotswissprot"){
if(GRCh == 37){
ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl", GRCh = GRCh)
} else {
ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
}
ngroup = round(length(l) / nsplit + 0.5, digits = 0)
group = rep(1:ngroup, nsplit)[1:length(l)]
llist = split(l, group)
gene.list = lapply(1:length(llist), function(x){
tb = llist[[x]]
op = getBM(attributes = attributes, filters = filters, values = tb, mart = ensembl)
return(op)
})
gene = do.call("rbind", gene.list)
geneUnique = lapply(levels(as.factor(gene[, 1])), function(q){
subset = gene[gene[, 1] == q, ]
geneid = paste(subset[subset[, 2] != "", 2], collapse = ";")
return(cbind(q, geneid))
})
geneUnique = do.call("rbind", geneUnique)
colnames(geneUnique) = attributes
return(geneUnique)
}
gene_names <- annotateBioMart(cosmic_region$uniprot, GRCh = 37)
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `filter_()` is deprecated as of dplyr 0.7.0.
## Please use `filter()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Cache found
cosmic_region <- as.matrix(cosmic_region[, -1])
rownames(cosmic_region) <- gene_names[, 2]
colnames(cosmic_region) <- c("surface", "core", "interact")
cosmic_region <- cosmic_region[hclust(dist(cosmic_region, method = "euclidean"), method = "ward.D")$order, ]
gplots::heatmap.2(cosmic_region, main = "", density.info = "none",
dendrogram = "none", Rowv= NULL, Colv=NULL, na.color = "white",
breaks = seq(0, 1, length.out = 101), trace = "none",
margins = c(8, 8), key.xlab = "CDF", xlab = "", ylab = "",
col = c(colorRampPalette(c("navy", "yellow"), space = "rgb")(100)))
Count the number of substitutions changing AA types for:
# "essential genes": genes appear in any of the two essential genes lists
essential_genes <- readLines('essential_genes/essential_genes_uniprot')
essential_genes <- union(essential_genes, rownames(hart_essential))
#___________________________________
# first read COSMIC muts
cosmic_muts <- read.table('../ZoomvarBrowserApp/cosmic_variants.txt',
stringsAsFactors = FALSE, header = TRUE, sep = "\t")
cosmic_muts <- split(cosmic_muts, f = cosmic_muts$protein)
# Now parse wt_AA, mut_AA and change_type
mapChangeType <- function(tb, colNameWT, colNameMut)
{
# map whether a missense mutation involves changing AA type
aa <- list(aromatic = data.frame(aa = c("Y", "W", "F"),
type = rep("aromatic", 3),
stringsAsFactors = FALSE),
negative = data.frame(aa = c("E", "D"),
type = rep("negative", 2),
stringsAsFactors = FALSE),
positive = data.frame(aa = c("R", "K", "H"),
type = rep("positive", 3),
stringsAsFactors = FALSE),
polar = data.frame(aa = c("S", "T", "C", "P", "Q", "N"),
type = rep("polar", 6),
stringsAsFactors = FALSE),
nonpolar = data.frame(aa = c("G", "A", "V", "L", "M", "I"),
type = rep("nonpolar", 6),
stringsAsFactors = FALSE))
aa <- do.call("rbind", aa)
wt_col <- which(colnames(tb) == colNameWT)
mut_col <- which(colnames(tb) == colNameMut)
tb[, "wt_type"] <- apply(tb, 1, function(x){
type_wt <- aa[aa[, "aa"] == x[wt_col], "type"]
if(length(type_wt) == 0) return(NA) else return(type_wt)
})
tb[, "mut_type"] <- apply(tb, 1, function(x){
type_mut <- aa[aa[, "aa"] == x[mut_col], "type"]
if(length(type_mut) == 0) return(NA) else return(type_mut)
})
tb[, "change_type"] <- apply(tb, 1, function(x){
type_wt <- aa[aa[, "aa"] == x[wt_col], "type"]
type_mut <- aa[aa[, "aa"] == x[mut_col], "type"]
if(length(type_wt) == 0 | length(type_mut) == 0) return(NA)
if(type_wt == type_mut) return(FALSE) else return(TRUE)
})
return(tb)
}
cosmic_muts <- lapply(cosmic_muts, function(tb){
tb$wtAA <- substr(tb$amino_acids, 1, 1)
tb$mutAA <- substr(tb$amino_acids, 3, 3)
mapChangeType(tb, "wtAA", "mutAA")
})
# all together
cols <- colnames(cosmic_muts[[1]])
cosmic_muts <- do.call("rbind", lapply(cosmic_muts, function(x) x[, cols]))
#___________________________________
# calculate change_type by struct
percentage <- function(data, indices, region) {
d <- data[indices,] # allows boot to select sample
d <- d[which(grepl(region, d[, "region"])), ]
cat(paste(nrow(d), ".."))
if(nrow(d) > 0){
return( nrow(d[d[, "change_type"] == TRUE, ]) / nrow(d) )
} else return(NA)
}
# calculation of proportions in all cancerTypes
tb <- lapply(c("surf", "core", "interact"), function(r){
print(r)
proportion_boot <- boot::boot(data=cosmic_muts, statistic=percentage, R=1000, region=r )
#proportion_boot$t <- proportion_boot$t[which(!is.na(proportion_boot$t[, 1])), ]
proportion_CI <- boot::boot.ci(proportion_boot, conf = 0.95, type = "perc")
proportion <- proportion_CI$t0
proportion_CI <- as.numeric(proportion_CI$percent[1, 4:5])
out <- data.frame(region = r, proportion = proportion,
lowerci = proportion_CI[1], upperci = proportion_CI[2])
return( out )
})
tb <- do.call("rbind", tb)
null_sample <- lapply(1:1000, function(i){
set.seed(i)
if(i %in% seq(1, 1000, by = 50)) print(i)
struct_randomized <- cosmic_muts[sample(1:nrow(cosmic_muts), nrow(cosmic_muts)), "region"]
d <- cosmic_muts
d[, "region"] <- struct_randomized
proportion_surf <- percentage(d, indices = 1:nrow(d), region = "surf")
proportion_core <- percentage(d, indices = 1:nrow(d), region = "core")
proportion_interact <- percentage(d, indices = 1:nrow(d), region = "interact")
out <- data.frame(surface = proportion_surf, core = proportion_core,
interact = proportion_interact)
})
null_sample <- do.call("rbind", null_sample)
saveRDS(null_sample, "Cosmic_change_type_null.rds")
tb[, "pval"] <- rep(NA, nrow(tb))
tb[tb[, "region"] == "surf", "pval"] <-
sum(tb[tb[, "region"] == "surf", "proportion"] > null_sample[, "surface"]) / 1000
tb[tb[, "region"] == "core", "pval"] <-
sum(tb[tb[, "region"] == "core", "proportion"] > null_sample[, "core"]) / 1000
tb[tb[, "region"] == "interact", "pval"] <-
sum(tb[tb[, "region"] == "interact", "proportion"] > null_sample[, "interact"]) /
1000
change_type <- tb
rm(tb)
save('change_type', 'null_sample', file='Cosmic_change_type_new.RData')
cosmic_muts_essential <- cosmic_muts[cosmic_muts$protein %in% essential_genes, ]
#___________________________________
# change_type statistics for cosmic essential genes only
# calculation of proportions in all cancerTypes
tb <- lapply(c("surf", "core", "interact"), function(r){
print(r)
proportion_boot <- boot::boot(data=cosmic_muts_essential, statistic=percentage, R=1000, region=r )
#proportion_boot$t <- proportion_boot$t[which(!is.na(proportion_boot$t[, 1])), ]
proportion_CI <- boot::boot.ci(proportion_boot, conf = 0.95, type = "perc")
proportion <- proportion_CI$t0
proportion_CI <- as.numeric(proportion_CI$percent[1, 4:5])
out <- data.frame(region = r, proportion = proportion,
lowerci = proportion_CI[1], upperci = proportion_CI[2])
return( out )
})
tb <- do.call("rbind", tb)
null_sample <- lapply(1:1000, function(i){
set.seed(i)
if(i %in% seq(1, 1000, by = 50)) print(i)
struct_randomized <- cosmic_muts_essential[sample(1:nrow(cosmic_muts_essential), nrow(cosmic_muts_essential)), "region"]
d <- cosmic_muts_essential
d[, "region"] <- struct_randomized
proportion_surf <- percentage(d, indices = 1:nrow(d), region = "surf")
proportion_core <- percentage(d, indices = 1:nrow(d), region = "core")
proportion_interact <- percentage(d, indices = 1:nrow(d), region = "interact")
out <- data.frame(surface = proportion_surf, core = proportion_core,
interact = proportion_interact)
})
null_sample <- do.call("rbind", null_sample)
saveRDS(null_sample, "Cosmic_EssentialGenes_change_type_null.rds")
tb[, "pval"] <- rep(NA, nrow(tb))
tb[tb[, "region"] == "surf", "pval"] <- sum(tb[tb[, "region"] == "surf", "proportion"] > null_sample[, "surface"]) / 1000
tb[tb[, "region"] == "core", "pval"] <- sum(tb[tb[, "region"] == "core", "proportion"] > null_sample[, "core"]) / 1000
tb[tb[, "region"] == "interact", "pval"] <- sum(tb[tb[, "region"] == "interact", "proportion"] > null_sample[, "interact"]) / 1000
change_type <- tb
rm(tb)
save('change_type', 'null_sample', file='Cosmic_EssentialGenes_change_type_new.RData')
cosmic_muts_essential <- cosmic_muts[cosmic_muts$protein %in% essential_genes, ]
#___________________________________
# visualise transition matrix
cosmic_muts_essential <- cosmic_muts_essential[which(cosmic_muts_essential$region != "None"), ]
cosmic_muts_essential$mut_type <- factor(cosmic_muts_essential$mut_type,
levels = c("aromatic", "negative", "positive", "polar", "nonpolar"))
cosmic_muts_essential$wt_type <- factor(cosmic_muts_essential$wt_type,
levels = c("aromatic", "negative", "positive", "polar", "nonpolar"))
cosmic_muts_essential$region <- factor(cosmic_muts_essential$region, levels = c("surf", "core", "interact"),
labels = c("surface", "core", "interact"))
trans_all <- plyr::ddply(cosmic_muts_essential, c("region", "mut_type", "wt_type"), nrow, .drop = FALSE)
trans_all$V1 <- apply(trans_all, MARGIN = 1, function(x){
as.numeric(x[4]) / sum(trans_all[trans_all$region == x[1], 4])
})
ggplot(trans_all) +
geom_tile(aes(x = wt_type, y = mut_type, fill = V1)) +
geom_text(aes(x = wt_type, y = mut_type, label = scales::percent(V1, accuracy = 0.1)), colour = "grey80") +
scale_fill_gradient(low ="navy",high = "yellow") +
facet_wrap(~ region, ncol = 2, scales = "free") + cowplot::theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
tcga <- readRDS('TCGA_allmuts_withSig_FINAL.rds')
single <- readRDS('TCGA_singleAA_withSig_FINAL.rds')
#____________________________________
# change_type
percentage <- function(data, indices, region) {
d <- data[indices,] # allows boot to select sample
d <- d[which(grepl(region, d[, "struct_cat"])), ]
cat(paste(nrow(d), ".."))
if(nrow(d) > 0){
return( nrow(d[d[, "change_type"] == TRUE, ]) / nrow(d) )
} else return(NA)
}
# calculation of proportions in all cancerTypes
tb <- lapply(c("surface", "core", "interact"), function(r){
print(r)
proportion_boot <- boot::boot(data=single, statistic=percentage, R=1000, region=r )
#proportion_boot$t <- proportion_boot$t[which(!is.na(proportion_boot$t[, 1])), ]
proportion_CI <- boot::boot.ci(proportion_boot, conf = 0.95, type = "perc")
proportion <- proportion_CI$t0
proportion_CI <- as.numeric(proportion_CI$percent[1, 4:5])
out <- data.frame(region = r, proportion = proportion,
lowerci = proportion_CI[1], upperci = proportion_CI[2])
return( out )
})
tb <- do.call("rbind", tb)
null_sample <- lapply(1:1000, function(i){
set.seed(i)
if(i %in% seq(1, 1000, by = 50)) print(i)
struct_randomized <- single[sample(1:nrow(single), nrow(single)), "struct_cat"]
d <- single
d[, "struct_cat"] <- struct_randomized
proportion_surf <- percentage(d, indices = 1:nrow(d), region = "surface")
proportion_core <- percentage(d, indices = 1:nrow(d), region = "core")
proportion_interact <- percentage(d, indices = 1:nrow(d), region = "interact")
out <- data.frame(surface = proportion_surf, core = proportion_core,
interact = proportion_interact)
})
null_sample <- do.call("rbind", null_sample)
saveRDS(null_sample, "TCGA_change_type_null.rds")
tb[, "pval"] <- rep(NA, nrow(tb))
tb[tb[, "region"] == "surface", "pval"] <- sum(tb[tb[, "region"] == "surface", "proportion"] > null_sample[, "surface"]) / 1000
tb[tb[, "region"] == "core", "pval"] <- sum(tb[tb[, "region"] == "core", "proportion"] > null_sample[, "core"]) / 1000
tb[tb[, "region"] == "interact", "pval"] <- sum(tb[tb[, "region"] == "interact", "proportion"] > null_sample[, "interact"]) / 1000
change_type <- tb
rm(tb)
save('change_type', 'null_sample', file='TCGA_change_type_new.RData')
#___________________________________
# change_type statistics for tcga essential genes only
# calculation of proportions in all cancerTypes
single <- single[single$SwissProt_acc_Id %in% essential_genes, ]
tb <- lapply(c("surface", "core", "interact"), function(r){
print(r)
proportion_boot <- boot::boot(data=single, statistic=percentage, R=1000, region=r )
#proportion_boot$t <- proportion_boot$t[which(!is.na(proportion_boot$t[, 1])), ]
proportion_CI <- boot::boot.ci(proportion_boot, conf = 0.95, type = "perc")
proportion <- proportion_CI$t0
proportion_CI <- as.numeric(proportion_CI$percent[1, 4:5])
out <- data.frame(region = r, proportion = proportion,
lowerci = proportion_CI[1], upperci = proportion_CI[2])
return( out )
})
tb <- do.call("rbind", tb)
null_sample <- lapply(1:1000, function(i){
set.seed(i)
if(i %in% seq(1, 1000, by = 50)) print(i)
struct_randomized <- single[sample(1:nrow(single), nrow(single)), "struct_cat"]
d <- single
d[, "struct_cat"] <- struct_randomized
proportion_surf <- percentage(d, indices = 1:nrow(d), region = "surface")
proportion_core <- percentage(d, indices = 1:nrow(d), region = "core")
proportion_interact <- percentage(d, indices = 1:nrow(d), region = "interact")
out <- data.frame(surface = proportion_surf, core = proportion_core,
interact = proportion_interact)
})
null_sample <- do.call("rbind", null_sample)
saveRDS(null_sample, "TCGA_EssentialGenes_change_type_null.rds")
tb[, "pval"] <- rep(NA, nrow(tb))
tb[tb[, "region"] == "surface", "pval"] <- sum(tb[tb[, "region"] == "surface", "proportion"] > null_sample[, "surface"]) / 1000
tb[tb[, "region"] == "core", "pval"] <- sum(tb[tb[, "region"] == "core", "proportion"] > null_sample[, "core"]) / 1000
tb[tb[, "region"] == "interact", "pval"] <- sum(tb[tb[, "region"] == "interact", "proportion"] > null_sample[, "interact"]) / 1000
change_type <- tb
rm(tb)
save('change_type', 'null_sample', file='TCGA_EssentialGenes_change_type_new.RData')
#________________________________
# plot change_type proportion
load('TCGA_change_type_new.RData')
change_type$cohort <- "TCGA_all"
tcga_all <- change_type
load('TCGA_EssentialGenes_change_type_new.RData')
change_type$cohort <- "TCGA_essential"
tcga_essential <- change_type
load('Cosmic_change_type_new.RData')
change_type$cohort <- "COSMIC_all"
cosmic_all <- change_type
cosmic_all$region <- as.character(cosmic_all$region)
cosmic_all[cosmic_all$region == "surf", "region"] <- "surface"
load('Cosmic_EssentialGenes_change_type_new.RData')
change_type$cohort <- "COSMIC_essential"
cosmic_essential <- change_type
cosmic_essential$region <- as.character(cosmic_essential$region)
cosmic_essential[cosmic_essential$region == "surf", "region"] <- "surface"
tb <- rbind(tcga_all, tcga_essential, cosmic_all, cosmic_essential)
ggplot(tb, aes(y = proportion, x = region)) + xlab("") +
ylab("% missense mutations\nchanging AA type") + scale_y_continuous(labels = scales::percent) +
geom_point() + geom_errorbar(aes(ymin = lowerci, ymax = upperci), width = 0) +
facet_wrap(~ cohort, nrow = 1) + cowplot::theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# print table
tb
## region proportion lowerci upperci pval cohort
## 1 surface 0.5685880 0.5671886 0.5700620 1.000 TCGA_all
## 2 core 0.4687110 0.4662064 0.4714523 0.000 TCGA_all
## 3 interact 0.5971413 0.5910505 0.6031485 1.000 TCGA_all
## 4 surface 0.5782745 0.5694228 0.5869805 0.949 TCGA_essential
## 5 core 0.4477685 0.4318792 0.4631942 0.000 TCGA_essential
## 6 interact 0.5833333 0.5576202 0.6102173 0.813 TCGA_essential
## 7 surface 0.7141629 0.7128856 0.7154232 1.000 COSMIC_all
## 8 core 0.6383452 0.6359286 0.6406961 0.000 COSMIC_all
## 9 interact 0.7065232 0.7004495 0.7128175 1.000 COSMIC_all
## 10 surface 0.7057989 0.6988804 0.7134098 1.000 COSMIC_essential
## 11 core 0.5936051 0.5786205 0.6094262 0.000 COSMIC_essential
## 12 interact 0.7133758 0.6898032 0.7364533 0.995 COSMIC_essential
Variant with structural coverage submitted for estimation of \(\Delta \Delta G\) using the mCSM web-server.
Cross-map with structural region (surface/core/interface) and compare.
# take Missense mutations
tcga <- readRDS('TCGA_singleAA_withSig_FINAL.rds')
subset <- unique(tcga[grepl("Missense", tcga$Variant_Classification), ])
subset$Position <- sapply(subset$Protein_Change, function(x){
substr(x, 4, nchar(x) - 1)
})
subset$Position <- as.numeric(subset$Position)
## Warning: NAs introduced by coercion
subset <- subset[which(!is.na(subset$Position)), ]
#______________________________
# merge with zoomvar annotations
zoomvar <- read.csv('/media/josefng/My_Passport/ZoomVar_db_202006/TCGA_Missense_pos_mapping',
header = TRUE, stringsAsFactors = FALSE, quote = "'",
na.strings = "NULL")
subset_mapped <- merge(subset, zoomvar[, 1:8],
by.x = c("SwissProt_acc_Id", "Position"),
by.y = c("UNIPROT", "UNIPROT_POS"),
all.x = TRUE, all.y = FALSE, sort = FALSE)
subset_mapped <- subset_mapped[, c("Hugo_Symbol", "SwissProt_acc_Id",
"Protein_Change", "origAA", "mutAA", "struct_cat",
"STRUCTURE_POS", "STRUCTURE_AA", "EVALUE",
"IDENTITY", "PDB", "CHAIN")]
# only those variants mapped to structural region
subset_mapped <- subset_mapped[which(subset_mapped$struct_cat != "None"), ]
# only those with same amino acid in structure and in the protein
subset_mapped <- subset_mapped[which(subset_mapped$origAA == subset_mapped$STRUCTURE_AA), ]
# consolidate entries so that only PDB with top E-value match is retained
getTopPDB <- function(pdb, chain, evalue, identity, pos, aa)
{
tb <- data.frame("PDB" = pdb, "CHAIN" = chain, "EVALUE" = evalue, "ID" = identity,
"POS" = pos, "AA" = aa, stringsAsFactors = FALSE)
tb <- tb[order(tb$EVALUE, tb$ID, decreasing = c(FALSE, TRUE)), ]
paste(tb[1, c("PDB", "CHAIN", "POS")], collapse = ";")
}
subset_mapped2 <- ddply(subset_mapped,
.variables = c("Hugo_Symbol", "SwissProt_acc_Id",
"Protein_Change", "origAA", "mutAA",
"struct_cat"),
summarise,
pdb_match = getTopPDB(PDB, CHAIN, EVALUE, IDENTITY,
STRUCTURE_POS, STRUCTURE_AA))
subset_mapped2$Mutation <- apply(subset_mapped2[, c("origAA", "mutAA", "pdb_match")],
1, function(x){
paste(x[1], unlist(strsplit(x[3], split = ";"))[3], x[2], sep = "")
})
subset_mapped2$PDB <- sapply(subset_mapped2$pdb_match, function(x){
unlist(strsplit(x, split = ";"))[1]
})
subset_mapped2$CHAIN <- sapply(subset_mapped2$pdb_match, function(x){
unlist(strsplit(x, split = ";"))[2]
})
subset_mapped2$pos <- sapply(subset_mapped2$pdb_match, function(x){
unlist(strsplit(x, split = ";"))[3]
})
#______________________________________
# map back mcsm results
mCSM <- read.csv('mCSM_PDBs/TCGA_mCSM_results_unique.csv', stringsAsFactors = FALSE)
subset_mapped3 <- merge(subset_mapped2, mCSM,
by.x = c("PDB", "CHAIN", "pos", "mutAA"),
by.y = c("pdb", "chain", "pos", "mut"),
all.x = FALSE, all.y = TRUE, sort = FALSE)
subset_mapped3$struct_cat <- factor(subset_mapped3$struct_cat,
levels = c("surface", "core", "interact"))
subset_mapped3 <- subset_mapped3[ which(!is.na(subset_mapped3$struct_cat)), ]
g1 <- ggplot(subset_mapped3, aes(x = struct_cat, y = ddg)) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_violin() + xlab("") + ylab("Change in protein stability") +
cowplot::theme_cowplot() + theme(legend.position = "bottom")
# p-value (Wilcoxon)
mCSM_p <- data.frame(
comparison = c("core-vs-surface", "core-vs-interact",
"surface-vs-interact"),
p = c(
wilcox.test(subset_mapped3[subset_mapped3$struct_cat == "core", "ddg"],
subset_mapped3[subset_mapped3$struct_cat == "surface", "ddg"])$p.value,
wilcox.test(subset_mapped3[subset_mapped3$struct_cat == "core", "ddg"],
subset_mapped3[subset_mapped3$struct_cat == "interact", "ddg"])$p.value,
wilcox.test(subset_mapped3[subset_mapped3$struct_cat == "interact", "ddg"],
subset_mapped3[subset_mapped3$struct_cat == "surface", "ddg"])$p.value
), stringsAsFactors = FALSE)
mCSM_p
## comparison p
## 1 core-vs-surface 0.000000e+00
## 2 core-vs-interact 0.000000e+00
## 3 surface-vs-interact 5.858105e-21
# effect size (Cohen's d)
mCSM_cohen <- data.frame(
comparison = c("core-vs-surface", "core-vs-interact",
"surface-vs-interact"),
cohens_d = c(
effsize::cohen.d(subset_mapped3[subset_mapped3$struct_cat == "core", "ddg"],
subset_mapped3[subset_mapped3$struct_cat == "surface", "ddg"])$estimate,
effsize::cohen.d(subset_mapped3[subset_mapped3$struct_cat == "core", "ddg"],
subset_mapped3[subset_mapped3$struct_cat == "interact", "ddg"])$estimate,
effsize::cohen.d(subset_mapped3[subset_mapped3$struct_cat == "interact", "ddg"],
subset_mapped3[subset_mapped3$struct_cat == "surface", "ddg"])$estimate
), stringsAsFactors = FALSE)
mCSM_cohen
## comparison cohens_d
## 1 core-vs-surface -0.8131973
## 2 core-vs-interact -0.5858697
## 3 surface-vs-interact -0.1603988
#______________________________________
# map back rhapsody results
rhapsody <- readRDS('/media/josefng/My_Passport/ZoomvarTCGA_rhapsody/ZoomvarTCGA_rhapsody_results_AllPossibleMissense_mCSMoverlapped_SNVmutsigs.rds')
rhapsody$pdbcode <- sapply(rhapsody$pdb, function(x){
gsub(".pdb", "", unlist(strsplit(x, split = " "))[1], fixed = TRUE)
})
rhapsody$chain <- sapply(rhapsody$pdb, function(x){
unlist(strsplit(x, split = " "))[2]
})
rhapsody$pos <- sapply(rhapsody$pdb, function(x){
unlist(strsplit(x, split = " "))[3]
})
rhapsody$mut <- sapply(rhapsody$SAV, function(x){
unlist(strsplit(x, split = " "))[4]
})
subset_mapped4 <- merge(subset_mapped2, rhapsody[, c("pdbcode", "chain", "pos",
"mut", "rhapsody_prob")],
by.x = c("PDB", "CHAIN", "pos", "mutAA"),
by.y = c("pdbcode", "chain", "pos", "mut"), sort = FALSE)
subset_mapped4$struct_cat <- factor(subset_mapped4$struct_cat,
levels = c("surface", "core", "interact"))
subset_mapped4 <- subset_mapped4[ which(!is.na(subset_mapped4$struct_cat)), ]
subset_mapped4 <- subset_mapped4[ which(!is.na(subset_mapped4$rhapsody_prob)), ]
g2 <- ggplot(subset_mapped4, aes(x = struct_cat, y = rhapsody_prob)) +
geom_hline(yintercept = 0.5, linetype = "dashed") +
geom_violin() + xlab("") + ylab("1 - Pathogenic probability (rhapsody)") +
cowplot::theme_cowplot() + theme(legend.position = "bottom")
# p-value (Wilcoxon)
rhapsody_p <- data.frame(
comparison = c("core-vs-surface", "core-vs-interact",
"surface-vs-interact"),
p = c(
wilcox.test(subset_mapped4[subset_mapped4$struct_cat == "core",
"rhapsody_prob"],
subset_mapped4[subset_mapped4$struct_cat == "surface",
"rhapsody_prob"])$p.value,
wilcox.test(subset_mapped4[subset_mapped4$struct_cat == "core",
"rhapsody_prob"],
subset_mapped4[subset_mapped4$struct_cat == "interact",
"rhapsody_prob"])$p.value,
wilcox.test(subset_mapped4[subset_mapped4$struct_cat == "interact",
"rhapsody_prob"],
subset_mapped4[subset_mapped4$struct_cat == "surface",
"rhapsody_prob"])$p.value
), stringsAsFactors = FALSE)
rhapsody_p
## comparison p
## 1 core-vs-surface 0.000000e+00
## 2 core-vs-interact 3.710507e-25
## 3 surface-vs-interact 5.595391e-01
# effect size (Cohen's d)
rhapsody_cohen <- data.frame(
comparison = c("core-vs-surface", "core-vs-interact",
"surface-vs-interact"),
cohens_d = c(
effsize::cohen.d(subset_mapped4[subset_mapped4$struct_cat == "core",
"rhapsody_prob"],
subset_mapped4[subset_mapped4$struct_cat == "surface",
"rhapsody_prob"])$estimate,
effsize::cohen.d(subset_mapped4[subset_mapped4$struct_cat == "core",
"rhapsody_prob"],
subset_mapped4[subset_mapped4$struct_cat == "interact",
"rhapsody_prob"])$estimate,
effsize::cohen.d(subset_mapped4[subset_mapped4$struct_cat == "interact",
"rhapsody_prob"],
subset_mapped4[subset_mapped4$struct_cat == "surface",
"rhapsody_prob"])$estimate
), stringsAsFactors = FALSE)
rhapsody_cohen
## comparison cohens_d
## 1 core-vs-surface -0.4506119
## 2 core-vs-interact -0.4528528
## 3 surface-vs-interact -0.0147647
Plotting both mCSM and rhapsody results together:
cowplot::plot_grid(
g1 + ggtitle("mCSM"), g2 + ggtitle("rhapsody"), nrow = 1, axis = "tb", align = "h"
)
Take the subset that maps to essential genes; do they show a different pattern?
# mCSM
subset_mapped3 <- list(subset_mapped3,
subset_mapped3[which(subset_mapped3$SwissProt_acc_Id %in%
essential_genes), ])
subset_mapped3[[1]]$essential <- "all"
subset_mapped3[[2]]$essential <- "essential genes"
subset_mapped3 <- do.call("rbind", subset_mapped3)
g1 <- ggplot(subset_mapped3, aes(x = struct_cat, y = ddg, fill = essential)) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_violin() + xlab("") + ylab("Change in protein stability") +
scale_fill_manual(values = c("all" = "white", "essential genes" = "red"), name = "") +
cowplot::theme_cowplot() + theme(legend.position = "bottom")
#______________________________________
# rhapsody
subset_mapped4 <- list(subset_mapped4,
subset_mapped4[which(subset_mapped4$SwissProt_acc_Id %in%
essential_genes), ])
subset_mapped4[[1]]$essential <- "all"
subset_mapped4[[2]]$essential <- "essential genes"
subset_mapped4 <- do.call("rbind", subset_mapped4)
g2 <- ggplot(subset_mapped4, aes(x = struct_cat, y = rhapsody_prob, fill = essential)) +
geom_hline(yintercept = 0.5, linetype = "dashed") +
geom_violin() + xlab("") + ylab("1 - Pathogenic probability (rhapsody)") +
scale_fill_manual(values = c("all" = "white", "essential genes" = "red"), name = "") +
cowplot::theme_cowplot() + theme(legend.position = "bottom")
cowplot::plot_grid(
g1 + ggtitle("mCSM"), g2 + ggtitle("rhapsody"), nrow = 1, axis = "tb", align = "h"
)
No difference in the essential gene subset.
Does clonal mutations (ie those frequent across cells in a given tumour) tend to be on the surface/core/interface?
# calculate cancer cell fraction of variants
# variant cancer cell fraction drivers vs passengers and between seq contexts
tcga <- readRDS('TCGA_singleAA_withSig_FINAL.rds')
# vaf data from VarScan2
vaf <- list.files(path = '/media/josefng/My_Passport/ZoomvarTCGA/varFreq',
pattern = ".txt", full.names = TRUE)
vaf <- lapply(vaf, read.table, stringsAsFactor = FALSE, header = FALSE)
vaf <- do.call("rbind", vaf)
vaf <- vaf[which(vaf[, 2] %in% paste0("chr", c(1:22, "X", "Y"))), ]
# vaf[, 4] <- vaf[, 4] / 100
colnames(vaf) <- c("fileID", "chr", "pos", "all", "alt", "vaf")
# map fileID with tcga case name
caseID <- read.csv('/home/josefng/Documents/ZoomvarTCGA/TCGA_fileID_map.csv',
stringsAsFactors = FALSE, header = FALSE)
colnames(caseID) <- c("fileID", "caseID")
vaf <- merge(vaf, caseID, by = "fileID", all.x = TRUE, all.y = FALSE, sort = FALSE)
vaf <- vaf[!is.na(vaf$caseID), ]
vaf_lo <- GenomicRanges::GRanges(vaf$chr, ranges = IRanges::IRanges(vaf$pos, width = 1),
caseID = vaf$caseID, read_depth = vaf$all, alt = vaf$alt)
# convert to GRCh37
library(rtracklayer)
path = system.file(package="liftOver", "extdata", "hg38ToHg19.over.chain")
ch = import.chain(path)
seqlevelsStyle(vaf_lo) = "UCSC" # necessary
vaf = liftOver(vaf_lo, ch)
vaf <- as.data.frame(vaf)
rm(vaf_lo)
vaf <- vaf[, c("seqnames", "start", "caseID", "read_depth", "alt")]
colnames(vaf) <- c("chr", "pos", "caseID", "read_depth", "alt_count")
gc()
tcga$sample <- substr(tcga$Tumor_Sample_Barcode, 1, 12)
tcga <- merge(tcga, vaf, by.x = c("sample", "Chromosome", "Start_Position"),
by.y = c("caseID", "chr", "pos"), all.x = TRUE, all.y = FALSE,
sort = FALSE)
# map total copy number
# CN data from COSMIC v92
cn <- read.table('/media/josefng/My_Passport/ZoomvarTCGA/Cosmic_V92_CompleteCNA_TCGA.tsv',
header = TRUE, stringsAsFactors = FALSE, sep = "\t")
cn$Chromosome <- paste0("chr", cn$Chromosome)
tcga$sample_detailed <- sapply(tcga$Tumor_Sample_Barcode, function(x) substr(x, 1, 15))
tcga$copy_number <- apply(tcga[, c("sample_detailed", "Chromosome", "Start_Position",
"End_position")], MARGIN = 1, function(x){
m <- cn[which(cn$SAMPLE_NAME == as.character(x[1]) &
cn$Chromosome == as.character(x[2]) &
cn$G_Start <= as.numeric(x[3]) &
cn$G_Stop >= as.numeric(x[4])), ]
if(nrow(m) > 0){
return(median(m[, "TOTAL_CN"]))
} else return(2)
})
# purity from ASCAT from COSMIC v92
purity <- read.table('/media/josefng/My_Passport/ZoomvarTCGA/Cosmic_V92_ascat_acf_ploidy.tsv',
header = TRUE, stringsAsFactors = FALSE, sep = "\t")
purity <- purity[, 1:3]
colnames(purity) <- c("abbrev", "caseID", "purity")
purity[, 2] <- gsub(".", "-", purity[, 2], fixed = TRUE)
purity[, 2] <- substr(purity[, 2], 1, 15)
purity <- ddply(purity, "caseID", summarise, purity = median(purity))
tcga <- merge(tcga, purity, by.x = "sample_detailed", by.y = "caseID",
all.x = TRUE, all.y = FALSE, sort = FALSE)
#___________________________________
# calculate cancer cell fraction
ccf_calc = function(chr, cn, purity, alt_count, read_depth){
if(any(is.na(purity), is.na(alt_count), is.na(read_depth), is.na(cn))) return(NA)
if(chr %in% c("chrY", "chrX")) return(NA)
grid = seq(from = 0.01, to = 1, by = 0.01)
f_c = purity * grid / (2*(1 - purity) + cn*purity)
p_c = dbinom(x = alt_count, size = read_depth, prob = f_c)
if (is.na(as.numeric(sum(p_c))) == FALSE & sum(p_c) != 0) {
norm_pc = p_c / sum(p_c)
return( sum(norm_pc * grid) )
} else return(NA)
}
tcga$ccf <- apply(tcga[, c("Chromosome", "copy_number", "purity",
"alt_count", "read_depth")],
MARGIN = 1, function(x){
ccf_calc(x[1], as.numeric(x[2]), as.numeric(x[3]),
as.numeric(x[4]), as.numeric(x[5]))
})
# CGC
cgc <- read.table('CGC_COSMICv86_20181030.tsv',
header = TRUE, stringsAsFactors = FALSE, sep = "\t")
cgc <- cgc[which(cgc$Tier == 1),]
tcga <- tcga[which(tcga$origAA != tcga$mutAA), ]
tcga <- tcga[which(!is.na(tcga$ccf)), ]
tcga$cgc <- sapply(tcga$Hugo_Symbol, function(x) x %in% cgc$Gene.Symbol)
tcga$ccf_bin <- ggplot2::cut_number(tcga$ccf, n = 10)
saveRDS(tcga, 'TCGA_singleAA_withSig_ccf_FINAL.rds')
percentage <- function(data, indices, bin) {
d <- data[indices,] # allows boot to select sample
d <- d[which(d[, "ccf_bin"] == bin), ]
cat(paste(nrow(d), ".."))
if(nrow(d) > 0){
return( nrow(d[d[, "cgc"] == TRUE, ]) / nrow(d) )
} else return(NA)
}
# calculation of proportions CGC for each ccf bin
tb <- lapply(c("surface", "core", "interact"), function(r){
print(r)
b <- tcga[which(tcga$struct_cat == r), ]
do.call("rbind", lapply(levels(tcga$ccf_bin), function(x){
proportion_boot <- boot::boot(data=b, statistic=percentage, R=1000, bin=x )
#proportion_boot$t <- proportion_boot$t[which(!is.na(proportion_boot$t[, 1])), ]
proportion_CI <- boot::boot.ci(proportion_boot, conf = 0.95, type = "perc")
proportion <- proportion_CI$t0
proportion_CI <- as.numeric(proportion_CI$percent[1, 4:5])
out <- data.frame(region = r, proportion = proportion,
lowerci = proportion_CI[1], upperci = proportion_CI[2])
return( out )
}))
})
tb <- do.call("rbind", tb)
tb$ccf_bin <- rep(levels(tcga$ccf_bin), 3)
tb$ccf_bin <- factor(tb$ccf_bin, levels = levels(tcga$ccf_bin))
saveRDS(tb, 'TCGA_ccf_bin_stats.rds')
tcga <- readRDS('TCGA_singleAA_withSig_ccf_FINAL.rds')
tb <- readRDS('TCGA_ccf_bin_stats.rds')
cowplot::plot_grid(
ggplot(tb, aes(x = ccf_bin, y = proportion, shape = region, group = region)) +
geom_point(size = 2.5) + geom_line() + xlab("") +
geom_errorbar(aes(ymin = lowerci, ymax = upperci), width = 0) +
cowplot::theme_cowplot() + scale_y_continuous(labels = scales::percent,
name = "% in driver genes") +
theme(axis.text.x = element_blank()),
ggplot(tcga[which(tcga$struct_cat %in% c("surface", "core", "interact")), ],
aes(x = ccf_bin, y = ccf)) +
geom_boxplot(outlier.shape = NA) +
scale_y_continuous(labels = scales::percent,
name = "Estimated % cells\nharbouring variant") +
xlab("Bins") + cowplot::theme_cowplot() + theme(axis.text.x = element_blank()),
nrow = 2, align = "v", axis = "lr", rel_heights = c(2, 1)
)
# trend test (Jonckheere-Terpstra Test)
tcga$ccf_bin <- factor(tcga$ccf_bin, levels = levels(tcga$ccf_bin), ordered = TRUE)
ccf_pval <- data.frame(
regions = c("surface", "core", "interact"),
pvalue = c(
DescTools::JonckheereTerpstraTest(
as.numeric(tcga[which(tcga$struct_cat == "surface"), "cgc"]),
tcga[which(tcga$struct_cat == "surface"),
"ccf_bin"], nperm = 1000,
alternative = "increasing")$p.value,
DescTools::JonckheereTerpstraTest(
as.numeric(tcga[which(tcga$struct_cat == "core"), "cgc"]),
tcga[which(tcga$struct_cat == "core"),
"ccf_bin"], nperm = 1000,
alternative = "increasing")$p.value,
DescTools::JonckheereTerpstraTest(
as.numeric(tcga[which(tcga$struct_cat == "interact"), "cgc"]),
tcga[which(tcga$struct_cat == "interact"),
"ccf_bin"], nperm = 1000,
alternative = "increasing")$p.value
)
)
## Registered S3 method overwritten by 'DescTools':
## method from
## reorder.factor gdata
ccf_pval
## regions pvalue
## 1 surface 0.001
## 2 core 0.001
## 3 interact 0.001