Using experimental (MAVE - Multiplex assays of variant effect, see here) deep mutational scanning data and computational prediction of mutational impact we compare the impact SNVs occuring in different mutational contexts exert on the protein.
Importantly, since these experimental/computational data evaluates impact of every possible amino acid substitution (i.e. for each residue of the protein, mutate to any other 19 amino acids), this eliminates the limitation posed by analysing mutations observed in individuals/tumours which are likely to be selectively depleted of extremely damaging variants.
Considered two cancer related human proteins with available data on MAVEdb:
Tested mCSM which we used to study TCGA variants and compare \(\Delta \Delta G\) of variants in protein core vs surface - prohibitively slow to run on 1,000s of proteins.
Used here these methods:
Running rhapsody would automatically give PolyPhen2 and EVmutation results wherever available so these two methods are also considered here.
Both experimental and computational data types correspond to a ‘saturation mutagenesis’ screen where for a given protein, the substitution of every amino acid position to any other 19 amino acids are considered for their possible mutational impact.
To compare mutational impact at the protein level for SNVs occurring in different mutational contexts, we need to, for a given protein:
\[ \text{Mutational impact} = \beta_{\text{signature}}\text{signature} \]
where ‘signature’ refers to each of the 96 mutational context. We then assess the magnitude and significance of the coefficient \(\beta_{\text{signature}}\) which would tell us whether a given mutational context is associated with variants of more/less damaging impact.
The file “genetic_code_sav.R”" contains function that performs steps (1) and (2) for any given Ensembl protein. We used Ensembl v86 and the GRCh38 genome.
source('genetic_code_sav.R') # code to get all possible SAVs from SNVs etc.
# sort out which single-nucleotide substitution belongs to which signature
# mapping table of AA substitution and SNV
getPossibleMuts <- function(prot_id){
cat(paste0(prot_id, " ...\n"))
codons <- mapCodon(prot_id, EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86,
BSgenome.Hsapiens.NCBI.GRCh38::Hsapiens)
codons_muts <- mapMut(codons, getPossibleMissenseSNVs())
codons_mutsigs <- mapMutSig(prot_id, nrow(codons), codons_muts,
EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86,
BSgenome.Hsapiens.NCBI.GRCh38::Hsapiens)
codons_mutsigs
}
msh2 <- getPossibleMuts( "ENSP00000233146" )
## ENSP00000233146 ...
## Fetching CDS for 1 proteins ... 1 found
## Checking CDS and protein sequence lengths ... 1/1 OK
## Fetching CDS for 1 proteins ... 1 found
## Checking CDS and protein sequence lengths ... 1/1 OK
pten <- getPossibleMuts( "ENSP00000361021" )
## ENSP00000361021 ...
## Fetching CDS for 1 proteins ... 1 found
## Checking CDS and protein sequence lengths ... 1/1 OK
## Fetching CDS for 1 proteins ... 1 found
## Checking CDS and protein sequence lengths ... 1/1 OK
msh2 <- msh2[ which(as.character(msh2$MUT_AA) != "*"), ]
pten <- pten[ which(as.character(pten$MUT_AA) != "*"), ]
# is the mutation in a given signature?
isSignature <- function(context, signatures)
{
return( context %in% signatures )
}
# parse contexts and subs into signature format i.e. ACG & T --> A[C>T]G etc.
setupSignature <- function(signatures)
{
apply(signatures, MARGIN = 1, function(x){
paste0( substr(x[1], 1, 1), "[", substr(x[1], 2, 2), ">", x[2], "]",
substr(x[1], 3, 3))
})
}
aging <- data.frame(context = c("ACG", "CCG", "GCG", "TCG"), mut = c("T", "T", "T", "T"),
stringsAsFactors = FALSE)
apobec <- data.frame(context = c("TCA", "TCA", "TCT", "TCT"), mut = c("T", "G", "T", "G"),
stringsAsFactors = FALSE)
pole <- data.frame(context = c("TCA", "TCG"), mut = c("A", "T"),
stringsAsFactors = FALSE)
fivefu <- data.frame(context = c("CTT", "CTT"), mut = c("C", "G"),
stringsAsFactors = FALSE)
plat <- data.frame(context = c("CCC", "CCT"), mut = c("T", "T"),
stringsAsFactors = FALSE)
aging <- setupSignature(aging)
apobec <- setupSignature(apobec)
pole <- setupSignature(pole)
fivefu <- setupSignature(fivefu)
plat <- setupSignature(plat)
msh2$aging <- sapply(msh2[, "MutSig"], isSignature, signatures = aging)
msh2$apobec <- sapply(msh2[, "MutSig"], isSignature, signatures = apobec)
msh2$pole <- sapply(msh2[, "MutSig"], isSignature, signatures = pole)
msh2$fivefu <- sapply(msh2[, "MutSig"], isSignature, signatures = fivefu)
msh2$plat <- sapply(msh2[, "MutSig"], isSignature, signatures = plat)
pten$aging <- sapply(pten[, "MutSig"], isSignature, signatures = aging)
pten$apobec <- sapply(pten[, "MutSig"], isSignature, signatures = apobec)
pten$pole <- sapply(pten[, "MutSig"], isSignature, signatures = pole)
pten$fivefu <- sapply(pten[, "MutSig"], isSignature, signatures = fivefu)
pten$plat <- sapply(pten[, "MutSig"], isSignature, signatures = plat)
# convert 1-letter AA code to 3-letters
msh2$WT_AA <- seqinr::aaa( as.character(msh2$WT_AA) )
msh2$MUT_AA <- seqinr::aaa( as.character(msh2$MUT_AA) )
pten$WT_AA <- seqinr::aaa( as.character(pten$WT_AA) )
pten$MUT_AA <- seqinr::aaa( as.character(pten$MUT_AA) )
msh2_dms <- read.csv('MAVEdb/MSH2_activity_mavedb00000050-a-1_scores.csv',
skip = 4)
pten_dms_activity <- read.csv('MAVEdb/PTEN_activity_mavedb00000054-a-1_scores.csv',
skip = 4)
pten_dms_stability <- read.csv('MAVEdb/PTEN_stability_mavedb00000013-a-1_scores.csv',
skip = 4)
pten_dms_activity <- pten_dms_activity[, 1:4]
pten_dms_stability <- pten_dms_stability[, 1:4]
pten_dms_activity$wt_pro <- sapply(pten_dms_activity$hgvs_pro, function(x){
if(x %in% c("_wt", "p.=")) return(NA)
return(substr(x, 3, 5))
})
pten_dms_stability$wt_pro <- sapply(pten_dms_stability$hgvs_pro, function(x){
if(x %in% c("_wt", "p.=")) return(NA)
return(substr(x, 3, 5))
})
pten_dms_activity$mut_pro <- sapply(as.character(pten_dms_activity$hgvs_pro), function(x){
if(x %in% c("_wt", "p.=")) return(NA)
o <- substr(x, gregexpr("[0-9]+", x), nchar(x))
o <- gsub('[0-9]+', '', o)
if(o == "=") return(substr(x, 3, 5)) else return(o)
})
pten_dms_stability$mut_pro <- sapply(as.character(pten_dms_stability$hgvs_pro), function(x){
if(x %in% c("_wt", "p.=")) return(NA)
o <- substr(x, gregexpr("[0-9]+", x), nchar(x))
o <- gsub('[0-9]+', '', o)
if(o == "=") return(substr(x, 3, 5)) else return(o)
})
pten_dms_activity$pos <- sapply(as.character(pten_dms_activity$hgvs_pro), function(x){
if(x %in% c("_wt", "p.=")) return(NA)
o <- regmatches(x, gregexpr("[0-9]+", x))[[1]]
o
})
pten_dms_stability$pos <- sapply(as.character(pten_dms_stability$hgvs_pro), function(x){
if(x %in% c("_wt", "p.=")) return(NA)
o <- regmatches(x, gregexpr("[0-9]+", x))[[1]]
o
})
msh2_dms$wt_pro <- sapply(msh2_dms$hgvs_pro, function(x){
if(x %in% c("_wt", "p.=")) return(NA)
return(substr(x, 3, 5))
})
msh2_dms$mut_pro <- sapply(as.character(msh2_dms$hgvs_pro), function(x){
if(x %in% c("_wt", "p.=")) return(NA)
o <- substr(x, gregexpr("[0-9]+", x), nchar(x))
o <- gsub('[0-9]+', '', o)
if(o == "=") return(substr(x, 3, 5)) else return(o)
})
msh2_dms$pos <- sapply(as.character(msh2_dms$hgvs_pro), function(x){
if(x %in% c("_wt", "p.=")) return(NA)
o <- regmatches(x, gregexpr("[0-9]+", x))[[1]]
o
})
# map the signature information to the dms dataframe
msh2_dms <- merge(msh2_dms, msh2[, c("AA_pos", "MUT_AA", "aging", "apobec", "pole",
"fivefu", "plat", "MutSig")],
by.x = c("pos", "mut_pro"), by.y = c("AA_pos", "MUT_AA"),
allFALSE, sort= FALSE)
pten_dms_activity <- merge(pten_dms_activity,
pten[, c("AA_pos", "MUT_AA", "aging", "apobec", "pole",
"fivefu", "plat", "MutSig")],
by.x = c("pos", "mut_pro"), by.y = c("AA_pos", "MUT_AA"),
all=FALSE, sort= FALSE)
pten_dms_stability <- merge(pten_dms_stability,
pten[, c("AA_pos", "MUT_AA", "aging", "apobec", "pole",
"fivefu", "plat", "MutSig")],
by.x = c("pos", "mut_pro"), by.y = c("AA_pos", "MUT_AA"),
all=FALSE, sort= FALSE)
msh2_dms$any <- TRUE; msh2_dms <- unique(msh2_dms)
msh2_dms$score <- -msh2_dms$score # invert according to description of the assay (https://www.mavedb.org/scoreset/urn:mavedb:00000050-a-1/)
pten_dms_activity$any <- TRUE; pten_dms_activity <- unique(pten_dms_activity)
pten_dms_stability$any <- TRUE; pten_dms_stability <- unique(pten_dms_stability)
# clinvar variants
pten_clinvar <- read.table('MAVEdb/PTEN_clinvar.txt',
sep = "\t", header = TRUE, stringsAsFactors = FALSE,
quote = "", comment.char = "", fill = TRUE)[, 1]
msh2_clinvar <- read.table('MAVEdb/MSH2_clinvar.txt',
sep = "\t", header = TRUE, stringsAsFactors = FALSE,
quote = "", comment.char = "", fill = TRUE)[, 1]
pten_clinvar <- sapply(pten_clinvar, function(x) gsub(")", "", regmatches(x, gregexpr("p.*", x))))
pten_clinvar <- unname( pten_clinvar[grepl("^p.", pten_clinvar)] )
msh2_clinvar <- sapply(msh2_clinvar, function(x) gsub(")", "", regmatches(x, gregexpr("p.*", x))))
msh2_clinvar <- unname( msh2_clinvar[grepl("^p.", msh2_clinvar)] )
pten_clinvar <- do.call("rbind", lapply(pten_clinvar, function(x){
if( length(which(pten_dms_activity$hgvs_pro == x)) == 1){
score_activity <- pten_dms_activity[which(pten_dms_activity$hgvs_pro == x), "score"]
} else score_activity = NA
if( length(which(pten_dms_stability$hgvs_pro == x)) == 1){
score_stability <- pten_dms_stability[which(pten_dms_stability$hgvs_pro == x), "score"]
} else score_stability = NA
wt <- substr(x, 3, 5)
mut <- substr(x, gregexpr("[0-9]+", x), nchar(x))
mut <- gsub('[0-9]+', '', mut)
pos <- regmatches(x, gregexpr("[0-9]+", x))[[1]]
data.frame(wt = wt, mut = mut, pos = pos, score_activity = score_activity,
score_stability = score_stability, stringsAsFactors = FALSE)
}))
msh2_clinvar <- do.call("rbind", lapply(msh2_clinvar, function(x){
wt <- substr(x, 3, 5)
mut <- substr(x, gregexpr("[0-9]+", x), nchar(x))
mut <- gsub('[0-9]+', '', mut)
pos <- regmatches(x, gregexpr("[0-9]+", x))[[1]]
if( length(which(msh2_dms$hgvs_pro == x)) == 1 ){
score <- msh2_dms[which(msh2_dms$hgvs_pro == x), "score"]
} else score <- NA
data.frame(wt = wt, mut = mut, pos = pos, score = score, stringsAsFactors = FALSE)
}))
# signatures - enriched in variants that modify activity but depleted in variants
# that destabilise the protein these are likely nondrivers, typical small-effect
# modifiers of protein function vs large-effect-size drivers that pose significant
# impact to protein function (cf clinvar variants)
#______________________________
# do the same for observed TCGA missense variants on MSH2 and PTEN
library(plyr)
tcga <- readRDS('TCGA_singleAA_withSig_FINAL.rds')
tcga <- tcga[which(tcga$origAA %in% seqinr::a() & tcga$mutAA %in% seqinr::a()),]
tcga <- tcga[which(tcga$mutAA != "*"), ]
tcga <- tcga[which(tcga$mutAA != tcga$origAA), ]
# count of tcga mutations for each of the 96 contexts
tcga$MutSig <- sapply(tcga$CONTEXT....20., substr, start = 20, stop = 22)
context32 <- get32Contexts()
tcga$MutSig <- apply(tcga[, c("MutSig", "Tumor_Seq_Allele2")], MARGIN = 1,
function(x){
context <- toupper(x[1])
mut <- x[2]
if( !context %in% context32 ){
context <- Biostrings::reverseComplement( Biostrings::DNAString( context ) )
mut <- Biostrings::reverseComplement( Biostrings::DNAString( mut ) )
context <- as.character( context )
mut <- as.character( mut )
}
return( paste0( substr(context, 1, 1), "[", substr(context, 2, 2), ">", mut, "]",
substr(context, 3, 3) ) )
})
tcga_sigcount <- ddply( tcga, "MutSig", nrow)
tcga_sigcount <- tcga_sigcount[ which(tcga_sigcount$MutSig %in% get96Contexts()), ]
tcga <- tcga[which(tcga$Hugo_Symbol %in% c("MSH2", "PTEN")), ]
tcga$origAA <- seqinr::aaa(tcga$origAA)
tcga$mutAA <- seqinr::aaa(tcga$mutAA)
tcga <- split(tcga, f = tcga$Hugo_Symbol)
tcga <- list(MSH2 = ddply(tcga$MSH2, c("origAA", "UniProt_AApos", "mutAA", "Protein_Change"), nrow),
PTEN = ddply(tcga$PTEN, c("origAA", "UniProt_AApos", "mutAA", "Protein_Change"), nrow))
colnames(tcga$MSH2)[5] <- "incidence"; colnames(tcga$PTEN)[5] <- "incidence"
tcga <- list(MSH2 = merge(tcga$MSH2, unique(msh2_dms[, c("pos", "mut_pro", "hgvs_pro", "score", "wt_pro")]),
by.x = c("origAA", "UniProt_AApos", "mutAA"),
by.y = c("wt_pro", "pos", "mut_pro"),
all.x = TRUE, all.y = FALSE, sort = FALSE),
PTEN = merge(tcga$PTEN, unique(pten_dms_activity[, c("pos", "mut_pro", "hgvs_pro", "score", "wt_pro")]),
by.x = c("origAA", "UniProt_AApos", "mutAA"),
by.y = c("wt_pro", "pos", "mut_pro"),
all.x = TRUE, all.y = FALSE, sort = FALSE))
colnames(tcga$PTEN)[which(colnames(tcga$PTEN) == "score")] <- "score_activity"
tcga$PTEN <- merge(tcga$PTEN, unique(pten_dms_stability[, c("pos", "mut_pro", "score", "wt_pro")]),
by.x = c("origAA", "UniProt_AApos", "mutAA"),
by.y = c("wt_pro", "pos", "mut_pro"),
all.x = TRUE, all.y = FALSE, sort = FALSE)
colnames(tcga$PTEN)[which(colnames(tcga$PTEN) == "score")] <- "score_stability"
# prepare plot of DMS score distribution and overlay distribution for ClinVar and TCGA observed variants
msh2_dms <- reshape2::melt(msh2_dms, id.vars = c("hgvs_pro", "score", "MutSig"),
measure.vars = c("aging", "apobec", "pole",
"fivefu", "plat", "any"))
pten_dms_activity <- reshape2::melt(pten_dms_activity,
id.vars = c("hgvs_pro", "score", "MutSig"),
measure.vars = c("aging", "apobec", "pole",
"fivefu", "plat", "any"))
pten_dms_stability <- reshape2::melt(pten_dms_stability,
id.vars = c("hgvs_pro", "score", "MutSig"),
measure.vars = c("aging", "apobec", "pole",
"fivefu", "plat", "any"))
msh2_dms$subs <- sapply(msh2_dms$MutSig, function(x){
regmatches(x, gregexpr('\\[.*\\]', x, perl = TRUE)[[1]])
})
pten_dms_activity$subs <- sapply(pten_dms_activity$MutSig, function(x){
regmatches(x, gregexpr('\\[.*\\]', x, perl = TRUE)[[1]])
})
pten_dms_stability$subs <- sapply(pten_dms_stability$MutSig, function(x){
regmatches(x, gregexpr('\\[.*\\]', x, perl = TRUE)[[1]])
})
msh2_dms$subs <- factor(msh2_dms$subs,
levels = c("[C>A]", "[C>G]", "[C>T]", "[T>A]", "[T>C]", "[T>G]"),
labels = c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G"))
pten_dms_activity$subs <- factor(pten_dms_activity$subs,
levels = c("[C>A]", "[C>G]", "[C>T]", "[T>A]", "[T>C]", "[T>G]"),
labels = c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G"))
pten_dms_stability$subs <- factor(pten_dms_stability$subs,
levels = c("[C>A]", "[C>G]", "[C>T]", "[T>A]", "[T>C]", "[T>G]"),
labels = c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G"))
msh2_dms$MutSig <- factor(msh2_dms$MutSig, levels = get96Contexts())
pten_dms_activity$MutSig <- factor(pten_dms_activity$MutSig, levels = get96Contexts())
pten_dms_stability$MutSig <- factor(pten_dms_stability$MutSig, levels = get96Contexts())
plotDMS <- function(dms, clinvar_var, tcga_var, gene, score_col, x_limits, title)
{
library(ggplot2)
library(ggridges)
g1 <- ggplot(unique(dms[which(dms$variable == "any"), ]),
aes(x = score, y = variable)) + geom_density_ridges2(size = 0.5, rel_min_height = 0.01) +
scale_x_continuous(limits = x_limits) + ggtitle(title) +
cowplot::theme_cowplot() + scale_y_discrete(name = "", labels = c("DMS\nprofile"))
n_clinvar <- length(clinvar_var[which(!is.na(clinvar_var[, score_col])), score_col])
n_tcga <- length(tcga_var[[gene]][which(!is.na(tcga_var[[gene]][, score_col])), score_col])
g2 <- ggplot(rbind(data.frame(variable = paste0("ClinVar\n(n = ", n_clinvar, ")"),
value = clinvar_var[, score_col]),
data.frame(variable = paste0("TCGA\n(n = ", n_tcga, ")"),
value = tcga_var[[gene]][, score_col])),
aes(x = value, y = variable)) +
geom_point(position = position_jitter(height=0.1)) + ylab("") +
stat_summary(fun = median, fun.min = median, fun.max = median,
geom = "crossbar", width = 0.5) +
cowplot::theme_cowplot() + scale_x_continuous(limits = x_limits, name = "") +
theme(axis.line = element_blank(), axis.ticks = element_blank(),
axis.text.x = element_blank())
cowplot::plot_grid(g1, g2, align = "v", axis = "lr", ncol = 1)
}
g_pten_stability <- plotDMS(pten_dms_stability, clinvar_var = pten_clinvar, tcga_var = tcga,
gene = "PTEN", title = "PTEN stability", score_col = "score_stability",
x_limits = c(-0.2, 1.4))
## Picking joint bandwidth of 0.059
## Warning: Removed 188 rows containing non-finite values (stat_summary).
## Warning: Removed 188 rows containing missing values (geom_point).
g_pten_activity <- plotDMS(pten_dms_activity, clinvar_var = pten_clinvar, tcga_var = tcga,
gene = "PTEN", title = "PTEN activity", score_col = "score_activity",
x_limits = c(-6, 3))
## Picking joint bandwidth of 0.171
## Warning: Removed 110 rows containing non-finite values (stat_density_ridges).
## Warning: Removed 38 rows containing non-finite values (stat_summary).
## Warning: Removed 38 rows containing missing values (geom_point).
g_msh2 <- plotDMS(msh2_dms, clinvar_var = msh2_clinvar, tcga_var = tcga,
gene = "MSH2", title = "MSH2 stability", score_col = "score",
x_limits = c(-6, 8))
## Picking joint bandwidth of 0.235
## Warning: Removed 313 rows containing non-finite values (stat_density_ridges).
## Warning: Removed 21 rows containing non-finite values (stat_summary).
## Warning: Removed 21 rows containing missing values (geom_point).
cowplot::plot_grid(g_pten_activity, g_pten_stability, #g_enrich,
g_msh2, nrow = 1, align = "hv", axis = "tblr")
# glm: score ~ sig and plot coefficients
plotGLMCoef <- function(glm_obj = NULL, coefs = NULL, x_var = "MutSig",
contexts = get96Contexts())
{
library(ggplot2)
if( is.null(glm_obj) & is.null(coefs) )
stop("Either one of glm_obj or coefs must not be NULL.")
if( !is.null(glm_obj) & is.null(coefs)){
coefs <- data.frame(summary(glm_obj)$coefficients)
}
if( is.null(glm_obj) & !is.null(coefs)){
coefs <- data.frame(coefs)
}
colnames(coefs) <- c("coef", "se", "t", "p")
coefs$variable <- rownames(coefs)
# coefs <- coefs[-1, ] # remove the intercept
# coefs$variable[1] <- paste0("MutSig", contexts[1]) # the intercept refers to the level of the first level of the factor
coefs$coef <- scale(coefs$coef)
# coefs$p <- p.adjust(coefs$p)
coefs$variable <- gsub(x_var, "", coefs$variable)
coefs$subs <- sapply(coefs$variable, function(x){
regmatches(x, gregexpr('\\[.*\\]', x, perl = TRUE)[[1]])
})
coefs$symbols <- sapply(coefs$p, function(p){
if(p<0.001) return("***")
if(p<0.01) return("**")
if(p<0.05) return("*")
return("")
})
coefs$symbol_pos <- apply(coefs[, c("coef", "se")], MARGIN = 1, function(x){
if(x[1]>0) y <- x[1]+x[2]+0.2 else y <- x[1]-x[2]-0.2
return(y)
})
coefs$subs <- factor(coefs$subs,
levels = c("[C>A]", "[C>G]", "[C>T]", "[T>A]", "[T>C]", "[T>G]"),
labels = c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G"))
ggplot(coefs, aes(x = variable, y = coef, fill = subs)) +
geom_bar(stat = "identity") + xlab("") + ylab(expression(beta)) +
geom_errorbar(aes(ymin = coef-se, ymax = coef+se), width = 0) +
geom_text(aes(y = symbol_pos, label = symbols)) +
cowplot::theme_cowplot() +
scale_fill_manual(values = c("C>A" = "#1EBFF0", "C>G" = "#000000",
"C>T" = "#FF0000", "T>A" = "#CBCACB",
"T>C" = "#A1CF64", "T>G" = "#EDC8C5")) +
facet_wrap(~ subs, nrow = 1, scales = "free_x") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.pos = "none")
}
# normal transformation of the scores
msh2_dms$score_scaled <-scale(msh2_dms$score)
pten_dms_activity$score_scaled <-scale(pten_dms_activity$score)
pten_dms_stability$score_scaled <-scale(pten_dms_stability$score)
# bar plot of regression coefficients
dms_sig_regression <- list(
plotGLMCoef(glm(data = unique(msh2_dms[which(msh2_dms$variable == "any"),]), formula = score_scaled ~ 0 + MutSig)) +
ggtitle("MSH2") + ylim(-1.7, 1.7),
plotGLMCoef(glm(data = unique(pten_dms_activity[which(pten_dms_activity$variable == "any"),]), formula = score_scaled ~ 0 + MutSig)) +
ggtitle("PTEN activity") + ylim(-1.7, 1.7),
plotGLMCoef(glm(data = unique(pten_dms_stability[which(pten_dms_stability$variable == "any"),]), formula = score_scaled ~ 0 + MutSig)) +
ggtitle("PTEN stability") + ylim(-1.7, 1.7)
)
dms_sig_regression <- lapply(dms_sig_regression, function(x) x$data)
dms_sig_regression[[1]]$assay <- "MSH2 stability"
dms_sig_regression[[2]]$assay <- "PTEN activity"
dms_sig_regression[[3]]$assay <- "PTEN stability"
dms_sig_regression <- do.call("rbind", dms_sig_regression)
Mapping strategy:
This leaves 6,918 unique Ensembl proteins to consider.
rhapsody <- readRDS('/media/josefng/My_Passport/ZoomvarTCGA_rhapsody/ZoomvarTCGA_rhapsody_results_AllPossibleMissense_mCSMoverlapped_SNVmutsigs.rds')
pred_sig_regression <- list()
pred_sig_regression[[1]] <-
glm(data = rhapsody, formula = rhapsody_score_scaled ~ 0 + MutSig)
pred_sig_regression[[1]] <- summary( pred_sig_regression[[1]] )$coefficients
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 9577208 511.5 18403093 982.9 18403093 982.9
## Vcells 64586468 492.8 1336686052 10198.2 1287795960 9825.2
pred_sig_regression[[2]] <-
glm(data = rhapsody, formula = rhapsody_prob_scaled ~ 0 + MutSig)
pred_sig_regression[[2]] <- summary( pred_sig_regression[[2]] )$coefficients
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 9577385 511.5 18403093 982.9 18403093 982.9
## Vcells 64587290 492.8 1069348842 8158.5 1287796997 9825.2
pred_sig_regression[[3]] <-
glm(data = rhapsody, formula = polyphen2_score_scaled ~ 0 + MutSig)
pred_sig_regression[[3]] <- summary( pred_sig_regression[[3]] )$coefficients
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 9577409 511.5 18403093 982.9 18403093 982.9
## Vcells 64587810 492.8 1224159762 9339.6 1294643249 9877.4
pred_sig_regression[[4]] <-
glm(data = rhapsody, formula = evmutation_score_scaled ~ 0 + MutSig)
pred_sig_regression[[4]] <- summary( pred_sig_regression[[4]] )$coefficients
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 9577439 511.5 18403093 982.9 18403093 982.9
## Vcells 64588338 492.8 979327810 7471.7 1294643249 9877.4
pred_sig_regression <- list(
plotGLMCoef(coefs = pred_sig_regression[[2]]) +
ggtitle("Rhapsody Pathogenic Probability") + ylim(-1.7, 1.7),
plotGLMCoef(coefs = pred_sig_regression[[3]]) +
ggtitle("PolyPhen2") + ylim(-1.7, 1.7),
plotGLMCoef(coefs = pred_sig_regression[[4]]) +
ggtitle("EVmutation") + ylim(-1.7, 1.7)
)
pred_sig_regression <- lapply(pred_sig_regression, function(x) x$data)
pred_sig_regression[[1]]$assay <- "rhapsody"
pred_sig_regression[[2]]$assay <- "PolyPhen2"
pred_sig_regression[[3]]$assay <- "EVmutation"
pred_sig_regression <- do.call("rbind", pred_sig_regression)
dms_sig_regression <- rbind(dms_sig_regression, pred_sig_regression)
# dot plot of regression coefficients (more compact visualisation!)
dms_sig_regression$variable <- sapply(dms_sig_regression$variable, function(x){
paste0(substr(x, 1, 1), substr(x, 3, 3), substr(x, 7, 7))
})
dms_sig_regression$p <- replace(dms_sig_regression$p,
which(dms_sig_regression$p < 0.01), 0.01)
dms_sig_regression$log_p <- -log10(dms_sig_regression$p)
dms_sig_regression$assay <- factor(dms_sig_regression$assay,
levels = c("MSH2 stability", "PTEN stability",
"PTEN activity", "rhapsody",
"PolyPhen2", "EVmutation"))
dms_sig_regression$variable <- factor(dms_sig_regression$variable,
levels = get32Contexts())
ggplot(dms_sig_regression, aes(x = assay, y = variable,
fill = coef, size = log_p)) +
geom_point(pch = 21) + xlab("") + ylab("") +
scale_fill_gradient2(midpoint = 0, low = "navyblue",
mid = "white", high = "yellow",
name = bquote(beta[signature])) +
scale_size_continuous(name = "p-value", limits = c(0, 5), range = c(1,10),
breaks = -log10(rev(c(0.01, 0.05, 0.1, 0.25, 0.5, 1))),
labels = c("1.00", "0.50", "0.25", "0.10", "0.05", "0.01")) +
# trans = scales::trans_new("-log_1p",
# transform = function(x) -log10(x+1),
# inverse = function(y) ( -1 + 1 / 10^y ) )) +
cowplot::theme_cowplot() + scale_y_discrete(limits = rev) +
theme(axis.text.x = element_text(angle =45, hjust = 1)) +
facet_wrap(~ subs, nrow = 1, scales = "free_y") +
ggtitle(bquote("Mutational impact" == beta[signature] ~"signature"))
tcga_sigcount$subs <- sapply(tcga_sigcount$MutSig, function(x){
regmatches(x, gregexpr('\\[.*\\]', x, perl = TRUE)[[1]])
})
tcga_sigcount$subs <- factor(tcga_sigcount$subs,
levels = c("[C>A]", "[C>G]", "[C>T]", "[T>A]", "[T>C]", "[T>G]"),
labels = c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G"))
tcga_sigcount$variable <- sapply(tcga_sigcount$MutSig, function(x){
paste0(substr(x, 1, 1), substr(x, 3, 3), substr(x, 7, 7))
})
tcga_sigcount$variable <- factor(tcga_sigcount$variable,
levels = get32Contexts())
tcga_sigcount$logcount <- log10(tcga_sigcount$V1)
ggplot(tcga_sigcount, aes(x = subs, y = variable, fill = logcount)) +
geom_tile() + xlab("") + ylab("") +
scale_fill_gradient(low = "white", high = "black", breaks = c(3, 5),
labels = c(1000, 100000), limits = c(2.84, 4.88),
name = "Number of\nTCGA\nmissense\nvariants") +
cowplot::theme_cowplot() + scale_y_discrete(limits = rev) +
facet_wrap(~ subs, scales = "free", nrow = 1) +
theme(axis.text.x = element_text(angle =45, hjust = 1))
Calculate the density in each region:
\[ \omega_{\text{context,region}} = \frac{ N_{\text{context,region}} / Size_{\text{region}} }{ N_{\text{context,all regions}} / Size_{\text{all regions}}} \]
# calculate density
calcdensity <- function(data, indices, region) {
d <- data[indices,] # allows boot to select sample
if(nrow(d) > 0){
surface_size <- sum(d$len_surface)
core_size <- sum(d$len_core)
interact_size <- sum(d$len_interact)
all_size <- surface_size + core_size + interact_size
surface_N <- sum(d$surface_sig)
core_N <- sum(d$core_sig)
interact_N <- sum(d$interact_sig)
all_N <- surface_N + core_N + interact_N
return( ( sum(d[, paste0(region, "_sig")]) / sum(d[, paste0("len_", region)]) ) /
( all_N / all_size ) )
} else return(NA)
}
context_counts <- list.files(pattern = 'proteinCDS_zoomvar_', path = 'countContexts',
full.names = TRUE)
context_counts <- lapply(context_counts, function(x){
counts <- read.csv(x, stringsAsFactors = FALSE)
o <- lapply(c("surface", "core", "interact"), function(r){
print(r)
density_boot <- boot::boot(data=counts, statistic=calcdensity,
R=1000, region=r )
density_CI <- boot::boot.ci(density_boot, conf = 0.95, type = "perc")
density_e <- density_CI$t0
density_CI <- as.numeric(density_CI$percent[1, 4:5])
out <- data.frame(region = r, dens = density_e,
lowerci = density_CI[1], upperci = density_CI[2])
return( out )
})
o <- do.call("rbind", o)
context <- unlist(strsplit(basename(x), split = "_"))[3]
context <- gsub(".csv", "", context)
o$context <- context
o
})
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
## [1] "surface"
## [1] "core"
## [1] "interact"
context_counts <- do.call('rbind', context_counts)
context_counts$context <- factor(context_counts$context,
levels = rev(c("CTG", "CCA", "CTC", "TCC", "TTC",
"CTT", "TCA", "TCT", "TTG", "TTT",
"ATT", "ATC", "GTT", "CTA", "GTA",
"ATA", "TTA", "CCG", "GCG", "ACG",
"TCG", "ACA", "ATG", "ACC", "ACT",
"GTC", "CCC", "GCT", "GCA", "GTG",
"CCT", "GCC")))
context_counts$region <- factor(context_counts$region,
levels = c("surface", "core", "interact"))
ggplot(context_counts, aes(x = region, y = context, fill = dens)) +
geom_tile() + cowplot::theme_cowplot() + xlab("") + ylab("") +
scale_fill_gradient2(low = "navy", high = "yellow", midpoint = 1,
name = "enrichment\nof motifs in\nhuman CDS") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
context_counts
## region dens lowerci upperci context
## 1 surface 0.9535201 0.9513900 0.9558112 ACA
## 2 core 1.1386309 1.1325574 1.1444399 ACA
## 3 interact 0.9331435 0.9156328 0.9502690 ACA
## 4 surface 1.0123670 1.0102138 1.0145666 ACC
## 5 core 0.9730364 0.9670800 0.9792510 ACC
## 6 interact 0.9393603 0.9206296 0.9595201 ACC
## 7 surface 1.0281894 1.0244977 1.0318387 ACG
## 8 core 0.9051031 0.8957263 0.9146319 ACG
## 9 interact 1.1260639 1.0908245 1.1616798 ACG
## 10 surface 1.0369411 1.0344185 1.0392357 ACT
## 11 core 0.8965823 0.8900055 0.9038393 ACT
## 12 interact 0.9996819 0.9801033 1.0207682 ACT
## 13 surface 0.9079570 0.9036867 0.9118241 ATA
## 14 core 1.2486904 1.2372109 1.2615922 ATA
## 15 interact 1.0718285 1.0372289 1.1074427 ATA
## 16 surface 0.9455594 0.9421789 0.9490689 ATC
## 17 core 1.1450607 1.1358328 1.1547889 ATC
## 18 interact 1.0585451 1.0370986 1.0785066 ATC
## 19 surface 0.9299811 0.9273634 0.9326694 ATG
## 20 core 1.1910184 1.1839245 1.1981166 ATG
## 21 interact 1.0401387 1.0201745 1.0601798 ATG
## 22 surface 0.9651509 0.9621996 0.9682003 ATT
## 23 core 1.0953472 1.0871936 1.1037536 ATT
## 24 interact 1.0177999 0.9902797 1.0479093 ATT
## 25 surface 0.9873883 0.9855521 0.9893028 CCA
## 26 core 1.0409464 1.0360571 1.0460124 CCA
## 27 interact 0.9555318 0.9411115 0.9707118 CCA
## 28 surface 1.0611738 1.0582190 1.0642369 CCC
## 29 core 0.8455478 0.8381759 0.8526387 CCC
## 30 interact 0.8666358 0.8442824 0.8877296 CCC
## 31 surface 1.0955810 1.0922023 1.0988477 CCG
## 32 core 0.7178499 0.7088227 0.7260410 CCG
## 33 interact 1.1143254 1.0787888 1.1488663 CCG
## 34 surface 1.0514923 1.0496147 1.0535199 CCT
## 35 core 0.8554113 0.8500523 0.8603992 CCT
## 36 interact 1.0029906 0.9862408 1.0204931 CCT
## 37 surface 0.9200526 0.9161850 0.9236026 CTA
## 38 core 1.2124464 1.2029205 1.2215968 CTA
## 39 interact 1.0905503 1.0603602 1.1207366 CTA
## 40 surface 1.0763030 1.0743251 1.0782327 CTC
## 41 core 0.7781109 0.7733904 0.7835082 CTC
## 42 interact 1.0647606 1.0471222 1.0818463 CTC
## 43 surface 0.9888280 0.9870120 0.9906823 CTG
## 44 core 1.0325626 1.0275847 1.0375405 CTG
## 45 interact 0.9899308 0.9740813 1.0059211 CTG
## 46 surface 1.0502491 1.0478357 1.0524572 CTT
## 47 core 0.8477509 0.8416167 0.8536778 CTT
## 48 interact 1.0910587 1.0745017 1.1082029 CTT
## 49 surface 0.8948991 0.8924203 0.8971812 GCA
## 50 core 1.3064343 1.2994831 1.3132403 GCA
## 51 interact 0.9044673 0.8880159 0.9229184 GCA
## 52 surface 0.9581053 0.9552072 0.9609030 GCC
## 53 core 1.1458450 1.1381512 1.1530607 GCC
## 54 interact 0.7746216 0.7549227 0.7941796 GCC
## 55 surface 1.0349139 1.0309116 1.0388774 GCG
## 56 core 0.8895875 0.8788288 0.8995750 GCG
## 57 interact 1.0998445 1.0563562 1.1438831 GCG
## 58 surface 0.9627413 0.9601975 0.9651686 GCT
## 59 core 1.1095235 1.1033763 1.1161126 GCT
## 60 interact 0.9590873 0.9381795 0.9805513 GCT
## 61 surface 0.9065491 0.9029111 0.9101893 GTA
## 62 core 1.2515151 1.2417661 1.2609855 GTA
## 63 interact 1.0806666 1.0520615 1.1090169 GTA
## 64 surface 1.0493561 1.0469464 1.0516470 GTC
## 65 core 0.8691997 0.8628654 0.8757264 GTC
## 66 interact 0.9412918 0.9224706 0.9619783 GTC
## 67 surface 0.9511798 0.9489504 0.9535797 GTG
## 68 core 1.1528330 1.1465182 1.1599510 GTG
## 69 interact 0.8726953 0.8558855 0.8890980 GTG
## 70 surface 1.0187942 1.0161192 1.0213345 GTT
## 71 core 0.9472537 0.9405847 0.9539716 GTT
## 72 interact 1.0008770 0.9780706 1.0224903 GTT
## 73 surface 0.9770875 0.9743272 0.9798244 TCA
## 74 core 1.0604279 1.0536965 1.0668934 TCA
## 75 interact 1.0295718 1.0124472 1.0458716 TCA
## 76 surface 1.0526431 1.0507977 1.0546132 TCC
## 77 core 0.8552049 0.8498576 0.8601686 TCC
## 78 interact 0.9791457 0.9629097 0.9947551 TCC
## 79 surface 1.0625886 1.0592112 1.0657295 TCG
## 80 core 0.7906107 0.7822271 0.7987151 TCG
## 81 interact 1.2695516 1.2339239 1.3046229 TCG
## 82 surface 1.0621216 1.0600610 1.0644148 TCT
## 83 core 0.8117671 0.8054739 0.8175282 TCT
## 84 interact 1.1126643 1.0947763 1.1302360 TCT
## 85 surface 0.9267551 0.9230084 0.9305646 TTA
## 86 core 1.2068214 1.1960395 1.2177244 TTA
## 87 interact 0.9866412 0.9526920 1.0227949 TTA
## 88 surface 1.0462767 1.0440625 1.0484923 TTC
## 89 core 0.8647775 0.8584636 0.8710525 TTC
## 90 interact 1.0444139 1.0271070 1.0629287 TTC
## 91 surface 0.9979506 0.9955457 1.0002527 TTG
## 92 core 0.9975012 0.9912454 1.0038988 TTG
## 93 interact 1.0651168 1.0465694 1.0815463 TTG
## 94 surface 1.0224463 1.0194277 1.0256943 TTT
## 95 core 0.9316230 0.9239124 0.9393273 TTT
## 96 interact 1.0435791 1.0155561 1.0701989 TTT
# For variants of selected signatures, plot wild-type and mutant AA
mapChangeType <- function(tb, colNameWT, colNameMut)
{
# map whether a missense mutation involves changing AA type
aa <- list(aromatic = data.frame(aa = c("Y", "W", "F"),
type = rep("aromatic", 3), stringsAsFactors = FALSE),
negative = data.frame(aa = c("E", "D"),
type = rep("negative", 2), stringsAsFactors = FALSE),
positive = data.frame(aa = c("R", "K", "H"),
type = rep("positive", 3), stringsAsFactors = FALSE),
polar = data.frame(aa = c("S", "T", "C", "P", "Q", "N"),
type = rep("polar", 6), stringsAsFactors = FALSE),
nonpolar = data.frame(aa = c("G", "A", "V", "L", "M", "I"),
type = rep("nonpolar", 6), stringsAsFactors = FALSE))
aa <- do.call("rbind", aa)
wt_col <- which(colnames(tb) == colNameWT)
mut_col <- which(colnames(tb) == colNameMut)
tb[, "wt_type"] <- apply(tb, 1, function(x){
type_wt <- aa[aa[, "aa"] == x[wt_col], "type"]
if(length(type_wt) == 0) return(NA) else return(type_wt)
})
tb[, "mut_type"] <- apply(tb, 1, function(x){
type_mut <- aa[aa[, "aa"] == x[mut_col], "type"]
if(length(type_mut) == 0) return(NA) else return(type_mut)
})
tb[, "change_type"] <- apply(tb, 1, function(x){
type_wt <- aa[aa[, "aa"] == x[wt_col], "type"]
type_mut <- aa[aa[, "aa"] == x[mut_col], "type"]
if(length(type_wt) == 0 | length(type_mut) == 0) return(NA)
if(type_wt == type_mut) return(FALSE) else return(TRUE)
})
return(tb)
}
getTransCountsForSignature <- function(tb, signature)
{
tb <- tb[ which(tb$MutSig == signature), ]
library(plyr)
sigs <- ddply(tb, .variables = c("wt", "mut"), nrow)
sigs <- sigs[which(sigs$wt != sigs$mut), ]
colnames(sigs) <- c("wt", "mut", "count")
sigs$count <- sigs$count / sum(sigs$count)
sigs <- mapChangeType(sigs, "wt", "mut")
sigs
}
rhapsody$wt <- sapply(rhapsody$SAV, function(x){
unlist(strsplit(x, split = " "))[3]
})
rhapsody$mut <- sapply(rhapsody$SAV, function(x){
unlist(strsplit(x, split = " "))[4]
})
selected_sigs <- lapply(c("G[T>G]G", "C[T>G]G", "G[T>G]A", "C[T>G]A"), function(x){
o <- getTransCountsForSignature(
rhapsody, signature = x
)
o$signature <- x
o
})
selected_sigs <- do.call("rbind", selected_sigs)
selected_sigs$wt <- seqinr::aaa(selected_sigs$wt)
selected_sigs$mut <- seqinr::aaa(selected_sigs$mut)
library(ggalluvial)
ggplot(as.data.frame(selected_sigs),
aes(y = count, axis1 = wt, axis2 = mut)) +
geom_alluvium(aes(fill = change_type), width = 1/12) +
geom_stratum(width = 1/6, color = "grey") +
geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
scale_x_discrete(limits = c("WT", "MUT")) + ylab("") +
scale_fill_manual(values = c("TRUE" = "red", "FALSE" = "black"),
name = "Change AA type?") +
cowplot::theme_cowplot() + theme(axis.text.y = element_blank(),
axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.x = element_blank(),
legend.position = "bottom") +
facet_wrap(~ signature, nrow = 2)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.