This markdown contains the following analysis:

comparison of variants in surface/core/interface for signatures
identification of putative variants caused by specific mutagenic processes
comparison of stability of these variants from above using predictors (mCSM, rhapsody)

Driver mutations caused by specific mutagenic processes

Annotate enrichment of mutation signature per tumour

Here we calculate the enrichment (ie density of mutations that occur in a given signature) of each signature, considered per tumour. Used to classify tumours with heightened exposure towards a specific mutagenic process.

#  function to calculate calculate signature density
calcSigDensity <- function(ref, alt, alt_alleles, context, mut_context, 
                           bg_context, dinucleotide = FALSE)
{
  con_sig <- sum(stringr::str_count(toupper(context), paste(mut_context, collapse = "|")))
  if(dinucleotide){
    mut_sig <- sum(sapply(1:length(mut_context), function(x){
      sum(ref == mut_context[x] & alt == alt_alleles[x])
    }))
    tb <- unique(do.call("rbind", lapply(1:length(mut_context), function(x){
      data.frame(bg = unlist(strsplit(mut_context[x], split = "")),
                 mut = unlist(strsplit(alt_alleles[x], split = "")), 
                 stringsAsFactors = FALSE)
    })))
    con_bg <- sum(stringr::str_count(toupper(context), paste(tb$bg, collapse = "|")))
    mut_bg <- sum(apply(tb, MARGIN = 1, function(x){
      sum(ref == x[1] & alt == x[2])
    }))
  } else {
    con_bg <- sum(stringr::str_count(toupper(context), paste(bg_context, collapse = "|")))
    pos <- 21
    mut_sig <- sum(sapply(1:length(mut_context), function(x){
      sum(substr(toupper(context), pos - 1, pos + 1) == mut_context[x] & alt == alt_alleles[x])
    }))
    mut_bg <- sum(apply(unique(data.frame(bg_context, alt_alleles,
                                          stringsAsFactors = FALSE)), 
                        MARGIN = 1, function(x){
      sum(ref == x[1] & alt == x[2])
    }))
  }
  if(mut_bg > 0 & con_sig > 0 & con_bg > 0){
    ( mut_sig / con_sig ) / ( mut_bg / con_bg )
  } else {
    0
  }
}

# filter del and ins
tcga <- readRDS('TCGA_allmuts_withSig_FINAL.rds')
subsonly <- tcga[which(tcga$Reference_Allele != '-' & tcga$Tumor_Seq_Allele2 != '-'), ]
subsonly <- subsonly[which(nchar(subsonly$Reference_Allele) == nchar(subsonly$Tumor_Seq_Allele2)), ]
head(subsonly)
subsonly <- split(subsonly, f = subsonly$Tumor_Sample_Barcode)

# calculation
ApobecEnrich <- sapply(subsonly, function(tb){
  calcSigDensity(tb$Reference_Allele, tb$Tumor_Seq_Allele2, context = tb$CONTEXT....20.,
                 mut_context = c("TCT", "TCT", "TCA", "TCA", 
                                 "AGA", "AGA", "TGA", "TGA"),
                 bg_context = c("C", "C", "C", "C", "G", "G", "G", "G"),
                 alt_alleles = c("T", "G", "T", "G",  "C", "A", "C", "A"))
})
AgingEnrich <- sapply(subsonly, function(tb){
  calcSigDensity(tb$Reference_Allele, tb$Tumor_Seq_Allele2, context = tb$CONTEXT....20.,
                 mut_context = c("TCG", "ACG", "CCG", "GCG", 
                                 "CGC", "CGG", "CGT", "CGA"),
                 bg_context = c("C", "C", "C", "C", "G", "G", "G", "G"),
                 alt_alleles = c("T", "T", "T", "T",  "A", "A", "A", "A"))
})
POLEEnrich <- sapply(subsonly, function(tb){
  calcSigDensity(tb$Reference_Allele, tb$Tumor_Seq_Allele2, context = tb$CONTEXT....20.,
                 mut_context = c("TCT", "AGA", "TCG", "CGA"),
                 bg_context = c("C", "G", "C", "G"),
                 alt_alleles = c("A", "T", "T", "A"))
})
FiveFUEnrich <- sapply(subsonly, function(tb){
  calcSigDensity(tb$Reference_Allele, tb$Tumor_Seq_Allele2, context = tb$CONTEXT....20.,
                 mut_context = c("CTT", "AAG"),
                 bg_context = c("T", "A"),
                 alt_alleles = c("G", "C"))
})
PlatSBSEnrich <- sapply(subsonly, function(tb){
  calcSigDensity(tb$Reference_Allele, tb$Tumor_Seq_Allele2, context = tb$CONTEXT....20.,
                 mut_context = c("CCC", "GGG", "CCT", "AGG"),
                 bg_context = c("C", "G", "C", "G"),
                 alt_alleles = c("T", "A", "T", "A"))
})
UVEnrich <- sapply(subsonly, function(tb){
  calcSigDensity(tb$Reference_Allele, tb$Tumor_Seq_Allele2, context = tb$CONTEXT....20.,
                 mut_context = c("CC", "GG"),
                 dinucleotide = TRUE,
                 alt_alleles = c("TT", "AA"))
})
PlatDBSEnrich <- sapply(subsonly, function(tb){
  calcSigDensity(tb$Reference_Allele, tb$Tumor_Seq_Allele2, context = tb$CONTEXT....20.,
                 mut_context = c("CT", "CT", "AG", "AG", "TG", "TC", "CA", "GA"),
                 dinucleotide = TRUE,
                 alt_alleles = c("AA", "AC", "TT", "GT", "GT", "AA", "AC", "TT"))
})

sig_enrich <- Reduce(function(d1, d2) cbind(as.data.frame(d1), as.data.frame(d2)),
                     list(AgingEnrich, ApobecEnrich, POLEEnrich, FiveFUEnrich,
                          PlatSBSEnrich, PlatDBSEnrich, UVEnrich))
colnames(sig_enrich) <- c("AgingEnrich", "ApobecEnrich", "POLEEnrich", "FiveFUEnrich",
                          "PlatSBSEnrich", "PlatDBSEnrich", "UVEnrich")
saveRDS(sig_enrich, 'TCGA_sig_enrich.rds')
rm(subsonly)

top mutations by incidence for each signature

tcga <- readRDS('TCGA_allmuts_withSig_FINAL.rds')
tcga$sample <- substr(tcga$Tumor_Sample_Barcode, 1, 12)
#____________________________________
# APOBEC only
apobec_topmut <- ddply(tcga[which(tcga$ApobecSig == 1), ], 
                       c("Hugo_Symbol", "Protein_Change"), nrow)
apobec_topmut <- apobec_topmut[order(apobec_topmut$V1, decreasing = TRUE), ]
apobec_topmut <- apobec_topmut[which(apobec_topmut$Protein_Change != "."), ]
apobec_topmut <- apobec_topmut[1:12, ]
apobec_topmut$delta <- apply(apobec_topmut[, 1:2], 1, paste, collapse = " ")
apobec_topmut$delta <- factor(apobec_topmut$delta, levels = apobec_topmut$delta)
ggplot(apobec_topmut) + geom_bar(aes(x = delta, y = V1), stat = "identity") +
  cowplot::theme_cowplot() + xlab("") + ylab("Incidence") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# all signatures
topmuts <- lapply(c("ApobecSig", "AgingSig", "POLESig", "UVSig", "FiveFUSig", 
                    "PlatDBS", "PlatSBS"), function(x){
  topmut <- ddply(tcga[which(tcga[, x] == 1), ], c("Hugo_Symbol", "Protein_Change"), nrow)
  topmut <- topmut[order(topmut$V1, decreasing = TRUE), ]
  topmut <- topmut[which(topmut$Protein_Change != "."), ]
  topmut <- topmut[1:12, ]
  topmut$delta <- apply(topmut[, 1:2], 1, paste, collapse = " ")
  topmut$delta <- factor(topmut$delta, levels = topmut$delta)
  ggplot(topmut) + geom_bar(aes(x = delta, y = V1), stat = "identity") +
    cowplot::theme_cowplot() + xlab("") + ylab("Incidence") + ylim(0, 400) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggtitle(gsub("Sig", "", x))
})
gridExtra::grid.arrange(topmuts[[1]], topmuts[[2]], topmuts[[3]], topmuts[[4]], 
                        topmuts[[5]], topmuts[[6]], topmuts[[7]], ncol = 4)

define subset of TCGA cases relevant to each mutational signature

5-FU / Platinum - cases treated with these drugs

clinFiles <- list.files(path = '/home/josefng/TCGA_clinical_all', full.names = TRUE)

drug_info <- lapply(clinFiles, function(file)
{
  cancerType <- unlist(strsplit(file, split = "/"))
  cancerType <- gsub(".clin.merged.txt", "",
                     cancerType[length(cancerType)], fixed = TRUE)
  print(cancerType)
  clin <- read.table(file, sep = "\t", quote = "", comment.char = "",
                     row.names = 1, stringsAsFactors = F)
  clin <- data.frame(t(clin), stringsAsFactors = F)
  drugs <- data.frame(clin[, grepl('patient.bcr_patient_barcode|patient.drugs.drug.*drug_name',
                        colnames(clin), perl = T)], stringsAsFactors = F)
  if(ncol(drugs) > 1){
    drugs <- apply(drugs, MARGIN = 1, function(x){
      o <- x[2:length(x)]
      o <- o[!is.na(o)]
      if(length(o) == 0){
        o <- NA
      }
      unname(o)
    })
    names(drugs) <- toupper(clin[, "patient.bcr_patient_barcode"])
    drugs
  } else return(NULL)
})
cancerTypes <- sapply(clinFiles, function(file){
  o <- unlist(strsplit(file, split = "/"))
  o <- gsub(".clin.merged.txt", "", o[length(o)], fixed = TRUE)
  o
})
names(drug_info) <- cancerTypes
drug_info <- drug_info[sapply(drug_info, function(x) !is.null(x))]

# patient barcode with platinum-based drug administration
plat_patients <- unique(names(unlist(drug_info)[grep('plat', unlist(drug_info))]))
for(patient in plat_patients){
  o <- unlist(strsplit(patient, split = ".", fixed = T))[2]
  o <- substr(o, 1, 12)
  write(o, "TCGA_platinum.txt", append = TRUE)
}

# patient barcode with 5-FU administration
fu_patients <- unique(names(unlist(drug_info)[grep('5fu|5-fu|uracil', unlist(drug_info))]))
for(patient in fu_patients){
  o <- unlist(strsplit(patient, split = ".", fixed = T))[2]
  o <- substr(o, 1, 12)
  write(o, "TCGA_5fu.txt", append = TRUE)
}

TCGA hypermutated - density (number of mutations per Mb) > 10 (ref: here). Prune (see below) to specifically focus on those with POLE mutation

# get cases with high mutational density
library(plyr)
mafs <- list.files(path = "/home/josefng/TCGA.firehose_maf",
                   pattern = "_maf_alilmuttype_mapped_withSig.txt", full.names = TRUE)
mafs <- lapply(mafs, function(maf){
 print(maf)
 tb <- read.table(maf, header = TRUE, sep = "\t", quote = "", 
                  stringsAsFactors = FALSE, comment.char = "")
 tb
})
mafs <- do.call("rbind", mafs)
nmuts <- ddply(mafs, "Tumor_Sample_Barcode", nrow)
# rm(mafs)
colnames(nmuts)[2] <- "nmut"
nmuts$density <- nmuts[, 2] / 38 # ref https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-017-0424-2
hypermutated_cases <- nmuts[nmuts$density > 10, 1] # ref https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5849393/

UV: The TCGA Melanoma cohort
Aging / APOBEC: enrichment of the signature (ie density) above-median. In the case of aging, examine association with age.

sig_enrich <- readRDS('TCGA_sig_enrich.rds')
caselist <- list('5FU' = readLines('TCGA_5fu.txt'),
                 'Platinum' = readLines('TCGA_platinum.txt'),
                 'POLE' = readLines('TCGA_hypermutated.txt'))
caselist[["UV"]] = unique(tcga[which(tcga$cancerType == "SKCM"), "Tumor_Sample_Barcode"])
caselist[["UV"]] = substr(caselist[["UV"]], 1, 12)
caselist[["Apobec"]] = rownames(sig_enrich[which(sig_enrich$ApobecEnrich > median(sig_enrich$ApobecEnrich)), ])
caselist[["Apobec"]] = substr(caselist[["Apobec"]], 1, 12)
caselist[["Aging"]] = rownames(sig_enrich[which(sig_enrich$AgingEnrich > median(sig_enrich$AgingEnrich)), ])
caselist[["Aging"]] = substr(caselist[["Aging"]], 1, 12)

cancerTypes <- ddply(tcga, c("Tumor_Sample_Barcode", "sample", "cancerType"), nrow)
sig_enrich <- merge(sig_enrich, cancerTypes, by.x = "row.names", 
                    by.y = "Tumor_Sample_Barcode",
                    all.x = TRUE, all.y=  FALSE, sort= FALSE)
colnames(sig_enrich)[ncol(sig_enrich)] <- "Mutation_load"
head(sig_enrich)

##                      Row.names AgingEnrich ApobecEnrich POLEEnrich FiveFUEnrich
## 1 TCGA-02-0003-01A-01D-1490-08    9.403312    0.4244755  0.9342055            0
## 2 TCGA-02-0033-01A-01D-1490-08    7.889610    0.5148305  1.1675676            0
## 3 TCGA-02-0047-01A-01D-1490-08    9.716312    0.4672224  1.1132178            0
## 4 TCGA-02-0055-01A-01D-1490-08    9.693034    0.1619329  0.7126736            0
## 5 TCGA-02-2470-01A-01D-1494-08   10.653325    0.9134065  0.6789966            0
## 6 TCGA-02-2483-01A-01D-1494-08   10.320122    0.3173933  1.0260167            0
##   PlatSBSEnrich PlatDBSEnrich UVEnrich       sample cancerType Mutation_load
## 1     0.9432789             0        0 TCGA-02-0003     GBMLGG            57
## 2     0.8596698             0        0 TCGA-02-0033     GBMLGG            47
## 3     0.9402883             0        0 TCGA-02-0047     GBMLGG            82
## 4     0.7446712             0        0 TCGA-02-0055     GBMLGG            76
## 5     0.0000000             0        0 TCGA-02-2470     GBMLGG            80
## 6     0.7141350             0        0 TCGA-02-2483     GBMLGG            67

Explore mutations associated with specific signatures

POLE

PTEN p.R130Q

#_______________
# prune list to select only POLE-mutated for hypermutation 
hypermutated <- caselist$POLE
msi <- list.files('clinical', pattern = "_msi", full.names = TRUE)
msi <- lapply(msi, read.table, stringsAsFactors = FALSE, row.names = 1, sep = "\t")
msi <- lapply(msi, function(tb) t(tb))
msi <- lapply(msi, function(tb){
  tb[, 1] <- toupper(tb[, 1])
  tb[, 2] <- sapply(tb[, 2], function(x){
    if(is.na(x)) return(FALSE)
    if(x == "msi-h") return(TRUE) else return(FALSE)
  })
  colnames(tb) <- c("Sample", "MSI_status")
  tb
})
msi <- do.call("rbind", msi)
hypermut_pol <- tcga[which(tcga$Hugo_Symbol %in% c("POLE", "POLD1", "PTEN") & 
                             tcga$mutAA != tcga$origAA), ]
hypermut_pol <- reshape2::acast(hypermut_pol, sample ~ Hugo_Symbol, 
                                value.var = "Protein_Change",
                                fun.aggregate = function(x) length(x) > 0)
hypermut_pol[, 3] <- sapply(
  rownames(hypermut_pol), 
  function(x) nrow(tcga[tcga$Hugo_Symbol == "PTEN" & tcga$sample == x & 
                          tcga$Protein_Change == "p.R130Q", ]) > 0
)
colnames(hypermut_pol)[3] <- "PTEN R130Q"
hypermut <- sig_enrich[, c("sample", "cancerType", "ApobecEnrich", 
                           "POLEEnrich", "Mutation_load", "Row.names")]
colnames(hypermut)[ncol(hypermut)] <- "Tumor_Sample_Barcode"
hypermut <- merge(hypermut, msi, by.x = "sample", by.y = "Sample",
                  all.x = TRUE, all.y = FALSE, sort = FALSE)
hypermut <- hypermut[which(hypermut$sample %in% caselist$POLE), ]
hypermut <- merge(hypermut, hypermut_pol, by.x = "sample", by.y = "row.names",
                  all.x = TRUE, all.y=  FALSE, sort = FALSE)
hypermut[which(is.na(hypermut$POLD1)), "POLD1"] <- FALSE
hypermut[which(is.na(hypermut$POLE)), "POLE"] <- FALSE
hypermut[which(is.na(hypermut$`PTEN R130Q`)), "PTEN R130Q"] <- FALSE
hypermut[which(is.na(hypermut$MSI_status)), "MSI_status"] <- FALSE
hypermut$ApobecStatus <- (hypermut$ApobecEnrich > 1)
hypermut$PoleStatus <- (hypermut$POLEEnrich > 1)

caselist[["POLEmut"]] <- hypermut[which(hypermut$POLE & 
                                          hypermut$sample %in% caselist$POLE), 
                                  "sample"]
names(caselist)[which(names(caselist) == "POLEmut")] <- "hypermutated"
caselist <- caselist[-which(names(caselist) == "POLE")]

# hypermutation: PTEN R130Q with POLE/POLD1
hypermut <- hypermut[order(hypermut$POLE, hypermut$POLD1, hypermut$`PTEN R130Q`,
                           hypermut$MSI_status, hypermut$Mutation_load,
                           hypermut$ApobecEnrich,
                           decreasing = TRUE), ]
Heatmap(t(hypermut)[c("ApobecStatus", "MSI_status", "POLE", "POLD1", "PTEN R130Q"), ], 
        col = c("grey80", "black"),
        show_column_names = FALSE, cluster_columns = FALSE, cluster_rows = FALSE, 
        top_annotation = HeatmapAnnotation("Mutation load" =
                                             anno_barplot(hypermut$Mutation_load)))

Aging

# age
age <- read.table('TCGA_all_age.txt', 
                  header = TRUE, stringsAsFactors = FALSE, sep = "\t")
age <- age[which(!is.na(age$age_at_initial_pathologic_diagnosis)), ]
sig_enrich <- merge(sig_enrich, age[, c(1, 3)], by.x = "sample", 
                    by.y = "bcr_patient_barcode",
                    all.x = TRUE, all.y = FALSE, sort = FALSE)
sig_enrich$age_group <- cut(sig_enrich$age_at_initial_pathologic_diagnosis,
                            breaks = seq(10, 90, by = 10))
ggplot(sig_enrich) + geom_boxplot(aes(x = age_group, y = AgingEnrich))

# weirdly the 'AgingEnrich' does not correlate with age. Rather it shows the vast majority of samples have the aging signature occuring at a rate higher than expected.
# 'High AgingEnrich' probably only means absence of other processes? Check with SigProfiler results
# https://www.synapse.org/#!Synapse:syn11801497

sigprofiler_counts <- read.csv('TCGA_WES_sigProfiler_SBS_signatures_in_samples.csv', row.names = NULL)
sigprofiler_counts$othersig <- apply(sigprofiler_counts[, c(-1, -2, -3, -4, -8)], 
                                     MARGIN = 1, sum)
sigprofiler_counts$othersig_prop <- sigprofiler_counts$othersig /
  apply(sigprofiler_counts[, c(-1, -2, -3, -69)], 1, sum)
sigprofiler_counts$sample <- substr(sigprofiler_counts$Sample.Names, 1, 12)
sigprofiler_counts <- merge(sigprofiler_counts, sig_enrich[, c(1, 2, 3)], 
                            by = "sample",
                            all.x = FALSE, all.y = TRUE, sort = FALSE)
sigprofiler_counts$AgingEnrich_group <- cut(sigprofiler_counts$AgingEnrich, 
                                            quantile(sigprofiler_counts$AgingEnrich,
                                                     probs = seq(0, 1, by = .25)), 
                                            include.lowest = TRUE, labels = FALSE)
sigprofiler_counts$AgingEnrich_group <- factor(sigprofiler_counts$AgingEnrich_group,
                                               ordered = TRUE)

ggplot(sigprofiler_counts) + geom_boxplot(aes(x = AgingEnrich_group, y = othersig_prop)) +
  cowplot::theme_cowplot() + xlab("Enrichment of Signature 1\n(binned by quartiles)") +
  ylab("Proportion of mutations NOT in Signature 1") + scale_y_continuous(labels = scales::percent)

## Warning: Removed 1183 rows containing non-finite values (stat_boxplot).

Nonsurprisingly the enrichment of Signature 1 (i.e. Aging signature) implies a low proportion of mutations outside of Signature 1.

TP53

# oncoprint for TP53 mutations selected in aging signature correlation with age
tp53_pol <- rbind(tcga[which(tcga$Hugo_Symbol %in% c("TP53") & 
                               tcga$mutAA != tcga$origAA), ])#,
                  #tcga[which(tcga$Hugo_Symbol %in% c("IDH1") & tcga$mutAA != tcga$origAA), ])
tp53_pol <- reshape2::acast(tp53_pol, sample ~ Protein_Change, 
                            value.var = "Protein_Change",
                            fun.aggregate = function(x) length(x) > 0)
tp53_pol <- tp53_pol[, c(#"p.R132H", 
  "p.R175H", "p.R248Q", "p.R273C", "p.R273H", "p.G245S")]
tp53 <- age[, c("bcr_patient_barcode", "age_at_initial_pathologic_diagnosis")]
#colnames(tp53)[ncol(tp53)] <- "Tumor_Sample_Barcode"
tp53 <- merge(tp53, tp53_pol, by.x = "bcr_patient_barcode", by.y = "row.names",
                  all.x = TRUE, all.y=  FALSE, sort = FALSE)
#tp53[which(is.na(tp53$p.R132H)), "p.R132H"] <- FALSE
tp53[which(is.na(tp53$p.R175H)), "p.R175H"] <- FALSE
tp53[which(is.na(tp53$p.R248Q)), "p.R248Q"] <- FALSE
tp53[which(is.na(tp53$p.R273C)), "p.R273C"] <- FALSE
tp53[which(is.na(tp53$p.R273H)), "p.R273H"] <- FALSE
tp53[which(is.na(tp53$p.G245S)), "p.G245S"] <- FALSE
tp53 <- tp53[order(tp53$p.R175H, tp53$p.R248Q, tp53$p.R273C,
                   tp53$p.R273H, tp53$p.G245S, decreasing = TRUE), ]
library(ComplexHeatmap)
#pdf('Documents/zoomvartcga_writeup/new_figures/aging_topmut_oncoplot.pdf', 
#    width = 11.7, height = 1.5)
Heatmap(t(as.matrix(tp53$age_at_initial_pathologic_diagnosis[1:800])), 
        col = c("navy", "yellow"), 
        show_column_names = FALSE, cluster_columns = FALSE, cluster_rows = FALSE, 
        top_annotation = HeatmapAnnotation(#"IDH1 R132H" =  tp53$p.R132H[1:800],
                                           "TP53 R175H" =  tp53$p.R175H[1:800],
                                           "TP53 R248Q" =  tp53$p.R248Q[1:800],
                                           "TP53 R273C" =  tp53$p.R273C[1:800],
                                           "TP53 R273H" =  tp53$p.R273H[1:800],
                                           "TP53 G245S" =  tp53$p.G245S[1:800],
                                           annotation_name_side = "left",
                                           col = list(#"IDH1 R132H" = c("FALSE" = "grey80", "TRUE" = "black"),
                 "TP53 R175H" = c("FALSE" = "grey80", "TRUE" = "black"),
                 "TP53 R248Q" = c("FALSE" = "grey80", "TRUE" = "black"),
                 "TP53 R273C" = c("FALSE" = "grey80", "TRUE" = "black"),
                 "TP53 R273H" = c("FALSE" = "grey80", "TRUE" = "black"),
                 "TP53 G245S" = c("FALSE" = "grey80", "TRUE" = "black"))))

mCSM analysis of specific hotspots

PTEN

PTEN <- read.table('PTEN_R130_mCSM.txt',
                   header = TRUE, sep = "\t", stringsAsFactors = FALSE)
PTEN$pos <- apply(PTEN[, c("WILD_RES", "RES_POS")], 1, paste, collapse = "")
PTEN$MUT_RES <- factor(PTEN$MUT_RES, levels = c("Y", "W", "F", "D", "E", "R", "K", "H",
                                                "S", "T", "C", "P", "Q", "N",
                                                "G", "A", "V", "L", "M", "I"))
ggplot(PTEN, aes(x = pos, y = MUT_RES)) + geom_tile(aes(fill = PRED_DDG)) +
  geom_text(aes(label = PRED_DDG), colour = "white") +
  scale_fill_gradient2(low = "red", mid = "yellow", high = "green", name = "ddG",
                       limits = c(-2.5, 2.5)) + cowplot::theme_cowplot() +
  scale_y_discrete(drop = FALSE) + ylab("") + xlab("")

PIK3CA <- read.table('PIK3CA_mCSM.txt',
                   header = TRUE, sep = "\t", stringsAsFactors = FALSE)
PIK3CA$pos <- apply(PIK3CA[, c("WILD_RES", "RES_POS")], 1, paste, collapse = "")
PIK3CA$MUT_RES <- factor(PIK3CA$MUT_RES, levels = c("Y", "W", "F", "D", "E", "R", 
                                                    "K", "H", "S", "T", "C", "P",
                                                    "Q", "N", "G", "A", "V", "L",
                                                    "M", "I"))

ggplot(PIK3CA, aes(x = pos, y = MUT_RES)) + geom_tile(aes(fill = PRED_DDG)) +
  geom_text(aes(label = PRED_DDG), colour = "white") +
  scale_fill_gradient2(low = "red", mid = "yellow", high = "green", name = "ddG",
                       limits = c(-2.5, 2.5)) + cowplot::theme_cowplot() +
  scale_y_discrete(drop = FALSE) + ylab("") + xlab("")

Annotating mutational signatures in terms of protein (structural) consequences

Joseph Ng

5 March 2021