This markdown contains the following analysis:

Driver mutations caused by specific mutagenic processes

Annotate enrichment of mutation signature per tumour

Here we calculate the enrichment (ie density of mutations that occur in a given signature) of each signature, considered per tumour. Used to classify tumours with heightened exposure towards a specific mutagenic process.

#  function to calculate calculate signature density
calcSigDensity <- function(ref, alt, alt_alleles, context, mut_context, 
                           bg_context, dinucleotide = FALSE)
{
  con_sig <- sum(stringr::str_count(toupper(context), paste(mut_context, collapse = "|")))
  if(dinucleotide){
    mut_sig <- sum(sapply(1:length(mut_context), function(x){
      sum(ref == mut_context[x] & alt == alt_alleles[x])
    }))
    tb <- unique(do.call("rbind", lapply(1:length(mut_context), function(x){
      data.frame(bg = unlist(strsplit(mut_context[x], split = "")),
                 mut = unlist(strsplit(alt_alleles[x], split = "")), 
                 stringsAsFactors = FALSE)
    })))
    con_bg <- sum(stringr::str_count(toupper(context), paste(tb$bg, collapse = "|")))
    mut_bg <- sum(apply(tb, MARGIN = 1, function(x){
      sum(ref == x[1] & alt == x[2])
    }))
  } else {
    con_bg <- sum(stringr::str_count(toupper(context), paste(bg_context, collapse = "|")))
    pos <- 21
    mut_sig <- sum(sapply(1:length(mut_context), function(x){
      sum(substr(toupper(context), pos - 1, pos + 1) == mut_context[x] & alt == alt_alleles[x])
    }))
    mut_bg <- sum(apply(unique(data.frame(bg_context, alt_alleles,
                                          stringsAsFactors = FALSE)), 
                        MARGIN = 1, function(x){
      sum(ref == x[1] & alt == x[2])
    }))
  }
  if(mut_bg > 0 & con_sig > 0 & con_bg > 0){
    ( mut_sig / con_sig ) / ( mut_bg / con_bg )
  } else {
    0
  }
}
# filter del and ins
tcga <- readRDS('TCGA_allmuts_withSig_FINAL.rds')
subsonly <- tcga[which(tcga$Reference_Allele != '-' & tcga$Tumor_Seq_Allele2 != '-'), ]
subsonly <- subsonly[which(nchar(subsonly$Reference_Allele) == nchar(subsonly$Tumor_Seq_Allele2)), ]
head(subsonly)
subsonly <- split(subsonly, f = subsonly$Tumor_Sample_Barcode)

# calculation
ApobecEnrich <- sapply(subsonly, function(tb){
  calcSigDensity(tb$Reference_Allele, tb$Tumor_Seq_Allele2, context = tb$CONTEXT....20.,
                 mut_context = c("TCT", "TCT", "TCA", "TCA", 
                                 "AGA", "AGA", "TGA", "TGA"),
                 bg_context = c("C", "C", "C", "C", "G", "G", "G", "G"),
                 alt_alleles = c("T", "G", "T", "G",  "C", "A", "C", "A"))
})
AgingEnrich <- sapply(subsonly, function(tb){
  calcSigDensity(tb$Reference_Allele, tb$Tumor_Seq_Allele2, context = tb$CONTEXT....20.,
                 mut_context = c("TCG", "ACG", "CCG", "GCG", 
                                 "CGC", "CGG", "CGT", "CGA"),
                 bg_context = c("C", "C", "C", "C", "G", "G", "G", "G"),
                 alt_alleles = c("T", "T", "T", "T",  "A", "A", "A", "A"))
})
POLEEnrich <- sapply(subsonly, function(tb){
  calcSigDensity(tb$Reference_Allele, tb$Tumor_Seq_Allele2, context = tb$CONTEXT....20.,
                 mut_context = c("TCT", "AGA", "TCG", "CGA"),
                 bg_context = c("C", "G", "C", "G"),
                 alt_alleles = c("A", "T", "T", "A"))
})
FiveFUEnrich <- sapply(subsonly, function(tb){
  calcSigDensity(tb$Reference_Allele, tb$Tumor_Seq_Allele2, context = tb$CONTEXT....20.,
                 mut_context = c("CTT", "AAG"),
                 bg_context = c("T", "A"),
                 alt_alleles = c("G", "C"))
})
PlatSBSEnrich <- sapply(subsonly, function(tb){
  calcSigDensity(tb$Reference_Allele, tb$Tumor_Seq_Allele2, context = tb$CONTEXT....20.,
                 mut_context = c("CCC", "GGG", "CCT", "AGG"),
                 bg_context = c("C", "G", "C", "G"),
                 alt_alleles = c("T", "A", "T", "A"))
})
UVEnrich <- sapply(subsonly, function(tb){
  calcSigDensity(tb$Reference_Allele, tb$Tumor_Seq_Allele2, context = tb$CONTEXT....20.,
                 mut_context = c("CC", "GG"),
                 dinucleotide = TRUE,
                 alt_alleles = c("TT", "AA"))
})
PlatDBSEnrich <- sapply(subsonly, function(tb){
  calcSigDensity(tb$Reference_Allele, tb$Tumor_Seq_Allele2, context = tb$CONTEXT....20.,
                 mut_context = c("CT", "CT", "AG", "AG", "TG", "TC", "CA", "GA"),
                 dinucleotide = TRUE,
                 alt_alleles = c("AA", "AC", "TT", "GT", "GT", "AA", "AC", "TT"))
})

sig_enrich <- Reduce(function(d1, d2) cbind(as.data.frame(d1), as.data.frame(d2)),
                     list(AgingEnrich, ApobecEnrich, POLEEnrich, FiveFUEnrich,
                          PlatSBSEnrich, PlatDBSEnrich, UVEnrich))
colnames(sig_enrich) <- c("AgingEnrich", "ApobecEnrich", "POLEEnrich", "FiveFUEnrich",
                          "PlatSBSEnrich", "PlatDBSEnrich", "UVEnrich")
saveRDS(sig_enrich, 'TCGA_sig_enrich.rds')
rm(subsonly)

top mutations by incidence for each signature

tcga <- readRDS('TCGA_allmuts_withSig_FINAL.rds')
tcga$sample <- substr(tcga$Tumor_Sample_Barcode, 1, 12)
#____________________________________
# APOBEC only
apobec_topmut <- ddply(tcga[which(tcga$ApobecSig == 1), ], 
                       c("Hugo_Symbol", "Protein_Change"), nrow)
apobec_topmut <- apobec_topmut[order(apobec_topmut$V1, decreasing = TRUE), ]
apobec_topmut <- apobec_topmut[which(apobec_topmut$Protein_Change != "."), ]
apobec_topmut <- apobec_topmut[1:12, ]
apobec_topmut$delta <- apply(apobec_topmut[, 1:2], 1, paste, collapse = " ")
apobec_topmut$delta <- factor(apobec_topmut$delta, levels = apobec_topmut$delta)
ggplot(apobec_topmut) + geom_bar(aes(x = delta, y = V1), stat = "identity") +
  cowplot::theme_cowplot() + xlab("") + ylab("Incidence") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# all signatures
topmuts <- lapply(c("ApobecSig", "AgingSig", "POLESig", "UVSig", "FiveFUSig", 
                    "PlatDBS", "PlatSBS"), function(x){
  topmut <- ddply(tcga[which(tcga[, x] == 1), ], c("Hugo_Symbol", "Protein_Change"), nrow)
  topmut <- topmut[order(topmut$V1, decreasing = TRUE), ]
  topmut <- topmut[which(topmut$Protein_Change != "."), ]
  topmut <- topmut[1:12, ]
  topmut$delta <- apply(topmut[, 1:2], 1, paste, collapse = " ")
  topmut$delta <- factor(topmut$delta, levels = topmut$delta)
  ggplot(topmut) + geom_bar(aes(x = delta, y = V1), stat = "identity") +
    cowplot::theme_cowplot() + xlab("") + ylab("Incidence") + ylim(0, 400) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggtitle(gsub("Sig", "", x))
})
gridExtra::grid.arrange(topmuts[[1]], topmuts[[2]], topmuts[[3]], topmuts[[4]], 
                        topmuts[[5]], topmuts[[6]], topmuts[[7]], ncol = 4)

define subset of TCGA cases relevant to each mutational signature

  • 5-FU / Platinum - cases treated with these drugs
clinFiles <- list.files(path = '/home/josefng/TCGA_clinical_all', full.names = TRUE)

drug_info <- lapply(clinFiles, function(file)
{
  cancerType <- unlist(strsplit(file, split = "/"))
  cancerType <- gsub(".clin.merged.txt", "",
                     cancerType[length(cancerType)], fixed = TRUE)
  print(cancerType)
  clin <- read.table(file, sep = "\t", quote = "", comment.char = "",
                     row.names = 1, stringsAsFactors = F)
  clin <- data.frame(t(clin), stringsAsFactors = F)
  drugs <- data.frame(clin[, grepl('patient.bcr_patient_barcode|patient.drugs.drug.*drug_name',
                        colnames(clin), perl = T)], stringsAsFactors = F)
  if(ncol(drugs) > 1){
    drugs <- apply(drugs, MARGIN = 1, function(x){
      o <- x[2:length(x)]
      o <- o[!is.na(o)]
      if(length(o) == 0){
        o <- NA
      }
      unname(o)
    })
    names(drugs) <- toupper(clin[, "patient.bcr_patient_barcode"])
    drugs
  } else return(NULL)
})
cancerTypes <- sapply(clinFiles, function(file){
  o <- unlist(strsplit(file, split = "/"))
  o <- gsub(".clin.merged.txt", "", o[length(o)], fixed = TRUE)
  o
})
names(drug_info) <- cancerTypes
drug_info <- drug_info[sapply(drug_info, function(x) !is.null(x))]

# patient barcode with platinum-based drug administration
plat_patients <- unique(names(unlist(drug_info)[grep('plat', unlist(drug_info))]))
for(patient in plat_patients){
  o <- unlist(strsplit(patient, split = ".", fixed = T))[2]
  o <- substr(o, 1, 12)
  write(o, "TCGA_platinum.txt", append = TRUE)
}

# patient barcode with 5-FU administration
fu_patients <- unique(names(unlist(drug_info)[grep('5fu|5-fu|uracil', unlist(drug_info))]))
for(patient in fu_patients){
  o <- unlist(strsplit(patient, split = ".", fixed = T))[2]
  o <- substr(o, 1, 12)
  write(o, "TCGA_5fu.txt", append = TRUE)
}
  • TCGA hypermutated - density (number of mutations per Mb) > 10 (ref: here). Prune (see below) to specifically focus on those with POLE mutation
# get cases with high mutational density
library(plyr)
mafs <- list.files(path = "/home/josefng/TCGA.firehose_maf",
                   pattern = "_maf_alilmuttype_mapped_withSig.txt", full.names = TRUE)
mafs <- lapply(mafs, function(maf){
 print(maf)
 tb <- read.table(maf, header = TRUE, sep = "\t", quote = "", 
                  stringsAsFactors = FALSE, comment.char = "")
 tb
})
mafs <- do.call("rbind", mafs)
nmuts <- ddply(mafs, "Tumor_Sample_Barcode", nrow)
# rm(mafs)
colnames(nmuts)[2] <- "nmut"
nmuts$density <- nmuts[, 2] / 38 # ref https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-017-0424-2
hypermutated_cases <- nmuts[nmuts$density > 10, 1] # ref https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5849393/
  • UV: The TCGA Melanoma cohort
  • Aging / APOBEC: enrichment of the signature (ie density) above-median. In the case of aging, examine association with age.
sig_enrich <- readRDS('TCGA_sig_enrich.rds')
caselist <- list('5FU' = readLines('TCGA_5fu.txt'),
                 'Platinum' = readLines('TCGA_platinum.txt'),
                 'POLE' = readLines('TCGA_hypermutated.txt'))
caselist[["UV"]] = unique(tcga[which(tcga$cancerType == "SKCM"), "Tumor_Sample_Barcode"])
caselist[["UV"]] = substr(caselist[["UV"]], 1, 12)
caselist[["Apobec"]] = rownames(sig_enrich[which(sig_enrich$ApobecEnrich > median(sig_enrich$ApobecEnrich)), ])
caselist[["Apobec"]] = substr(caselist[["Apobec"]], 1, 12)
caselist[["Aging"]] = rownames(sig_enrich[which(sig_enrich$AgingEnrich > median(sig_enrich$AgingEnrich)), ])
caselist[["Aging"]] = substr(caselist[["Aging"]], 1, 12)

cancerTypes <- ddply(tcga, c("Tumor_Sample_Barcode", "sample", "cancerType"), nrow)
sig_enrich <- merge(sig_enrich, cancerTypes, by.x = "row.names", 
                    by.y = "Tumor_Sample_Barcode",
                    all.x = TRUE, all.y=  FALSE, sort= FALSE)
colnames(sig_enrich)[ncol(sig_enrich)] <- "Mutation_load"
head(sig_enrich)
##                      Row.names AgingEnrich ApobecEnrich POLEEnrich FiveFUEnrich
## 1 TCGA-02-0003-01A-01D-1490-08    9.403312    0.4244755  0.9342055            0
## 2 TCGA-02-0033-01A-01D-1490-08    7.889610    0.5148305  1.1675676            0
## 3 TCGA-02-0047-01A-01D-1490-08    9.716312    0.4672224  1.1132178            0
## 4 TCGA-02-0055-01A-01D-1490-08    9.693034    0.1619329  0.7126736            0
## 5 TCGA-02-2470-01A-01D-1494-08   10.653325    0.9134065  0.6789966            0
## 6 TCGA-02-2483-01A-01D-1494-08   10.320122    0.3173933  1.0260167            0
##   PlatSBSEnrich PlatDBSEnrich UVEnrich       sample cancerType Mutation_load
## 1     0.9432789             0        0 TCGA-02-0003     GBMLGG            57
## 2     0.8596698             0        0 TCGA-02-0033     GBMLGG            47
## 3     0.9402883             0        0 TCGA-02-0047     GBMLGG            82
## 4     0.7446712             0        0 TCGA-02-0055     GBMLGG            76
## 5     0.0000000             0        0 TCGA-02-2470     GBMLGG            80
## 6     0.7141350             0        0 TCGA-02-2483     GBMLGG            67

Explore mutations associated with specific signatures

POLE

PTEN p.R130Q

#_______________
# prune list to select only POLE-mutated for hypermutation 
hypermutated <- caselist$POLE
msi <- list.files('clinical', pattern = "_msi", full.names = TRUE)
msi <- lapply(msi, read.table, stringsAsFactors = FALSE, row.names = 1, sep = "\t")
msi <- lapply(msi, function(tb) t(tb))
msi <- lapply(msi, function(tb){
  tb[, 1] <- toupper(tb[, 1])
  tb[, 2] <- sapply(tb[, 2], function(x){
    if(is.na(x)) return(FALSE)
    if(x == "msi-h") return(TRUE) else return(FALSE)
  })
  colnames(tb) <- c("Sample", "MSI_status")
  tb
})
msi <- do.call("rbind", msi)
hypermut_pol <- tcga[which(tcga$Hugo_Symbol %in% c("POLE", "POLD1", "PTEN") & 
                             tcga$mutAA != tcga$origAA), ]
hypermut_pol <- reshape2::acast(hypermut_pol, sample ~ Hugo_Symbol, 
                                value.var = "Protein_Change",
                                fun.aggregate = function(x) length(x) > 0)
hypermut_pol[, 3] <- sapply(
  rownames(hypermut_pol), 
  function(x) nrow(tcga[tcga$Hugo_Symbol == "PTEN" & tcga$sample == x & 
                          tcga$Protein_Change == "p.R130Q", ]) > 0
)
colnames(hypermut_pol)[3] <- "PTEN R130Q"
hypermut <- sig_enrich[, c("sample", "cancerType", "ApobecEnrich", 
                           "POLEEnrich", "Mutation_load", "Row.names")]
colnames(hypermut)[ncol(hypermut)] <- "Tumor_Sample_Barcode"
hypermut <- merge(hypermut, msi, by.x = "sample", by.y = "Sample",
                  all.x = TRUE, all.y = FALSE, sort = FALSE)
hypermut <- hypermut[which(hypermut$sample %in% caselist$POLE), ]
hypermut <- merge(hypermut, hypermut_pol, by.x = "sample", by.y = "row.names",
                  all.x = TRUE, all.y=  FALSE, sort = FALSE)
hypermut[which(is.na(hypermut$POLD1)), "POLD1"] <- FALSE
hypermut[which(is.na(hypermut$POLE)), "POLE"] <- FALSE
hypermut[which(is.na(hypermut$`PTEN R130Q`)), "PTEN R130Q"] <- FALSE
hypermut[which(is.na(hypermut$MSI_status)), "MSI_status"] <- FALSE
hypermut$ApobecStatus <- (hypermut$ApobecEnrich > 1)
hypermut$PoleStatus <- (hypermut$POLEEnrich > 1)

caselist[["POLEmut"]] <- hypermut[which(hypermut$POLE & 
                                          hypermut$sample %in% caselist$POLE), 
                                  "sample"]
names(caselist)[which(names(caselist) == "POLEmut")] <- "hypermutated"
caselist <- caselist[-which(names(caselist) == "POLE")]

# hypermutation: PTEN R130Q with POLE/POLD1
hypermut <- hypermut[order(hypermut$POLE, hypermut$POLD1, hypermut$`PTEN R130Q`,
                           hypermut$MSI_status, hypermut$Mutation_load,
                           hypermut$ApobecEnrich,
                           decreasing = TRUE), ]
Heatmap(t(hypermut)[c("ApobecStatus", "MSI_status", "POLE", "POLD1", "PTEN R130Q"), ], 
        col = c("grey80", "black"),
        show_column_names = FALSE, cluster_columns = FALSE, cluster_rows = FALSE, 
        top_annotation = HeatmapAnnotation("Mutation load" =
                                             anno_barplot(hypermut$Mutation_load)))

Aging

# age
age <- read.table('TCGA_all_age.txt', 
                  header = TRUE, stringsAsFactors = FALSE, sep = "\t")
age <- age[which(!is.na(age$age_at_initial_pathologic_diagnosis)), ]
sig_enrich <- merge(sig_enrich, age[, c(1, 3)], by.x = "sample", 
                    by.y = "bcr_patient_barcode",
                    all.x = TRUE, all.y = FALSE, sort = FALSE)
sig_enrich$age_group <- cut(sig_enrich$age_at_initial_pathologic_diagnosis,
                            breaks = seq(10, 90, by = 10))
ggplot(sig_enrich) + geom_boxplot(aes(x = age_group, y = AgingEnrich))

# weirdly the 'AgingEnrich' does not correlate with age. Rather it shows the vast majority of samples have the aging signature occuring at a rate higher than expected.
# 'High AgingEnrich' probably only means absence of other processes? Check with SigProfiler results
# https://www.synapse.org/#!Synapse:syn11801497

sigprofiler_counts <- read.csv('TCGA_WES_sigProfiler_SBS_signatures_in_samples.csv', row.names = NULL)
sigprofiler_counts$othersig <- apply(sigprofiler_counts[, c(-1, -2, -3, -4, -8)], 
                                     MARGIN = 1, sum)
sigprofiler_counts$othersig_prop <- sigprofiler_counts$othersig /
  apply(sigprofiler_counts[, c(-1, -2, -3, -69)], 1, sum)
sigprofiler_counts$sample <- substr(sigprofiler_counts$Sample.Names, 1, 12)
sigprofiler_counts <- merge(sigprofiler_counts, sig_enrich[, c(1, 2, 3)], 
                            by = "sample",
                            all.x = FALSE, all.y = TRUE, sort = FALSE)
sigprofiler_counts$AgingEnrich_group <- cut(sigprofiler_counts$AgingEnrich, 
                                            quantile(sigprofiler_counts$AgingEnrich,
                                                     probs = seq(0, 1, by = .25)), 
                                            include.lowest = TRUE, labels = FALSE)
sigprofiler_counts$AgingEnrich_group <- factor(sigprofiler_counts$AgingEnrich_group,
                                               ordered = TRUE)

ggplot(sigprofiler_counts) + geom_boxplot(aes(x = AgingEnrich_group, y = othersig_prop)) +
  cowplot::theme_cowplot() + xlab("Enrichment of Signature 1\n(binned by quartiles)") +
  ylab("Proportion of mutations NOT in Signature 1") + scale_y_continuous(labels = scales::percent)
## Warning: Removed 1183 rows containing non-finite values (stat_boxplot).

Nonsurprisingly the enrichment of Signature 1 (i.e. Aging signature) implies a low proportion of mutations outside of Signature 1.

TP53

# oncoprint for TP53 mutations selected in aging signature correlation with age
tp53_pol <- rbind(tcga[which(tcga$Hugo_Symbol %in% c("TP53") & 
                               tcga$mutAA != tcga$origAA), ])#,
                  #tcga[which(tcga$Hugo_Symbol %in% c("IDH1") & tcga$mutAA != tcga$origAA), ])
tp53_pol <- reshape2::acast(tp53_pol, sample ~ Protein_Change, 
                            value.var = "Protein_Change",
                            fun.aggregate = function(x) length(x) > 0)
tp53_pol <- tp53_pol[, c(#"p.R132H", 
  "p.R175H", "p.R248Q", "p.R273C", "p.R273H", "p.G245S")]
tp53 <- age[, c("bcr_patient_barcode", "age_at_initial_pathologic_diagnosis")]
#colnames(tp53)[ncol(tp53)] <- "Tumor_Sample_Barcode"
tp53 <- merge(tp53, tp53_pol, by.x = "bcr_patient_barcode", by.y = "row.names",
                  all.x = TRUE, all.y=  FALSE, sort = FALSE)
#tp53[which(is.na(tp53$p.R132H)), "p.R132H"] <- FALSE
tp53[which(is.na(tp53$p.R175H)), "p.R175H"] <- FALSE
tp53[which(is.na(tp53$p.R248Q)), "p.R248Q"] <- FALSE
tp53[which(is.na(tp53$p.R273C)), "p.R273C"] <- FALSE
tp53[which(is.na(tp53$p.R273H)), "p.R273H"] <- FALSE
tp53[which(is.na(tp53$p.G245S)), "p.G245S"] <- FALSE
tp53 <- tp53[order(tp53$p.R175H, tp53$p.R248Q, tp53$p.R273C,
                   tp53$p.R273H, tp53$p.G245S, decreasing = TRUE), ]
library(ComplexHeatmap)
#pdf('Documents/zoomvartcga_writeup/new_figures/aging_topmut_oncoplot.pdf', 
#    width = 11.7, height = 1.5)
Heatmap(t(as.matrix(tp53$age_at_initial_pathologic_diagnosis[1:800])), 
        col = c("navy", "yellow"), 
        show_column_names = FALSE, cluster_columns = FALSE, cluster_rows = FALSE, 
        top_annotation = HeatmapAnnotation(#"IDH1 R132H" =  tp53$p.R132H[1:800],
                                           "TP53 R175H" =  tp53$p.R175H[1:800],
                                           "TP53 R248Q" =  tp53$p.R248Q[1:800],
                                           "TP53 R273C" =  tp53$p.R273C[1:800],
                                           "TP53 R273H" =  tp53$p.R273H[1:800],
                                           "TP53 G245S" =  tp53$p.G245S[1:800],
                                           annotation_name_side = "left",
                                           col = list(#"IDH1 R132H" = c("FALSE" = "grey80", "TRUE" = "black"),
                 "TP53 R175H" = c("FALSE" = "grey80", "TRUE" = "black"),
                 "TP53 R248Q" = c("FALSE" = "grey80", "TRUE" = "black"),
                 "TP53 R273C" = c("FALSE" = "grey80", "TRUE" = "black"),
                 "TP53 R273H" = c("FALSE" = "grey80", "TRUE" = "black"),
                 "TP53 G245S" = c("FALSE" = "grey80", "TRUE" = "black"))))

mCSM analysis of specific hotspots

PTEN

PTEN <- read.table('PTEN_R130_mCSM.txt',
                   header = TRUE, sep = "\t", stringsAsFactors = FALSE)
PTEN$pos <- apply(PTEN[, c("WILD_RES", "RES_POS")], 1, paste, collapse = "")
PTEN$MUT_RES <- factor(PTEN$MUT_RES, levels = c("Y", "W", "F", "D", "E", "R", "K", "H",
                                                "S", "T", "C", "P", "Q", "N",
                                                "G", "A", "V", "L", "M", "I"))
ggplot(PTEN, aes(x = pos, y = MUT_RES)) + geom_tile(aes(fill = PRED_DDG)) +
  geom_text(aes(label = PRED_DDG), colour = "white") +
  scale_fill_gradient2(low = "red", mid = "yellow", high = "green", name = "ddG",
                       limits = c(-2.5, 2.5)) + cowplot::theme_cowplot() +
  scale_y_discrete(drop = FALSE) + ylab("") + xlab("")

PIK3CA <- read.table('PIK3CA_mCSM.txt',
                   header = TRUE, sep = "\t", stringsAsFactors = FALSE)
PIK3CA$pos <- apply(PIK3CA[, c("WILD_RES", "RES_POS")], 1, paste, collapse = "")
PIK3CA$MUT_RES <- factor(PIK3CA$MUT_RES, levels = c("Y", "W", "F", "D", "E", "R", 
                                                    "K", "H", "S", "T", "C", "P",
                                                    "Q", "N", "G", "A", "V", "L",
                                                    "M", "I"))

ggplot(PIK3CA, aes(x = pos, y = MUT_RES)) + geom_tile(aes(fill = PRED_DDG)) +
  geom_text(aes(label = PRED_DDG), colour = "white") +
  scale_fill_gradient2(low = "red", mid = "yellow", high = "green", name = "ddG",
                       limits = c(-2.5, 2.5)) + cowplot::theme_cowplot() +
  scale_y_discrete(drop = FALSE) + ylab("") + xlab("")