diff --git a/_pre-ARC-legacy/190522_RNASeq_Report.docx b/_pre-ARC-legacy/190522_RNASeq_Report.docx new file mode 100644 index 0000000000000000000000000000000000000000..2cc34e9ad4ef570d8eb6da2aee81954f089889e6 Binary files /dev/null and b/_pre-ARC-legacy/190522_RNASeq_Report.docx differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/01_FullPipeline.R b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/01_FullPipeline.R new file mode 100644 index 0000000000000000000000000000000000000000..da65d1644c4cefc64569a50a6cda6e61199c3ca1 --- /dev/null +++ b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/01_FullPipeline.R @@ -0,0 +1,1372 @@ +### Setting up the expression.data / mother table from kallisto and +### Analysis of differential gene expression using sleuth + +# Tutorial adapted from this online tutorial: https://pachterlab.github.io/sleuth_walkthroughs/trapnell/analysis.html + + +required.packages <- c("Rmisc", "plyr", "ggplot2", "reshape2", "gridExtra", 'openxlsx', 'sleuth', 'amap', 'ggdendro', 'ellipse', 'UpSetR', 'VennDiagram') + +for(package in required.packages) +{ + print(package) + ## Check if package is installed. If not, install + if(!package %in% row.names(installed.packages())){install.packages(package, repos ="https://cran.uni-muenster.de/")} + # ## Check if package is up to date. If not, update + # update.packages(package, repos = "https://cran.uni-muenster.de/") + ## Load package + library(package, character.only = T) +} + +nice.date <- format(Sys.time(), "%y%m%d") +SampleInfo <- read.xlsx('../00_SampleInfo.xlsx') + +color.values <- brewer.pal(8, 'Dark2')[8:1] +names(color.values) <- unique(SampleInfo$Cond_short) + +theme_dominik <- + theme(panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(fill = NA)) + + theme(aspect.ratio = 1) + + theme(axis.ticks = element_line(color = "black"), axis.text = element_text(color = "black")) + + theme(strip.background = element_blank(), strip.text = element_text(size = 8)) + + theme(legend.position = "bottom", legend.direction = "horizontal", legend.title = element_blank()) + #, legend.key.size = unit(0.05, "npc") + theme(plot.margin=unit(c(1, 1, 1, 1),"cm")) + + theme(plot.title = element_text(hjust = 0, face = "bold", size = 15)) + + +######################################## +#### Annotation data +######################################## + +load('RNASeq_Annotation/Arabidopsis/02.0Araport11/02.0Araport.RData') +load("RNASeq_Annotation/Arabidopsis/02.4GO/GOAnno2018-02-06.RData") +load("RNASeq_Annotation/Arabidopsis/02.3Mapman/180924_MercatorMapman.RData") +load("RNASeq_Annotation/Arabidopsis/02.3Mapman/02.3Mapman.RData") +colnames(mapman)[2] <- 'MAPMAN.BIN' +load("RNASeq_Annotation/Arabidopsis/02.9PlnTFDB/02.9PlnTFDB.RData") +load("RNASeq_Annotation/Arabidopsis/02.12Lipids/02.12lipids.RData") + +# load("02_kmeans_clustering/190520_02_ClusterResults.RData") + + +######################################## +#### Reading Kallisto results +######################################## + +# sleuth_objects <- mother.tables <- s2c.list <- mapping_stats.list <- list() +# for(type in c('not_trimmed', 'trimmed')) +# { +# base_dir <- paste0('kallisto_', type) +# +# # A list of paths to the kallisto results indexed by the sample IDs is collated with +# kal_dirs <- dir(base_dir, full.names = T) ## Sleuth requires full paths +# kal_dirs +# +# s2c <- SampleInfo[order(SampleInfo$IlluminaID), c('IlluminaID', 'Cond_short')] +# colnames(s2c)[1] <- 'sample' +# +# s2c$path <- kal_dirs +# s2c.list[[type]] <- s2c +# +# # First build a sleuth object +# sleuth_objects[[type]] <- so <- sleuth_prep(s2c, ~ Cond_short) +# +# expression.data <- kallisto_table(so) +# mother.tables[[type]] <- dcast(expression.data, target_id~sample, value.var = 'tpm') +# +# mapping_stats <- c() +# +# for(i in dir(kal_dirs, pattern = '.json', full.names = T)) +# { +# +# x <- readLines(i) +# x <- x[2:(length(x)-1)] +# y <- t(do.call(rbind, strsplit(gsub(',', '', x), split = '": '))) +# +# colnames(y) <- y[1,] +# z <- as.data.frame(t(y[-1,])) +# +# z$sample <- unlist(strsplit(i, split = '/'))[2] +# +# mapping_stats <- rbind(mapping_stats, z) +# +# } +# +# mapping_stats.list[[type]] <- mapping_stats +# +# } +# +# +# write.xlsx(mapping_stats.list, file = paste(nice.date, '01_mappingstats.xlsx', sep = '_')) +# +# save(sleuth_objects, mother.tables, s2c.list, file = '01_Kallisto.RData') + + +load(file = '01_Kallisto.RData') + + + +######################################## +#### PCA and HCL +######################################## + +pdf(paste(nice.date, '02_PCA_HCL.pdf', sep = '_'), width = 8, height = 8) +for(type in c('not_trimmed', 'trimmed')) +{ + + ### Prepare data + mother.table <- mother.tables[[type]] + + for_clust <- mother.table[, 2:ncol(mother.table)] + + # we'll filter to leave only, more abundant, always-above-0 genes + for_clust <- for_clust[apply(for_clust, 1, min) > 0, ] + # and log2 transform + for_clust <- t(log2(for_clust)) + + ### PCA + pca <- prcomp(for_clust, scale. = T, retx = T) + s <- summary(pca) + + scores <- as.data.frame(pca$x) + scores$sample <- row.names(scores) + + scores <- merge(scores, SampleInfo, by.x = 'sample', by.y = 'IlluminaID') + + + pca12 <- ggplot(data=scores,aes(x = PC1, y = PC2)) + theme_bw() + + geom_point(aes(color=Cond_short), size=2.5) + + geom_text(aes(label=sample),size=3, vjust = 2) + + labs(x = paste0("PC1 (", round(s$importance[2,1], digits = 3) *100, "%)"), + y = paste0("PC2 (", round(s$importance[2,2], digits = 3) *100, "%)")) + + theme(panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(fill = NA)) + + theme(axis.text = element_text(color = "black"), axis.ticks = element_line(color = "black"), axis.title = element_text(color = "black")) + + theme(plot.title = element_text(face= "bold")) + + theme(aspect.ratio = 1) + + scale_color_manual(values = color.values) + + theme(strip.background = element_blank(), strip.text = element_text(size = 10, face = "bold")) + + ggtitle(type) + + theme(legend.position = "right", legend.title = element_blank()) #legend.direction = "vertical", + + print(pca12) + + + ### PCA with ellipses + + ellipses <- droplevels(scores[, c('PC1', 'PC2', 'Cond_short')]) + colnames(ellipses) <- c('PC1', 'PC2', 'group') + + centroids <- aggregate(cbind(ellipses$PC1, ellipses$PC2)~group, ellipses, mean) + + conf.rgn <- do.call(rbind,lapply(unique(ellipses$group),function(t) + data.frame(group=as.character(t), + + ellipse::ellipse(cov(ellipses[ellipses$group==t,1:2]), + centre=as.matrix(centroids[centroids$group==t,2:3]), + level=0.95), + stringsAsFactors=FALSE))) + + pc.elli <- ggplot(ellipses, aes(x = PC1, y = PC2, col = group)) + + geom_point(size = 2.5) + + geom_path(data=conf.rgn) + + labs(x = paste0("PC1 (", round(s$importance[2,1], digits = 3) *100, "%)"), + y = paste0("PC2 (", round(s$importance[2,2], digits = 3) *100, "%)")) + + theme(panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(fill = NA)) + + theme(axis.text = element_text(color = "black"), axis.ticks = element_line(color = "black"), axis.title = element_text(color = "black")) + + theme(plot.title = element_text(face= "bold")) + + theme(aspect.ratio = 1) + + scale_color_manual(values = color.values) + + theme(strip.background = element_blank(), strip.text = element_text(size = 10, face = "bold")) + + theme(legend.position = "right", legend.title = element_blank()) + + print(pc.elli) + + + ### HCL + + clusters <- hcluster(for_clust, method = "pearson", link = "complete") + + ### Extract the cluster data to use in gglot + ddata <- dendro_data(as.dendrogram(clusters), type = "rectangle") ### alternative: type = "triangle" + + ### Extract the label data of the dendogram... + labelData <- label(ddata) + ### ... and match with the Sample info + colnames(labelData)[3] <- "IlluminaID" + labelData <- merge(labelData, SampleInfo, "IlluminaID") + + + p <- ggplot(segment(ddata)) + + ### The following line is the info for the indivSampleIDual segments (leaves) of the dendogram + geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) + + ### The following line is the info for the labels of the leaves + geom_text(data = labelData, aes(x = x, y = y, hjust = 0, label = IlluminaID, col = Cond_short)) + + scale_color_manual(values = color.values) + + coord_flip() + + scale_y_reverse(expand = c(0.4,0)) + + theme_dendro() + + # theme_dominik + + ggtitle(type) + + print(p) + + +} +dev.off() + + +######################################## +#### PCA and HCL of only single treatments +######################################## + +pdf(paste(nice.date, '02_PCA_HCL_singleTreatments.pdf', sep = '_'), width = 6, height = 6) + + mother.table <- mother.tables$not_trimmed[,c('target_id', SampleInfo[SampleInfo$Cond_short %in% c('C', 'S', 'M', 'H'), 'IlluminaID'])] + for_clust <- mother.table[, 2:ncol(mother.table)] + for_clust <- for_clust[apply(for_clust, 1, min) > 0, ] + for_clust <- t(log2(for_clust)) + + ### PCA + pca <- prcomp(for_clust, retx=T, scale = T) + # pca <- prcomp(for_clust) + s <- summary(pca) + + scores <- as.data.frame(pca$x) + scores$sample <- row.names(scores) + + scores <- merge(scores, SampleInfo, by.x = 'sample', by.y = 'IlluminaID') + + + pca12 <- ggplot(data=scores,aes(x = PC1, y = PC2)) + theme_bw() + + geom_point(aes(color=Cond_short), size=2.5) + + # geom_text(aes(label=sample),size=3, vjust = 2) + + labs(x = paste0("PC1 (", round(s$importance[2,1], digits = 3) *100, "%)"), + y = paste0("PC2 (", round(s$importance[2,2], digits = 3) *100, "%)")) + + theme(panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(fill = NA)) + + theme(axis.text = element_text(color = "black"), axis.ticks = element_line(color = "black"), axis.title = element_text(color = "black")) + + theme(plot.title = element_text(face= "bold")) + + theme(aspect.ratio = 1) + + scale_color_manual(values = color.values) + + theme(strip.background = element_blank(), strip.text = element_text(size = 10, face = "bold")) + + theme(legend.position = "right", legend.title = element_blank()) #legend.direction = "vertical", + + print(pca12) + + ### PCA with ellipses + + + ellipses <- droplevels(scores[, c('PC1', 'PC2', 'Cond_short')]) + colnames(ellipses) <- c('PC1', 'PC2', 'group') + ellipses$group <- as.factor(ellipses$group) + + centroids <- aggregate(cbind(ellipses$PC1, ellipses$PC2)~group, ellipses, mean) + + conf.rgn <- do.call(rbind,lapply(unique(ellipses$group),function(t) + data.frame(group=as.character(t), + + ellipse::ellipse(cov(ellipses[ellipses$group==t,1:2]), + centre=as.matrix(centroids[centroids$group==t,2:3]), + level=0.95), + stringsAsFactors=FALSE))) + + pc.elli <- ggplot(ellipses, aes(x = PC1, y = PC2, col = group)) + + geom_point(size = 2.5) + + geom_path(data=conf.rgn) + + labs(x = paste0("PC1 (", round(s$importance[2,1], digits = 3) *100, "%)"), + y = paste0("PC2 (", round(s$importance[2,2], digits = 3) *100, "%)")) + + theme(panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(fill = NA)) + + theme(axis.text = element_text(color = "black"), axis.ticks = element_line(color = "black"), axis.title = element_text(color = "black")) + + theme(plot.title = element_text(face= "bold")) + + theme(aspect.ratio = 1) + + scale_color_manual(values = color.values) + + theme(strip.background = element_blank(), strip.text = element_text(size = 10, face = "bold")) + + theme(legend.position = "right", legend.title = element_blank()) + + print(pc.elli) + + ### HCL + + clusters <- hcluster(for_clust, method = "pearson", link = "complete") + + ### Extract the cluster data to use in gglot + ddata <- dendro_data(as.dendrogram(clusters), type = "rectangle") ### alternative: type = "triangle" + + ### Extract the label data of the dendogram... + labelData <- label(ddata) + ### ... and match with the Sample info + colnames(labelData)[3] <- "IlluminaID" + labelData <- merge(labelData, SampleInfo, "IlluminaID") + + + p <- ggplot(segment(ddata)) + + ### The following line is the info for the indivSampleIDual segments (leaves) of the dendogram + geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) + + ### The following line is the info for the labels of the leaves + geom_text(data = labelData, aes(x = x, y = y, hjust = 0, label = IlluminaID, col = Cond_short)) + + scale_color_manual(values = color.values) + + coord_flip() + + scale_y_reverse(expand = c(0.4,0)) + + theme_dendro() + + print(p) + +dev.off() + + +######################################## +#### PCA separated with / without Heat +######################################## + +suppressWarnings(dir.create('02_PCA_results')) + +for(i in c('Heat', 'NoHeat', 'All')) +{ + + if(i == 'Heat'){for_clust <- mother.tables$not_trimmed[,c('target_id', SampleInfo[grepl('H', SampleInfo$Cond_short), 'IlluminaID'])]} + if(i == 'NoHeat'){for_clust <- mother.tables$not_trimmed[,c('target_id', SampleInfo[!grepl('H', SampleInfo$Cond_short), 'IlluminaID'])]} + if(i == 'All'){for_clust <- mother.tables$not_trimmed} + + for_clust <- for_clust[, 2:ncol(for_clust)] + for_clust <- for_clust[apply(for_clust, 1, min) > 0, ] + for_clust <- t(log2(for_clust)) + + pca <- prcomp(for_clust, retx=T, scale = T) + s <- summary(pca) + + scores <- as.data.frame(pca$x) + scores$sample <- row.names(scores) + + scores <- merge(scores, SampleInfo, by.x = 'sample', by.y = 'IlluminaID') + + ellipses <- droplevels(scores[, c('PC1', 'PC2', 'Cond_short')]) + colnames(ellipses) <- c('PC1', 'PC2', 'group') + ellipses$group <- as.factor(ellipses$group) + + centroids <- aggregate(cbind(ellipses$PC1, ellipses$PC2)~group, ellipses, mean) + + conf.rgn <- do.call(rbind,lapply(unique(ellipses$group),function(t) + data.frame(group=as.character(t), + + ellipse::ellipse(cov(ellipses[ellipses$group==t,1:2]), + centre=as.matrix(centroids[centroids$group==t,2:3]), + level=0.95), + stringsAsFactors=FALSE))) + + pc.elli <- ggplot(ellipses, aes(x = PC1, y = PC2, col = group)) + + geom_point(size = 2.5) + + geom_path(data=conf.rgn) + + labs(x = paste0("PC1 (", round(s$importance[2,1], digits = 3) *100, "%)"), + y = paste0("PC2 (", round(s$importance[2,2], digits = 3) *100, "%)")) + + theme(panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(fill = NA)) + + theme(axis.text = element_text(color = "black"), axis.ticks = element_line(color = "black"), axis.title = element_text(color = "black")) + + theme(plot.title = element_text(face= "bold")) + + theme(aspect.ratio = 1) + + scale_color_manual(values = color.values) + + theme(strip.background = element_blank(), strip.text = element_text(size = 10, face = "bold")) + + theme(legend.position = "right", legend.title = element_blank()) + + png(paste('02_PCA_results/', nice.date, '02_PCA_', i, '.png', sep = ''), res = 300, width = 1600, height = 1600) + print(pc.elli) + dev.off() + +} + + +######################################## +#### Averaged PCA separated with / without Heat +######################################## + +suppressWarnings(dir.create('02_PCA_results')) + +for(i in c('Heat', 'NoHeat', 'All')) +{ + + if(i == 'Heat'){for_clust <- mother.tables$not_trimmed[,c('target_id', SampleInfo[grepl('H', SampleInfo$Cond_short), 'IlluminaID'])]} + if(i == 'NoHeat'){for_clust <- mother.tables$not_trimmed[,c('target_id', SampleInfo[!grepl('H', SampleInfo$Cond_short), 'IlluminaID'])]} + if(i == 'All'){for_clust <- mother.tables$not_trimmed} + + averaged <- melt(for_clust) + averaged <- merge(averaged, SampleInfo, by.x = 'variable', by.y = 'IlluminaID') + averaged.su <- summarySE(averaged, groupvars = c('target_id', 'Cond_short'), measurevar = 'value') + for_clust <- dcast(averaged.su, target_id~Cond_short, value.var = 'value') + + for_clust <- for_clust[, 2:ncol(for_clust)] + for_clust <- for_clust[apply(for_clust, 1, min) > 0, ] + for_clust <- t(log2(for_clust)) + + pca <- prcomp(for_clust, retx=T, scale = T) + s <- summary(pca) + + scores <- as.data.frame(pca$x) + scores$Cond_short <- row.names(scores) + + pc <- ggplot(scores, aes(x = PC1, y = PC2, col = Cond_short)) + + geom_text(aes(label = Cond_short)) + + labs(x = paste0("PC1 (", round(s$importance[2,1], digits = 3) *100, "%)"), + y = paste0("PC2 (", round(s$importance[2,2], digits = 3) *100, "%)")) + + theme(panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(fill = NA)) + + theme(axis.text = element_text(color = "black"), axis.ticks = element_line(color = "black"), axis.title = element_text(color = "black")) + + theme(plot.title = element_text(face= "bold")) + + theme(aspect.ratio = 1) + + scale_color_manual(values = color.values) + + theme(strip.background = element_blank(), strip.text = element_text(size = 10, face = "bold")) + + theme(legend.position = "none", legend.title = element_blank()) + + + png(paste('02_PCA_results/', nice.date, '02_PCA_Averaged', i, '.png', sep = ''), res = 300, width = 1600, height = 1600) + print(pc) + dev.off() + +} + + + + + +# +# ######################################## +# #### Sleuth +# ######################################## +# +s2c <- s2c.list$not_trimmed +so <- sleuth_objects$not_trimmed +# +# +# sleuth.results <- c() +# +# # for(i in setdiff(s2c$Cond_short, 'C')) +# for(i in c('S', 'M', 'H', 'SM', 'SH', 'MH', 'SMH')) ##Nasser's Order +# { +# +# sleuthData <- droplevels(rbind(s2c[s2c$Cond_short == "C", ], s2c[s2c$Cond_short == i, ])) +# sleuthData$Cond_short <- relevel(factor(sleuthData$Cond_short), i) ### makes sure that the control / wt comes last +# +# so <- sleuth_prep(sleuthData, ~Cond_short) +# so <- sleuth_fit(so) +# so <- sleuth_fit(so, ~1, 'reduced') +# so <- sleuth_lrt(so, 'reduced', 'full') +# lrt_results <- sleuth_results(so, 'reduced:full', test_type = 'lrt') +# +# so <- sleuth_wt(so, 'Cond_shortC') +# wt_results <- sleuth_results(so, test = 'Cond_shortC', test_type = 'wt') +# +# ### the 'relative to direction' is weird for sleuth...! +# wt_results$b <- -wt_results$b +# +# results <- merge(lrt_results[, c('target_id', 'qval')], wt_results[, c('target_id', 'b', 'se_b')], by = 'target_id') +# # results <- wt_results[, c('target_id', 'qval','b', 'se_b')] +# +# sleuth.results <- rbind(sleuth.results, cbind(results, Cond_short = i)) +# } +# +# +# save(sleuth.results, means, sleuth.exprt, file = '01_Sleuth.RData') + + +means <- summarySE(expression.data, groupvars = c('Cond_short', 'target_id'), measurevar = 'tpm') +means$AGI <- do.call(rbind, strsplit(means$target_id, '.', fixed = T))[,1] + + + +for(LFC_cut in c(1, 1.5, 2)) +{ + load('01_Sleuth.RData') + + suppressWarnings(dir.create(paste0('03_ResultsWithLFC_CutOff_', LFC_cut))) + + sleuth.results[abs(sleuth.results$b) < LFC_cut & !is.na(sleuth.results$b), 'qval'] <- 1 + + q.values <- dcast(sleuth.results, target_id~Cond_short, value.var = 'qval') + colnames(q.values)[2:ncol(q.values)] <- paste('qvalue', colnames(q.values)[2:ncol(q.values)], sep = '_') + + lfc_b <- dcast(sleuth.results, target_id~Cond_short, value.var = 'b') + colnames(lfc_b)[2:ncol(lfc_b)] <- paste('b', colnames(lfc_b)[2:ncol(lfc_b)], sep = '_') + + lfc_se_b <- dcast(sleuth.results, target_id~Cond_short, value.var = 'se_b') + colnames(lfc_se_b)[2:ncol(lfc_se_b)] <- paste('se_b', colnames(lfc_se_b)[2:ncol(lfc_se_b)], sep = '_') + + + sleuth.exprt <- merge(q.values, lfc_b, 'target_id') + sleuth.exprt <- merge(sleuth.exprt, lfc_se_b, 'target_id') + + + expression.data <- kallisto_table(sleuth_objects$not_trimmed) + + + ######################################## + #### Sanity check top25 / bottom25 lfc + ######################################## + + + for(q_thresh in c(0.01, 0.05)) + { + + plotList <- list() + for(i in setdiff(s2c$Cond_short, 'C')) + { + + current.sig <- subset(sleuth.exprt, sleuth.exprt[,paste('qvalue', i, sep = '_')] <= q_thresh) + + current.top25 <- head(current.sig[order(current.sig[,paste('b', i,sep = '_')], decreasing = T),], 25) + if(nrow(current.top25) > 0) + { + plot.data <- merge(subset(means, Cond_short %in% c('C', i)), current.top25, 'target_id') + plot.data <- merge(plot.data, araport, 'AGI', all.x = T) + + plot.data$label <- paste(plot.data$target_id, plot.data$NAME, sep = '\n') + + plotList[[i]][['top25']] <- ggplot(plot.data, aes(x = Cond_short, y = tpm)) + + geom_bar(stat = 'identity') + + geom_errorbar(aes(ymin = (tpm - sd), ymax = (tpm + sd)), width = 0.1) + + theme_dominik + + facet_wrap('label', scales = 'free') + + ggtitle(paste('Top25 C vs.', i), subtitle = paste('q.value <', q_thresh)) + } + + current.bottom25 <- head(current.sig[order(current.sig[,paste('b', i,sep = '_')], decreasing = F),], 25) + if(nrow(current.bottom25) > 0) + { + + plot.data <- merge(subset(means, Cond_short %in% c('C', i)), current.bottom25, 'target_id') + plot.data <- merge(plot.data, araport, 'AGI', all.x = T) + + plot.data$label <- paste(plot.data$target_id, plot.data$NAME, sep = '\n') + + plotList[[i]][['bottom25']] <- ggplot(plot.data, aes(x = Cond_short, y = tpm)) + + geom_bar(stat = 'identity') + + geom_errorbar(aes(ymin = (tpm - sd), ymax = (tpm + sd)), width = 0.1) + + theme_dominik + + facet_wrap('label', scales = 'free') + + ggtitle(paste('Top25 C vs.', i), subtitle = paste('q.value <', q_thresh)) + + } + } + + plotList <- unlist(plotList, recursive = F) + plotList <- lapply(plotList, ggplotGrob) + plotList <- adaptGGplotSize_depth1(plotList) + + pdf(paste0('03_ResultsWithLFC_CutOff_', LFC_cut, '/', nice.date, '_03_q', q_thresh, 'SanityCheck.pdf'), width = 15, height = 15) + for(i in names(plotList)) + { + grid.arrange(plotList[[i]]) + } + dev.off() + + } + + + ######################################## + #### Venn diagrams and UpsetR type venn diagram for all treatments + ######################################## + + for(q_thresh in c(0.01, 0.05)) + { + + ######################################## + #### UpsetR type Venn diagram + ######################################## + + pdf(paste0('03_ResultsWithLFC_CutOff_', LFC_cut, '/', nice.date, '_04_q', q_thresh, 'UpsetR_Venn.pdf'), width = 10, height = 10) + #### 1. differentiating down and up + + sleuth.upset <- subset(sleuth.results, qval <= q_thresh) + sleuth.upset[sleuth.upset$b < 0, 'direction'] <- 'down' + sleuth.upset[sleuth.upset$b > 0, 'direction'] <- 'up' + sleuth.upset$set <- paste(sleuth.upset$Cond_short, sleuth.upset$direction) + + sleuth.upset$AGI <- do.call(rbind, strsplit(sleuth.upset$target_id, split = '.', fixed = T))[,1] + + set.order <- c(grep('down', sort(unique(sleuth.upset$set)), value = T), grep('up', sort(unique(sleuth.upset$set)), value = T)) + sleuth.venn <- dcast(sleuth.upset, target_id~set, value.var = 'set', fun.aggregate = length) + upset(sleuth.venn, sets = set.order, keep.order = F) + + + #### 2. Generally de-regulated + + sleuth.upset2 <- subset(sleuth.results, qval <= q_thresh) + sleuth.upset2$set <- sleuth.upset2$Cond_short + sleuth.venn2 <- dcast(sleuth.upset2, target_id~set, value.var = 'set', fun.aggregate = length) + upset(sleuth.venn2) + + count(sleuth.upset$set) + count(sleuth.upset2$set) + + ######################################## + #### Report some numbers + ######################################## + + + grid.arrange(top = paste('q <=', q_thresh), tableGrob(count(sleuth.upset2$set), rows = NULL), tableGrob(count(sleuth.upset$set), rows = NULL), ncol = 2) + + + + ######################################## + #### Standard venn diagram + ######################################## + + venn.plots <- list() + ### 1. Generally de-regulated + venn.data <- list() + for(i in setdiff(s2c$Cond_short, 'C')) + { + venn.data[[i]] <- subset(sleuth.exprt, sleuth.exprt[,paste('qvalue', i, sep = '_')] <= q_thresh, target_id, drop = T) + } + + for(j in c('H', 'M', 'S')) + { + # grid.newpage() + venn.plots[['de']][[j]] <- gTree(children = + venn.diagram( + x = venn.data[grep(j, names(venn.data))], + sub = paste(paste('q <=', q_thresh)), + main = 'de-regulated', + inverted = F, + cat.fontface = 'bold', + filename = NULL, + cat.default.pos = "text", + cat.pos = 0 + ) + ) + } + + + ### 2. Up-regulated + venn.data <- list() + for(i in setdiff(s2c$Cond_short, 'C')) + { + venn.data[[i]] <- subset(sleuth.exprt, sleuth.exprt[,paste('qvalue', i, sep = '_')] <= q_thresh & sleuth.exprt[,paste('b', i, sep = '_')] > 0, target_id, drop = T) + } + + for(j in c('H', 'M', 'S')) + { + # grid.newpage() + venn.plots[['up']][[j]] <- gTree(children = + venn.diagram( + x = venn.data[grep(j, names(venn.data))], + sub = paste(paste('q <=', q_thresh)), + main = 'Up-regulated', + inverted = F, + cat.fontface = 'bold', + filename = NULL, + cat.default.pos = "text", + cat.pos = 0 + ) + ) + } + + ### 3. Down-regulated + venn.data <- list() + for(i in setdiff(s2c$Cond_short, 'C')) + { + venn.data[[i]] <- subset(sleuth.exprt, sleuth.exprt[,paste('qvalue', i, sep = '_')] <= q_thresh & sleuth.exprt[,paste('b', i, sep = '_')] < 0, target_id, drop = T) + } + + for(j in c('H', 'M', 'S')) + { + # grid.newpage() + venn.plots[['down']][[j]] <- gTree(children = + venn.diagram( + x = venn.data[grep(j, names(venn.data))], + sub = paste(paste('q <=', q_thresh)), + main = 'Down-regulated', + inverted = F, + cat.fontface = 'bold', + filename = NULL, + cat.default.pos = "text", + cat.pos = 0 + ) + ) + } + + + + do.call(grid.arrange, unlist(venn.plots, recursive = F)) + + + dev.off() + + } + + + + ######################################## + #### Venn diagrams and UpsetR type venn diagram for single treatments + ######################################## + + for(q_thresh in c(0.01, 0.05)) + { + + ######################################## + #### UpsetR type Venn diagram + ######################################## + pdf(paste0('03_ResultsWithLFC_CutOff_', LFC_cut, '/', nice.date, '_04_q', q_thresh, 'UpsetR_Venn_singleTreatments.pdf'), width = 10, height = 10) + + + sleuth.upset <- subset(sleuth.results, qval <= q_thresh & Cond_short %in% c('H', 'M', 'S')) + sleuth.upset[sleuth.upset$b < 0, 'direction'] <- 'down' + sleuth.upset[sleuth.upset$b > 0, 'direction'] <- 'up' + sleuth.upset$set <- paste(sleuth.upset$Cond_short, sleuth.upset$direction) + + sleuth.upset2 <- subset(sleuth.results, qval <= q_thresh & Cond_short %in% c('H', 'M', 'S')) + sleuth.upset2$set <- sleuth.upset2$Cond_short + + sleuth.upset$AGI <- do.call(rbind, strsplit(sleuth.upset$target_id, split = '.', fixed = T))[,1] + + + ######################################## + #### Report some numbers + ######################################## + + + grid.arrange(top = paste('q <=', q_thresh), tableGrob(count(sleuth.upset2$set), rows = NULL), tableGrob(count(sleuth.upset$set), rows = NULL), ncol = 2) + + + ######################################## + #### Standard venn diagram + ######################################## + + venn.plots <- list() + ### 1. Generally de-regulated + venn.data <- list() + for(i in c('H', 'M', 'S')) + { + venn.data[[i]] <- subset(sleuth.exprt, sleuth.exprt[,paste('qvalue', i, sep = '_')] <= q_thresh, target_id, drop = T) + } + + venn.plots[['de']] <- gTree(children = + venn.diagram( + x = venn.data, + sub = paste(paste('q <=', q_thresh)), + main = 'de-regulated', + inverted = F, + cat.fontface = 'bold', + filename = NULL, + cat.default.pos = "text", + cat.pos = 0 + ) + ) + + + ### 2. Up-regulated + venn.data <- list() + for(i in c('H', 'M', 'S')) + { + venn.data[[i]] <- subset(sleuth.exprt, sleuth.exprt[,paste('qvalue', i, sep = '_')] <= q_thresh & sleuth.exprt[,paste('b', i, sep = '_')] > 0, target_id, drop = T) + } + + venn.plots[['up']] <- gTree(children = + venn.diagram( + x = venn.data, + sub = paste(paste('q <=', q_thresh)), + main = 'Up-regulated', + inverted = F, + cat.fontface = 'bold', + filename = NULL, + cat.default.pos = "text", + cat.pos = 0 + ) + ) + + ### 3. Down-regulated + venn.data <- list() + for(i in c('H', 'M', 'S')) + { + venn.data[[i]] <- subset(sleuth.exprt, sleuth.exprt[,paste('qvalue', i, sep = '_')] <= q_thresh & sleuth.exprt[,paste('b', i, sep = '_')] < 0, target_id, drop = T) + } + + venn.plots[['down']] <- gTree(children = + venn.diagram( + x = venn.data, + sub = paste(paste('q <=', q_thresh)), + main = 'Down-regulated', + inverted = F, + cat.fontface = 'bold', + filename = NULL, + cat.default.pos = "text", + cat.pos = 0 + ) + ) + + + + + do.call(grid.arrange, c(venn.plots, ncol = 3, nrow = 3)) + + + dev.off() + + } + + + + + + ############################################################################################################ + ### GO-term enrichment using fisher's exact + ############################################################################################################ + + alphaEnrich <- 0.05 + pAdjust <- "BH" + GO.type <- 'P' + + q_thresh <- 0.05 + + + fish.list <- list() + for(i in unique(sleuth.results$Cond_short)) + { + print(i) + + for(HighLow in c("higher", "lower")) + { + print(HighLow) + + current.sleuth <- subset(sleuth.results, Cond_short == i,select = c(target_id, qval, b)) + + current.sleuth[is.na(current.sleuth$qval), "qval"] <- 1 + current.sleuth[is.na(current.sleuth$b), "b"] <- 0 + + current.sleuth$HighLow <- as.character(cut(current.sleuth$b, breaks = c(min(current.sleuth$b), 0, max(current.sleuth$b)), labels = c("lower", "higher"), include.lowest = T)) + current.sleuth[current.sleuth$qval > q_thresh, "HighLow"] <- "Neither" + + ### Pick only genes, not transcripts + current.sleuth$AGI <- do.call(rbind, strsplit(split = ".", as.character(current.sleuth$target_id), fixed = T))[,1] + + GO.subset <- droplevels(merge(GOAnno[[GO.type]], current.sleuth, "AGI")) + + + fishers.results <- data.frame(matrix(NA, 0,3)) + for(GO.ID in unique(GO.subset[,"GO.ID"])) + { + + ############################################################################################################## + ### Fisher's exact test based on all (GO tagged) genes + ############################################################################################################## + gene.count <- length(unique(GO.subset[GO.subset$GO.ID == GO.ID,"AGI"])) + + InIn <- sum(GO.subset$HighLow == HighLow & GO.subset[,"GO.ID"] == GO.ID) + InOut <- sum(GO.subset$HighLow == HighLow & GO.subset[,"GO.ID"] != GO.ID) + OutIn <- sum((GO.subset$HighLow != HighLow & GO.subset[,"GO.ID"] == GO.ID)) + OutOut <- sum((GO.subset$HighLow != HighLow & GO.subset[,"GO.ID"] != GO.ID)) + + p <- fisher.test(matrix(c(InIn, InOut, OutIn, OutOut), nrow = 2))$p.value + + if(InOut != 0 & InIn != 0){if(InIn/InOut > OutIn/OutOut){represented = "over"}else{represented = "under"}} + + fishers.results <- rbind.data.frame(fishers.results, cbind.data.frame(GO.ID, p, represented, gene.count, InIn, InOut, OutIn, OutOut)) + + } + + fishers.results[, pAdjust] <- p.adjust(fishers.results$p, method = pAdjust) + # ### Remove non-significant + # fishers.results <- fishers.results[fishers.results[,pAdjust] <= alphaEnrich,] + # ### Remove under-represented + fishers.results <- fishers.results[fishers.results$represented == "over",] + ### Add Term + fishers.results <- merge(fishers.results, subset(GO.IDtoGO.Term, Aspect == GO.type), "GO.ID") + + + fish.list[[GO.type]][[i]][[HighLow]] <- fishers.results + + head(print(fishers.results)) + + print("Done") + } + } + + + + + fish.extraxt <- c() + for(i in names(fish.list$P)) + { + test <- do.call(rbind, fish.list$P[[i]]) + test <- subset(test, BH <= alphaEnrich) + if(nrow(test) > 0 ) + { + test$direction <- do.call(rbind, strsplit(split = ".", row.names(test), fixed = T))[,1] + test$Cond_short <- i + + fish.extraxt <- rbind(fish.extraxt, test) + } + } + + + fish.summary <- dcast(fish.extraxt, GO.ID+GO.Term+gene.count~Cond_short+direction, value.var = 'BH') + write.xlsx(fish.summary, file = paste0('03_ResultsWithLFC_CutOff_', LFC_cut, '/', nice.date, '_05_q', q_thresh, 'GO_enrichment.xlsx')) + + + + + ############################################################################################################ + ### Mapman enrichment using fisher's exact + ############################################################################################################ + + alphaEnrich <- 0.05 + pAdjust <- "BH" + + q_thresh <- 0.05 + + + + fish.list <- list() + for(i in unique(sleuth.results$Cond_short)) + { + print(i) + + for(HighLow in c("higher", "lower")) + { + print(HighLow) + + current.sleuth <- subset(sleuth.results, Cond_short == i,select = c(target_id, qval, b)) + + current.sleuth[is.na(current.sleuth$qval), "qval"] <- 1 + current.sleuth[is.na(current.sleuth$b), "b"] <- 0 + + current.sleuth$HighLow <- as.character(cut(current.sleuth$b, breaks = c(min(current.sleuth$b), 0, max(current.sleuth$b)), labels = c("lower", "higher"), include.lowest = T)) + current.sleuth[current.sleuth$qval > q_thresh, "HighLow"] <- "Neither" + + ### Pick only genes, not transcripts + current.sleuth$AGI <- do.call(rbind, strsplit(split = ".", as.character(current.sleuth$target_id), fixed = T))[,1] + + # mapman.subset <- droplevels(merge(mapman.X4, current.sleuth, "AGI")) + mapman.subset <- droplevels(merge(mapman, current.sleuth, "AGI")) + + + fishers.results <- data.frame(matrix(NA, 0,3)) + for(mapman.ID in unique(mapman.subset$BIN1)) + { + + ############################################################################################################## + ### Fisher's exact test based on all (mapman tagged) genes + ############################################################################################################## + gene.count <- length(unique(mapman.subset[mapman.subset$BIN1 == mapman.ID,"AGI"])) + + InIn <- sum(mapman.subset$HighLow == HighLow & mapman.subset$BIN1 == mapman.ID) + InOut <- sum(mapman.subset$HighLow == HighLow & mapman.subset$BIN1 != mapman.ID) + OutIn <- sum((mapman.subset$HighLow != HighLow & mapman.subset$BIN1 == mapman.ID)) + OutOut <- sum((mapman.subset$HighLow != HighLow & mapman.subset$BIN1 != mapman.ID)) + + p <- fisher.test(matrix(c(InIn, InOut, OutIn, OutOut), nrow = 2))$p.value + + if(InIn/InOut > OutIn/OutOut){represented = "over"}else{represented = "under"} + + fishers.results <- rbind.data.frame(fishers.results, cbind.data.frame(mapman.ID, p, represented, gene.count, InIn, InOut, OutIn, OutOut)) + + } + + fishers.results[, pAdjust] <- p.adjust(fishers.results$p, method = pAdjust) + # ### Remove non-significant + # fishers.results <- fishers.results[fishers.results[,pAdjust] <= alphaEnrich,] + # ### Remove under-represented + fishers.results <- fishers.results[fishers.results$represented == "over",] + + fish.list[[i]][[HighLow]] <- fishers.results + + head(print(fishers.results)) + + print("Done") + } + } + + + + + fish.extraxt <- c() + for(i in names(fish.list)) + { + test <- do.call(rbind, fish.list[[i]]) + test <- subset(test, BH <= alphaEnrich) + if(nrow(test) > 0 ) + { + test$direction <- do.call(rbind, strsplit(split = ".", row.names(test), fixed = T))[,1] + test$Cond_short <- i + + fish.extraxt <- rbind(fish.extraxt, test) + } + } + + + fish.summary <- dcast(fish.extraxt, mapman.ID+gene.count~Cond_short+direction, value.var = 'BH') + write.xlsx(fish.summary, file = paste0('03_ResultsWithLFC_CutOff_', LFC_cut, '/', nice.date, '_05_q', q_thresh, 'mapman_enrichment.xlsx')) + + ############################################################################################################ + ### TF family enrichment using fisher's exact + ############################################################################################################ + + # + # alphaEnrich <- 0.05 + # pAdjust <- "BH" + # + # q_thresh <- 0.05 + # + # + # + # fish.list <- list() + # for(i in unique(sleuth.results$Cond_short)) + # { + # print(i) + # + # for(HighLow in c("higher", "lower")) + # { + # print(HighLow) + # + # current.sleuth <- subset(sleuth.results, Cond_short == i,select = c(target_id, qval, b)) + # + # current.sleuth[is.na(current.sleuth$qval), "qval"] <- 1 + # current.sleuth[is.na(current.sleuth$b), "b"] <- 0 + # + # current.sleuth$HighLow <- as.character(cut(current.sleuth$b, breaks = c(min(current.sleuth$b), 0, max(current.sleuth$b)), labels = c("lower", "higher"), include.lowest = T)) + # current.sleuth[current.sleuth$qval > q_thresh, "HighLow"] <- "Neither" + # + # ### Pick only genes, not transcripts + # current.sleuth$AGI <- do.call(rbind, strsplit(split = ".", as.character(current.sleuth$target_id), fixed = T))[,1] + # + # TF.subset <- droplevels(merge(PlnTFDB, current.sleuth, "AGI")) + # + # + # fishers.results <- data.frame(matrix(NA, 0,3)) + # for(TF.ID in unique(TF.subset$TF_Family)) + # { + # + # ############################################################################################################## + # ### Fisher's exact test based on all (TF tagged) genes + # ############################################################################################################## + # gene.count <- length(unique(TF.subset[TF.subset$TF_Family == TF.ID,"AGI"])) + # + # InIn <- sum(TF.subset$HighLow == HighLow & TF.subset$TF_Family == TF.ID) + # InOut <- sum(TF.subset$HighLow == HighLow & TF.subset$TF_Family != TF.ID) + # OutIn <- sum((TF.subset$HighLow != HighLow & TF.subset$TF_Family == TF.ID)) + # OutOut <- sum((TF.subset$HighLow != HighLow & TF.subset$TF_Family != TF.ID)) + # + # p <- fisher.test(matrix(c(InIn, InOut, OutIn, OutOut), nrow = 2))$p.value + # + # if(InIn/InOut > OutIn/OutOut){represented = "over"}else{represented = "under"} + # + # fishers.results <- rbind.data.frame(fishers.results, cbind.data.frame(TF.ID, p, represented, gene.count, InIn, InOut, OutIn, OutOut)) + # + # } + # + # fishers.results[, pAdjust] <- p.adjust(fishers.results$p, method = pAdjust) + # # ### Remove non-significant + # # fishers.results <- fishers.results[fishers.results[,pAdjust] <= alphaEnrich,] + # # ### Remove under-represented + # fishers.results <- fishers.results[fishers.results$represented == "over",] + # + # fish.list[[i]][[HighLow]] <- fishers.results + # + # head(print(fishers.results)) + # + # print("Done") + # } + # } + # + # + # + # + # fish.extraxt <- c() + # for(i in names(fish.list)) + # { + # test <- do.call(rbind, fish.list[[i]]) + # test <- subset(test, BH <= alphaEnrich) + # if(nrow(test) > 0 ) + # { + # test$direction <- do.call(rbind, strsplit(split = ".", row.names(test), fixed = T))[,1] + # test$Cond_short <- i + # + # fish.extraxt <- rbind(fish.extraxt, test) + # } + # } + # + # + # + # fish.summary <- dcast(fish.extraxt, TF.ID+gene.count~Cond_short+direction, value.var = 'BH') + # write.xlsx(fish.summary, file = paste(nice.date, '05_q', q_thresh, 'TF_enrichment.xlsx', sep = '_')) + # + # + + + + ############################################################################################################ + ### Extracting some numbers for how many genes are up / down in the individual mapman bins + ############################################################################################################ + q_thresh <- 0.05 + + sleuth.upset <- subset(sleuth.results, qval <= q_thresh) + sleuth.upset[sleuth.upset$b < 0, 'direction'] <- 'down' + sleuth.upset[sleuth.upset$b > 0, 'direction'] <- 'up' + sleuth.upset$set <- paste(sleuth.upset$Cond_short, sleuth.upset$direction) + + sleuth.upset$AGI <- do.call(rbind, strsplit(sleuth.upset$target_id, split = '.', fixed = T))[,1] + + + + + + mapman_summary <- merge(sleuth.upset, mapman, 'AGI') + + mapman_summar2y <- list() + mapman_summar2y[['BIN1']] <- dcast(mapman_summary, BIN1~set) + mapman_summar2y[['BIN1.BIN2']] <- dcast(mapman_summary, BIN1+BIN2~set) + mapman_summar2y[['BIN1.BIN2.BIN3']] <- dcast(mapman_summary, BIN1+BIN2+BIN3~set) + mapman_summar2y[['BIN1.BIN2.BIN3.BIN4']] <- dcast(mapman_summary, BIN1+BIN2+BIN3+BIN4~set) + + write.xlsx(mapman_summar2y, file = paste0('03_ResultsWithLFC_CutOff_', LFC_cut, '/', nice.date, '_06_MapmanSummary.xlsx')) + + + + ############################################################################################################ + ### Extracting some numbers for how many genes are up / down across treatments + ############################################################################################################ + + diff.dev <- list() + + diff.dev[['p1']] <- ggplot(sleuth.upset, aes(x = Cond_short, fill = direction)) + + geom_bar(position = position_dodge()) + + theme_dominik + + scale_fill_manual(values = c(down = "#1f78b4", up = "#e41a1c")) + + geom_text(stat='count', aes(label=..count.., group = direction), vjust=-1, position = position_dodge(width = 1)) + + labs(y = 'Gene count', x = '') + theme(legend.position = c(0.1,0.8)) + ggtitle('Overview') + + + for(x in unique(sleuth.upset$set)) + { + + current.sub <- subset(sleuth.upset, AGI %in% subset(sleuth.upset, set == x, AGI, drop = T) & set != x) + + diff.dev[[x]] <- ggplot(current.sub, aes(x = Cond_short, fill = direction)) + + geom_bar(position = position_dodge()) + + theme_dominik + + scale_fill_manual(values = c(down = "#1f78b4", up = "#e41a1c")) + + geom_text(stat='count', aes(label=..count.., group = direction), vjust=-1, position = position_dodge(width = 1)) + + labs(y = 'Gene count', x = '') + + ggtitle(x) + theme(legend.position = 'none') + + + } + + + pdf(paste0('03_ResultsWithLFC_CutOff_', LFC_cut, '/', nice.date, '_DEG_counts.pdf'), width = 30, height = 30) + do.call(grid.arrange, diff.dev) + dev.off() + + + ############################################################################################################ + ### Extracting some numbers for how many genes are up / down across treatments + ############################################################################################################ + + set.order <- rev(c(grep('down', sort(unique(sleuth.upset$set)), value = T), grep('up', sort(unique(sleuth.upset$set)), value = T))) + sleuth.venn <- dcast(sleuth.upset, target_id~set, value.var = 'set', fun.aggregate = length) + + + meta <- unique(sleuth.upset[, c('set', 'direction', 'Cond_short')]) + + + pdf(paste0('03_ResultsWithLFC_CutOff_', LFC_cut, '/', nice.date, '_GeneSets_AllCombinations.pdf'), width = 60, height = 6) + upset(sleuth.venn, sets = set.order, keep.order = T, nintersects = NA, sets.bar.color=c(rep("#1f78b4", 7), rep("#e41a1c", 7)), set.metadata = list(data = meta, plots = list(list(type = "matrix_rows", column = "Cond_short", colors = color.values[names(color.values) != 'C'], alpha = 0.5)))) + # upset(sleuth.venn, sets = set.order, keep.order = T, nintersects = NA, set.metadata = list(data = meta, plots = list(list(type = "matrix_rows", column = "Cond_short", colors = color.values[names(color.values) != 'C'], alpha = 0.5)))) + dev.off() + + # + # a <- sleuth.venn + # a$combinations <- apply(a, 1, function(x){paste(colnames(a)[which(x == 1)], collapse = '|')}) + # gene.set.count <- count(a$combinations) + # + + + + + gene.set.list <- data.frame(AGI = unique(sleuth.upset$AGI)) + gene.set.plots <- list() + for(i in c('H', 'S', 'M')) + { + if(i == 'H'){label <- 'Heat'} + if(i == 'S'){label <- 'Salt'} + if(i == 'M'){label <- 'Mannitol'} + + upset.test <- subset(sleuth.upset, grepl(i, Cond_short)) + + set.order <- rev(c(grep('down', sort(unique(upset.test$set)), value = T), grep('up', sort(unique(upset.test$set)), value = T))) + sleuth.venn <- dcast(upset.test, target_id~set, value.var = 'set', fun.aggregate = length) + # upset(sleuth.venn, sets = set.order, keep.order = T, nintersects = NA) + + upset(sleuth.venn, sets = set.order, sets.bar.color=c(rep("#1f78b4", 4), rep("#e41a1c", 4)), keep.order = T, nintersects = NA, set.metadata = list(data = meta, plots = list(list(type = "matrix_rows", column = "Cond_short", colors = color.values[grepl(i, names(color.values))], alpha = 0.5)))) + + + grid.edit('arrange', name = 'bla') + vp <- grid.grab() + + gene.set.plots[[label]] <- arrangeGrob(vp, top = label) + + a <- sleuth.venn + + a$combinations <- apply(a, 1, function(x){paste(colnames(a)[which(x == 1)], collapse = '|')}) + + gene.set.count <- count(a$combinations) + + a <- merge(a, gene.set.count, by.x = 'combinations', by.y = 'x') + a[,label] <- paste(a$combinations, a$freq) + + a$AGI <- do.call(rbind, strsplit(split = ".", as.character(a$target_id), fixed = T))[,1] + b <- a[, c('AGI', label)] + + gene.set.list <- merge(gene.set.list, b, all.x = T, by = 'AGI') + + } + + + pdf(paste0('03_ResultsWithLFC_CutOff_', LFC_cut, '/', nice.date, '_GeneSets_byTreatment.pdf'), width = 16, height = 18) + do.call(grid.arrange, c(gene.set.plots, ncol = 1)) + dev.off() + + +} + + + +# +# +# +# ############################################################################################################ +# ### Gene categories / LFC tables +# ############################################################################################################ +# +# # +# # anno.cols <- colnames(supp)[1:26] +# # +# # lfc.table <- melt(supp, id.vars = anno.cols, measure.vars = grep('^b_', colnames(supp)), value.name = 'lfc') +# # q.table <- melt(supp, id.vars = 'AGI', measure.vars = grep('qvalue_', colnames(supp)), value.name = 'q') +# # head(q.table) +# # +# # lfc.table <- cbind.data.frame(lfc.table, q.table[,2:3]) +# # lfc.table$q.star <- cut(lfc.table$q, breaks = c(0, 0.001, 0.01, 0.05, 1), labels = c('***', '**', '*', ''), include.lowest = T) +# # +# # grep('cell wall', supp$MAPMAN.BIN, value = T) +# +# anno.cols <- colnames(supp)[1:26] +# +# lfc.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('^b_', colnames(sleuth.exprt)), value.name = 'lfc') +# q.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('qvalue_', colnames(sleuth.exprt)), value.name = 'q') +# head(q.table) +# +# lfc.table <- cbind.data.frame(lfc.table, q.table[,2:3]) +# lfc.table$q.star <- cut(lfc.table$q, breaks = c(0, 0.001, 0.01, 0.05, 1), labels = c('***', '**', '*', ' '), include.lowest = T) +# +# +# sig.at.some <- unique(subset(q.table, q <= 0.05, AGI, drop = T)) +# +# +# count(mapman.X4[, c('BIN1', 'BIN2')]) +# +# +# cwllwall <- merge(subset(mapman.X4, BIN1 == 'Cell wall' & BIN2 == 'cellulose'), lfc.table, 'AGI') +# cwllwall$label <- gsub('b_' , '', cwllwall$variable) +# cwllwall$label <- factor(cwllwall$label, unique(cwllwall[order(cwllwall$variable), 'label'])) +# +# ggplot(cwllwall, aes(x = label, y = AGI, label = q.star, fill = lfc)) + +# # geom_tile() + +# geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +# geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +# geom_text(size = 3, color = "black", vjust = 0.7) + +# scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# # limits = limits.lfc.sleuth.GT, +# midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +# theme_dominik + +# theme(aspect.ratio = 5) +# +# +# View(GOAnno$P) +# +# unique(grep('pathogen', GOAnno$P$GO.Term, ignore.case = T, value = T)) +# +# unique(grep('heat', araport$NAME, ignore.case = T, value = T)) +# +# +# +# +# +# candidates <- subset(lfc.table, AGI %in% subset(araport, grepl("heat shock protein", NAME, ignore.case = T), AGI, drop = T)) +# category <- "HSPs" +# +# +# View(subset(araport, grepl("late embryogenesis", NAME, ignore.case = T))) +# View(subset(araport, grepl("LEA", name, ignore.case = T))) +# +# View(subset(mapman, grepl("embryo", MAPMAN.BIN, ignore.case = T))) +# +# View(subset(mapman.X4, grepl("embryo", MAPMAN.BIN, ignore.case = T))) +# +# +# +# unique(grep('HSP', mapman.X4$MAPMAN.BIN, ignore.case = T, value = T)) +# +# +# +# candidates <- subset(lfc.table, AGI %in% subset(mapman.X4, grepl("heat-shock", MAPMAN.BIN, ignore.case = T), AGI, drop = T)) +# category <- "HSPs" +# +# View(subset(araport, grepl("late embryogenesis", NAME, ignore.case = T))) +# +# +# candidates <- subset(lfc.table, AGI %in% subset(araport, grepl("late embryogenesis", NAME, ignore.case = T), AGI, drop = T)) +# category <- "LEAs" +# +# +# candidates <- merge(subset(GOAnno$P, GO.Term =='response to heat'), lfc.table, 'AGI') +# category <- "response to heat" +# +# candidates <- merge(subset(GOAnno$P, GO.Term =='pathogenesis'), lfc.table, 'AGI') +# category <- "pathogen" +# +# # +# # +# # +# # for(i in unique(cluster.results$`14`$km.new.order)) +# # { +# # +# # current.cluster <- subset(cluster.results$`14`, km.new.order == i) +# # +# # tfs.cluster <- merge(current.cluster, PlnTFDB, 'AGI') +# # +# # +# # +# # } +# +# +# +# candidates <- merge(candidates, araport, 'AGI') +# candidates$label <- gsub('b_' , '', candidates$variable) +# candidates$label <- factor(candidates$label, unique(candidates[order(candidates$variable), 'label'])) +# # candidates$gene.label <- paste0(candidates$name, ' (', candidates$AGI, ')') +# candidates$gene.label <- paste0(candidates$NAME, ' (', candidates$AGI, ')') +# +# +# pdf(paste0(category, '.pdf'), width = nrow(candidates)/50, height = nrow(candidates)/50 * 2) +# +# ggplot(candidates, aes(x = label, y = gene.label, label = q.star, fill = lfc)) + +# geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +# geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +# geom_text(size = 3, color = "black", vjust = 0.7) + +# theme_dominik + +# scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# # limits = limits.lfc.sleuth.GT, +# midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / Ctrl)", sep = ""))) + +# theme(legend.position = 'right', legend.direction = 'vertical', legend.title = element_text(), axis.ticks = element_blank(), panel.background = element_blank()) + +# theme(aspect.ratio = nrow(candidates)/50) + +# labs(x = '', y = '') + +# scale_x_discrete(position = 'top') + +# ggtitle(category) +# +# dev.off() +# +# +# +# +# +# load('02_kmeans_clustering/190520_02_FishList.RData') +# +# View(fish.list$mapman$`14`) +# View(fish.list$go$`14`) + + + diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/01_Kallisto.RData b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/01_Kallisto.RData new file mode 100644 index 0000000000000000000000000000000000000000..0673a90dbec915f62d28b495256fa905a01ec7be Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/01_Kallisto.RData differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/01_Sleuth.RData b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/01_Sleuth.RData new file mode 100644 index 0000000000000000000000000000000000000000..eacfe0d76191502d97127dd357fd402cb063e0af Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/01_Sleuth.RData differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_All.png b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_All.png new file mode 100644 index 0000000000000000000000000000000000000000..90dd03e0b26e0d7a4607269a7055559cd30f504f Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_All.png differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_AveragedAll.png b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_AveragedAll.png new file mode 100644 index 0000000000000000000000000000000000000000..1a2c0dd5ec63922b4e11102eaa62c43649042652 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_AveragedAll.png differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_AveragedHeat.png b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_AveragedHeat.png new file mode 100644 index 0000000000000000000000000000000000000000..ba1b3d3138ff832b7a78d05d1bfff9c0b6f4a5b6 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_AveragedHeat.png differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_AveragedNoHeat.png b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_AveragedNoHeat.png new file mode 100644 index 0000000000000000000000000000000000000000..c1f0eb3985510e89ccf31832960f670852a0ef15 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_AveragedNoHeat.png differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_Heat.png b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_Heat.png new file mode 100644 index 0000000000000000000000000000000000000000..1937291d029381135e7489304824deffb3d8afed Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_Heat.png differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_NoHeat.png b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_NoHeat.png new file mode 100644 index 0000000000000000000000000000000000000000..8354e0eebb39ec5a93231d6fdf66ab0f000e0391 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/02_PCA_results/19073102_PCA_NoHeat.png differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_03_q0.01SanityCheck.pdf b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_03_q0.01SanityCheck.pdf new file mode 100644 index 0000000000000000000000000000000000000000..04d669f8cd986c43247964d12f7844f08b9c4629 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_03_q0.01SanityCheck.pdf differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_03_q0.05SanityCheck.pdf b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_03_q0.05SanityCheck.pdf new file mode 100644 index 0000000000000000000000000000000000000000..0bcbe5f47284868b848987da1662d17e9d4325a2 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_03_q0.05SanityCheck.pdf differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_04_q0.01UpsetR_Venn.pdf b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_04_q0.01UpsetR_Venn.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2d7546e414bc37b8cc497ae0519aa3736a2bf795 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_04_q0.01UpsetR_Venn.pdf differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_04_q0.01UpsetR_Venn_singleTreatments.pdf b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_04_q0.01UpsetR_Venn_singleTreatments.pdf new file mode 100644 index 0000000000000000000000000000000000000000..508ee50741d30ef2c8f8de0bc6a298dbed84c492 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_04_q0.01UpsetR_Venn_singleTreatments.pdf differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_04_q0.05UpsetR_Venn.pdf b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_04_q0.05UpsetR_Venn.pdf new file mode 100644 index 0000000000000000000000000000000000000000..0c6e1db8b94bcb3273e4c4f3be42cce192f2b735 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_04_q0.05UpsetR_Venn.pdf differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_04_q0.05UpsetR_Venn_singleTreatments.pdf b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_04_q0.05UpsetR_Venn_singleTreatments.pdf new file mode 100644 index 0000000000000000000000000000000000000000..aecd8d8b821be015057e22585d8ca64d36e1e9e0 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_04_q0.05UpsetR_Venn_singleTreatments.pdf differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_05_q0.05GO_enrichment.xlsx b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_05_q0.05GO_enrichment.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..0fe2d2245ff8e6d597e9dc27bb64fc5da24a95bf Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_05_q0.05GO_enrichment.xlsx differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_05_q0.05mapman_enrichment.xlsx b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_05_q0.05mapman_enrichment.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..407efcb5440b679f6db46a4eb3473b5547eab042 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_05_q0.05mapman_enrichment.xlsx differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_06_MapmanSummary.xlsx b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_06_MapmanSummary.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..731f1c5c37241bc41b851b64ee49f29bc1cdbc3a Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_06_MapmanSummary.xlsx differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_DEG_counts.pdf b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_DEG_counts.pdf new file mode 100644 index 0000000000000000000000000000000000000000..54ab76aa614cae0467dd833141786dc6126f3b38 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_DEG_counts.pdf differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_GeneSets_AllCombinations.pdf b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_GeneSets_AllCombinations.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e782a2bb34e76db9e1ec066d14583ac8f031591d Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_GeneSets_AllCombinations.pdf differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_GeneSets_byTreatment.pdf b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_GeneSets_byTreatment.pdf new file mode 100644 index 0000000000000000000000000000000000000000..fc1ba719fb19ad15d8b7ec5ff85a345651d8e447 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/03_ResultsWithLFC_CutOff_1/190731_GeneSets_byTreatment.pdf differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/04_ClusterResultsWithLFC_CutOff_1/190731_CLusterIteration.pdf b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/04_ClusterResultsWithLFC_CutOff_1/190731_CLusterIteration.pdf new file mode 100644 index 0000000000000000000000000000000000000000..94859954c9634103dd8b00fc4b374ed81b9fc596 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/04_ClusterResultsWithLFC_CutOff_1/190731_CLusterIteration.pdf differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/04_ClusterResultsWithLFC_CutOff_1/190731_ClusterResults.RData b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/04_ClusterResultsWithLFC_CutOff_1/190731_ClusterResults.RData new file mode 100644 index 0000000000000000000000000000000000000000..1e3fbe3ff407ef1171effbe23db97e4f2d49f9c8 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/04_ClusterResultsWithLFC_CutOff_1/190731_ClusterResults.RData differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/04_kmeans.R b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/04_kmeans.R new file mode 100644 index 0000000000000000000000000000000000000000..22de0db6f2798a33a1279164352c2ac9b089bc92 --- /dev/null +++ b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/04_kmeans.R @@ -0,0 +1,369 @@ + + +required.packages <- c("Rmisc", "plyr", "ggplot2", "reshape2", "gridExtra", 'openxlsx', 'sleuth', 'amap', 'ggdendro', 'ellipse', 'UpSetR', 'VennDiagram') + +for(package in required.packages) +{ + print(package) + ## Check if package is installed. If not, install + if(!package %in% row.names(installed.packages())){install.packages(package, repos ="https://cran.uni-muenster.de/")} + # ## Check if package is up to date. If not, update + # update.packages(package, repos = "https://cran.uni-muenster.de/") + ## Load package + library(package, character.only = T) +} + + +theme_dominik <- + theme(panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(fill = NA)) + + theme(aspect.ratio = 1) + + theme(axis.ticks = element_line(color = "black"), axis.text = element_text(color = "black")) + + theme(strip.background = element_blank(), strip.text = element_text(size = 8)) + + theme(legend.position = "bottom", legend.direction = "horizontal", legend.title = element_blank()) + #, legend.key.size = unit(0.05, "npc") + theme(plot.margin=unit(c(1, 1, 1, 1),"cm")) + + theme(plot.title = element_text(hjust = 0, face = "bold", size = 8)) + + +######################################## +#### Annotation data +######################################## + +load('RNASeq_Annotation/Arabidopsis/02.0Araport11/02.0Araport.RData') +load("RNASeq_Annotation/Arabidopsis/02.4GO/GOAnno2018-02-06.RData") +load("RNASeq_Annotation/Arabidopsis/02.3Mapman/180924_MercatorMapman.RData") +load("RNASeq_Annotation/Arabidopsis/02.3Mapman/02.3Mapman.RData") +colnames(mapman)[2] <- 'MAPMAN.BIN' +load("RNASeq_Annotation/Arabidopsis/02.9PlnTFDB/02.9PlnTFDB.RData") +load("RNASeq_Annotation/Arabidopsis/02.12Lipids/02.12lipids.RData") + + +######################################## +#### Expression data +######################################## + +load('01_Kallisto.RData') + + +nice.date <- format(Sys.time(), "%y%m%d") +SampleInfo <- read.xlsx('../00_SampleInfo.xlsx') + +color.values <- brewer.pal(8, 'Dark2')[8:1] +names(color.values) <- unique(SampleInfo$Cond_short) + + +#### Pull count / tpm data +expression.data <- kallisto_table(sleuth_objects$not_trimmed) +expression.data$AGI <- do.call(rbind, strsplit(split = ".", as.character(expression.data$target_id), fixed = T))[,1] +expression.data <- merge(expression.data, SampleInfo, by.x = 'sample', by.y = 'IlluminaID') + +#### Calculate means and sd +means <- summarySE(expression.data, groupvars = c('Cond_short.y', 'AGI'), measurevar = 'tpm') +mean.tpm <- dcast(means, AGI~Cond_short.y, value.var = 'tpm') +mean.tpm <- mean.tpm[, c('AGI', 'C', 'S', 'M', 'H', 'SM', 'SH', 'MH', 'SMH')] +colnames(mean.tpm)[2:ncol(mean.tpm)] <- paste('mean.tpm', colnames(mean.tpm)[2:ncol(mean.tpm)], sep = '_') + +kmeans.elbows <- list() +for(LFC_cut in c(1, 1.5, 2, 3)) +{ + load('01_Sleuth.RData') + sleuth.results[abs(sleuth.results$b) < LFC_cut & !is.na(sleuth.results$b), 'qval'] <- 1 + + +######################################## +#### k-means of only genes with at least one sig diff +######################################## + +sleuth.exprt$AGI <- do.call(rbind, strsplit(split = ".", as.character(sleuth.exprt$target_id), fixed = T))[,1] +sleuth.results$AGI <- do.call(rbind, strsplit(split = ".", as.character(sleuth.results$target_id), fixed = T))[,1] + + +######################################## +#### Option b) k-means of LFCs (only genes with at least one sig diff) + +lfcs <- sleuth.exprt[,grep('^b_', colnames(sleuth.exprt))] +row.names(lfcs) <- sleuth.exprt$AGI + +sig.lfcs <- lfcs[row.names(lfcs) %in% unique(subset(sleuth.results, qval <= 0.05, AGI, drop = T)), ] +sig.lfcs[is.na(sig.lfcs)] <- 0 + +sub_to_cluster <- sig.lfcs +######################################## + +k_to_test <- 2:30 +km_k2up <- sapply(k_to_test,function(x) kmeans(sub_to_cluster, x, nstart=3)$tot.withinss) +to_plot <- data.frame(ss=km_k2up, k=k_to_test, run="real") + +n_negative_controls <- 7 +for (i in 1:n_negative_controls){ + scrambled_to_cluster <- t(apply(sub_to_cluster, 1, sample)) + km_k2tok24negative <- sapply(k_to_test, + function(x) kmeans(scrambled_to_cluster, x)$tot.withinss) + to_plot <- rbind(to_plot, data.frame(ss=km_k2tok24negative, k=k_to_test, run=paste0("neg", i))) + +} + +choose.k <- ggplot(data=to_plot, aes(x=k, y=ss, color=run, group=run)) + geom_line() + + geom_point(size=1, shape=20) + scale_color_manual(values=brewer.pal(length(unique(to_plot$run)), "Set1")) +choose.k + + +neg.minus.real <- sapply(paste0("neg", 1:n_negative_controls), function(x) to_plot$ss[to_plot$run==x]) +neg.minus.real <- to_plot[to_plot$run=="neg1", "ss"] - to_plot[to_plot$run=="real", "ss"] +neg.minus.real <- data.frame(neg.minus.real, k=k_to_test) + + +choose.k.by.diff <- ggplot(neg.minus.real, aes(y=neg.minus.real, x=k))+ geom_bar(stat="identity") + ggtitle(LFC_cut) + +kmeans.elbows[[LFC_cut]] <- choose.k.by.diff + +} + + + +for(LFC_cut in c(1, 1.5, 2)) +{ + load('01_Sleuth.RData') + + suppressWarnings(dir.create(paste0('04_ClusterResultsWithLFC_CutOff_', LFC_cut))) + + sleuth.results[abs(sleuth.results$b) < LFC_cut & !is.na(sleuth.results$b), 'qval'] <- 1 + + + ######################################## + #### k-means of only genes with at least one sig diff + ######################################## + + sleuth.exprt$AGI <- do.call(rbind, strsplit(split = ".", as.character(sleuth.exprt$target_id), fixed = T))[,1] + sleuth.results$AGI <- do.call(rbind, strsplit(split = ".", as.character(sleuth.results$target_id), fixed = T))[,1] + + + ######################################## + #### Option b) k-means of LFCs (only genes with at least one sig diff) + + lfcs <- sleuth.exprt[,grep('^b_', colnames(sleuth.exprt))] + row.names(lfcs) <- sleuth.exprt$AGI + + sig.lfcs <- lfcs[row.names(lfcs) %in% unique(subset(sleuth.results, qval <= 0.05, AGI, drop = T)), ] + sig.lfcs[is.na(sig.lfcs)] <- 0 + + sub_to_cluster <- sig.lfcs + + cluster.results <- c() + for(i in 3:10) + { + + km <- kmeans(sub_to_cluster, i, nstart=100) + + cluster.mship <- as.data.frame(km$cluster) + colnames(cluster.mship) <- 'km' + cluster.mship$AGI <- rownames(cluster.mship) + + plot.d <- as.data.frame(sub_to_cluster) + plot.d$AGI <- row.names(plot.d) + + scaled.tpm <- melt(plot.d) + colnames(scaled.tpm) <- c("AGI", "Condition", "z.score") + scaled.tpm$Condition <- gsub('b_', '', scaled.tpm$Condition) + scaled.tpm$Condition <- factor(gsub('mean.tpm_', '', scaled.tpm$Condition), levels = c('C' ,'S', 'M', 'H', 'SM', 'SH', 'MH', 'SMH')) + + ###Building correlations between the center means for resorting + + ## 1. Calculate mean scores + mean.scores <- merge(scaled.tpm, cluster.mship, 'AGI') + mean.scores <- summarySE(mean.scores, measurevar = 'z.score', groupvars = c('km', 'Condition')) + ## 2. Pick the cluster with the least deviation from zero (most boring cluster) as a reference + + min.dev <- order(abs(summarySE(mean.scores, measurevar = 'z.score', groupvars = c('km'))$z.score))[1] + + cor.mat <- c() + for(j in unique(mean.scores$km)) + { + cor.mat[j] <- cor(x = subset(mean.scores, km == min.dev, z.score), y = subset(mean.scores, km == j, z.score)) + } + + re.ordercluster <- cbind(km.new.order = sort(unique(mean.scores$km)), km = order(cor.mat, decreasing = T)) + + cluster.mship <- merge(cluster.mship, re.ordercluster, 'km') + cluster.mship <- merge(cluster.mship, count(cluster.mship$km.new.order), by.x = 'km.new.order', by.y = 'x') + + cluster.mship$label <- paste0('Cluster ', cluster.mship$km.new.order, ' (', cluster.mship$freq, ' genes)') + + cluster.results[[as.character(i)]] <- cluster.mship[,c(1, 3, 5)] + + + } + + save(cluster.results, file = paste0('04_ClusterResultsWithLFC_CutOff_', LFC_cut, '/', nice.date, '_ClusterResults.RData')) + + + + + ############################################################################################################ + ### Category enrichment using fisher's exact + ############################################################################################################ + + + alphaEnrich <- 0.05 + pAdjust <- "BH" + + GO.type <- 'P' + + q_thresh <- 0.05 + + fish.list <- list() + for(k in names(cluster.results)) + { + + current.cluster.appr <- cluster.results[[k]] + + # mapman.subset <- droplevels(merge(mapman.X4, current.cluster.appr, "AGI")) + mapman.subset <- droplevels(merge(mapman, current.cluster.appr, "AGI")) + GO.subset <- droplevels(merge(GOAnno[[GO.type]], current.cluster.appr, "AGI")) + + for(i in unique(current.cluster.appr$km.new.order)) + { + + ############################################################################################################ + ### Mapman + ############################################################################################################ + + + fishers.results <- data.frame(matrix(NA, 0,3)) + for(mapman.ID in unique(mapman.subset$BIN1)) + { + gene.count <- length(unique(mapman.subset[mapman.subset$BIN1 == mapman.ID,"AGI"])) + + InIn <- sum(mapman.subset$km.new.order == i & mapman.subset$BIN1 == mapman.ID) + InOut <- sum(mapman.subset$km.new.order == i & mapman.subset$BIN1 != mapman.ID) + OutIn <- sum((mapman.subset$km.new.order != i & mapman.subset$BIN1 == mapman.ID)) + OutOut <- sum((mapman.subset$km.new.order != i & mapman.subset$BIN1 != mapman.ID)) + + p <- fisher.test(matrix(c(InIn, InOut, OutIn, OutOut), nrow = 2))$p.value + + if(InIn/InOut > OutIn/OutOut){represented = "over"}else{represented = "under"} + + fishers.results <- rbind.data.frame(fishers.results, cbind.data.frame(mapman.ID, p, represented, gene.count, InIn, InOut, OutIn, OutOut)) + + } + + fishers.results[, pAdjust] <- p.adjust(fishers.results$p, method = pAdjust) + # ### Remove non-significant + # fishers.results <- fishers.results[fishers.results[,pAdjust] <= alphaEnrich,] + # ### Remove under-represented + fishers.results <- fishers.results[fishers.results$represented == "over",] + + fishers.results$km.new.order <- i + + fish.list[['mapman']][[k]] <- rbind(fish.list[['mapman']][[k]], fishers.results) + + + ############################################################################################################ + ### GO terms + ############################################################################################################ + + + fishers.results <- data.frame(matrix(NA, 0,3)) + for(GO.Term in unique(GO.subset$GO.Term)) + { + + gene.count <- length(unique(GO.subset[GO.subset$GO.Term == GO.Term,"AGI"])) + + InIn <- sum(GO.subset$km.new.order == i & GO.subset$GO.Term == GO.Term) + InOut <- sum(GO.subset$km.new.order == i & GO.subset$GO.Term != GO.Term) + OutIn <- sum((GO.subset$km.new.order != i & GO.subset$GO.Term == GO.Term)) + OutOut <- sum((GO.subset$km.new.order != i & GO.subset$GO.Term != GO.Term)) + + p <- fisher.test(matrix(c(InIn, InOut, OutIn, OutOut), nrow = 2))$p.value + + + if(InIn/InOut > OutIn/OutOut){represented = "over"}else{represented = "under"} + + fishers.results <- rbind.data.frame(fishers.results, cbind.data.frame(GO.Term, p, represented, gene.count, InIn, InOut, OutIn, OutOut)) + + } + + fishers.results[, pAdjust] <- p.adjust(fishers.results$p, method = pAdjust) + # ### Remove non-significant + # fishers.results <- fishers.results[fishers.results[,pAdjust] <= alphaEnrich,] + # ### Remove under-represented + fishers.results <- fishers.results[fishers.results$represented == "over",] + + + fishers.results$km.new.order <- i + + fish.list[['go']][[k]] <- rbind(fish.list[['go']][[k]], fishers.results) + + + } + } + + # save(fish.list, file = paste0(nice.date, '_02_FishList.RData')) + + + + + pdf(paste0('04_ClusterResultsWithLFC_CutOff_', LFC_cut, '/', nice.date, '_CLusterIteration.pdf'), width = 20, height = 20) + + for(k in names(cluster.results)) + { + + go.ex <- drop.levels(subset(fish.list[['go']][[k]], BH <= alphaEnrich)) + go.ex$GO.Term <- factor(go.ex$GO.Term, levels = rev(sort(unique(go.ex$GO.Term)))) + + go.plot <- ggplot(go.ex, aes(x=as.factor(km.new.order), y=GO.Term, label = InIn)) + + geom_label() + + xlab('k-means cluster') + + theme_dominik + theme(aspect.ratio = 2) + + theme(panel.grid = element_line(color = 'grey70'), panel.background = element_blank(), panel.border = element_rect(fill = NA)) + + + mm.ex <- drop.levels(subset(fish.list[['mapman']][[k]], BH <= alphaEnrich)) + mm.ex$mapman.ID <- factor(mm.ex$mapman.ID, levels = rev(sort(unique(mm.ex$mapman.ID)))) + + + mm.plot <- ggplot(mm.ex, aes(x=as.factor(km.new.order), y=mapman.ID, label = InIn)) + + geom_label() + + xlab('k-means cluster') + + theme_dominik + theme(aspect.ratio = 2) + + theme(panel.grid = element_line(color = 'grey70'), panel.background = element_blank(), panel.border = element_rect(fill = NA)) + + + plot.data <- merge(scaled.tpm, cluster.results[[k]], 'AGI') + plot.data$label <- factor(plot.data$label, levels = unique(plot.data[order(plot.data$km.new.order), 'label'])) + + cluster.plot <- ggplot(plot.data, aes(x=Condition, y=z.score, group=AGI)) + + # geom_boxplot(aes(group = Condition)) + + geom_violin(aes(group = Condition), scale = 'width') + + geom_hline(yintercept = 0, color = 'red') + + labs(x="Treatment", y="log-FC (Trtmt / Control)") + + facet_wrap(~label, scales = 'free', nrow = 2) + + # facet_wrap(~km, scales = 'free') + + theme_dominik + + theme(strip.background = element_blank(), strip.text = element_text(size = 12)) + + grid.arrange(cluster.plot, gtable_cbind(ggplotGrob(mm.plot), ggplotGrob(go.plot)), nrow = 2, + top = 'k-means clustering of relative expression', + bottom = 'A, log2-FC (Trtmt / Control) as violin diagram; Red line: control reference\n + B, mapman terms over-represented in each cluster (according to Fisher´s Exact Test, q < 0.05); Numbers: gene count\n + C, gene ontology terms over-represented in each cluster (according to Fisher´s Exact Test, q < 0.05); Numbers: gene count') + + } + + dev.off() + + + +} + + + + + +load("04_ClusterResultsWithLFC_CutOff_1/190731_ClusterResults.RData") +cluster.results <- cluster.results$`8`[,1:2] + +colnames(cluster.results) <- c('kmeans_cluster', 'AGI') +save(cluster.results, file = '04_190731_ClusterResults_final.RData') + + + diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_190731_ClusterResults_final.RData b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_190731_ClusterResults_final.RData new file mode 100644 index 0000000000000000000000000000000000000000..4100f4628d8fdcd49937be013685d6dbe7393194 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_190731_ClusterResults_final.RData differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans.R b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans.R new file mode 100644 index 0000000000000000000000000000000000000000..0418dc1ec29aa90beddb1320697bd8f878a4e6b5 --- /dev/null +++ b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans.R @@ -0,0 +1,357 @@ + + +required.packages <- c("Rmisc", "plyr", "ggplot2", "reshape2", "gridExtra", 'openxlsx', 'sleuth', 'amap', 'ggdendro', 'ellipse', 'UpSetR', 'VennDiagram') + +for(package in required.packages) +{ + print(package) + ## Check if package is installed. If not, install + if(!package %in% row.names(installed.packages())){install.packages(package, repos ="https://cran.uni-muenster.de/")} + # ## Check if package is up to date. If not, update + # update.packages(package, repos = "https://cran.uni-muenster.de/") + ## Load package + library(package, character.only = T) +} + + +theme_dominik <- + theme(panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(fill = NA)) + + theme(aspect.ratio = 1) + + theme(axis.ticks = element_line(color = "black"), axis.text = element_text(color = "black")) + + theme(strip.background = element_blank(), strip.text = element_text(size = 8)) + + theme(legend.position = "bottom", legend.direction = "horizontal", legend.title = element_blank()) + #, legend.key.size = unit(0.05, "npc") + theme(plot.margin=unit(c(1, 1, 1, 1),"cm")) + + theme(plot.title = element_text(hjust = 0, face = "bold", size = 8)) + + +load("04_ClusterResultsWithLFC_CutOff_1/190731_ClusterResults.RData") + +cluster.resultsexport <- cluster.results$`8` + +# colnames(cluster.resultsexport) <- c('kmeans_cluster', 'AGI') +save(cluster.resultsexport, file = '05_190731_ClusterResults_final.RData') + + LFC_cut <- 1 + sleuth.results[abs(sleuth.results$b) < LFC_cut & !is.na(sleuth.results$b), 'qval'] <- 1 + + + ######################################## + #### k-means of only genes with at least one sig diff + ######################################## + + sleuth.exprt$AGI <- do.call(rbind, strsplit(split = ".", as.character(sleuth.exprt$target_id), fixed = T))[,1] + sleuth.results$AGI <- do.call(rbind, strsplit(split = ".", as.character(sleuth.results$target_id), fixed = T))[,1] + + lfcs <- sleuth.exprt[,grep('^b_', colnames(sleuth.exprt))] + row.names(lfcs) <- sleuth.exprt$AGI + + sig.lfcs <- lfcs[row.names(lfcs) %in% unique(subset(sleuth.results, qval <= 0.05, AGI, drop = T)), ] + sig.lfcs[is.na(sig.lfcs)] <- 0 + + sub_to_cluster <- sig.lfcs + + plot.d <- as.data.frame(sub_to_cluster) + plot.d$AGI <- row.names(plot.d) + + scaled.tpm <- melt(plot.d) + colnames(scaled.tpm) <- c("AGI", "Condition", "z.score") + scaled.tpm$Condition <- gsub('b_', '', scaled.tpm$Condition) + scaled.tpm$Condition <- factor(gsub('mean.tpm_', '', scaled.tpm$Condition), levels = c('C' ,'S', 'M', 'H', 'SM', 'SH', 'MH', 'SMH')) + + + + ############################################################################################################ + ### Category enrichment using fisher's exact + ############################################################################################################ + + + alphaEnrich <- 0.05 + pAdjust <- "BH" + + GO.type <- 'P' + + q_thresh <- 0.05 + + fish.list <- list() + + current.cluster.appr <- cluster.resultsexport + + # mapman.subset <- droplevels(merge(mapman.X4, current.cluster.appr, "AGI")) + mapman.subset <- droplevels(merge(mapman, current.cluster.appr, "AGI")) + GO.subset <- droplevels(merge(GOAnno[[GO.type]], current.cluster.appr, "AGI")) + + for(i in unique(current.cluster.appr$km.new.order)) + { + + ############################################################################################################ + ### Mapman + ############################################################################################################ + + + fishers.results <- data.frame(matrix(NA, 0,3)) + for(mapman.ID in unique(mapman.subset$BIN1)) + { + gene.count <- length(unique(mapman.subset[mapman.subset$BIN1 == mapman.ID,"AGI"])) + + InIn <- sum(mapman.subset$km.new.order == i & mapman.subset$BIN1 == mapman.ID) + InOut <- sum(mapman.subset$km.new.order == i & mapman.subset$BIN1 != mapman.ID) + OutIn <- sum((mapman.subset$km.new.order != i & mapman.subset$BIN1 == mapman.ID)) + OutOut <- sum((mapman.subset$km.new.order != i & mapman.subset$BIN1 != mapman.ID)) + + p <- fisher.test(matrix(c(InIn, InOut, OutIn, OutOut), nrow = 2))$p.value + + if(InIn/InOut > OutIn/OutOut){represented = "over"}else{represented = "under"} + + fishers.results <- rbind.data.frame(fishers.results, cbind.data.frame(mapman.ID, p, represented, gene.count, InIn, InOut, OutIn, OutOut)) + + } + + fishers.results[, pAdjust] <- p.adjust(fishers.results$p, method = pAdjust) + # ### Remove non-significant + # fishers.results <- fishers.results[fishers.results[,pAdjust] <= alphaEnrich,] + # ### Remove under-represented + fishers.results <- fishers.results[fishers.results$represented == "over",] + + fishers.results$km.new.order <- i + + fish.list[['mapman']] <- rbind(fish.list[['mapman']], fishers.results) + + + ############################################################################################################ + ### GO terms + ############################################################################################################ + + + fishers.results <- data.frame(matrix(NA, 0,3)) + for(GO.Term in unique(GO.subset$GO.Term)) + { + + gene.count <- length(unique(GO.subset[GO.subset$GO.Term == GO.Term,"AGI"])) + + InIn <- sum(GO.subset$km.new.order == i & GO.subset$GO.Term == GO.Term) + InOut <- sum(GO.subset$km.new.order == i & GO.subset$GO.Term != GO.Term) + OutIn <- sum((GO.subset$km.new.order != i & GO.subset$GO.Term == GO.Term)) + OutOut <- sum((GO.subset$km.new.order != i & GO.subset$GO.Term != GO.Term)) + + p <- fisher.test(matrix(c(InIn, InOut, OutIn, OutOut), nrow = 2))$p.value + + + if(InIn/InOut > OutIn/OutOut){represented = "over"}else{represented = "under"} + + fishers.results <- rbind.data.frame(fishers.results, cbind.data.frame(GO.Term, p, represented, gene.count, InIn, InOut, OutIn, OutOut)) + + } + + fishers.results[, pAdjust] <- p.adjust(fishers.results$p, method = pAdjust) + # ### Remove non-significant + # fishers.results <- fishers.results[fishers.results[,pAdjust] <= alphaEnrich,] + # ### Remove under-represented + fishers.results <- fishers.results[fishers.results$represented == "over",] + + + fishers.results$km.new.order <- i + + fish.list[['go']] <- rbind(fish.list[['go']], fishers.results) + + + } + + + + + ############################################################################################################ + ### Plot stuff + ############################################################################################################ + + + + + + + + go.ex <- drop.levels(subset(fish.list[['go']], BH <= alphaEnrich)) + go.ex$GO.Term <- factor(go.ex$GO.Term, levels = rev(sort(unique(go.ex$GO.Term)))) + + go.plot <- ggplot(go.ex, aes(x=as.factor(km.new.order), y=GO.Term, label = InIn)) + + geom_label() + + xlab('k-means cluster') + + theme_dominik + theme(aspect.ratio = 2) + + theme(panel.grid = element_line(color = 'grey70'), panel.background = element_blank(), panel.border = element_rect(fill = NA)) + + + mm.ex <- drop.levels(subset(fish.list[['mapman']], BH <= alphaEnrich)) + mm.ex$mapman.ID <- factor(mm.ex$mapman.ID, levels = rev(sort(unique(mm.ex$mapman.ID)))) + + + mm.plot <- ggplot(mm.ex, aes(x=as.factor(km.new.order), y=mapman.ID, label = InIn)) + + geom_label() + + xlab('k-means cluster') + + theme_dominik + theme(aspect.ratio = 2) + + theme(panel.grid = element_line(color = 'grey70'), panel.background = element_blank(), panel.border = element_rect(fill = NA)) + + + plot.data <- merge(scaled.tpm, cluster.resultsexport, 'AGI') + plot.data$label <- factor(plot.data$label, levels = unique(plot.data[order(plot.data$km.new.order), 'label'])) + + cluster.plot <- ggplot(plot.data, aes(x=Condition, y=z.score, group=AGI)) + + geom_violin(aes(group = Condition), scale = 'width') + + geom_hline(yintercept = 0, color = 'red') + + labs(x="Treatment", y="log-FC (Trtmt / Control)") + + facet_wrap(~label, scales = 'free', nrow = 2) + + theme_dominik + + theme(strip.background = element_blank(), strip.text = element_text(size = 12)) + + + + + dir.create('05_kmeans_final') + setwd('05_kmeans_final') + + + png(paste(nice.date, 'go.plot', '.png', sep = ''), res = 300, width = 3200, height = 2000) + print(go.plot) + dev.off() + + png(paste(nice.date, 'mm.plot', '.png', sep = ''), res = 300, width = 3200, height = 2000) + print(mm.plot) + dev.off() + + png(paste(nice.date, 'cluster.plot', '.png', sep = ''), res = 300, width = 3200, height = 2000) + print(cluster.plot) + dev.off() + + + + + + ############################################################################################################ + ### Plot heat maps with LFCs + ############################################################################################################ + + + lfc.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('^b_', colnames(sleuth.exprt)), value.name = 'lfc') + q.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('qvalue_', colnames(sleuth.exprt)), value.name = 'q') + + lfc.table <- cbind.data.frame(lfc.table, q.table[,2:3]) + lfc.table$q.star <- cut(lfc.table$q, breaks = c(0, 0.001, 0.01, 0.05, 1), labels = c('***', '**', '*', ' '), include.lowest = T) + + lfc.table$condition <- gsub('b_' , '', lfc.table$variable) + lfc.table$condition <- factor(lfc.table$condition, unique(lfc.table[order(lfc.table$variable), 'condition'])) + + lfc.table <- merge(lfc.table, araport,'AGI', all.x = T) + lfc.table[is.na(lfc.table$name), 'name'] <- lfc.table[is.na(lfc.table$name), 'AGI'] + + + lfc.table[!lfc.table$name == lfc.table$AGI, 'gene.label'] <- paste0(lfc.table[!lfc.table$name == lfc.table$AGI, 'name'], ' (', lfc.table[!lfc.table$name == lfc.table$AGI, 'AGI'], ')') + lfc.table[lfc.table$name == lfc.table$AGI, 'gene.label'] <- lfc.table[lfc.table$name == lfc.table$AGI, 'AGI'] + + + #### Cluster 4 + + km4 <- merge(subset(cluster.resultsexport, km.new.order == '4'), lfc.table, 'AGI') + km4$gene.label <- factor(km4$gene.label, levels = sort(unique(km4$gene.label), decreasing = T)) + + png(paste(nice.date, 'heatmap_lfc_cluster4', '.png', sep = ''), res = 300, width = 1200, height = 4000) + + ggplot(km4, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + + # geom_tile() + + geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + + geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + + geom_text(size = 3, color = "black", vjust = 0.7) + + scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", + midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + + theme_dominik + + theme(aspect.ratio = 7) + + theme(axis.title = element_blank()) + + + dev.off() + + + + #### Cluster 7 + + km7 <- merge(subset(cluster.resultsexport, km.new.order == '7'), lfc.table, 'AGI') + km7$gene.label <- factor(km7$gene.label, levels = sort(unique(km7$gene.label), decreasing = T)) + + png(paste(nice.date, 'heatmap_lfc_cluster7', '.png', sep = ''), res = 300, width = 1200, height = 4000) + + ggplot(km7, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + + # geom_tile() + + geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + + geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + + geom_text(size = 3, color = "black", vjust = 0.7) + + scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", + midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + + theme_dominik + + theme(aspect.ratio = 7) + + theme(axis.title = element_blank()) + + + dev.off() + + # + # + # ############################################################################################################ + # ### Plot heat maps with means + # ############################################################################################################ + # + # q.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('qvalue_', colnames(sleuth.exprt)), value.name = 'q') + # + # mean.table <- merge(means, q.table, 'AGI') + # mean.table$q.star <- cut(mean.table$q, breaks = c(0, 0.001, 0.01, 0.05, 1), labels = c('***', '**', '*', ' '), include.lowest = T) + # + # mean.table$condition <- factor(mean.table$Cond_short.y, c('C', 'S', 'M', 'H', 'SM', 'SH', 'MH', 'SMH')) + # + # mean.table <- merge(mean.table, araport,'AGI', all.x = T) + # mean.table[is.na(mean.table$name), 'name'] <- mean.table[is.na(mean.table$name), 'AGI'] + # + # mean.table[!mean.table$name == mean.table$AGI, 'gene.label'] <- paste0(mean.table[!mean.table$name == mean.table$AGI, 'name'], ' (', mean.table[!mean.table$name == mean.table$AGI, 'AGI'], ')') + # mean.table[mean.table$name == mean.table$AGI, 'gene.label'] <- mean.table[mean.table$name == mean.table$AGI, 'AGI'] + # + # + # #### Cluster 4 + # + # km4 <- merge(subset(cluster.resultsexport, km.new.order == '4'), mean.table, 'AGI') + # km4$gene.label <- factor(km4$gene.label, levels = sort(unique(km4$gene.label), decreasing = T)) + # + # png(paste(nice.date, 'heatmap_means_cluster4', '.png', sep = ''), res = 300, width = 1200, height = 4000) + # + # ggplot(km4, aes(x = condition, y = gene.label, label = q.star, fill = tpm)) + + # # geom_tile() + + # geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + + # geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + + # geom_text(size = 3, color = "black", vjust = 0.7) + + # scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c") + + # theme_dominik + + # theme(aspect.ratio = 7) + + # theme(axis.title = element_blank()) + # + # + # dev.off() + # + # + # + # #### Cluster 7 + # + # km7 <- merge(subset(cluster.resultsexport, km.new.order == '7'), mean.table, 'AGI') + # km7$gene.label <- factor(km7$gene.label, levels = sort(unique(km7$gene.label), decreasing = T)) + # + # png(paste(nice.date, 'heatmap_means_cluster7', '.png', sep = ''), res = 300, width = 1200, height = 4000) + # + # ggplot(km7, aes(x = condition, y = gene.label, label = q.star, fill = tpm)) + + # # geom_tile() + + # geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + + # geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + + # geom_text(size = 3, color = "black", vjust = 0.7) + + # scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c") + + # theme_dominik + + # theme(aspect.ratio = 7) + + # theme(axis.title = element_blank()) + # + # + # dev.off() + # + # + + diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/.Rhistory b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/.Rhistory new file mode 100644 index 0000000000000000000000000000000000000000..1c4ed75bf74ac6e7629921ea405b3f383604da12 --- /dev/null +++ b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/.Rhistory @@ -0,0 +1,512 @@ +sig.lfcs <- lfcs[row.names(lfcs) %in% unique(subset(sleuth.results, qval <= 0.05, AGI, drop = T)), ] +sig.lfcs[is.na(sig.lfcs)] <- 0 +sub_to_cluster <- sig.lfcs +plot.d <- as.data.frame(sub_to_cluster) +plot.d$AGI <- row.names(plot.d) +scaled.tpm <- melt(plot.d) +colnames(scaled.tpm) <- c("AGI", "Condition", "z.score") +scaled.tpm$Condition <- gsub('b_', '', scaled.tpm$Condition) +scaled.tpm$Condition <- factor(gsub('mean.tpm_', '', scaled.tpm$Condition), levels = c('C' ,'S', 'M', 'H', 'SM', 'SH', 'MH', 'SMH')) +cluster.resultsexport +alphaEnrich <- 0.05 +pAdjust <- "BH" +GO.type <- 'P' +q_thresh <- 0.05 +plot.data <- merge(scaled.tpm, cluster.resultsexport, 'AGI') +plot.data$label <- factor(plot.data$label, levels = unique(plot.data[order(plot.data$kmeans_cluster), 'label'])) +order(plot.data$kmeans_cluster) +plot.data$kmeans_cluster +cluster.resultsexport <- cluster.results$`8` +# colnames(cluster.resultsexport) <- c('kmeans_cluster', 'AGI') +save(cluster.resultsexport, file = '05_190731_ClusterResults_final.RData') +load("04_ClusterResultsWithLFC_CutOff_1/190731_ClusterResults.RData") +cluster.resultsexport <- cluster.results$`8` +# colnames(cluster.resultsexport) <- c('kmeans_cluster', 'AGI') +save(cluster.resultsexport, file = '05_190731_ClusterResults_final.RData') +current.cluster.appr <- cluster.resultsexport +# mapman.subset <- droplevels(merge(mapman.X4, current.cluster.appr, "AGI")) +mapman.subset <- droplevels(merge(mapman, current.cluster.appr, "AGI")) +GO.subset <- droplevels(merge(GOAnno[[GO.type]], current.cluster.appr, "AGI")) +unique(current.cluster.appr$km.new.order) +names(cluster.results) +alphaEnrich <- 0.05 +pAdjust <- "BH" +GO.type <- 'P' +q_thresh <- 0.05 +fish.list <- list() +current.cluster.appr <- cluster.resultsexport +# mapman.subset <- droplevels(merge(mapman.X4, current.cluster.appr, "AGI")) +mapman.subset <- droplevels(merge(mapman, current.cluster.appr, "AGI")) +GO.subset <- droplevels(merge(GOAnno[[GO.type]], current.cluster.appr, "AGI")) +for(i in unique(current.cluster.appr$km.new.order)) +{ +############################################################################################################ +### Mapman +############################################################################################################ +fishers.results <- data.frame(matrix(NA, 0,3)) +for(mapman.ID in unique(mapman.subset$BIN1)) +{ +gene.count <- length(unique(mapman.subset[mapman.subset$BIN1 == mapman.ID,"AGI"])) +InIn <- sum(mapman.subset$km.new.order == i & mapman.subset$BIN1 == mapman.ID) +InOut <- sum(mapman.subset$km.new.order == i & mapman.subset$BIN1 != mapman.ID) +OutIn <- sum((mapman.subset$km.new.order != i & mapman.subset$BIN1 == mapman.ID)) +OutOut <- sum((mapman.subset$km.new.order != i & mapman.subset$BIN1 != mapman.ID)) +p <- fisher.test(matrix(c(InIn, InOut, OutIn, OutOut), nrow = 2))$p.value +if(InIn/InOut > OutIn/OutOut){represented = "over"}else{represented = "under"} +fishers.results <- rbind.data.frame(fishers.results, cbind.data.frame(mapman.ID, p, represented, gene.count, InIn, InOut, OutIn, OutOut)) +} +fishers.results[, pAdjust] <- p.adjust(fishers.results$p, method = pAdjust) +# ### Remove non-significant +# fishers.results <- fishers.results[fishers.results[,pAdjust] <= alphaEnrich,] +# ### Remove under-represented +fishers.results <- fishers.results[fishers.results$represented == "over",] +fishers.results$km.new.order <- i +fish.list[['mapman']] <- rbind(fish.list[['mapman']], fishers.results) +############################################################################################################ +### GO terms +############################################################################################################ +fishers.results <- data.frame(matrix(NA, 0,3)) +for(GO.Term in unique(GO.subset$GO.Term)) +{ +gene.count <- length(unique(GO.subset[GO.subset$GO.Term == GO.Term,"AGI"])) +InIn <- sum(GO.subset$km.new.order == i & GO.subset$GO.Term == GO.Term) +InOut <- sum(GO.subset$km.new.order == i & GO.subset$GO.Term != GO.Term) +OutIn <- sum((GO.subset$km.new.order != i & GO.subset$GO.Term == GO.Term)) +OutOut <- sum((GO.subset$km.new.order != i & GO.subset$GO.Term != GO.Term)) +p <- fisher.test(matrix(c(InIn, InOut, OutIn, OutOut), nrow = 2))$p.value +if(InIn/InOut > OutIn/OutOut){represented = "over"}else{represented = "under"} +fishers.results <- rbind.data.frame(fishers.results, cbind.data.frame(GO.Term, p, represented, gene.count, InIn, InOut, OutIn, OutOut)) +} +fishers.results[, pAdjust] <- p.adjust(fishers.results$p, method = pAdjust) +# ### Remove non-significant +# fishers.results <- fishers.results[fishers.results[,pAdjust] <= alphaEnrich,] +# ### Remove under-represented +fishers.results <- fishers.results[fishers.results$represented == "over",] +fishers.results$km.new.order <- i +fish.list[['go']] <- rbind(fish.list[['go']], fishers.results) +} +plot.data <- merge(scaled.tpm, cluster.resultsexport, 'AGI') +plot.data$label <- factor(plot.data$label, levels = unique(plot.data[order(plot.data$km.new.order), 'label'])) +cluster.plot <- ggplot(plot.data, aes(x=Condition, y=z.score, group=AGI)) + +geom_violin(aes(group = Condition), scale = 'width') + +geom_hline(yintercept = 0, color = 'red') + +labs(x="Treatment", y="log-FC (Trtmt / Control)") + +facet_wrap(~label, scales = 'free', nrow = 2) + +theme_dominik + +theme(strip.background = element_blank(), strip.text = element_text(size = 12)) +cluster.plot +dir.create('05_kmeans_final') +go.ex <- drop.levels(subset(fish.list[['go']], BH <= alphaEnrich)) +go.ex$GO.Term <- factor(go.ex$GO.Term, levels = rev(sort(unique(go.ex$GO.Term)))) +go.plot <- ggplot(go.ex, aes(x=as.factor(km.new.order), y=GO.Term, label = InIn)) + +geom_label() + +xlab('k-means cluster') + +theme_dominik + theme(aspect.ratio = 2) + +theme(panel.grid = element_line(color = 'grey70'), panel.background = element_blank(), panel.border = element_rect(fill = NA)) +go.plot +setwd('05_kmeans_final') +png(paste(nice.date, 'go.plot', '.png', sep = ''), res = 300, width = 1600, height = 1600) +print(go.plot) +dev.off() +png(paste(nice.date, 'go.plot', '.png', sep = ''), res = 300, width = 1600, height = 800) +print(go.plot) +dev.off() +png(paste(nice.date, 'go.plot', '.png', sep = ''), res = 300, width = 3200, height = 1600) +print(go.plot) +dev.off() +png(paste(nice.date, 'go.plot', '.png', sep = ''), res = 300, width = 3200, height = 2000) +print(go.plot) +dev.off() +png(paste(nice.date, 'mm.plot', '.png', sep = ''), res = 300, width = 3200, height = 2000) +print(mm.plot) +dev.off() +mm.ex <- drop.levels(subset(fish.list[['mapman']], BH <= alphaEnrich)) +mm.ex$mapman.ID <- factor(mm.ex$mapman.ID, levels = rev(sort(unique(mm.ex$mapman.ID)))) +mm.plot <- ggplot(mm.ex, aes(x=as.factor(km.new.order), y=mapman.ID, label = InIn)) + +geom_label() + +xlab('k-means cluster') + +theme_dominik + theme(aspect.ratio = 2) + +theme(panel.grid = element_line(color = 'grey70'), panel.background = element_blank(), panel.border = element_rect(fill = NA)) +plot.data <- merge(scaled.tpm, cluster.resultsexport, 'AGI') +plot.data$label <- factor(plot.data$label, levels = unique(plot.data[order(plot.data$km.new.order), 'label'])) +cluster.plot <- ggplot(plot.data, aes(x=Condition, y=z.score, group=AGI)) + +geom_violin(aes(group = Condition), scale = 'width') + +geom_hline(yintercept = 0, color = 'red') + +labs(x="Treatment", y="log-FC (Trtmt / Control)") + +facet_wrap(~label, scales = 'free', nrow = 2) + +theme_dominik + +theme(strip.background = element_blank(), strip.text = element_text(size = 12)) +png(paste(nice.date, 'mm.plot', '.png', sep = ''), res = 300, width = 3200, height = 2000) +print(mm.plot) +dev.off() +png(paste(nice.date, 'cluster.plot', '.png', sep = ''), res = 300, width = 3200, height = 2000) +print(cluster.plot) +dev.off() +cluster.plot +View(plot.data) +lfc.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('^b_', colnames(sleuth.exprt)), value.name = 'lfc') +View(lfc.table) +q.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('qvalue_', colnames(sleuth.exprt)), value.name = 'q') +head(q.table) +lfc.table <- cbind.data.frame(lfc.table, q.table[,2:3]) +lfc.table$q.star <- cut(lfc.table$q, breaks = c(0, 0.001, 0.01, 0.05, 1), labels = c('***', '**', '*', ' '), include.lowest = T) +sig.at.some <- unique(subset(q.table, q <= 0.05, AGI, drop = T)) +count(mapman.X4[, c('BIN1', 'BIN2')]) +cwllwall <- merge(subset(mapman.X4, BIN1 == 'Cell wall' & BIN2 == 'cellulose'), lfc.table, 'AGI') +cwllwall$label <- gsub('b_' , '', cwllwall$variable) +cwllwall$label <- factor(cwllwall$label, unique(cwllwall[order(cwllwall$variable), 'label'])) +ggplot(cwllwall, aes(x = label, y = AGI, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 5) +candidates <- merge(candidates, araport, 'AGI') +candidates$label <- gsub('b_' , '', candidates$variable) +candidates <- merge(subset(GOAnno$P, GO.Term =='pathogenesis'), lfc.table, 'AGI') +category <- "pathogen" +candidates <- merge(candidates, araport, 'AGI') +candidates$label <- gsub('b_' , '', candidates$variable) +candidates$label <- factor(candidates$label, unique(candidates[order(candidates$variable), 'label'])) +# candidates$gene.label <- paste0(candidates$name, ' (', candidates$AGI, ')') +candidates$gene.label <- paste0(candidates$NAME, ' (', candidates$AGI, ')') +pdf(paste0(category, '.pdf'), width = nrow(candidates)/50, height = nrow(candidates)/50 * 2) +ggplot(candidates, aes(x = label, y = gene.label, label = q.star, fill = lfc)) + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +theme_dominik + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / Ctrl)", sep = ""))) + +theme(legend.position = 'right', legend.direction = 'vertical', legend.title = element_text(), axis.ticks = element_blank(), panel.background = element_blank()) + +theme(aspect.ratio = nrow(candidates)/50) + +labs(x = '', y = '') + +scale_x_discrete(position = 'top') + +ggtitle(category) +dev.off() +getwd() +cwllwall <- merge(subset(cluster.resultsexport, km.new.order == '4'), lfc.table, 'AGI') +km4 <- merge(subset(cluster.resultsexport, km.new.order == '4'), lfc.table, 'AGI') +cwllwall$variable +lfc.table$label <- gsub('b_' , '', lfc.table$variable) +lfc.table$label <- factor(lfc.table$label, unique(lfc.table[order(lfc.table$variable), 'label'])) +km4 <- merge(subset(cluster.resultsexport, km.new.order == '4'), lfc.table, 'AGI') +View(cluster.resultsexport) +lfc.table$condition <- gsub('b_' , '', lfc.table$variable) +lfc.table$condition <- factor(lfc.table$condition, unique(lfc.table[order(lfc.table$variable), 'condition'])) +lfc.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('^b_', colnames(sleuth.exprt)), value.name = 'lfc') +q.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('qvalue_', colnames(sleuth.exprt)), value.name = 'q') +head(q.table) +lfc.table <- cbind.data.frame(lfc.table, q.table[,2:3]) +lfc.table$q.star <- cut(lfc.table$q, breaks = c(0, 0.001, 0.01, 0.05, 1), labels = c('***', '**', '*', ' '), include.lowest = T) +lfc.table$condition <- gsub('b_' , '', lfc.table$variable) +lfc.table$condition <- factor(lfc.table$condition, unique(lfc.table[order(lfc.table$variable), 'condition'])) +km4 <- merge(subset(cluster.resultsexport, km.new.order == '4'), lfc.table, 'AGI') +ggplot(km4, aes(x = condition, y = AGI, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 5) +View(km4) +lfc.table <- merge(lfc.table, araport,'AGI') +lfc.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('^b_', colnames(sleuth.exprt)), value.name = 'lfc') +q.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('qvalue_', colnames(sleuth.exprt)), value.name = 'q') +head(q.table) +lfc.table <- cbind.data.frame(lfc.table, q.table[,2:3]) +lfc.table$q.star <- cut(lfc.table$q, breaks = c(0, 0.001, 0.01, 0.05, 1), labels = c('***', '**', '*', ' '), include.lowest = T) +lfc.table$condition <- gsub('b_' , '', lfc.table$variable) +lfc.table$condition <- factor(lfc.table$condition, unique(lfc.table[order(lfc.table$variable), 'condition'])) +lfc.table <- merge(lfc.table, araport,'AGI', all.x = T) +lfc.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('^b_', colnames(sleuth.exprt)), value.name = 'lfc') +q.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('qvalue_', colnames(sleuth.exprt)), value.name = 'q') +head(q.table) +lfc.table <- cbind.data.frame(lfc.table, q.table[,2:3]) +lfc.table$q.star <- cut(lfc.table$q, breaks = c(0, 0.001, 0.01, 0.05, 1), labels = c('***', '**', '*', ' '), include.lowest = T) +lfc.table$condition <- gsub('b_' , '', lfc.table$variable) +lfc.table$condition <- factor(lfc.table$condition, unique(lfc.table[order(lfc.table$variable), 'condition'])) +lfc.table <- merge(lfc.table, araport,'AGI', all.x = T) +View(lfc.table) +lfc.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('^b_', colnames(sleuth.exprt)), value.name = 'lfc') +q.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('qvalue_', colnames(sleuth.exprt)), value.name = 'q') +lfc.table <- cbind.data.frame(lfc.table, q.table[,2:3]) +lfc.table$q.star <- cut(lfc.table$q, breaks = c(0, 0.001, 0.01, 0.05, 1), labels = c('***', '**', '*', ' '), include.lowest = T) +lfc.table$condition <- gsub('b_' , '', lfc.table$variable) +lfc.table$condition <- factor(lfc.table$condition, unique(lfc.table[order(lfc.table$variable), 'condition'])) +lfc.table <- merge(lfc.table, araport,'AGI', all.x = T) +km4 <- merge(subset(cluster.resultsexport, km.new.order == '4'), lfc.table, 'AGI') +# ggplot(km4, aes(x = condition, y = AGI, label = q.star, fill = lfc)) + +ggplot(km4, aes(x = condition, y = name, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 5) +lfc.table[is.na(lfc.table$name), 'name'] <- lfc.table[is.na(lfc.table$name), 'AGI'] +paste0(lfc.table[!is.na(lfc.table$name), 'name'], ' (', lfc.table[!is.na(lfc.table$name), 'AGI'], ')') +lfc.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('^b_', colnames(sleuth.exprt)), value.name = 'lfc') +q.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('qvalue_', colnames(sleuth.exprt)), value.name = 'q') +lfc.table <- cbind.data.frame(lfc.table, q.table[,2:3]) +lfc.table$q.star <- cut(lfc.table$q, breaks = c(0, 0.001, 0.01, 0.05, 1), labels = c('***', '**', '*', ' '), include.lowest = T) +lfc.table$condition <- gsub('b_' , '', lfc.table$variable) +lfc.table$condition <- factor(lfc.table$condition, unique(lfc.table[order(lfc.table$variable), 'condition'])) +lfc.table <- merge(lfc.table, araport,'AGI', all.x = T) +lfc.table[is.na(lfc.table$name), 'name'] <- lfc.table[is.na(lfc.table$name), 'AGI'] +lfc.table$name == lfc.table$AGI +lfc.table[!lfc.table$name == lfc.table$AGI, 'gene.label'] <- paste0(lfc.table[!lfc.table$name == lfc.table$AGI, 'name'], ' (', lfc.table[!lfc.table$name == lfc.table$AGI, 'AGI'], ')') +lfc.table$gene.label +lfc.table[lfc.table$name == lfc.table$AGI, 'gene.label'] <- lfc.table[!lfc.table$name == lfc.table$AGI, 'AGI'] +lfc.table[!lfc.table$name == lfc.table$AGI, 'gene.label'] <- paste0(lfc.table[!lfc.table$name == lfc.table$AGI, 'name'], ' (', lfc.table[!lfc.table$name == lfc.table$AGI, 'AGI'], ')') +lfc.table[lfc.table$name == lfc.table$AGI, 'gene.label'] <- lfc.table[lfc.table$name == lfc.table$AGI, 'AGI'] +lfc.table$gene.label +km4 <- merge(subset(cluster.resultsexport, km.new.order == '4'), lfc.table, 'AGI') +# ggplot(km4, aes(x = condition, y = AGI, label = q.star, fill = lfc)) + +ggplot(km4, aes(x = condition, y = name, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 5) +km4$gene.label +factor(km4$gene.label, levels = rev(unique(km4$gene.label))) +km4$gene.label <- factor(km4$gene.label, levels = sort(unique(km4$gene.label), decreasing = T)) +km4$gene.label +# ggplot(km4, aes(x = condition, y = AGI, label = q.star, fill = lfc)) + +ggplot(km4, aes(x = condition, y = name, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 5) +# ggplot(km4, aes(x = condition, y = AGI, label = q.star, fill = lfc)) + +ggplot(km4, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 5) +km4 <- merge(subset(cluster.resultsexport, km.new.order == '4'), lfc.table, 'AGI') +km4$gene.label <- factor(km4$gene.label, levels = sort(unique(km4$gene.label), decreasing = T)) +png(paste(nice.date, 'heatmap_cluster4', '.png', sep = ''), res = 300, width = 3200, height = 2000) +ggplot(km4, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 5) + +theme(axis.title = element_blank()) +dev.off() +png(paste(nice.date, 'heatmap_cluster4', '.png', sep = ''), res = 300, width = 3200, height = 6400) +ggplot(km4, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 5) + +theme(axis.title = element_blank()) +dev.off() +png(paste(nice.date, 'heatmap_cluster4', '.png', sep = ''), res = 300, width = 600, height = 3000) +ggplot(km4, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 5) + +theme(axis.title = element_blank()) +dev.off() +png(paste(nice.date, 'heatmap_cluster4', '.png', sep = ''), res = 300, width = 1200, height = 3000) +ggplot(km4, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 5) + +theme(axis.title = element_blank()) +dev.off() +png(paste(nice.date, 'heatmap_cluster4', '.png', sep = ''), res = 300, width = 1200, height = 3500) +ggplot(km4, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 5) + +theme(axis.title = element_blank()) +dev.off() +png(paste(nice.date, 'heatmap_cluster4', '.png', sep = ''), res = 300, width = 1200, height = 4000) +ggplot(km4, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 5) + +theme(axis.title = element_blank()) +dev.off() +png(paste(nice.date, 'heatmap_cluster4', '.png', sep = ''), res = 300, width = 1200, height = 4000) +ggplot(km4, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 6) + +theme(axis.title = element_blank()) +dev.off() +png(paste(nice.date, 'heatmap_cluster4', '.png', sep = ''), res = 300, width = 1200, height = 4000) +ggplot(km4, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 7) + +theme(axis.title = element_blank()) +dev.off() +km7 <- merge(subset(cluster.resultsexport, km.new.order == '7'), lfc.table, 'AGI') +km7$gene.label <- factor(km7$gene.label, levels = sort(unique(km7$gene.label), decreasing = T)) +png(paste(nice.date, 'heatmap_cluster7', '.png', sep = ''), res = 300, width = 1200, height = 4000) +ggplot(km7, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 7) + +theme(axis.title = element_blank()) +dev.off() +View(means) +View(q.table) +q.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('qvalue_', colnames(sleuth.exprt)), value.name = 'q') +mean.table <- merge(means, q.table, 'AGI') +mean.table$q.star <- cut(mean.table$q, breaks = c(0, 0.001, 0.01, 0.05, 1), labels = c('***', '**', '*', ' '), include.lowest = T) +View(mean.table) +order(lfc.table$variable) +lfc.table$condition +order(mean.table$Cond_short.y) +mean.table$condition <- factor(mean.table$condition, unique(mean.table[order(mean.table$variable), 'Cond_short.y'])) +mean.table$condition <- factor(mean.table$Cond_short.y, unique(mean.table[order(mean.table$variable), 'Cond_short.y'])) +mean.table$condition +mean.table$condition <- factor(mean.table$Cond_short.y, c('C', 'S', 'M', 'H', 'SM', 'SH', 'MH', 'SMH')) +mean.table$condition +mean.table <- merge(mean.table, araport,'AGI', all.x = T) +mean.table[is.na(mean.table$name), 'name'] <- mean.table[is.na(mean.table$name), 'AGI'] +mean.table[!mean.table$name == mean.table$AGI, 'gene.label'] <- paste0(mean.table[!mean.table$name == mean.table$AGI, 'name'], ' (', mean.table[!mean.table$name == mean.table$AGI, 'AGI'], ')') +mean.table[mean.table$name == mean.table$AGI, 'gene.label'] <- mean.table[mean.table$name == mean.table$AGI, 'AGI'] +km4 <- merge(subset(cluster.resultsexport, km.new.order == '4'), lfc.table, 'AGI') +km4$gene.label <- factor(km4$gene.label, levels = sort(unique(km4$gene.label), decreasing = T)) +png(paste(nice.date, 'heatmap_means_cluster4', '.png', sep = ''), res = 300, width = 1200, height = 4000) +ggplot(km4, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +# limits = limits.lfc.sleuth.GT, +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 7) + +theme(axis.title = element_blank()) +dev.off() +View(km4) +km4 <- merge(subset(cluster.resultsexport, km.new.order == '4'), mean.table, 'AGI') +km4$gene.label <- factor(km4$gene.label, levels = sort(unique(km4$gene.label), decreasing = T)) +png(paste(nice.date, 'heatmap_means_cluster4', '.png', sep = ''), res = 300, width = 1200, height = 4000) +ggplot(km4, aes(x = condition, y = gene.label, label = q.star, fill = tpm)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c") + +theme_dominik + +theme(aspect.ratio = 7) + +theme(axis.title = element_blank()) +dev.off() +View(scaled.tpm) +View(scaled.tpm) +#### Cluster 4 +km4 <- merge(subset(cluster.resultsexport, km.new.order == '4'), lfc.table, 'AGI') +km4$gene.label <- factor(km4$gene.label, levels = sort(unique(km4$gene.label), decreasing = T)) +png(paste(nice.date, 'heatmap_lfc_cluster4', '.png', sep = ''), res = 300, width = 1200, height = 4000) +ggplot(km4, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 7) + +theme(axis.title = element_blank()) +dev.off() +#### Cluster 7 +km7 <- merge(subset(cluster.resultsexport, km.new.order == '7'), lfc.table, 'AGI') +km7$gene.label <- factor(km7$gene.label, levels = sort(unique(km7$gene.label), decreasing = T)) +png(paste(nice.date, 'heatmap_lfc_cluster7', '.png', sep = ''), res = 300, width = 1200, height = 4000) +ggplot(km7, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + +# geom_tile() + +geom_point(color = "black", stroke = 1, fill = NA, size = 6, shape = 22) + +geom_point(alpha = 1, stroke = 1, size = 6, shape = 22) + +geom_text(size = 3, color = "black", vjust = 0.7) + +scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", +midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + +theme_dominik + +theme(aspect.ratio = 7) + +theme(axis.title = element_blank()) +dev.off() diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/190731cluster.plot.png b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/190731cluster.plot.png new file mode 100644 index 0000000000000000000000000000000000000000..47f7c0614622a9db3a145610dc386c1cc5abfe30 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/190731cluster.plot.png differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/190731go.plot.png b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/190731go.plot.png new file mode 100644 index 0000000000000000000000000000000000000000..46a52cbca177d460ed95fbcc301348e716518cba Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/190731go.plot.png differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/190731heatmap_lfc_cluster4.png b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/190731heatmap_lfc_cluster4.png new file mode 100644 index 0000000000000000000000000000000000000000..a60a84f8200f67297126a8884babca7e2a175190 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/190731heatmap_lfc_cluster4.png differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/190731heatmap_lfc_cluster7.png b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/190731heatmap_lfc_cluster7.png new file mode 100644 index 0000000000000000000000000000000000000000..16ae9ac940ea811543c0491e3e03ba7c034e4b93 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/190731heatmap_lfc_cluster7.png differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/190731mm.plot.png b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/190731mm.plot.png new file mode 100644 index 0000000000000000000000000000000000000000..0959e87e88fa18f78bb279b05550e4d2a5e841b6 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/05_kmeans_final/190731mm.plot.png differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/06_HeatMaps.R b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/06_HeatMaps.R new file mode 100644 index 0000000000000000000000000000000000000000..08626bbd48e4e218b453c19512b1dfc7045997d1 --- /dev/null +++ b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/06_HeatMaps.R @@ -0,0 +1,193 @@ + +required.packages <- c("Rmisc", "plyr", "ggplot2", "reshape2", "gridExtra", 'openxlsx', 'sleuth', 'amap', 'ggdendro', 'ellipse', 'UpSetR', 'VennDiagram') + +for(package in required.packages) +{ + print(package) + ## Check if package is installed. If not, install + if(!package %in% row.names(installed.packages())){install.packages(package, repos ="https://cran.uni-muenster.de/")} + # ## Check if package is up to date. If not, update + # update.packages(package, repos = "https://cran.uni-muenster.de/") + ## Load package + library(package, character.only = T) +} + + +theme_dominik <- + theme(panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(fill = NA, color = NA)) + + theme(aspect.ratio = 1) + + theme(axis.ticks = element_line(color = "black"), axis.text = element_text(color = "black")) + + # theme(legend.position = "bottom", legend.direction = "horizontal", legend.title = element_blank()) + #, legend.key.size = unit(0.05, "npc") + theme(plot.margin=unit(c(1, 1, 1, 1),"cm")) + + theme(plot.title = element_text(hjust = 0, face = "bold", size = 8)) + + +load('01_Sleuth.RData') +load('RNASeq_Annotation/Arabidopsis/02.0Araport11/02.0Araport.RData') + + # LFC_cut <- 1 + # sleuth.results[abs(sleuth.results$b) < LFC_cut & !is.na(sleuth.results$b), 'qval'] <- 1 + sleuth.results[is.na(sleuth.results$b), 'qval'] <- 1 + + sleuth.exprt$AGI <- do.call(rbind, strsplit(split = ".", as.character(sleuth.exprt$target_id), fixed = T))[,1] + sleuth.results$AGI <- do.call(rbind, strsplit(split = ".", as.character(sleuth.results$target_id), fixed = T))[,1] + + lfcs <- sleuth.exprt[,grep('^b_', colnames(sleuth.exprt))] + row.names(lfcs) <- sleuth.exprt$AGI + + sig.lfcs <- lfcs[row.names(lfcs) %in% unique(subset(sleuth.results, qval <= 0.05, AGI, drop = T)), ] + sig.lfcs[is.na(sig.lfcs)] <- 0 + # + # plot.d <- as.data.frame(sig.lfcs) + # plot.d$AGI <- row.names(plot.d) + # + # scaled.tpm <- melt(plot.d) + # colnames(scaled.tpm) <- c("AGI", "Condition", "z.score") + # scaled.tpm$Condition <- gsub('b_', '', scaled.tpm$Condition) + # scaled.tpm$Condition <- factor(gsub('mean.tpm_', '', scaled.tpm$Condition), levels = c('C' ,'S', 'M', 'H', 'SM', 'SH', 'MH', 'SMH')) + # + # + + ############################################################################################################ + ### Plot heat maps with LFCs + ############################################################################################################ + + + lfc.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('^b_', colnames(sleuth.exprt)), value.name = 'lfc') + q.table <- melt(sleuth.exprt, id.vars = 'AGI', measure.vars = grep('qvalue_', colnames(sleuth.exprt)), value.name = 'q') + + lfc.table <- cbind.data.frame(lfc.table, q.table[,2:3]) + lfc.table$q.star <- cut(lfc.table$q, breaks = c(0, 0.001, 0.01, 0.05, 1), labels = c('***', '**', '*', ' '), include.lowest = T) + + lfc.table$condition <- gsub('b_' , '', lfc.table$variable) + lfc.table$condition <- factor(lfc.table$condition, unique(lfc.table[order(lfc.table$variable), 'condition'])) + + lfc.table <- merge(lfc.table, araport,'AGI', all.x = T) + lfc.table[is.na(lfc.table$name), 'name'] <- lfc.table[is.na(lfc.table$name), 'AGI'] + + + lfc.table[!lfc.table$name == lfc.table$AGI, 'gene.label'] <- paste0(lfc.table[!lfc.table$name == lfc.table$AGI, 'name'], ' (', lfc.table[!lfc.table$name == lfc.table$AGI, 'AGI'], ')') + lfc.table[lfc.table$name == lfc.table$AGI, 'gene.label'] <- lfc.table[lfc.table$name == lfc.table$AGI, 'AGI'] + + + + + # + # current.set <- sample(lfc.table$AGI, 10) + # plot.data <- subset(lfc.table, AGI %in% current.set) + # + # + # p1 <- ggplot(plot.data, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + + # geom_point(color = "black", stroke = 0.5, fill = NA, size = 6, shape = 22) + + # geom_point(alpha = 1, stroke = 0.5, size = 6, shape = 22) + + # geom_text(size = 3, color = "black", vjust = 0.7) + + # scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", + # midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + + # theme_dominik + + # theme(aspect.ratio = length(unique(plot.data$AGI))/4) + + # theme(axis.title = element_blank()) + # + # + # + # current.set <- sample(lfc.table$AGI, 30) + # plot.data <- subset(lfc.table, AGI %in% current.set) + # + # p2 <- ggplot(plot.data, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + + # geom_point(color = "black", stroke = 0.5, fill = NA, size = 6, shape = 22) + + # geom_point(alpha = 1, stroke = 0.5, size = 6, shape = 22) + + # geom_text(size = 3, color = "black", vjust = 0.7) + + # scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", + # midpoint = 0, name = expression(paste(log["2"], "-FC (trtmt / WT)", sep = ""))) + + # theme_dominik + + # theme(aspect.ratio = length(unique(plot.data$AGI))/4) + + # theme(axis.title = element_blank()) + # + # + # ggplotGrob(p1) + # + # + # + # png('test.png', res = 300, height = 4000, width = 1600) + # grid.arrange(gtable_rbind(ggplotGrob(p1), ggplotGrob(p2))) + # dev.off() + + + + + gene.cats <- read.xlsx('06_NasserHeatMaps.xlsx') + plot.cats <- merge(lfc.table, gene.cats) + + + plotlist <- list() + for(i in sort(unique(plot.cats$Table))) + { + + plot.data <- subset(plot.cats, Table %in% i) + + plotlist[[i]] <- ggplot(plot.data, aes(x = condition, y = gene.label, label = q.star, fill = lfc)) + + geom_point(color = "black", stroke = 0.5, fill = NA, size = 6, shape = 22) + + geom_point(alpha = 1, stroke = 0.5, size = 6, shape = 22) + + geom_text(size = 3, color = "black", vjust = 0.7) + + scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", + midpoint = 0, name = expression(paste(log["2"], "-FC", sep = ""))) + + theme_dominik + + theme(aspect.ratio = length(unique(plot.data$AGI))/5) + + theme(plot.margin = unit(c(2.5, 0, 2.5, 0), 'lines')) + + theme(axis.title = element_blank()) + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + + } + + plotlist <- lapply(plotlist, ggplotGrob) + + + png('test.png', res = 500, height = 24000, width = 3000) + + grid.arrange(do.call(gtable_rbind, plotlist)) + + dev.off() + + + + + + gene.cats <- read.xlsx('06_NasserHeatMaps.xlsx', sheet = 2) + gene.cats$yposition <- -1*gene.cats$yposition + + plot.cats <- merge(lfc.table, gene.cats) + + i <- "T4_WRKY" + + + plotlist <- list() + for(i in sort(unique(plot.cats$Table))) + { + + + plot.data <- subset(plot.cats, Table %in% i) + + + plot.data$Subcategory <- factor(plot.data$Subcategory, levels = unique(plot.data[order(plot.data$Group), 'Subcategory'])) + plot.data$LongLabel <- factor(plot.data$LongLabel, levels = unique(plot.data[order(plot.data$yposition), 'LongLabel'])) + plot.data$SuperLongLabel <- factor(plot.data$SuperLongLabel, levels = unique(plot.data[order(plot.data$yposition), 'SuperLongLabel'])) + + + plotlist[[i]] <- + ggplot(plot.data, aes(x = condition, y = SuperLongLabel, label = q.star, fill = lfc)) + + geom_point(color = "black", stroke = 0.5, fill = NA, size = 6, shape = 22) + + geom_point(alpha = 1, stroke = 0.5, size = 6, shape = 22) + + geom_text(size = 3, color = "black", vjust = 0.7) + + scale_fill_gradient2(low = "#1f78b4", high = "#e41a1c", + midpoint = 0, name = expression(paste(log["2"], "-FC", sep = ""))) + + theme_dominik + + facet_grid(Subcategory~. , scales = "free_y", space = "free", switch = 'y', labeller = label_wrap_gen(width=25)) + + theme(strip.background = element_rect(fill = NA, color = 'black'), strip.placement = 'outside', strip.text = element_text(face = 'bold')) + + theme(axis.title = element_blank(), axis.text.y = element_text(hjust =1), axis.ticks = element_blank()) + + scale_x_discrete(position = 'top') + + theme(axis.text.x.top = element_text(angle = 90, vjust = 0.5, hjust = 0)) + + + } + +plotlist <- lapply(plotlist, ggplotGrob) + diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/06_NasserHeatMaps.xlsx b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/06_NasserHeatMaps.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..cd191479c4191de63554b25f12f10613dc21619b Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/06_NasserHeatMaps.xlsx differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/10_CollectSupplement.R b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/10_CollectSupplement.R new file mode 100644 index 0000000000000000000000000000000000000000..b87081bc80e4c7bd76bd1aaa2b7445e146effca4 --- /dev/null +++ b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/10_CollectSupplement.R @@ -0,0 +1,125 @@ + +required.packages <- c("Rmisc", "plyr", "ggplot2", "reshape2", "gridExtra", 'openxlsx', 'sleuth', 'amap', 'ggdendro', 'ellipse', 'UpSetR', 'VennDiagram') + +for(package in required.packages) +{ + print(package) + ## Check if package is installed. If not, install + if(!package %in% row.names(installed.packages())){install.packages(package, repos ="https://cran.uni-muenster.de/")} + # ## Check if package is up to date. If not, update + # update.packages(package, repos = "https://cran.uni-muenster.de/") + ## Load package + library(package, character.only = T) +} + +nice.date <- format(Sys.time(), "%y%m%d") +SampleInfo <- read.xlsx('../00_SampleInfo.xlsx') + +color.values <- brewer.pal(8, 'Dark2')[8:1] +names(color.values) <- unique(SampleInfo$Cond_short) + +theme_dominik <- + theme(panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(fill = NA)) + + theme(aspect.ratio = 1) + + theme(axis.ticks = element_line(color = "black"), axis.text = element_text(color = "black")) + + theme(strip.background = element_blank(), strip.text = element_text(size = 8)) + + theme(legend.position = "bottom", legend.direction = "horizontal", legend.title = element_blank()) + #, legend.key.size = unit(0.05, "npc") + theme(plot.margin=unit(c(1, 1, 1, 1),"cm")) + + theme(plot.title = element_text(hjust = 0, face = "bold", size = 15)) + + +######################################## +#### Annotation data +######################################## + +load('RNASeq_Annotation/Arabidopsis/02.0Araport11/02.0Araport.RData') +load("RNASeq_Annotation/Arabidopsis/02.4GO/GOAnno2018-02-06.RData") +load("RNASeq_Annotation/Arabidopsis/02.3Mapman/180924_MercatorMapman.RData") +load("RNASeq_Annotation/Arabidopsis/02.3Mapman/02.3Mapman.RData") +colnames(mapman)[2] <- 'MAPMAN.BIN' +load("RNASeq_Annotation/Arabidopsis/02.9PlnTFDB/02.9PlnTFDB.RData") +load("RNASeq_Annotation/Arabidopsis/02.12Lipids/02.12lipids.RData") + +load(file = '01_Kallisto.RData') +load(file = '01_Sleuth.RData') + + +load('05_190731_ClusterResults_final.RData') + + +############################################################################################################ +### Writing the supplementary data file +############################################################################################################ + +#### Prepare mapman as a one line to prevent gene duplication +mapman_edit <- unique(mapman[,c(1,2,4,6)]) + +mapman_edit$pivot <- 'MAPMAN.BIN' +mapman_OneLiner.term <- dcast(mapman_edit, AGI~pivot, value.var = 'MAPMAN.BIN', fun.aggregate = + function(x){paste(unique(x), collapse = '|')}) + +mapman_edit$pivot <- 'MAPMAN.DESCRIPTION' +mapman_OneLiner.Desc <- dcast(mapman_edit, AGI~pivot, value.var = 'DESCRIPTION', fun.aggregate = + function(x){paste(unique(x), collapse = '|')}) + +mapman_OneLiner <- merge(mapman_OneLiner.term, mapman_OneLiner.Desc, 'AGI') + +#### Pull count / tpm data +expression.data <- kallisto_table(sleuth_objects$not_trimmed) +expression.data$AGI <- do.call(rbind, strsplit(split = ".", as.character(expression.data$target_id), fixed = T))[,1] +expression.data <- merge(expression.data, SampleInfo, by.x = 'sample', by.y = 'IlluminaID') + +est_counts <- dcast(expression.data, AGI+target_id~SampleID, value.var = 'est_counts') +colnames(est_counts)[3:ncol(est_counts)] <- paste('est_counts', colnames(est_counts)[3:ncol(est_counts)], sep = '_') + +est_counts <- est_counts[, + c(1:2, grep('_C_', colnames(est_counts)), grep('_S_', colnames(est_counts)), grep('_M_', colnames(est_counts)), + grep('_H_', colnames(est_counts)), grep('_SM_', colnames(est_counts)), grep('_SH_', colnames(est_counts)), + grep('_MH_', colnames(est_counts)), grep('_SMH_', colnames(est_counts)))] + + +tpm <- dcast(expression.data, AGI~SampleID, value.var = 'tpm') +colnames(tpm)[2:ncol(tpm)] <- paste('tpm', colnames(tpm)[2:ncol(tpm)], sep = '_') + +tpm <- tpm[, c(1, grep('_C_', colnames(tpm)), grep('_S_', colnames(tpm)), grep('_M_', colnames(tpm)), + grep('_H_', colnames(tpm)), grep('_SM_', colnames(tpm)), grep('_SH_', colnames(tpm)), + grep('_MH_', colnames(tpm)), grep('_SMH_', colnames(tpm)))] + + +#### Calculate means and sd +means <- summarySE(expression.data, groupvars = c('Cond_short.y', 'AGI'), measurevar = 'tpm') + +mean.tpm <- dcast(means, AGI~Cond_short.y, value.var = 'tpm') +mean.tpm <- mean.tpm[, c('AGI', 'C', 'S', 'M', 'H', 'SM', 'SH', 'MH', 'SMH')] +colnames(mean.tpm)[2:ncol(mean.tpm)] <- paste('mean.tpm', colnames(mean.tpm)[2:ncol(mean.tpm)], sep = '_') + + +sd.tpm <- dcast(means, AGI~Cond_short.y, value.var = 'sd') +sd.tpm <- sd.tpm[, c('AGI', 'C', 'S', 'M', 'H', 'SM', 'SH', 'MH', 'SMH')] +colnames(sd.tpm)[2:ncol(sd.tpm)] <- paste('sd.tpm', colnames(sd.tpm)[2:ncol(sd.tpm)], sep = '_') + +sleuth.exprt$AGI <- do.call(rbind, strsplit(split = ".", as.character(sleuth.exprt$target_id), fixed = T))[,1] + + +#### Combine everything together +# supp <- merge(unique(araport[,c(1:3, 5, 7:9)]), unique(mapman.X4[,c(1:3, 5)]), 'AGI', all.x = F, all.y = F) +supp <- merge(unique(araport[,c(1:3, 5, 7:9)]), mapman_OneLiner, 'AGI', all.x = F, all.y = F) +supp <- merge(supp, unique(GO.export), 'AGI', all.x = T, all.y = F) +supp <- merge(supp, unique(PlnTFDB[!duplicated(PlnTFDB$AGI) ,c(2, 4, 5)]), 'AGI', all.x = T, all.y = F) +supp <- merge(supp, unique(lipids), 'AGI', all.x = T, all.y = F) + +supp <- merge(supp, est_counts, 'AGI', all.y = T) +supp <- merge(supp, tpm, 'AGI', all.y = T) +supp <- merge(supp, mean.tpm, 'AGI', all.y = T) +supp <- merge(supp, sd.tpm, 'AGI', all.y = T) + +supp <- merge(supp, sleuth.exprt[,!colnames(sleuth.exprt) == 'target_id' & !grepl('se_b_', colnames(sleuth.exprt))], 'AGI', all.y = T) + +supp <- merge(supp, cluster.results, 'AGI', all = T) +# supp <- merge(supp, gene.set.list, 'AGI', all = T) + + +write.table(supp, file = paste(nice.date, 'SupplementalDataset.txt', sep = '_'), row.names = F, sep = '\t', na = '') + + + diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/190731_SupplementalDataset.txt b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/190731_SupplementalDataset.txt new file mode 100644 index 0000000000000000000000000000000000000000..6174e0f74aff8f467e69e5dd259659ede7298849 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/190731_SupplementalDataset.txt differ diff --git a/_pre-ARC-legacy/190731_PipelineWithlogCutOff/190731_SupplementalDataset.xlsx b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/190731_SupplementalDataset.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..dc3e8b11a00d278a29909d803ab16dcc9dccb261 Binary files /dev/null and b/_pre-ARC-legacy/190731_PipelineWithlogCutOff/190731_SupplementalDataset.xlsx differ diff --git a/_pre-ARC-legacy/200225_GEO_Submission/00_SampleInfo.xlsx b/_pre-ARC-legacy/200225_GEO_Submission/00_SampleInfo.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..222ca446c463d8a79742fc8c291fc4a615906d6b Binary files /dev/null and b/_pre-ARC-legacy/200225_GEO_Submission/00_SampleInfo.xlsx differ diff --git a/_pre-ARC-legacy/200225_GEO_Submission/200302_GEO_MetaData_DB.xlsx b/_pre-ARC-legacy/200225_GEO_Submission/200302_GEO_MetaData_DB.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..2fc83ffcff017d6f4fc94a364c56dee4ac2573a7 Binary files /dev/null and b/_pre-ARC-legacy/200225_GEO_Submission/200302_GEO_MetaData_DB.xlsx differ diff --git a/_pre-ARC-legacy/200225_GEO_Submission/SupplementalDatasetS1.xlsx b/_pre-ARC-legacy/200225_GEO_Submission/SupplementalDatasetS1.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..bdacae683c060e4d4869b9960691be0ace438935 Binary files /dev/null and b/_pre-ARC-legacy/200225_GEO_Submission/SupplementalDatasetS1.xlsx differ diff --git a/_pre-ARC-legacy/200225_GEO_Submission/md5sums.xlsx b/_pre-ARC-legacy/200225_GEO_Submission/md5sums.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..66dccf50b8d4cdeea153c8232ac32b323b0b5433 Binary files /dev/null and b/_pre-ARC-legacy/200225_GEO_Submission/md5sums.xlsx differ diff --git a/_pre-ARC-legacy/Kallisto-Pipeline.sh b/_pre-ARC-legacy/Kallisto-Pipeline.sh new file mode 100644 index 0000000000000000000000000000000000000000..5bf0fc7821143736a1998af002d347d4affbf126 --- /dev/null +++ b/_pre-ARC-legacy/Kallisto-Pipeline.sh @@ -0,0 +1,71 @@ +#### 04.04.2019 +#### RNASeq Analysis for AG Maurino + +############################################ +#### Prepare genome reference for Kallisto + +GENOME=~/190404_NasserJessica/2015_Nasser_JessicaSchmitz_AGMaurino_DB/Araport11_genes.201606.cds.repr.fasta +/home/bin/kallisto_linux-v0.45.1/kallisto index -i $GENOME'_index' $GENOME + +############################################ +#### One Script for everything +#### (See comments for details) + +#### Specify the genome reference +GENOME=Araport11_genes.201606.cds.repr.fasta + +#### prep some directories to collect data +mkdir fastqc_results +mkdir kallisto_trimmed +mkdir kallisto_not_trimmed + +#### Loop through all samples one after the other +for j in Sample_*/; + do + + cd $j + + sampleName=$(echo $j | sed -e 's/\///g') + echo $sampleName + + ############################################ + ### Concatenate data from multiple fastq.gz files originating from the same library (and split for size limit reasons) + + ### Unzip files + gunzip *fastq.gz + + ### Concatenate by direction (fwd or rev) + cat *R1_00* > $sampleName'_R1.fastq' + cat *R2_00* > $sampleName'_R2.fastq' + + + ### Remove original files + rm *_001.fastq *_002.fastq + + ### Gzip back to save space + gzip *fastq + + ############################################ + ### Trimmomatic + java -jar /home/bin/Trimmomatic-0.33/trimmomatic-0.33.jar PE $sampleName'_R1.fastq.gz' $sampleName'_R2.fastq.gz' $sampleName'_fwd_paired.fastq.gz' $sampleName'_fwd_unpaired.fastq.gz' $sampleName'_rev_paired.fastq.gz' $sampleName'_rev_unpaired.fastq.gz' ILLUMINACLIP:TruSeq3-PE.fa:2:30:10 MAXINFO:50:0.8 MINLEN:36 + + echo 'Trimmomatic done' + + ############################################ + ### Run FastQC + + fastqc *fastq.gz -o '../fastqc_results/'$sampleName + + echo 'FastQC done' + + ############################################ + ### Run Kallisto on trimmed and non-trimmed reads + + /home/bin/kallisto_linux-v0.45.1/kallisto quant -b 50 -t 10 -i $GENOME'_index' -o '../kallisto_trimmed/'$sampleName $sampleName'_fwd_paired.fastq.gz' $sampleName'_rev_paired.fastq.gz' + /home/bin/kallisto_linux-v0.45.1/kallisto quant -b 50 -t 10 -i $GENOME'_index' -o '../kallisto_not_trimmed/'$sampleName $sampleName'_R1.fastq.gz' $sampleName'_R2.fastq.gz' + + echo 'Kallisto done' + + cd .. + +done