diff --git a/workflows/3D_PCA_plot.R b/workflows/3D_PCA_plot.R new file mode 100644 index 0000000000000000000000000000000000000000..498ca44c4b805291fe071d13813b3401d6b0515a --- /dev/null +++ b/workflows/3D_PCA_plot.R @@ -0,0 +1,167 @@ + # 3d R plot + saveat <- "/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Hoecker" + + file.rpkm <- '/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Hoecker/31315.p.sort.rpkm' + + + ## Windows ### + #-#saveat <- "Q:/haasf/leubner_RNA-seq/Celery/DEGs_0.9/" + + #-#file.rpkm <- 'Q:/haasf/leubner_RNA-seq/Celery/DEGs/Celery_merged_isoforms.p.sort.rpkm' + #-#file.count <- 'Q:/haasf/leubner_RNA-seq/Celery/DEGs_0.9/Celery_merged_isoforms.p.sort.counts' + ### + + data.rpkm <- read.table(file.rpkm, header=T, sep="\t", row.names=1) + + + # sort by colnames + data.rpkm <- data.rpkm[,order(colnames(data.rpkm))] + + + librariesName <- list( + cop_D = c("cop_D", "red"), + cop_L = c("cop_L", "blue"), + spa_D = c("spa_D", "green"), + spa_L = c("spa_L", "yellow"), + WT_D = c("WT_D", "black"), + WT_L = c("WT_L", "violet") + ) + + # + # header.ori <- c("56754_WT_Naturally_3.bam.sort.fastq.unmapped.sam.sort.bam", "56753_WT_Naturally_2.bam.sort.fastq.unmapped.sam.sort.bam", "56752_WT_Naturally_1.bam.sort.fastq.unmapped.sam.sort.bam", "56751_tt_6_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56750_tt_6_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56749_tt_6_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56748_tt_0_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56747_tt_0_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56746_tt_0_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56745_WT_10_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56744_WT_10_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56743_WT_10_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56742_WT_6_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56741_WT_6_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56740_WT_6_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56739_WT_0_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56738_WT_0_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56737_WT_0_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56736_tt_T1_tt_6d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56735_tt_T1_tt_6d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56734_tt_T1_tt_6d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56733_tt_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56732_tt_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56731_tt_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56730_tt_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56729_tt_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56728_tt_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56727_WT_T1_NA_3.bam.sort.fastq.unmapped.sam.sort.bam", "56726_WT_T1_NA_2.bam.sort.fastq.unmapped.sam.sort.bam", "56725_WT_T1_NA_1.bam.sort.fastq.unmapped.sam.sort.bam", "56724_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56723_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56722_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56721_WT_T1_WT_10d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56720_WT_T1_WT_10d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56719_WT_T1_WT_10d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56718_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56717_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56716_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56715_WT_T1_WT_6d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56714_WT_T1_WT_6d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56713_WT_T1_WT_6d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56712_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56711_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56710_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56709_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56708_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56707_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56706_WT_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56705_WT_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56704_WT_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam") + # header.new <- c("T17_Dry_Naturally_Freezedryseed", "T17_Dry_Naturally_Freezedryseed.1", "T17_Dry_Naturally_Freezedryseed.2", "T16_Dry_6days_Freezeafter6dageing", "T16_Dry_6days_Freezeafter6dageing.1", "T16_Dry_6days_Freezeafter6dageing.2", "T15_Dry_0days_Freezedryseed", "T15_Dry_0days_Freezedryseed.1", "T15_Dry_0days_Freezedryseed.2", "T14_Dry_10days_Freezeafter10dageing", "T14_Dry_10days_Freezeafter10dageing.1", "T14_Dry_10days_Freezeafter10dageing.2", "T13_Dry_6days_Freezeafter6dageing", "T13_Dry_6days_Freezeafter6dageing.1", "T13_Dry_6days_Freezeafter6dageing.2", "T12_Dry_0days_Freezedryseed", "T12_Dry_0days_Freezedryseed.1", "T12_Dry_0days_Freezedryseed.2", "T11_Imbibe_6days_12h", "T11_Imbibe_6days_12h.1", "T11_Imbibe_6days_12h.2", "T10_Imbibe_6days_5h", "T10_Imbibe_6days_5h.1", "T10_Imbibe_6days_5h.2", "T9_Imbibe_0dyas_5h", "T9_Imbibe_0dyas_5h.1", "T9_Imbibe_0dyas_5h.2", "T8_Imbibe_Naturally_47h", "T8_Imbibe_Naturally_47h.1", "T8_Imbibe_Naturally_47h.2", "T7_Imbibe_Naturally_24h", "T7_Imbibe_Naturally_24h.1", "T7_Imbibe_Naturally_24h.2", "T6_Imbibe_10days_72h", "T6_Imbibe_10days_72h.1", "T6_Imbibe_10days_72h.2", "T5_Imbibe_10days_24h", "T5_Imbibe_10days_24h.1", "T5_Imbibe_10days_24h.2", "T4_Imbibe_6days_47h", "T4_Imbibe_6days_47h.1", "T4_Imbibe_6days_47h.2", "T3_Imbibe_6days_24h", "T3_Imbibe_6days_24h.1", "T3_Imbibe_6days_24h.2", "T2_Imbibe_0days_24h", "T2_Imbibe_0days_24h.1", "T2_Imbibe_0days_24h.2", "T1_Imbibe_0days_5h", "T1_Imbibe_0days_5h.1", "T1_Imbibe_0days_5h.2") + # + # header.new <- header.new[order(header.ori)] + # header.ori <- header.ori[order(header.ori)] + # + # col.header <- header.new + # + # colnames(data.rpkm) <- col.header + + library("DESeq2") + library("ggplot2") + library("RColorBrewer") + library("pheatmap") + library("BiocGenerics") + library("rgl") + library("magick") + library("sjmisc") + + + ################### running ###################### + ### PCA RPKM ### + + set.seed(0) + + data.inv <- t(data.rpkm) + data.dist <-dist(data.inv, method="euclidean") # "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski" + data.dist.hc <- hclust(data.dist,method="ward.D2") + data.dist.pca <-princomp(data.dist,cor=T) + + pc1 <- data.dist.pca$scores[,1] + pc2 <- data.dist.pca$scores[,2] + pc3 <- data.dist.pca$scores[,3] + + # create data frame for pc1 pc2 pc3 + data.dist.pca.frame = data.frame(pc1,pc2,pc3) + rownames(data.dist.pca.frame) <- names(data.dist.pca$scale) + colnames(data.dist.pca.frame) <- c("pc1","pc2","pc3") + + condition.values <- c() + condition.values.color <- c() + + + for(a in colnames(data.rpkm)) { + + v <- substr(a, nchar(a), nchar(a)) + if(str_contains(v, c(1,2,3,4,5,6,7,8,9,0), logic = "or")) { + if (substr(substr(a, 1, nchar(a)-2), nchar(substr(a, 1, nchar(a)-2)), nchar(substr(a, 1, nchar(a)-2))) == ".") { + n <- substr(a, 1, nchar(a)-3) + } else { + n <- substr(a, 1, nchar(a)-2) + } + + } else { + n <- a + + } + condition.values <- c(condition.values, librariesName[n][[1]][1]) + condition.values.color <- c(condition.values.color, librariesName[n][[1]][2]) + } + + + data.dist.pca.frame["tissue"] <- condition.values + data.dist.pca.frame["color"] <- condition.values.color + data.dist.pca.frame["name"] <- names(data.dist.pca$scale) + attr(data.dist.pca.frame, "percentVar") <- (data.dist.pca$sdev)^2 / sum(data.dist.pca$sdev^2) # cumsum() + + # simple plot + png(filename=paste0(saveat, "/HC_RPKM_normalized.png")) + plot(data.dist.hc) # hc plot + dev.off() + png(filename=paste0(saveat, "/PCA_variance_RPKM_normalized.png")) + plot(data.dist.pca) # variances; var(data.dist.pca$sdev[1:9]) + dev.off() + # get the parcent variation + percentVar <- round(100 * attr(data.dist.pca.frame, "percentVar")) + + # 3d plot + plot3d(pc1, pc2, pc3, + type = "s", # p, s, l, h, n + #pch = c(1:3), + col = condition.values.color, + size = 1, + xlab = paste0("PC1: ",percentVar[1],"% variance"), + ylab = paste0("PC2: ",percentVar[2],"% variance"), + zlab = paste0("PC3: ",percentVar[3],"% variance"), + cex = 2, + main = "", # -> princomp", + ) + + # shift <- matrix(4, 4, 4, byrow = TRUE) + # text3d(shift,texts=1:3) + grid3d(c("x", "y", "z")) + + ## add legend + legend3d("right", unique(condition.values), pch = 19, col = unique(condition.values.color)) + + #### video ##### + M <- par3d("userMatrix") + play3d( par3dinterp( userMatrix=list(M, + rotate3d(M, pi/2, 1, 0, 0), + rotate3d(M, pi/2, 0, 1, 0) ) ), + duration=2 ) + + movie3d(spin3d(axis = c(1, 2, 1)), duration = 5, + dir = saveat) + + #### video end #### + + # pc1, pc2 + png(filename=paste0(saveat, "/PCA_RPKM_normalized.png")) + ggplot( + data.dist.pca.frame, + aes( + pc1, + pc2, + color=tissue) + ) + + geom_point(size=2.5) + + xlab( + paste0("PC1: ",percentVar[1],"% variance") + ) + + ylab( + paste0("PC2: ",percentVar[2],"% variance") + ) + + #theme() + #, face="bold" + scale_colour_manual( + values= c("red", "blue", "green", "yellow", "black", "violet") # dodgerblue3 + ) + + ggtitle("PCA of all samples (RPKM normalized)") + + theme( + plot.title = element_text(lineheight=.8), + panel.background = element_rect(fill = "gray95") + ) + dev.off() + + diff --git a/workflows/DEG_pipeline.R b/workflows/DEG_pipeline.R new file mode 100644 index 0000000000000000000000000000000000000000..83bf87a4b9328cbda2b6e70a619c11fa700af894 --- /dev/null +++ b/workflows/DEG_pipeline.R @@ -0,0 +1,923 @@ + ############################################################### + ## DEG pipeline with edgeR, DESeq2 and NOISeq # + ## Modified from the template of Dr. Kristian Ullrich 2014 # + ## @fabianhaas, AG Rensing, Marburg, Jan 2016 # + ############################################################### + # The pipeline expects sample triplicates. + # If you don't have three samples of one condition, use face files with zero counts. + # RPKM calculation is optional. + # Additional program to sum up the tsv files: /mnt/nas_temp/home/haasf/Program/sumup_htseq_counts.pl folder file_prefix + + ########################################### + ## please define your variables # + ########################################### + + saveat <- "/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Hoecker/DEGs_0.9" + + file <- '/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Hoecker/31315.p.sort.counts' + + # Table with the length and the GC content + LenGCTable <- "/mnt/NAS_coruscant_datashare/haasf/GeneAtlas_RNASeq/DEG_2nd_round/cosmoss_V3.3.release_2016-07-22.IsoformV31.fasta.length_GC_Contig_Daten_for_34840_1nd.csv" + + # Column position of the gene name and the width/gc content (first column is gene name!) + myLenPos <- 1 + myGCPos <- 2 + + # + # hoo <- c("","") + # sampleLists <- c() + # for(a in hoo) { + # for(b in hoo) { + # if(a!=b) { + # tmp <- c(a,b) + # sampleLists <- c(sampleLists, list(tmp)) + # } + # } + # } + + + sampleLists <- list(c("cop_D", "cop_L"), + c("cop_D", "spa_D"), + c("cop_D", "spa_L"), + c("cop_D", "WT_D"), + c("cop_D", "WT_L"), + c("cop_L", "spa_D"), + c("cop_L", "spa_L"), + c("cop_L", "WT_D"), + c("cop_L", "WT_L"), + c("spa_D", "spa_L"), + c("spa_D", "WT_D"), + c("spa_D", "WT_L"), + c("spa_L", "WT_D"), + c("spa_L", "WT_L"), + c("WT_D", "WT_L") + ) + + + # Do you want a RPKM value table (1/0) + # Be warned, it takes a while to do this. approx. 10 minutes for 35000 genes and 6 samples. + rpkm <- 0 + # Are the data original HTSeq files? yes <- 1 ; no <- 0 + htseq <- 0 + + # threshold settings for the R packages + #edgeR + pvtedgeR <- 0.001 + #DESeq2 + pvtDESeq2 <- 0.001 + #NOISeq + pvtNOISeq <- 0.9 + + # Output folder names + edgeRDir <- "edgeR" + DESeq2Dir <- "DESeq2" + NOISeqDir <- "NOISeq" + + ######################### + ## stop editing here!! ## + ######################### + + + ########################################### + ## executiv function # + ########################################### + + workflow <- function(file, saveat, SampleNames, LenGCTable, myLenPos, myGCPos, rpkm, pvtedgeR, pvtDESeq2, pvtNOISeq, edgeRDir, DESeq2Dir, NOISeqDir, htseq) + { + # setwd(workingFolder) # set workspace + + sampleNameI <- SampleNames[1] + sampleNameII <- SampleNames[2] + + # Comparison name + cName <- paste(sampleNameI, sampleNameII, sep = " vs. ") + + filename <- strsplit(file, "/") + filename <- filename[[1]][length(filename[[1]])] # long but it's working + sampleName <- gsub(", ","_",toString(SampleNames)) + samples <- SampleNames + # create output dir + dir.create(saveat, showWarnings = FALSE) + outdir <- paste0(saveat,"/",sampleName) + dir.create(outdir) + + # Start the log file + sink(paste0(outdir, "/DEGpipe_",cName, ".log")) + + # print parameter into the log file + print("set parameters:") + print(paste0("Counts file:", file)) + print(paste0("Results stored at:", saveat)) + print(paste0("SampleNames: ",SampleNames)) + print(paste0("LenGCTable: ",LenGCTable)) + print(paste0("myLenPos: ",myLenPos)) + print(paste0("myGCPos: ",myGCPos)) + print(paste0("rpkm: ",rpkm)) + print(paste0("htseq: ",htseq)) + print(paste0("pvtedgeR: ",pvtedgeR)) + print(paste0("pvtDESeq2: ",pvtDESeq2)) + print(paste0("pvtNOISeq: ",pvtNOISeq)) + print(paste0("edgeRDir: ",edgeRDir)) + print(paste0("DESeq2Dir: ",DESeq2Dir)) + print(paste0("NOISeqDir: ",NOISeqDir)) + # session and package infos for the log file + print(sessionInfo()) + print(Sys.info()) + + # load spike-in count tables + data0 <- read.table(file, header=T, sep="\t", row.names=1) + # just if the gene ids not ending with .1 (it is nessessery for NOISeqbio iteration step) + #row.names(data0) <- paste0(row.names(data0), ".1") + + # remove HTSeq rows like "__no_feature" + data0 <- data0[ with(data0, !grepl('__', rownames(data0))) , ] + data0 <- data0[ with(data0, !grepl('ERCC', rownames(data0))) , ] + #data0 <- data0[ , with(data0, !grepl('X.1', colnames(data0)))] + #rename col names (library name to roman number) + + # librariesName <- c("56754_WT_Naturally_3.bam.sort.fastq.unmapped.sam.sort.bam", "56753_WT_Naturally_2.bam.sort.fastq.unmapped.sam.sort.bam", "56752_WT_Naturally_1.bam.sort.fastq.unmapped.sam.sort.bam", "56751_tt_6_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56750_tt_6_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56749_tt_6_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56748_tt_0_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56747_tt_0_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56746_tt_0_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56745_WT_10_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56744_WT_10_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56743_WT_10_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56742_WT_6_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56741_WT_6_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56740_WT_6_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56739_WT_0_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56738_WT_0_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56737_WT_0_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56736_tt_T1_tt_6d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56735_tt_T1_tt_6d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56734_tt_T1_tt_6d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56733_tt_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56732_tt_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56731_tt_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56730_tt_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56729_tt_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56728_tt_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56727_WT_T1_NA_3.bam.sort.fastq.unmapped.sam.sort.bam", "56726_WT_T1_NA_2.bam.sort.fastq.unmapped.sam.sort.bam", "56725_WT_T1_NA_1.bam.sort.fastq.unmapped.sam.sort.bam", "56724_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56723_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56722_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56721_WT_T1_WT_10d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56720_WT_T1_WT_10d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56719_WT_T1_WT_10d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56718_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56717_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56716_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56715_WT_T1_WT_6d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56714_WT_T1_WT_6d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56713_WT_T1_WT_6d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56712_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56711_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56710_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56709_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56708_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56707_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56706_WT_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56705_WT_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56704_WT_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam") + # #librariesName <- c("80484_Celery_3_day_imbibed_embryo_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80483_Celery_3_day_imbibed_embryo_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80482_Celery_3_day_imbibed_embryo_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80481_Celery_1_day_imbibed_embryo_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80480_Celery_1_day_imbibed_embryo_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80479_Celery_1_day_imbibed_embryo_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80478_Celery_5_day_imbibed_whole_fruit_29C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80477_Celery_5_day_imbibed_whole_fruit_29C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80460_Celery_3_day_imbibed_whole_fruit_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80459_Celery_3_day_imbibed_whole_fruit_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80458_Celery_1_day_imbibed_whole_fruit_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80457_Celery_1_day_imbibed_whole_fruit_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80456_Celery_1_day_imbibed_whole_fruit_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80455_Celery_Dry_whole_fruit_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80454_Celery_Dry_whole_fruit_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80453_Celery_Dry_whole_fruit_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80452_Celery_5_day_imbibed_whole_fruit_29C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80451_Celery_5_day_imbibed_whole_fruit_100uM_ABA_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80450_Celery_5_day_imbibed_whole_fruit_100uM_ABA_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80449_Celery_5_day_imbibed_whole_fruit_100uM_ABA_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80448_Celery_5_day_imbibed_whole_fruit_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80447_Celery_5_day_imbibed_whole_fruit_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80446_Celery_5_day_imbibed_whole_fruit_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80445_Celery_3_day_imbibed_whole_fruit_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam.featureCounts.p.txt") + # romanNumber <- c("T17_Dry_Naturally_Freezedryseed", "T17_Dry_Naturally_Freezedryseed.1", "T17_Dry_Naturally_Freezedryseed.2", "T16_Dry_6days_Freezeafter6dageing", "T16_Dry_6days_Freezeafter6dageing.1", "T16_Dry_6days_Freezeafter6dageing.2", "T15_Dry_0days_Freezedryseed", "T15_Dry_0days_Freezedryseed.1", "T15_Dry_0days_Freezedryseed.2", "T14_Dry_10days_Freezeafter10dageing", "T14_Dry_10days_Freezeafter10dageing.1", "T14_Dry_10days_Freezeafter10dageing.2", "T13_Dry_6days_Freezeafter6dageing", "T13_Dry_6days_Freezeafter6dageing.1", "T13_Dry_6days_Freezeafter6dageing.2", "T12_Dry_0days_Freezedryseed", "T12_Dry_0days_Freezedryseed.1", "T12_Dry_0days_Freezedryseed.2", "T11_Imbibe_6days_12h", "T11_Imbibe_6days_12h.1", "T11_Imbibe_6days_12h.2", "T10_Imbibe_6days_5h", "T10_Imbibe_6days_5h.1", "T10_Imbibe_6days_5h.2", "T9_Imbibe_0dyas_5h", "T9_Imbibe_0dyas_5h.1", "T9_Imbibe_0dyas_5h.2", "T8_Imbibe_Naturally_47h", "T8_Imbibe_Naturally_47h.1", "T8_Imbibe_Naturally_47h.2", "T7_Imbibe_Naturally_24h", "T7_Imbibe_Naturally_24h.1", "T7_Imbibe_Naturally_24h.2", "T6_Imbibe_10days_72h", "T6_Imbibe_10days_72h.1", "T6_Imbibe_10days_72h.2", "T5_Imbibe_10days_24h", "T5_Imbibe_10days_24h.1", "T5_Imbibe_10days_24h.2", "T4_Imbibe_6days_47h", "T4_Imbibe_6days_47h.1", "T4_Imbibe_6days_47h.2", "T3_Imbibe_6days_24h", "T3_Imbibe_6days_24h.1", "T3_Imbibe_6days_24h.2", "T2_Imbibe_0days_24h", "T2_Imbibe_0days_24h.1", "T2_Imbibe_0days_24h.2", "T1_Imbibe_0days_5h", "T1_Imbibe_0days_5h.1", "T1_Imbibe_0days_5h.2") + # #romanNumber <- c("emb_3d_20.2", "emb_3d_20.1", "emb_3d_20", "emb_1d_20.2", "emb_1d_20.1", "emb_1d_20", "wf_5d_29.2", "wf_5d_29.1", "wf_3d_20.1", "wf_3d_20", "wf_1d_20.2", "wf_1d_20.1", "wf_1d_20", "wf_dry.2", "wf_dry.1", "wf_dry", "wf_5d_29", "wf_5d_ABA_20.2", "wf_5d_ABA_20.1", "wf_5d_ABA_20", "wf_5d_20.2", "wf_5d_20.1", "wf_5d_20", "wf_3d_20.2") + # + # + # for(i in 1:length(librariesName)) { + # colnames(data0)[colnames(data0)==grep(librariesName[i], names(data0), value = TRUE)] <- romanNumber[i] + # } + + # creats a list of the column names/samples you would like to compare + SampleNames <- makeSampleList(sampleNameI, length(grep(paste0("^",sampleNameI,"[^A-Z]"), colnames(data0))) + 1, + sampleNameII, length(grep(paste0("^",sampleNameII,"[^A-Z]"), colnames(data0))) + 1) + print(SampleNames) + + ## mal anschauen als ersatz für das darüber ## + ## newenw <- table.new[, grepl(paste(LIST,collapse="|"), colnames(table.new))] + + + # exctract the samples from the data0 if needed + if(length(SampleNames) > 0) { + RTableInFiles <- data0[,SampleNames] + data <- data0[,SampleNames] + } + print(head(RTableInFiles)) + + # sort by column name + # data <- data[ , order(names(data))] # dangeros, the order of query and referenz might be not the same + # see how many rows and columns are in the tables + print(paste("Raw data row number: ",nrow(data))) + print(paste("Raw data column number: ",ncol(data))) + + # 2) A typical differential expression analysis workflow + # 2.1) Filtering and exploratory data analysis + # filter: One single gene/spike-in must have at least 5 counts in minimum 2 libraries + filter <- apply(data, 1, function(x) length(x[x>5])>=2) + dataFiltered <- data[filter,] + # see how many rows and columns are left + print(paste("Filtered data row number: ",nrow(dataFiltered))) + print(paste("Filtered data row number: ",ncol(dataFiltered))) + # filter gene names + genes <- rownames(dataFiltered)[grep("^Pp3", rownames(dataFiltered))] + # filter spike-in names + spikes <- rownames(dataFiltered)[grep("^ERCC", rownames(dataFiltered))] + # get the column name list as.factors + xtmp <- colnames(dataFiltered) + x <- split2list(xtmp) + + sampleNameList <- gsub(".[1234567890]+$", "", SampleNames) + columnPos <- makeIndexList(SampleNames) + + # run ... + ### edgeR ### + print(paste0("edgeR is running with ", cName)) + packageDescription("edgeR") + edgeRFunction(outdir, edgeRDir, cName, RTableInFiles, pvtedgeR, sampleNameI, sampleNameII, sampleNameList) + + #### DESeq2 #### + print(paste0("DESeq2 is running with ", cName)) + packageDescription("DESeq2") + DESeq2Function(outdir, DESeq2Dir, cName, RTableInFiles, pvtDESeq2, sampleNameI, sampleNameII, sampleNameList) + + ### NOISeq ### + print(paste0("NOISeq is running with ", cName)) + packageDescription("NOISeq") + NOISeqFunction(outdir, NOISeqDir, cName, RTableInFiles, pvtNOISeq, sampleNameI, sampleNameII, myLenPos, myGCPos, LenGCTable, columnPos, sampleNameList, SampleNames) + + ### Overlap of all three methods ### + # intersection with DEG direction check => 1, without => 0 + intersection_4.file <- intersection(outdir,1,4) + write.table(intersection_4.file, file = paste0(outdir, "/", cName,"_intersection_four_methods_",nrow(intersection_4.file),".tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # edgeR, DESeq2, NOISeq + intersection_3.file <- intersection(outdir,1,3) + write.table(intersection_3.file, file = paste0(outdir, "/", cName,"_intersection_three_methods_",nrow(intersection_3.file),".tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + sink() # End of log file + + } + + + + + + #################### + ## main functions # + #################### + + plotGraphics <- function(data, librariesName, cName, outdir, romanNumber) + { + library("DESeq2") + library("ggplot2") + library("RColorBrewer") + library("pheatmap") + library("genefilter") + + # create sample annotation lists + # sample.name.list <- gsub(".[12]", "", librariesName[colnames(data)]) + sample.name.list <- gsub(".[12]", "", romanNumber) + sample.name.uniq <- unique(unlist(sample.name.list, use.names = FALSE)) + + col.data.condition = data.frame(row.names = librariesName, condition = sample.name.list) + + # run DESeq2 functions + dds.matrix <- DESeqDataSetFromMatrix(countData = data, colData = col.data.condition, design = ~condition) # data must be integer! (Ganze Zahlen) + colData(dds.matrix)$condition <- factor(colData(dds.matrix)$condition, levels = sample.name.uniq) + dds.matrix.deseq <- DESeq(dds.matrix) + dds.matrix.res <- results(dds.matrix.deseq) + dds.matrix.res <- dds.matrix.res[order(dds.matrix.res$padj), ] + dds.matrix.res.fdr <- dds.matrix.res[!is.na(dds.matrix.res$padj), ] + dds.matrix.res.fdr <- dds.matrix.res.fdr[dds.matrix.res.fdr$padj < 0.0001, ] + dds.matrix.rld <- rlogTransformation(dds.matrix, blind = TRUE) + + # data for PCA plot + pcaData <- plotPCA(dds.matrix.rld, intgroup = "condition", returnData=TRUE) + percentVar <- round(100 * attr(pcaData, "percentVar")) + # color for samples + sample.colors <- c("gray60","#D73027","#4575B4", "purple", "orange","#113FF7","green", "black") + + # plot the PCA + # https://cran.r-project.org/web/packages/ggfortify/vignettes/plot_pca.html + png(filename=paste0(outdir,"PCA_all_samples_DEseq2_normalized.png")) + ggplot( + pcaData, + aes( + PC1, + PC2, + color=condition, + shape=condition) + ) + + scale_shape_manual(values = c(4, 1, 2, 3, 0, 5, 6, 7)) + + geom_point(size=2.5) + + xlab( + paste0("PC1: ",percentVar[1],"% variance") + ) + + ylab( + paste0("PC2: ",percentVar[2],"% variance") + ) + + theme() + #, face="bold" + scale_colour_manual( + values=sample.colors # dodgerblue3 + ) + + ggtitle("PCA of all samples (DESeq2 normalized)") + + theme( + plot.title = element_text(lineheight=.8), + panel.background = element_rect(fill = "gray95") + ) + dev.off() + + # how many genes will be involved in the heatmap calculation + numbGenes <- 100 + # pheatmap data + topVarGenes <- head(order(rowVars(assay(dds.matrix.rld)),decreasing=TRUE),numbGenes) + mat <- assay(dds.matrix.rld)[ topVarGenes, ] + mat <- mat - rowMeans(mat) + # pheatmap data annotation + anno <-as.data.frame(colData(dds.matrix.rld)[,c("condition")]) + rownames(anno) <- colnames(data) # check out the row names of annotation + colnames(anno) <- "condition" + # pheatmap data annotation color + condition <- sample.colors + names(condition) <- sample.name.uniq + anno_colors <- list(condition = condition) + + #heatmap(mat) + #pheatmap(mat) + # ftp://cran.r-project.org/pub/R/web/packages/pheatmap/pheatmap.pdf + png(filename=paste0(outdir,"HC_heatmap_all_samples_DESeq2_normalized.png")) + pheatmap(mat , + annotation = anno, + annotation_colors = anno_colors, + fontsize = 10, + fontsize_row = 6, + fontsize_col = 10, + main = paste0("Heatmap of all samples (DESeq2 normalized) ", numbGenes," genes\n"), + annotation_names_col = F, + show_rownames = F, + cluster_rows = F, + border_color = F, + treeheight_col = 20 + ) + dev.off() + } + + + edgeRFunction <- function(odir, edgeRDir, cName, RTableInFiles, pvtedgeR, sampleNameI, sampleNameII, sampleNameList) + { + library(edgeR) + + outdir <- paste0(odir, "/",edgeRDir,"/") # set outdir + + dir.create(outdir, showWarnings = FALSE) # create output folder + dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots" + + comparison <- cName + counts <- RTableInFiles + + pvt <- pvtedgeR + + group <- factor(sampleNameList) + + # edgeR 'normal' + + y.normal <- DGEList(counts = counts, group = group) + + y.normal[,1] + + png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotMDS"))) + plotMDS(y.normal, method = "bcv", main = paste0(cName, "\nedgeR - plotMDS")) + dev.off() + + y.normal <- calcNormFactors(y.normal) + y.normal <- estimateCommonDisp(y.normal) + y.normal <- estimateTagwiseDisp(y.normal) + + + png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotBCV"))) + plotBCV(y.normal, main = paste0(cName, "\nedgeR - plotBCV")) + dev.off() + + et.normal <- exactTest(y.normal, pair = c(sampleNameI, sampleNameII)) + + topTags(et.normal) + # adjust.method ==> character string specifying p-value adjustment method. Possible values are "none", "BH", "fdr" (equivalent to "BH"), "BY" and "holm". See p.adjust for details. + summary(de.normal <- decideTestsDGE(et.normal, p = pvt, adjust = "BH")) + et.normal.fdr <- cbind(et.normal$table, p.adjust(et.normal$table[, 3], method = "BH")) + et.normal.fdr <- et.normal.fdr[et.normal.fdr[, 4] < pvt, ] + + write.table(et.normal.fdr, file = paste0(outdir, cName, "_", nrow(et.normal.fdr), "_edgeR_fdr.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + detags.normal <- rownames(de.normal)[as.logical(de.normal)] + png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotSmear"))) + plotSmear(et.normal, de.normal, tags = detags.normal, main = paste0(cName, "\nedgeR - plotSmear")) + abline(h = c(-2, 2), col = "blue") + dev.off() + + # edgeR 'GLM' + + y.glm <- DGEList(counts = counts, group = group) + + design <- model.matrix(~0 + group, data = y.glm$samples) + design + + y.glm <- estimateGLMCommonDisp(y.glm, design) + y.glm <- estimateGLMTrendedDisp(y.glm, design) + + ## Loading required package: splines + y.glm <- estimateGLMTagwiseDisp(y.glm, design) + + png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotBCV - GLM"))) + plotBCV(y.glm, main = paste0(cName, "\nedgeR - plotBCV - GLM")) + dev.off() + + y.fit <- glmFit(y.glm, design) + y.lrt <- glmLRT(y.fit) + + FDR <- p.adjust(y.lrt$table$PValue, method = "BH") + sum(FDR < pvt) + + topTags(y.lrt) + + summary(de.glm <- decideTestsDGE(y.lrt, p = pvt, adjust = "BH")) + + } + + + DESeq2Function <- function(odir, DESeq2Dir, cName, RTableInFiles, pvtDESeq2, sampleNameI, sampleNameII, sampleNameList) + { + library(DESeq2) + library(RColorBrewer) + library(gplots) + library(vsn) + + outdir <- paste0(odir, "/",DESeq2Dir,"/") # set outdir + + dir.create(outdir, showWarnings = FALSE) + dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots" + + + pvt <- pvtDESeq2 + + + + # DESeqDataSet(se, design, ignoreRank = FALSE) + # DESeqDataSetFromMatrix(countData, colData, design, tidy = FALSE, ignoreRank = FALSE, ...) + # DESeqDataSetFromHTSeqCount(sampleTable, directory = ".", design, ignoreRank = FALSE, ...) + # DESeqDataSetFromTximport(txi, colData, design, ...) + coldata = data.frame(row.names = colnames(RTableInFiles), condition = sampleNameList) + + ddsHTSeq <- DESeqDataSetFromMatrix(countData = RTableInFiles, colData = coldata, design = ~ condition) + + colData(ddsHTSeq)$condition <- factor(colData(ddsHTSeq)$condition, levels = c(sampleNameI, sampleNameII)) + + ddsHTSeq <- DESeq(ddsHTSeq) + + ddsHTSeq.res <- results(ddsHTSeq) + ddsHTSeq.res <- ddsHTSeq.res[order(ddsHTSeq.res$padj), ] + head(ddsHTSeq.res) + + # png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - plotMA"))) + # plotMA(ddsHTSeq, main = paste0(cName, "\nDESeq2 - plotMA"), ylim = c(-2, 2)) + # dev.off() + + ddsHTSeq.res.fdr <- ddsHTSeq.res[!is.na(ddsHTSeq.res$padj), ] + ddsHTSeq.res.fdr <- ddsHTSeq.res.fdr[ddsHTSeq.res.fdr$padj < pvt, ] + + write.table(as.matrix(ddsHTSeq.res.fdr), file = paste0(outdir, cName, "_", nrow(as.matrix(ddsHTSeq.res.fdr)), "_DESeq2_fdr.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + ddsHTSeq.rld <- rlogTransformation(ddsHTSeq, blind = TRUE) + ddsHTSeq.vsd <- varianceStabilizingTransformation(ddsHTSeq, blind = TRUE) + + par(mfrow = c(1, 3)) + notAllZero <- (rowSums(counts(ddsHTSeq)) > 0) + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.1"))) + meanSdPlot(log2(counts(ddsHTSeq, normalized = TRUE)[notAllZero, ] + 1)) # , ylim = c(0, 2.5) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.2"))) # , ylim = c(0, 2.5) + meanSdPlot(assay(ddsHTSeq.rld[notAllZero, ])) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.3"))) # , ylim = c(0, 2.5) + meanSdPlot(assay(ddsHTSeq.vsd[notAllZero, ])) + dev.off() + + hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100) + distRL <- dist(t(assay(ddsHTSeq.rld))) + mat <- as.matrix(distRL) + rownames(mat) <- colnames(mat) <- with(colData(ddsHTSeq), paste0(condition)) + + # png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - heatmap"))) + # heatmap(mat, trace = "none", col = rev(hmcol), margin = c(13, 13)) + # dev.off() + + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - PCA"))) + print(plotPCA(ddsHTSeq.rld, intgroup = "condition")) + dev.off() + + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - DispEsts"))) + plotDispEsts(ddsHTSeq, main = paste0(cName, "\nDESeq2 - plotDispEsts")) + dev.off() + } + + NOISeqFunction <- function(odir, NOISeqDir, cName, RTableInFiles, pvtNOISeq, sampleNameI, sampleNameII, myLenPos, myGCPos, LenGCTable, columnPos, sampleNameList, SampleNames) + { + library(NOISeq) + + outdir <- paste0(odir, "/",NOISeqDir,"/") # set outdir + + dir.create(outdir, showWarnings = FALSE) + dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots" + + pvt <- pvtNOISeq + + counts <- RTableInFiles[, columnPos] + head(counts) + + # must be changed if there are no triplicates + # !! Please change this setting if needed !! + myfactors <- data.frame(Treatment = sampleNameList, TreatmentRun = SampleNames) + # sort counts by row names + mycounts <- counts[order(rownames(counts)), ] + # filter out low counts + mycounts.filter <- filtered.data(mycounts, factor = myfactors$Treatment, norm = FALSE, depth = NULL, method = 1, cv.cutoff = 100, cpm = 1) + mycounts.rowname <- rownames(mycounts.filter) + + # Extra file with length and GC content + myLengthTable <- read.table(LenGCTable, sep = "\t", header = TRUE, stringsAsFactor = FALSE, row.names=1) + myLengthTable <- myLengthTable[order(rownames(myLengthTable)),] # sort by rownames + myLengthTable.filter <- myLengthTable[mycounts.rowname,] + + mylength <- myLengthTable.filter[,myLenPos] + names(mylength) <- rownames(myLengthTable.filter) + + mygc <- myLengthTable.filter[,myGCPos] + names(mygc) <- rownames(myLengthTable.filter) + + + + + + mydata <- readData(data = mycounts.filter, length = mylength, gc = mygc, factors = myfactors) + mydata + + mydata.GCbias <- dat(mydata, type = "GCbias") + #-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - GCbias.1"))) + #-# explo.plot(mydata.GCbias) + #-# dev.off() + + mydata.GCbias.group <- dat(mydata, factor = "Treatment", type = "GCbias") + #-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - GCbias.2"))) + #-# explo.plot(mydata.GCbias.group) + #-# dev.off() + + show(mydata.GCbias.group) + + mydata.lengthbias <- dat(mydata, type = "lengthbias") + #-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - Lenbias.1"))) + #-# explo.plot(mydata.lengthbias) + #-# dev.off() + + mydata.lengthbias.group <- dat(mydata, factor = "Treatment", type = "lengthbias") + #-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - Lenbias.2"))) + #-# explo.plot(mydata.lengthbias.group) + #-# dev.off() + + show(mydata.lengthbias.group) + + # looks different + mydata.cd <- dat(mydata, type = "cd") + #-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - cd"))) + #-# explo.plot(mydata.cd) + #-# dev.off() + mylength + + myRPKM <- NOISeq::rpkm(assayData(mydata)$exprs) + myUQUA <- NOISeq::uqua(assayData(mydata)$exprs, long = mylength, lc = 0.5, k = 0) + myTMM <- NOISeq::tmm(assayData(mydata)$exprs, long = 1000, lc = 0) + + mynoiseq.rpkm <- noiseq(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological") + mynoiseq.nnorm <- noiseq(mydata, k = 0.5, norm = "n", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological") + mynoiseq.tmm <- noiseq(mydata, k = 0.5, norm = "tmm", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological") + + # RPKM + mynoiseq.rpkm.prob <- mynoiseq.rpkm@results[[1]][mynoiseq.rpkm@results[[1]]$prob > pvt, ] + mynoiseq.rpkm.prob <- mynoiseq.rpkm.prob[!is.na(mynoiseq.rpkm.prob$prob), ] + write.table(mynoiseq.rpkm.prob, file = paste0(outdir, cName, "_", nrow(mynoiseq.rpkm.prob), "_NOISeq_rpkm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # none + mynoiseq.nnorm.prob <- mynoiseq.nnorm@results[[1]][mynoiseq.nnorm@results[[1]]$prob > pvt, ] + mynoiseq.nnorm.prob <- mynoiseq.nnorm.prob[!is.na(mynoiseq.nnorm.prob$prob), ] + write.table(mynoiseq.nnorm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseq.nnorm.prob), "_NOISeq_nnorm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # TMM + mynoiseq.tmm.prob <- mynoiseq.tmm@results[[1]][mynoiseq.tmm@results[[1]]$prob > pvt, ] + mynoiseq.tmm.prob <- mynoiseq.tmm.prob[!is.na(mynoiseq.tmm.prob$prob), ] + write.table(mynoiseq.tmm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseq.tmm.prob), "_NOISeq_tmm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + ############ NOISeq bio ################ + + pvtnsb <- 0.995 + a0per_value <- 0.99 + + iterations <- 5 + x5 <- c(sample(10000:99999, iterations, replace=F), 12345) + x5 + + collector.rpkm <- NULL + collector.tmm <- NULL + + + # for (iter in x5) { + # # RPKM + # mynoiseqbio.rpkm <- noiseqbio(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = iter, filter = 1)#, plot = TRUE) + # mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm@results[[1]][mynoiseqbio.rpkm@results[[1]]$prob > pvtnsb, ] + # mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm.prob[!is.na(mynoiseqbio.rpkm.prob$prob), ] + # collector.rpkm <- rbind(collector.rpkm, mynoiseqbio.rpkm.prob) + # #collector <- rbind(collector, mynoiseqbio.rpkm.prob[rownames(mynoiseqbio.rpkm.prob) %in% rownames(collector),]) + # # TMM + # mynoiseqbio.tmm <- noiseqbio(mydata, k = 0.5, norm = "tmm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = iter, filter = 1)#, plot = TRUE) + # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ] + # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ] + # collector.tmm <- rbind(collector.tmm, mynoiseqbio.tmm.prob) + # } + # + # # RPKM + # print(sort(row.names(collector.rpkm))) + # library(stringr) + # mynoiseqbio.rpkm.prob.overlap <- collector.rpkm[str_split_fixed(row.names(collector.rpkm), "\\.",2)[,2]>=iterations+10,] + # row.names(mynoiseqbio.rpkm.prob.overlap) <- substr(row.names(mynoiseqbio.rpkm.prob.overlap),1,str_length(row.names(mynoiseqbio.rpkm.prob.overlap))-1) + # + # write.table(mynoiseqbio.rpkm.prob.overlap, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.rpkm.prob.overlap),"_NOISeqbio_rpkm_prob_seed_iteration_overlap.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # + # #TMM + # print(sort(row.names(collector.tmm))) + # library(stringr) + # mynoiseqbio.tmm.prob.overlap <- collector.tmm[str_split_fixed(row.names(collector.tmm), "\\.",2)[,2]>=iterations+10,] + # row.names(mynoiseqbio.tmm.prob.overlap) <- substr(row.names(mynoiseqbio.tmm.prob.overlap),1,str_length(row.names(mynoiseqbio.tmm.prob.overlap))-1) + # + # write.table(mynoiseqbio.tmm.prob.overlap, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.tmm.prob.overlap),"_NOISeqbio_tmm_prob_seed_iteration_overlap.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + # # none + # mynoiseqbio.nnorm.prob <- mynoiseqbio.nnorm@results[[1]][mynoiseqbio.nnorm@results[[1]]$prob > pvtnsb, ] + # mynoiseqbio.nnorm.prob <- mynoiseqbio.nnorm.prob[!is.na(mynoiseqbio.nnorm.prob$prob), ] + # write.table(mynoiseqbio.nnorm.prob, file = paste0(outdir, cName, "_NOISeqbio_nnorm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # # TMM + # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ] + # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ] + # write.table(mynoiseqbio.tmm.prob, file = paste0(outdir, cName, "_NOISeqbio_tmm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + + + + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbiorpkm"))) + mynoiseqbio.rpkm <- noiseqbio(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE) + dev.off() + + mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm@results[[1]][mynoiseqbio.rpkm@results[[1]]$prob > pvtnsb, ] + mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm.prob[!is.na(mynoiseqbio.rpkm.prob$prob), ] + write.table(mynoiseqbio.rpkm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.rpkm.prob),"_NOISeqbio_rpkm_prob_seed_12345.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbiontmm"))) + mynoiseqbio.tmm <- noiseqbio(mydata, k = 0.5, norm = "tmm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE) + dev.off() + + mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ] + mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ] + write.table(mynoiseqbio.tmm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.tmm.prob),"_NOISeqbio_tmm_prob_seed_12345.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbionnorm"))) + mynoiseqbio.nnorm <- noiseqbio(mydata, k = 0.5, norm = "n", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE) + dev.off() + + head(mynoiseqbio.rpkm@results[[1]]) + head(mynoiseqbio.nnorm@results[[1]]) + + mynoiseqbio.rpkm.deg <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = NULL) + + mynoiseqbio.rpkm.deg.up <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = "up") + mynoiseqbio.rpkm.deg.down <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = "down") + mynoiseqbio.nnorm.deg <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = NULL) + mynoiseqbio.nnorm.deg.up <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = "up") + mynoiseqbio.nnorm.deg.down <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = "down") + + par(mfrow = c(3, 2)) + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.9"))) + DE.plot(mynoiseq.rpkm, q = 0.9, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.8"))) + DE.plot(mynoiseq.rpkm, q = 0.8, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorn_0.9"))) + DE.plot(mynoiseq.nnorm, q = 0.9, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorn_0.8"))) + DE.plot(mynoiseq.nnorm, q = 0.8, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.tmm_0.9"))) + DE.plot(mynoiseq.tmm, q = 0.9, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.tmm_0.8"))) + DE.plot(mynoiseq.tmm, q = 0.8, graphic = "expr", log.scale = TRUE) + dev.off() + + par(mfrow = c(2, 2)) + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.rpkm_",pvt))) + DE.plot(mynoiseqbio.rpkm, q = pvt, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.rpkm_",pvtnsb))) + DE.plot(mynoiseqbio.rpkm, q = pvtnsb, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.nnorm_",pvt))) + DE.plot(mynoiseqbio.nnorm, q = pvt, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.nnorm_",pvtnsb))) + DE.plot(mynoiseqbio.nnorm, q = pvtnsb, graphic = "expr", log.scale = TRUE) + dev.off() + + par(mfrow = c(1, 2)) + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.9_MD"))) + DE.plot(mynoiseq.rpkm, q = 0.9, graphic = "MD", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorm_0.9_MD"))) + DE.plot(mynoiseq.nnorm, q = 0.9, graphic = "MD", log.scale = TRUE) + dev.off() + } + + ########################################### + ## additional function for more usability # + ########################################### + + calcRPKM <- function(c,n,l) + { + RPKM <- (10^9* as.numeric(c)) / (as.numeric(n) * as.numeric(l)) + return(RPKM) + } + + RPKMMatrix <- function(InTable,directory,LenGCTable,myLenPos) + { + myLengthTable <- read.table(paste0(directory,LenGCTable), sep = "\t", header = TRUE, stringsAsFactor = FALSE, row.names = 1) + + newtable <- matrix(data = c(1), nrow = nrow(InTable), ncol(InTable), byrow= TRUE) + colnames(newtable) <- colnames(InTable) + rownames(newtable) <- rownames(InTable) + + sumCol <- apply(InTable,2,sum) # list of the all col sums + + for(s in colnames(InTable)) { + for(r in rownames(InTable)) { + w <- InTable[r, s] + len <- myLengthTable[r, myLenPos] + sum <- as.numeric(sumCol[s]) + + newtable[r, s] <- calcRPKM(w, sum, len) + } + } + return(newtable) + } + + + # rmHTSeqLines removes the last five rows + rmHTSeqLines <- function(counts) + { + counts <- head(counts, -5) + return(counts) + } + + + ############### + ## functions ## + ############### + + # split list to a factor list + split2list <- function(tmp) + { + xx <- c() + for(i in 1:length(tmp)) { + if(grepl("\\.", tmp[i])) { + foo <- strsplit(tmp[i], '\\.')[[1]] + fooo <- strsplit(foo, " ")[[1]] + xx <- c(xx, fooo) + } else { + xx <- c(xx, tmp[i]) + } + } + return(as.factor(xx)) + } + + # makeGroups + # https://github.com/drisso/RUVSeq/blob/master/R/helper.R + makeGroups <- function(xs) + { + xs <- factor(xs) + groups <- matrix(-1, nrow = length(levels(xs)), ncol = max(table(xs))) + for (i in 1:length(levels(xs))) { + idxs <- which(xs == levels(xs)[i]) + groups[i,1:length(idxs)] <- idxs + } + return(groups) + } + + # The function makeSampleList creates a list of the columns(samples) you would like to compare + makeSampleList <- function(A, Al, B, Bl) + { + l <- c() + if(exists("Al", mode = "numeric")) { + l <- makeSampleList.helper(A, Al, l) + l <- makeSampleList.helper(B, Bl, l) + } + return(l) + } + makeSampleList.helper <- function(v,k,l) + { + for(h in 1:k) { + if(h == 1) { + l <- c(l, v) + } else { + l <- c(l, paste0(v,".",h-1)) + } + } + return(l) + } + + # function creates a list of numbers out of another list. The List is the index of the given list. + makeIndexList <- function(l) + { + n <- 1 + jj <- c() + for(v in l) { + jj <- c(jj, n) + n <- n + 1 + } + return(jj) + } + + intersection <- function(l,check,t) + { + l.list <- list.files(path = l, pattern = "*(rpkm|edgeR|DESeq2).*tsv", recursive = TRUE) + if(grepl("bio", l.list[4],fixed = TRUE)) { + l.list.noiseq <- read.table(paste0(l,"/",l.list[3]), header=T, sep="\t", row.names=1) + l.list.noiseqbio <- read.table(paste0(l,"/",l.list[4]), header=T, sep="\t", row.names=1) + } else { + l.list.noiseq <- read.table(paste0(l,"/",l.list[4]), header=T, sep="\t", row.names=1) + l.list.noiseqbio <- read.table(paste0(l,"/",l.list[3]), header=T, sep="\t", row.names=1) + } + l.list.edgeR <- read.table(paste0(l,"/",l.list[2]), header=T, sep="\t", row.names=1) + l.list.deseq2 <- read.table(paste0(l,"/",l.list[1]), header=T, sep="\t", row.names=1) + + + print("DEGs edgeR") + print(nrow(l.list.edgeR)) + print("DEGs deseq2") + print(nrow(l.list.deseq2)) + print("DEGs noiseq") + print(nrow(l.list.noiseq)) + print("DEGs noiseqbio") + print(nrow(l.list.noiseqbio)) + + interect.list <- "" + interect.list.direct <- "" + + if(t==4) { + # List of overlaps without direction check + interect.list <- intersect(rownames(l.list.edgeR), intersect(rownames(l.list.noiseq), intersect(rownames(l.list.noiseqbio), rownames(l.list.deseq2)))) + + print("DEGs intersection without direction check (four methods)") + print(length(interect.list)) + print(interect.list) + print("DEGs intersection with direction check") + + # List of overlaps with direction check + if(length(interect.list)!=0) { + interect.list.direct <- interDirect(l.list.edgeR, l.list.deseq2, l.list.noiseq, l.list.noiseqbio, interect.list) + print(length(interect.list.direct)) + print(interect.list.direct) + } else { + print("0") + } + } + if(t==3) { + # List of overlaps without direction check + interect.list <- intersect(rownames(l.list.edgeR), intersect(rownames(l.list.noiseq), rownames(l.list.deseq2))) + + print("DEGs intersection without direction check (three methods)") + print(length(interect.list)) + print(interect.list) + print("DEGs intersection with direction check") + + # List of overlaps with direction check + if(length(interect.list)!=0) { + # fake data frame with 0 rows => nrow == 0 + df_fake <- data.frame(Date=as.Date(character()), + File=character(), + User=character(), + stringsAsFactors=FALSE) + + interect.list.direct <- interDirect(l.list.edgeR, l.list.deseq2, l.list.noiseq, df_fake, interect.list) + print(length(interect.list.direct)) + print(interect.list.direct) + } else { + print("0") + } + } + + + if(check == 1){ + return(l.list.noiseq[interect.list.direct,]) + } else { + return(l.list.noiseq[interect.list,]) + } + } + + interDirect <- function(e,d,n,nb,i) + { + + if(nrow(nb) == 0) { + nbx <- apply(n[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) # use noiseq twice to simulate noiseqbio + nby <- paste0(nbx[,"M"],rownames(nbx)) + } else { + nbx <- apply(nb[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) + nby <- paste0(nbx[,"log2FC"],rownames(nbx)) + } + + ex <- apply(e[i,], 2, function(x) {ifelse(x < 0, "-", "+")}) + dx <- apply(d[i,], 2, function(x) {ifelse(x < 0, "-", "+")}) + nx <- apply(n[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) # twisted, because edgeR and DESeq2 are calculating vice versa + + ey <- paste0(ex[,"logFC"],rownames(ex)) + dy <- paste0(dx[,"log2FoldChange"],rownames(dx)) + ny <- paste0(nx[,"M"],rownames(nx)) + + interect.list.direct <- intersect(ey, intersect(dy, intersect(ny, nby))) + + + return(substr(interect.list.direct,2,100)) + } + + ########################################### + ## executiv part of the script # + ########################################### + for(SampleNames in sampleLists) { + workflow(file, saveat, SampleNames, LenGCTable, myLenPos, myGCPos, rpkm, pvtedgeR, pvtDESeq2, pvtNOISeq, edgeRDir, DESeq2Dir, NOISeqDir, htseq) + } + print("Thank you for using the DEG pipeline :) \n") + + # fin + + ### compare DEGs analysis ### + # used venny and Excel + +