User Tools

Site Tools


Hotfix release available: 2023-04-04a "Jack Jackrum". upgrade now! [54.1] (what's this?)
New release available: 2023-04-04 "Jack Jackrum". upgrade now! [54] (what's this?)
Hotfix release available: 2022-07-31b "Igor". upgrade now! [53.1] (what's this?)
Hotfix release available: 2022-07-31a "Igor". upgrade now! [53] (what's this?)
New release available: 2022-07-31 "Igor". upgrade now! [52.2] (what's this?)
New release candidate 2 available: rc2022-06-26 "Igor". upgrade now! [52.1] (what's this?)
New release candidate available: 2022-06-26 "Igor". upgrade now! [52] (what's this?)
Hotfix release available: 2020-07-29a "Hogfather". upgrade now! [51.4] (what's this?)
personal:haasf:2021:madland:new_deg_pipe_four_methods_noiseqbio_hoecker_20210527
:personal:haasf:2021:madland:new_deg_pipe_four_methods_noiseqbio_hoecker_20210527.r
###############################################################
## DEG pipeline with edgeR, DESeq2 and NOISeq                 #
## Modified from the template of Dr. Kristian Ullrich 2014    #
## @fabianhaas, AG Rensing, Marburg, Jan 2016                 #
###############################################################
# The pipeline expects sample triplicates.
#   If you don't have three samples of one condition, use face files with zero counts.
# RPKM calculation is optional.
# Additional program to sum up the tsv files: /mnt/nas_temp/home/haasf/Program/sumup_htseq_counts.pl folder file_prefix
 
###########################################
## please define your variables           #
###########################################
 
saveat <- "/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Hoecker/DEGs_0.9"
 
file <- '/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Hoecker/31315.p.sort.counts'
 
# Table with the length and the GC content
LenGCTable <- "/mnt/NAS_coruscant_datashare/haasf/GeneAtlas_RNASeq/DEG_2nd_round/cosmoss_V3.3.release_2016-07-22.IsoformV31.fasta.length_GC_Contig_Daten_for_34840_1nd.csv"
 
# Column position of the gene name and the width/gc content (first column is gene name!)
myLenPos <- 1
myGCPos <- 2
 
# 
# hoo <- c("","")
# sampleLists <- c()
# for(a in hoo) {
#   for(b in hoo) {
#     if(a!=b) {
#       tmp <- c(a,b)
#       sampleLists <- c(sampleLists, list(tmp))
#     }
#   }
# }
 
 
sampleLists <- list(c("cop_D", "cop_L"),
                    c("cop_D", "spa_D"),
                    c("cop_D", "spa_L"),
                    c("cop_D", "WT_D"),
                    c("cop_D", "WT_L"),
                    c("cop_L", "spa_D"),
                    c("cop_L", "spa_L"),
                    c("cop_L", "WT_D"),
                    c("cop_L", "WT_L"),
                    c("spa_D", "spa_L"),
                    c("spa_D", "WT_D"),
                    c("spa_D", "WT_L"),
                    c("spa_L", "WT_D"),
                    c("spa_L", "WT_L"),
                    c("WT_D", "WT_L")
                  )
 
 
# Do you want a RPKM value table (1/0)
# Be warned, it takes a while to do this. approx. 10 minutes for 35000 genes and 6 samples.
rpkm <- 0
# Are the data original HTSeq files? yes <- 1 ; no <- 0
htseq <- 0
 
# threshold settings for the R packages
#edgeR
pvtedgeR <- 0.001
#DESeq2
pvtDESeq2 <- 0.001
#NOISeq
pvtNOISeq <- 0.9
 
# Output folder names
edgeRDir <- "edgeR"
DESeq2Dir <- "DESeq2"
NOISeqDir <- "NOISeq"
 
#########################
## stop editing here!! ##
#########################
 
 
###########################################
## executiv function                      #
###########################################
 
workflow <- function(file, saveat, SampleNames, LenGCTable, myLenPos, myGCPos, rpkm, pvtedgeR, pvtDESeq2, pvtNOISeq, edgeRDir, DESeq2Dir, NOISeqDir, htseq)
{
    # setwd(workingFolder) # set workspace
 
            sampleNameI <- SampleNames[1]
            sampleNameII <- SampleNames[2]
 
            # Comparison name
            cName <- paste(sampleNameI, sampleNameII, sep = " vs. ")
 
            filename <- strsplit(file, "/")
            filename <- filename[[1]][length(filename[[1]])] # long but it's working
            sampleName <- gsub(", ","_",toString(SampleNames))
            samples <- SampleNames
            # create output dir
            dir.create(saveat, showWarnings = FALSE)
            outdir <- paste0(saveat,"/",sampleName)
            dir.create(outdir)
 
            # Start the log file
            sink(paste0(outdir, "/DEGpipe_",cName, ".log"))
 
            # print parameter into the log file
            print("set parameters:")
            print(paste0("Counts file:", file))
            print(paste0("Results stored at:", saveat))
            print(paste0("SampleNames: ",SampleNames))
            print(paste0("LenGCTable: ",LenGCTable))
            print(paste0("myLenPos: ",myLenPos))
            print(paste0("myGCPos: ",myGCPos))
            print(paste0("rpkm: ",rpkm))
            print(paste0("htseq: ",htseq))
            print(paste0("pvtedgeR: ",pvtedgeR))
            print(paste0("pvtDESeq2: ",pvtDESeq2))
            print(paste0("pvtNOISeq: ",pvtNOISeq))
            print(paste0("edgeRDir: ",edgeRDir))
            print(paste0("DESeq2Dir: ",DESeq2Dir))
            print(paste0("NOISeqDir: ",NOISeqDir))
            # session and package infos for the log file
            print(sessionInfo())
            print(Sys.info())
 
            # load spike-in count tables
            data0 <- read.table(file, header=T, sep="\t", row.names=1)
            # just if the gene ids not ending with .1 (it is nessessery for NOISeqbio iteration step)
            #row.names(data0) <- paste0(row.names(data0), ".1")
 
            # remove HTSeq rows like "__no_feature"
            data0 <- data0[ with(data0,  !grepl('__', rownames(data0))) , ]
            data0 <- data0[ with(data0,  !grepl('ERCC', rownames(data0))) , ]
            #data0 <- data0[ , with(data0,  !grepl('X.1', colnames(data0)))]
            #rename col names (library name to roman number)
 
          # librariesName <- c("56754_WT_Naturally_3.bam.sort.fastq.unmapped.sam.sort.bam", "56753_WT_Naturally_2.bam.sort.fastq.unmapped.sam.sort.bam", "56752_WT_Naturally_1.bam.sort.fastq.unmapped.sam.sort.bam", "56751_tt_6_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56750_tt_6_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56749_tt_6_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56748_tt_0_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56747_tt_0_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56746_tt_0_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56745_WT_10_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56744_WT_10_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56743_WT_10_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56742_WT_6_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56741_WT_6_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56740_WT_6_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56739_WT_0_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56738_WT_0_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56737_WT_0_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56736_tt_T1_tt_6d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56735_tt_T1_tt_6d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56734_tt_T1_tt_6d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56733_tt_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56732_tt_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56731_tt_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56730_tt_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56729_tt_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56728_tt_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56727_WT_T1_NA_3.bam.sort.fastq.unmapped.sam.sort.bam", "56726_WT_T1_NA_2.bam.sort.fastq.unmapped.sam.sort.bam", "56725_WT_T1_NA_1.bam.sort.fastq.unmapped.sam.sort.bam", "56724_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56723_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56722_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56721_WT_T1_WT_10d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56720_WT_T1_WT_10d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56719_WT_T1_WT_10d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56718_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56717_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56716_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56715_WT_T1_WT_6d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56714_WT_T1_WT_6d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56713_WT_T1_WT_6d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56712_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56711_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56710_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56709_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56708_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56707_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56706_WT_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56705_WT_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56704_WT_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam")
          # #librariesName <- c("80484_Celery_3_day_imbibed_embryo_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80483_Celery_3_day_imbibed_embryo_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80482_Celery_3_day_imbibed_embryo_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80481_Celery_1_day_imbibed_embryo_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80480_Celery_1_day_imbibed_embryo_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80479_Celery_1_day_imbibed_embryo_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80478_Celery_5_day_imbibed_whole_fruit_29C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80477_Celery_5_day_imbibed_whole_fruit_29C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80460_Celery_3_day_imbibed_whole_fruit_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80459_Celery_3_day_imbibed_whole_fruit_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80458_Celery_1_day_imbibed_whole_fruit_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80457_Celery_1_day_imbibed_whole_fruit_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80456_Celery_1_day_imbibed_whole_fruit_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80455_Celery_Dry_whole_fruit_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80454_Celery_Dry_whole_fruit_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80453_Celery_Dry_whole_fruit_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80452_Celery_5_day_imbibed_whole_fruit_29C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80451_Celery_5_day_imbibed_whole_fruit_100uM_ABA_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80450_Celery_5_day_imbibed_whole_fruit_100uM_ABA_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80449_Celery_5_day_imbibed_whole_fruit_100uM_ABA_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80448_Celery_5_day_imbibed_whole_fruit_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80447_Celery_5_day_imbibed_whole_fruit_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80446_Celery_5_day_imbibed_whole_fruit_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80445_Celery_3_day_imbibed_whole_fruit_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam.featureCounts.p.txt")
          # romanNumber <- c("T17_Dry_Naturally_Freezedryseed", "T17_Dry_Naturally_Freezedryseed.1", "T17_Dry_Naturally_Freezedryseed.2", "T16_Dry_6days_Freezeafter6dageing", "T16_Dry_6days_Freezeafter6dageing.1", "T16_Dry_6days_Freezeafter6dageing.2", "T15_Dry_0days_Freezedryseed", "T15_Dry_0days_Freezedryseed.1", "T15_Dry_0days_Freezedryseed.2", "T14_Dry_10days_Freezeafter10dageing", "T14_Dry_10days_Freezeafter10dageing.1", "T14_Dry_10days_Freezeafter10dageing.2", "T13_Dry_6days_Freezeafter6dageing", "T13_Dry_6days_Freezeafter6dageing.1", "T13_Dry_6days_Freezeafter6dageing.2", "T12_Dry_0days_Freezedryseed", "T12_Dry_0days_Freezedryseed.1", "T12_Dry_0days_Freezedryseed.2", "T11_Imbibe_6days_12h", "T11_Imbibe_6days_12h.1", "T11_Imbibe_6days_12h.2", "T10_Imbibe_6days_5h", "T10_Imbibe_6days_5h.1", "T10_Imbibe_6days_5h.2", "T9_Imbibe_0dyas_5h", "T9_Imbibe_0dyas_5h.1", "T9_Imbibe_0dyas_5h.2", "T8_Imbibe_Naturally_47h", "T8_Imbibe_Naturally_47h.1", "T8_Imbibe_Naturally_47h.2", "T7_Imbibe_Naturally_24h", "T7_Imbibe_Naturally_24h.1", "T7_Imbibe_Naturally_24h.2", "T6_Imbibe_10days_72h", "T6_Imbibe_10days_72h.1", "T6_Imbibe_10days_72h.2", "T5_Imbibe_10days_24h", "T5_Imbibe_10days_24h.1", "T5_Imbibe_10days_24h.2", "T4_Imbibe_6days_47h", "T4_Imbibe_6days_47h.1", "T4_Imbibe_6days_47h.2", "T3_Imbibe_6days_24h", "T3_Imbibe_6days_24h.1", "T3_Imbibe_6days_24h.2", "T2_Imbibe_0days_24h", "T2_Imbibe_0days_24h.1", "T2_Imbibe_0days_24h.2", "T1_Imbibe_0days_5h", "T1_Imbibe_0days_5h.1", "T1_Imbibe_0days_5h.2")
          # #romanNumber <- c("emb_3d_20.2", "emb_3d_20.1", "emb_3d_20", "emb_1d_20.2", "emb_1d_20.1", "emb_1d_20", "wf_5d_29.2", "wf_5d_29.1", "wf_3d_20.1", "wf_3d_20", "wf_1d_20.2", "wf_1d_20.1", "wf_1d_20", "wf_dry.2", "wf_dry.1", "wf_dry", "wf_5d_29", "wf_5d_ABA_20.2", "wf_5d_ABA_20.1", "wf_5d_ABA_20", "wf_5d_20.2", "wf_5d_20.1", "wf_5d_20", "wf_3d_20.2")
          # 
          # 
          # for(i in 1:length(librariesName)) {
          #     colnames(data0)[colnames(data0)==grep(librariesName[i], names(data0), value = TRUE)] <- romanNumber[i]
          # }
 
            # creats a list of the column names/samples you would like to compare
            SampleNames <- makeSampleList(sampleNameI, length(grep(paste0("^",sampleNameI,"[^A-Z]"), colnames(data0))) + 1,
                                          sampleNameII, length(grep(paste0("^",sampleNameII,"[^A-Z]"), colnames(data0))) + 1)
            print(SampleNames)
 
            ## mal anschauen als ersatz für das darüber ##
            ## newenw <- table.new[, grepl(paste(LIST,collapse="|"), colnames(table.new))]
 
 
            # exctract the samples from the data0 if needed
            if(length(SampleNames) > 0) {
                RTableInFiles <- data0[,SampleNames]
                data <- data0[,SampleNames]
            }
            print(head(RTableInFiles))
 
            # sort by column name
            # data <- data[ , order(names(data))] # dangeros, the order of query and referenz might be not the same
            # see how many rows and columns are in the tables
            print(paste("Raw data row number: ",nrow(data)))
            print(paste("Raw data column number: ",ncol(data)))
 
            # 2) A typical differential expression analysis workflow
            # 2.1) Filtering and exploratory data analysis
            # filter: One single gene/spike-in must have at least 5 counts in minimum 2 libraries
            filter <- apply(data, 1, function(x) length(x[x>5])>=2)
            dataFiltered <- data[filter,]
            # see how many rows and columns are left
            print(paste("Filtered data row number: ",nrow(dataFiltered)))
            print(paste("Filtered data row number: ",ncol(dataFiltered)))
            # filter gene names
            genes <- rownames(dataFiltered)[grep("^Pp3", rownames(dataFiltered))]
            # filter spike-in names
            spikes <- rownames(dataFiltered)[grep("^ERCC", rownames(dataFiltered))]
            # get the column name list as.factors
            xtmp <- colnames(dataFiltered)
            x <- split2list(xtmp)
 
            sampleNameList <- gsub(".[1234567890]+$", "", SampleNames)
            columnPos <- makeIndexList(SampleNames)
 
            # run ...
            ### edgeR ###
            print(paste0("edgeR is running with ", cName))
            packageDescription("edgeR")
            edgeRFunction(outdir, edgeRDir, cName, RTableInFiles, pvtedgeR, sampleNameI, sampleNameII, sampleNameList)
 
            #### DESeq2 ####
            print(paste0("DESeq2 is running with ", cName))
            packageDescription("DESeq2")
            DESeq2Function(outdir, DESeq2Dir, cName, RTableInFiles, pvtDESeq2, sampleNameI, sampleNameII, sampleNameList)
 
            ### NOISeq ###
            print(paste0("NOISeq is running with ", cName))
            packageDescription("NOISeq")
           NOISeqFunction(outdir, NOISeqDir, cName, RTableInFiles, pvtNOISeq, sampleNameI, sampleNameII, myLenPos, myGCPos, LenGCTable, columnPos, sampleNameList, SampleNames)
 
            ### Overlap of all three methods ###
            # intersection with DEG direction check => 1, without => 0
            intersection_4.file <- intersection(outdir,1,4)
            write.table(intersection_4.file, file = paste0(outdir, "/", cName,"_intersection_four_methods_",nrow(intersection_4.file),".tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
            # edgeR, DESeq2, NOISeq
            intersection_3.file <- intersection(outdir,1,3)
            write.table(intersection_3.file, file = paste0(outdir, "/", cName,"_intersection_three_methods_",nrow(intersection_3.file),".tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
 
            sink() # End of log file
 
}
 
 
 
 
 
####################
## main functions  #
####################
 
plotGraphics <- function(data, librariesName, cName, outdir, romanNumber)
{
    library("DESeq2")
    library("ggplot2")
    library("RColorBrewer")
    library("pheatmap")
    library("genefilter")
 
    # create sample annotation lists
    # sample.name.list <- gsub(".[12]", "", librariesName[colnames(data)])
    sample.name.list <- gsub(".[12]", "", romanNumber)
    sample.name.uniq <- unique(unlist(sample.name.list, use.names = FALSE))
 
    col.data.condition = data.frame(row.names = librariesName, condition = sample.name.list)
 
    # run DESeq2 functions
    dds.matrix <- DESeqDataSetFromMatrix(countData = data, colData = col.data.condition, design = ~condition) # data must be integer! (Ganze Zahlen)
    colData(dds.matrix)$condition <- factor(colData(dds.matrix)$condition, levels = sample.name.uniq)
    dds.matrix.deseq <- DESeq(dds.matrix)
    dds.matrix.res <- results(dds.matrix.deseq)
    dds.matrix.res <- dds.matrix.res[order(dds.matrix.res$padj), ]
    dds.matrix.res.fdr <- dds.matrix.res[!is.na(dds.matrix.res$padj), ]
    dds.matrix.res.fdr <- dds.matrix.res.fdr[dds.matrix.res.fdr$padj < 0.0001, ]
    dds.matrix.rld <- rlogTransformation(dds.matrix, blind = TRUE)
 
    # data for PCA plot
    pcaData <- plotPCA(dds.matrix.rld, intgroup = "condition", returnData=TRUE)
    percentVar <- round(100 * attr(pcaData, "percentVar"))
    # color for samples
    sample.colors <- c("gray60","#D73027","#4575B4", "purple", "orange","#113FF7","green", "black")
 
    # plot the PCA
    # https://cran.r-project.org/web/packages/ggfortify/vignettes/plot_pca.html
    png(filename=paste0(outdir,"PCA_all_samples_DEseq2_normalized.png"))
      ggplot(
          pcaData,
          aes(
              PC1,
              PC2,
              color=condition,
              shape=condition)
          ) +
          scale_shape_manual(values = c(4, 1, 2, 3, 0, 5, 6, 7)) +
          geom_point(size=2.5) +
          xlab(
              paste0("PC1: ",percentVar[1],"% variance")
              ) +
          ylab(
              paste0("PC2: ",percentVar[2],"% variance")
              ) +
          theme() + #, face="bold"
          scale_colour_manual(
              values=sample.colors  # dodgerblue3
              ) +
          ggtitle("PCA of all samples (DESeq2 normalized)") +
          theme(
              plot.title = element_text(lineheight=.8),
              panel.background = element_rect(fill = "gray95")
          )
    dev.off()
 
    # how many genes will be involved in the heatmap calculation
    numbGenes <- 100
    # pheatmap data
    topVarGenes <- head(order(rowVars(assay(dds.matrix.rld)),decreasing=TRUE),numbGenes)
    mat <- assay(dds.matrix.rld)[ topVarGenes, ]
    mat <- mat - rowMeans(mat)
    # pheatmap data annotation
    anno <-as.data.frame(colData(dds.matrix.rld)[,c("condition")])
    rownames(anno) <- colnames(data) # check out the row names of annotation
    colnames(anno) <- "condition"
    # pheatmap data annotation color
    condition <- sample.colors
    names(condition) <- sample.name.uniq
    anno_colors <- list(condition = condition)
 
    #heatmap(mat)
    #pheatmap(mat)
    # ftp://cran.r-project.org/pub/R/web/packages/pheatmap/pheatmap.pdf
    png(filename=paste0(outdir,"HC_heatmap_all_samples_DESeq2_normalized.png"))
      pheatmap(mat ,
               annotation = anno,
               annotation_colors = anno_colors,
               fontsize = 10,
               fontsize_row = 6,
               fontsize_col = 10,
               main = paste0("Heatmap of all samples (DESeq2 normalized) ", numbGenes," genes\n"),
               annotation_names_col = F,
               show_rownames = F,
               cluster_rows = F,
               border_color = F,
               treeheight_col = 20
      )
    dev.off()
}
 
 
edgeRFunction <- function(odir, edgeRDir, cName, RTableInFiles, pvtedgeR, sampleNameI, sampleNameII, sampleNameList)
{
    library(edgeR)
 
    outdir <- paste0(odir, "/",edgeRDir,"/") # set outdir
 
    dir.create(outdir, showWarnings = FALSE) # create output folder
    dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots"
 
    comparison <- cName
    counts <- RTableInFiles
 
    pvt <- pvtedgeR
 
    group <- factor(sampleNameList)
 
    # edgeR 'normal'
 
    y.normal <- DGEList(counts = counts, group = group)
 
    y.normal[,1]
 
    png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotMDS")))
        plotMDS(y.normal, method = "bcv", main = paste0(cName, "\nedgeR - plotMDS"))
    dev.off()
 
    y.normal <- calcNormFactors(y.normal)
    y.normal <- estimateCommonDisp(y.normal)
    y.normal <- estimateTagwiseDisp(y.normal)
 
 
    png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotBCV")))
        plotBCV(y.normal, main = paste0(cName, "\nedgeR - plotBCV"))
    dev.off()
 
    et.normal <- exactTest(y.normal, pair = c(sampleNameI, sampleNameII))
 
    topTags(et.normal)
    # adjust.method   ==>  character string specifying p-value adjustment method. Possible values are "none", "BH", "fdr" (equivalent to "BH"), "BY" and "holm". See p.adjust for details.
    summary(de.normal <- decideTestsDGE(et.normal, p = pvt, adjust = "BH"))
    et.normal.fdr <- cbind(et.normal$table, p.adjust(et.normal$table[, 3], method = "BH"))
    et.normal.fdr <- et.normal.fdr[et.normal.fdr[, 4] < pvt, ]
 
    write.table(et.normal.fdr, file = paste0(outdir, cName, "_", nrow(et.normal.fdr), "_edgeR_fdr.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
 
    detags.normal <- rownames(de.normal)[as.logical(de.normal)]
    png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotSmear")))
    plotSmear(et.normal, de.normal, tags = detags.normal, main = paste0(cName, "\nedgeR - plotSmear"))
    abline(h = c(-2, 2), col = "blue")
    dev.off()
 
    # edgeR 'GLM'
 
    y.glm <- DGEList(counts = counts, group = group)
 
    design <- model.matrix(~0 + group, data = y.glm$samples)
    design
 
    y.glm <- estimateGLMCommonDisp(y.glm, design)
    y.glm <- estimateGLMTrendedDisp(y.glm, design)
 
    ## Loading required package: splines
    y.glm <- estimateGLMTagwiseDisp(y.glm, design)
 
    png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotBCV - GLM")))
    plotBCV(y.glm, main = paste0(cName, "\nedgeR - plotBCV - GLM"))
    dev.off()
 
    y.fit <- glmFit(y.glm, design)
    y.lrt <- glmLRT(y.fit)
 
    FDR <- p.adjust(y.lrt$table$PValue, method = "BH")
    sum(FDR < pvt)
 
    topTags(y.lrt)
 
    summary(de.glm <- decideTestsDGE(y.lrt, p = pvt, adjust = "BH"))
 
}
 
 
DESeq2Function <- function(odir, DESeq2Dir, cName, RTableInFiles, pvtDESeq2, sampleNameI, sampleNameII, sampleNameList)
{
    library(DESeq2)
    library(RColorBrewer)
    library(gplots)
    library(vsn)
 
    outdir <- paste0(odir, "/",DESeq2Dir,"/") # set outdir
 
    dir.create(outdir, showWarnings = FALSE)
    dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots"
 
 
    pvt <- pvtDESeq2
 
 
 
    #   DESeqDataSet(se, design, ignoreRank = FALSE)
    #   DESeqDataSetFromMatrix(countData, colData, design, tidy = FALSE, ignoreRank = FALSE, ...)
    #   DESeqDataSetFromHTSeqCount(sampleTable, directory = ".", design, ignoreRank = FALSE, ...)
    #   DESeqDataSetFromTximport(txi, colData, design, ...)
    coldata = data.frame(row.names = colnames(RTableInFiles), condition = sampleNameList)
 
    ddsHTSeq <- DESeqDataSetFromMatrix(countData = RTableInFiles, colData = coldata, design = ~ condition)
 
    colData(ddsHTSeq)$condition <- factor(colData(ddsHTSeq)$condition, levels = c(sampleNameI, sampleNameII))
 
    ddsHTSeq <- DESeq(ddsHTSeq)
 
    ddsHTSeq.res <- results(ddsHTSeq)
    ddsHTSeq.res <- ddsHTSeq.res[order(ddsHTSeq.res$padj), ]
    head(ddsHTSeq.res)
 
    # png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - plotMA")))
    #     plotMA(ddsHTSeq, main = paste0(cName, "\nDESeq2 - plotMA"), ylim = c(-2, 2))
    # dev.off()
 
    ddsHTSeq.res.fdr <- ddsHTSeq.res[!is.na(ddsHTSeq.res$padj), ]
    ddsHTSeq.res.fdr <- ddsHTSeq.res.fdr[ddsHTSeq.res.fdr$padj < pvt, ]
 
    write.table(as.matrix(ddsHTSeq.res.fdr), file = paste0(outdir, cName, "_", nrow(as.matrix(ddsHTSeq.res.fdr)), "_DESeq2_fdr.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
 
    ddsHTSeq.rld <- rlogTransformation(ddsHTSeq, blind = TRUE)
    ddsHTSeq.vsd <- varianceStabilizingTransformation(ddsHTSeq, blind = TRUE)
 
    par(mfrow = c(1, 3))
    notAllZero <- (rowSums(counts(ddsHTSeq)) > 0)
    png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.1")))
        meanSdPlot(log2(counts(ddsHTSeq, normalized = TRUE)[notAllZero, ] + 1)) # , ylim = c(0, 2.5)
    dev.off()
    png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.2"))) # , ylim = c(0, 2.5)
        meanSdPlot(assay(ddsHTSeq.rld[notAllZero, ]))
    dev.off()
    png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.3"))) # , ylim = c(0, 2.5)
        meanSdPlot(assay(ddsHTSeq.vsd[notAllZero, ]))
    dev.off()
 
    hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
    distRL <- dist(t(assay(ddsHTSeq.rld)))
    mat <- as.matrix(distRL)
    rownames(mat) <- colnames(mat) <- with(colData(ddsHTSeq), paste0(condition))
 
    # png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - heatmap")))
    #     heatmap(mat, trace = "none", col = rev(hmcol), margin = c(13, 13))
    # dev.off()
 
    png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - PCA")))
        print(plotPCA(ddsHTSeq.rld, intgroup = "condition"))
    dev.off()
 
    png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - DispEsts")))
        plotDispEsts(ddsHTSeq, main = paste0(cName, "\nDESeq2 - plotDispEsts"))
    dev.off()
}
 
NOISeqFunction <- function(odir, NOISeqDir, cName, RTableInFiles, pvtNOISeq, sampleNameI, sampleNameII, myLenPos, myGCPos, LenGCTable, columnPos, sampleNameList, SampleNames)
{
    library(NOISeq)
 
    outdir <- paste0(odir, "/",NOISeqDir,"/") # set outdir
 
    dir.create(outdir, showWarnings = FALSE)
    dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots"
 
    pvt <- pvtNOISeq
 
    counts <- RTableInFiles[, columnPos]
    head(counts)
 
    # must be changed if there are no triplicates
    # !! Please change this setting if needed !!
    myfactors <- data.frame(Treatment = sampleNameList, TreatmentRun = SampleNames)
    # sort counts by row names
    mycounts <- counts[order(rownames(counts)), ]
    # filter out low counts
    mycounts.filter <- filtered.data(mycounts, factor = myfactors$Treatment, norm = FALSE, depth = NULL, method = 1, cv.cutoff = 100, cpm = 1)
    mycounts.rowname <- rownames(mycounts.filter)
 
    # Extra file with length and GC content
    myLengthTable <- read.table(LenGCTable, sep = "\t", header = TRUE, stringsAsFactor = FALSE, row.names=1)
    myLengthTable <- myLengthTable[order(rownames(myLengthTable)),] # sort by rownames
    myLengthTable.filter <- myLengthTable[mycounts.rowname,]
 
    mylength <- myLengthTable.filter[,myLenPos]
    names(mylength) <- rownames(myLengthTable.filter)
 
    mygc <- myLengthTable.filter[,myGCPos]
    names(mygc) <- rownames(myLengthTable.filter)
 
 
 
 
 
    mydata <- readData(data = mycounts.filter, length = mylength, gc = mygc, factors = myfactors)
    mydata
 
    mydata.GCbias <- dat(mydata, type = "GCbias")
#-#    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - GCbias.1")))
#-#        explo.plot(mydata.GCbias)
#-#    dev.off()
 
    mydata.GCbias.group <- dat(mydata, factor = "Treatment", type = "GCbias")
#-#    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - GCbias.2")))
#-#      explo.plot(mydata.GCbias.group)
#-#    dev.off()
 
    show(mydata.GCbias.group)
 
    mydata.lengthbias <- dat(mydata, type = "lengthbias")
#-#    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - Lenbias.1")))
#-#        explo.plot(mydata.lengthbias)
#-#    dev.off()
 
    mydata.lengthbias.group <- dat(mydata, factor = "Treatment", type = "lengthbias")
#-#    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - Lenbias.2")))
#-#        explo.plot(mydata.lengthbias.group)
#-#    dev.off()
 
    show(mydata.lengthbias.group)
 
    # looks different
    mydata.cd <- dat(mydata, type = "cd")
#-#    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - cd")))
#-#        explo.plot(mydata.cd)
#-#    dev.off()
    mylength
 
    myRPKM <- NOISeq::rpkm(assayData(mydata)$exprs)
    myUQUA <- NOISeq::uqua(assayData(mydata)$exprs, long = mylength, lc = 0.5, k = 0)
    myTMM <- NOISeq::tmm(assayData(mydata)$exprs, long = 1000, lc = 0)
 
    mynoiseq.rpkm <- noiseq(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological")
    mynoiseq.nnorm <- noiseq(mydata, k = 0.5, norm = "n", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological")
    mynoiseq.tmm <- noiseq(mydata, k = 0.5, norm = "tmm", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological")
 
    # RPKM
    mynoiseq.rpkm.prob <- mynoiseq.rpkm@results[[1]][mynoiseq.rpkm@results[[1]]$prob > pvt, ]
    mynoiseq.rpkm.prob <- mynoiseq.rpkm.prob[!is.na(mynoiseq.rpkm.prob$prob), ]
    write.table(mynoiseq.rpkm.prob, file = paste0(outdir, cName, "_", nrow(mynoiseq.rpkm.prob), "_NOISeq_rpkm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
    # none
    mynoiseq.nnorm.prob <- mynoiseq.nnorm@results[[1]][mynoiseq.nnorm@results[[1]]$prob > pvt, ]
    mynoiseq.nnorm.prob <- mynoiseq.nnorm.prob[!is.na(mynoiseq.nnorm.prob$prob), ]
    write.table(mynoiseq.nnorm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseq.nnorm.prob), "_NOISeq_nnorm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
    # TMM
    mynoiseq.tmm.prob <- mynoiseq.tmm@results[[1]][mynoiseq.tmm@results[[1]]$prob > pvt, ]
    mynoiseq.tmm.prob <- mynoiseq.tmm.prob[!is.na(mynoiseq.tmm.prob$prob), ]
    write.table(mynoiseq.tmm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseq.tmm.prob), "_NOISeq_tmm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
 
    ############ NOISeq bio ################
 
    pvtnsb <- 0.995
    a0per_value <- 0.99
 
    iterations <- 5
    x5 <- c(sample(10000:99999, iterations, replace=F), 12345)
    x5
 
    collector.rpkm <- NULL
    collector.tmm <- NULL
 
 
    # for (iter in x5) {
    #   # RPKM
    #   mynoiseqbio.rpkm <- noiseqbio(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = iter, filter = 1)#, plot = TRUE)
    #   mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm@results[[1]][mynoiseqbio.rpkm@results[[1]]$prob > pvtnsb, ]
    #   mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm.prob[!is.na(mynoiseqbio.rpkm.prob$prob), ]
    #   collector.rpkm <- rbind(collector.rpkm, mynoiseqbio.rpkm.prob)
    #   #collector <- rbind(collector, mynoiseqbio.rpkm.prob[rownames(mynoiseqbio.rpkm.prob) %in% rownames(collector),])
    #   # TMM
    #   mynoiseqbio.tmm <- noiseqbio(mydata, k = 0.5, norm = "tmm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = iter, filter = 1)#, plot = TRUE)
    #   mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ]
    #   mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ]
    #   collector.tmm <- rbind(collector.tmm, mynoiseqbio.tmm.prob)
    # }
    #
    # # RPKM
    # print(sort(row.names(collector.rpkm)))
    # library(stringr)
    # mynoiseqbio.rpkm.prob.overlap <- collector.rpkm[str_split_fixed(row.names(collector.rpkm), "\\.",2)[,2]>=iterations+10,]
    # row.names(mynoiseqbio.rpkm.prob.overlap) <- substr(row.names(mynoiseqbio.rpkm.prob.overlap),1,str_length(row.names(mynoiseqbio.rpkm.prob.overlap))-1)
    #
    # write.table(mynoiseqbio.rpkm.prob.overlap, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.rpkm.prob.overlap),"_NOISeqbio_rpkm_prob_seed_iteration_overlap.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
    #
    # #TMM
    # print(sort(row.names(collector.tmm)))
    # library(stringr)
    # mynoiseqbio.tmm.prob.overlap <- collector.tmm[str_split_fixed(row.names(collector.tmm), "\\.",2)[,2]>=iterations+10,]
    # row.names(mynoiseqbio.tmm.prob.overlap) <- substr(row.names(mynoiseqbio.tmm.prob.overlap),1,str_length(row.names(mynoiseqbio.tmm.prob.overlap))-1)
    #
    # write.table(mynoiseqbio.tmm.prob.overlap, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.tmm.prob.overlap),"_NOISeqbio_tmm_prob_seed_iteration_overlap.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
 
    # # none
    # mynoiseqbio.nnorm.prob <- mynoiseqbio.nnorm@results[[1]][mynoiseqbio.nnorm@results[[1]]$prob > pvtnsb, ]
    # mynoiseqbio.nnorm.prob <- mynoiseqbio.nnorm.prob[!is.na(mynoiseqbio.nnorm.prob$prob), ]
    # write.table(mynoiseqbio.nnorm.prob, file = paste0(outdir, cName, "_NOISeqbio_nnorm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
    # # TMM
    # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ]
    # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ]
    # write.table(mynoiseqbio.tmm.prob, file = paste0(outdir, cName, "_NOISeqbio_tmm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
 
 
 
 
    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbiorpkm")))
        mynoiseqbio.rpkm <- noiseqbio(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE)
    dev.off()
 
    mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm@results[[1]][mynoiseqbio.rpkm@results[[1]]$prob > pvtnsb, ]
    mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm.prob[!is.na(mynoiseqbio.rpkm.prob$prob), ]
    write.table(mynoiseqbio.rpkm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.rpkm.prob),"_NOISeqbio_rpkm_prob_seed_12345.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
 
    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbiontmm")))
        mynoiseqbio.tmm <- noiseqbio(mydata, k = 0.5, norm = "tmm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE)
    dev.off()
 
    mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ]
    mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ]
    write.table(mynoiseqbio.tmm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.tmm.prob),"_NOISeqbio_tmm_prob_seed_12345.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
 
    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbionnorm")))
        mynoiseqbio.nnorm <- noiseqbio(mydata, k = 0.5, norm = "n", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE)
    dev.off()
 
    head(mynoiseqbio.rpkm@results[[1]])
    head(mynoiseqbio.nnorm@results[[1]])
 
    mynoiseqbio.rpkm.deg <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = NULL)
 
    mynoiseqbio.rpkm.deg.up <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = "up")
    mynoiseqbio.rpkm.deg.down <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = "down")
    mynoiseqbio.nnorm.deg <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = NULL)
    mynoiseqbio.nnorm.deg.up <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = "up")
    mynoiseqbio.nnorm.deg.down <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = "down")
 
    par(mfrow = c(3, 2))
    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.9")))
        DE.plot(mynoiseq.rpkm, q = 0.9, graphic = "expr", log.scale = TRUE)
    dev.off()
    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.8")))
        DE.plot(mynoiseq.rpkm, q = 0.8, graphic = "expr", log.scale = TRUE)
    dev.off()
    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorn_0.9")))
        DE.plot(mynoiseq.nnorm, q = 0.9, graphic = "expr", log.scale = TRUE)
    dev.off()
    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorn_0.8")))
        DE.plot(mynoiseq.nnorm, q = 0.8, graphic = "expr", log.scale = TRUE)
    dev.off()
    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.tmm_0.9")))
        DE.plot(mynoiseq.tmm, q = 0.9, graphic = "expr", log.scale = TRUE)
    dev.off()
    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.tmm_0.8")))
        DE.plot(mynoiseq.tmm, q = 0.8, graphic = "expr", log.scale = TRUE)
    dev.off()
 
    par(mfrow = c(2, 2))
    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.rpkm_",pvt)))
        DE.plot(mynoiseqbio.rpkm, q = pvt, graphic = "expr", log.scale = TRUE)
    dev.off()
    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.rpkm_",pvtnsb)))
        DE.plot(mynoiseqbio.rpkm, q = pvtnsb, graphic = "expr", log.scale = TRUE)
    dev.off()
    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.nnorm_",pvt)))
        DE.plot(mynoiseqbio.nnorm, q = pvt, graphic = "expr", log.scale = TRUE)
    dev.off()
    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.nnorm_",pvtnsb)))
        DE.plot(mynoiseqbio.nnorm, q = pvtnsb, graphic = "expr", log.scale = TRUE)
    dev.off()
 
    par(mfrow = c(1, 2))
    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.9_MD")))
        DE.plot(mynoiseq.rpkm, q = 0.9, graphic = "MD", log.scale = TRUE)
    dev.off()
    png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorm_0.9_MD")))
        DE.plot(mynoiseq.nnorm, q = 0.9, graphic = "MD", log.scale = TRUE)
    dev.off()
}
 
###########################################
## additional function for more usability #
###########################################
 
calcRPKM <- function(c,n,l)
{
    RPKM <- (10^9* as.numeric(c)) / (as.numeric(n) * as.numeric(l))
    return(RPKM)
}
 
RPKMMatrix <- function(InTable,directory,LenGCTable,myLenPos)
{
    myLengthTable <- read.table(paste0(directory,LenGCTable), sep = "\t", header = TRUE, stringsAsFactor = FALSE, row.names = 1)
 
    newtable <- matrix(data = c(1), nrow = nrow(InTable), ncol(InTable), byrow= TRUE)
    colnames(newtable) <- colnames(InTable)
    rownames(newtable) <- rownames(InTable)
 
    sumCol <- apply(InTable,2,sum) # list of the all col sums
 
    for(s in colnames(InTable)) {
        for(r in rownames(InTable)) {
            w <- InTable[r, s]
            len <- myLengthTable[r, myLenPos]
            sum <- as.numeric(sumCol[s])
 
            newtable[r, s] <- calcRPKM(w, sum, len)
        }
    }
    return(newtable)
}
 
 
# rmHTSeqLines removes the last five rows
rmHTSeqLines <- function(counts)
{
    counts <- head(counts, -5)
    return(counts)
}
 
 
###############
## functions ##
###############
 
# split list to a factor list
split2list <- function(tmp)
{
    xx <- c()
    for(i in 1:length(tmp)) {
        if(grepl("\\.", tmp[i])) {
            foo <- strsplit(tmp[i], '\\.')[[1]]
            fooo <- strsplit(foo, " ")[[1]]
            xx <- c(xx, fooo)
        } else {
            xx <- c(xx, tmp[i])
        }
    }
    return(as.factor(xx))
}
 
# makeGroups
# https://github.com/drisso/RUVSeq/blob/master/R/helper.R
makeGroups <- function(xs)
{
    xs <- factor(xs)
    groups <- matrix(-1, nrow = length(levels(xs)), ncol = max(table(xs)))
    for (i in 1:length(levels(xs))) {
        idxs <- which(xs == levels(xs)[i])
        groups[i,1:length(idxs)] <- idxs
    }
    return(groups)
}
 
# The function makeSampleList creates a list of the columns(samples) you would like to compare
makeSampleList <- function(A, Al, B, Bl)
{
    l <- c()
    if(exists("Al", mode = "numeric")) {
        l <- makeSampleList.helper(A, Al, l)
        l <- makeSampleList.helper(B, Bl, l)
    }
    return(l)
}
makeSampleList.helper <- function(v,k,l)
{
    for(h in 1:k) {
        if(h == 1) {
            l <- c(l, v)
        } else {
            l <- c(l, paste0(v,".",h-1))
        }
    }
    return(l)
}
 
# function creates a list of numbers out of another list. The List is the index of the given list.
makeIndexList <- function(l)
{
    n <- 1
    jj <- c()
    for(v in l) {
        jj <- c(jj, n)
        n <- n + 1
    }
    return(jj)
}
 
intersection <- function(l,check,t)
{
  l.list <- list.files(path = l, pattern = "*(rpkm|edgeR|DESeq2).*tsv", recursive = TRUE)
  if(grepl("bio", l.list[4],fixed = TRUE)) {
    l.list.noiseq <- read.table(paste0(l,"/",l.list[3]), header=T, sep="\t", row.names=1)
    l.list.noiseqbio <- read.table(paste0(l,"/",l.list[4]), header=T, sep="\t", row.names=1)
  } else {
    l.list.noiseq <- read.table(paste0(l,"/",l.list[4]), header=T, sep="\t", row.names=1)
    l.list.noiseqbio <- read.table(paste0(l,"/",l.list[3]), header=T, sep="\t", row.names=1)
  }
  l.list.edgeR <- read.table(paste0(l,"/",l.list[2]), header=T, sep="\t", row.names=1)
  l.list.deseq2 <- read.table(paste0(l,"/",l.list[1]), header=T, sep="\t", row.names=1)
 
 
  print("DEGs edgeR")
  print(nrow(l.list.edgeR))
  print("DEGs deseq2")
  print(nrow(l.list.deseq2))
  print("DEGs noiseq")
  print(nrow(l.list.noiseq))
  print("DEGs noiseqbio")
  print(nrow(l.list.noiseqbio))
 
  interect.list <- ""
  interect.list.direct <- ""
 
  if(t==4) {
    # List of overlaps without direction check
    interect.list <- intersect(rownames(l.list.edgeR), intersect(rownames(l.list.noiseq), intersect(rownames(l.list.noiseqbio), rownames(l.list.deseq2))))
 
    print("DEGs intersection without direction check (four methods)")
    print(length(interect.list))
    print(interect.list)
    print("DEGs intersection with direction check")
 
    # List of overlaps with direction check
    if(length(interect.list)!=0) {
      interect.list.direct <- interDirect(l.list.edgeR, l.list.deseq2, l.list.noiseq, l.list.noiseqbio, interect.list)
      print(length(interect.list.direct))
      print(interect.list.direct)
    } else {
      print("0")
    }
  }
  if(t==3) {
    # List of overlaps without direction check
    interect.list <- intersect(rownames(l.list.edgeR), intersect(rownames(l.list.noiseq), rownames(l.list.deseq2)))
 
    print("DEGs intersection without direction check  (three methods)")
    print(length(interect.list))
    print(interect.list)
    print("DEGs intersection with direction check")
 
    # List of overlaps with direction check
    if(length(interect.list)!=0) {
      # fake data frame with 0 rows => nrow == 0
      df_fake <- data.frame(Date=as.Date(character()), 
                       File=character(), 
                       User=character(), 
                       stringsAsFactors=FALSE) 
 
      interect.list.direct <- interDirect(l.list.edgeR, l.list.deseq2, l.list.noiseq, df_fake, interect.list)
      print(length(interect.list.direct))
      print(interect.list.direct)
    } else {
      print("0")
    }
  }
 
 
  if(check == 1){
    return(l.list.noiseq[interect.list.direct,])
  } else {
    return(l.list.noiseq[interect.list,])
  }
}
 
interDirect <- function(e,d,n,nb,i)
{
 
  if(nrow(nb) == 0) {
    nbx <- apply(n[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) # use noiseq twice to simulate noiseqbio
    nby <- paste0(nbx[,"M"],rownames(nbx))
  } else {
    nbx <- apply(nb[i,], 2, function(x) {ifelse(x > 0, "-", "+")})
    nby <- paste0(nbx[,"log2FC"],rownames(nbx))
  }
 
  ex <- apply(e[i,], 2, function(x) {ifelse(x < 0, "-", "+")}) 
  dx <- apply(d[i,], 2, function(x) {ifelse(x < 0, "-", "+")})
  nx <- apply(n[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) # twisted, because edgeR and DESeq2 are calculating vice versa
 
  ey <- paste0(ex[,"logFC"],rownames(ex))
  dy <- paste0(dx[,"log2FoldChange"],rownames(dx))
  ny <- paste0(nx[,"M"],rownames(nx))
 
  interect.list.direct <- intersect(ey, intersect(dy, intersect(ny, nby)))
 
 
  return(substr(interect.list.direct,2,100))
}
 
###########################################
## executiv part of the script            #
###########################################
for(SampleNames in sampleLists) {
   workflow(file, saveat, SampleNames, LenGCTable, myLenPos, myGCPos, rpkm, pvtedgeR, pvtDESeq2, pvtNOISeq, edgeRDir, DESeq2Dir, NOISeqDir, htseq)
}
print("Thank you for using the DEG pipeline :) \n")
 
# fin
 
### compare DEGs analysis ###
# used venny and Excel
personal/haasf/2021/madland/new_deg_pipe_four_methods_noiseqbio_hoecker_20210527.txt · Last modified: 2021/06/01 12:26 by Fabian Haas