############################################################### ## DEG pipeline with edgeR, DESeq2 and NOISeq # ## Modified from the template of Dr. Kristian Ullrich 2014 # ## @fabianhaas, AG Rensing, Marburg, Jan 2016 # ############################################################### # The pipeline expects sample triplicates. # If you don't have three samples of one condition, use face files with zero counts. # RPKM calculation is optional. # Additional program to sum up the tsv files: /mnt/nas_temp/home/haasf/Program/sumup_htseq_counts.pl folder file_prefix ########################################### ## please define your variables # ########################################### saveat <- "/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Hoecker/DEGs_0.9" file <- '/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Hoecker/31315.p.sort.counts' # Table with the length and the GC content LenGCTable <- "/mnt/NAS_coruscant_datashare/haasf/GeneAtlas_RNASeq/DEG_2nd_round/cosmoss_V3.3.release_2016-07-22.IsoformV31.fasta.length_GC_Contig_Daten_for_34840_1nd.csv" # Column position of the gene name and the width/gc content (first column is gene name!) myLenPos <- 1 myGCPos <- 2 # # hoo <- c("","") # sampleLists <- c() # for(a in hoo) { # for(b in hoo) { # if(a!=b) { # tmp <- c(a,b) # sampleLists <- c(sampleLists, list(tmp)) # } # } # } sampleLists <- list(c("cop_D", "cop_L"), c("cop_D", "spa_D"), c("cop_D", "spa_L"), c("cop_D", "WT_D"), c("cop_D", "WT_L"), c("cop_L", "spa_D"), c("cop_L", "spa_L"), c("cop_L", "WT_D"), c("cop_L", "WT_L"), c("spa_D", "spa_L"), c("spa_D", "WT_D"), c("spa_D", "WT_L"), c("spa_L", "WT_D"), c("spa_L", "WT_L"), c("WT_D", "WT_L") ) # Do you want a RPKM value table (1/0) # Be warned, it takes a while to do this. approx. 10 minutes for 35000 genes and 6 samples. rpkm <- 0 # Are the data original HTSeq files? yes <- 1 ; no <- 0 htseq <- 0 # threshold settings for the R packages #edgeR pvtedgeR <- 0.001 #DESeq2 pvtDESeq2 <- 0.001 #NOISeq pvtNOISeq <- 0.9 # Output folder names edgeRDir <- "edgeR" DESeq2Dir <- "DESeq2" NOISeqDir <- "NOISeq" ######################### ## stop editing here!! ## ######################### ########################################### ## executiv function # ########################################### workflow <- function(file, saveat, SampleNames, LenGCTable, myLenPos, myGCPos, rpkm, pvtedgeR, pvtDESeq2, pvtNOISeq, edgeRDir, DESeq2Dir, NOISeqDir, htseq) { # setwd(workingFolder) # set workspace sampleNameI <- SampleNames[1] sampleNameII <- SampleNames[2] # Comparison name cName <- paste(sampleNameI, sampleNameII, sep = " vs. ") filename <- strsplit(file, "/") filename <- filename[[1]][length(filename[[1]])] # long but it's working sampleName <- gsub(", ","_",toString(SampleNames)) samples <- SampleNames # create output dir dir.create(saveat, showWarnings = FALSE) outdir <- paste0(saveat,"/",sampleName) dir.create(outdir) # Start the log file sink(paste0(outdir, "/DEGpipe_",cName, ".log")) # print parameter into the log file print("set parameters:") print(paste0("Counts file:", file)) print(paste0("Results stored at:", saveat)) print(paste0("SampleNames: ",SampleNames)) print(paste0("LenGCTable: ",LenGCTable)) print(paste0("myLenPos: ",myLenPos)) print(paste0("myGCPos: ",myGCPos)) print(paste0("rpkm: ",rpkm)) print(paste0("htseq: ",htseq)) print(paste0("pvtedgeR: ",pvtedgeR)) print(paste0("pvtDESeq2: ",pvtDESeq2)) print(paste0("pvtNOISeq: ",pvtNOISeq)) print(paste0("edgeRDir: ",edgeRDir)) print(paste0("DESeq2Dir: ",DESeq2Dir)) print(paste0("NOISeqDir: ",NOISeqDir)) # session and package infos for the log file print(sessionInfo()) print(Sys.info()) # load spike-in count tables data0 <- read.table(file, header=T, sep="\t", row.names=1) # just if the gene ids not ending with .1 (it is nessessery for NOISeqbio iteration step) #row.names(data0) <- paste0(row.names(data0), ".1") # remove HTSeq rows like "__no_feature" data0 <- data0[ with(data0, !grepl('__', rownames(data0))) , ] data0 <- data0[ with(data0, !grepl('ERCC', rownames(data0))) , ] #data0 <- data0[ , with(data0, !grepl('X.1', colnames(data0)))] #rename col names (library name to roman number) # librariesName <- c("56754_WT_Naturally_3.bam.sort.fastq.unmapped.sam.sort.bam", "56753_WT_Naturally_2.bam.sort.fastq.unmapped.sam.sort.bam", "56752_WT_Naturally_1.bam.sort.fastq.unmapped.sam.sort.bam", "56751_tt_6_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56750_tt_6_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56749_tt_6_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56748_tt_0_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56747_tt_0_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56746_tt_0_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56745_WT_10_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56744_WT_10_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56743_WT_10_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56742_WT_6_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56741_WT_6_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56740_WT_6_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56739_WT_0_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56738_WT_0_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56737_WT_0_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56736_tt_T1_tt_6d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56735_tt_T1_tt_6d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56734_tt_T1_tt_6d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56733_tt_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56732_tt_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56731_tt_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56730_tt_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56729_tt_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56728_tt_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56727_WT_T1_NA_3.bam.sort.fastq.unmapped.sam.sort.bam", "56726_WT_T1_NA_2.bam.sort.fastq.unmapped.sam.sort.bam", "56725_WT_T1_NA_1.bam.sort.fastq.unmapped.sam.sort.bam", "56724_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56723_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56722_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56721_WT_T1_WT_10d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56720_WT_T1_WT_10d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56719_WT_T1_WT_10d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56718_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56717_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56716_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56715_WT_T1_WT_6d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56714_WT_T1_WT_6d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56713_WT_T1_WT_6d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56712_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56711_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56710_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56709_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56708_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56707_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56706_WT_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56705_WT_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56704_WT_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam") # #librariesName <- c("80484_Celery_3_day_imbibed_embryo_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80483_Celery_3_day_imbibed_embryo_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80482_Celery_3_day_imbibed_embryo_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80481_Celery_1_day_imbibed_embryo_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80480_Celery_1_day_imbibed_embryo_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80479_Celery_1_day_imbibed_embryo_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80478_Celery_5_day_imbibed_whole_fruit_29C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80477_Celery_5_day_imbibed_whole_fruit_29C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80460_Celery_3_day_imbibed_whole_fruit_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80459_Celery_3_day_imbibed_whole_fruit_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80458_Celery_1_day_imbibed_whole_fruit_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80457_Celery_1_day_imbibed_whole_fruit_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80456_Celery_1_day_imbibed_whole_fruit_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80455_Celery_Dry_whole_fruit_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80454_Celery_Dry_whole_fruit_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80453_Celery_Dry_whole_fruit_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80452_Celery_5_day_imbibed_whole_fruit_29C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80451_Celery_5_day_imbibed_whole_fruit_100uM_ABA_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80450_Celery_5_day_imbibed_whole_fruit_100uM_ABA_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80449_Celery_5_day_imbibed_whole_fruit_100uM_ABA_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80448_Celery_5_day_imbibed_whole_fruit_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80447_Celery_5_day_imbibed_whole_fruit_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80446_Celery_5_day_imbibed_whole_fruit_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80445_Celery_3_day_imbibed_whole_fruit_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam.featureCounts.p.txt") # romanNumber <- c("T17_Dry_Naturally_Freezedryseed", "T17_Dry_Naturally_Freezedryseed.1", "T17_Dry_Naturally_Freezedryseed.2", "T16_Dry_6days_Freezeafter6dageing", "T16_Dry_6days_Freezeafter6dageing.1", "T16_Dry_6days_Freezeafter6dageing.2", "T15_Dry_0days_Freezedryseed", "T15_Dry_0days_Freezedryseed.1", "T15_Dry_0days_Freezedryseed.2", "T14_Dry_10days_Freezeafter10dageing", "T14_Dry_10days_Freezeafter10dageing.1", "T14_Dry_10days_Freezeafter10dageing.2", "T13_Dry_6days_Freezeafter6dageing", "T13_Dry_6days_Freezeafter6dageing.1", "T13_Dry_6days_Freezeafter6dageing.2", "T12_Dry_0days_Freezedryseed", "T12_Dry_0days_Freezedryseed.1", "T12_Dry_0days_Freezedryseed.2", "T11_Imbibe_6days_12h", "T11_Imbibe_6days_12h.1", "T11_Imbibe_6days_12h.2", "T10_Imbibe_6days_5h", "T10_Imbibe_6days_5h.1", "T10_Imbibe_6days_5h.2", "T9_Imbibe_0dyas_5h", "T9_Imbibe_0dyas_5h.1", "T9_Imbibe_0dyas_5h.2", "T8_Imbibe_Naturally_47h", "T8_Imbibe_Naturally_47h.1", "T8_Imbibe_Naturally_47h.2", "T7_Imbibe_Naturally_24h", "T7_Imbibe_Naturally_24h.1", "T7_Imbibe_Naturally_24h.2", "T6_Imbibe_10days_72h", "T6_Imbibe_10days_72h.1", "T6_Imbibe_10days_72h.2", "T5_Imbibe_10days_24h", "T5_Imbibe_10days_24h.1", "T5_Imbibe_10days_24h.2", "T4_Imbibe_6days_47h", "T4_Imbibe_6days_47h.1", "T4_Imbibe_6days_47h.2", "T3_Imbibe_6days_24h", "T3_Imbibe_6days_24h.1", "T3_Imbibe_6days_24h.2", "T2_Imbibe_0days_24h", "T2_Imbibe_0days_24h.1", "T2_Imbibe_0days_24h.2", "T1_Imbibe_0days_5h", "T1_Imbibe_0days_5h.1", "T1_Imbibe_0days_5h.2") # #romanNumber <- c("emb_3d_20.2", "emb_3d_20.1", "emb_3d_20", "emb_1d_20.2", "emb_1d_20.1", "emb_1d_20", "wf_5d_29.2", "wf_5d_29.1", "wf_3d_20.1", "wf_3d_20", "wf_1d_20.2", "wf_1d_20.1", "wf_1d_20", "wf_dry.2", "wf_dry.1", "wf_dry", "wf_5d_29", "wf_5d_ABA_20.2", "wf_5d_ABA_20.1", "wf_5d_ABA_20", "wf_5d_20.2", "wf_5d_20.1", "wf_5d_20", "wf_3d_20.2") # # # for(i in 1:length(librariesName)) { # colnames(data0)[colnames(data0)==grep(librariesName[i], names(data0), value = TRUE)] <- romanNumber[i] # } # creats a list of the column names/samples you would like to compare SampleNames <- makeSampleList(sampleNameI, length(grep(paste0("^",sampleNameI,"[^A-Z]"), colnames(data0))) + 1, sampleNameII, length(grep(paste0("^",sampleNameII,"[^A-Z]"), colnames(data0))) + 1) print(SampleNames) ## mal anschauen als ersatz für das darüber ## ## newenw <- table.new[, grepl(paste(LIST,collapse="|"), colnames(table.new))] # exctract the samples from the data0 if needed if(length(SampleNames) > 0) { RTableInFiles <- data0[,SampleNames] data <- data0[,SampleNames] } print(head(RTableInFiles)) # sort by column name # data <- data[ , order(names(data))] # dangeros, the order of query and referenz might be not the same # see how many rows and columns are in the tables print(paste("Raw data row number: ",nrow(data))) print(paste("Raw data column number: ",ncol(data))) # 2) A typical differential expression analysis workflow # 2.1) Filtering and exploratory data analysis # filter: One single gene/spike-in must have at least 5 counts in minimum 2 libraries filter <- apply(data, 1, function(x) length(x[x>5])>=2) dataFiltered <- data[filter,] # see how many rows and columns are left print(paste("Filtered data row number: ",nrow(dataFiltered))) print(paste("Filtered data row number: ",ncol(dataFiltered))) # filter gene names genes <- rownames(dataFiltered)[grep("^Pp3", rownames(dataFiltered))] # filter spike-in names spikes <- rownames(dataFiltered)[grep("^ERCC", rownames(dataFiltered))] # get the column name list as.factors xtmp <- colnames(dataFiltered) x <- split2list(xtmp) sampleNameList <- gsub(".[1234567890]+$", "", SampleNames) columnPos <- makeIndexList(SampleNames) # run ... ### edgeR ### print(paste0("edgeR is running with ", cName)) packageDescription("edgeR") edgeRFunction(outdir, edgeRDir, cName, RTableInFiles, pvtedgeR, sampleNameI, sampleNameII, sampleNameList) #### DESeq2 #### print(paste0("DESeq2 is running with ", cName)) packageDescription("DESeq2") DESeq2Function(outdir, DESeq2Dir, cName, RTableInFiles, pvtDESeq2, sampleNameI, sampleNameII, sampleNameList) ### NOISeq ### print(paste0("NOISeq is running with ", cName)) packageDescription("NOISeq") NOISeqFunction(outdir, NOISeqDir, cName, RTableInFiles, pvtNOISeq, sampleNameI, sampleNameII, myLenPos, myGCPos, LenGCTable, columnPos, sampleNameList, SampleNames) ### Overlap of all three methods ### # intersection with DEG direction check => 1, without => 0 intersection_4.file <- intersection(outdir,1,4) write.table(intersection_4.file, file = paste0(outdir, "/", cName,"_intersection_four_methods_",nrow(intersection_4.file),".tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) # edgeR, DESeq2, NOISeq intersection_3.file <- intersection(outdir,1,3) write.table(intersection_3.file, file = paste0(outdir, "/", cName,"_intersection_three_methods_",nrow(intersection_3.file),".tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) sink() # End of log file } #################### ## main functions # #################### plotGraphics <- function(data, librariesName, cName, outdir, romanNumber) { library("DESeq2") library("ggplot2") library("RColorBrewer") library("pheatmap") library("genefilter") # create sample annotation lists # sample.name.list <- gsub(".[12]", "", librariesName[colnames(data)]) sample.name.list <- gsub(".[12]", "", romanNumber) sample.name.uniq <- unique(unlist(sample.name.list, use.names = FALSE)) col.data.condition = data.frame(row.names = librariesName, condition = sample.name.list) # run DESeq2 functions dds.matrix <- DESeqDataSetFromMatrix(countData = data, colData = col.data.condition, design = ~condition) # data must be integer! (Ganze Zahlen) colData(dds.matrix)$condition <- factor(colData(dds.matrix)$condition, levels = sample.name.uniq) dds.matrix.deseq <- DESeq(dds.matrix) dds.matrix.res <- results(dds.matrix.deseq) dds.matrix.res <- dds.matrix.res[order(dds.matrix.res$padj), ] dds.matrix.res.fdr <- dds.matrix.res[!is.na(dds.matrix.res$padj), ] dds.matrix.res.fdr <- dds.matrix.res.fdr[dds.matrix.res.fdr$padj < 0.0001, ] dds.matrix.rld <- rlogTransformation(dds.matrix, blind = TRUE) # data for PCA plot pcaData <- plotPCA(dds.matrix.rld, intgroup = "condition", returnData=TRUE) percentVar <- round(100 * attr(pcaData, "percentVar")) # color for samples sample.colors <- c("gray60","#D73027","#4575B4", "purple", "orange","#113FF7","green", "black") # plot the PCA # https://cran.r-project.org/web/packages/ggfortify/vignettes/plot_pca.html png(filename=paste0(outdir,"PCA_all_samples_DEseq2_normalized.png")) ggplot( pcaData, aes( PC1, PC2, color=condition, shape=condition) ) + scale_shape_manual(values = c(4, 1, 2, 3, 0, 5, 6, 7)) + geom_point(size=2.5) + xlab( paste0("PC1: ",percentVar[1],"% variance") ) + ylab( paste0("PC2: ",percentVar[2],"% variance") ) + theme() + #, face="bold" scale_colour_manual( values=sample.colors # dodgerblue3 ) + ggtitle("PCA of all samples (DESeq2 normalized)") + theme( plot.title = element_text(lineheight=.8), panel.background = element_rect(fill = "gray95") ) dev.off() # how many genes will be involved in the heatmap calculation numbGenes <- 100 # pheatmap data topVarGenes <- head(order(rowVars(assay(dds.matrix.rld)),decreasing=TRUE),numbGenes) mat <- assay(dds.matrix.rld)[ topVarGenes, ] mat <- mat - rowMeans(mat) # pheatmap data annotation anno <-as.data.frame(colData(dds.matrix.rld)[,c("condition")]) rownames(anno) <- colnames(data) # check out the row names of annotation colnames(anno) <- "condition" # pheatmap data annotation color condition <- sample.colors names(condition) <- sample.name.uniq anno_colors <- list(condition = condition) #heatmap(mat) #pheatmap(mat) # ftp://cran.r-project.org/pub/R/web/packages/pheatmap/pheatmap.pdf png(filename=paste0(outdir,"HC_heatmap_all_samples_DESeq2_normalized.png")) pheatmap(mat , annotation = anno, annotation_colors = anno_colors, fontsize = 10, fontsize_row = 6, fontsize_col = 10, main = paste0("Heatmap of all samples (DESeq2 normalized) ", numbGenes," genes\n"), annotation_names_col = F, show_rownames = F, cluster_rows = F, border_color = F, treeheight_col = 20 ) dev.off() } edgeRFunction <- function(odir, edgeRDir, cName, RTableInFiles, pvtedgeR, sampleNameI, sampleNameII, sampleNameList) { library(edgeR) outdir <- paste0(odir, "/",edgeRDir,"/") # set outdir dir.create(outdir, showWarnings = FALSE) # create output folder dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots" comparison <- cName counts <- RTableInFiles pvt <- pvtedgeR group <- factor(sampleNameList) # edgeR 'normal' y.normal <- DGEList(counts = counts, group = group) y.normal[,1] png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotMDS"))) plotMDS(y.normal, method = "bcv", main = paste0(cName, "\nedgeR - plotMDS")) dev.off() y.normal <- calcNormFactors(y.normal) y.normal <- estimateCommonDisp(y.normal) y.normal <- estimateTagwiseDisp(y.normal) png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotBCV"))) plotBCV(y.normal, main = paste0(cName, "\nedgeR - plotBCV")) dev.off() et.normal <- exactTest(y.normal, pair = c(sampleNameI, sampleNameII)) topTags(et.normal) # adjust.method ==> character string specifying p-value adjustment method. Possible values are "none", "BH", "fdr" (equivalent to "BH"), "BY" and "holm". See p.adjust for details. summary(de.normal <- decideTestsDGE(et.normal, p = pvt, adjust = "BH")) et.normal.fdr <- cbind(et.normal$table, p.adjust(et.normal$table[, 3], method = "BH")) et.normal.fdr <- et.normal.fdr[et.normal.fdr[, 4] < pvt, ] write.table(et.normal.fdr, file = paste0(outdir, cName, "_", nrow(et.normal.fdr), "_edgeR_fdr.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) detags.normal <- rownames(de.normal)[as.logical(de.normal)] png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotSmear"))) plotSmear(et.normal, de.normal, tags = detags.normal, main = paste0(cName, "\nedgeR - plotSmear")) abline(h = c(-2, 2), col = "blue") dev.off() # edgeR 'GLM' y.glm <- DGEList(counts = counts, group = group) design <- model.matrix(~0 + group, data = y.glm$samples) design y.glm <- estimateGLMCommonDisp(y.glm, design) y.glm <- estimateGLMTrendedDisp(y.glm, design) ## Loading required package: splines y.glm <- estimateGLMTagwiseDisp(y.glm, design) png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotBCV - GLM"))) plotBCV(y.glm, main = paste0(cName, "\nedgeR - plotBCV - GLM")) dev.off() y.fit <- glmFit(y.glm, design) y.lrt <- glmLRT(y.fit) FDR <- p.adjust(y.lrt$table$PValue, method = "BH") sum(FDR < pvt) topTags(y.lrt) summary(de.glm <- decideTestsDGE(y.lrt, p = pvt, adjust = "BH")) } DESeq2Function <- function(odir, DESeq2Dir, cName, RTableInFiles, pvtDESeq2, sampleNameI, sampleNameII, sampleNameList) { library(DESeq2) library(RColorBrewer) library(gplots) library(vsn) outdir <- paste0(odir, "/",DESeq2Dir,"/") # set outdir dir.create(outdir, showWarnings = FALSE) dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots" pvt <- pvtDESeq2 # DESeqDataSet(se, design, ignoreRank = FALSE) # DESeqDataSetFromMatrix(countData, colData, design, tidy = FALSE, ignoreRank = FALSE, ...) # DESeqDataSetFromHTSeqCount(sampleTable, directory = ".", design, ignoreRank = FALSE, ...) # DESeqDataSetFromTximport(txi, colData, design, ...) coldata = data.frame(row.names = colnames(RTableInFiles), condition = sampleNameList) ddsHTSeq <- DESeqDataSetFromMatrix(countData = RTableInFiles, colData = coldata, design = ~ condition) colData(ddsHTSeq)$condition <- factor(colData(ddsHTSeq)$condition, levels = c(sampleNameI, sampleNameII)) ddsHTSeq <- DESeq(ddsHTSeq) ddsHTSeq.res <- results(ddsHTSeq) ddsHTSeq.res <- ddsHTSeq.res[order(ddsHTSeq.res$padj), ] head(ddsHTSeq.res) # png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - plotMA"))) # plotMA(ddsHTSeq, main = paste0(cName, "\nDESeq2 - plotMA"), ylim = c(-2, 2)) # dev.off() ddsHTSeq.res.fdr <- ddsHTSeq.res[!is.na(ddsHTSeq.res$padj), ] ddsHTSeq.res.fdr <- ddsHTSeq.res.fdr[ddsHTSeq.res.fdr$padj < pvt, ] write.table(as.matrix(ddsHTSeq.res.fdr), file = paste0(outdir, cName, "_", nrow(as.matrix(ddsHTSeq.res.fdr)), "_DESeq2_fdr.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) ddsHTSeq.rld <- rlogTransformation(ddsHTSeq, blind = TRUE) ddsHTSeq.vsd <- varianceStabilizingTransformation(ddsHTSeq, blind = TRUE) par(mfrow = c(1, 3)) notAllZero <- (rowSums(counts(ddsHTSeq)) > 0) png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.1"))) meanSdPlot(log2(counts(ddsHTSeq, normalized = TRUE)[notAllZero, ] + 1)) # , ylim = c(0, 2.5) dev.off() png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.2"))) # , ylim = c(0, 2.5) meanSdPlot(assay(ddsHTSeq.rld[notAllZero, ])) dev.off() png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.3"))) # , ylim = c(0, 2.5) meanSdPlot(assay(ddsHTSeq.vsd[notAllZero, ])) dev.off() hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100) distRL <- dist(t(assay(ddsHTSeq.rld))) mat <- as.matrix(distRL) rownames(mat) <- colnames(mat) <- with(colData(ddsHTSeq), paste0(condition)) # png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - heatmap"))) # heatmap(mat, trace = "none", col = rev(hmcol), margin = c(13, 13)) # dev.off() png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - PCA"))) print(plotPCA(ddsHTSeq.rld, intgroup = "condition")) dev.off() png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - DispEsts"))) plotDispEsts(ddsHTSeq, main = paste0(cName, "\nDESeq2 - plotDispEsts")) dev.off() } NOISeqFunction <- function(odir, NOISeqDir, cName, RTableInFiles, pvtNOISeq, sampleNameI, sampleNameII, myLenPos, myGCPos, LenGCTable, columnPos, sampleNameList, SampleNames) { library(NOISeq) outdir <- paste0(odir, "/",NOISeqDir,"/") # set outdir dir.create(outdir, showWarnings = FALSE) dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots" pvt <- pvtNOISeq counts <- RTableInFiles[, columnPos] head(counts) # must be changed if there are no triplicates # !! Please change this setting if needed !! myfactors <- data.frame(Treatment = sampleNameList, TreatmentRun = SampleNames) # sort counts by row names mycounts <- counts[order(rownames(counts)), ] # filter out low counts mycounts.filter <- filtered.data(mycounts, factor = myfactors$Treatment, norm = FALSE, depth = NULL, method = 1, cv.cutoff = 100, cpm = 1) mycounts.rowname <- rownames(mycounts.filter) # Extra file with length and GC content myLengthTable <- read.table(LenGCTable, sep = "\t", header = TRUE, stringsAsFactor = FALSE, row.names=1) myLengthTable <- myLengthTable[order(rownames(myLengthTable)),] # sort by rownames myLengthTable.filter <- myLengthTable[mycounts.rowname,] mylength <- myLengthTable.filter[,myLenPos] names(mylength) <- rownames(myLengthTable.filter) mygc <- myLengthTable.filter[,myGCPos] names(mygc) <- rownames(myLengthTable.filter) mydata <- readData(data = mycounts.filter, length = mylength, gc = mygc, factors = myfactors) mydata mydata.GCbias <- dat(mydata, type = "GCbias") #-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - GCbias.1"))) #-# explo.plot(mydata.GCbias) #-# dev.off() mydata.GCbias.group <- dat(mydata, factor = "Treatment", type = "GCbias") #-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - GCbias.2"))) #-# explo.plot(mydata.GCbias.group) #-# dev.off() show(mydata.GCbias.group) mydata.lengthbias <- dat(mydata, type = "lengthbias") #-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - Lenbias.1"))) #-# explo.plot(mydata.lengthbias) #-# dev.off() mydata.lengthbias.group <- dat(mydata, factor = "Treatment", type = "lengthbias") #-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - Lenbias.2"))) #-# explo.plot(mydata.lengthbias.group) #-# dev.off() show(mydata.lengthbias.group) # looks different mydata.cd <- dat(mydata, type = "cd") #-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - cd"))) #-# explo.plot(mydata.cd) #-# dev.off() mylength myRPKM <- NOISeq::rpkm(assayData(mydata)$exprs) myUQUA <- NOISeq::uqua(assayData(mydata)$exprs, long = mylength, lc = 0.5, k = 0) myTMM <- NOISeq::tmm(assayData(mydata)$exprs, long = 1000, lc = 0) mynoiseq.rpkm <- noiseq(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological") mynoiseq.nnorm <- noiseq(mydata, k = 0.5, norm = "n", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological") mynoiseq.tmm <- noiseq(mydata, k = 0.5, norm = "tmm", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological") # RPKM mynoiseq.rpkm.prob <- mynoiseq.rpkm@results[[1]][mynoiseq.rpkm@results[[1]]$prob > pvt, ] mynoiseq.rpkm.prob <- mynoiseq.rpkm.prob[!is.na(mynoiseq.rpkm.prob$prob), ] write.table(mynoiseq.rpkm.prob, file = paste0(outdir, cName, "_", nrow(mynoiseq.rpkm.prob), "_NOISeq_rpkm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) # none mynoiseq.nnorm.prob <- mynoiseq.nnorm@results[[1]][mynoiseq.nnorm@results[[1]]$prob > pvt, ] mynoiseq.nnorm.prob <- mynoiseq.nnorm.prob[!is.na(mynoiseq.nnorm.prob$prob), ] write.table(mynoiseq.nnorm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseq.nnorm.prob), "_NOISeq_nnorm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) # TMM mynoiseq.tmm.prob <- mynoiseq.tmm@results[[1]][mynoiseq.tmm@results[[1]]$prob > pvt, ] mynoiseq.tmm.prob <- mynoiseq.tmm.prob[!is.na(mynoiseq.tmm.prob$prob), ] write.table(mynoiseq.tmm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseq.tmm.prob), "_NOISeq_tmm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) ############ NOISeq bio ################ pvtnsb <- 0.995 a0per_value <- 0.99 iterations <- 5 x5 <- c(sample(10000:99999, iterations, replace=F), 12345) x5 collector.rpkm <- NULL collector.tmm <- NULL # for (iter in x5) { # # RPKM # mynoiseqbio.rpkm <- noiseqbio(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = iter, filter = 1)#, plot = TRUE) # mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm@results[[1]][mynoiseqbio.rpkm@results[[1]]$prob > pvtnsb, ] # mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm.prob[!is.na(mynoiseqbio.rpkm.prob$prob), ] # collector.rpkm <- rbind(collector.rpkm, mynoiseqbio.rpkm.prob) # #collector <- rbind(collector, mynoiseqbio.rpkm.prob[rownames(mynoiseqbio.rpkm.prob) %in% rownames(collector),]) # # TMM # mynoiseqbio.tmm <- noiseqbio(mydata, k = 0.5, norm = "tmm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = iter, filter = 1)#, plot = TRUE) # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ] # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ] # collector.tmm <- rbind(collector.tmm, mynoiseqbio.tmm.prob) # } # # # RPKM # print(sort(row.names(collector.rpkm))) # library(stringr) # mynoiseqbio.rpkm.prob.overlap <- collector.rpkm[str_split_fixed(row.names(collector.rpkm), "\\.",2)[,2]>=iterations+10,] # row.names(mynoiseqbio.rpkm.prob.overlap) <- substr(row.names(mynoiseqbio.rpkm.prob.overlap),1,str_length(row.names(mynoiseqbio.rpkm.prob.overlap))-1) # # write.table(mynoiseqbio.rpkm.prob.overlap, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.rpkm.prob.overlap),"_NOISeqbio_rpkm_prob_seed_iteration_overlap.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) # # #TMM # print(sort(row.names(collector.tmm))) # library(stringr) # mynoiseqbio.tmm.prob.overlap <- collector.tmm[str_split_fixed(row.names(collector.tmm), "\\.",2)[,2]>=iterations+10,] # row.names(mynoiseqbio.tmm.prob.overlap) <- substr(row.names(mynoiseqbio.tmm.prob.overlap),1,str_length(row.names(mynoiseqbio.tmm.prob.overlap))-1) # # write.table(mynoiseqbio.tmm.prob.overlap, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.tmm.prob.overlap),"_NOISeqbio_tmm_prob_seed_iteration_overlap.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) # # none # mynoiseqbio.nnorm.prob <- mynoiseqbio.nnorm@results[[1]][mynoiseqbio.nnorm@results[[1]]$prob > pvtnsb, ] # mynoiseqbio.nnorm.prob <- mynoiseqbio.nnorm.prob[!is.na(mynoiseqbio.nnorm.prob$prob), ] # write.table(mynoiseqbio.nnorm.prob, file = paste0(outdir, cName, "_NOISeqbio_nnorm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) # # TMM # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ] # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ] # write.table(mynoiseqbio.tmm.prob, file = paste0(outdir, cName, "_NOISeqbio_tmm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbiorpkm"))) mynoiseqbio.rpkm <- noiseqbio(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE) dev.off() mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm@results[[1]][mynoiseqbio.rpkm@results[[1]]$prob > pvtnsb, ] mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm.prob[!is.na(mynoiseqbio.rpkm.prob$prob), ] write.table(mynoiseqbio.rpkm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.rpkm.prob),"_NOISeqbio_rpkm_prob_seed_12345.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbiontmm"))) mynoiseqbio.tmm <- noiseqbio(mydata, k = 0.5, norm = "tmm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE) dev.off() mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ] mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ] write.table(mynoiseqbio.tmm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.tmm.prob),"_NOISeqbio_tmm_prob_seed_12345.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbionnorm"))) mynoiseqbio.nnorm <- noiseqbio(mydata, k = 0.5, norm = "n", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE) dev.off() head(mynoiseqbio.rpkm@results[[1]]) head(mynoiseqbio.nnorm@results[[1]]) mynoiseqbio.rpkm.deg <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = NULL) mynoiseqbio.rpkm.deg.up <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = "up") mynoiseqbio.rpkm.deg.down <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = "down") mynoiseqbio.nnorm.deg <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = NULL) mynoiseqbio.nnorm.deg.up <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = "up") mynoiseqbio.nnorm.deg.down <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = "down") par(mfrow = c(3, 2)) png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.9"))) DE.plot(mynoiseq.rpkm, q = 0.9, graphic = "expr", log.scale = TRUE) dev.off() png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.8"))) DE.plot(mynoiseq.rpkm, q = 0.8, graphic = "expr", log.scale = TRUE) dev.off() png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorn_0.9"))) DE.plot(mynoiseq.nnorm, q = 0.9, graphic = "expr", log.scale = TRUE) dev.off() png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorn_0.8"))) DE.plot(mynoiseq.nnorm, q = 0.8, graphic = "expr", log.scale = TRUE) dev.off() png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.tmm_0.9"))) DE.plot(mynoiseq.tmm, q = 0.9, graphic = "expr", log.scale = TRUE) dev.off() png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.tmm_0.8"))) DE.plot(mynoiseq.tmm, q = 0.8, graphic = "expr", log.scale = TRUE) dev.off() par(mfrow = c(2, 2)) png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.rpkm_",pvt))) DE.plot(mynoiseqbio.rpkm, q = pvt, graphic = "expr", log.scale = TRUE) dev.off() png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.rpkm_",pvtnsb))) DE.plot(mynoiseqbio.rpkm, q = pvtnsb, graphic = "expr", log.scale = TRUE) dev.off() png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.nnorm_",pvt))) DE.plot(mynoiseqbio.nnorm, q = pvt, graphic = "expr", log.scale = TRUE) dev.off() png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.nnorm_",pvtnsb))) DE.plot(mynoiseqbio.nnorm, q = pvtnsb, graphic = "expr", log.scale = TRUE) dev.off() par(mfrow = c(1, 2)) png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.9_MD"))) DE.plot(mynoiseq.rpkm, q = 0.9, graphic = "MD", log.scale = TRUE) dev.off() png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorm_0.9_MD"))) DE.plot(mynoiseq.nnorm, q = 0.9, graphic = "MD", log.scale = TRUE) dev.off() } ########################################### ## additional function for more usability # ########################################### calcRPKM <- function(c,n,l) { RPKM <- (10^9* as.numeric(c)) / (as.numeric(n) * as.numeric(l)) return(RPKM) } RPKMMatrix <- function(InTable,directory,LenGCTable,myLenPos) { myLengthTable <- read.table(paste0(directory,LenGCTable), sep = "\t", header = TRUE, stringsAsFactor = FALSE, row.names = 1) newtable <- matrix(data = c(1), nrow = nrow(InTable), ncol(InTable), byrow= TRUE) colnames(newtable) <- colnames(InTable) rownames(newtable) <- rownames(InTable) sumCol <- apply(InTable,2,sum) # list of the all col sums for(s in colnames(InTable)) { for(r in rownames(InTable)) { w <- InTable[r, s] len <- myLengthTable[r, myLenPos] sum <- as.numeric(sumCol[s]) newtable[r, s] <- calcRPKM(w, sum, len) } } return(newtable) } # rmHTSeqLines removes the last five rows rmHTSeqLines <- function(counts) { counts <- head(counts, -5) return(counts) } ############### ## functions ## ############### # split list to a factor list split2list <- function(tmp) { xx <- c() for(i in 1:length(tmp)) { if(grepl("\\.", tmp[i])) { foo <- strsplit(tmp[i], '\\.')[[1]] fooo <- strsplit(foo, " ")[[1]] xx <- c(xx, fooo) } else { xx <- c(xx, tmp[i]) } } return(as.factor(xx)) } # makeGroups # https://github.com/drisso/RUVSeq/blob/master/R/helper.R makeGroups <- function(xs) { xs <- factor(xs) groups <- matrix(-1, nrow = length(levels(xs)), ncol = max(table(xs))) for (i in 1:length(levels(xs))) { idxs <- which(xs == levels(xs)[i]) groups[i,1:length(idxs)] <- idxs } return(groups) } # The function makeSampleList creates a list of the columns(samples) you would like to compare makeSampleList <- function(A, Al, B, Bl) { l <- c() if(exists("Al", mode = "numeric")) { l <- makeSampleList.helper(A, Al, l) l <- makeSampleList.helper(B, Bl, l) } return(l) } makeSampleList.helper <- function(v,k,l) { for(h in 1:k) { if(h == 1) { l <- c(l, v) } else { l <- c(l, paste0(v,".",h-1)) } } return(l) } # function creates a list of numbers out of another list. The List is the index of the given list. makeIndexList <- function(l) { n <- 1 jj <- c() for(v in l) { jj <- c(jj, n) n <- n + 1 } return(jj) } intersection <- function(l,check,t) { l.list <- list.files(path = l, pattern = "*(rpkm|edgeR|DESeq2).*tsv", recursive = TRUE) if(grepl("bio", l.list[4],fixed = TRUE)) { l.list.noiseq <- read.table(paste0(l,"/",l.list[3]), header=T, sep="\t", row.names=1) l.list.noiseqbio <- read.table(paste0(l,"/",l.list[4]), header=T, sep="\t", row.names=1) } else { l.list.noiseq <- read.table(paste0(l,"/",l.list[4]), header=T, sep="\t", row.names=1) l.list.noiseqbio <- read.table(paste0(l,"/",l.list[3]), header=T, sep="\t", row.names=1) } l.list.edgeR <- read.table(paste0(l,"/",l.list[2]), header=T, sep="\t", row.names=1) l.list.deseq2 <- read.table(paste0(l,"/",l.list[1]), header=T, sep="\t", row.names=1) print("DEGs edgeR") print(nrow(l.list.edgeR)) print("DEGs deseq2") print(nrow(l.list.deseq2)) print("DEGs noiseq") print(nrow(l.list.noiseq)) print("DEGs noiseqbio") print(nrow(l.list.noiseqbio)) interect.list <- "" interect.list.direct <- "" if(t==4) { # List of overlaps without direction check interect.list <- intersect(rownames(l.list.edgeR), intersect(rownames(l.list.noiseq), intersect(rownames(l.list.noiseqbio), rownames(l.list.deseq2)))) print("DEGs intersection without direction check (four methods)") print(length(interect.list)) print(interect.list) print("DEGs intersection with direction check") # List of overlaps with direction check if(length(interect.list)!=0) { interect.list.direct <- interDirect(l.list.edgeR, l.list.deseq2, l.list.noiseq, l.list.noiseqbio, interect.list) print(length(interect.list.direct)) print(interect.list.direct) } else { print("0") } } if(t==3) { # List of overlaps without direction check interect.list <- intersect(rownames(l.list.edgeR), intersect(rownames(l.list.noiseq), rownames(l.list.deseq2))) print("DEGs intersection without direction check (three methods)") print(length(interect.list)) print(interect.list) print("DEGs intersection with direction check") # List of overlaps with direction check if(length(interect.list)!=0) { # fake data frame with 0 rows => nrow == 0 df_fake <- data.frame(Date=as.Date(character()), File=character(), User=character(), stringsAsFactors=FALSE) interect.list.direct <- interDirect(l.list.edgeR, l.list.deseq2, l.list.noiseq, df_fake, interect.list) print(length(interect.list.direct)) print(interect.list.direct) } else { print("0") } } if(check == 1){ return(l.list.noiseq[interect.list.direct,]) } else { return(l.list.noiseq[interect.list,]) } } interDirect <- function(e,d,n,nb,i) { if(nrow(nb) == 0) { nbx <- apply(n[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) # use noiseq twice to simulate noiseqbio nby <- paste0(nbx[,"M"],rownames(nbx)) } else { nbx <- apply(nb[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) nby <- paste0(nbx[,"log2FC"],rownames(nbx)) } ex <- apply(e[i,], 2, function(x) {ifelse(x < 0, "-", "+")}) dx <- apply(d[i,], 2, function(x) {ifelse(x < 0, "-", "+")}) nx <- apply(n[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) # twisted, because edgeR and DESeq2 are calculating vice versa ey <- paste0(ex[,"logFC"],rownames(ex)) dy <- paste0(dx[,"log2FoldChange"],rownames(dx)) ny <- paste0(nx[,"M"],rownames(nx)) interect.list.direct <- intersect(ey, intersect(dy, intersect(ny, nby))) return(substr(interect.list.direct,2,100)) } ########################################### ## executiv part of the script # ########################################### for(SampleNames in sampleLists) { workflow(file, saveat, SampleNames, LenGCTable, myLenPos, myGCPos, rpkm, pvtedgeR, pvtDESeq2, pvtNOISeq, edgeRDir, DESeq2Dir, NOISeqDir, htseq) } print("Thank you for using the DEG pipeline :) \n") # fin ### compare DEGs analysis ### # used venny and Excel