Skip to content
Snippets Groups Projects
Commit 35149420 authored by Saskia Hiltemann's avatar Saskia Hiltemann
Browse files

add Fabian's R scripts

parent 8ebefab1
No related branches found
No related tags found
No related merge requests found
# 3d R plot
saveat <- "/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Hoecker"
file.rpkm <- '/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Hoecker/31315.p.sort.rpkm'
## Windows ###
#-#saveat <- "Q:/haasf/leubner_RNA-seq/Celery/DEGs_0.9/"
#-#file.rpkm <- 'Q:/haasf/leubner_RNA-seq/Celery/DEGs/Celery_merged_isoforms.p.sort.rpkm'
#-#file.count <- 'Q:/haasf/leubner_RNA-seq/Celery/DEGs_0.9/Celery_merged_isoforms.p.sort.counts'
###
data.rpkm <- read.table(file.rpkm, header=T, sep="\t", row.names=1)
# sort by colnames
data.rpkm <- data.rpkm[,order(colnames(data.rpkm))]
librariesName <- list(
cop_D = c("cop_D", "red"),
cop_L = c("cop_L", "blue"),
spa_D = c("spa_D", "green"),
spa_L = c("spa_L", "yellow"),
WT_D = c("WT_D", "black"),
WT_L = c("WT_L", "violet")
)
#
# header.ori <- c("56754_WT_Naturally_3.bam.sort.fastq.unmapped.sam.sort.bam", "56753_WT_Naturally_2.bam.sort.fastq.unmapped.sam.sort.bam", "56752_WT_Naturally_1.bam.sort.fastq.unmapped.sam.sort.bam", "56751_tt_6_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56750_tt_6_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56749_tt_6_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56748_tt_0_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56747_tt_0_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56746_tt_0_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56745_WT_10_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56744_WT_10_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56743_WT_10_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56742_WT_6_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56741_WT_6_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56740_WT_6_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56739_WT_0_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56738_WT_0_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56737_WT_0_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56736_tt_T1_tt_6d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56735_tt_T1_tt_6d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56734_tt_T1_tt_6d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56733_tt_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56732_tt_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56731_tt_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56730_tt_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56729_tt_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56728_tt_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56727_WT_T1_NA_3.bam.sort.fastq.unmapped.sam.sort.bam", "56726_WT_T1_NA_2.bam.sort.fastq.unmapped.sam.sort.bam", "56725_WT_T1_NA_1.bam.sort.fastq.unmapped.sam.sort.bam", "56724_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56723_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56722_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56721_WT_T1_WT_10d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56720_WT_T1_WT_10d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56719_WT_T1_WT_10d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56718_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56717_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56716_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56715_WT_T1_WT_6d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56714_WT_T1_WT_6d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56713_WT_T1_WT_6d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56712_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56711_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56710_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56709_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56708_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56707_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56706_WT_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56705_WT_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56704_WT_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam")
# header.new <- c("T17_Dry_Naturally_Freezedryseed", "T17_Dry_Naturally_Freezedryseed.1", "T17_Dry_Naturally_Freezedryseed.2", "T16_Dry_6days_Freezeafter6dageing", "T16_Dry_6days_Freezeafter6dageing.1", "T16_Dry_6days_Freezeafter6dageing.2", "T15_Dry_0days_Freezedryseed", "T15_Dry_0days_Freezedryseed.1", "T15_Dry_0days_Freezedryseed.2", "T14_Dry_10days_Freezeafter10dageing", "T14_Dry_10days_Freezeafter10dageing.1", "T14_Dry_10days_Freezeafter10dageing.2", "T13_Dry_6days_Freezeafter6dageing", "T13_Dry_6days_Freezeafter6dageing.1", "T13_Dry_6days_Freezeafter6dageing.2", "T12_Dry_0days_Freezedryseed", "T12_Dry_0days_Freezedryseed.1", "T12_Dry_0days_Freezedryseed.2", "T11_Imbibe_6days_12h", "T11_Imbibe_6days_12h.1", "T11_Imbibe_6days_12h.2", "T10_Imbibe_6days_5h", "T10_Imbibe_6days_5h.1", "T10_Imbibe_6days_5h.2", "T9_Imbibe_0dyas_5h", "T9_Imbibe_0dyas_5h.1", "T9_Imbibe_0dyas_5h.2", "T8_Imbibe_Naturally_47h", "T8_Imbibe_Naturally_47h.1", "T8_Imbibe_Naturally_47h.2", "T7_Imbibe_Naturally_24h", "T7_Imbibe_Naturally_24h.1", "T7_Imbibe_Naturally_24h.2", "T6_Imbibe_10days_72h", "T6_Imbibe_10days_72h.1", "T6_Imbibe_10days_72h.2", "T5_Imbibe_10days_24h", "T5_Imbibe_10days_24h.1", "T5_Imbibe_10days_24h.2", "T4_Imbibe_6days_47h", "T4_Imbibe_6days_47h.1", "T4_Imbibe_6days_47h.2", "T3_Imbibe_6days_24h", "T3_Imbibe_6days_24h.1", "T3_Imbibe_6days_24h.2", "T2_Imbibe_0days_24h", "T2_Imbibe_0days_24h.1", "T2_Imbibe_0days_24h.2", "T1_Imbibe_0days_5h", "T1_Imbibe_0days_5h.1", "T1_Imbibe_0days_5h.2")
#
# header.new <- header.new[order(header.ori)]
# header.ori <- header.ori[order(header.ori)]
#
# col.header <- header.new
#
# colnames(data.rpkm) <- col.header
library("DESeq2")
library("ggplot2")
library("RColorBrewer")
library("pheatmap")
library("BiocGenerics")
library("rgl")
library("magick")
library("sjmisc")
################### running ######################
### PCA RPKM ###
set.seed(0)
data.inv <- t(data.rpkm)
data.dist <-dist(data.inv, method="euclidean") # "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski"
data.dist.hc <- hclust(data.dist,method="ward.D2")
data.dist.pca <-princomp(data.dist,cor=T)
pc1 <- data.dist.pca$scores[,1]
pc2 <- data.dist.pca$scores[,2]
pc3 <- data.dist.pca$scores[,3]
# create data frame for pc1 pc2 pc3
data.dist.pca.frame = data.frame(pc1,pc2,pc3)
rownames(data.dist.pca.frame) <- names(data.dist.pca$scale)
colnames(data.dist.pca.frame) <- c("pc1","pc2","pc3")
condition.values <- c()
condition.values.color <- c()
for(a in colnames(data.rpkm)) {
v <- substr(a, nchar(a), nchar(a))
if(str_contains(v, c(1,2,3,4,5,6,7,8,9,0), logic = "or")) {
if (substr(substr(a, 1, nchar(a)-2), nchar(substr(a, 1, nchar(a)-2)), nchar(substr(a, 1, nchar(a)-2))) == ".") {
n <- substr(a, 1, nchar(a)-3)
} else {
n <- substr(a, 1, nchar(a)-2)
}
} else {
n <- a
}
condition.values <- c(condition.values, librariesName[n][[1]][1])
condition.values.color <- c(condition.values.color, librariesName[n][[1]][2])
}
data.dist.pca.frame["tissue"] <- condition.values
data.dist.pca.frame["color"] <- condition.values.color
data.dist.pca.frame["name"] <- names(data.dist.pca$scale)
attr(data.dist.pca.frame, "percentVar") <- (data.dist.pca$sdev)^2 / sum(data.dist.pca$sdev^2) # cumsum()
# simple plot
png(filename=paste0(saveat, "/HC_RPKM_normalized.png"))
plot(data.dist.hc) # hc plot
dev.off()
png(filename=paste0(saveat, "/PCA_variance_RPKM_normalized.png"))
plot(data.dist.pca) # variances; var(data.dist.pca$sdev[1:9])
dev.off()
# get the parcent variation
percentVar <- round(100 * attr(data.dist.pca.frame, "percentVar"))
# 3d plot
plot3d(pc1, pc2, pc3,
type = "s", # p, s, l, h, n
#pch = c(1:3),
col = condition.values.color,
size = 1,
xlab = paste0("PC1: ",percentVar[1],"% variance"),
ylab = paste0("PC2: ",percentVar[2],"% variance"),
zlab = paste0("PC3: ",percentVar[3],"% variance"),
cex = 2,
main = "", # -> princomp",
)
# shift <- matrix(4, 4, 4, byrow = TRUE)
# text3d(shift,texts=1:3)
grid3d(c("x", "y", "z"))
## add legend
legend3d("right", unique(condition.values), pch = 19, col = unique(condition.values.color))
#### video #####
M <- par3d("userMatrix")
play3d( par3dinterp( userMatrix=list(M,
rotate3d(M, pi/2, 1, 0, 0),
rotate3d(M, pi/2, 0, 1, 0) ) ),
duration=2 )
movie3d(spin3d(axis = c(1, 2, 1)), duration = 5,
dir = saveat)
#### video end ####
# pc1, pc2
png(filename=paste0(saveat, "/PCA_RPKM_normalized.png"))
ggplot(
data.dist.pca.frame,
aes(
pc1,
pc2,
color=tissue)
) +
geom_point(size=2.5) +
xlab(
paste0("PC1: ",percentVar[1],"% variance")
) +
ylab(
paste0("PC2: ",percentVar[2],"% variance")
) +
#theme() + #, face="bold"
scale_colour_manual(
values= c("red", "blue", "green", "yellow", "black", "violet") # dodgerblue3
) +
ggtitle("PCA of all samples (RPKM normalized)") +
theme(
plot.title = element_text(lineheight=.8),
panel.background = element_rect(fill = "gray95")
)
dev.off()
###############################################################
## DEG pipeline with edgeR, DESeq2 and NOISeq #
## Modified from the template of Dr. Kristian Ullrich 2014 #
## @fabianhaas, AG Rensing, Marburg, Jan 2016 #
###############################################################
# The pipeline expects sample triplicates.
# If you don't have three samples of one condition, use face files with zero counts.
# RPKM calculation is optional.
# Additional program to sum up the tsv files: /mnt/nas_temp/home/haasf/Program/sumup_htseq_counts.pl folder file_prefix
###########################################
## please define your variables #
###########################################
saveat <- "/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Hoecker/DEGs_0.9"
file <- '/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Hoecker/31315.p.sort.counts'
# Table with the length and the GC content
LenGCTable <- "/mnt/NAS_coruscant_datashare/haasf/GeneAtlas_RNASeq/DEG_2nd_round/cosmoss_V3.3.release_2016-07-22.IsoformV31.fasta.length_GC_Contig_Daten_for_34840_1nd.csv"
# Column position of the gene name and the width/gc content (first column is gene name!)
myLenPos <- 1
myGCPos <- 2
#
# hoo <- c("","")
# sampleLists <- c()
# for(a in hoo) {
# for(b in hoo) {
# if(a!=b) {
# tmp <- c(a,b)
# sampleLists <- c(sampleLists, list(tmp))
# }
# }
# }
sampleLists <- list(c("cop_D", "cop_L"),
c("cop_D", "spa_D"),
c("cop_D", "spa_L"),
c("cop_D", "WT_D"),
c("cop_D", "WT_L"),
c("cop_L", "spa_D"),
c("cop_L", "spa_L"),
c("cop_L", "WT_D"),
c("cop_L", "WT_L"),
c("spa_D", "spa_L"),
c("spa_D", "WT_D"),
c("spa_D", "WT_L"),
c("spa_L", "WT_D"),
c("spa_L", "WT_L"),
c("WT_D", "WT_L")
)
# Do you want a RPKM value table (1/0)
# Be warned, it takes a while to do this. approx. 10 minutes for 35000 genes and 6 samples.
rpkm <- 0
# Are the data original HTSeq files? yes <- 1 ; no <- 0
htseq <- 0
# threshold settings for the R packages
#edgeR
pvtedgeR <- 0.001
#DESeq2
pvtDESeq2 <- 0.001
#NOISeq
pvtNOISeq <- 0.9
# Output folder names
edgeRDir <- "edgeR"
DESeq2Dir <- "DESeq2"
NOISeqDir <- "NOISeq"
#########################
## stop editing here!! ##
#########################
###########################################
## executiv function #
###########################################
workflow <- function(file, saveat, SampleNames, LenGCTable, myLenPos, myGCPos, rpkm, pvtedgeR, pvtDESeq2, pvtNOISeq, edgeRDir, DESeq2Dir, NOISeqDir, htseq)
{
# setwd(workingFolder) # set workspace
sampleNameI <- SampleNames[1]
sampleNameII <- SampleNames[2]
# Comparison name
cName <- paste(sampleNameI, sampleNameII, sep = " vs. ")
filename <- strsplit(file, "/")
filename <- filename[[1]][length(filename[[1]])] # long but it's working
sampleName <- gsub(", ","_",toString(SampleNames))
samples <- SampleNames
# create output dir
dir.create(saveat, showWarnings = FALSE)
outdir <- paste0(saveat,"/",sampleName)
dir.create(outdir)
# Start the log file
sink(paste0(outdir, "/DEGpipe_",cName, ".log"))
# print parameter into the log file
print("set parameters:")
print(paste0("Counts file:", file))
print(paste0("Results stored at:", saveat))
print(paste0("SampleNames: ",SampleNames))
print(paste0("LenGCTable: ",LenGCTable))
print(paste0("myLenPos: ",myLenPos))
print(paste0("myGCPos: ",myGCPos))
print(paste0("rpkm: ",rpkm))
print(paste0("htseq: ",htseq))
print(paste0("pvtedgeR: ",pvtedgeR))
print(paste0("pvtDESeq2: ",pvtDESeq2))
print(paste0("pvtNOISeq: ",pvtNOISeq))
print(paste0("edgeRDir: ",edgeRDir))
print(paste0("DESeq2Dir: ",DESeq2Dir))
print(paste0("NOISeqDir: ",NOISeqDir))
# session and package infos for the log file
print(sessionInfo())
print(Sys.info())
# load spike-in count tables
data0 <- read.table(file, header=T, sep="\t", row.names=1)
# just if the gene ids not ending with .1 (it is nessessery for NOISeqbio iteration step)
#row.names(data0) <- paste0(row.names(data0), ".1")
# remove HTSeq rows like "__no_feature"
data0 <- data0[ with(data0, !grepl('__', rownames(data0))) , ]
data0 <- data0[ with(data0, !grepl('ERCC', rownames(data0))) , ]
#data0 <- data0[ , with(data0, !grepl('X.1', colnames(data0)))]
#rename col names (library name to roman number)
# librariesName <- c("56754_WT_Naturally_3.bam.sort.fastq.unmapped.sam.sort.bam", "56753_WT_Naturally_2.bam.sort.fastq.unmapped.sam.sort.bam", "56752_WT_Naturally_1.bam.sort.fastq.unmapped.sam.sort.bam", "56751_tt_6_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56750_tt_6_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56749_tt_6_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56748_tt_0_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56747_tt_0_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56746_tt_0_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56745_WT_10_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56744_WT_10_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56743_WT_10_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56742_WT_6_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56741_WT_6_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56740_WT_6_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56739_WT_0_days_3.bam.sort.fastq.unmapped.sam.sort.bam", "56738_WT_0_days_2.bam.sort.fastq.unmapped.sam.sort.bam", "56737_WT_0_days_1.bam.sort.fastq.unmapped.sam.sort.bam", "56736_tt_T1_tt_6d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56735_tt_T1_tt_6d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56734_tt_T1_tt_6d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56733_tt_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56732_tt_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56731_tt_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56730_tt_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56729_tt_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56728_tt_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56727_WT_T1_NA_3.bam.sort.fastq.unmapped.sam.sort.bam", "56726_WT_T1_NA_2.bam.sort.fastq.unmapped.sam.sort.bam", "56725_WT_T1_NA_1.bam.sort.fastq.unmapped.sam.sort.bam", "56724_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56723_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56722_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56721_WT_T1_WT_10d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56720_WT_T1_WT_10d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56719_WT_T1_WT_10d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56718_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56717_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56716_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56715_WT_T1_WT_6d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56714_WT_T1_WT_6d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56713_WT_T1_WT_6d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56712_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56711_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56710_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56709_WT_T1_WT_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56708_WT_T1_WT_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56707_WT_T1_WT_0d_1.bam.sort.fastq.unmapped.sam.sort.bam", "56706_WT_T1_tt_0d_3.bam.sort.fastq.unmapped.sam.sort.bam", "56705_WT_T1_tt_0d_2.bam.sort.fastq.unmapped.sam.sort.bam", "56704_WT_T1_tt_0d_1.bam.sort.fastq.unmapped.sam.sort.bam")
# #librariesName <- c("80484_Celery_3_day_imbibed_embryo_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80483_Celery_3_day_imbibed_embryo_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80482_Celery_3_day_imbibed_embryo_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80481_Celery_1_day_imbibed_embryo_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80480_Celery_1_day_imbibed_embryo_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80479_Celery_1_day_imbibed_embryo_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80478_Celery_5_day_imbibed_whole_fruit_29C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80477_Celery_5_day_imbibed_whole_fruit_29C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80460_Celery_3_day_imbibed_whole_fruit_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80459_Celery_3_day_imbibed_whole_fruit_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80458_Celery_1_day_imbibed_whole_fruit_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80457_Celery_1_day_imbibed_whole_fruit_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80456_Celery_1_day_imbibed_whole_fruit_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80455_Celery_Dry_whole_fruit_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80454_Celery_Dry_whole_fruit_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80453_Celery_Dry_whole_fruit_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80452_Celery_5_day_imbibed_whole_fruit_29C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80451_Celery_5_day_imbibed_whole_fruit_100uM_ABA_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80450_Celery_5_day_imbibed_whole_fruit_100uM_ABA_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80449_Celery_5_day_imbibed_whole_fruit_100uM_ABA_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80448_Celery_5_day_imbibed_whole_fruit_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80447_Celery_5_day_imbibed_whole_fruit_20C_rep_2_2.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80446_Celery_5_day_imbibed_whole_fruit_20C_rep_1_1.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam", "80445_Celery_3_day_imbibed_whole_fruit_20C_rep_3_3.bam.sort.fastq.unmapped.concordant_uniqsort.bam_reads.Celery_genome_GG_trinity.500bp_new.sam.sort.bam.featureCounts.p.txt")
# romanNumber <- c("T17_Dry_Naturally_Freezedryseed", "T17_Dry_Naturally_Freezedryseed.1", "T17_Dry_Naturally_Freezedryseed.2", "T16_Dry_6days_Freezeafter6dageing", "T16_Dry_6days_Freezeafter6dageing.1", "T16_Dry_6days_Freezeafter6dageing.2", "T15_Dry_0days_Freezedryseed", "T15_Dry_0days_Freezedryseed.1", "T15_Dry_0days_Freezedryseed.2", "T14_Dry_10days_Freezeafter10dageing", "T14_Dry_10days_Freezeafter10dageing.1", "T14_Dry_10days_Freezeafter10dageing.2", "T13_Dry_6days_Freezeafter6dageing", "T13_Dry_6days_Freezeafter6dageing.1", "T13_Dry_6days_Freezeafter6dageing.2", "T12_Dry_0days_Freezedryseed", "T12_Dry_0days_Freezedryseed.1", "T12_Dry_0days_Freezedryseed.2", "T11_Imbibe_6days_12h", "T11_Imbibe_6days_12h.1", "T11_Imbibe_6days_12h.2", "T10_Imbibe_6days_5h", "T10_Imbibe_6days_5h.1", "T10_Imbibe_6days_5h.2", "T9_Imbibe_0dyas_5h", "T9_Imbibe_0dyas_5h.1", "T9_Imbibe_0dyas_5h.2", "T8_Imbibe_Naturally_47h", "T8_Imbibe_Naturally_47h.1", "T8_Imbibe_Naturally_47h.2", "T7_Imbibe_Naturally_24h", "T7_Imbibe_Naturally_24h.1", "T7_Imbibe_Naturally_24h.2", "T6_Imbibe_10days_72h", "T6_Imbibe_10days_72h.1", "T6_Imbibe_10days_72h.2", "T5_Imbibe_10days_24h", "T5_Imbibe_10days_24h.1", "T5_Imbibe_10days_24h.2", "T4_Imbibe_6days_47h", "T4_Imbibe_6days_47h.1", "T4_Imbibe_6days_47h.2", "T3_Imbibe_6days_24h", "T3_Imbibe_6days_24h.1", "T3_Imbibe_6days_24h.2", "T2_Imbibe_0days_24h", "T2_Imbibe_0days_24h.1", "T2_Imbibe_0days_24h.2", "T1_Imbibe_0days_5h", "T1_Imbibe_0days_5h.1", "T1_Imbibe_0days_5h.2")
# #romanNumber <- c("emb_3d_20.2", "emb_3d_20.1", "emb_3d_20", "emb_1d_20.2", "emb_1d_20.1", "emb_1d_20", "wf_5d_29.2", "wf_5d_29.1", "wf_3d_20.1", "wf_3d_20", "wf_1d_20.2", "wf_1d_20.1", "wf_1d_20", "wf_dry.2", "wf_dry.1", "wf_dry", "wf_5d_29", "wf_5d_ABA_20.2", "wf_5d_ABA_20.1", "wf_5d_ABA_20", "wf_5d_20.2", "wf_5d_20.1", "wf_5d_20", "wf_3d_20.2")
#
#
# for(i in 1:length(librariesName)) {
# colnames(data0)[colnames(data0)==grep(librariesName[i], names(data0), value = TRUE)] <- romanNumber[i]
# }
# creats a list of the column names/samples you would like to compare
SampleNames <- makeSampleList(sampleNameI, length(grep(paste0("^",sampleNameI,"[^A-Z]"), colnames(data0))) + 1,
sampleNameII, length(grep(paste0("^",sampleNameII,"[^A-Z]"), colnames(data0))) + 1)
print(SampleNames)
## mal anschauen als ersatz für das darüber ##
## newenw <- table.new[, grepl(paste(LIST,collapse="|"), colnames(table.new))]
# exctract the samples from the data0 if needed
if(length(SampleNames) > 0) {
RTableInFiles <- data0[,SampleNames]
data <- data0[,SampleNames]
}
print(head(RTableInFiles))
# sort by column name
# data <- data[ , order(names(data))] # dangeros, the order of query and referenz might be not the same
# see how many rows and columns are in the tables
print(paste("Raw data row number: ",nrow(data)))
print(paste("Raw data column number: ",ncol(data)))
# 2) A typical differential expression analysis workflow
# 2.1) Filtering and exploratory data analysis
# filter: One single gene/spike-in must have at least 5 counts in minimum 2 libraries
filter <- apply(data, 1, function(x) length(x[x>5])>=2)
dataFiltered <- data[filter,]
# see how many rows and columns are left
print(paste("Filtered data row number: ",nrow(dataFiltered)))
print(paste("Filtered data row number: ",ncol(dataFiltered)))
# filter gene names
genes <- rownames(dataFiltered)[grep("^Pp3", rownames(dataFiltered))]
# filter spike-in names
spikes <- rownames(dataFiltered)[grep("^ERCC", rownames(dataFiltered))]
# get the column name list as.factors
xtmp <- colnames(dataFiltered)
x <- split2list(xtmp)
sampleNameList <- gsub(".[1234567890]+$", "", SampleNames)
columnPos <- makeIndexList(SampleNames)
# run ...
### edgeR ###
print(paste0("edgeR is running with ", cName))
packageDescription("edgeR")
edgeRFunction(outdir, edgeRDir, cName, RTableInFiles, pvtedgeR, sampleNameI, sampleNameII, sampleNameList)
#### DESeq2 ####
print(paste0("DESeq2 is running with ", cName))
packageDescription("DESeq2")
DESeq2Function(outdir, DESeq2Dir, cName, RTableInFiles, pvtDESeq2, sampleNameI, sampleNameII, sampleNameList)
### NOISeq ###
print(paste0("NOISeq is running with ", cName))
packageDescription("NOISeq")
NOISeqFunction(outdir, NOISeqDir, cName, RTableInFiles, pvtNOISeq, sampleNameI, sampleNameII, myLenPos, myGCPos, LenGCTable, columnPos, sampleNameList, SampleNames)
### Overlap of all three methods ###
# intersection with DEG direction check => 1, without => 0
intersection_4.file <- intersection(outdir,1,4)
write.table(intersection_4.file, file = paste0(outdir, "/", cName,"_intersection_four_methods_",nrow(intersection_4.file),".tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
# edgeR, DESeq2, NOISeq
intersection_3.file <- intersection(outdir,1,3)
write.table(intersection_3.file, file = paste0(outdir, "/", cName,"_intersection_three_methods_",nrow(intersection_3.file),".tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
sink() # End of log file
}
####################
## main functions #
####################
plotGraphics <- function(data, librariesName, cName, outdir, romanNumber)
{
library("DESeq2")
library("ggplot2")
library("RColorBrewer")
library("pheatmap")
library("genefilter")
# create sample annotation lists
# sample.name.list <- gsub(".[12]", "", librariesName[colnames(data)])
sample.name.list <- gsub(".[12]", "", romanNumber)
sample.name.uniq <- unique(unlist(sample.name.list, use.names = FALSE))
col.data.condition = data.frame(row.names = librariesName, condition = sample.name.list)
# run DESeq2 functions
dds.matrix <- DESeqDataSetFromMatrix(countData = data, colData = col.data.condition, design = ~condition) # data must be integer! (Ganze Zahlen)
colData(dds.matrix)$condition <- factor(colData(dds.matrix)$condition, levels = sample.name.uniq)
dds.matrix.deseq <- DESeq(dds.matrix)
dds.matrix.res <- results(dds.matrix.deseq)
dds.matrix.res <- dds.matrix.res[order(dds.matrix.res$padj), ]
dds.matrix.res.fdr <- dds.matrix.res[!is.na(dds.matrix.res$padj), ]
dds.matrix.res.fdr <- dds.matrix.res.fdr[dds.matrix.res.fdr$padj < 0.0001, ]
dds.matrix.rld <- rlogTransformation(dds.matrix, blind = TRUE)
# data for PCA plot
pcaData <- plotPCA(dds.matrix.rld, intgroup = "condition", returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
# color for samples
sample.colors <- c("gray60","#D73027","#4575B4", "purple", "orange","#113FF7","green", "black")
# plot the PCA
# https://cran.r-project.org/web/packages/ggfortify/vignettes/plot_pca.html
png(filename=paste0(outdir,"PCA_all_samples_DEseq2_normalized.png"))
ggplot(
pcaData,
aes(
PC1,
PC2,
color=condition,
shape=condition)
) +
scale_shape_manual(values = c(4, 1, 2, 3, 0, 5, 6, 7)) +
geom_point(size=2.5) +
xlab(
paste0("PC1: ",percentVar[1],"% variance")
) +
ylab(
paste0("PC2: ",percentVar[2],"% variance")
) +
theme() + #, face="bold"
scale_colour_manual(
values=sample.colors # dodgerblue3
) +
ggtitle("PCA of all samples (DESeq2 normalized)") +
theme(
plot.title = element_text(lineheight=.8),
panel.background = element_rect(fill = "gray95")
)
dev.off()
# how many genes will be involved in the heatmap calculation
numbGenes <- 100
# pheatmap data
topVarGenes <- head(order(rowVars(assay(dds.matrix.rld)),decreasing=TRUE),numbGenes)
mat <- assay(dds.matrix.rld)[ topVarGenes, ]
mat <- mat - rowMeans(mat)
# pheatmap data annotation
anno <-as.data.frame(colData(dds.matrix.rld)[,c("condition")])
rownames(anno) <- colnames(data) # check out the row names of annotation
colnames(anno) <- "condition"
# pheatmap data annotation color
condition <- sample.colors
names(condition) <- sample.name.uniq
anno_colors <- list(condition = condition)
#heatmap(mat)
#pheatmap(mat)
# ftp://cran.r-project.org/pub/R/web/packages/pheatmap/pheatmap.pdf
png(filename=paste0(outdir,"HC_heatmap_all_samples_DESeq2_normalized.png"))
pheatmap(mat ,
annotation = anno,
annotation_colors = anno_colors,
fontsize = 10,
fontsize_row = 6,
fontsize_col = 10,
main = paste0("Heatmap of all samples (DESeq2 normalized) ", numbGenes," genes\n"),
annotation_names_col = F,
show_rownames = F,
cluster_rows = F,
border_color = F,
treeheight_col = 20
)
dev.off()
}
edgeRFunction <- function(odir, edgeRDir, cName, RTableInFiles, pvtedgeR, sampleNameI, sampleNameII, sampleNameList)
{
library(edgeR)
outdir <- paste0(odir, "/",edgeRDir,"/") # set outdir
dir.create(outdir, showWarnings = FALSE) # create output folder
dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots"
comparison <- cName
counts <- RTableInFiles
pvt <- pvtedgeR
group <- factor(sampleNameList)
# edgeR 'normal'
y.normal <- DGEList(counts = counts, group = group)
y.normal[,1]
png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotMDS")))
plotMDS(y.normal, method = "bcv", main = paste0(cName, "\nedgeR - plotMDS"))
dev.off()
y.normal <- calcNormFactors(y.normal)
y.normal <- estimateCommonDisp(y.normal)
y.normal <- estimateTagwiseDisp(y.normal)
png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotBCV")))
plotBCV(y.normal, main = paste0(cName, "\nedgeR - plotBCV"))
dev.off()
et.normal <- exactTest(y.normal, pair = c(sampleNameI, sampleNameII))
topTags(et.normal)
# adjust.method ==> character string specifying p-value adjustment method. Possible values are "none", "BH", "fdr" (equivalent to "BH"), "BY" and "holm". See p.adjust for details.
summary(de.normal <- decideTestsDGE(et.normal, p = pvt, adjust = "BH"))
et.normal.fdr <- cbind(et.normal$table, p.adjust(et.normal$table[, 3], method = "BH"))
et.normal.fdr <- et.normal.fdr[et.normal.fdr[, 4] < pvt, ]
write.table(et.normal.fdr, file = paste0(outdir, cName, "_", nrow(et.normal.fdr), "_edgeR_fdr.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
detags.normal <- rownames(de.normal)[as.logical(de.normal)]
png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotSmear")))
plotSmear(et.normal, de.normal, tags = detags.normal, main = paste0(cName, "\nedgeR - plotSmear"))
abline(h = c(-2, 2), col = "blue")
dev.off()
# edgeR 'GLM'
y.glm <- DGEList(counts = counts, group = group)
design <- model.matrix(~0 + group, data = y.glm$samples)
design
y.glm <- estimateGLMCommonDisp(y.glm, design)
y.glm <- estimateGLMTrendedDisp(y.glm, design)
## Loading required package: splines
y.glm <- estimateGLMTagwiseDisp(y.glm, design)
png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotBCV - GLM")))
plotBCV(y.glm, main = paste0(cName, "\nedgeR - plotBCV - GLM"))
dev.off()
y.fit <- glmFit(y.glm, design)
y.lrt <- glmLRT(y.fit)
FDR <- p.adjust(y.lrt$table$PValue, method = "BH")
sum(FDR < pvt)
topTags(y.lrt)
summary(de.glm <- decideTestsDGE(y.lrt, p = pvt, adjust = "BH"))
}
DESeq2Function <- function(odir, DESeq2Dir, cName, RTableInFiles, pvtDESeq2, sampleNameI, sampleNameII, sampleNameList)
{
library(DESeq2)
library(RColorBrewer)
library(gplots)
library(vsn)
outdir <- paste0(odir, "/",DESeq2Dir,"/") # set outdir
dir.create(outdir, showWarnings = FALSE)
dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots"
pvt <- pvtDESeq2
# DESeqDataSet(se, design, ignoreRank = FALSE)
# DESeqDataSetFromMatrix(countData, colData, design, tidy = FALSE, ignoreRank = FALSE, ...)
# DESeqDataSetFromHTSeqCount(sampleTable, directory = ".", design, ignoreRank = FALSE, ...)
# DESeqDataSetFromTximport(txi, colData, design, ...)
coldata = data.frame(row.names = colnames(RTableInFiles), condition = sampleNameList)
ddsHTSeq <- DESeqDataSetFromMatrix(countData = RTableInFiles, colData = coldata, design = ~ condition)
colData(ddsHTSeq)$condition <- factor(colData(ddsHTSeq)$condition, levels = c(sampleNameI, sampleNameII))
ddsHTSeq <- DESeq(ddsHTSeq)
ddsHTSeq.res <- results(ddsHTSeq)
ddsHTSeq.res <- ddsHTSeq.res[order(ddsHTSeq.res$padj), ]
head(ddsHTSeq.res)
# png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - plotMA")))
# plotMA(ddsHTSeq, main = paste0(cName, "\nDESeq2 - plotMA"), ylim = c(-2, 2))
# dev.off()
ddsHTSeq.res.fdr <- ddsHTSeq.res[!is.na(ddsHTSeq.res$padj), ]
ddsHTSeq.res.fdr <- ddsHTSeq.res.fdr[ddsHTSeq.res.fdr$padj < pvt, ]
write.table(as.matrix(ddsHTSeq.res.fdr), file = paste0(outdir, cName, "_", nrow(as.matrix(ddsHTSeq.res.fdr)), "_DESeq2_fdr.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
ddsHTSeq.rld <- rlogTransformation(ddsHTSeq, blind = TRUE)
ddsHTSeq.vsd <- varianceStabilizingTransformation(ddsHTSeq, blind = TRUE)
par(mfrow = c(1, 3))
notAllZero <- (rowSums(counts(ddsHTSeq)) > 0)
png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.1")))
meanSdPlot(log2(counts(ddsHTSeq, normalized = TRUE)[notAllZero, ] + 1)) # , ylim = c(0, 2.5)
dev.off()
png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.2"))) # , ylim = c(0, 2.5)
meanSdPlot(assay(ddsHTSeq.rld[notAllZero, ]))
dev.off()
png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.3"))) # , ylim = c(0, 2.5)
meanSdPlot(assay(ddsHTSeq.vsd[notAllZero, ]))
dev.off()
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
distRL <- dist(t(assay(ddsHTSeq.rld)))
mat <- as.matrix(distRL)
rownames(mat) <- colnames(mat) <- with(colData(ddsHTSeq), paste0(condition))
# png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - heatmap")))
# heatmap(mat, trace = "none", col = rev(hmcol), margin = c(13, 13))
# dev.off()
png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - PCA")))
print(plotPCA(ddsHTSeq.rld, intgroup = "condition"))
dev.off()
png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - DispEsts")))
plotDispEsts(ddsHTSeq, main = paste0(cName, "\nDESeq2 - plotDispEsts"))
dev.off()
}
NOISeqFunction <- function(odir, NOISeqDir, cName, RTableInFiles, pvtNOISeq, sampleNameI, sampleNameII, myLenPos, myGCPos, LenGCTable, columnPos, sampleNameList, SampleNames)
{
library(NOISeq)
outdir <- paste0(odir, "/",NOISeqDir,"/") # set outdir
dir.create(outdir, showWarnings = FALSE)
dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots"
pvt <- pvtNOISeq
counts <- RTableInFiles[, columnPos]
head(counts)
# must be changed if there are no triplicates
# !! Please change this setting if needed !!
myfactors <- data.frame(Treatment = sampleNameList, TreatmentRun = SampleNames)
# sort counts by row names
mycounts <- counts[order(rownames(counts)), ]
# filter out low counts
mycounts.filter <- filtered.data(mycounts, factor = myfactors$Treatment, norm = FALSE, depth = NULL, method = 1, cv.cutoff = 100, cpm = 1)
mycounts.rowname <- rownames(mycounts.filter)
# Extra file with length and GC content
myLengthTable <- read.table(LenGCTable, sep = "\t", header = TRUE, stringsAsFactor = FALSE, row.names=1)
myLengthTable <- myLengthTable[order(rownames(myLengthTable)),] # sort by rownames
myLengthTable.filter <- myLengthTable[mycounts.rowname,]
mylength <- myLengthTable.filter[,myLenPos]
names(mylength) <- rownames(myLengthTable.filter)
mygc <- myLengthTable.filter[,myGCPos]
names(mygc) <- rownames(myLengthTable.filter)
mydata <- readData(data = mycounts.filter, length = mylength, gc = mygc, factors = myfactors)
mydata
mydata.GCbias <- dat(mydata, type = "GCbias")
#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - GCbias.1")))
#-# explo.plot(mydata.GCbias)
#-# dev.off()
mydata.GCbias.group <- dat(mydata, factor = "Treatment", type = "GCbias")
#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - GCbias.2")))
#-# explo.plot(mydata.GCbias.group)
#-# dev.off()
show(mydata.GCbias.group)
mydata.lengthbias <- dat(mydata, type = "lengthbias")
#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - Lenbias.1")))
#-# explo.plot(mydata.lengthbias)
#-# dev.off()
mydata.lengthbias.group <- dat(mydata, factor = "Treatment", type = "lengthbias")
#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - Lenbias.2")))
#-# explo.plot(mydata.lengthbias.group)
#-# dev.off()
show(mydata.lengthbias.group)
# looks different
mydata.cd <- dat(mydata, type = "cd")
#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - cd")))
#-# explo.plot(mydata.cd)
#-# dev.off()
mylength
myRPKM <- NOISeq::rpkm(assayData(mydata)$exprs)
myUQUA <- NOISeq::uqua(assayData(mydata)$exprs, long = mylength, lc = 0.5, k = 0)
myTMM <- NOISeq::tmm(assayData(mydata)$exprs, long = 1000, lc = 0)
mynoiseq.rpkm <- noiseq(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological")
mynoiseq.nnorm <- noiseq(mydata, k = 0.5, norm = "n", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological")
mynoiseq.tmm <- noiseq(mydata, k = 0.5, norm = "tmm", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological")
# RPKM
mynoiseq.rpkm.prob <- mynoiseq.rpkm@results[[1]][mynoiseq.rpkm@results[[1]]$prob > pvt, ]
mynoiseq.rpkm.prob <- mynoiseq.rpkm.prob[!is.na(mynoiseq.rpkm.prob$prob), ]
write.table(mynoiseq.rpkm.prob, file = paste0(outdir, cName, "_", nrow(mynoiseq.rpkm.prob), "_NOISeq_rpkm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
# none
mynoiseq.nnorm.prob <- mynoiseq.nnorm@results[[1]][mynoiseq.nnorm@results[[1]]$prob > pvt, ]
mynoiseq.nnorm.prob <- mynoiseq.nnorm.prob[!is.na(mynoiseq.nnorm.prob$prob), ]
write.table(mynoiseq.nnorm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseq.nnorm.prob), "_NOISeq_nnorm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
# TMM
mynoiseq.tmm.prob <- mynoiseq.tmm@results[[1]][mynoiseq.tmm@results[[1]]$prob > pvt, ]
mynoiseq.tmm.prob <- mynoiseq.tmm.prob[!is.na(mynoiseq.tmm.prob$prob), ]
write.table(mynoiseq.tmm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseq.tmm.prob), "_NOISeq_tmm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
############ NOISeq bio ################
pvtnsb <- 0.995
a0per_value <- 0.99
iterations <- 5
x5 <- c(sample(10000:99999, iterations, replace=F), 12345)
x5
collector.rpkm <- NULL
collector.tmm <- NULL
# for (iter in x5) {
# # RPKM
# mynoiseqbio.rpkm <- noiseqbio(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = iter, filter = 1)#, plot = TRUE)
# mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm@results[[1]][mynoiseqbio.rpkm@results[[1]]$prob > pvtnsb, ]
# mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm.prob[!is.na(mynoiseqbio.rpkm.prob$prob), ]
# collector.rpkm <- rbind(collector.rpkm, mynoiseqbio.rpkm.prob)
# #collector <- rbind(collector, mynoiseqbio.rpkm.prob[rownames(mynoiseqbio.rpkm.prob) %in% rownames(collector),])
# # TMM
# mynoiseqbio.tmm <- noiseqbio(mydata, k = 0.5, norm = "tmm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = iter, filter = 1)#, plot = TRUE)
# mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ]
# mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ]
# collector.tmm <- rbind(collector.tmm, mynoiseqbio.tmm.prob)
# }
#
# # RPKM
# print(sort(row.names(collector.rpkm)))
# library(stringr)
# mynoiseqbio.rpkm.prob.overlap <- collector.rpkm[str_split_fixed(row.names(collector.rpkm), "\\.",2)[,2]>=iterations+10,]
# row.names(mynoiseqbio.rpkm.prob.overlap) <- substr(row.names(mynoiseqbio.rpkm.prob.overlap),1,str_length(row.names(mynoiseqbio.rpkm.prob.overlap))-1)
#
# write.table(mynoiseqbio.rpkm.prob.overlap, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.rpkm.prob.overlap),"_NOISeqbio_rpkm_prob_seed_iteration_overlap.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
#
# #TMM
# print(sort(row.names(collector.tmm)))
# library(stringr)
# mynoiseqbio.tmm.prob.overlap <- collector.tmm[str_split_fixed(row.names(collector.tmm), "\\.",2)[,2]>=iterations+10,]
# row.names(mynoiseqbio.tmm.prob.overlap) <- substr(row.names(mynoiseqbio.tmm.prob.overlap),1,str_length(row.names(mynoiseqbio.tmm.prob.overlap))-1)
#
# write.table(mynoiseqbio.tmm.prob.overlap, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.tmm.prob.overlap),"_NOISeqbio_tmm_prob_seed_iteration_overlap.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
# # none
# mynoiseqbio.nnorm.prob <- mynoiseqbio.nnorm@results[[1]][mynoiseqbio.nnorm@results[[1]]$prob > pvtnsb, ]
# mynoiseqbio.nnorm.prob <- mynoiseqbio.nnorm.prob[!is.na(mynoiseqbio.nnorm.prob$prob), ]
# write.table(mynoiseqbio.nnorm.prob, file = paste0(outdir, cName, "_NOISeqbio_nnorm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
# # TMM
# mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ]
# mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ]
# write.table(mynoiseqbio.tmm.prob, file = paste0(outdir, cName, "_NOISeqbio_tmm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbiorpkm")))
mynoiseqbio.rpkm <- noiseqbio(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE)
dev.off()
mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm@results[[1]][mynoiseqbio.rpkm@results[[1]]$prob > pvtnsb, ]
mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm.prob[!is.na(mynoiseqbio.rpkm.prob$prob), ]
write.table(mynoiseqbio.rpkm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.rpkm.prob),"_NOISeqbio_rpkm_prob_seed_12345.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbiontmm")))
mynoiseqbio.tmm <- noiseqbio(mydata, k = 0.5, norm = "tmm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE)
dev.off()
mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ]
mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ]
write.table(mynoiseqbio.tmm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.tmm.prob),"_NOISeqbio_tmm_prob_seed_12345.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbionnorm")))
mynoiseqbio.nnorm <- noiseqbio(mydata, k = 0.5, norm = "n", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE)
dev.off()
head(mynoiseqbio.rpkm@results[[1]])
head(mynoiseqbio.nnorm@results[[1]])
mynoiseqbio.rpkm.deg <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = NULL)
mynoiseqbio.rpkm.deg.up <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = "up")
mynoiseqbio.rpkm.deg.down <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = "down")
mynoiseqbio.nnorm.deg <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = NULL)
mynoiseqbio.nnorm.deg.up <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = "up")
mynoiseqbio.nnorm.deg.down <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = "down")
par(mfrow = c(3, 2))
png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.9")))
DE.plot(mynoiseq.rpkm, q = 0.9, graphic = "expr", log.scale = TRUE)
dev.off()
png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.8")))
DE.plot(mynoiseq.rpkm, q = 0.8, graphic = "expr", log.scale = TRUE)
dev.off()
png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorn_0.9")))
DE.plot(mynoiseq.nnorm, q = 0.9, graphic = "expr", log.scale = TRUE)
dev.off()
png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorn_0.8")))
DE.plot(mynoiseq.nnorm, q = 0.8, graphic = "expr", log.scale = TRUE)
dev.off()
png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.tmm_0.9")))
DE.plot(mynoiseq.tmm, q = 0.9, graphic = "expr", log.scale = TRUE)
dev.off()
png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.tmm_0.8")))
DE.plot(mynoiseq.tmm, q = 0.8, graphic = "expr", log.scale = TRUE)
dev.off()
par(mfrow = c(2, 2))
png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.rpkm_",pvt)))
DE.plot(mynoiseqbio.rpkm, q = pvt, graphic = "expr", log.scale = TRUE)
dev.off()
png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.rpkm_",pvtnsb)))
DE.plot(mynoiseqbio.rpkm, q = pvtnsb, graphic = "expr", log.scale = TRUE)
dev.off()
png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.nnorm_",pvt)))
DE.plot(mynoiseqbio.nnorm, q = pvt, graphic = "expr", log.scale = TRUE)
dev.off()
png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.nnorm_",pvtnsb)))
DE.plot(mynoiseqbio.nnorm, q = pvtnsb, graphic = "expr", log.scale = TRUE)
dev.off()
par(mfrow = c(1, 2))
png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.9_MD")))
DE.plot(mynoiseq.rpkm, q = 0.9, graphic = "MD", log.scale = TRUE)
dev.off()
png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorm_0.9_MD")))
DE.plot(mynoiseq.nnorm, q = 0.9, graphic = "MD", log.scale = TRUE)
dev.off()
}
###########################################
## additional function for more usability #
###########################################
calcRPKM <- function(c,n,l)
{
RPKM <- (10^9* as.numeric(c)) / (as.numeric(n) * as.numeric(l))
return(RPKM)
}
RPKMMatrix <- function(InTable,directory,LenGCTable,myLenPos)
{
myLengthTable <- read.table(paste0(directory,LenGCTable), sep = "\t", header = TRUE, stringsAsFactor = FALSE, row.names = 1)
newtable <- matrix(data = c(1), nrow = nrow(InTable), ncol(InTable), byrow= TRUE)
colnames(newtable) <- colnames(InTable)
rownames(newtable) <- rownames(InTable)
sumCol <- apply(InTable,2,sum) # list of the all col sums
for(s in colnames(InTable)) {
for(r in rownames(InTable)) {
w <- InTable[r, s]
len <- myLengthTable[r, myLenPos]
sum <- as.numeric(sumCol[s])
newtable[r, s] <- calcRPKM(w, sum, len)
}
}
return(newtable)
}
# rmHTSeqLines removes the last five rows
rmHTSeqLines <- function(counts)
{
counts <- head(counts, -5)
return(counts)
}
###############
## functions ##
###############
# split list to a factor list
split2list <- function(tmp)
{
xx <- c()
for(i in 1:length(tmp)) {
if(grepl("\\.", tmp[i])) {
foo <- strsplit(tmp[i], '\\.')[[1]]
fooo <- strsplit(foo, " ")[[1]]
xx <- c(xx, fooo)
} else {
xx <- c(xx, tmp[i])
}
}
return(as.factor(xx))
}
# makeGroups
# https://github.com/drisso/RUVSeq/blob/master/R/helper.R
makeGroups <- function(xs)
{
xs <- factor(xs)
groups <- matrix(-1, nrow = length(levels(xs)), ncol = max(table(xs)))
for (i in 1:length(levels(xs))) {
idxs <- which(xs == levels(xs)[i])
groups[i,1:length(idxs)] <- idxs
}
return(groups)
}
# The function makeSampleList creates a list of the columns(samples) you would like to compare
makeSampleList <- function(A, Al, B, Bl)
{
l <- c()
if(exists("Al", mode = "numeric")) {
l <- makeSampleList.helper(A, Al, l)
l <- makeSampleList.helper(B, Bl, l)
}
return(l)
}
makeSampleList.helper <- function(v,k,l)
{
for(h in 1:k) {
if(h == 1) {
l <- c(l, v)
} else {
l <- c(l, paste0(v,".",h-1))
}
}
return(l)
}
# function creates a list of numbers out of another list. The List is the index of the given list.
makeIndexList <- function(l)
{
n <- 1
jj <- c()
for(v in l) {
jj <- c(jj, n)
n <- n + 1
}
return(jj)
}
intersection <- function(l,check,t)
{
l.list <- list.files(path = l, pattern = "*(rpkm|edgeR|DESeq2).*tsv", recursive = TRUE)
if(grepl("bio", l.list[4],fixed = TRUE)) {
l.list.noiseq <- read.table(paste0(l,"/",l.list[3]), header=T, sep="\t", row.names=1)
l.list.noiseqbio <- read.table(paste0(l,"/",l.list[4]), header=T, sep="\t", row.names=1)
} else {
l.list.noiseq <- read.table(paste0(l,"/",l.list[4]), header=T, sep="\t", row.names=1)
l.list.noiseqbio <- read.table(paste0(l,"/",l.list[3]), header=T, sep="\t", row.names=1)
}
l.list.edgeR <- read.table(paste0(l,"/",l.list[2]), header=T, sep="\t", row.names=1)
l.list.deseq2 <- read.table(paste0(l,"/",l.list[1]), header=T, sep="\t", row.names=1)
print("DEGs edgeR")
print(nrow(l.list.edgeR))
print("DEGs deseq2")
print(nrow(l.list.deseq2))
print("DEGs noiseq")
print(nrow(l.list.noiseq))
print("DEGs noiseqbio")
print(nrow(l.list.noiseqbio))
interect.list <- ""
interect.list.direct <- ""
if(t==4) {
# List of overlaps without direction check
interect.list <- intersect(rownames(l.list.edgeR), intersect(rownames(l.list.noiseq), intersect(rownames(l.list.noiseqbio), rownames(l.list.deseq2))))
print("DEGs intersection without direction check (four methods)")
print(length(interect.list))
print(interect.list)
print("DEGs intersection with direction check")
# List of overlaps with direction check
if(length(interect.list)!=0) {
interect.list.direct <- interDirect(l.list.edgeR, l.list.deseq2, l.list.noiseq, l.list.noiseqbio, interect.list)
print(length(interect.list.direct))
print(interect.list.direct)
} else {
print("0")
}
}
if(t==3) {
# List of overlaps without direction check
interect.list <- intersect(rownames(l.list.edgeR), intersect(rownames(l.list.noiseq), rownames(l.list.deseq2)))
print("DEGs intersection without direction check (three methods)")
print(length(interect.list))
print(interect.list)
print("DEGs intersection with direction check")
# List of overlaps with direction check
if(length(interect.list)!=0) {
# fake data frame with 0 rows => nrow == 0
df_fake <- data.frame(Date=as.Date(character()),
File=character(),
User=character(),
stringsAsFactors=FALSE)
interect.list.direct <- interDirect(l.list.edgeR, l.list.deseq2, l.list.noiseq, df_fake, interect.list)
print(length(interect.list.direct))
print(interect.list.direct)
} else {
print("0")
}
}
if(check == 1){
return(l.list.noiseq[interect.list.direct,])
} else {
return(l.list.noiseq[interect.list,])
}
}
interDirect <- function(e,d,n,nb,i)
{
if(nrow(nb) == 0) {
nbx <- apply(n[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) # use noiseq twice to simulate noiseqbio
nby <- paste0(nbx[,"M"],rownames(nbx))
} else {
nbx <- apply(nb[i,], 2, function(x) {ifelse(x > 0, "-", "+")})
nby <- paste0(nbx[,"log2FC"],rownames(nbx))
}
ex <- apply(e[i,], 2, function(x) {ifelse(x < 0, "-", "+")})
dx <- apply(d[i,], 2, function(x) {ifelse(x < 0, "-", "+")})
nx <- apply(n[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) # twisted, because edgeR and DESeq2 are calculating vice versa
ey <- paste0(ex[,"logFC"],rownames(ex))
dy <- paste0(dx[,"log2FoldChange"],rownames(dx))
ny <- paste0(nx[,"M"],rownames(nx))
interect.list.direct <- intersect(ey, intersect(dy, intersect(ny, nby)))
return(substr(interect.list.direct,2,100))
}
###########################################
## executiv part of the script #
###########################################
for(SampleNames in sampleLists) {
workflow(file, saveat, SampleNames, LenGCTable, myLenPos, myGCPos, rpkm, pvtedgeR, pvtDESeq2, pvtNOISeq, edgeRDir, DESeq2Dir, NOISeqDir, htseq)
}
print("Thank you for using the DEG pipeline :) \n")
# fin
### compare DEGs analysis ###
# used venny and Excel
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment