diff --git a/README.md b/README.md index 0fc960c4032ab95cfe4dc5efc5aedcdc40cfe494..a92020df90698733cf1ac36770cc0298b4b8945f 100644 --- a/README.md +++ b/README.md @@ -100,3 +100,7 @@ cd $arc_root ```bash echo "~/03DataPLANT_gitlab/samplearc_rnaseq/" > workflows/_arc_local_wd ``` + + +# replace +# runs/ by runs/'$run_name'/ \ No newline at end of file diff --git a/runs/run1/05_shinyPrep.RData b/runs/run1/05_shinyPrep.RData new file mode 100644 index 0000000000000000000000000000000000000000..e05d66658d5b95026e2731cf8c6ac993fe04f4c2 Binary files /dev/null and b/runs/run1/05_shinyPrep.RData differ diff --git a/workflows/01_KallistoQuant.sh b/workflows/01_KallistoQuant.sh index c63f4647df1232f6a847f964b318953ba9f9e3d8..bca3fd80d639a2bdb2232e5d6c8c59f5319f4eb6 100644 --- a/workflows/01_KallistoQuant.sh +++ b/workflows/01_KallistoQuant.sh @@ -1,20 +1,9 @@ - -######################## -#### To be replaced by CWL routine -######################## - -# Execute within <ARC root>/workflows -# chmod a+x 01_KallistoQuant.sh -# ./01_KallistoQuant.sh > $ARC_root'runs/01_kallisto.log' 2>&1 & -# pointers to and from `runs` need to be replaced - -ARC_root=$(cat ./_arc_local_wd) - -######################## - - +#!/usr/bin/bash # Map RNASeq reads via kallisto +run_name=$1 +arc_root=$2 + ## Manual: http://pachterlab.github.io/kallisto/manual.html kallisto version @@ -22,21 +11,21 @@ kallisto cite ### Build index -kall_ref=$ARC_root'studies/TalinumGenomeDraft/resources/Talinum.gm.CDS.nt.fa' -kallisto index -i $ARC_root'runs/01_kallisto_index' $kall_ref +kall_ref=$arc_root'/studies/TalinumGenomeDraft/resources/Talinum.gm.CDS.nt.fa' +kallisto index -i $arc_root'/runs/'$run_name'/01_kallisto_index' $kall_ref ### Align reads -ILLUMINASAMPLES=$(ls ${ARC_root}'assays/Talinum_RNASeq_minimal/dataset/'*fastq.gz) +ILLUMINASAMPLES=$(ls ${arc_root}'/assays/Talinum_RNASeq_minimal/dataset/'*fastq.gz) -mkdir $ARC_root'/runs/01_kallisto_results/' +mkdir $arc_root'/runs/'$run_name'/01_kallisto_results/' for j in $ILLUMINASAMPLES; do sampleName=$(echo $j | sed -e 's|.*/||' | cut -c -6) # cut away path. retain only first six chars of file name echo $sampleName - kallisto quant --single -b 100 -t 30 -l 200 -s 20 -i $ARC_root'/runs/01_kallisto_index' -o $ARC_root'/runs/01_kallisto_results/'$sampleName $j + kallisto quant --single -b 100 -t 30 -l 200 -s 20 -i $arc_root'/runs/'$run_name'/01_kallisto_index' -o $arc_root'/runs/'$run_name'/01_kallisto_results/'$sampleName $j echo 'Kallisto done' diff --git a/workflows/02_InstallPackages.R b/workflows/02_InstallPackages.R new file mode 100644 index 0000000000000000000000000000000000000000..f328f617d63b25b4f5b537ad96aca3fda6c90c50 --- /dev/null +++ b/workflows/02_InstallPackages.R @@ -0,0 +1,50 @@ +#!/usr/bin/env Rscript + +######################## +#### To be replaced by CWL routine +######################## + + + +######################## +# Installation of R dependencies +######################## + +### sleuth installation + +if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") +BiocManager::install() +BiocManager::install("devtools") # only if devtools not yet installed +# BiocManager::install("pachterlab/sleuth") +# There's currently (early 2022) an issue with sleuth installation. +# Using this workaround from https://github.com/pachterlab/sleuth/issues/259#issuecomment-1001076030 +remotes::install_github("pachterlab/sleuth#260") + +library(sleuth) + +### other packages + +library(tidyverse) +library(jsonlite) +library(openxlsx) + +required_packages <- + c('tidyverse', ## data wrangling and plotting + 'jsonlite', ## read/write json files + 'openxlsx') ## read/write xlsx files + +for(package in required_packages) +{ + ## Check if package is installed. If not, install + if(!package %in% row.names(installed.packages())){install.packages(package, + repos ="https://cran.uni-muenster.de/")} + # ## Check if package is up to date. If not, update + # update.packages(package, repos = "https://cran.uni-muenster.de/") + ## Load package + library(package, character.only = T) + print(paste("installed R package", package)) +} + + + + diff --git a/workflows/03_KallistoCollect.R b/workflows/03_KallistoCollect.R index a103b38f3e9c422bd1550f867cf259e670be1237..ff069265dcd52435dceaccd2253d5bacc16562ea 100644 --- a/workflows/03_KallistoCollect.R +++ b/workflows/03_KallistoCollect.R @@ -1,31 +1,26 @@ - +#!/usr/bin/env Rscript ######################## #### To be replaced by CWL routine ######################## -# Execute within <ARC root>/workflows -# Rscript 03_KallistoCollect.R -# pointers to and from `runs` need to be replaced - -ARC_root=readLines("./_arc_local_wd") - - -######################## +args = commandArgs(trailingOnly=TRUE) + +# test if <Run name> and <ARC root> argument are supplied: if not, return an error +if (length(args)!=2) { + stop("<Run name> and <ARC root> must be supplied as arguments", call.=FALSE) +} else if (length(args)==2) { + # default output file + run_name <- args[1] + print(paste("Storing outputs in:", run_name)) + arc_root <- args[2] + print(paste("ARC root:", arc_root)) +} ######################## # Collect kallisto data ######################## - -# ### sleuth installation -# -# if (!requireNamespace("BiocManager", quietly = TRUE)) -# install.packages("BiocManager") -# BiocManager::install() -# BiocManager::install("devtools") # only if devtools not yet installed -# BiocManager::install("pachterlab/sleuth") - library(sleuth) library(tidyverse) library(jsonlite) @@ -33,8 +28,8 @@ library(openxlsx) ## read experimental metadata from isa.assay wb -isa_assay <- paste0(ARC_root, 'assays/Talinum_RNASeq_minimal/isa.assay.xlsx') -isa_study <- paste0(ARC_root, 'studies/TalinumFacultativeCAM/isa.study.xlsx') +isa_assay <- paste0(arc_root, 'assays/Talinum_RNASeq_minimal/isa.assay.xlsx') +isa_study <- paste0(arc_root, 'studies/TalinumFacultativeCAM/isa.study.xlsx') study_data <- readWorkbook(isa_study, "plant_growth", startRow = 1) @@ -50,7 +45,7 @@ assay_data <- merge(study_data, assay_data, by.x = "Sample.Name", by.y = "Sourc assay_data <- assay_data[, !apply(assay_data, 2, function(x){sum(is.na(x)) == nrow(assay_data)})] # Pointer to kallisto results folder -base_dir <- paste0(ARC_root, 'runs/01_kallisto_results/') +base_dir <- paste0(arc_root, 'runs/', run_name, '/01_kallisto_results/') # A list of paths to the kallisto results indexed by the sample IDs is collated with kal_dirs <- dir(base_dir, full.names = T) ## Sleuth requires full paths @@ -64,17 +59,17 @@ s2c <- s2c[order(s2c$sample), ] # Build a sleuth object so <- sleuth_prep(s2c, ~Photosynthesis.mode) -save(so, file = paste0(ARC_root, 'runs/03_kallisto_sleuthObject.RData')) +save(so, file = paste0(arc_root, 'runs/', run_name, '/03_kallisto_sleuthObject.RData')) # Extract expression tables ## as data.frame expression_data <- kallisto_table(so) -write.csv(expression_data, paste0(ARC_root, 'runs/03_kallisto_df.csv'), row.names = F) +write.csv(expression_data, paste0(arc_root, 'runs/', run_name, '/03_kallisto_df.csv'), row.names = F) ## as tpm matrix (gene x sample) tpm_table <- pivot_wider(expression_data, id_cols = target_id, names_from = sample, values_from = tpm) -write.csv(tpm_table, paste0(ARC_root, 'runs/03_kallisto_tpmMatrix.csv'), row.names = F) +write.csv(tpm_table, paste0(arc_root, 'runs/', run_name, '/03_kallisto_tpmMatrix.csv'), row.names = F) # Summarize mapping stats @@ -88,7 +83,7 @@ for(i in dir(kal_dirs, pattern = '.json', full.names = T)) mapping_stats <- rbind(mapping_stats, z) } -write.csv(mapping_stats, paste0(ARC_root, 'runs/03_kallisto_mappingStats.csv'), row.names = F) +write.csv(mapping_stats, paste0(arc_root, 'runs/', run_name, '/03_kallisto_mappingStats.csv'), row.names = F) diff --git a/workflows/04_Sleuth.R b/workflows/04_Sleuth.R index 97894a987cea97fea991ce67b889f70544a637f0..41fb94d4aaae670c0229eadef22cbed779d76767 100644 --- a/workflows/04_Sleuth.R +++ b/workflows/04_Sleuth.R @@ -1,17 +1,25 @@ - +#!/usr/bin/env Rscript ######################## #### To be replaced by CWL routine ######################## -# Execute within <ARC root>/workflows -# Rscript 04_Sleuth.R -# pointers to and from `runs` need to be replaced +args = commandArgs(trailingOnly=TRUE) -ARC_root=readLines("./_arc_local_wd") +# test if <Run name> and <ARC root> argument are supplied: if not, return an error +if (length(args)!=2) { + stop("<Run name> and <ARC root> must be supplied as arguments", call.=FALSE) +} else if (length(args)==2) { + # default output file + run_name <- args[1] + print(paste("Storing outputs in:", run_name)) + arc_root <- args[2] + print(paste("ARC root:", arc_root)) +} ######################## + ######################## # Determine diff. gene expression with sleuth ######################## @@ -19,18 +27,17 @@ ARC_root=readLines("./_arc_local_wd") library(sleuth) # Load the sleuth object -load(file = paste0(ARC_root, 'runs/03_kallisto_sleuthObject.RData')) +load(file = paste0(arc_root, 'runs/', run_name, '/03_kallisto_sleuthObject.RData')) so <- sleuth_fit(so) -so <- sleuth_fit(so, ~Group, 'full') +so <- sleuth_fit(so, ~Photosynthesis.mode, 'full') so <- sleuth_fit(so, ~1, 'reduced') so <- sleuth_lrt(so, 'reduced', 'full') sleuth_table <- sleuth_results(so, 'reduced:full', 'lrt', show_all = FALSE) - -write.csv(sleuth_table, paste0(ARC_root, 'runs/04_sleuth_dge.csv'), row.names = F) +write.csv(sleuth_table, paste0(arc_root, 'runs/', run_name, '/04_sleuth_dge.csv'), row.names = F) diff --git a/workflows/05_plot_shinyPrep.R b/workflows/05_plot_shinyPrep.R index 075c567040119d7ce18e546a49accca8af4494b5..4cabdc3f5d9375e6c3b90488e05822acd45fcc5f 100644 --- a/workflows/05_plot_shinyPrep.R +++ b/workflows/05_plot_shinyPrep.R @@ -1,16 +1,21 @@ - +#!/usr/bin/env Rscript ######################## #### To be replaced by CWL routine ######################## -# Execute within <ARC root>/workflows -# Rscript 05_plot_shinyPrep.R -# pointers to and from `runs` need to be replaced - -ARC_root=readLines("./_arc_local_wd") +args = commandArgs(trailingOnly=TRUE) -######################## +# test if <Run name> and <ARC root> argument are supplied: if not, return an error +if (length(args)!=2) { + stop("<Run name> and <ARC root> must be supplied as arguments", call.=FALSE) +} else if (length(args)==2) { + # default output file + run_name <- args[1] + print(paste("Storing outputs in:", run_name)) + arc_root <- args[2] + print(paste("ARC root:", arc_root)) +} ######################## # Prep data for shiny app @@ -18,7 +23,7 @@ ARC_root=readLines("./_arc_local_wd") library(openxlsx) -expression_data <- read.csv(file = paste0(ARC_root, 'runs/03_kallisto_df.csv')) +expression_data <- read.csv(file = paste0(arc_root, 'runs/', run_name, '/03_kallisto_df.csv')) available_genes <- unique(expression_data$target_id) -save(expression_data, available_genes, file = paste0(ARC_root, 'runs/05_shinyPrep.RData')) +save(expression_data, available_genes, file = paste0(arc_root, 'runs/', run_name, '/05_shinyPrep.RData')) diff --git a/workflows/_arc_local_wd b/workflows/_arc_local_wd deleted file mode 100644 index aca1e009538a1e623f7c13a9f169d4a707c39a6b..0000000000000000000000000000000000000000 --- a/workflows/_arc_local_wd +++ /dev/null @@ -1 +0,0 @@ -~/03DataPLANT_gitlab/samplearc_rnaseq/ diff --git a/workflows/_workflow_wrapper.sh b/workflows/_workflow_wrapper.sh new file mode 100755 index 0000000000000000000000000000000000000000..f34687e4d51665ce9f12e4a6ec357439e69d0699 --- /dev/null +++ b/workflows/_workflow_wrapper.sh @@ -0,0 +1,26 @@ + #!/usr/bin/bash + +####################################### +#### To be replaced by CWL routine #### +####################################### + +# This is supposed to wrap the workflow from kallisto quantification to sleuth diff gene expression +# Execute within <ARC root>/workflows: +## change rights: `chmod a+x _workflow_wrapper.sh` +## execute script: `./_workflow_wrapper.sh <run_name> <arc_root>` +## Example: ./_workflow_wrapper.sh "run1" "~/03DataPLANT_gitlab/samplearc_rnaseq/" + +run_name=$1 +arc_root=$2 + +# chmod a+x 01_KallistoQuant.sh +# ./01_KallistoQuant.sh $run_name $arc_root + +Rscript 02_InstallPackages.R +Rscript 03_KallistoCollect.R $run_name $arc_root +Rscript 04_Sleuth.R $run_name $arc_root +Rscript 05_plot_shinyPrep.R $run_name $arc_root + +# store variables for test runs in R +# run_name="run1" +# arc_root="~/03DataPLANT_gitlab/samplearc_rnaseq/" \ No newline at end of file