diff --git a/runs/kallisto_quant/README.md b/runs/kallisto_quant/README.md new file mode 100644 index 0000000000000000000000000000000000000000..65511777825456ab11c634b495723e7e60f62fe9 --- /dev/null +++ b/runs/kallisto_quant/README.md @@ -0,0 +1,21 @@ + +# cwltool README + +## First cd into the (runs) folder, with the .yml file + +cd /Users/dominikbrilhaus/03_DataPLANT_gitlab/samplearc_rnaseq/runs/kallisto_quant + +## Let it flow + +```bash +### store arc root (two levels up from here) as variable +arc_root=$(echo ${PWD%/*/*}) + +### replace arc root line in yml (specific to the machine from where this is run) +### not sure, if this works on linux... +sed -i '' "s|^arc_root:.*|arc_root: $arc_root|g" kallisto_quant.yml + +### run with cwltool +cwltool ../../workflows/kallisto_quant.cwl kallisto_index.yml + +``` diff --git a/runs/kallisto_quant/kallisto_quant.yml b/runs/kallisto_quant/kallisto_quant.yml new file mode 100644 index 0000000000000000000000000000000000000000..19031ec02334edfebea3533c0685a3290629c764 --- /dev/null +++ b/runs/kallisto_quant/kallisto_quant.yml @@ -0,0 +1,12 @@ +cores: 4 +sh_script: + class: File + path: ../../workflows/kallisto_quant.sh +arc_root: /Users/dominikbrilhaus/03_DataPLANT_gitlab/samplearc_rnaseq +out_folder: runs/kallisto_quant +in_kallisto_index: runs/kallisto_index/kallisto_index +in_fastq_folder: assays/Talinum_RNASeq_minimal/dataset +kallisto_bootstrap: 100 +kallisto_threads: 4 +kallisto_fragmentLength: 200 +kallisto_stdDev: 20 \ No newline at end of file diff --git a/runs/shiny_prep/shiny_prep.RData b/runs/shiny_prep/shiny_prep.RData new file mode 100644 index 0000000000000000000000000000000000000000..258a091981fee5ee5389d9e76ca25e2a237768b8 Binary files /dev/null and b/runs/shiny_prep/shiny_prep.RData differ diff --git a/runs/shiny_prep/shiny_prep.yml b/runs/shiny_prep/shiny_prep.yml new file mode 100644 index 0000000000000000000000000000000000000000..cdaed0ed18ac7f3660030169f7af8ffee314471d --- /dev/null +++ b/runs/shiny_prep/shiny_prep.yml @@ -0,0 +1,7 @@ +cores: 1 +r_script: + class: File + path: ../../workflows/shiny_prep.R +in_kallisto_df: runs/kallisto_collect/kallisto_df.csv +out_folder: runs/shiny_prep +arc_root: /Users/dominikbrilhaus/03_DataPLANT_gitlab/samplearc_rnaseq diff --git a/workflows/kallisto_quant.cwl b/workflows/kallisto_quant.cwl new file mode 100644 index 0000000000000000000000000000000000000000..9259357a2df7043a94a49eba13d0d1b1f534db28 --- /dev/null +++ b/workflows/kallisto_quant.cwl @@ -0,0 +1,53 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +inputs: +- id: sh_script + type: File + inputBinding: + position: 0 +- id: arc_root + type: string + inputBinding: + position: 1 +- id: out_folder + type: string + inputBinding: + position: 2 +- id: in_kallisto_index + type: string + inputBinding: + position: 3 +- id: in_fastq_folder + type: string + inputBinding: + position: 4 +- id: kallisto_bootstrap + type: int + inputBinding: + position: 5 +- id: kallisto_threads + type: int + inputBinding: + position: 6 +- id: kallisto_fragmentLength + type: int + inputBinding: + position: 7 +- id: kallisto_stdDev + type: int + inputBinding: + position: 8 + +outputs: +- id: outdir + type: + type: array + items: Directory + outputBinding: + glob: $(runtime.outdir)/$(inputs.out_folder) + +baseCommand: +- bash diff --git a/workflows/kallisto_quant.sh b/workflows/kallisto_quant.sh new file mode 100755 index 0000000000000000000000000000000000000000..0f81cd18bb9f96853a68473e86a73b22d2f4ee1a --- /dev/null +++ b/workflows/kallisto_quant.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +### Map RNASeq reads via kallisto +### Note, this is written for single-end mode only + +################################################ +#### Read arguments from CLI +################################################ + +arc_root=$1 +out_folder=$2 +in_kallisto_index=$3 +in_fastq_folder=$4 +kallisto_bootstrap=$5 +kallisto_threads=$6 +kallisto_fragmentLength=$7 +kallisto_stdDev=$8 + +################################################ +#### If it does not exist, create out dir +################################################ + +mkdir -p "$arc_root/$out_folder/" + +################################################ +#### Store fastq files in variable +################################################ + +fastq_files=$(ls "${arc_root}"/"${in_fastq_folder}"/*fastq*) + +################################################ +#### Loop over fastq files and quantify reads +################################################ + +for j in $fastq_files; do + + # cut away path. retain only first six chars of file name + sampleName=$(echo $j | sed -e 's|.*/||' | sed -e 's|.fastq.*||') + + echo $sampleName + + kallisto quant --single -b $kallisto_bootstrap -t $kallisto_threads -l $kallisto_fragmentLength -s $kallisto_stdDev -i "$arc_root/$in_kallisto_index" -o "$arc_root/$out_folder/$sampleName" $j + + echo 'Kallisto done' + +done + + + diff --git a/workflows/no_CWL_yet/kallisto_sleuth/01_KallistoIndex.sh b/workflows/no_CWL_yet/kallisto_sleuth/01_KallistoIndex.sh deleted file mode 100755 index ec37c24ab97dbfffeb68fa38476cd435af4fd046..0000000000000000000000000000000000000000 --- a/workflows/no_CWL_yet/kallisto_sleuth/01_KallistoIndex.sh +++ /dev/null @@ -1,25 +0,0 @@ - -run_name=$1 -arc_root=$2 - -## Manual: http://pachterlab.github.io/kallisto/manual.html - -## MacOS Installation -# ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" -# brew install kallisto - - -# git clone https://github.com/pachterlab/kallisto.git -# cd kallisto -# mkdir build -# cd build -# cmake .. -DUSE_HDF5=ON - - -kallisto version -kallisto cite - -### Build index - -kall_ref=$arc_root'/studies/TalinumGenomeDraft/resources/Talinum.gm.CDS.nt.fa' -kallisto index -i $arc_root'/runs/'$run_name'/01_kallisto_index' $kall_ref \ No newline at end of file diff --git a/workflows/no_CWL_yet/kallisto_sleuth/01_KallistoQuant.sh b/workflows/no_CWL_yet/kallisto_sleuth/01_KallistoQuant.sh deleted file mode 100755 index f65e0ac4c0df747d4af8fc4630295cfc54119fda..0000000000000000000000000000000000000000 --- a/workflows/no_CWL_yet/kallisto_sleuth/01_KallistoQuant.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash - -# Map RNASeq reads via kallisto - -run_name=$1 -arc_root=$2 - -### Align reads - -ILLUMINASAMPLES=$(ls ${arc_root}"/assays/Talinum_RNASeq_minimal/dataset/"*"fastq.gz") - -mkdir -p $arc_root'/runs/'$run_name'/01_kallisto_results/' - -for j in $ILLUMINASAMPLES; do - - sampleName=$(echo $j | sed -e 's|.*/||' | cut -c -6) # cut away path. retain only first six chars of file name - echo $sampleName - - kallisto quant --single -b 100 -t 8 -l 200 -s 20 -i $arc_root'/runs/'$run_name'/01_kallisto_index' -o $arc_root'/runs/'$run_name'/01_kallisto_results/'$sampleName $j - - echo 'Kallisto done' - -done - - - diff --git a/workflows/no_CWL_yet/kallisto_sleuth/02_InstallPackages.R b/workflows/no_CWL_yet/kallisto_sleuth/02_InstallPackages.R deleted file mode 100644 index 163e75e2125be651e24cee07081b7eda37a0c477..0000000000000000000000000000000000000000 --- a/workflows/no_CWL_yet/kallisto_sleuth/02_InstallPackages.R +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env Rscript - -######################## -#### To be replaced by CWL routine -######################## - - - -######################## -# Installation of R dependencies -######################## - -### sleuth installation - -if (!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager")} -if(!"BiocManager" %in% row.names(installed.packages())){BiocManager::install()} -if(!"devtools" %in% row.names(installed.packages())){BiocManager::install("devtools")} - -# BiocManager::install("pachterlab/sleuth") -# There's currently (early 2022) an issue with sleuth installation. -# Using this workaround from https://github.com/pachterlab/sleuth/issues/259#issuecomment-1001076030 - -if(!"sleuth" %in% row.names(installed.packages())){remotes::install_github("pachterlab/sleuth#260")} - -library(sleuth) - -### other packages - -library(tidyverse) -library(jsonlite) -library(openxlsx) - -required_packages <- - c('tidyverse', ## data wrangling and plotting - 'jsonlite', ## read/write json files - 'openxlsx') ## read/write xlsx files - -for(package in required_packages) -{ - ## Check if package is installed. If not, install - if(!package %in% row.names(installed.packages())){install.packages(package, - repos ="https://cran.uni-muenster.de/")} - # ## Check if package is up to date. If not, update - # update.packages(package, repos = "https://cran.uni-muenster.de/") - ## Load package - library(package, character.only = T) - print(paste("installed R package", package)) -} - - - - diff --git a/workflows/no_CWL_yet/kallisto_sleuth/03_KallistoCollect.R b/workflows/no_CWL_yet/kallisto_sleuth/03_KallistoCollect.R deleted file mode 100644 index f36e8f7d960afa0d05dc5bdd2aa2bd43ec01bc5c..0000000000000000000000000000000000000000 --- a/workflows/no_CWL_yet/kallisto_sleuth/03_KallistoCollect.R +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env Rscript - -######################## -#### To be replaced by CWL routine -######################## - -args = commandArgs(trailingOnly=TRUE) - -# test if <Run name> and <ARC root> argument are supplied: if not, return an error -if (length(args)!=2) { - stop("<Run name> and <ARC root> must be supplied as arguments", call.=FALSE) -} else if (length(args)==2) { - # default output file - run_name <- args[1] - print(paste("Storing outputs in:", run_name)) - arc_root <- args[2] - print(paste("ARC root:", arc_root)) -} - -######################## -# Collect kallisto data -######################## - -library(sleuth) -library(tidyverse) -library(jsonlite) -library(openxlsx) - -## read experimental metadata from isa.assay wb - -isa_assay <- paste0(arc_root, 'assays/Talinum_RNASeq_minimal/isa.assay.xlsx') -isa_study <- paste0(arc_root, 'studies/TalinumFacultativeCAM/isa.study.xlsx') - -study_data <- readWorkbook(isa_study, "plant_growth", startRow = 1) - -assay_data <- merge(readWorkbook(isa_assay, "2EXT01_RNA", startRow = 1), - readWorkbook(isa_assay, "3ASY01_RNASeq", startRow = 1), - by.x = "Sample.Name", - by.y = "Source.Name" - ) - -assay_data <- merge(study_data, assay_data, by.x = "Sample.Name", by.y = "Source.Name") - -## remove empty cols -assay_data <- assay_data[, !apply(assay_data, 2, function(x){sum(is.na(x)) == nrow(assay_data)})] - -# Pointer to kallisto results folder -# base_dir <- paste0(arc_root, 'runs/', run_name, '/01_kallisto_results/') - -######################## -### Note: hard coded link due to issues with kallisto version -base_dir <- paste0(arc_root, 'runs/kallisto_sleuth/run1/01_kallisto_results') -######################## - -# A list of paths to the kallisto results indexed by the sample IDs is collated with -kal_dirs <- dir(base_dir, full.names = T) ## Sleuth requires full paths - -s2c <- assay_data[order(assay_data$Sample.Name), c('Sample.Name', "Factor.[Photosynthesis.mode]")] -# For kallisto / sleuth: 's2c' (sample_to_covariates) must contain a column named 'sample' -colnames(s2c) <- c("sample", "Photosynthesis.mode") - -s2c$path <- kal_dirs -s2c <- s2c[order(s2c$sample), ] - -# Build a sleuth object -so <- sleuth_prep(s2c, ~Photosynthesis.mode, num_cores = 1) -save(so, file = paste0(arc_root, 'runs/', run_name, '/03_kallisto_sleuthObject.RData')) - -# Extract expression tables - -## as data.frame -expression_data <- kallisto_table(so) -write.csv(expression_data, paste0(arc_root, 'runs/', run_name, '/03_kallisto_df.csv'), row.names = F) - -## as tpm matrix (gene x sample) -tpm_table <- pivot_wider(expression_data, id_cols = target_id, names_from = sample, values_from = tpm) -write.csv(tpm_table, paste0(arc_root, 'runs/', run_name, '/03_kallisto_tpmMatrix.csv'), row.names = F) - - -# Summarize mapping stats - -mapping_stats <- c() -for(i in dir(kal_dirs, pattern = '.json', full.names = T)) -{ - id <- unlist(strsplit(i, split = '/')) - - z <- data.frame(ID = id[length(id) - 1], read_json(i, simplifyVector = T)) - mapping_stats <- rbind(mapping_stats, z) -} - -write.csv(mapping_stats, paste0(arc_root, 'runs/', run_name, '/03_kallisto_mappingStats.csv'), row.names = F) - - - - - diff --git a/workflows/no_CWL_yet/kallisto_sleuth/04_Sleuth.R b/workflows/no_CWL_yet/kallisto_sleuth/04_Sleuth.R deleted file mode 100644 index 41fb94d4aaae670c0229eadef22cbed779d76767..0000000000000000000000000000000000000000 --- a/workflows/no_CWL_yet/kallisto_sleuth/04_Sleuth.R +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env Rscript - -######################## -#### To be replaced by CWL routine -######################## - -args = commandArgs(trailingOnly=TRUE) - -# test if <Run name> and <ARC root> argument are supplied: if not, return an error -if (length(args)!=2) { - stop("<Run name> and <ARC root> must be supplied as arguments", call.=FALSE) -} else if (length(args)==2) { - # default output file - run_name <- args[1] - print(paste("Storing outputs in:", run_name)) - arc_root <- args[2] - print(paste("ARC root:", arc_root)) -} - -######################## - - -######################## -# Determine diff. gene expression with sleuth -######################## - -library(sleuth) - -# Load the sleuth object -load(file = paste0(arc_root, 'runs/', run_name, '/03_kallisto_sleuthObject.RData')) - - -so <- sleuth_fit(so) -so <- sleuth_fit(so, ~Photosynthesis.mode, 'full') -so <- sleuth_fit(so, ~1, 'reduced') -so <- sleuth_lrt(so, 'reduced', 'full') - -sleuth_table <- sleuth_results(so, 'reduced:full', 'lrt', show_all = FALSE) - -write.csv(sleuth_table, paste0(arc_root, 'runs/', run_name, '/04_sleuth_dge.csv'), row.names = F) - - - - - diff --git a/workflows/no_CWL_yet/kallisto_sleuth/05_plot_shinyPrep.R b/workflows/no_CWL_yet/kallisto_sleuth/05_plot_shinyPrep.R deleted file mode 100644 index 4cabdc3f5d9375e6c3b90488e05822acd45fcc5f..0000000000000000000000000000000000000000 --- a/workflows/no_CWL_yet/kallisto_sleuth/05_plot_shinyPrep.R +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env Rscript - -######################## -#### To be replaced by CWL routine -######################## - -args = commandArgs(trailingOnly=TRUE) - -# test if <Run name> and <ARC root> argument are supplied: if not, return an error -if (length(args)!=2) { - stop("<Run name> and <ARC root> must be supplied as arguments", call.=FALSE) -} else if (length(args)==2) { - # default output file - run_name <- args[1] - print(paste("Storing outputs in:", run_name)) - arc_root <- args[2] - print(paste("ARC root:", arc_root)) -} - -######################## -# Prep data for shiny app -######################## - -library(openxlsx) - -expression_data <- read.csv(file = paste0(arc_root, 'runs/', run_name, '/03_kallisto_df.csv')) -available_genes <- unique(expression_data$target_id) - -save(expression_data, available_genes, file = paste0(arc_root, 'runs/', run_name, '/05_shinyPrep.RData')) diff --git a/workflows/no_CWL_yet/kallisto_sleuth_wrapper.sh b/workflows/no_CWL_yet/kallisto_sleuth_wrapper.sh deleted file mode 100755 index 641796bbe02cfbd66ab214fc6718ec3e4ff95733..0000000000000000000000000000000000000000 --- a/workflows/no_CWL_yet/kallisto_sleuth_wrapper.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env bash - -####################################### -#### To be replaced by CWL routine #### -####################################### - -# This is supposed to wrap the workflow from kallisto quantification to sleuth diff gene expression -# Execute within <ARC root>/workflows: -## change rights: `chmod a+x kallisto_sleuth_wrapper.sh` -## execute script: `./kallisto_sleuth_wrapper.sh <run_name> <arc_root>` -## Example: ./kallisto_sleuth_wrapper.sh "run2" "/Users/dominikbrilhaus/03DataPLANT_gitlab/samplearc_rnaseq/" - -run_name='kallisto_sleuth/'$1 -arc_root=$2 - -# Note: due to some not well documented change in kallisto development (use of hd5) the kallisto quant part is currently not reproducible and commented out -# Pointer to `runs/kallisto_sleuth/run1/01_kallisto_results` is thus hard coded into 03_KallistoCollect.R - -mkdir -p $arc_root'/runs/'$run_name - -# chmod a+x ./kallisto_sleuth/01_KallistoIndex.sh -# ./kallisto_sleuth/01_KallistoIndex.sh $run_name $arc_root -# chmod a+x kallisto_sleuth/01_KallistoQuant.sh -# ./kallisto_sleuth/01_KallistoQuant.sh $run_name $arc_root - -Rscript ./kallisto_sleuth/02_InstallPackages.R -echo "r insallations done" -Rscript ./kallisto_sleuth/03_KallistoCollect.R $run_name $arc_root -echo "kallisto collect done" -Rscript ./kallisto_sleuth/04_Sleuth.R $run_name $arc_root -echo "sleuth done" -Rscript ./kallisto_sleuth/05_plot_shinyPrep.R $run_name $arc_root -echo "shiny prep done" - -# store variables for test runs in R -# run_name="kallisto_sleuth/run2" -# arc_root="/Users/dominikbrilhaus/03DataPLANT_gitlab/samplearc_rnaseq/" \ No newline at end of file diff --git a/workflows/shiny_prep.R b/workflows/shiny_prep.R new file mode 100644 index 0000000000000000000000000000000000000000..b883bb1aa3e3e1f90b4a8ff95b92f7a7f462e9a9 --- /dev/null +++ b/workflows/shiny_prep.R @@ -0,0 +1,41 @@ +#!/usr/bin/env Rscript + +################################################ +#### Test area (Within R) +################################################ + +# arc_root <- "~/03_DataPLANT_gitlab/samplearc_rnaseq/" +# out_folder <- "runs/shiny_prep" +# in_kallisto_df <- "runs/kallisto_collect/kallisto_df.csv" + +################################################ +#### Load required library +################################################ + + + +################################################ +#### Read arguments from CLI +################################################ + +args <- commandArgs(trailingOnly = T) + +arc_root <- args[1] +out_folder <- args[2] +in_kallisto_df <- args[3] + +################################################ +#### If it does not exist, create out dir +################################################ + +dir.create(paste(arc_root, out_folder, sep = "/"), recursive = T, showWarnings = F) + +################################################ +#### Prep data for shiny app +################################################ + +expression_data <- read.csv(file = paste(arc_root, in_kallisto_df, sep = "/")) + +available_genes <- unique(expression_data$target_id) + +save(expression_data, available_genes, file = paste(arc_root, out_folder, 'shiny_prep.RData', sep = "/")) diff --git a/workflows/shiny_prep.cwl b/workflows/shiny_prep.cwl new file mode 100644 index 0000000000000000000000000000000000000000..918aaa0aefa401caed518703a64ac7fd6c333e3b --- /dev/null +++ b/workflows/shiny_prep.cwl @@ -0,0 +1,33 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +inputs: +- id: r_script + type: File + inputBinding: + position: 0 +- id: arc_root + type: string + inputBinding: + position: 1 +- id: out_folder + type: string + inputBinding: + position: 2 +- id: in_kallisto_df + type: string + inputBinding: + position: 3 + +outputs: +- id: outdir + type: + type: array + items: Directory + outputBinding: + glob: $(runtime.outdir)/$(inputs.out_folder) + +baseCommand: +- Rscript \ No newline at end of file