Skip to content
Snippets Groups Projects
Commit 8ce8c2f5 authored by Dominik Brilhaus's avatar Dominik Brilhaus
Browse files

cwl kallisto quant and shiny prep

parent 614203d4
No related branches found
No related tags found
No related merge requests found
Showing
with 216 additions and 310 deletions
# cwltool README
## First cd into the (runs) folder, with the .yml file
cd /Users/dominikbrilhaus/03_DataPLANT_gitlab/samplearc_rnaseq/runs/kallisto_quant
## Let it flow
```bash
### store arc root (two levels up from here) as variable
arc_root=$(echo ${PWD%/*/*})
### replace arc root line in yml (specific to the machine from where this is run)
### not sure, if this works on linux...
sed -i '' "s|^arc_root:.*|arc_root: $arc_root|g" kallisto_quant.yml
### run with cwltool
cwltool ../../workflows/kallisto_quant.cwl kallisto_index.yml
```
cores: 4
sh_script:
class: File
path: ../../workflows/kallisto_quant.sh
arc_root: /Users/dominikbrilhaus/03_DataPLANT_gitlab/samplearc_rnaseq
out_folder: runs/kallisto_quant
in_kallisto_index: runs/kallisto_index/kallisto_index
in_fastq_folder: assays/Talinum_RNASeq_minimal/dataset
kallisto_bootstrap: 100
kallisto_threads: 4
kallisto_fragmentLength: 200
kallisto_stdDev: 20
\ No newline at end of file
File added
cores: 1
r_script:
class: File
path: ../../workflows/shiny_prep.R
in_kallisto_df: runs/kallisto_collect/kallisto_df.csv
out_folder: runs/shiny_prep
arc_root: /Users/dominikbrilhaus/03_DataPLANT_gitlab/samplearc_rnaseq
#!/usr/bin/env cwl-runner
cwlVersion: v1.2
class: CommandLineTool
inputs:
- id: sh_script
type: File
inputBinding:
position: 0
- id: arc_root
type: string
inputBinding:
position: 1
- id: out_folder
type: string
inputBinding:
position: 2
- id: in_kallisto_index
type: string
inputBinding:
position: 3
- id: in_fastq_folder
type: string
inputBinding:
position: 4
- id: kallisto_bootstrap
type: int
inputBinding:
position: 5
- id: kallisto_threads
type: int
inputBinding:
position: 6
- id: kallisto_fragmentLength
type: int
inputBinding:
position: 7
- id: kallisto_stdDev
type: int
inputBinding:
position: 8
outputs:
- id: outdir
type:
type: array
items: Directory
outputBinding:
glob: $(runtime.outdir)/$(inputs.out_folder)
baseCommand:
- bash
#!/usr/bin/env bash
### Map RNASeq reads via kallisto
### Note, this is written for single-end mode only
################################################
#### Read arguments from CLI
################################################
arc_root=$1
out_folder=$2
in_kallisto_index=$3
in_fastq_folder=$4
kallisto_bootstrap=$5
kallisto_threads=$6
kallisto_fragmentLength=$7
kallisto_stdDev=$8
################################################
#### If it does not exist, create out dir
################################################
mkdir -p "$arc_root/$out_folder/"
################################################
#### Store fastq files in variable
################################################
fastq_files=$(ls "${arc_root}"/"${in_fastq_folder}"/*fastq*)
################################################
#### Loop over fastq files and quantify reads
################################################
for j in $fastq_files; do
# cut away path. retain only first six chars of file name
sampleName=$(echo $j | sed -e 's|.*/||' | sed -e 's|.fastq.*||')
echo $sampleName
kallisto quant --single -b $kallisto_bootstrap -t $kallisto_threads -l $kallisto_fragmentLength -s $kallisto_stdDev -i "$arc_root/$in_kallisto_index" -o "$arc_root/$out_folder/$sampleName" $j
echo 'Kallisto done'
done
run_name=$1
arc_root=$2
## Manual: http://pachterlab.github.io/kallisto/manual.html
## MacOS Installation
# ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
# brew install kallisto
# git clone https://github.com/pachterlab/kallisto.git
# cd kallisto
# mkdir build
# cd build
# cmake .. -DUSE_HDF5=ON
kallisto version
kallisto cite
### Build index
kall_ref=$arc_root'/studies/TalinumGenomeDraft/resources/Talinum.gm.CDS.nt.fa'
kallisto index -i $arc_root'/runs/'$run_name'/01_kallisto_index' $kall_ref
\ No newline at end of file
#!/usr/bin/env bash
# Map RNASeq reads via kallisto
run_name=$1
arc_root=$2
### Align reads
ILLUMINASAMPLES=$(ls ${arc_root}"/assays/Talinum_RNASeq_minimal/dataset/"*"fastq.gz")
mkdir -p $arc_root'/runs/'$run_name'/01_kallisto_results/'
for j in $ILLUMINASAMPLES; do
sampleName=$(echo $j | sed -e 's|.*/||' | cut -c -6) # cut away path. retain only first six chars of file name
echo $sampleName
kallisto quant --single -b 100 -t 8 -l 200 -s 20 -i $arc_root'/runs/'$run_name'/01_kallisto_index' -o $arc_root'/runs/'$run_name'/01_kallisto_results/'$sampleName $j
echo 'Kallisto done'
done
#!/usr/bin/env Rscript
########################
#### To be replaced by CWL routine
########################
########################
# Installation of R dependencies
########################
### sleuth installation
if (!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager")}
if(!"BiocManager" %in% row.names(installed.packages())){BiocManager::install()}
if(!"devtools" %in% row.names(installed.packages())){BiocManager::install("devtools")}
# BiocManager::install("pachterlab/sleuth")
# There's currently (early 2022) an issue with sleuth installation.
# Using this workaround from https://github.com/pachterlab/sleuth/issues/259#issuecomment-1001076030
if(!"sleuth" %in% row.names(installed.packages())){remotes::install_github("pachterlab/sleuth#260")}
library(sleuth)
### other packages
library(tidyverse)
library(jsonlite)
library(openxlsx)
required_packages <-
c('tidyverse', ## data wrangling and plotting
'jsonlite', ## read/write json files
'openxlsx') ## read/write xlsx files
for(package in required_packages)
{
## Check if package is installed. If not, install
if(!package %in% row.names(installed.packages())){install.packages(package,
repos ="https://cran.uni-muenster.de/")}
# ## Check if package is up to date. If not, update
# update.packages(package, repos = "https://cran.uni-muenster.de/")
## Load package
library(package, character.only = T)
print(paste("installed R package", package))
}
#!/usr/bin/env Rscript
########################
#### To be replaced by CWL routine
########################
args = commandArgs(trailingOnly=TRUE)
# test if <Run name> and <ARC root> argument are supplied: if not, return an error
if (length(args)!=2) {
stop("<Run name> and <ARC root> must be supplied as arguments", call.=FALSE)
} else if (length(args)==2) {
# default output file
run_name <- args[1]
print(paste("Storing outputs in:", run_name))
arc_root <- args[2]
print(paste("ARC root:", arc_root))
}
########################
# Collect kallisto data
########################
library(sleuth)
library(tidyverse)
library(jsonlite)
library(openxlsx)
## read experimental metadata from isa.assay wb
isa_assay <- paste0(arc_root, 'assays/Talinum_RNASeq_minimal/isa.assay.xlsx')
isa_study <- paste0(arc_root, 'studies/TalinumFacultativeCAM/isa.study.xlsx')
study_data <- readWorkbook(isa_study, "plant_growth", startRow = 1)
assay_data <- merge(readWorkbook(isa_assay, "2EXT01_RNA", startRow = 1),
readWorkbook(isa_assay, "3ASY01_RNASeq", startRow = 1),
by.x = "Sample.Name",
by.y = "Source.Name"
)
assay_data <- merge(study_data, assay_data, by.x = "Sample.Name", by.y = "Source.Name")
## remove empty cols
assay_data <- assay_data[, !apply(assay_data, 2, function(x){sum(is.na(x)) == nrow(assay_data)})]
# Pointer to kallisto results folder
# base_dir <- paste0(arc_root, 'runs/', run_name, '/01_kallisto_results/')
########################
### Note: hard coded link due to issues with kallisto version
base_dir <- paste0(arc_root, 'runs/kallisto_sleuth/run1/01_kallisto_results')
########################
# A list of paths to the kallisto results indexed by the sample IDs is collated with
kal_dirs <- dir(base_dir, full.names = T) ## Sleuth requires full paths
s2c <- assay_data[order(assay_data$Sample.Name), c('Sample.Name', "Factor.[Photosynthesis.mode]")]
# For kallisto / sleuth: 's2c' (sample_to_covariates) must contain a column named 'sample'
colnames(s2c) <- c("sample", "Photosynthesis.mode")
s2c$path <- kal_dirs
s2c <- s2c[order(s2c$sample), ]
# Build a sleuth object
so <- sleuth_prep(s2c, ~Photosynthesis.mode, num_cores = 1)
save(so, file = paste0(arc_root, 'runs/', run_name, '/03_kallisto_sleuthObject.RData'))
# Extract expression tables
## as data.frame
expression_data <- kallisto_table(so)
write.csv(expression_data, paste0(arc_root, 'runs/', run_name, '/03_kallisto_df.csv'), row.names = F)
## as tpm matrix (gene x sample)
tpm_table <- pivot_wider(expression_data, id_cols = target_id, names_from = sample, values_from = tpm)
write.csv(tpm_table, paste0(arc_root, 'runs/', run_name, '/03_kallisto_tpmMatrix.csv'), row.names = F)
# Summarize mapping stats
mapping_stats <- c()
for(i in dir(kal_dirs, pattern = '.json', full.names = T))
{
id <- unlist(strsplit(i, split = '/'))
z <- data.frame(ID = id[length(id) - 1], read_json(i, simplifyVector = T))
mapping_stats <- rbind(mapping_stats, z)
}
write.csv(mapping_stats, paste0(arc_root, 'runs/', run_name, '/03_kallisto_mappingStats.csv'), row.names = F)
#!/usr/bin/env Rscript
########################
#### To be replaced by CWL routine
########################
args = commandArgs(trailingOnly=TRUE)
# test if <Run name> and <ARC root> argument are supplied: if not, return an error
if (length(args)!=2) {
stop("<Run name> and <ARC root> must be supplied as arguments", call.=FALSE)
} else if (length(args)==2) {
# default output file
run_name <- args[1]
print(paste("Storing outputs in:", run_name))
arc_root <- args[2]
print(paste("ARC root:", arc_root))
}
########################
########################
# Determine diff. gene expression with sleuth
########################
library(sleuth)
# Load the sleuth object
load(file = paste0(arc_root, 'runs/', run_name, '/03_kallisto_sleuthObject.RData'))
so <- sleuth_fit(so)
so <- sleuth_fit(so, ~Photosynthesis.mode, 'full')
so <- sleuth_fit(so, ~1, 'reduced')
so <- sleuth_lrt(so, 'reduced', 'full')
sleuth_table <- sleuth_results(so, 'reduced:full', 'lrt', show_all = FALSE)
write.csv(sleuth_table, paste0(arc_root, 'runs/', run_name, '/04_sleuth_dge.csv'), row.names = F)
#!/usr/bin/env Rscript
########################
#### To be replaced by CWL routine
########################
args = commandArgs(trailingOnly=TRUE)
# test if <Run name> and <ARC root> argument are supplied: if not, return an error
if (length(args)!=2) {
stop("<Run name> and <ARC root> must be supplied as arguments", call.=FALSE)
} else if (length(args)==2) {
# default output file
run_name <- args[1]
print(paste("Storing outputs in:", run_name))
arc_root <- args[2]
print(paste("ARC root:", arc_root))
}
########################
# Prep data for shiny app
########################
library(openxlsx)
expression_data <- read.csv(file = paste0(arc_root, 'runs/', run_name, '/03_kallisto_df.csv'))
available_genes <- unique(expression_data$target_id)
save(expression_data, available_genes, file = paste0(arc_root, 'runs/', run_name, '/05_shinyPrep.RData'))
#!/usr/bin/env bash
#######################################
#### To be replaced by CWL routine ####
#######################################
# This is supposed to wrap the workflow from kallisto quantification to sleuth diff gene expression
# Execute within <ARC root>/workflows:
## change rights: `chmod a+x kallisto_sleuth_wrapper.sh`
## execute script: `./kallisto_sleuth_wrapper.sh <run_name> <arc_root>`
## Example: ./kallisto_sleuth_wrapper.sh "run2" "/Users/dominikbrilhaus/03DataPLANT_gitlab/samplearc_rnaseq/"
run_name='kallisto_sleuth/'$1
arc_root=$2
# Note: due to some not well documented change in kallisto development (use of hd5) the kallisto quant part is currently not reproducible and commented out
# Pointer to `runs/kallisto_sleuth/run1/01_kallisto_results` is thus hard coded into 03_KallistoCollect.R
mkdir -p $arc_root'/runs/'$run_name
# chmod a+x ./kallisto_sleuth/01_KallistoIndex.sh
# ./kallisto_sleuth/01_KallistoIndex.sh $run_name $arc_root
# chmod a+x kallisto_sleuth/01_KallistoQuant.sh
# ./kallisto_sleuth/01_KallistoQuant.sh $run_name $arc_root
Rscript ./kallisto_sleuth/02_InstallPackages.R
echo "r insallations done"
Rscript ./kallisto_sleuth/03_KallistoCollect.R $run_name $arc_root
echo "kallisto collect done"
Rscript ./kallisto_sleuth/04_Sleuth.R $run_name $arc_root
echo "sleuth done"
Rscript ./kallisto_sleuth/05_plot_shinyPrep.R $run_name $arc_root
echo "shiny prep done"
# store variables for test runs in R
# run_name="kallisto_sleuth/run2"
# arc_root="/Users/dominikbrilhaus/03DataPLANT_gitlab/samplearc_rnaseq/"
\ No newline at end of file
#!/usr/bin/env Rscript
################################################
#### Test area (Within R)
################################################
# arc_root <- "~/03_DataPLANT_gitlab/samplearc_rnaseq/"
# out_folder <- "runs/shiny_prep"
# in_kallisto_df <- "runs/kallisto_collect/kallisto_df.csv"
################################################
#### Load required library
################################################
################################################
#### Read arguments from CLI
################################################
args <- commandArgs(trailingOnly = T)
arc_root <- args[1]
out_folder <- args[2]
in_kallisto_df <- args[3]
################################################
#### If it does not exist, create out dir
################################################
dir.create(paste(arc_root, out_folder, sep = "/"), recursive = T, showWarnings = F)
################################################
#### Prep data for shiny app
################################################
expression_data <- read.csv(file = paste(arc_root, in_kallisto_df, sep = "/"))
available_genes <- unique(expression_data$target_id)
save(expression_data, available_genes, file = paste(arc_root, out_folder, 'shiny_prep.RData', sep = "/"))
#!/usr/bin/env cwl-runner
cwlVersion: v1.2
class: CommandLineTool
inputs:
- id: r_script
type: File
inputBinding:
position: 0
- id: arc_root
type: string
inputBinding:
position: 1
- id: out_folder
type: string
inputBinding:
position: 2
- id: in_kallisto_df
type: string
inputBinding:
position: 3
outputs:
- id: outdir
type:
type: array
items: Directory
outputBinding:
glob: $(runtime.outdir)/$(inputs.out_folder)
baseCommand:
- Rscript
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment