Skip to content
Snippets Groups Projects
Commit 283b1086 authored by Dominik Brilhaus's avatar Dominik Brilhaus
Browse files

cwl kallisto collect

parent a2656643
No related branches found
No related tags found
No related merge requests found
# cwltool README
## First cd into the (runs) folder, with the .yml file
cd /Users/dominikbrilhaus/03_DataPLANT_gitlab/samplearc_rnaseq/runs/kallisto_collect
## Let it flow
```bash
### store arc root (two levels up from here) as variable
arc_root=$(echo ${PWD%/*/*})
### replace arc root line in yml (specific to the machine from where this is run)
### not sure, if this works on linux...
sed -i '' "s|^arc_root:.*|arc_root: $arc_root|g" kallisto_collect.yml
### run with cwltool
cwltool ../../workflows/kallisto_collect.cwl kallisto_collect.yml
```
cores: 1
r_script:
class: File
path: ../../workflows/kallisto_collect.R
in_kallisto_results: "runs/no_CWL_yet/kallisto_sleuth/run1/01_kallisto_results"
in_metadata_file: "runs/merged_isa_metadata/merged_isa.tsv"
in_metadata_sample: "Sample.Name.2"
in_metadata_factor: "Factor..Photosynthesis.mode."
out_folder: runs/kallisto_collect
arc_root: /Users/dominikbrilhaus/03_DataPLANT_gitlab/samplearc_rnaseq
\ No newline at end of file
"ID","n_targets","n_bootstraps","n_processed","n_pseudoaligned","n_unique","p_pseudoaligned","p_unique","kallisto_version","index_version","start_time","call"
"DB_097",31434,100,11071656,7663654,7522185,69.2,67.9,"0.44.0",10,"Fri Aug 6 15:48:13 2021","kallisto quant --single -b 100 -t 30 -l 200 -s 20 -i /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005//runs/01_kallisto_index -o /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005//runs/01_kallisto_results/DB_097 /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005/assays/Talinum_RNASeq_minimal/dataset/DB_097_CAMMD_CAGATC_L001_R1_001.fastq.gz"
"DB_099",31434,100,14332278,10066506,9848811,70.2,68.7,"0.44.0",10,"Fri Aug 6 15:49:19 2021","kallisto quant --single -b 100 -t 30 -l 200 -s 20 -i /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005//runs/01_kallisto_index -o /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005//runs/01_kallisto_results/DB_099 /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005/assays/Talinum_RNASeq_minimal/dataset/DB_099_CAMMD_CTTGTA_L001_R1_001.fastq.gz"
"DB_103",31434,100,13083957,9007720,8828236,68.8,67.5,"0.44.0",10,"Fri Aug 6 15:50:41 2021","kallisto quant --single -b 100 -t 30 -l 200 -s 20 -i /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005//runs/01_kallisto_index -o /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005//runs/01_kallisto_results/DB_103 /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005/assays/Talinum_RNASeq_minimal/dataset/DB_103_CAMMD_AGTCAA_L001_R1_001.fastq.gz"
"DB_161",31434,100,12785081,9058396,8740183,70.9,68.4,"0.44.0",10,"Fri Aug 6 15:51:58 2021","kallisto quant --single -b 100 -t 30 -l 200 -s 20 -i /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005//runs/01_kallisto_index -o /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005//runs/01_kallisto_results/DB_161 /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005/assays/Talinum_RNASeq_minimal/dataset/DB_161_reC3MD_GTCCGC_L001_R1_001.fastq.gz"
"DB_163",31434,100,14312514,10054493,9707941,70.2,67.8,"0.44.0",10,"Fri Aug 6 15:53:12 2021","kallisto quant --single -b 100 -t 30 -l 200 -s 20 -i /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005//runs/01_kallisto_index -o /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005//runs/01_kallisto_results/DB_163 /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005/assays/Talinum_RNASeq_minimal/dataset/DB_163_reC3MD_GTGAAA_L001_R1_001.fastq.gz"
"DB_165",31434,100,13679479,9552504,9264346,69.8,67.7,"0.44.0",10,"Fri Aug 6 15:54:35 2021","kallisto quant --single -b 100 -t 30 -l 200 -s 20 -i /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005//runs/01_kallisto_index -o /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005//runs/01_kallisto_results/DB_165 /Users/dominikbrilhaus/sciebo/CEPLAS_DM/CEPLAS_ARCs/ARC0005/assays/Talinum_RNASeq_minimal/dataset/DB_165_re-C3MD_GTGAAA_L002_R1_001.fastq.gz"
This diff is collapsed.
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
cd /Users/dominikbrilhaus/03_DataPLANT_gitlab/samplearc_rnaseq/runs/merged_isa_metadata cd /Users/dominikbrilhaus/03_DataPLANT_gitlab/samplearc_rnaseq/runs/merged_isa_metadata
## Let it flow ## Let it flow
```bash ```bash
...@@ -12,15 +13,11 @@ cd /Users/dominikbrilhaus/03_DataPLANT_gitlab/samplearc_rnaseq/runs/merged_isa_m ...@@ -12,15 +13,11 @@ cd /Users/dominikbrilhaus/03_DataPLANT_gitlab/samplearc_rnaseq/runs/merged_isa_m
### store arc root (two levels up from here) as variable ### store arc root (two levels up from here) as variable
arc_root=$(echo ${PWD%/*/*}) arc_root=$(echo ${PWD%/*/*})
# arc_root=/Users/dominikbrilhaus/03_DataPLANT_gitlab/samplearc_rnaseq/ ### replace arc root line in yml (specific to the machine from where this is run)
### not sure, if this works on linux...
### write a temporary yml sed -i '' "s|^arc_root:.*|arc_root: $arc_root|g" merge_isa_metadata.yml
cat merge_isa_metadata.yml > merge_isa_metadata_rooted.yml
### add the arc root to that yml
printf "\narc_root: $arc_root\n" >> merge_isa_metadata_rooted.yml
### run with cwltool ### run with cwltool
cwltool ../../workflows/merge_isa_metadata.cwl merge_isa_metadata_rooted.yml cwltool ../../workflows/merge_isa_metadata.cwl merge_isa_metadata.yml
``` ```
...@@ -4,4 +4,5 @@ r_script: ...@@ -4,4 +4,5 @@ r_script:
path: ../../workflows/merge_isa_metadata.R path: ../../workflows/merge_isa_metadata.R
in_isa_study: studies/TalinumFacultativeCAM/isa.study.xlsx:plant_growth in_isa_study: studies/TalinumFacultativeCAM/isa.study.xlsx:plant_growth
in_isa_assay: assays/Talinum_RNASeq_minimal/isa.assay.xlsx:2EXT01_RNA:3ASY01_RNASeq in_isa_assay: assays/Talinum_RNASeq_minimal/isa.assay.xlsx:2EXT01_RNA:3ASY01_RNASeq
out_folder: runs/merged_isa_metadata out_folder: runs/merged_isa_metadata
\ No newline at end of file arc_root: /Users/dominikbrilhaus/03_DataPLANT_gitlab/samplearc_rnaseq
\ No newline at end of file
#!/usr/bin/env Rscript
################################################
#### Test area (Within R)
################################################
# arc_root <- "~/03_DataPLANT_gitlab/samplearc_rnaseq/"
# in_kallisto_results <- "runs/kallisto_sleuth/run1/01_kallisto_results"
# in_metadata_file <- "runs/merged_isa_metadata/merged_isa.tsv"
# in_metadata_sample <- "Sample.Name.2"
# in_metadata_factor <- "Factor..Photosynthesis.mode."
# out_folder <- "runs/kallisto_collect"
################################################
#### Load required library
################################################
library(sleuth)
library(tidyverse)
library(jsonlite)
################################################
#### Read arguments from CLI
################################################
args <- commandArgs(trailingOnly = T)
arc_root <- args[1]
in_kallisto_results <- args[2]
in_metadata_file <- args[3]
in_metadata_sample <- args[4]
in_metadata_factor <- args[5]
out_folder <- args[6]
################################################
#### If it does not exist, create out dir
################################################
dir.create(paste(arc_root, out_folder, sep = "/"), recursive = T, showWarnings = F)
################################################
#### Read ISA sample metadata
################################################
samples <- read.table(file = paste(arc_root, in_metadata_file, sep = "/"), sep = "\t")
################################################
#### Read Kallisto results
################################################
base_dir <- paste(arc_root, in_kallisto_results, sep = "/")
# A list of paths to the kallisto results indexed by the sample IDs is collated with
kal_dirs <- dir(base_dir, full.names = T) ## Sleuth requires full paths
s2c <- samples[order(samples[[in_metadata_sample]]), c(in_metadata_sample, in_metadata_factor)]
# For kallisto / sleuth: 's2c' (sample_to_covariates) must contain a column named 'sample'
colnames(s2c) <- c("sample", "condition")
s2c$path <- kal_dirs
s2c <- s2c[order(s2c$sample), ]
# Build a sleuth object
so <- sleuth_prep(s2c, full_model = ~condition, num_cores = 1)
save(so, file = paste(arc_root, out_folder, "kallisto_sleuthObject.RData", sep = "/"))
################################################
#### Extract expression tables
################################################
## as data.frame
expression_data <- kallisto_table(so)
## write to file
write.csv(expression_data, paste(arc_root, out_folder, "/kallisto_df.csv", sep = "/"), row.names = F)
## as tpm matrix (gene x sample)
tpm_table <- pivot_wider(expression_data, id_cols = target_id, names_from = sample, values_from = tpm)
## write to file
write.csv(tpm_table, paste(arc_root, out_folder, "/kallisto_tpmMatrix.csv", sep = "/"), row.names = F)
################################################
#### Summarize mapping stats
################################################
mapping_stats <- c()
for (i in dir(kal_dirs, pattern = ".json", full.names = T))
{
id <- unlist(strsplit(i, split = "/"))
z <- data.frame(ID = id[length(id) - 1], read_json(i, simplifyVector = T))
mapping_stats <- rbind(mapping_stats, z)
}
write.csv(mapping_stats, paste(arc_root, out_folder, "/kallisto_mappingStats.csv", sep = "/"), row.names = F)
#!/usr/bin/env cwl-runner
cwlVersion: v1.2
class: CommandLineTool
inputs:
- id: r_script
type: File
inputBinding:
position: 0
- id: arc_root
type: string
inputBinding:
position: 1
- id: in_kallisto_results
type: string
inputBinding:
position: 2
- id: in_metadata_file
type: string
inputBinding:
position: 3
- id: in_metadata_sample
type: string
inputBinding:
position: 4
- id: in_metadata_factor
type: string
inputBinding:
position: 5
- id: out_folder
type: string
inputBinding:
position: 6
outputs:
- id: outdir
type:
type: array
items: Directory
outputBinding:
glob: $(runtime.outdir)/$(inputs.out_folder)
baseCommand:
- Rscript
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment