diff --git a/README.md b/README.md index 736fe639f54fae9b4da24667b05acd5d71d64850..05f29a6c0047281c6b8f5ec62d46390e415521dc 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,30 @@ -# SampleARC_RNASeq +# ARC mininmal Example RNASeq + + +## Notes + +- CWL not yet implemented + + + +### isa.assay +- split GEO SWATE templates into four sheets + - 1SPL01_plants + - 2EXT01_RNA + - 3ASY01_RNASeq + - 4COM01_RNASeq + + +### adding raw data via git lfs + +``` +git lfs track "*.fastq.gz" +``` + +add data to assays folder + +``` +git add assays/Talinum_RNASeq_minimal/dataset/ +``` + diff --git a/TalinumFacultativeCAM.study.xlsx b/TalinumFacultativeCAM.study.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..3dc159704211be4bd2b6d62bfc315a48f567e134 Binary files /dev/null and b/TalinumFacultativeCAM.study.xlsx differ diff --git a/assays/Talinum_RNASeq_minimal/README.md b/assays/Talinum_RNASeq_minimal/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/assays/Talinum_RNASeq_minimal/dataset/DB_097_CAMMD_CAGATC_L001_R1_001.fastq.gz b/assays/Talinum_RNASeq_minimal/dataset/DB_097_CAMMD_CAGATC_L001_R1_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..f741badf5f61e7497ede70baa210a26f0359422c --- /dev/null +++ b/assays/Talinum_RNASeq_minimal/dataset/DB_097_CAMMD_CAGATC_L001_R1_001.fastq.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f0b61abc9971ade750ce54f0f10aa50a6bc42a517be8ba60257e0cf6776d1c7 +size 1451886904 diff --git a/assays/Talinum_RNASeq_minimal/dataset/DB_099_CAMMD_CTTGTA_L001_R1_001.fastq.gz b/assays/Talinum_RNASeq_minimal/dataset/DB_099_CAMMD_CTTGTA_L001_R1_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..aca6b925031ec2410da74d7840ef280d7b2c7f06 --- /dev/null +++ b/assays/Talinum_RNASeq_minimal/dataset/DB_099_CAMMD_CTTGTA_L001_R1_001.fastq.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d4200a78e57bc821b46c1b6c84b1c5bcf5889eaba82e6baff16cc35f2e77651 +size 1879439049 diff --git a/assays/Talinum_RNASeq_minimal/dataset/DB_103_CAMMD_AGTCAA_L001_R1_001.fastq.gz b/assays/Talinum_RNASeq_minimal/dataset/DB_103_CAMMD_AGTCAA_L001_R1_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..11c69db04d237cd06134fb6afcaedef400680c9f --- /dev/null +++ b/assays/Talinum_RNASeq_minimal/dataset/DB_103_CAMMD_AGTCAA_L001_R1_001.fastq.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7f7f26c5eb54c04aad9b4168b3104fcf17c6cbaff4ce4300d07c8955dbbeb28 +size 1713418642 diff --git a/assays/Talinum_RNASeq_minimal/dataset/DB_161_reC3MD_GTCCGC_L001_R1_001.fastq.gz b/assays/Talinum_RNASeq_minimal/dataset/DB_161_reC3MD_GTCCGC_L001_R1_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..7505cdb1fcd631eee30c5b37ab157f0efe1dfacd --- /dev/null +++ b/assays/Talinum_RNASeq_minimal/dataset/DB_161_reC3MD_GTCCGC_L001_R1_001.fastq.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:827908c02722e3bf22ca4cdfaeda269d999a36ae82e54ffeefc1602f7d3938f5 +size 1684786710 diff --git a/assays/Talinum_RNASeq_minimal/dataset/DB_163_reC3MD_GTGAAA_L001_R1_001.fastq.gz b/assays/Talinum_RNASeq_minimal/dataset/DB_163_reC3MD_GTGAAA_L001_R1_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..baa83fb94ed5a8a8b19bfd47f4070bdca4d6ccc7 --- /dev/null +++ b/assays/Talinum_RNASeq_minimal/dataset/DB_163_reC3MD_GTGAAA_L001_R1_001.fastq.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08a331f71ef4ba111d5c8c0a76cb5e767b93b208ba9d50c93c4b632115bfea03 +size 1880798768 diff --git a/assays/Talinum_RNASeq_minimal/dataset/DB_165_re-C3MD_GTGAAA_L002_R1_001.fastq.gz b/assays/Talinum_RNASeq_minimal/dataset/DB_165_re-C3MD_GTGAAA_L002_R1_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..e09a781dc1ce35496e27d02ce233f9106cb54ee9 --- /dev/null +++ b/assays/Talinum_RNASeq_minimal/dataset/DB_165_re-C3MD_GTGAAA_L002_R1_001.fastq.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc8e318352b10c72c032ebf1609ea14cd57c454e0f922a7a0c14f1d8f754dfb9 +size 1801995178 diff --git a/assays/Talinum_RNASeq_minimal/isa.assay.xlsx b/assays/Talinum_RNASeq_minimal/isa.assay.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..c6e6a3420aef7726b7b274ea429597e991cfa749 Binary files /dev/null and b/assays/Talinum_RNASeq_minimal/isa.assay.xlsx differ diff --git a/assays/Talinum_RNASeq_minimal/protocols/01_plant_material.md b/assays/Talinum_RNASeq_minimal/protocols/01_plant_material.md new file mode 100644 index 0000000000000000000000000000000000000000..4622948c73164021e191ec5a2dcdb0a8e7e5b697 --- /dev/null +++ b/assays/Talinum_RNASeq_minimal/protocols/01_plant_material.md @@ -0,0 +1,3 @@ +# Plant Material and Growth Conditions + +Talinum triangulare plants were grown in Miracle-Gro Potting Mix (Miracle- Gro) in “Short-One†treepots, 1.6 l (Stuewe and Sons). The experiment was initiated with 28-d-old plants in a controlled environment chamber (Environ- mental Growth Chambers) maintained under 12 h light (30°C, 37% relative humidity)/12 h dark (22°C) cycles. Photon flux density at leaf level was 425 mmol m22 s21. Irrigation was withheld on day 1 and recommenced on day 14. Leaves were harvested when plants were well-watered as well as after 4, 9, and 12 d of water deprivation and watered for two days following the drought period. \ No newline at end of file diff --git a/assays/Talinum_RNASeq_minimal/protocols/02_RNAex_libraries.md b/assays/Talinum_RNASeq_minimal/protocols/02_RNAex_libraries.md new file mode 100644 index 0000000000000000000000000000000000000000..2ffb4c49322f11a3737bd05f957f9a12ae0b4332 --- /dev/null +++ b/assays/Talinum_RNASeq_minimal/protocols/02_RNAex_libraries.md @@ -0,0 +1,3 @@ +# RNA Extraction, Preparation, and Sequencing of Illumina Libraries + +The topmost mature unshaded leaves (of approximately 3–4.5 cm length) of T. triangulare were harvested in the middle of the light or the middle of the dark period and immediately frozen in liquid nitrogen. RNA was isolated from ground tissue using the GeneMatrix Universal RNA Purification Kit (EURx Ltd.). Residues of DNA were removed with DNase (New England Biolabs). RNA integrity, sequencing library, and fragment size were analyzed on a 2100 Bioanalyzer (Agilent). Libraries were prepared using the TruSeq RNA Sample Prep Kit v2 (Illumina) and quantified with a Qubit 2.0 (Invitrogen). Samples were multiplexed with 12 libraries per lane and sequenced in single-end mode (Rapid Run, 150 bp read length) on an Illumina HiSEquation 2000 platform, yielding ;14 million reads per library. diff --git a/assays/Talinum_RNASeq_minimal/protocols/README.md b/assays/Talinum_RNASeq_minimal/protocols/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/externals/README.md b/externals/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c28804c6a77108569f9ea2358f15589455d9689d --- /dev/null +++ b/externals/README.md @@ -0,0 +1,7 @@ + + +### Talinum Genome ref +File: Talinum.gm.CDS.nt.fa +Source: weber_fileshare/data/Eva_Maleckova-CAM/Genomics/Genome/Talinum-Flye-Polca_v201017/Talinum.gm.CDS.nt.fa +Contributor: Eva Maleckova + diff --git a/isa.investigation.xlsx b/isa.investigation.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..c0e3330b608535e018150962d00fe31111d9b9b9 Binary files /dev/null and b/isa.investigation.xlsx differ diff --git a/workflows/01_KallistoQuant.sh b/workflows/01_KallistoQuant.sh new file mode 100644 index 0000000000000000000000000000000000000000..8975e04cb25efc27a85979848de45744ec5be05a --- /dev/null +++ b/workflows/01_KallistoQuant.sh @@ -0,0 +1,42 @@ + +######################## +#### To be replaced by CWL routine +######################## + +ARC_root=~/Hackathon_ARCexample_rnaseq/ +cd $ARC_root'workflows/' + +# chmod a+x 01_KallistoQuant.sh +# ./01_KallistoQuant.sh > $ARC_root'runs/01_kallisto.log' 2>&1 & + +######################## + + +# Map RNASeq reads via kallisto + +## Manual: http://pachterlab.github.io/kallisto/manual.html + +kallisto version +kallisto cite + +### Build index + +kall_ref=$ARC_root'externals/Talinum.gm.CDS.nt.fa' +kallisto index -i $ARC_root'runs/01_kallisto_index' $kall_ref + +### Align reads + +ILLUMINASAMPLES=$(ls ${ARC_root}'assays/Talinum_RNASeq_minimal/dataset/'*fastq.gz) + +mkdir $ARC_root'/runs/01_kallisto_results/' + +for j in $ILLUMINASAMPLES; do + + sampleName=$(echo $j | sed -e 's|.*/||' | cut -c -6) # cut away path. retain only first six chars of file name + echo $sampleName + + kallisto quant --single -b 100 -t 30 -l 200 -s 20 -i $ARC_root'/runs/01_kallisto_index' -o $ARC_root'/runs/01_kallisto_results/'$sampleName $j + + echo 'Kallisto done' + +done \ No newline at end of file diff --git a/workflows/03_KallistoCollect.R b/workflows/03_KallistoCollect.R new file mode 100644 index 0000000000000000000000000000000000000000..796a61218717e37e36b59cb01da0e0f5c738e875 --- /dev/null +++ b/workflows/03_KallistoCollect.R @@ -0,0 +1,86 @@ + + +######################## +#### To be replaced by CWL routine +######################## + +ARC_root="~/Hackathon_ARCexample_rnaseq/" +setwd(paste0(ARC_root, 'workflows/')) + +######################## + +######################## +# Collect kallisto data +######################## + + +# ### sleuth installation +# +# if (!requireNamespace("BiocManager", quietly = TRUE)) +# install.packages("BiocManager") +# BiocManager::install() +# BiocManager::install("devtools") # only if devtools not yet installed +# BiocManager::install("pachterlab/sleuth") + +library(sleuth) +library(tidyverse) +library(jsonlite) +library(openxlsx) + +## read experimental metadata from isa.assay wb + +isa_assay <- paste0(ARC_root, 'assays/Talinum_RNASeq_minimal/assay.isa.xlsx') + +assay_data <- merge(readWorkbook(isa_assay, "1SPL01_plants", startRow = 2), + readWorkbook(isa_assay, "3ASY01_RNASeq", startRow = 2), + by = "Sample.Name" + ) + +## remove empty cols +assay_data <- assay_data[, !apply(assay_data, 2, function(x){sum(is.na(x)) == nrow(assay_data)})] + +# Pointer to kallisto results folder +base_dir <- paste0(ARC_root, '/runs/01_kallisto_results/') + +# A list of paths to the kallisto results indexed by the sample IDs is collated with +kal_dirs <- dir(base_dir, full.names = T) ## Sleuth requires full paths + +s2c <- assay_data[order(assay_data$Sample.Name), c('Sample.Name', "Characteristics.[Photosynthesis.mode]")] +# For kallisto / sleuth: 's2c' (sample_to_covariates) must contain a column named 'sample' +colnames(s2c) <- c("sample", "Photosynthesis.mode") + +s2c$path <- kal_dirs +s2c <- s2c[order(s2c$sample), ] + +# Build a sleuth object +so <- sleuth_prep(s2c, ~Photosynthesis.mode) +save(so, file = paste0(ARC_root, 'runs/03_kallisto_sleuthObject.RData')) + +# Extract expression tables + +## as data.frame +expression_data <- kallisto_table(so) +write.csv(expression_data, paste0(ARC_root, 'runs/03_kallisto_df.csv'), row.names = F) + +## as tpm matrix (gene x sample) +tpm_table <- pivot_wider(expression_data, id_cols = target_id, names_from = sample, values_from = tpm) +write.csv(tpm_table, paste0(ARC_root, 'runs/03_kallisto_tpmMatrix.csv'), row.names = F) + + +# Summarize mapping stats + +mapping_stats <- c() +for(i in dir(kal_dirs, pattern = '.json', full.names = T)) +{ + id <- unlist(strsplit(i, split = '/')) + + z <- data.frame(ID = id[length(id) - 1], read_json(i, simplifyVector = T)) + mapping_stats <- rbind(mapping_stats, z) +} + +write.csv(mapping_stats, paste0(ARC_root, 'runs/03_kallisto_mappingStats.csv'), row.names = F) + + + + + diff --git a/workflows/04_Sleuth.R b/workflows/04_Sleuth.R new file mode 100644 index 0000000000000000000000000000000000000000..ab2f5791cc20d8343cee2f934443d87b5e317c8e --- /dev/null +++ b/workflows/04_Sleuth.R @@ -0,0 +1,35 @@ + + +######################## +#### To be replaced by CWL routine +######################## + +ARC_root="~/Hackathon_ARCexample_rnaseq/" +setwd(paste0(ARC_root, 'workflows/')) + +######################## + +######################## +# Determine diff. gene expression with sleuth +######################## + +library(sleuth) + +# Load the sleuth object +load(file = paste0(ARC_root, 'runs/03_kallisto_sleuthObject.RData')) + + +so <- sleuth_fit(so) +so <- sleuth_fit(so, ~Group, 'full') +so <- sleuth_fit(so, ~1, 'reduced') +so <- sleuth_lrt(so, 'reduced', 'full') + +sleuth_table <- sleuth_results(so, 'reduced:full', 'lrt', show_all = FALSE) + + +write.csv(sleuth_table, paste0(ARC_root, 'runs/04_sleuth_dge.csv'), row.names = F) + + + + + diff --git a/workflows/05_plot_shinyPrep.R b/workflows/05_plot_shinyPrep.R new file mode 100644 index 0000000000000000000000000000000000000000..e0b2fe554819c1c0f21057a7cf283c021319de3f --- /dev/null +++ b/workflows/05_plot_shinyPrep.R @@ -0,0 +1,21 @@ + + +######################## +#### To be replaced by CWL routine +######################## + +ARC_root="~/Hackathon_ARCexample_rnaseq/" +setwd(paste0(ARC_root, 'workflows/')) + +######################## + +######################## +# Prep data for shiny app +######################## + +library(openxlsx) + +expression_data <- read.csv(file = paste0(ARC_root, 'runs/03_kallisto_df.csv')) +available_genes <- unique(expression_data$target_id) + +save(expression_data, available_genes, file = paste0(ARC_root, 'runs/05_shinyPrep.RData')) diff --git a/workflows/05_plotshinyPrep/05_plot_shinyPrep.R b/workflows/05_plotshinyPrep/05_plot_shinyPrep.R new file mode 100644 index 0000000000000000000000000000000000000000..664bf3919e0b69d7d8c62100728e8f97aa09048d --- /dev/null +++ b/workflows/05_plotshinyPrep/05_plot_shinyPrep.R @@ -0,0 +1,16 @@ + +######################## +# Prep data for shiny app +######################## + + +install.packages("openxlsx",dependencies=TRUE, lib='./lib',repos='http://cran.rstudio.com/') +library(openxlsx, lib.loc = "./lib") + +options <- commandArgs(trailingOnly = TRUE) +wd <- getwd() + +expression_data <- read.csv(file = options[1]) +available_genes <- unique(expression_data$target_id) + +save(expression_data, available_genes, file = paste0(wd, "/05_shinyPrep.RData")) diff --git a/workflows/05_plotshinyPrep/plot_shinyPrep.cwl b/workflows/05_plotshinyPrep/plot_shinyPrep.cwl new file mode 100644 index 0000000000000000000000000000000000000000..04ffba69da5da065507a0df34d17bc7def4834f3 --- /dev/null +++ b/workflows/05_plotshinyPrep/plot_shinyPrep.cwl @@ -0,0 +1,31 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool +hints: + DockerRequirement: + dockerPull: rocker/tidyverse:4.1 +requirements: + - class: NetworkAccess + networkAccess: true + - class: InlineJavascriptRequirement + - class: InitialWorkDirRequirement + listing: + - entry: "$({class: 'Directory', listing: []})" + entryname: "./lib" + writable: true +baseCommand: Rscript +inputs: + rScript: + type: File + inputBinding: + position: 1 + kallistoResults: + type: File + inputBinding: + position: 2 +outputs: + outFile: + type: File + outputBinding: + glob: "*05_shinyPrep.RData" \ No newline at end of file diff --git a/workflows/05_plotshinyPrep/plot_shinyPrep.yml b/workflows/05_plotshinyPrep/plot_shinyPrep.yml new file mode 100644 index 0000000000000000000000000000000000000000..426ca847efcbc3e014897beafd870ce2409ebcd7 --- /dev/null +++ b/workflows/05_plotshinyPrep/plot_shinyPrep.yml @@ -0,0 +1,6 @@ +rScript: + class: File + path: ./05_plot_shinyPrep.R +kallistoResults: + class: File + path: ./../../runs/run1/03_kallisto_df.csv \ No newline at end of file diff --git a/workflows/06_plot_shinyApp.Rmd b/workflows/06_plot_shinyApp.Rmd new file mode 100644 index 0000000000000000000000000000000000000000..d48a3b6337495a47fc9b91a8eb759aa5f4fc060e --- /dev/null +++ b/workflows/06_plot_shinyApp.Rmd @@ -0,0 +1,108 @@ +--- +title: "Plot RNASeq data mapped against Talinum genome" +output: html_document +runtime: shiny +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) + +``` + + +```{r load_data, include=FALSE} + +# Load data + +ARC_root="~/Hackathon_ARCexample_rnaseq/" +load(file = paste0(ARC_root, 'runs/05_shinyPrep.RData')) + +``` + + +```{r plot_setup, message=TRUE, warning=TRUE, include=FALSE} + +# Setup plot environment + +required.packages <- c('knitr', 'kableExtra', ## RMarkdown, + "shiny", "tidyverse", ## data loading and shaping + "RColorBrewer", "shiny" ## plotting + ) + +for(package in required.packages) +{ + print(package) + ## Check if package is installed. If not, install + if(!package %in% row.names(installed.packages())) + {install.packages(package, repos ="https://cran.uni-muenster.de/")} + ## Load package + library(package, character.only = T) +} + +``` + + +```{r, eval=F, echo=FALSE} + + +# Non-interactive test + +current_selection <- sample(expression_data$target_id, 10) + +plot_set <- subset(expression_data, target_id %in% current_selection) + +ggplot(plot_set, aes(x = Group, y = tpm, group = Group)) + + stat_summary(fun = 'mean', geom = 'bar') + + geom_point(size = 0.5) + + facet_wrap(~ target_id , scales = "free") + + theme_minimal() + +``` + +# Let it shine + +```{r shiny_part, echo=FALSE} + +sidebarLayout( + + sidebarPanel( + + selectizeInput(multiple = T, "target", label = "Select Gene by target id", + choices = available_genes, + selected = sample(available_genes, size = 1), + options = list(delimiter = ' ', + create = I("function(input, callback){return {value: input, text: input};}")) + ), + helpText("You can copy/paste target ids from excel") + + ), + + + mainPanel( + + renderPlot({ + + + plot_set <- subset(expression_data, target_id %in% input$target) + + ## Facetted by gene only + + ggplot(plot_set, aes(x = Group, y = tpm)) + + stat_summary(fun = 'mean', geom = 'bar') + + geom_point(size = 0.5) + + facet_wrap( ~ target_id, scales = "free") + + theme_minimal() + + theme(aspect.ratio = 1) + + + }), + + + ) +) + + + +``` + +