Skip to content
Snippets Groups Projects
Commit 799e19c5 authored by omaus's avatar omaus
Browse files

Add ARC

parent 0034b030
No related branches found
No related tags found
No related merge requests found
Showing
with 260 additions and 1 deletion
# SampleARC_RNASeq # ARC mininmal Example RNASeq
## Notes
- CWL not yet implemented
### isa.assay
- split GEO SWATE templates into four sheets
- 1SPL01_plants
- 2EXT01_RNA
- 3ASY01_RNASeq
- 4COM01_RNASeq
### adding raw data via git lfs
```
git lfs track "*.fastq.gz"
```
add data to assays folder
```
git add assays/Talinum_RNASeq_minimal/dataset/
```
File added
File added
File added
File added
File added
File added
File added
File added
# Plant Material and Growth Conditions
Talinum triangulare plants were grown in Miracle-Gro Potting Mix (Miracle- Gro) in “Short-One” treepots, 1.6 l (Stuewe and Sons). The experiment was initiated with 28-d-old plants in a controlled environment chamber (Environ- mental Growth Chambers) maintained under 12 h light (30°C, 37% relative humidity)/12 h dark (22°C) cycles. Photon flux density at leaf level was 425 mmol m22 s21. Irrigation was withheld on day 1 and recommenced on day 14. Leaves were harvested when plants were well-watered as well as after 4, 9, and 12 d of water deprivation and watered for two days following the drought period.
\ No newline at end of file
# RNA Extraction, Preparation, and Sequencing of Illumina Libraries
The topmost mature unshaded leaves (of approximately 3–4.5 cm length) of T. triangulare were harvested in the middle of the light or the middle of the dark period and immediately frozen in liquid nitrogen. RNA was isolated from ground tissue using the GeneMatrix Universal RNA Purification Kit (EURx Ltd.). Residues of DNA were removed with DNase (New England Biolabs). RNA integrity, sequencing library, and fragment size were analyzed on a 2100 Bioanalyzer (Agilent). Libraries were prepared using the TruSeq RNA Sample Prep Kit v2 (Illumina) and quantified with a Qubit 2.0 (Invitrogen). Samples were multiplexed with 12 libraries per lane and sequenced in single-end mode (Rapid Run, 150 bp read length) on an Illumina HiSEquation 2000 platform, yielding ;14 million reads per library.
### Talinum Genome ref
File: Talinum.gm.CDS.nt.fa
Source: weber_fileshare/data/Eva_Maleckova-CAM/Genomics/Genome/Talinum-Flye-Polca_v201017/Talinum.gm.CDS.nt.fa
Contributor: Eva Maleckova
File added
########################
#### To be replaced by CWL routine
########################
ARC_root=~/Hackathon_ARCexample_rnaseq/
cd $ARC_root'workflows/'
# chmod a+x 01_KallistoQuant.sh
# ./01_KallistoQuant.sh > $ARC_root'runs/01_kallisto.log' 2>&1 &
########################
# Map RNASeq reads via kallisto
## Manual: http://pachterlab.github.io/kallisto/manual.html
kallisto version
kallisto cite
### Build index
kall_ref=$ARC_root'externals/Talinum.gm.CDS.nt.fa'
kallisto index -i $ARC_root'runs/01_kallisto_index' $kall_ref
### Align reads
ILLUMINASAMPLES=$(ls ${ARC_root}'assays/Talinum_RNASeq_minimal/dataset/'*fastq.gz)
mkdir $ARC_root'/runs/01_kallisto_results/'
for j in $ILLUMINASAMPLES; do
sampleName=$(echo $j | sed -e 's|.*/||' | cut -c -6) # cut away path. retain only first six chars of file name
echo $sampleName
kallisto quant --single -b 100 -t 30 -l 200 -s 20 -i $ARC_root'/runs/01_kallisto_index' -o $ARC_root'/runs/01_kallisto_results/'$sampleName $j
echo 'Kallisto done'
done
\ No newline at end of file
########################
#### To be replaced by CWL routine
########################
ARC_root="~/Hackathon_ARCexample_rnaseq/"
setwd(paste0(ARC_root, 'workflows/'))
########################
########################
# Collect kallisto data
########################
# ### sleuth installation
#
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install()
# BiocManager::install("devtools") # only if devtools not yet installed
# BiocManager::install("pachterlab/sleuth")
library(sleuth)
library(tidyverse)
library(jsonlite)
library(openxlsx)
## read experimental metadata from isa.assay wb
isa_assay <- paste0(ARC_root, 'assays/Talinum_RNASeq_minimal/assay.isa.xlsx')
assay_data <- merge(readWorkbook(isa_assay, "1SPL01_plants", startRow = 2),
readWorkbook(isa_assay, "3ASY01_RNASeq", startRow = 2),
by = "Sample.Name"
)
## remove empty cols
assay_data <- assay_data[, !apply(assay_data, 2, function(x){sum(is.na(x)) == nrow(assay_data)})]
# Pointer to kallisto results folder
base_dir <- paste0(ARC_root, '/runs/01_kallisto_results/')
# A list of paths to the kallisto results indexed by the sample IDs is collated with
kal_dirs <- dir(base_dir, full.names = T) ## Sleuth requires full paths
s2c <- assay_data[order(assay_data$Sample.Name), c('Sample.Name', "Characteristics.[Photosynthesis.mode]")]
# For kallisto / sleuth: 's2c' (sample_to_covariates) must contain a column named 'sample'
colnames(s2c) <- c("sample", "Photosynthesis.mode")
s2c$path <- kal_dirs
s2c <- s2c[order(s2c$sample), ]
# Build a sleuth object
so <- sleuth_prep(s2c, ~Photosynthesis.mode)
save(so, file = paste0(ARC_root, 'runs/03_kallisto_sleuthObject.RData'))
# Extract expression tables
## as data.frame
expression_data <- kallisto_table(so)
write.csv(expression_data, paste0(ARC_root, 'runs/03_kallisto_df.csv'), row.names = F)
## as tpm matrix (gene x sample)
tpm_table <- pivot_wider(expression_data, id_cols = target_id, names_from = sample, values_from = tpm)
write.csv(tpm_table, paste0(ARC_root, 'runs/03_kallisto_tpmMatrix.csv'), row.names = F)
# Summarize mapping stats
mapping_stats <- c()
for(i in dir(kal_dirs, pattern = '.json', full.names = T))
{
id <- unlist(strsplit(i, split = '/'))
z <- data.frame(ID = id[length(id) - 1], read_json(i, simplifyVector = T))
mapping_stats <- rbind(mapping_stats, z)
}
write.csv(mapping_stats, paste0(ARC_root, 'runs/03_kallisto_mappingStats.csv'), row.names = F)
########################
#### To be replaced by CWL routine
########################
ARC_root="~/Hackathon_ARCexample_rnaseq/"
setwd(paste0(ARC_root, 'workflows/'))
########################
########################
# Determine diff. gene expression with sleuth
########################
library(sleuth)
# Load the sleuth object
load(file = paste0(ARC_root, 'runs/03_kallisto_sleuthObject.RData'))
so <- sleuth_fit(so)
so <- sleuth_fit(so, ~Group, 'full')
so <- sleuth_fit(so, ~1, 'reduced')
so <- sleuth_lrt(so, 'reduced', 'full')
sleuth_table <- sleuth_results(so, 'reduced:full', 'lrt', show_all = FALSE)
write.csv(sleuth_table, paste0(ARC_root, 'runs/04_sleuth_dge.csv'), row.names = F)
########################
#### To be replaced by CWL routine
########################
ARC_root="~/Hackathon_ARCexample_rnaseq/"
setwd(paste0(ARC_root, 'workflows/'))
########################
########################
# Prep data for shiny app
########################
library(openxlsx)
expression_data <- read.csv(file = paste0(ARC_root, 'runs/03_kallisto_df.csv'))
available_genes <- unique(expression_data$target_id)
save(expression_data, available_genes, file = paste0(ARC_root, 'runs/05_shinyPrep.RData'))
########################
# Prep data for shiny app
########################
install.packages("openxlsx",dependencies=TRUE, lib='./lib',repos='http://cran.rstudio.com/')
library(openxlsx, lib.loc = "./lib")
options <- commandArgs(trailingOnly = TRUE)
wd <- getwd()
expression_data <- read.csv(file = options[1])
available_genes <- unique(expression_data$target_id)
save(expression_data, available_genes, file = paste0(wd, "/05_shinyPrep.RData"))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment