diff --git a/assays/GCqTOF_targets/isa.assay.xlsx b/assays/GCqTOF_targets/isa.assay.xlsx index a50438bc4b38a8e8d4bdc25f3f2afef524cdcfaa..b0f78c63340f8a9c3f37e9d6e662d8ceea66e4d9 100644 Binary files a/assays/GCqTOF_targets/isa.assay.xlsx and b/assays/GCqTOF_targets/isa.assay.xlsx differ diff --git a/assays/MassHunter_targets/isa.assay.xlsx b/assays/MassHunter_targets/isa.assay.xlsx index de9f3d94c6c077ee8e1ed4ca754d5bba18400706..2dc984f14f01cea782498176f8355792d705f209 100644 Binary files a/assays/MassHunter_targets/isa.assay.xlsx and b/assays/MassHunter_targets/isa.assay.xlsx differ diff --git a/assays/Talinum_RNASeq_minimal/dataset/.gitkeep b/assays/Talinum_RNASeq_minimal/dataset/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/assays/Talinum_RNASeq_minimal/isa.assay.xlsx b/assays/Talinum_RNASeq_minimal/isa.assay.xlsx index cdfc308b31bec50f61d40bab871c219a256bc1f8..6871289c0ec15ffc27db9685cf97d94519d89c39 100644 Binary files a/assays/Talinum_RNASeq_minimal/isa.assay.xlsx and b/assays/Talinum_RNASeq_minimal/isa.assay.xlsx differ diff --git a/runs/deseq2-run/job.yml b/runs/deseq2-run/job.yml index b0386bd4bbf18d2b229848e339880d5f9477e99e..25f66f6d26c6cdd6179212db52d906332973f912 100644 --- a/runs/deseq2-run/job.yml +++ b/runs/deseq2-run/job.yml @@ -5,5 +5,5 @@ inMetadataFile: class: File path: ../../runs/merged_isa_metadata/out/merged_isa.tsv inMetadataSample: "Source.Name" -inMetadataFactor: +inMetadataFactorList: - "Factor..Photosynthesis.mode." \ No newline at end of file diff --git a/studies/TalinumFacultativeCAM/isa.study.xlsx b/studies/TalinumFacultativeCAM/isa.study.xlsx index faedcffca8c32e6a6951fc1bc60e18b28c59ded6..96bcec10f2783141e0216d6f03f75b079f5e0ac5 100644 Binary files a/studies/TalinumFacultativeCAM/isa.study.xlsx and b/studies/TalinumFacultativeCAM/isa.study.xlsx differ diff --git a/studies/TalinumGenomeDraft/isa.study.xlsx b/studies/TalinumGenomeDraft/isa.study.xlsx index b1f113e3228dd803dc86c4598002b38c786e4a55..84b3218ada3e1b3451e0f24edb8cd70c885d16c2 100644 Binary files a/studies/TalinumGenomeDraft/isa.study.xlsx and b/studies/TalinumGenomeDraft/isa.study.xlsx differ diff --git a/studies/TalinumGenomeDraft/resources/.gitkeep b/studies/TalinumGenomeDraft/resources/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/workflows/deseq2/README.md b/workflows/deseq2/README.md index 03944ea1446b8c7a58ed5a530e9a238d10e68c6a..39cf2dcfacc76eeaee434d0f5d1abc07809fbe5d 100644 --- a/workflows/deseq2/README.md +++ b/workflows/deseq2/README.md @@ -10,12 +10,38 @@ Workflow used for **differential gene expression analysis** - https://bioconductor.org/packages/release/bioc/vignettes/tximport/inst/doc/tximport.html#kallisto +## Run pure script (to test) -## Run pure script +### Install R dependencies for deseq2 + +```R +if (!require("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + +BiocManager::install("DESeq2") +library("DESeq2") + +BiocManager::install("tximport") +library("tximport") + +BiocManager::install("rhdf5") +library("rhdf5") +``` + +### test ```bash RScript deseq2.R "../../runs/kallisto/kallisto_results" "../../runs/merged_isa_metadata/out/merged_isa.tsv" "Source.Name" "Factor..Photosynthesis.mode." ``` -## Run CWL +## Run CWL-wrapped script + +see [runs/deseq2-run](../../runs/deseq2-run) + + +## Multi-package containers + +- R and combinations of library dependencies are available as multi-package containers from [BioContainers](https://github.com/BioContainers/multi-package-containers) +- Searched for `repo:BioContainers/multi-package-containers deseq2 tximport rhdf5` +- and found `quay.io/biocontainers/mulled-v2-05fd88b9ac812a9149da2f2d881d62f01cc49835:a10f0e3a7a70fc45494f8781d33901086d2214d0-0` :tada: diff --git a/workflows/deseq2/dependencies.R b/workflows/deseq2/dependencies.R deleted file mode 100644 index d010bbb7caa76ff4fbb114655812cb7ccea20dfb..0000000000000000000000000000000000000000 --- a/workflows/deseq2/dependencies.R +++ /dev/null @@ -1,14 +0,0 @@ - -# Install dependencies for deseq2 - -if (!require("BiocManager", quietly = TRUE)) - install.packages("BiocManager") - -BiocManager::install("DESeq2") -library("DESeq2") - -BiocManager::install("tximport") -library("tximport") - -BiocManager::install("rhdf5") -library("rhdf5") diff --git a/workflows/deseq2/deseq2.R b/workflows/deseq2/deseq2.R index 0384bc5d321b4d2ab18b59322be90aaf47d770d3..565142189e96783199acbd5fa1df5573a2f21601 100644 --- a/workflows/deseq2/deseq2.R +++ b/workflows/deseq2/deseq2.R @@ -9,10 +9,10 @@ library("ggplot2") ## In-and-out -inKallistoResults <- "../../runs/kallisto/kallisto_results" -inMetadataFile <- "../../runs/merged_isa_metadata/out/merged_isa.tsv" -inMetadataSample <- "Source.Name" -inMetadataFactor <- "Factor..Photosynthesis.mode." +# inKallistoResults <- "../../runs/kallisto/kallisto_results" +# inMetadataFile <- "../../runs/merged_isa_metadata/out/merged_isa.tsv" +# inMetadataSample <- "Source.Name" +# inMetadataFactorList <- list("Factor..Photosynthesis.mode.", "Factor..Biosource.amount.") ### Read arguments from CLI @@ -21,7 +21,7 @@ args <- commandArgs(trailingOnly = T) inKallistoResults <- args[1] inMetadataFile <- args[2] inMetadataSample <- args[3] -inMetadataFactor <- args[4] +inMetadataFactorList <- args[4] ## Import kallisto count data @@ -35,15 +35,16 @@ head(txi$counts) ## Read sample metadata samples_metadata <- read.table(file = inMetadataFile, sep = "\t") +samples <- samples_metadata[order(samples_metadata[[inMetadataSample]]), c(inMetadataSample, unlist(inMetadataFactorList))] +rownames(samples) <- samples[,inMetadataSample] -samples <- samples_metadata[order(samples_metadata[[inMetadataSample]]), c(inMetadataSample, inMetadataFactor)] -colnames(samples)[1:2] <- c("sampleID", "condition") +factors <- sapply(inMetadataFactorList, function(x) x[[1]]) +design_formula <- as.formula(paste("~", paste(rev(factors), collapse = " + "))) -rownames(samples) <- samples$sampleID ## DESeq -dds <- DESeqDataSetFromTximport(txi, colData = samples, design = ~ condition) +dds <- DESeqDataSetFromTximport(txi, colData = samples, design = design_formula) dds <- DESeq(dds) @@ -52,7 +53,7 @@ dds <- DESeq(dds) ### Extract results res <- results(dds) -write.csv(res, file = "results_stats.csv", append = FALSE, quote = TRUE) +write.csv(res, file = "results_stats.csv", quote = TRUE) ### Generate and save default plots @@ -61,10 +62,10 @@ png("results_ma-plot.png") dev.off() vsd <- vst(dds, blind=FALSE) -pcaData <- plotPCA(vsd, intgroup=c("condition"), returnData=TRUE) +pcaData <- plotPCA(vsd, intgroup=factors, returnData=TRUE) percentVar <- round(100 * attr(pcaData, "percentVar")) -p2 <- ggplot(pcaData, aes(PC1, PC2, color=condition)) + +p2 <- ggplot(pcaData, aes(PC1, PC2, color=factors[[1]])) + geom_point(size=3) + xlab(paste0("PC1: ",percentVar[1],"% variance")) + ylab(paste0("PC2: ",percentVar[2],"% variance")) + @@ -73,8 +74,3 @@ p2 <- ggplot(pcaData, aes(PC1, PC2, color=condition)) + png("results_pca-plot.png") print(p2) dev.off() - - - - - diff --git a/workflows/deseq2/deseq2.cwl b/workflows/deseq2/deseq2.cwl index 2f926aaf9f58afb994efa1d20ff809c56a5f86a9..c3dd72790b6493ced407c5525754856d7ab4b441 100644 --- a/workflows/deseq2/deseq2.cwl +++ b/workflows/deseq2/deseq2.cwl @@ -1,8 +1,10 @@ +#!/usr/bin/env cwl-runner + cwlVersion: v1.2 class: CommandLineTool -# hints: -# DockerRequirement: -# dockerPull: r-base:4.4.2 +hints: + DockerRequirement: + dockerPull: quay.io/biocontainers/mulled-v2-05fd88b9ac812a9149da2f2d881d62f01cc49835:a10f0e3a7a70fc45494f8781d33901086d2214d0-0 requirements: - class: InitialWorkDirRequirement listing: @@ -11,7 +13,7 @@ requirements: $include: deseq2.R - class: NetworkAccess networkAccess: true -baseCommand: [RScript, deseq2.R] +baseCommand: [Rscript, deseq2.R] inputs: inKallistoResults: type: Directory @@ -25,7 +27,7 @@ inputs: type: string inputBinding: position: 3 - inMetadataFactor: + inMetadataFactorList: type: string[] inputBinding: position: 4 diff --git a/workflows/deseq2/mutli-docker-test.cwl b/workflows/deseq2/mutli-docker-test.cwl new file mode 100644 index 0000000000000000000000000000000000000000..136d87e72742f52c9ac1cdfe287273e5462b541a --- /dev/null +++ b/workflows/deseq2/mutli-docker-test.cwl @@ -0,0 +1,14 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +requirements: + - class: DockerRequirement + dockerPull: quay.io/biocontainers/mulled-v2-05fd88b9ac812a9149da2f2d881d62f01cc49835:a10f0e3a7a70fc45494f8781d33901086d2214d0-0 + +baseCommand: [Rscript, --help] + +inputs: [] + +outputs: [] diff --git a/workflows/deseq2/r-docker-test.cwl b/workflows/deseq2/r-docker-test.cwl deleted file mode 100644 index 3b2627952f299875eaf7e742f572317226f0ed1d..0000000000000000000000000000000000000000 --- a/workflows/deseq2/r-docker-test.cwl +++ /dev/null @@ -1,15 +0,0 @@ -cwlVersion: v1.2 -class: CommandLineTool - -requirements: - - class: NetworkAccess - networkAccess: true - # - class: DockerRequirement - # dockerPull: r-base:4.4.2 - -baseCommand: [RScript, --help] - -inputs: [] - -outputs: [] - \ No newline at end of file diff --git a/workflows/isaSampleToRawDataSeq/isaSampleToRawDataSeq.fsx b/workflows/isaSampleToRawDataSeq/isaSampleToRawDataSeq.fsx index 4a9483c8630a1699465acb19c6dd027097e7193a..a67f56a5aff383094c39d229adb6b87f87428c6f 100644 --- a/workflows/isaSampleToRawDataSeq/isaSampleToRawDataSeq.fsx +++ b/workflows/isaSampleToRawDataSeq/isaSampleToRawDataSeq.fsx @@ -1,92 +1,129 @@ -// Pull out the full ISA process sequence (incl. all metadata) leading to the first Raw Data Node - -// Dependencies - -#r "nuget: ARCtrl.NET, 2.0.2" -#r "nuget: ARCtrl.QueryModel, 2.0.2" - -open System.IO -open ARCtrl.NET -open ARCtrl -open ARCtrl.QueryModel - -// input parameters - -let args : string array = fsi.CommandLineArgs |> Array.tail -let arcPath = args.[0] -let assayName = args.[1] -let startingNodeNum = args.[2] |> int -let outName = args.[3] - - -// test parameters -let source = __SOURCE_DIRECTORY__ -let arcPath = Path.Combine(source, "../../") -let assayName = "Talinum_RNASeq_minimal" -let startingNodeNum = 1 -let outName = "rnaseq-samples" - - -// Load ARC - -let arc = ARC.load(arcPath) - -let inv = arc.ISA.Value - -// Load first data node - -let firstData = inv.GetAssay(assayName).FirstData - -// Create headers for output table -let headers = [ - CompositeHeader.Input IOType.Sample - for v in inv.ArcTables.ValuesOf firstData.[0] do - if v.IsCharacteristicValue then - CompositeHeader.Characteristic v.Category - elif v.IsParameterValue then - CompositeHeader.Parameter v.Category - elif v.IsFactorValue then - CompositeHeader.Factor v.Category - elif v.IsComponent then - CompositeHeader.Component v.Category - else failwithf "what the f is %O" v - - CompositeHeader.Output IOType.Data -] - -// Create rows - -let getRow (d: QNode) = - [| - - CompositeCell.createFreeText (inv.ArcTables.SamplesOf d).[startingNodeNum].Name - - for v in inv.ArcTables.ValuesOf d do - if v.HasUnit then - CompositeCell.Unitized(v.ValueText, v.Unit) - else - CompositeCell.Term(v.Value.AsOntology()) - - CompositeCell.FreeText d.Name - - |] - -// Combine into table - -let t = ArcTable.init "FullTable" -t.Headers <- ResizeArray headers - -for d in firstData do - t.AddRow (getRow d) - -// Small detour via workbook -let ws = Spreadsheet.ArcTable.toFsWorksheet t - -let wb = new FsSpreadsheet.FsWorkbook() - -wb.AddWorksheet ws - -// Write to csv - -wb.ToCsvFile (outName + ".tsv", Separator = '\t') -wb.ToXlsxFile (outName + ".xlsx") +// Pull out the full ISA process sequence (incl. all metadata) leading to the first Raw Data Node + +// Dependencies + +#r "nuget: ARCtrl.NET" +#r "nuget: ARCtrl.QueryModel" + +open System.IO +open ARCtrl.NET +open ARCtrl +open ARCtrl.QueryModel +open ARCtrl.Helper +open FsSpreadsheet +open FsSpreadsheet.Net + +// input parameters + +// let args : string array = fsi.CommandLineArgs |> Array.tail +// let arcPath = args.[0] +// let assayName = args.[1] +// let startingNodeNum = args.[2] |> int +// let outName = args.[3] + + +type ArcTables with + + member this.IgnoreShitty() : ArcTables = + this.Tables + |> ResizeArray.filter (fun t -> + t.TryGetInputColumn().IsSome && t.TryGetOutputColumn().IsSome + ) + |> ArcTables + + +// test parameters +let source = __SOURCE_DIRECTORY__ +let arcPath = Path.Combine(source, "../../") +let assayName = "Talinum_RNASeq_minimal" +let startingNodeNum = 1 +let outName = "rnaseq-samples" + +// Load ARC + +let clean (a : ARC) = + a.ISA.Value.Assays |> Seq.iter (fun a -> + a.Tables + |> Seq.toArray + |> Seq.iter (fun t -> + if not (t.TryGetInputColumn().IsSome && t.TryGetOutputColumn().IsSome) then + a.RemoveTable t.Name + + ) + + ) + a.ISA.Value.Studies |> Seq.iter (fun s -> + s.Tables + |> Seq.toArray + |> Seq.iter (fun t -> + if not (t.TryGetInputColumn().IsSome && t.TryGetOutputColumn().IsSome) then + s.RemoveTable t.Name + + ) + + ) + a + +let arc = ARC.load(arcPath) |> clean + + +let inv = arc.ISA.Value + +// Load first data node + +let firstData = inv.GetAssay(assayName).FirstData + +// Create headers for output table +let headers = [ + CompositeHeader.Input IOType.Sample + for v in inv.ArcTables.IgnoreShitty().ValuesOf firstData.[0].Name do + if v.IsCharacteristicValue then + CompositeHeader.Characteristic v.Category + elif v.IsParameterValue then + CompositeHeader.Parameter v.Category + elif v.IsFactorValue then + CompositeHeader.Factor v.Category + elif v.IsComponent then + CompositeHeader.Component v.Category + else failwithf "what the f is %O" v + + CompositeHeader.Output IOType.Data +] + + +// Create rows + +let getRow (d: QNode) = + [| + + CompositeCell.createFreeText (inv.ArcTables.SamplesOf d).[startingNodeNum].Name + + for v in inv.ArcTables.ValuesOf d do + if v.HasUnit then + CompositeCell.Unitized(v.ValueText, v.Unit) + else + CompositeCell.Term(v.Value.AsOntology()) + + CompositeCell.FreeText d.Name + + |] + +// Combine into table + +let t = ArcTable.init "FullTable" +t.Headers <- ResizeArray headers + +for d in firstData do + t.AddRow (getRow d) + +// Small detour via workbook +let ws = Spreadsheet.ArcTable.toFsWorksheet None t + +let wb = new FsSpreadsheet.FsWorkbook() + +wb.AddWorksheet ws + +// Write to csv + +// wb.To (outName + ".tsv", Separator = '\t') +wb.ToXlsxFile (outName + ".xlsx")