From ffdece03c88225752dce1f59d1979c128c663f70 Mon Sep 17 00:00:00 2001
From: Dominik Brilhaus <brilhaus@nfdi4plants.org>
Date: Thu, 7 Nov 2024 10:12:35 +0100
Subject: [PATCH] multi-docker works, cleanup

---
 workflows/deseq2/Dockerfile                   |  5 --
 workflows/deseq2/README.md                    | 28 ++++++-
 workflows/deseq2/dependencies.R               | 14 ----
 workflows/deseq2/deseq2-dev.R                 | 76 +++++++++++++++++++
 workflows/deseq2/deseq2-localDocker-test.cwl  | 16 ----
 workflows/deseq2/deseq2.R                     |  8 +-
 workflows/deseq2/deseq2.cwl                   |  5 +-
 ...-docker-test.cwl => mutli-docker-test.cwl} |  2 +-
 8 files changed, 109 insertions(+), 45 deletions(-)
 delete mode 100644 workflows/deseq2/Dockerfile
 delete mode 100644 workflows/deseq2/dependencies.R
 create mode 100644 workflows/deseq2/deseq2-dev.R
 delete mode 100644 workflows/deseq2/deseq2-localDocker-test.cwl
 rename workflows/deseq2/{r-docker-test.cwl => mutli-docker-test.cwl} (56%)

diff --git a/workflows/deseq2/Dockerfile b/workflows/deseq2/Dockerfile
deleted file mode 100644
index fb3edf1..0000000
--- a/workflows/deseq2/Dockerfile
+++ /dev/null
@@ -1,5 +0,0 @@
-FROM rstudio/r-base:4.3-jammy
-
-RUN apt-get update && \ 
-    R -e "install.packages(c('BiocManager'), repos='https://cloud.r-project.org/'); BiocManager::install('DESeq2'); BiocManager::install('tximport'); BiocManager::install('rhdf5'); \\" && \
-    apt-get clean -y
\ No newline at end of file
diff --git a/workflows/deseq2/README.md b/workflows/deseq2/README.md
index 673fb4d..39cf2dc 100644
--- a/workflows/deseq2/README.md
+++ b/workflows/deseq2/README.md
@@ -12,10 +12,36 @@ Workflow used for **differential gene expression analysis**
 
 ## Run pure script (to test)
 
+### Install R dependencies for deseq2
+
+```R
+if (!require("BiocManager", quietly = TRUE))
+    install.packages("BiocManager")
+
+BiocManager::install("DESeq2")
+library("DESeq2")
+
+BiocManager::install("tximport")
+library("tximport")
+
+BiocManager::install("rhdf5")
+library("rhdf5")
+```
+
+### test
+
 ```bash
 RScript deseq2.R "../../runs/kallisto/kallisto_results" "../../runs/merged_isa_metadata/out/merged_isa.tsv" "Source.Name" "Factor..Photosynthesis.mode."
 ```
 
+
 ## Run CWL-wrapped script
 
-see [runs/deseq2-run](../../runs/deseq2-run)
\ No newline at end of file
+see [runs/deseq2-run](../../runs/deseq2-run)
+
+
+## Multi-package containers
+
+- R and combinations of library dependencies are available as multi-package containers from [BioContainers](https://github.com/BioContainers/multi-package-containers)
+- Searched for `repo:BioContainers/multi-package-containers deseq2 tximport rhdf5`
+- and found `quay.io/biocontainers/mulled-v2-05fd88b9ac812a9149da2f2d881d62f01cc49835:a10f0e3a7a70fc45494f8781d33901086d2214d0-0` :tada:
diff --git a/workflows/deseq2/dependencies.R b/workflows/deseq2/dependencies.R
deleted file mode 100644
index d010bbb..0000000
--- a/workflows/deseq2/dependencies.R
+++ /dev/null
@@ -1,14 +0,0 @@
-
-# Install dependencies for deseq2
-
-if (!require("BiocManager", quietly = TRUE))
-    install.packages("BiocManager")
-
-BiocManager::install("DESeq2")
-library("DESeq2")
-
-BiocManager::install("tximport")
-library("tximport")
-
-BiocManager::install("rhdf5")
-library("rhdf5")
diff --git a/workflows/deseq2/deseq2-dev.R b/workflows/deseq2/deseq2-dev.R
new file mode 100644
index 0000000..5651421
--- /dev/null
+++ b/workflows/deseq2/deseq2-dev.R
@@ -0,0 +1,76 @@
+# DESeq2
+
+## Libraries
+
+library("DESeq2")
+library("tximport")
+library("rhdf5")
+library("ggplot2")
+
+## In-and-out
+
+# inKallistoResults <- "../../runs/kallisto/kallisto_results"
+# inMetadataFile <- "../../runs/merged_isa_metadata/out/merged_isa.tsv"
+# inMetadataSample <- "Source.Name"
+# inMetadataFactorList <- list("Factor..Photosynthesis.mode.", "Factor..Biosource.amount.")
+
+### Read arguments from CLI
+
+args <- commandArgs(trailingOnly = T)
+
+inKallistoResults <- args[1]
+inMetadataFile <- args[2]
+inMetadataSample <- args[3]
+inMetadataFactorList <- args[4]
+
+## Import kallisto count data
+
+files <- dir(inKallistoResults, recursive = T, full.names = T ,"abundance.h5")
+names(files) <- dir(inKallistoResults)
+
+txi <- tximport(files, type = "kallisto", txOut = TRUE)
+
+head(txi$counts)
+
+## Read sample metadata
+
+samples_metadata <- read.table(file = inMetadataFile, sep = "\t")
+samples <- samples_metadata[order(samples_metadata[[inMetadataSample]]), c(inMetadataSample, unlist(inMetadataFactorList))]
+rownames(samples) <- samples[,inMetadataSample]
+
+factors <- sapply(inMetadataFactorList, function(x) x[[1]])
+design_formula <- as.formula(paste("~", paste(rev(factors), collapse = " + ")))
+
+
+## DESeq
+
+dds <- DESeqDataSetFromTximport(txi, colData = samples, design = design_formula)
+
+dds <- DESeq(dds)
+
+## Outputs
+
+### Extract results
+
+res <- results(dds)
+write.csv(res, file = "results_stats.csv", quote = TRUE)
+
+### Generate and save default plots
+
+png("results_ma-plot.png")
+  plotMA(res, ylim=c(-2,2))
+dev.off()
+
+vsd <- vst(dds, blind=FALSE)
+pcaData <- plotPCA(vsd, intgroup=factors, returnData=TRUE)
+percentVar <- round(100 * attr(pcaData, "percentVar"))
+
+p2 <- ggplot(pcaData, aes(PC1, PC2, color=factors[[1]])) +
+  geom_point(size=3) +
+  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
+  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
+  coord_fixed()
+
+png("results_pca-plot.png")
+  print(p2)
+dev.off()
diff --git a/workflows/deseq2/deseq2-localDocker-test.cwl b/workflows/deseq2/deseq2-localDocker-test.cwl
deleted file mode 100644
index 3e26cb8..0000000
--- a/workflows/deseq2/deseq2-localDocker-test.cwl
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env cwl-runner
-
-cwlVersion: v1.2
-class: CommandLineTool
-hints:
-  DockerRequirement:
-    dockerFile: {$include: "./Dockerfile"}
-    dockerImageId: "deseq-docker"
-requirements:
-  - class: NetworkAccess
-    networkAccess: true
-baseCommand: [Rscript, --help]
-
-inputs: []
-
-outputs: []
diff --git a/workflows/deseq2/deseq2.R b/workflows/deseq2/deseq2.R
index 0384bc5..e9ad421 100644
--- a/workflows/deseq2/deseq2.R
+++ b/workflows/deseq2/deseq2.R
@@ -9,10 +9,10 @@ library("ggplot2")
 
 ## In-and-out
 
-inKallistoResults <- "../../runs/kallisto/kallisto_results"
-inMetadataFile <- "../../runs/merged_isa_metadata/out/merged_isa.tsv"
-inMetadataSample <- "Source.Name"
-inMetadataFactor <- "Factor..Photosynthesis.mode."
+# inKallistoResults <- "../../runs/kallisto/kallisto_results"
+# inMetadataFile <- "../../runs/merged_isa_metadata/out/merged_isa.tsv"
+# inMetadataSample <- "Source.Name"
+# inMetadataFactor <- "Factor..Photosynthesis.mode."
 
 ### Read arguments from CLI
 
diff --git a/workflows/deseq2/deseq2.cwl b/workflows/deseq2/deseq2.cwl
index b35f8a4..4cdda2f 100644
--- a/workflows/deseq2/deseq2.cwl
+++ b/workflows/deseq2/deseq2.cwl
@@ -3,11 +3,8 @@
 cwlVersion: v1.2
 class: CommandLineTool
 hints:
-  # DockerRequirement:
-  #   dockerPull: quay.io/biocontainers/bioconductor-deseq2:1.42.0--r43hf17093f_2
   DockerRequirement:
-    dockerFile: {$include: "./Dockerfile"}
-    dockerImageId: "deseq-docker"
+    dockerPull: quay.io/biocontainers/mulled-v2-05fd88b9ac812a9149da2f2d881d62f01cc49835:a10f0e3a7a70fc45494f8781d33901086d2214d0-0
 requirements:
   - class: InitialWorkDirRequirement
     listing:
diff --git a/workflows/deseq2/r-docker-test.cwl b/workflows/deseq2/mutli-docker-test.cwl
similarity index 56%
rename from workflows/deseq2/r-docker-test.cwl
rename to workflows/deseq2/mutli-docker-test.cwl
index b964a83..136d87e 100644
--- a/workflows/deseq2/r-docker-test.cwl
+++ b/workflows/deseq2/mutli-docker-test.cwl
@@ -5,7 +5,7 @@ class: CommandLineTool
 
 requirements:
   - class: DockerRequirement
-    dockerPull: quay.io/biocontainers/bioconductor-deseq2:1.42.0--r43hf17093f_2
+    dockerPull: quay.io/biocontainers/mulled-v2-05fd88b9ac812a9149da2f2d881d62f01cc49835:a10f0e3a7a70fc45494f8781d33901086d2214d0-0
 
 baseCommand: [Rscript, --help]
 
-- 
GitLab