From 211ac2be5acdf160d1e7394d773c5b2875636773 Mon Sep 17 00:00:00 2001
From: Dominik Brilhaus <brilhaus@nfdi4plants.org>
Date: Mon, 24 Mar 2025 10:17:13 +0100
Subject: [PATCH] redesign sleuth

---
 runs/sleuth/run.cwl               |  14 ++--
 runs/sleuth/run.yml               |   2 +-
 workflows/sleuth/gather-files.cwl |  24 +++++++
 workflows/sleuth/sleuth.R         |  23 ++----
 workflows/sleuth/sleuth.cwl       |  87 +++++++++++++++++++++++
 workflows/sleuth/workflow.cwl     | 113 +++++++++++++++---------------
 6 files changed, 182 insertions(+), 81 deletions(-)
 create mode 100644 workflows/sleuth/gather-files.cwl
 create mode 100644 workflows/sleuth/sleuth.cwl

diff --git a/runs/sleuth/run.cwl b/runs/sleuth/run.cwl
index 7884d8d..789736f 100644
--- a/runs/sleuth/run.cwl
+++ b/runs/sleuth/run.cwl
@@ -3,13 +3,16 @@
 cwlVersion: v1.2
 class: Workflow
 
+requirements: 
+  SubworkflowFeatureRequirement: {}
+
 inputs:
   inKallistoResults: Directory
   inMetadataFile: File
   inMetadataSample: string
   inMetadataFactorList: string[]
   inMetadataDataCol: string
-  outFolder: string
+  resultsoutdir: string
 
 steps: 
   sleuth:
@@ -20,14 +23,13 @@ steps:
       inMetadataSample: inMetadataSample
       inMetadataFactorList: inMetadataFactorList
       inMetadataDataCol: inMetadataDataCol
-      outFolder: outFolder
-    out: [outdir]
+      finaloutdir: resultsoutdir
+    out: [ sleuth_outdir ]
 
 outputs:
  outdir:
-    type: Directory[]
-    outputSource: sleuth/outdir
-
+    type: Directory
+    outputSource: sleuth/sleuth_outdir
 
 $namespaces:
   s: https://schema.org/
diff --git a/runs/sleuth/run.yml b/runs/sleuth/run.yml
index 1996f50..61c509d 100644
--- a/runs/sleuth/run.yml
+++ b/runs/sleuth/run.yml
@@ -8,4 +8,4 @@ inMetadataSample: "Input [Source Name]"
 inMetadataFactorList:
   - "Factor [Photosynthesis mode]"
 inMetadataDataCol: "Output [Data]"
-outFolder: results
+resultsoutdir: results
diff --git a/workflows/sleuth/gather-files.cwl b/workflows/sleuth/gather-files.cwl
new file mode 100644
index 0000000..67d23ec
--- /dev/null
+++ b/workflows/sleuth/gather-files.cwl
@@ -0,0 +1,24 @@
+cwlVersion: v1.2
+class: ExpressionTool
+label: Gather files
+doc: |
+  Helper tool to organize workflow outputs
+
+  Takes an array of files (e.g. from a workflow step) and yields them in a destination directory.
+
+  Adapted from: https://github.com/common-workflow-language/cwl-v1.1/blob/a22b7580c6b50e77c0a181ca59d3828dd5c69143/tests/dir7.cwl
+requirements:
+  - class: InlineJavascriptRequirement
+inputs:
+  inFiles: File[]
+  destination: string
+expression: |
+  ${
+    return {"outDir": {
+      "class": "Directory", 
+      "basename": inputs.destination,
+      "listing": inputs.inFiles
+    } };
+  }
+outputs:
+  outDir: Directory
\ No newline at end of file
diff --git a/workflows/sleuth/sleuth.R b/workflows/sleuth/sleuth.R
index 2b2283c..a9e0b52 100644
--- a/workflows/sleuth/sleuth.R
+++ b/workflows/sleuth/sleuth.R
@@ -18,7 +18,6 @@ inMetadataFile <- args[2]
 inMetadataSample <- args[3]
 inMetadataFactorList <- args[4]
 inMetadataDataCol <- args[5]
-outFolder <- args[6]
 
 # inKallistoResults <- "../../runs/kallisto/kallisto_results"
 # inMetadataFile <- "../../runs/isaSampleToRawDataSeq/rnaseq-samples.csv"
@@ -27,12 +26,6 @@ outFolder <- args[6]
 # inMetadataDataCol <- "Output [Data]"
 # outFolder <- "."
 
-################################################
-#### If it does not exist, create out dir
-################################################
-
-dir.create(outFolder, recursive = T, showWarnings = F)
-
 ################################################
 #### Read ISA sample metadata
 ################################################
@@ -43,10 +36,8 @@ samples <- read.csv(file = inMetadataFile, check.names = FALSE)
 #### Read Kallisto results
 ################################################
 
-base_dir <- inKallistoResults
-
 # A list of paths to the kallisto results indexed by the sample IDs is collated with
-kal_dirs <- dir(base_dir, full.names = T) ## Sleuth requires full paths
+kal_dirs <- dir(inKallistoResults, full.names = T) ## Sleuth requires full paths
 
 s2c <- samples[order(samples[[inMetadataSample]]), c(inMetadataSample, unlist(inMetadataFactorList), inMetadataDataCol)]
 
@@ -66,7 +57,7 @@ design_formula <- as.formula(paste("~", paste(rev(factors), collapse = " + ")))
 
 so <- sleuth_prep(s2c, full_model = design_formula)
 
-save(so, file = file.path(outFolder, "kallisto_sleuthObject.RData"))
+save(so, file = "kallisto_sleuthObject.RData")
 
 ################################################
 #### Extract expression tables
@@ -76,13 +67,13 @@ save(so, file = file.path(outFolder, "kallisto_sleuthObject.RData"))
 expression_data <- kallisto_table(so)
 
 ## write to file
-write.csv(expression_data, file.path(outFolder, "kallisto_df.csv"), row.names = F)
+write.csv(expression_data, "kallisto_df.csv", row.names = F)
 
 ## as tpm matrix (gene x sample)
 tpm_table <- reshape(expression_data, idvar = "target_id", timevar = "sample", direction = "wide", v.names = "tpm")
 
 # Write to file
-write.csv(tpm_table, file.path(outFolder, "kallisto_tpmMatrix.csv"), row.names = F)
+write.csv(tpm_table, "kallisto_tpmMatrix.csv", row.names = F)
 
 ################################################
 #### Summarize mapping stats
@@ -95,11 +86,11 @@ for (i in dir(kal_dirs, pattern = ".json", full.names = T)) {
   mapping_stats <- rbind(mapping_stats, z)
 }
 
-write.csv(mapping_stats, file.path(outFolder, "kallisto_mappingStats.csv"), row.names = F)
+write.csv(mapping_stats, "kallisto_mappingStats.csv", row.names = F)
 
 
 ################################################
-#### Run sleuth to identify DEGs
+#### Run sleuth to identify DGE
 ################################################
 
 so <- sleuth_fit(so)
@@ -111,4 +102,4 @@ sleuth_table <- sleuth_results(so, "reduced:full", "lrt", show_all = FALSE)
 
 ### write to file
 
-write.csv(sleuth_table, file.path(outFolder, "sleuth_dge.csv"), row.names = F)
+write.csv(sleuth_table, "sleuth_dge.csv", row.names = F)
diff --git a/workflows/sleuth/sleuth.cwl b/workflows/sleuth/sleuth.cwl
new file mode 100644
index 0000000..3ffc997
--- /dev/null
+++ b/workflows/sleuth/sleuth.cwl
@@ -0,0 +1,87 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.2
+class: CommandLineTool
+
+hints:
+  DockerRequirement: 
+    dockerPull: quay.io/biocontainers/mulled-v2-fdd016122f200fdc6dc30f6ea2fd0000e8067dff:f9531f6ac1f44332eff70b5912d7d5f3ebe8df38-0
+  SoftwareRequirement:  
+    packages:
+      - package: R
+        specs:
+          - https://identifiers.org/rrid/RRID:SCR_001905
+          - https://identifiers.org/biotools/r
+          - https://anaconda.org/bioconda/r
+        version: [ "4.2.3" ]
+      - package: sleuth
+        version: [ "0.30.1" ]
+        specs: 
+          - https://identifiers.org/rrid/RRID:SCR_016883
+          - https://identifiers.org/biotools/sleuth
+
+requirements:
+  - class: InitialWorkDirRequirement
+    listing:
+      - entryname: sleuth.R
+        entry:
+          $include: sleuth.R
+
+baseCommand: [Rscript, sleuth.R]
+
+inputs:
+  inKallistoResults:
+    type: Directory
+    inputBinding:
+      position: 1
+  inMetadataFile:
+    type: File
+    inputBinding:
+      position: 2
+  inMetadataSample:
+    type: string
+    inputBinding:
+      position: 3
+  inMetadataFactorList:
+    type: string[]
+    inputBinding:
+      position: 4
+  inMetadataDataCol:
+    type: string
+    inputBinding:
+      position: 5
+
+outputs:
+  kallisto_sleuthObject:
+    type: File
+    outputBinding:
+      glob: "kallisto_sleuthObject.RData"
+  kallisto_df:
+    type: File
+    outputBinding:
+      glob: "kallisto_df.csv"
+  kallisto_tpmMatrix:
+    type: File
+    outputBinding:
+      glob: "kallisto_tpmMatrix.csv"
+  kallisto_mappingStats:
+    type: File
+    outputBinding:
+      glob: "kallisto_mappingStats.csv"
+  sleuth_dge:
+    type: File
+    outputBinding:
+      glob: "sleuth_dge.csv"
+
+$namespaces:
+  s: https://schema.org/
+  edam: http://edamontology.org/
+
+$schemas:
+  - https://schema.org/version/latest/schemaorg-current-https.rdf
+  - http://edamontology.org/EDAM_1.18.owl
+
+s:author:
+  - class: s:Person
+    s:name: Dominik Brilhaus
+    s:identifier: https://orcid.org/0000-0001-9021-3197
\ No newline at end of file
diff --git a/workflows/sleuth/workflow.cwl b/workflows/sleuth/workflow.cwl
index 497533c..5f08a47 100644
--- a/workflows/sleuth/workflow.cwl
+++ b/workflows/sleuth/workflow.cwl
@@ -1,66 +1,63 @@
 #!/usr/bin/env cwl-runner
-
 cwlVersion: v1.2
-class: CommandLineTool
-
-hints:
-  DockerRequirement: 
-    dockerPull: quay.io/biocontainers/mulled-v2-fdd016122f200fdc6dc30f6ea2fd0000e8067dff:f9531f6ac1f44332eff70b5912d7d5f3ebe8df38-0
-  SoftwareRequirement:  
-    packages:
-      - package: R
-        specs:
-          - https://identifiers.org/rrid/RRID:SCR_001905
-          - https://identifiers.org/biotools/r
-          - https://anaconda.org/bioconda/r
-        version: [ "4.2.3" ]
-      - package: sleuth
-        version: [ "0.30.1" ]
-        specs: 
-          - https://identifiers.org/rrid/RRID:SCR_016883
-          - https://identifiers.org/biotools/sleuth
+class: Workflow
 
 requirements:
-  - class: InitialWorkDirRequirement
-    listing:
-      - entryname: sleuth.R
-        entry:
-          $include: sleuth.R
-  # - class: NetworkAccess
-  #   networkAccess: true
-
-baseCommand: [Rscript, sleuth.R]
+  ScatterFeatureRequirement: {}
+  SubworkflowFeatureRequirement: {}
+  MultipleInputFeatureRequirement: {}
 
 inputs:
-  inKallistoResults:
+  inKallistoResults: Directory
+  inMetadataFile: File
+  inMetadataSample: string
+  inMetadataFactorList: string[]
+  inMetadataDataCol: string
+  finaloutdir: string
+ 
+steps:
+  sleuth:
+    run: sleuth.cwl
+    in:
+      inKallistoResults: inKallistoResults
+      inMetadataFile: inMetadataFile
+      inMetadataSample: inMetadataSample
+      inMetadataFactorList: inMetadataFactorList
+      inMetadataDataCol: inMetadataDataCol
+    out:
+      - kallisto_sleuthObject
+      - kallisto_df
+      - kallisto_tpmMatrix
+      - kallisto_mappingStats
+      - sleuth_dge
+  collectFiles:
+    run: ./gather-files.cwl
+    in: 
+      inFiles:
+        source:
+          - sleuth/kallisto_sleuthObject
+          - sleuth/kallisto_df
+          - sleuth/kallisto_tpmMatrix
+          - sleuth/kallisto_mappingStats
+          - sleuth/sleuth_dge
+        linkMerge: merge_flattened
+      destination: finaloutdir
+    out: [outDir]
+  
+outputs:
+  sleuth_outdir:
     type: Directory
-    inputBinding:
-      position: 1
-  inMetadataFile:
-    type: File
-    inputBinding:
-      position: 2
-  inMetadataSample:
-    type: string
-    inputBinding:
-      position: 3
-  inMetadataFactorList:
-    type: string[]
-    inputBinding:
-      position: 4
-  inMetadataDataCol:
-    type: string
-    inputBinding:
-      position: 5
-  outFolder:
-    type: string
-    inputBinding:
-      position: 6
+    outputSource: collectFiles/outDir
 
-outputs:
-- id: outdir
-  type:
-    type: array
-    items: Directory
-  outputBinding:
-    glob: $(runtime.outdir)/$(inputs.outFolder)
+$namespaces:
+  s: https://schema.org/
+  edam: http://edamontology.org/
+
+$schemas:
+  - https://schema.org/version/latest/schemaorg-current-https.rdf
+  - http://edamontology.org/EDAM_1.18.owl
+
+s:author:
+  - class: s:Person
+    s:name: Dominik Brilhaus
+    s:identifier: https://orcid.org/0000-0001-9021-3197
-- 
GitLab