From 211ac2be5acdf160d1e7394d773c5b2875636773 Mon Sep 17 00:00:00 2001 From: Dominik Brilhaus <brilhaus@nfdi4plants.org> Date: Mon, 24 Mar 2025 10:17:13 +0100 Subject: [PATCH] redesign sleuth --- runs/sleuth/run.cwl | 14 ++-- runs/sleuth/run.yml | 2 +- workflows/sleuth/gather-files.cwl | 24 +++++++ workflows/sleuth/sleuth.R | 23 ++---- workflows/sleuth/sleuth.cwl | 87 +++++++++++++++++++++++ workflows/sleuth/workflow.cwl | 113 +++++++++++++++--------------- 6 files changed, 182 insertions(+), 81 deletions(-) create mode 100644 workflows/sleuth/gather-files.cwl create mode 100644 workflows/sleuth/sleuth.cwl diff --git a/runs/sleuth/run.cwl b/runs/sleuth/run.cwl index 7884d8d..789736f 100644 --- a/runs/sleuth/run.cwl +++ b/runs/sleuth/run.cwl @@ -3,13 +3,16 @@ cwlVersion: v1.2 class: Workflow +requirements: + SubworkflowFeatureRequirement: {} + inputs: inKallistoResults: Directory inMetadataFile: File inMetadataSample: string inMetadataFactorList: string[] inMetadataDataCol: string - outFolder: string + resultsoutdir: string steps: sleuth: @@ -20,14 +23,13 @@ steps: inMetadataSample: inMetadataSample inMetadataFactorList: inMetadataFactorList inMetadataDataCol: inMetadataDataCol - outFolder: outFolder - out: [outdir] + finaloutdir: resultsoutdir + out: [ sleuth_outdir ] outputs: outdir: - type: Directory[] - outputSource: sleuth/outdir - + type: Directory + outputSource: sleuth/sleuth_outdir $namespaces: s: https://schema.org/ diff --git a/runs/sleuth/run.yml b/runs/sleuth/run.yml index 1996f50..61c509d 100644 --- a/runs/sleuth/run.yml +++ b/runs/sleuth/run.yml @@ -8,4 +8,4 @@ inMetadataSample: "Input [Source Name]" inMetadataFactorList: - "Factor [Photosynthesis mode]" inMetadataDataCol: "Output [Data]" -outFolder: results +resultsoutdir: results diff --git a/workflows/sleuth/gather-files.cwl b/workflows/sleuth/gather-files.cwl new file mode 100644 index 0000000..67d23ec --- /dev/null +++ b/workflows/sleuth/gather-files.cwl @@ -0,0 +1,24 @@ +cwlVersion: v1.2 +class: ExpressionTool +label: Gather files +doc: | + Helper tool to organize workflow outputs + + Takes an array of files (e.g. from a workflow step) and yields them in a destination directory. + + Adapted from: https://github.com/common-workflow-language/cwl-v1.1/blob/a22b7580c6b50e77c0a181ca59d3828dd5c69143/tests/dir7.cwl +requirements: + - class: InlineJavascriptRequirement +inputs: + inFiles: File[] + destination: string +expression: | + ${ + return {"outDir": { + "class": "Directory", + "basename": inputs.destination, + "listing": inputs.inFiles + } }; + } +outputs: + outDir: Directory \ No newline at end of file diff --git a/workflows/sleuth/sleuth.R b/workflows/sleuth/sleuth.R index 2b2283c..a9e0b52 100644 --- a/workflows/sleuth/sleuth.R +++ b/workflows/sleuth/sleuth.R @@ -18,7 +18,6 @@ inMetadataFile <- args[2] inMetadataSample <- args[3] inMetadataFactorList <- args[4] inMetadataDataCol <- args[5] -outFolder <- args[6] # inKallistoResults <- "../../runs/kallisto/kallisto_results" # inMetadataFile <- "../../runs/isaSampleToRawDataSeq/rnaseq-samples.csv" @@ -27,12 +26,6 @@ outFolder <- args[6] # inMetadataDataCol <- "Output [Data]" # outFolder <- "." -################################################ -#### If it does not exist, create out dir -################################################ - -dir.create(outFolder, recursive = T, showWarnings = F) - ################################################ #### Read ISA sample metadata ################################################ @@ -43,10 +36,8 @@ samples <- read.csv(file = inMetadataFile, check.names = FALSE) #### Read Kallisto results ################################################ -base_dir <- inKallistoResults - # A list of paths to the kallisto results indexed by the sample IDs is collated with -kal_dirs <- dir(base_dir, full.names = T) ## Sleuth requires full paths +kal_dirs <- dir(inKallistoResults, full.names = T) ## Sleuth requires full paths s2c <- samples[order(samples[[inMetadataSample]]), c(inMetadataSample, unlist(inMetadataFactorList), inMetadataDataCol)] @@ -66,7 +57,7 @@ design_formula <- as.formula(paste("~", paste(rev(factors), collapse = " + "))) so <- sleuth_prep(s2c, full_model = design_formula) -save(so, file = file.path(outFolder, "kallisto_sleuthObject.RData")) +save(so, file = "kallisto_sleuthObject.RData") ################################################ #### Extract expression tables @@ -76,13 +67,13 @@ save(so, file = file.path(outFolder, "kallisto_sleuthObject.RData")) expression_data <- kallisto_table(so) ## write to file -write.csv(expression_data, file.path(outFolder, "kallisto_df.csv"), row.names = F) +write.csv(expression_data, "kallisto_df.csv", row.names = F) ## as tpm matrix (gene x sample) tpm_table <- reshape(expression_data, idvar = "target_id", timevar = "sample", direction = "wide", v.names = "tpm") # Write to file -write.csv(tpm_table, file.path(outFolder, "kallisto_tpmMatrix.csv"), row.names = F) +write.csv(tpm_table, "kallisto_tpmMatrix.csv", row.names = F) ################################################ #### Summarize mapping stats @@ -95,11 +86,11 @@ for (i in dir(kal_dirs, pattern = ".json", full.names = T)) { mapping_stats <- rbind(mapping_stats, z) } -write.csv(mapping_stats, file.path(outFolder, "kallisto_mappingStats.csv"), row.names = F) +write.csv(mapping_stats, "kallisto_mappingStats.csv", row.names = F) ################################################ -#### Run sleuth to identify DEGs +#### Run sleuth to identify DGE ################################################ so <- sleuth_fit(so) @@ -111,4 +102,4 @@ sleuth_table <- sleuth_results(so, "reduced:full", "lrt", show_all = FALSE) ### write to file -write.csv(sleuth_table, file.path(outFolder, "sleuth_dge.csv"), row.names = F) +write.csv(sleuth_table, "sleuth_dge.csv", row.names = F) diff --git a/workflows/sleuth/sleuth.cwl b/workflows/sleuth/sleuth.cwl new file mode 100644 index 0000000..3ffc997 --- /dev/null +++ b/workflows/sleuth/sleuth.cwl @@ -0,0 +1,87 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +hints: + DockerRequirement: + dockerPull: quay.io/biocontainers/mulled-v2-fdd016122f200fdc6dc30f6ea2fd0000e8067dff:f9531f6ac1f44332eff70b5912d7d5f3ebe8df38-0 + SoftwareRequirement: + packages: + - package: R + specs: + - https://identifiers.org/rrid/RRID:SCR_001905 + - https://identifiers.org/biotools/r + - https://anaconda.org/bioconda/r + version: [ "4.2.3" ] + - package: sleuth + version: [ "0.30.1" ] + specs: + - https://identifiers.org/rrid/RRID:SCR_016883 + - https://identifiers.org/biotools/sleuth + +requirements: + - class: InitialWorkDirRequirement + listing: + - entryname: sleuth.R + entry: + $include: sleuth.R + +baseCommand: [Rscript, sleuth.R] + +inputs: + inKallistoResults: + type: Directory + inputBinding: + position: 1 + inMetadataFile: + type: File + inputBinding: + position: 2 + inMetadataSample: + type: string + inputBinding: + position: 3 + inMetadataFactorList: + type: string[] + inputBinding: + position: 4 + inMetadataDataCol: + type: string + inputBinding: + position: 5 + +outputs: + kallisto_sleuthObject: + type: File + outputBinding: + glob: "kallisto_sleuthObject.RData" + kallisto_df: + type: File + outputBinding: + glob: "kallisto_df.csv" + kallisto_tpmMatrix: + type: File + outputBinding: + glob: "kallisto_tpmMatrix.csv" + kallisto_mappingStats: + type: File + outputBinding: + glob: "kallisto_mappingStats.csv" + sleuth_dge: + type: File + outputBinding: + glob: "sleuth_dge.csv" + +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + +$schemas: + - https://schema.org/version/latest/schemaorg-current-https.rdf + - http://edamontology.org/EDAM_1.18.owl + +s:author: + - class: s:Person + s:name: Dominik Brilhaus + s:identifier: https://orcid.org/0000-0001-9021-3197 \ No newline at end of file diff --git a/workflows/sleuth/workflow.cwl b/workflows/sleuth/workflow.cwl index 497533c..5f08a47 100644 --- a/workflows/sleuth/workflow.cwl +++ b/workflows/sleuth/workflow.cwl @@ -1,66 +1,63 @@ #!/usr/bin/env cwl-runner - cwlVersion: v1.2 -class: CommandLineTool - -hints: - DockerRequirement: - dockerPull: quay.io/biocontainers/mulled-v2-fdd016122f200fdc6dc30f6ea2fd0000e8067dff:f9531f6ac1f44332eff70b5912d7d5f3ebe8df38-0 - SoftwareRequirement: - packages: - - package: R - specs: - - https://identifiers.org/rrid/RRID:SCR_001905 - - https://identifiers.org/biotools/r - - https://anaconda.org/bioconda/r - version: [ "4.2.3" ] - - package: sleuth - version: [ "0.30.1" ] - specs: - - https://identifiers.org/rrid/RRID:SCR_016883 - - https://identifiers.org/biotools/sleuth +class: Workflow requirements: - - class: InitialWorkDirRequirement - listing: - - entryname: sleuth.R - entry: - $include: sleuth.R - # - class: NetworkAccess - # networkAccess: true - -baseCommand: [Rscript, sleuth.R] + ScatterFeatureRequirement: {} + SubworkflowFeatureRequirement: {} + MultipleInputFeatureRequirement: {} inputs: - inKallistoResults: + inKallistoResults: Directory + inMetadataFile: File + inMetadataSample: string + inMetadataFactorList: string[] + inMetadataDataCol: string + finaloutdir: string + +steps: + sleuth: + run: sleuth.cwl + in: + inKallistoResults: inKallistoResults + inMetadataFile: inMetadataFile + inMetadataSample: inMetadataSample + inMetadataFactorList: inMetadataFactorList + inMetadataDataCol: inMetadataDataCol + out: + - kallisto_sleuthObject + - kallisto_df + - kallisto_tpmMatrix + - kallisto_mappingStats + - sleuth_dge + collectFiles: + run: ./gather-files.cwl + in: + inFiles: + source: + - sleuth/kallisto_sleuthObject + - sleuth/kallisto_df + - sleuth/kallisto_tpmMatrix + - sleuth/kallisto_mappingStats + - sleuth/sleuth_dge + linkMerge: merge_flattened + destination: finaloutdir + out: [outDir] + +outputs: + sleuth_outdir: type: Directory - inputBinding: - position: 1 - inMetadataFile: - type: File - inputBinding: - position: 2 - inMetadataSample: - type: string - inputBinding: - position: 3 - inMetadataFactorList: - type: string[] - inputBinding: - position: 4 - inMetadataDataCol: - type: string - inputBinding: - position: 5 - outFolder: - type: string - inputBinding: - position: 6 + outputSource: collectFiles/outDir -outputs: -- id: outdir - type: - type: array - items: Directory - outputBinding: - glob: $(runtime.outdir)/$(inputs.outFolder) +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + +$schemas: + - https://schema.org/version/latest/schemaorg-current-https.rdf + - http://edamontology.org/EDAM_1.18.owl + +s:author: + - class: s:Person + s:name: Dominik Brilhaus + s:identifier: https://orcid.org/0000-0001-9021-3197 -- GitLab