From 52a5e1f6d44557a68bb9496ce512b99feac0a58a Mon Sep 17 00:00:00 2001 From: Dominik Brilhaus <brilhaus@nfdi4plants.org> Date: Fri, 21 Mar 2025 10:59:12 +0100 Subject: [PATCH 1/3] CWL metadata notes --- .cwl/cwl-metadata-checklist.md | 80 +++++++++++++++++++++++++++ workflows/fastqc/fastqc.cwl | 6 +- workflows/kallisto/kallisto-index.cwl | 2 +- workflows/kallisto/kallisto-quant.cwl | 2 +- 4 files changed, 84 insertions(+), 6 deletions(-) create mode 100644 .cwl/cwl-metadata-checklist.md diff --git a/.cwl/cwl-metadata-checklist.md b/.cwl/cwl-metadata-checklist.md new file mode 100644 index 0000000..e7b070f --- /dev/null +++ b/.cwl/cwl-metadata-checklist.md @@ -0,0 +1,80 @@ + +# Checklist for good CWL documents + +based on recommendations from: + +- https://www.commonwl.org/user_guide/topics/best-practices.html +- https://www.commonwl.org/user_guide/topics/metadata-and-authorship.html + +- [Design](#design) + - [Single-step first](#single-step-first) +- [Dependencies](#dependencies) + - [Soft requirements = `hints`](#soft-requirements--hints) + - [Hard requirements = `requirements`](#hard-requirements--requirements) +- [Metadata](#metadata) + - [Namespaces and schemas](#namespaces-and-schemas) + +## Design + +### Keep it Simple: Single-step + +- follow the [KISS principle](https://en.wikipedia.org/wiki/KISS_principle) +- a `CommandLineTool` document should only execute one process +- use `Workflow` documents to design more complex, multi-step pipelines +- use `scatter` to execute the process on multiple inputs +- do not hard code input, output paths + +## Dependencies + +### Soft requirements = `hints` + +Specify software and resource requirements under `hints` + +- add `SoftwareRequirement` to specify software version and reference + - `package: ` name of the software or package + - `specs: ` reference url from https://identifiers.org/biotools/ or SciCrunch https://identifiers.org/rrid/ + - `version: [ "0.11.9" ]` +- add `DockerRequirement` + - reference a local `Dockerfile` or a published Docker image +- add `ResourceRequirement` to specify the required compute resources + +### Hard requirements = `requirements` + +Use the `requirements` primarily to specify hard requirements needed to run the current `CommandLineTool` or `Workflow` document + +## Metadata + +### Namespaces and schemas + +Adding namespaces and schemas allows to reuse them elsewhere in a CWL document + +```yaml +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + +$schemas: + - https://schema.org/version/latest/schemaorg-current-https.rdf + - http://edamontology.org/EDAM_1.18.owl +``` + +### Attribute authors and contributors + +```yaml +s:author: + - class: s:Person + s:identifier: <author ORCID> + s:email: mailto:<author email> + s:name: <author name> + +s:contributor: + - class: s:Person + s:identifier: <contributor ORCID> + s:email: mailto:<contributor email> + s:name: <contributor name> + +s:citation: <DOI to software paper> +s:codeRepository: <URL to software repo (e.g. github)> +s:dateCreated: "2016-12-13" +s:license: <URL to license, e.g. from https://spdx.org/licenses/> +``` diff --git a/workflows/fastqc/fastqc.cwl b/workflows/fastqc/fastqc.cwl index 1954ad0..4f96fb6 100644 --- a/workflows/fastqc/fastqc.cwl +++ b/workflows/fastqc/fastqc.cwl @@ -19,10 +19,10 @@ hints: dockerPull: quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1 SoftwareRequirement: packages: - fastqc: + - package: fastqc specs: - https://identifiers.org/biotools/fastqc - - - https://identifiers.org/rrid/RRID:SCR_014583 + - https://identifiers.org/rrid/RRID:SCR_014583 version: [ "0.11.9" ] baseCommand: "fastqc" @@ -58,5 +58,3 @@ $schemas: - https://edamontology.org/EDAM_1.25.owl s:license: https://spdx.org/licenses/GPL-3.0-or-later - - diff --git a/workflows/kallisto/kallisto-index.cwl b/workflows/kallisto/kallisto-index.cwl index 10f879d..595142c 100644 --- a/workflows/kallisto/kallisto-index.cwl +++ b/workflows/kallisto/kallisto-index.cwl @@ -33,7 +33,7 @@ hints: dockerPull: quay.io/biocontainers/kallisto:0.51.1--ha4fb952_1 SoftwareRequirement: packages: - kallisto: + - package: kallisto version: [ "0.51.1" ] specs: - https://identifiers.org/rrid/RRID:SCR_016582 diff --git a/workflows/kallisto/kallisto-quant.cwl b/workflows/kallisto/kallisto-quant.cwl index 1a8e6ea..e250c04 100755 --- a/workflows/kallisto/kallisto-quant.cwl +++ b/workflows/kallisto/kallisto-quant.cwl @@ -45,7 +45,7 @@ hints: dockerPull: quay.io/biocontainers/kallisto:0.51.1--ha4fb952_1 SoftwareRequirement: packages: - kallisto: + - package: kallisto version: [ "0.51.1" ] specs: [ https://identifiers.org/biotools/kallisto ] -- GitLab From be3024307de8401d3db5637b5caccae00cfd5552 Mon Sep 17 00:00:00 2001 From: Dominik Brilhaus <brilhaus@nfdi4plants.org> Date: Fri, 21 Mar 2025 11:02:23 +0100 Subject: [PATCH 2/3] add author to all run.cwl --- runs/deseq2/run.cwl | 13 +++++++++++++ runs/fastqc/run.cwl | 15 ++++++++++++++- runs/isaSampleToRawDataSeq/run.cwl | 15 ++++++++++++++- runs/kallisto/run.cwl | 13 +++++++++++++ runs/shiny/run.cwl | 13 +++++++++++++ runs/sleuth/run.cwl | 14 ++++++++++++++ 6 files changed, 81 insertions(+), 2 deletions(-) diff --git a/runs/deseq2/run.cwl b/runs/deseq2/run.cwl index 849cd98..e0fdbf4 100644 --- a/runs/deseq2/run.cwl +++ b/runs/deseq2/run.cwl @@ -22,3 +22,16 @@ outputs: output: type: File[] outputSource: deseq2/output + +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + +$schemas: + - https://schema.org/version/latest/schemaorg-current-https.rdf + - http://edamontology.org/EDAM_1.18.owl + +s:author: + - class: s:Person + s:name: Dominik Brilhaus + s:identifier: https://orcid.org/0000-0001-9021-3197 \ No newline at end of file diff --git a/runs/fastqc/run.cwl b/runs/fastqc/run.cwl index b049a5f..fb4552b 100644 --- a/runs/fastqc/run.cwl +++ b/runs/fastqc/run.cwl @@ -21,4 +21,17 @@ steps: outputs: fastqc_outdir: type: Directory - outputSource: fastqc/fastqc_outdir \ No newline at end of file + outputSource: fastqc/fastqc_outdir + +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + +$schemas: + - https://schema.org/version/latest/schemaorg-current-https.rdf + - http://edamontology.org/EDAM_1.18.owl + +s:author: + - class: s:Person + s:name: Dominik Brilhaus + s:identifier: https://orcid.org/0000-0001-9021-3197 \ No newline at end of file diff --git a/runs/isaSampleToRawDataSeq/run.cwl b/runs/isaSampleToRawDataSeq/run.cwl index aca29e7..803f531 100644 --- a/runs/isaSampleToRawDataSeq/run.cwl +++ b/runs/isaSampleToRawDataSeq/run.cwl @@ -25,4 +25,17 @@ outputs: outputSource: isaSampleToRawDataSeq/sampleseqCsv sampleseqXlsx: type: File - outputSource: isaSampleToRawDataSeq/sampleseqXlsx \ No newline at end of file + outputSource: isaSampleToRawDataSeq/sampleseqXlsx + +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + +$schemas: + - https://schema.org/version/latest/schemaorg-current-https.rdf + - http://edamontology.org/EDAM_1.18.owl + +s:author: + - class: s:Person + s:name: Dominik Brilhaus + s:identifier: https://orcid.org/0000-0001-9021-3197 \ No newline at end of file diff --git a/runs/kallisto/run.cwl b/runs/kallisto/run.cwl index d702ed4..f06fae2 100644 --- a/runs/kallisto/run.cwl +++ b/runs/kallisto/run.cwl @@ -41,3 +41,16 @@ outputs: kallistoOutDir: type: Directory outputSource: kallisto/kallistoOutDir + +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + +$schemas: + - https://schema.org/version/latest/schemaorg-current-https.rdf + - http://edamontology.org/EDAM_1.18.owl + +s:author: + - class: s:Person + s:name: Dominik Brilhaus + s:identifier: https://orcid.org/0000-0001-9021-3197 \ No newline at end of file diff --git a/runs/shiny/run.cwl b/runs/shiny/run.cwl index 71435f9..532fa88 100644 --- a/runs/shiny/run.cwl +++ b/runs/shiny/run.cwl @@ -16,3 +16,16 @@ steps: out: [] outputs: [] + +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + +$schemas: + - https://schema.org/version/latest/schemaorg-current-https.rdf + - http://edamontology.org/EDAM_1.18.owl + +s:author: + - class: s:Person + s:name: Dominik Brilhaus + s:identifier: https://orcid.org/0000-0001-9021-3197 diff --git a/runs/sleuth/run.cwl b/runs/sleuth/run.cwl index e00ce51..7884d8d 100644 --- a/runs/sleuth/run.cwl +++ b/runs/sleuth/run.cwl @@ -27,3 +27,17 @@ outputs: outdir: type: Directory[] outputSource: sleuth/outdir + + +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + +$schemas: + - https://schema.org/version/latest/schemaorg-current-https.rdf + - http://edamontology.org/EDAM_1.18.owl + +s:author: + - class: s:Person + s:name: Dominik Brilhaus + s:identifier: https://orcid.org/0000-0001-9021-3197 \ No newline at end of file -- GitLab From ed5710494ee579783346a29c0ca506f8259c19ce Mon Sep 17 00:00:00 2001 From: Dominik Brilhaus <brilhaus@nfdi4plants.org> Date: Fri, 21 Mar 2025 12:49:32 +0100 Subject: [PATCH 3/3] add citation and repo to kallisto --- workflows/kallisto/kallisto-index.cwl | 3 +++ workflows/kallisto/kallisto-quant.cwl | 3 +++ 2 files changed, 6 insertions(+) diff --git a/workflows/kallisto/kallisto-index.cwl b/workflows/kallisto/kallisto-index.cwl index 595142c..1a4b3c9 100644 --- a/workflows/kallisto/kallisto-index.cwl +++ b/workflows/kallisto/kallisto-index.cwl @@ -82,5 +82,8 @@ $namespaces: s: https://schema.org/ $schemas: - https://edamontology.org/EDAM_1.25.owl + - https://schema.org/version/latest/schemaorg-current-https.rdf s:license: https://spdx.org/licenses/BSD-2-Clause +s:citation: https://dx.doi.org/10.1038/nbt.3519 +s:codeRepository: https://github.com/pachterlab/kallisto diff --git a/workflows/kallisto/kallisto-quant.cwl b/workflows/kallisto/kallisto-quant.cwl index e250c04..93ff361 100755 --- a/workflows/kallisto/kallisto-quant.cwl +++ b/workflows/kallisto/kallisto-quant.cwl @@ -175,5 +175,8 @@ $namespaces: s: https://schema.org/ $schemas: - https://edamontology.org/EDAM_1.25.owl + - https://schema.org/version/latest/schemaorg-current-https.rdf s:license: https://spdx.org/licenses/BSD-2-Clause +s:citation: https://dx.doi.org/10.1038/nbt.3519 +s:codeRepository: https://github.com/pachterlab/kallisto -- GitLab