From 43934494bd887687d86b548cd93ea9205be40fa2 Mon Sep 17 00:00:00 2001
From: Dominik Brilhaus <brilhaus@nfdi4plants.org>
Date: Tue, 18 Mar 2025 08:50:50 +0100
Subject: [PATCH] add and run fastqc workflow

---
 .gitattributes                                | 14 +++++-
 .../Talinum_RNASeq_minimal/dataset/.gitkeep   |  0
 ...B_097_CAMMD_CAGATC_L001_R1_001_fastqc.html |  3 ++
 ...DB_097_CAMMD_CAGATC_L001_R1_001_fastqc.zip |  3 ++
 ...B_099_CAMMD_CTTGTA_L001_R1_001_fastqc.html |  3 ++
 ...DB_099_CAMMD_CTTGTA_L001_R1_001_fastqc.zip |  3 ++
 ...B_103_CAMMD_AGTCAA_L001_R1_001_fastqc.html |  3 ++
 ...DB_103_CAMMD_AGTCAA_L001_R1_001_fastqc.zip |  3 ++
 ..._161_reC3MD_GTCCGC_L001_R1_001_fastqc.html |  3 ++
 ...B_161_reC3MD_GTCCGC_L001_R1_001_fastqc.zip |  3 ++
 ..._163_reC3MD_GTGAAA_L001_R1_001_fastqc.html |  3 ++
 ...B_163_reC3MD_GTGAAA_L001_R1_001_fastqc.zip |  3 ++
 ...165_re-C3MD_GTGAAA_L002_R1_001_fastqc.html |  3 ++
 ..._165_re-C3MD_GTGAAA_L002_R1_001_fastqc.zip |  3 ++
 runs/fastqc/run.cwl                           | 24 ++++++++++
 runs/fastqc/run.yml                           | 14 ++++++
 workflows/fastqc/collectFilesInDir.cwl        | 20 ++++++++
 workflows/fastqc/fastqc.cwl                   | 48 +++++++++++++++++++
 workflows/fastqc/workflow.cwl                 | 32 +++++++++++++
 19 files changed, 187 insertions(+), 1 deletion(-)
 delete mode 100644 assays/Talinum_RNASeq_minimal/dataset/.gitkeep
 create mode 100644 runs/fastqc/results/DB_097_CAMMD_CAGATC_L001_R1_001_fastqc.html
 create mode 100644 runs/fastqc/results/DB_097_CAMMD_CAGATC_L001_R1_001_fastqc.zip
 create mode 100644 runs/fastqc/results/DB_099_CAMMD_CTTGTA_L001_R1_001_fastqc.html
 create mode 100644 runs/fastqc/results/DB_099_CAMMD_CTTGTA_L001_R1_001_fastqc.zip
 create mode 100644 runs/fastqc/results/DB_103_CAMMD_AGTCAA_L001_R1_001_fastqc.html
 create mode 100644 runs/fastqc/results/DB_103_CAMMD_AGTCAA_L001_R1_001_fastqc.zip
 create mode 100644 runs/fastqc/results/DB_161_reC3MD_GTCCGC_L001_R1_001_fastqc.html
 create mode 100644 runs/fastqc/results/DB_161_reC3MD_GTCCGC_L001_R1_001_fastqc.zip
 create mode 100644 runs/fastqc/results/DB_163_reC3MD_GTGAAA_L001_R1_001_fastqc.html
 create mode 100644 runs/fastqc/results/DB_163_reC3MD_GTGAAA_L001_R1_001_fastqc.zip
 create mode 100644 runs/fastqc/results/DB_165_re-C3MD_GTGAAA_L002_R1_001_fastqc.html
 create mode 100644 runs/fastqc/results/DB_165_re-C3MD_GTGAAA_L002_R1_001_fastqc.zip
 create mode 100644 runs/fastqc/run.cwl
 create mode 100644 runs/fastqc/run.yml
 create mode 100644 workflows/fastqc/collectFilesInDir.cwl
 create mode 100644 workflows/fastqc/fastqc.cwl
 create mode 100644 workflows/fastqc/workflow.cwl

diff --git a/.gitattributes b/.gitattributes
index 07a4a56..574339b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -32,4 +32,16 @@ runs/kallisto_sleuth/sleuth_dge.csv filter=lfs diff=lfs merge=lfs -text
 studies/TalinumGenomeDraft/resources/Talinum.gm.CDS.nt.fa filter=lfs diff=lfs merge=lfs -text
 runs/sleuth/kallisto_sleuthObject.RData filter=lfs diff=lfs merge=lfs -text
 runs/sleuth/out/kallisto_sleuthObject.RData filter=lfs diff=lfs merge=lfs -text
-runs/kallisto/kallisto_results/** filter=lfs diff=lfs merge=lfs -text
\ No newline at end of file
+runs/kallisto/kallisto_results/** filter=lfs diff=lfs merge=lfs -text
+runs/fastqc/results/DB_097_CAMMD_CAGATC_L001_R1_001_fastqc.zip filter=lfs diff=lfs merge=lfs -text
+runs/fastqc/results/DB_163_reC3MD_GTGAAA_L001_R1_001_fastqc.html filter=lfs diff=lfs merge=lfs -text
+runs/fastqc/results/DB_097_CAMMD_CAGATC_L001_R1_001_fastqc.html filter=lfs diff=lfs merge=lfs -text
+runs/fastqc/results/DB_099_CAMMD_CTTGTA_L001_R1_001_fastqc.html filter=lfs diff=lfs merge=lfs -text
+runs/fastqc/results/DB_099_CAMMD_CTTGTA_L001_R1_001_fastqc.zip filter=lfs diff=lfs merge=lfs -text
+runs/fastqc/results/DB_103_CAMMD_AGTCAA_L001_R1_001_fastqc.html filter=lfs diff=lfs merge=lfs -text
+runs/fastqc/results/DB_103_CAMMD_AGTCAA_L001_R1_001_fastqc.zip filter=lfs diff=lfs merge=lfs -text
+runs/fastqc/results/DB_161_reC3MD_GTCCGC_L001_R1_001_fastqc.html filter=lfs diff=lfs merge=lfs -text
+runs/fastqc/results/DB_161_reC3MD_GTCCGC_L001_R1_001_fastqc.zip filter=lfs diff=lfs merge=lfs -text
+runs/fastqc/results/DB_163_reC3MD_GTGAAA_L001_R1_001_fastqc.zip filter=lfs diff=lfs merge=lfs -text
+runs/fastqc/results/DB_165_re-C3MD_GTGAAA_L002_R1_001_fastqc.html filter=lfs diff=lfs merge=lfs -text
+runs/fastqc/results/DB_165_re-C3MD_GTGAAA_L002_R1_001_fastqc.zip filter=lfs diff=lfs merge=lfs -text
diff --git a/assays/Talinum_RNASeq_minimal/dataset/.gitkeep b/assays/Talinum_RNASeq_minimal/dataset/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/runs/fastqc/results/DB_097_CAMMD_CAGATC_L001_R1_001_fastqc.html b/runs/fastqc/results/DB_097_CAMMD_CAGATC_L001_R1_001_fastqc.html
new file mode 100644
index 0000000..51bd550
--- /dev/null
+++ b/runs/fastqc/results/DB_097_CAMMD_CAGATC_L001_R1_001_fastqc.html
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:496b053b05cdbff7213e1ce93e168c9aae01830b875ba4f140f0445add3a4c9d
+size 608772
diff --git a/runs/fastqc/results/DB_097_CAMMD_CAGATC_L001_R1_001_fastqc.zip b/runs/fastqc/results/DB_097_CAMMD_CAGATC_L001_R1_001_fastqc.zip
new file mode 100644
index 0000000..9cf6ee9
--- /dev/null
+++ b/runs/fastqc/results/DB_097_CAMMD_CAGATC_L001_R1_001_fastqc.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:453098e0f3046efe2317023867e5859e92ee91b254da02f4c84c6a569a12c358
+size 423523
diff --git a/runs/fastqc/results/DB_099_CAMMD_CTTGTA_L001_R1_001_fastqc.html b/runs/fastqc/results/DB_099_CAMMD_CTTGTA_L001_R1_001_fastqc.html
new file mode 100644
index 0000000..141b06f
--- /dev/null
+++ b/runs/fastqc/results/DB_099_CAMMD_CTTGTA_L001_R1_001_fastqc.html
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d20c29bba7d056d66f6a6516c4d1044cdb26184971499eb8814de46ebcbaf6c
+size 610809
diff --git a/runs/fastqc/results/DB_099_CAMMD_CTTGTA_L001_R1_001_fastqc.zip b/runs/fastqc/results/DB_099_CAMMD_CTTGTA_L001_R1_001_fastqc.zip
new file mode 100644
index 0000000..3424223
--- /dev/null
+++ b/runs/fastqc/results/DB_099_CAMMD_CTTGTA_L001_R1_001_fastqc.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c51133d51ff8d98ddaf9df5373e5dc832a66de48b603fc1fca788b75fc9314f8
+size 425882
diff --git a/runs/fastqc/results/DB_103_CAMMD_AGTCAA_L001_R1_001_fastqc.html b/runs/fastqc/results/DB_103_CAMMD_AGTCAA_L001_R1_001_fastqc.html
new file mode 100644
index 0000000..1bbad97
--- /dev/null
+++ b/runs/fastqc/results/DB_103_CAMMD_AGTCAA_L001_R1_001_fastqc.html
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:462d62cb603020068aff4606f26b62d818d2bfb56eb1894dc993638a5763c7ae
+size 611722
diff --git a/runs/fastqc/results/DB_103_CAMMD_AGTCAA_L001_R1_001_fastqc.zip b/runs/fastqc/results/DB_103_CAMMD_AGTCAA_L001_R1_001_fastqc.zip
new file mode 100644
index 0000000..3ebe761
--- /dev/null
+++ b/runs/fastqc/results/DB_103_CAMMD_AGTCAA_L001_R1_001_fastqc.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e7d78a994d307c6a57adbb4b41d22207d0ae7ebf2ace568f9fbe70aa606da03
+size 424835
diff --git a/runs/fastqc/results/DB_161_reC3MD_GTCCGC_L001_R1_001_fastqc.html b/runs/fastqc/results/DB_161_reC3MD_GTCCGC_L001_R1_001_fastqc.html
new file mode 100644
index 0000000..25b6a55
--- /dev/null
+++ b/runs/fastqc/results/DB_161_reC3MD_GTCCGC_L001_R1_001_fastqc.html
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e689e4f749d29fff8ec52ada03471255579e43f04e49191bb8b19896348e576
+size 608061
diff --git a/runs/fastqc/results/DB_161_reC3MD_GTCCGC_L001_R1_001_fastqc.zip b/runs/fastqc/results/DB_161_reC3MD_GTCCGC_L001_R1_001_fastqc.zip
new file mode 100644
index 0000000..545e4ba
--- /dev/null
+++ b/runs/fastqc/results/DB_161_reC3MD_GTCCGC_L001_R1_001_fastqc.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1772958d490994dabde1091c2506c634b78d12698027b37e9d8e15d20a72fbc1
+size 420951
diff --git a/runs/fastqc/results/DB_163_reC3MD_GTGAAA_L001_R1_001_fastqc.html b/runs/fastqc/results/DB_163_reC3MD_GTGAAA_L001_R1_001_fastqc.html
new file mode 100644
index 0000000..9df6845
--- /dev/null
+++ b/runs/fastqc/results/DB_163_reC3MD_GTGAAA_L001_R1_001_fastqc.html
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f54fd0ae336e95fa780a9b0fb097beb1f50cdbbc24610d6247066eba3b0454a
+size 610537
diff --git a/runs/fastqc/results/DB_163_reC3MD_GTGAAA_L001_R1_001_fastqc.zip b/runs/fastqc/results/DB_163_reC3MD_GTGAAA_L001_R1_001_fastqc.zip
new file mode 100644
index 0000000..d94d295
--- /dev/null
+++ b/runs/fastqc/results/DB_163_reC3MD_GTGAAA_L001_R1_001_fastqc.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be44017bdb5956ac2b479115fd26a5f605b493b3224050bb0a0af6deb53786ac
+size 424179
diff --git a/runs/fastqc/results/DB_165_re-C3MD_GTGAAA_L002_R1_001_fastqc.html b/runs/fastqc/results/DB_165_re-C3MD_GTGAAA_L002_R1_001_fastqc.html
new file mode 100644
index 0000000..5c6ddb7
--- /dev/null
+++ b/runs/fastqc/results/DB_165_re-C3MD_GTGAAA_L002_R1_001_fastqc.html
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24474916df5d793336d0806f8ae28754e3ce1c03c3c3979f91a37550f26eb126
+size 609445
diff --git a/runs/fastqc/results/DB_165_re-C3MD_GTGAAA_L002_R1_001_fastqc.zip b/runs/fastqc/results/DB_165_re-C3MD_GTGAAA_L002_R1_001_fastqc.zip
new file mode 100644
index 0000000..1e9c80d
--- /dev/null
+++ b/runs/fastqc/results/DB_165_re-C3MD_GTGAAA_L002_R1_001_fastqc.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c7efe5eb2d68cb409f26228692e0b0d19da26787a2a3716faafd6f88722fdb0
+size 426039
diff --git a/runs/fastqc/run.cwl b/runs/fastqc/run.cwl
new file mode 100644
index 0000000..974e992
--- /dev/null
+++ b/runs/fastqc/run.cwl
@@ -0,0 +1,24 @@
+cwlVersion: v1.2
+class: Workflow
+
+requirements:
+  SubworkflowFeatureRequirement: {}
+  ScatterFeatureRequirement: {}
+  MultipleInputFeatureRequirement: {}
+
+inputs:
+  fastq: File[]
+  finaloutdir: string
+
+steps:
+  fastqc:
+    run: ../../workflows/fastqc/workflow.cwl
+    in:
+      fastq: fastq
+      finaloutdir: finaloutdir
+    out: [outdir]
+
+outputs:
+  outdir:
+    type: Directory
+    outputSource: fastqc/outdir
\ No newline at end of file
diff --git a/runs/fastqc/run.yml b/runs/fastqc/run.yml
new file mode 100644
index 0000000..d9744cf
--- /dev/null
+++ b/runs/fastqc/run.yml
@@ -0,0 +1,14 @@
+finaloutdir: "results"
+fastq:
+  - class: File
+    path: ../../assays/Talinum_RNASeq_minimal/dataset/DB_097_CAMMD_CAGATC_L001_R1_001.fastq.gz
+  - class: File
+    path: ../../assays/Talinum_RNASeq_minimal/dataset/DB_099_CAMMD_CTTGTA_L001_R1_001.fastq.gz
+  - class: File
+    path: ../../assays/Talinum_RNASeq_minimal/dataset/DB_103_CAMMD_AGTCAA_L001_R1_001.fastq.gz
+  - class: File
+    path: ../../assays/Talinum_RNASeq_minimal/dataset/DB_161_reC3MD_GTCCGC_L001_R1_001.fastq.gz
+  - class: File
+    path: ../../assays/Talinum_RNASeq_minimal/dataset/DB_163_reC3MD_GTGAAA_L001_R1_001.fastq.gz
+  - class: File
+    path: ../../assays/Talinum_RNASeq_minimal/dataset/DB_165_re-C3MD_GTGAAA_L002_R1_001.fastq.gz
\ No newline at end of file
diff --git a/workflows/fastqc/collectFilesInDir.cwl b/workflows/fastqc/collectFilesInDir.cwl
new file mode 100644
index 0000000..39f9ff8
--- /dev/null
+++ b/workflows/fastqc/collectFilesInDir.cwl
@@ -0,0 +1,20 @@
+cwlVersion: v1.2
+class: ExpressionTool
+label: Collect files in a directory
+doc: |
+  Takes Files (e.g. from a workflow step) and yields them in a desired directory.
+requirements:
+  - class: InlineJavascriptRequirement
+inputs:
+  files: File[]
+  destination: string
+expression: |
+  ${
+    return {"outDir": {
+      "class": "Directory", 
+      "basename": inputs.destination,
+      "listing": inputs.files
+    } };
+  }
+outputs:
+  outDir: Directory
\ No newline at end of file
diff --git a/workflows/fastqc/fastqc.cwl b/workflows/fastqc/fastqc.cwl
new file mode 100644
index 0000000..a2e408d
--- /dev/null
+++ b/workflows/fastqc/fastqc.cwl
@@ -0,0 +1,48 @@
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.2
+class: CommandLineTool
+
+label: Run fastqc on raw reads in FASTQ format (single or paired end) or aligned reads in BAM.
+
+doc: |
+  simplified from: https://github.com/common-workflow-library/bio-cwl-tools/blob/66f620da5b0a11e934a6da83272205a2516bcd91/fastqc/fastqc_1.cwl
+  
+  Run fastqc on raw reads in FASTQ format (single or paired end) or aligned reads in BAM.
+
+hints:
+  ResourceRequirement:
+    coresMin: 1
+    ramMin: 5000
+  DockerRequirement:
+    dockerPull: quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1
+  SoftwareRequirement:
+    packages:
+      fastqc:
+        specs: [ https://identifiers.org/biotools/fastqc ]
+        version: [ "0.11.9" ]
+
+
+baseCommand: "fastqc"
+arguments: 
+  - valueFrom: $(runtime.outdir)
+    prefix: "-o"
+  - valueFrom: "--noextract"
+
+inputs:
+  fastq:
+    type: File
+    inputBinding:
+      position: 1
+ 
+outputs:
+  fastqc_zip:
+    doc: all data e.g. figures
+    type: File
+    outputBinding:
+      glob: "*_fastqc.zip"
+  fastqc_html:
+    doc: html report showing results from zip
+    type: File
+    outputBinding:
+      glob: "*_fastqc.html"
+    
diff --git a/workflows/fastqc/workflow.cwl b/workflows/fastqc/workflow.cwl
new file mode 100644
index 0000000..d80a7ea
--- /dev/null
+++ b/workflows/fastqc/workflow.cwl
@@ -0,0 +1,32 @@
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.2
+class: Workflow
+
+requirements:
+  ScatterFeatureRequirement: {}
+  SubworkflowFeatureRequirement: {}
+
+inputs:
+  fastq: File[]
+  finaloutdir: string
+ 
+steps:
+  fastqc:
+    run: fastqc.cwl
+    scatter: fastq
+    in:
+      fastq: fastq
+    out: [fastqc_zip, fastqc_html]
+  collectFiles:
+    run: ./collectFilesInDir.cwl
+    in: 
+      destination: finaloutdir
+      files:
+        source: [fastqc/fastqc_html, fastqc/fastqc_zip]
+        linkMerge: merge_flattened
+    out: [outDir]
+  
+outputs:
+  outdir:
+    type: Directory
+    outputSource: collectFiles/outDir
\ No newline at end of file
-- 
GitLab