From 9952aa5f47cea2170299b1cfc9c88d49021dfc31 Mon Sep 17 00:00:00 2001 From: Dominik Brilhaus <brilhaus@nfdi4plants.org> Date: Wed, 7 Aug 2024 13:42:24 +0200 Subject: [PATCH] add ebi trimmomatic workflows --- workflows/trimmomatic/README.md | 5 + workflows/trimmomatic/trimmomatic-Dockerfile | 20 ++ workflows/trimmomatic/trimmomatic-docker.yml | 4 + .../trimmomatic/trimmomatic-end_mode.yaml | 3 + .../trimmomatic-illumina_clipping.yaml | 41 +++ .../trimmomatic/trimmomatic-max_info.yaml | 7 + workflows/trimmomatic/trimmomatic-phred.yaml | 3 + .../trimmomatic-sliding_window.yaml | 7 + workflows/trimmomatic/trimmomatic.cwl | 321 ++++++++++++++++++ 9 files changed, 411 insertions(+) create mode 100644 workflows/trimmomatic/README.md create mode 100644 workflows/trimmomatic/trimmomatic-Dockerfile create mode 100644 workflows/trimmomatic/trimmomatic-docker.yml create mode 100644 workflows/trimmomatic/trimmomatic-end_mode.yaml create mode 100755 workflows/trimmomatic/trimmomatic-illumina_clipping.yaml create mode 100755 workflows/trimmomatic/trimmomatic-max_info.yaml create mode 100644 workflows/trimmomatic/trimmomatic-phred.yaml create mode 100644 workflows/trimmomatic/trimmomatic-sliding_window.yaml create mode 100755 workflows/trimmomatic/trimmomatic.cwl diff --git a/workflows/trimmomatic/README.md b/workflows/trimmomatic/README.md new file mode 100644 index 0000000..87b73ce --- /dev/null +++ b/workflows/trimmomatic/README.md @@ -0,0 +1,5 @@ + +# Trimmomatic + +adapted from: https://github.com/EBI-Metagenomics/ebi-metagenomics-cwl/commit/7bb76f33bf40b5cd2604001cac46f967a209c47f + diff --git a/workflows/trimmomatic/trimmomatic-Dockerfile b/workflows/trimmomatic/trimmomatic-Dockerfile new file mode 100644 index 0000000..9d08d88 --- /dev/null +++ b/workflows/trimmomatic/trimmomatic-Dockerfile @@ -0,0 +1,20 @@ +################################################################# +# Dockerfile +# +# Software: trimmomatic +# Software Version: 0.32+dfsg-1 +# Description: DukeGCB trimmomatic image +# Website: http://www.usadellab.org/cms/?page=trimmomatic +# Provides: trimmomatic +# Base Image: dukegcb/trimmomatic +# Build Cmd: docker build --rm -t dukegcb/trimmomatic . +# Pull Cmd: docker pull dukegcb/trimmomatic +# Run Cmd: docker run --rm -it dukegcb/trimmomatic +################################################################# + +FROM phusion/baseimage +MAINTAINER Dan Leehr <dan.leehr@duke.edu> + +RUN apt-get update && apt-get install -y \ + openjdk-7-jre-headless \ + trimmomatic="0.32+dfsg-1" diff --git a/workflows/trimmomatic/trimmomatic-docker.yml b/workflows/trimmomatic/trimmomatic-docker.yml new file mode 100644 index 0000000..8a8df26 --- /dev/null +++ b/workflows/trimmomatic/trimmomatic-docker.yml @@ -0,0 +1,4 @@ +class: DockerRequirement +dockerPull: dukegcb/trimmomatic +dockerFile: > + $import: trimmomatic-Dockerfile diff --git a/workflows/trimmomatic/trimmomatic-end_mode.yaml b/workflows/trimmomatic/trimmomatic-end_mode.yaml new file mode 100644 index 0000000..2c53a6f --- /dev/null +++ b/workflows/trimmomatic/trimmomatic-end_mode.yaml @@ -0,0 +1,3 @@ +type: enum +name: end_mode +symbols: [ SE, PE ] diff --git a/workflows/trimmomatic/trimmomatic-illumina_clipping.yaml b/workflows/trimmomatic/trimmomatic-illumina_clipping.yaml new file mode 100755 index 0000000..8a18874 --- /dev/null +++ b/workflows/trimmomatic/trimmomatic-illumina_clipping.yaml @@ -0,0 +1,41 @@ +type: record +name: illuminaClipping +fields: + - name: adapters + type: File + doc: | + FASTA file containing adapters, PCR sequences, etc. It is used to search + for and remove these sequences in the input FASTQ file(s) + - name: seedMismatches + type: int + doc: | + specifies the maximum mismatch count which will still allow a full match + to be performed + - name: palindromeClipThreshold + type: int + doc: | + specifies how accurate the match between the two 'adapter ligated' reads + must be for PE palindrome read alignment. + - name: simpleClipThreshold + type: int + doc: | + specifies how accurate the match between any adapter etc. sequence must + be against a read + - name: minAdapterLength + type: int? + doc: | + In addition to the alignment score, palindrome mode can verify that a + minimum length of adapter has been detected. If unspecified, this + defaults to 8 bases, for historical reasons. However, since palindrome + mode has a very low false positive rate, this can be safely reduced, even + down to 1, to allow shorter adapter fragments to be removed. + - name: keepBothReads + type: boolean + doc: | + After read-though has been detected by palindrome mode, and the adapter + sequence removed, the reverse read contains the same sequence information + as the forward read, albeit in reverse complement. For this reason, the + default behaviour is to entirely drop the reverse read. By specifying + "true" for this parameter, the reverse read will also be retained, which + may be useful e.g. if the downstream tools cannot handle a combination of + paired and unpaired reads. diff --git a/workflows/trimmomatic/trimmomatic-max_info.yaml b/workflows/trimmomatic/trimmomatic-max_info.yaml new file mode 100755 index 0000000..086b295 --- /dev/null +++ b/workflows/trimmomatic/trimmomatic-max_info.yaml @@ -0,0 +1,7 @@ +type: record +name: maxinfo +fields: + - name: targetLength + type: int + - name: strictness + type: int diff --git a/workflows/trimmomatic/trimmomatic-phred.yaml b/workflows/trimmomatic/trimmomatic-phred.yaml new file mode 100644 index 0000000..6015346 --- /dev/null +++ b/workflows/trimmomatic/trimmomatic-phred.yaml @@ -0,0 +1,3 @@ +type: enum +name: phred +symbols: [ '64', '33' ] diff --git a/workflows/trimmomatic/trimmomatic-sliding_window.yaml b/workflows/trimmomatic/trimmomatic-sliding_window.yaml new file mode 100644 index 0000000..66d3b8c --- /dev/null +++ b/workflows/trimmomatic/trimmomatic-sliding_window.yaml @@ -0,0 +1,7 @@ +type: record +name: slidingWindow +fields: + - name: windowSize + type: int + - name: requiredQuality + type: int diff --git a/workflows/trimmomatic/trimmomatic.cwl b/workflows/trimmomatic/trimmomatic.cwl new file mode 100755 index 0000000..2aaf27e --- /dev/null +++ b/workflows/trimmomatic/trimmomatic.cwl @@ -0,0 +1,321 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: CommandLineTool + +hints: + SoftwareRequirement: + packages: + trimmomatic: + specs: [ "https://identifiers.org/rrid/RRID:SCR_011848" ] + version: [ "0.32", "0.35", "0.36" ] + +requirements: + ResourceRequirement: + ramMin: 10240 + coresMin: 8 + SchemaDefRequirement: + types: + - $import: trimmomatic-end_mode.yaml + - $import: trimmomatic-sliding_window.yaml + - $import: trimmomatic-phred.yaml + - $import: trimmomatic-illumina_clipping.yaml + - $import: trimmomatic-max_info.yaml + InlineJavascriptRequirement: {} + ShellCommandRequirement: {} + +# hints: +# - $import: trimmomatic-docker.yml + +inputs: + phred: + type: trimmomatic-phred.yaml#phred? + inputBinding: + prefix: -phred + separate: false + position: 4 + doc: | + "33" or "64" specifies the base quality encoding. Default: 64 + + tophred64: + type: boolean? + inputBinding: + position: 12 + prefix: TOPHRED64 + separate: false + doc: This (re)encodes the quality part of the FASTQ file to base 64. + + headcrop: + type: int? + inputBinding: + position: 13 + prefix: 'HEADCROP:' + separate: false + doc: | + Removes the specified number of bases, regardless of quality, from the + beginning of the read. + The numbser specified is the number of bases to keep, from the start of + the read. + + tophred33: + type: boolean? + inputBinding: + position: 12 + prefix: TOPHRED33 + separate: false + doc: This (re)encodes the quality part of the FASTQ file to base 33. + + minlen: + type: int? + inputBinding: + position: 100 + prefix: 'MINLEN:' + separate: false + doc: | + This module removes reads that fall below the specified minimal length. + If required, it should normally be after all other processing steps. + Reads removed by this step will be counted and included in the "dropped + reads" count presented in the trimmomatic summary. + + java_opts: + type: string? + inputBinding: + position: 1 + shellQuote: false + doc: | + JVM arguments should be a quoted, space separated list + (e.g. "-Xms128m -Xmx512m") + + leading: + type: int? + inputBinding: + position: 14 + prefix: 'LEADING:' + separate: false + doc: | + Remove low quality bases from the beginning. As long as a base has a + value below this threshold the base is removed and the next base will be + investigated. + + slidingwindow: + type: trimmomatic-sliding_window.yaml#slidingWindow? + inputBinding: + position: 15 + valueFrom: | + ${ if ( self ) { + return "SLIDINGWINDOW:" + self.windowSize + ":" + + self.requiredQuality; + } else { + return self; + } + } + doc: | + Perform a sliding window trimming, cutting once the average quality + within the window falls below a threshold. By considering multiple + bases, a single poor quality base will not cause the removal of high + quality data later in the read. + <windowSize> specifies the number of bases to average across + <requiredQuality> specifies the average quality required + + illuminaClip: + type: trimmomatic-illumina_clipping.yaml#illuminaClipping? + inputBinding: + valueFrom: | + ${ if ( self ) { + return "ILLUMINACLIP:" + inputs.illuminaClip.adapters.path + ":" + + self.seedMismatches + ":" + self.palindromeClipThreshold + ":" + + self.simpleClipThreshold + ":" + self.minAdapterLength + ":" + + self.keepBothReads; + } else { + return self; + } + } + position: 11 + doc: Cut adapter and other illumina-specific sequences from the read. + + crop: + type: int? + inputBinding: + position: 13 + prefix: 'CROP:' + separate: false + doc: | + Removes bases regardless of quality from the end of the read, so that the + read has maximally the specified length after this step has been + performed. Steps performed after CROP might of course further shorten the + read. The value is the number of bases to keep, from the start of the read. + + reads2: + type: File? + format: edam:format_1930 # fastq + inputBinding: + position: 6 + doc: FASTQ file of R2 reads in Paired End mode + + reads1: + type: File + format: edam:format_1930 # fastq + inputBinding: + position: 5 + doc: FASTQ file of reads (R1 reads in Paired End mode) + + avgqual: + type: int? + inputBinding: + position: 101 + prefix: 'AVGQUAL:' + separate: false + doc: | + Drop the read if the average quality is below the specified level + + trailing: + type: int? + inputBinding: + position: 14 + prefix: 'TRAILING:' + separate: false + doc: | + Remove low quality bases from the end. As long as a base has a value + below this threshold the base is removed and the next base (which as + trimmomatic is starting from the 3' prime end would be base preceding + the just removed base) will be investigated. This approach can be used + removing the special Illumina "low quality segment" regions (which are + marked with quality score of 2), but we recommend Sliding Window or + MaxInfo instead + + maxinfo: + type: trimmomatic-max_info.yaml#maxinfo? + inputBinding: + position: 15 + valueFrom: | + ${ if ( self ) { + return "MAXINFO:" + self.targetLength + ":" + self.strictness; + } else { + return self; + } + } + doc: | + Performs an adaptive quality trim, balancing the benefits of retaining + longer reads against the costs of retaining bases with errors. + <targetLength>: This specifies the read length which is likely to allow + the location of the read within the target sequence to be determined. + <strictness>: This value, which should be set between 0 and 1, specifies + the balance between preserving as much read length as possible vs. + removal of incorrect bases. A low value of this parameter (<0.2) favours + longer reads, while a high value (>0.8) favours read correctness. + + end_mode: + type: trimmomatic-end_mode.yaml#end_mode + inputBinding: + position: 3 + doc: | + Single End (SE) or Paired End (PE) mode + +outputs: + reads1_trimmed: + type: File + format: edam:format_1930 # fastq + outputBinding: + glob: $(inputs.reads1.nameroot).trimmed.fastq + + output_log: + type: File + outputBinding: + glob: trim.log + label: Trimmomatic log + doc: | + log of all read trimmings, indicating the following details: + the read name + the surviving sequence length + the location of the first surviving base, aka. the amount trimmed from the start + the location of the last surviving base in the original read + the amount trimmed from the end + + reads1_trimmed_unpaired: + type: File? + format: edam:format_1930 # fastq + outputBinding: + glob: $(inputs.reads1.nameroot).unpaired.trimmed.fastq + + reads2_trimmed_paired: + type: File? + format: edam:format_1930 # fastq + outputBinding: + glob: | + ${ if (inputs.reads2 ) { + return inputs.reads2.nameroot + '.trimmed.fastq'; + } else { + return null; + } + } + + reads2_trimmed_unpaired: + type: File? + format: edam:format_1930 # fastq + outputBinding: + glob: | + ${ if (inputs.reads2 ) { + return inputs.reads2.nameroot + '.unpaired.trimmed.fastq'; + } else { + return null; + } + } + +baseCommand: [ java, org.usadellab.trimmomatic.Trimmomatic ] + +arguments: +- valueFrom: trim.log + prefix: -trimlog + position: 4 +- valueFrom: $(runtime.cores) + position: 4 + prefix: -threads +- valueFrom: $(inputs.reads1.nameroot).trimmed.fastq + position: 7 +- valueFrom: | + ${ + if (inputs.end_mode == "PE" && inputs.reads2) { + return inputs.reads1.nameroot + '.trimmed.unpaired.fastq'; + } else { + return null; + } + } + position: 8 +- valueFrom: | + ${ + if (inputs.end_mode == "PE" && inputs.reads2) { + return inputs.reads2.nameroot + '.trimmed.fastq'; + } else { + return null; + } + } + position: 9 +- valueFrom: | + ${ + if (inputs.end_mode == "PE" && inputs.reads2) { + return inputs.reads2.nameroot + '.trimmed.unpaired.fastq'; + } else { + return null; + } + } + position: 10 + +doc: | + Trimmomatic is a fast, multithreaded command line tool that can be used to trim and crop + Illumina (FASTQ) data as well as to remove adapters. These adapters can pose a real problem + depending on the library preparation and downstream application. + There are two major modes of the program: Paired end mode and Single end mode. The + paired end mode will maintain correspondence of read pairs and also use the additional + information contained in paired reads to better find adapter or PCR primer fragments + introduced by the library preparation process. + Trimmomatic works with FASTQ files (using phred + 33 or phred + 64 quality scores, + depending on the Illumina pipeline used). + +$namespaces: + edam: http://edamontology.org/ + s: http://schema.org/ +$schemas: + - http://edamontology.org/EDAM_1.16.owl + +s:license: "https://www.apache.org/licenses/LICENSE-2.0" +s:copyrightHolder: "EMBL - European Bioinformatics Institute" -- GitLab