SNP_pipe.sh

#!/bin/bash

#. /etc/profile.d/modules.sh

export PATH=$PATH:/vol/biotools/bin
export MANPATH=$MANPATH:/vol/biotools/man
export PATH=$PATH:/vol/software/bin


# install important R package - lummerland cluster does not have it ...
#echo "install.packages(\"gplots\", repos=\"https://cran.rstudio.com\")" | R --vanilla --no-save
#R CMD INSTALL /vol/agrensing/tools/R_packages/bitops_1.0-6.tar.gz
#R CMD INSTALL /vol/agrensing/tools/R_packages/caTools_1.17.1.1.tar.gz
#R CMD INSTALL /vol/agrensing/tools/R_packages/gtools_3.8.1.tar.gz
#R CMD INSTALL /vol/agrensing/tools/R_packages/gdata_2.18.0.tar.gz
#R CMD INSTALL /vol/agrensing/tools/R_packages/gplots_3.0.1.tar.gz

#functions

startle ()
{
	number_row_reads=`wc -l $fw | awk '{print $1/4}'`
	echo "## Start log ##"
	echo "Number of row reads: "$number_row_reads
}

trimmo ()
 {
   # Remove leading low quality or N bases (below quality 3) (LEADING:3)
   # Remove trailing low quality or N bases (below quality 3) (TRAILING:3)
   # Scan the read with a 4-base wide sliding window, cutting when the average quality per base drops below 15 (SLIDINGWINDOW:4:15)
   # HEADCROP: Cut the specified Number of bases from the start of the read
   # MINLEN was first set to 50 bp. But there are libraries with read start length of 50 bp.

   if [ -e $rv ]; then
	java $java_ram -jar /nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads $cpu -phred33 -summary $trimmo_out".trimmo.stats" \
	  $fw $rv $trimmo_out".PE.fw" $trimmo_out".SE.fw" $trimmo_out".PE.rv" $trimmo_out".SE.rv" \
	  ILLUMINACLIP:/nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/adapters/TruSeq-PE_all.fna:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 HEADCROP:10 MINLEN:30

	# stats
	echo "## Trimmo statistics ##"
	printf "trim\t"$trimmo_out".PE.fw\t"; grep -c "^\+$" $trimmo_out".PE.fw"
	# fastqc
	/nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc" $trimmo_out".PE."*
    fi
    if [ ! -e $rv ]; then
	java $java_ram -jar /nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/trimmomatic-0.39.jar SE -threads $cpu -phred33 -summary $trimmo_out".trimmo.stats" \
	  $fw $trimmo_out".SE" \
	  ILLUMINACLIP:/nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/adapters/TruSeq-PE_all.fna:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 HEADCROP:10 MINLEN:30

	# stats
	echo "## Trimmo statistics ##"
	# printf "trim\t"$trimmo_out".SE\t"; grep -c "^\+$" $trimmo_out".SE"
        printf "trim\t"$trimmo_out".SE\t"; wc -l $trimmo_out".SE" | awk '{print $1/4}'
	# fastqc
	/nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc" $trimmo_out".SE"
    fi

 }

polyA ()
 {
   # min_len was first set to 50 bp. But there are libraries with read start length of 50 bp.
   # If you specify any statistic option, no other ouput will be generated. To preprocess data, do not specify a statistics option.
   # -stats_dupl -stats_all

   inpA=$polyA_out".polyA.passed"
   infA=$polyA_out".polyA.failed"

   if [ -e $rv ]; then
	/nfs/agrensing/tools/prinseq/prinseq-lite-0.20.4/./prinseq-lite.pl -verbose -no_qual_header -fastq $trimmo_out".PE.fw" -fastq2 $trimmo_out".PE.rv" \
	  -trim_tail_left 5 -trim_tail_right 5 -min_len 30 -out_good $inpA -out_bad $infA \
	  -out_format 3 -log $polyA_out"_polyA.log"

	# stats
	echo "## PolyA statistics ##"
	printf "polyA\t"$inpA"_1.fastq\t"; wc -l $inpA"_1.fastq" | awk '{print $1/4}'
	#fastqc
	/nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc" $inpA"_"[12]".fastq"
   fi
   if [ ! -e $rv ]; then
	/nfs/agrensing/tools/prinseq/prinseq-lite-0.20.4/./prinseq-lite.pl -verbose -no_qual_header -fastq $trimmo_out".SE" \
	  -trim_tail_left 5 -trim_tail_right 5 -min_len 30 -out_good $inpA -out_bad $infA \
	  -out_format 3 -log $polyA_out"_polyA.log"

	# stats
	echo "## PolyA statistics ##"
	printf "polyA\t"$inpA".fastq\t"; grep -c "^\+$" $inpA".fastq"
	#fastqc
	/nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc" $inpA".fastq"
   fi
 }

duple ()
{
	# number of 100% identical duplicat reads.
	echo "Duplication:"

        if [ -e $rv ]; then

                #SE 1
                IN=$polyA_out".polyA.passed"

                num_dup_se1=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_1_dupl_histogram"}}' $IN"_1.fastq" | wc -l`
                printf $num_dup_se1"\t"; printf %.5f\\t "$(( 100000 * num_dup_se1 / number_row_reads ))e-5"
                #SE 2
                num_dup_se2=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_2_dupl_histogram"}}' $IN"_2.fastq" | wc -l`
                printf $num_dup_se2"\t"; printf %.5f\\t "$(( 100000 * num_dup_se2 / number_row_reads ))e-5"


                #PE
                num_dup_pe=`awk -v nof=$IN 'NR % 4 == 1 || NR % 4 == 2 {if(n==1){if(foo[name]==""){c=0}; foo[name]=foo[name]$1; if(c==1){seq[foo[name]]++} else {c=1}; n=0} else if($1~/^@/){split($1,w,"/"); name=w[1]; foo[name]; n=1}} END {for(a in seq){print seq[a]; p[seq[a]]++} for(l in p){print l"\t"p[l] >> nof"_1_2_dupl_histogram"}}' $IN"_1.fastq" $IN"_2.fastq" | wc -l`
                printf $num_dup_pe"\t"; printf %.5f\\n "$(( 100000 * num_dup_pe / number_row_reads ))e-5"


        fi
        if [ ! -e $rv ]; then
                for IN in $polyA_out".polyA.passed"*;
                do
                  num_dup_se1=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_dupl_histogram"}}' $IN | wc -l`
                  printf $IN"\t"$num_dup_se1"\t"; printf %.5f\\n "$(( 100000 * num_dup_se1 / number_row_reads ))e-5"
                done
        fi


}

mappi()
{
    if [ -e $rv ]; then
	/homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -N 1 -n 7 -A sam --npaths=5 \
          --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN  \
          --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \
          $polyA_out".polyA.passed_1.fastq" $polyA_out".polyA.passed_2.fastq" 2> $mappi_out".mappi.err"

	unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped.1"`

    fi
    if [ ! -e $rv ]; then
	/ceph/agrensing/tools/gmap-gsnap/gmap-2021-03-08/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -N 1 -n 5 -A sam --npaths=5 \
          --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN  \
          --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \
          $polyA_out".polyA.passed.fastq" 2> $mappi_out".mappi.err"

	unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped"`
    fi


    # mapping statistics
        echo "## Pp genome version 3 ##"
        echo "Unmapped reads: "$unmapped_reads

}

organelli()
{
        if [ -e $rv ]; then
                for IN in $mappi_out".genome.concordant_"*;
                do
                  print "Fragments:"
                  printf $IN"\t";
                  with=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Ei "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`;
                  wo=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Eiv "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`;
                  awk -v with=$with -v wo=$wo 'BEGIN {print with"\t"wo"\t"with/(wo+with)*100" %"}' ;
                done
        fi
        if [ ! -e $rv ]; then
                for IN in $mappi_out".genome"*"uniq"*;
                do
                  print "SE reads:"
                  printf $IN"\t";
                  with=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Ei "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`;
                  wo=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Eiv "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`;
                  awk -v with=$with -v wo=$wo 'BEGIN {print with"\t"wo"\t"with/(wo+with)*100" %"}' ;
                done

        fi


}

mappi_gDNA()
{
     if [ -e $rv ]; then
          /homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -n 7 -A sam --npaths=5 \
            --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN  \
            --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \
            $polyA_out".polyA.passed_1.fastq" $polyA_out".polyA.passed_2.fastq" 2> $mappi_out".mappi.err"

          unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped.1"`
     fi

     if [ ! -e $rv ]; then
          /homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -n 5 -A sam --npaths=5 \
           --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN  \
           --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \
           $polyA_out".polyA.passed.fastq" 2> $mappi_out".mappi.err"

         unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped"`
     fi

  # mapping statistics
   echo "## Pp genome version 3 ##"
   echo "Unmapped reads: "$unmapped_reads
}


sample ()
{
     # if PE use ".genome"*[tcqg]
     for IN in $mappi_out".genome"*[tcqg]
     do
	  printf $IN"\tsort.bam\n" >&2
	   /vol/agrensing/tools/samtools/samtools-1.12/samtools view -@ $cpu -bhS $IN | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G - -o $IN".sort.bam"

	  printf $IN"\trmdup.bam\n" >&2
	   grep -vE "chloro|ribo|mito|ERCC" $IN | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G -n -O BAM - | \
           /vol/agrensing/tools/samtools/samtools-1.12/samtools fixmate -@ $cpu -m - - | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G -O BAM - | \
           /vol/agrensing/tools/samtools/samtools-1.12/samtools markdup -@ $cpu -s -r - \
	   $IN".filter.fixmate.sort.markdup_r.bam"

	  # samtools statistics
	  v_tmp=`awk '$1!~"@" {print $1}' $IN | sort | uniq | wc -l`
	  printf "Number of mapped reads: "$IN"\t"$v_tmp"\n"

          # compress the sam text files
          # gzip $IN
	  # rm sam file
          rm $IN


	  # bam index
          /vol/agrensing/tools/samtools/samtools-1.12/samtools index $IN*"_r.bam"
          /vol/agrensing/tools/samtools/samtools-1.12/samtools index $IN".sort.bam"

     done

     ls -l $mappi_out*"bam" >&2

     # merge all unique bam files
     echo "merging all .uniq bam files" >&2
     echo "merging all .uniq bam files"
     /vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.bam" $mappi_out*"uniq"*".sort.bam"
     /vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.bam"

     /vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.rmdup.bam" $mappi_out*"uniq"*"_r.bam"
     /vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.rmdup.bam"
}

counti()
{
     if [ -e $rv ]; then
        # read count v3.3 and ERCC combined
        /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_ercc_gt.gff \
          -t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.pB.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_32458_ercc.exon.pB.log"
        /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_ercc_gt.gff \
          -t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.p.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_32458_ercc.exon.p.log"
        /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \
          -t exon -g Parent -o $counti_out"_v3.3_31315_ercc.exon.pB.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_31315_ercc.exon.pB.log"
        /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \
          -t exon -g Parent -o $counti_out"_v3.3_31315_ercc.exon.p.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_31315_ercc.exon.p.log"


       /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \
         -t exon -g Parent -o $counti_out"_all_v3.3_31315_ercc.exon.pB.txt" $mappi_out".uniq.merged.bam" 2> $counti_out"_all_v3.3_31315_ercc.exon.pB.log"
       /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \
         -t exon -g Parent -o $counti_out"_all_v3.3_31315_ercc.exon.p.txt" $mappi_out".uniq.merged.bam" 2> $counti_out"_all_v3.3_31315_ercc.exon.p.log"

        # statistics
        printf "\n### FeatureCount statisitcs v3.3 ###\n"
        # 32458
        echo "Fragment counts on gene models exon PE 32458:"
        grep "Successfully assigned fragments" $counti_out"_v3.3_32458_ercc.exon.pB.log"
        echo "Fragment counts on gene models exon PE+SE 32458:"
        grep "Successfully assigned fragments" $counti_out"_v3.3_32458_ercc.exon.p.log"
        # 31315
        echo "Fragment counts on gene models exon PE 31315:"
        grep "Successfully assigned fragments" $counti_out"_v3.3_31315_ercc.exon.pB.log"
        echo "Fragment counts on gene models exon PE+SE 31315:"
        grep "Successfully assigned fragments" $counti_out"_v3.3_31315_ercc.exon.p.log"
        # 31315 no rmdup
        echo "Fragment counts on gene models exon PE 31315 (no rmdup):"
        grep "Successfully assigned fragments" $counti_out"_all_v3.3_31315_ercc.exon.pB.log"
        echo "Fragment counts on gene models exon PE+SE 31315 (no rmdup):"
        grep "Successfully assigned fragments" $counti_out"_all_v3.3_31315_ercc.exon.p.log"

     fi
     if [ ! -e $rv ]; then

        /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_ercc_gt.gff \
          -t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.SE.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_32458_ercc.exon.SE.log"
        /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \
          -t exon -g Parent -o $counti_out"_v3.3_31315_ercc.exon.SE.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_31315_ercc.exon.SE.log"

        # statistics
        printf "\n### FeatureCount statisitcs v3.3 ###"
        # 32458
        echo "Fragment counts on gene models exon SE 32458:"
        grep "Successfully assigned " $counti_out"_v3.3_32458_ercc.exon.SE.log"
        # 31315
        echo "Fragment counts on gene models exon SE 31315:"
        grep "Successfully assigned " $counti_out"_v3.3_31315_ercc.exon.SE.log"

     fi
}

depthi()
{
  /vol/agrensing/tools/samtools/samtools-1.12/samtools depth -aa $mappi_out".uniq.merged.rmdup.bam" > $mappi_out".uniq.merged.rmdup.depth"
}


snpi()
{
  gatkrun='/vol/agrensing/tools/gatk/gatk-4.2.0.0/gatk --java-options -Xmx50G'
  threading_option='--native-pair-hmm-threads 8'
  # V3
   # start the SNP detection
   $gatkrun HaplotypeCaller $threading_option -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -ploidy 1 -O $snpi_out".HaplotypeCaller.vcf" 2> $snpi_out".HaplotypeCaller.err"

   $gatkrun SelectVariants -R $reffna --variant $snpi_out".HaplotypeCaller.vcf" -select-type INDEL -O $snpi_out".SelectVariants.vcf" 2> $snpi_out".SelectVariants.err"

   # HQ SNPs
   python2.7 /vol/agrensing/fhaas/GeneAtlas/SNP/GetHighQualVcfs.py -i $snpi_out".HaplotypeCaller.vcf" -o $output_dir"/05_SNP_calling/"$species"/"$prefix"/" --ploidy 1 --percentile 90 --GQ 90 2> $snpi_out".GetHighQualVcfs.HQ.err"

#   mv $snpi_out".GetHighQualVcfs.HQ.vcf" $snpi_out".GetHighQualVcfs.V3.HQ.vcf"

   $gatkrun IndexFeatureFile -I  $snpi_out".GetHighQualVcfs.HQ.vcf" 2> $snpi_out".GetHighQualVcfs.HQ.idx.err"

   $gatkrun BaseRecalibrator -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.table" 2> $snpi_out".BaseRecalibrator.err"

   $gatkrun ApplyBQSR -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" --bqsr-recal-file $snpi_out".BaseRecalibrator.table" -O $snpi_out".BaseRecalibrator.ApplyBQSR.bam" \
	2> $snpi_out".BaseRecalibrator.ApplyBQSR.err"

   $gatkrun BaseRecalibrator -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.recal.table" \
	2> $snpi_out".BaseRecalibrator.recal.err"

   $gatkrun AnalyzeCovariates -before $snpi_out".BaseRecalibrator.table" -after $snpi_out".BaseRecalibrator.recal.table" -verbosity DEBUG -csv $snpi_out".AnalyzeCovariates.calibration-report.csv" \
     -plots $snpi_out".AnalyzeCovariates.calibration-report.QC.pdf" 2> $snpi_out".AnalyzeCovariates.calibration-report.err"

   $gatkrun PrintReads -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -O $snpi_out".PrintReads.bam" 2> $snpi_out".PrintReads.err"

   $gatkrun PrintReads -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -O $snpi_out".PrintReads.ApplyBQSR.bam" 2> $snpi_out".PrintReads.ApplyBQSR.err"

   # HaplotypeCaller and VariantFiltration w/o ApplyBQSR BAM file
   $gatkrun HaplotypeCaller $threading_option -R $reffna -I $snpi_out".PrintReads.bam" -ploidy 1 -O $snpi_out".PrintReads.HaplotypeCaller.vcf" 2> $snpi_out".PrintReads.HaplotypeCaller.err"

   $gatkrun HaplotypeCaller $threading_option -R $reffna -I $snpi_out".PrintReads.ApplyBQSR.bam" -ploidy 1 -O $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.vcf" \
	2> $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.err"

   #  filter flags ## NOTE: it is important to look for float and integer numbers (e.g. FS must be 60.0 not 60) ##
   # --filter-expression "FS > 60" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0"
   # --filter-name "LowMQ" --filter-expression "QUAL > 30.0 && QUAL < 60.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage"
   #
   $gatkrun VariantFiltration -R $reffna --variant $snpi_out".PrintReads.HaplotypeCaller.vcf" \
     --filter-expression "FS > 60.0" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0" \
     --filter-name "LowMQ" --filter-expression "QUAL > 30.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage" \
     -O $snpi_out".VariantFiltration.vcf" 2> $snpi_out".VariantFiltration.err"

   $gatkrun VariantFiltration -R $reffna --variant $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.vcf" \
     --filter-expression "FS > 60.0" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0" \
     --filter-name "LowMQ" --filter-expression "QUAL > 30.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage" \
     -O $snpi_out".ApplyBQSR.VariantFiltration.vcf" 2> $snpi_out".ApplyBQSR.VariantFiltration.err"
}

corri()
{
  for IN in $mappi_out*.gz
  do
    gunzip $IN
    file=`echo $IN | awk '{print substr($1, 1, length($1)-3)}'`
    /vol/agrensing/tools/samtools/samtools-1.12/samtools view -@ $cpu -bhS $file | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G - -o $file".sort.bam"
    /vol/agrensing/tools/samtools/samtools-1.12/samtools index $file".sort.bam"
    gzip $file

  done
  /vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.bam" $mappi_out*"uniq"*".sort.bam"
  /vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.bam"


  gatkrun='/ceph/agrensing/tools/gatk/gatk-4.2.0.0/gatk --java-options -Xmx50G'
  threading_option='--native-pair-hmm-threads 8'
  $gatkrun BaseRecalibrator -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.recal.table" \
         2> $snpi_out".BaseRecalibrator.recal.err"

  $gatkrun AnalyzeCovariates -before $snpi_out".BaseRecalibrator.table" -after $snpi_out".BaseRecalibrator.recal.table" -verbosity DEBUG -csv $snpi_out".AnalyzeCovariates.calibration-report.csv" \
    -plots $snpi_out".AnalyzeCovariates.calibration-report.QC.pdf" 2> $snpi_out".AnalyzeCovariates.calibration-report.err"


}


# script input variables

cpu=$1
output_dir=$2
prefix=$3
species=$4
infile_path=$5

# set variables
java_ram="-Xmx50G -Xms25G"
id=$prefix

fw=$infile_path"/fw."$prefix
rv=$infile_path"/rv."$prefix

db_path_V3="/vol/agrensing/fhaas/DB/Pp_core_chloro_mito_ribo_ercc_sNone/Pp_core_chloro_mito_ribo_ercc_sNone"
db_name_V3="Pp_core_chloro_mito_ribo_ercc_sNone"

reffna="/vol/agrensing/fhaas/DB/Pp_core_chloro_mito_ribo_ercc/Pp_core_chloro_mito_ribo_ercc.fna"


# outdir variables
# output=$output_dir"/"$prefix
trimmo_out=$output_dir"/02_trimmomatic/"$species"/"$prefix"/"$prefix
polyA_out=$output_dir"/03_poly-A/"$species"/"$prefix"/"$prefix
mappi_out=$output_dir"/04_genome_mapping/"$species"/"$prefix"/"$prefix
snpi_out=$output_dir"/05_SNP_calling/"$species"/"$prefix"/"$prefix
counti_out=$output_dir"/06_counts/"$species"/"$prefix"/"$prefix

# create result folder
mkdir -p $output_dir"/02_trimmomatic/"$species"/"$prefix
mkdir -p $output_dir"/03_poly-A/"$species"/"$prefix
mkdir -p $output_dir"/04_genome_mapping/"$species"/"$prefix
mkdir -p $output_dir"/05_SNP_calling/"$species"/"$prefix
mkdir -p $output_dir"/06_counts/"$species"/"$prefix

mkdir -p $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc"
mkdir -p $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc"


# print running machine and start date
hostname
date

# print all variables
printf $cpu"\n"$output_dir"\n"$prefix"\n"$species"\n"$fw"\n"

if [ -e $rv ]; then
  printf $rv"\n"
fi

printf "\'$java_ram\'""\n"$id"\n"$db_path_V3"\n"$db_name_V3"\n"

echo ""

echo "Phase 0 assamble data"
# startle

date
echo "Phase 1 run trimming"
# trimmo

date
echo "Phase 2 run polyA"
# polyA
# duple
date
echo "Phase 3 run mapping"
# mappi
#-# mappi_gDNA
# organelli
# sample
# counti
# depthi
date
echo "Phase 4 run SNP calling"
snpi
#-# corri
date