-
Saskia Hiltemann authoredSaskia Hiltemann authored
SNP_pipe.sh 22.23 KiB
#!/bin/bash
#. /etc/profile.d/modules.sh
export PATH=$PATH:/vol/biotools/bin
export MANPATH=$MANPATH:/vol/biotools/man
export PATH=$PATH:/vol/software/bin
# install important R package - lummerland cluster does not have it ...
#echo "install.packages(\"gplots\", repos=\"https://cran.rstudio.com\")" | R --vanilla --no-save
#R CMD INSTALL /vol/agrensing/tools/R_packages/bitops_1.0-6.tar.gz
#R CMD INSTALL /vol/agrensing/tools/R_packages/caTools_1.17.1.1.tar.gz
#R CMD INSTALL /vol/agrensing/tools/R_packages/gtools_3.8.1.tar.gz
#R CMD INSTALL /vol/agrensing/tools/R_packages/gdata_2.18.0.tar.gz
#R CMD INSTALL /vol/agrensing/tools/R_packages/gplots_3.0.1.tar.gz
#functions
startle ()
{
number_row_reads=`wc -l $fw | awk '{print $1/4}'`
echo "## Start log ##"
echo "Number of row reads: "$number_row_reads
}
trimmo ()
{
# Remove leading low quality or N bases (below quality 3) (LEADING:3)
# Remove trailing low quality or N bases (below quality 3) (TRAILING:3)
# Scan the read with a 4-base wide sliding window, cutting when the average quality per base drops below 15 (SLIDINGWINDOW:4:15)
# HEADCROP: Cut the specified Number of bases from the start of the read
# MINLEN was first set to 50 bp. But there are libraries with read start length of 50 bp.
if [ -e $rv ]; then
java $java_ram -jar /nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads $cpu -phred33 -summary $trimmo_out".trimmo.stats" \
$fw $rv $trimmo_out".PE.fw" $trimmo_out".SE.fw" $trimmo_out".PE.rv" $trimmo_out".SE.rv" \
ILLUMINACLIP:/nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/adapters/TruSeq-PE_all.fna:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 HEADCROP:10 MINLEN:30
# stats
echo "## Trimmo statistics ##"
printf "trim\t"$trimmo_out".PE.fw\t"; grep -c "^\+$" $trimmo_out".PE.fw"
# fastqc
/nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc" $trimmo_out".PE."*
fi
if [ ! -e $rv ]; then
java $java_ram -jar /nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/trimmomatic-0.39.jar SE -threads $cpu -phred33 -summary $trimmo_out".trimmo.stats" \
$fw $trimmo_out".SE" \
ILLUMINACLIP:/nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/adapters/TruSeq-PE_all.fna:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 HEADCROP:10 MINLEN:30
# stats
echo "## Trimmo statistics ##"
# printf "trim\t"$trimmo_out".SE\t"; grep -c "^\+$" $trimmo_out".SE"
printf "trim\t"$trimmo_out".SE\t"; wc -l $trimmo_out".SE" | awk '{print $1/4}'
# fastqc
/nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc" $trimmo_out".SE"
fi
}
polyA ()
{
# min_len was first set to 50 bp. But there are libraries with read start length of 50 bp.
# If you specify any statistic option, no other ouput will be generated. To preprocess data, do not specify a statistics option.
# -stats_dupl -stats_all
inpA=$polyA_out".polyA.passed"
infA=$polyA_out".polyA.failed"
if [ -e $rv ]; then
/nfs/agrensing/tools/prinseq/prinseq-lite-0.20.4/./prinseq-lite.pl -verbose -no_qual_header -fastq $trimmo_out".PE.fw" -fastq2 $trimmo_out".PE.rv" \
-trim_tail_left 5 -trim_tail_right 5 -min_len 30 -out_good $inpA -out_bad $infA \
-out_format 3 -log $polyA_out"_polyA.log"
# stats
echo "## PolyA statistics ##"
printf "polyA\t"$inpA"_1.fastq\t"; wc -l $inpA"_1.fastq" | awk '{print $1/4}'
#fastqc
/nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc" $inpA"_"[12]".fastq"
fi
if [ ! -e $rv ]; then
/nfs/agrensing/tools/prinseq/prinseq-lite-0.20.4/./prinseq-lite.pl -verbose -no_qual_header -fastq $trimmo_out".SE" \
-trim_tail_left 5 -trim_tail_right 5 -min_len 30 -out_good $inpA -out_bad $infA \
-out_format 3 -log $polyA_out"_polyA.log"
# stats
echo "## PolyA statistics ##"
printf "polyA\t"$inpA".fastq\t"; grep -c "^\+$" $inpA".fastq"
#fastqc
/nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc" $inpA".fastq"
fi
}
duple ()
{
# number of 100% identical duplicat reads.
echo "Duplication:"
if [ -e $rv ]; then
#SE 1
IN=$polyA_out".polyA.passed"
num_dup_se1=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_1_dupl_histogram"}}' $IN"_1.fastq" | wc -l`
printf $num_dup_se1"\t"; printf %.5f\\t "$(( 100000 * num_dup_se1 / number_row_reads ))e-5"
#SE 2
num_dup_se2=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_2_dupl_histogram"}}' $IN"_2.fastq" | wc -l`
printf $num_dup_se2"\t"; printf %.5f\\t "$(( 100000 * num_dup_se2 / number_row_reads ))e-5"
#PE
num_dup_pe=`awk -v nof=$IN 'NR % 4 == 1 || NR % 4 == 2 {if(n==1){if(foo[name]==""){c=0}; foo[name]=foo[name]$1; if(c==1){seq[foo[name]]++} else {c=1}; n=0} else if($1~/^@/){split($1,w,"/"); name=w[1]; foo[name]; n=1}} END {for(a in seq){print seq[a]; p[seq[a]]++} for(l in p){print l"\t"p[l] >> nof"_1_2_dupl_histogram"}}' $IN"_1.fastq" $IN"_2.fastq" | wc -l`
printf $num_dup_pe"\t"; printf %.5f\\n "$(( 100000 * num_dup_pe / number_row_reads ))e-5"
fi
if [ ! -e $rv ]; then
for IN in $polyA_out".polyA.passed"*;
do
num_dup_se1=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_dupl_histogram"}}' $IN | wc -l`
printf $IN"\t"$num_dup_se1"\t"; printf %.5f\\n "$(( 100000 * num_dup_se1 / number_row_reads ))e-5"
done
fi
}
mappi()
{
if [ -e $rv ]; then
/homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -N 1 -n 7 -A sam --npaths=5 \
--read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \
--failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \
$polyA_out".polyA.passed_1.fastq" $polyA_out".polyA.passed_2.fastq" 2> $mappi_out".mappi.err"
unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped.1"`
fi
if [ ! -e $rv ]; then
/ceph/agrensing/tools/gmap-gsnap/gmap-2021-03-08/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -N 1 -n 5 -A sam --npaths=5 \
--read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \
--failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \
$polyA_out".polyA.passed.fastq" 2> $mappi_out".mappi.err"
unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped"`
fi
# mapping statistics
echo "## Pp genome version 3 ##"
echo "Unmapped reads: "$unmapped_reads
}
organelli()
{
if [ -e $rv ]; then
for IN in $mappi_out".genome.concordant_"*;
do
print "Fragments:"
printf $IN"\t";
with=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Ei "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`;
wo=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Eiv "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`;
awk -v with=$with -v wo=$wo 'BEGIN {print with"\t"wo"\t"with/(wo+with)*100" %"}' ;
done
fi
if [ ! -e $rv ]; then
for IN in $mappi_out".genome"*"uniq"*;
do
print "SE reads:"
printf $IN"\t";
with=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Ei "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`;
wo=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Eiv "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`;
awk -v with=$with -v wo=$wo 'BEGIN {print with"\t"wo"\t"with/(wo+with)*100" %"}' ;
done
fi
}
mappi_gDNA()
{
if [ -e $rv ]; then
/homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -n 7 -A sam --npaths=5 \
--read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \
--failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \
$polyA_out".polyA.passed_1.fastq" $polyA_out".polyA.passed_2.fastq" 2> $mappi_out".mappi.err"
unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped.1"`
fi
if [ ! -e $rv ]; then
/homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -n 5 -A sam --npaths=5 \
--read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \
--failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \
$polyA_out".polyA.passed.fastq" 2> $mappi_out".mappi.err"
unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped"`
fi
# mapping statistics
echo "## Pp genome version 3 ##"
echo "Unmapped reads: "$unmapped_reads
}
sample ()
{
# if PE use ".genome"*[tcqg]
for IN in $mappi_out".genome"*[tcqg]
do
printf $IN"\tsort.bam\n" >&2
/vol/agrensing/tools/samtools/samtools-1.12/samtools view -@ $cpu -bhS $IN | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G - -o $IN".sort.bam"
printf $IN"\trmdup.bam\n" >&2
grep -vE "chloro|ribo|mito|ERCC" $IN | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G -n -O BAM - | \
/vol/agrensing/tools/samtools/samtools-1.12/samtools fixmate -@ $cpu -m - - | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G -O BAM - | \
/vol/agrensing/tools/samtools/samtools-1.12/samtools markdup -@ $cpu -s -r - \
$IN".filter.fixmate.sort.markdup_r.bam"
# samtools statistics
v_tmp=`awk '$1!~"@" {print $1}' $IN | sort | uniq | wc -l`
printf "Number of mapped reads: "$IN"\t"$v_tmp"\n"
# compress the sam text files
# gzip $IN
# rm sam file
rm $IN
# bam index
/vol/agrensing/tools/samtools/samtools-1.12/samtools index $IN*"_r.bam"
/vol/agrensing/tools/samtools/samtools-1.12/samtools index $IN".sort.bam"
done
ls -l $mappi_out*"bam" >&2
# merge all unique bam files
echo "merging all .uniq bam files" >&2
echo "merging all .uniq bam files"
/vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.bam" $mappi_out*"uniq"*".sort.bam"
/vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.bam"
/vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.rmdup.bam" $mappi_out*"uniq"*"_r.bam"
/vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.rmdup.bam"
}
counti()
{
if [ -e $rv ]; then
# read count v3.3 and ERCC combined
/vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_ercc_gt.gff \
-t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.pB.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_32458_ercc.exon.pB.log"
/vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_ercc_gt.gff \
-t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.p.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_32458_ercc.exon.p.log"
/vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \
-t exon -g Parent -o $counti_out"_v3.3_31315_ercc.exon.pB.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_31315_ercc.exon.pB.log"
/vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \
-t exon -g Parent -o $counti_out"_v3.3_31315_ercc.exon.p.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_31315_ercc.exon.p.log"
/vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \
-t exon -g Parent -o $counti_out"_all_v3.3_31315_ercc.exon.pB.txt" $mappi_out".uniq.merged.bam" 2> $counti_out"_all_v3.3_31315_ercc.exon.pB.log"
/vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \
-t exon -g Parent -o $counti_out"_all_v3.3_31315_ercc.exon.p.txt" $mappi_out".uniq.merged.bam" 2> $counti_out"_all_v3.3_31315_ercc.exon.p.log"
# statistics
printf "\n### FeatureCount statisitcs v3.3 ###\n"
# 32458
echo "Fragment counts on gene models exon PE 32458:"
grep "Successfully assigned fragments" $counti_out"_v3.3_32458_ercc.exon.pB.log"
echo "Fragment counts on gene models exon PE+SE 32458:"
grep "Successfully assigned fragments" $counti_out"_v3.3_32458_ercc.exon.p.log"
# 31315
echo "Fragment counts on gene models exon PE 31315:"
grep "Successfully assigned fragments" $counti_out"_v3.3_31315_ercc.exon.pB.log"
echo "Fragment counts on gene models exon PE+SE 31315:"
grep "Successfully assigned fragments" $counti_out"_v3.3_31315_ercc.exon.p.log"
# 31315 no rmdup
echo "Fragment counts on gene models exon PE 31315 (no rmdup):"
grep "Successfully assigned fragments" $counti_out"_all_v3.3_31315_ercc.exon.pB.log"
echo "Fragment counts on gene models exon PE+SE 31315 (no rmdup):"
grep "Successfully assigned fragments" $counti_out"_all_v3.3_31315_ercc.exon.p.log"
fi
if [ ! -e $rv ]; then
/vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_ercc_gt.gff \
-t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.SE.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_32458_ercc.exon.SE.log"
/vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \
-t exon -g Parent -o $counti_out"_v3.3_31315_ercc.exon.SE.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_31315_ercc.exon.SE.log"
# statistics
printf "\n### FeatureCount statisitcs v3.3 ###"
# 32458
echo "Fragment counts on gene models exon SE 32458:"
grep "Successfully assigned " $counti_out"_v3.3_32458_ercc.exon.SE.log"
# 31315
echo "Fragment counts on gene models exon SE 31315:"
grep "Successfully assigned " $counti_out"_v3.3_31315_ercc.exon.SE.log"
fi
}
depthi()
{
/vol/agrensing/tools/samtools/samtools-1.12/samtools depth -aa $mappi_out".uniq.merged.rmdup.bam" > $mappi_out".uniq.merged.rmdup.depth"
}
snpi()
{
gatkrun='/vol/agrensing/tools/gatk/gatk-4.2.0.0/gatk --java-options -Xmx50G'
threading_option='--native-pair-hmm-threads 8'
# V3
# start the SNP detection
$gatkrun HaplotypeCaller $threading_option -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -ploidy 1 -O $snpi_out".HaplotypeCaller.vcf" 2> $snpi_out".HaplotypeCaller.err"
$gatkrun SelectVariants -R $reffna --variant $snpi_out".HaplotypeCaller.vcf" -select-type INDEL -O $snpi_out".SelectVariants.vcf" 2> $snpi_out".SelectVariants.err"
# HQ SNPs
python2.7 /vol/agrensing/fhaas/GeneAtlas/SNP/GetHighQualVcfs.py -i $snpi_out".HaplotypeCaller.vcf" -o $output_dir"/05_SNP_calling/"$species"/"$prefix"/" --ploidy 1 --percentile 90 --GQ 90 2> $snpi_out".GetHighQualVcfs.HQ.err"
# mv $snpi_out".GetHighQualVcfs.HQ.vcf" $snpi_out".GetHighQualVcfs.V3.HQ.vcf"
$gatkrun IndexFeatureFile -I $snpi_out".GetHighQualVcfs.HQ.vcf" 2> $snpi_out".GetHighQualVcfs.HQ.idx.err"
$gatkrun BaseRecalibrator -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.table" 2> $snpi_out".BaseRecalibrator.err"
$gatkrun ApplyBQSR -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" --bqsr-recal-file $snpi_out".BaseRecalibrator.table" -O $snpi_out".BaseRecalibrator.ApplyBQSR.bam" \
2> $snpi_out".BaseRecalibrator.ApplyBQSR.err"
$gatkrun BaseRecalibrator -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.recal.table" \
2> $snpi_out".BaseRecalibrator.recal.err"
$gatkrun AnalyzeCovariates -before $snpi_out".BaseRecalibrator.table" -after $snpi_out".BaseRecalibrator.recal.table" -verbosity DEBUG -csv $snpi_out".AnalyzeCovariates.calibration-report.csv" \
-plots $snpi_out".AnalyzeCovariates.calibration-report.QC.pdf" 2> $snpi_out".AnalyzeCovariates.calibration-report.err"
$gatkrun PrintReads -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -O $snpi_out".PrintReads.bam" 2> $snpi_out".PrintReads.err"
$gatkrun PrintReads -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -O $snpi_out".PrintReads.ApplyBQSR.bam" 2> $snpi_out".PrintReads.ApplyBQSR.err"
# HaplotypeCaller and VariantFiltration w/o ApplyBQSR BAM file
$gatkrun HaplotypeCaller $threading_option -R $reffna -I $snpi_out".PrintReads.bam" -ploidy 1 -O $snpi_out".PrintReads.HaplotypeCaller.vcf" 2> $snpi_out".PrintReads.HaplotypeCaller.err"
$gatkrun HaplotypeCaller $threading_option -R $reffna -I $snpi_out".PrintReads.ApplyBQSR.bam" -ploidy 1 -O $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.vcf" \
2> $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.err"
# filter flags ## NOTE: it is important to look for float and integer numbers (e.g. FS must be 60.0 not 60) ##
# --filter-expression "FS > 60" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0"
# --filter-name "LowMQ" --filter-expression "QUAL > 30.0 && QUAL < 60.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage"
#
$gatkrun VariantFiltration -R $reffna --variant $snpi_out".PrintReads.HaplotypeCaller.vcf" \
--filter-expression "FS > 60.0" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0" \
--filter-name "LowMQ" --filter-expression "QUAL > 30.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage" \
-O $snpi_out".VariantFiltration.vcf" 2> $snpi_out".VariantFiltration.err"
$gatkrun VariantFiltration -R $reffna --variant $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.vcf" \
--filter-expression "FS > 60.0" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0" \
--filter-name "LowMQ" --filter-expression "QUAL > 30.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage" \
-O $snpi_out".ApplyBQSR.VariantFiltration.vcf" 2> $snpi_out".ApplyBQSR.VariantFiltration.err"
}
corri()
{
for IN in $mappi_out*.gz
do
gunzip $IN
file=`echo $IN | awk '{print substr($1, 1, length($1)-3)}'`
/vol/agrensing/tools/samtools/samtools-1.12/samtools view -@ $cpu -bhS $file | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G - -o $file".sort.bam"
/vol/agrensing/tools/samtools/samtools-1.12/samtools index $file".sort.bam"
gzip $file
done
/vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.bam" $mappi_out*"uniq"*".sort.bam"
/vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.bam"
gatkrun='/ceph/agrensing/tools/gatk/gatk-4.2.0.0/gatk --java-options -Xmx50G'
threading_option='--native-pair-hmm-threads 8'
$gatkrun BaseRecalibrator -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.recal.table" \
2> $snpi_out".BaseRecalibrator.recal.err"
$gatkrun AnalyzeCovariates -before $snpi_out".BaseRecalibrator.table" -after $snpi_out".BaseRecalibrator.recal.table" -verbosity DEBUG -csv $snpi_out".AnalyzeCovariates.calibration-report.csv" \
-plots $snpi_out".AnalyzeCovariates.calibration-report.QC.pdf" 2> $snpi_out".AnalyzeCovariates.calibration-report.err"
}
# script input variables
cpu=$1
output_dir=$2
prefix=$3
species=$4
infile_path=$5
# set variables
java_ram="-Xmx50G -Xms25G"
id=$prefix
fw=$infile_path"/fw."$prefix
rv=$infile_path"/rv."$prefix
db_path_V3="/vol/agrensing/fhaas/DB/Pp_core_chloro_mito_ribo_ercc_sNone/Pp_core_chloro_mito_ribo_ercc_sNone"
db_name_V3="Pp_core_chloro_mito_ribo_ercc_sNone"
reffna="/vol/agrensing/fhaas/DB/Pp_core_chloro_mito_ribo_ercc/Pp_core_chloro_mito_ribo_ercc.fna"
# outdir variables
# output=$output_dir"/"$prefix
trimmo_out=$output_dir"/02_trimmomatic/"$species"/"$prefix"/"$prefix
polyA_out=$output_dir"/03_poly-A/"$species"/"$prefix"/"$prefix
mappi_out=$output_dir"/04_genome_mapping/"$species"/"$prefix"/"$prefix
snpi_out=$output_dir"/05_SNP_calling/"$species"/"$prefix"/"$prefix
counti_out=$output_dir"/06_counts/"$species"/"$prefix"/"$prefix
# create result folder
mkdir -p $output_dir"/02_trimmomatic/"$species"/"$prefix
mkdir -p $output_dir"/03_poly-A/"$species"/"$prefix
mkdir -p $output_dir"/04_genome_mapping/"$species"/"$prefix
mkdir -p $output_dir"/05_SNP_calling/"$species"/"$prefix
mkdir -p $output_dir"/06_counts/"$species"/"$prefix
mkdir -p $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc"
mkdir -p $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc"
# print running machine and start date
hostname
date
# print all variables
printf $cpu"\n"$output_dir"\n"$prefix"\n"$species"\n"$fw"\n"
if [ -e $rv ]; then
printf $rv"\n"
fi
printf "\'$java_ram\'""\n"$id"\n"$db_path_V3"\n"$db_name_V3"\n"
echo ""
echo "Phase 0 assamble data"
# startle
date
echo "Phase 1 run trimming"
# trimmo
date
echo "Phase 2 run polyA"
# polyA
# duple
date
echo "Phase 3 run mapping"
# mappi
#-# mappi_gDNA
# organelli
# sample
# counti
# depthi
date
echo "Phase 4 run SNP calling"
snpi
#-# corri
date