diff --git a/assays/BSA_RNAseq/adapters/custom_adapters.fa b/assays/BSA_RNAseq/adapters/custom_adapters.fa new file mode 100644 index 0000000000000000000000000000000000000000..c6da458fd5c4e43e5c5b9758062a9062867a9e98 --- /dev/null +++ b/assays/BSA_RNAseq/adapters/custom_adapters.fa @@ -0,0 +1,24 @@ +>read1_1/1 +AGATCGGAAGAGCACACGTCTGAACTCCAGTCA +>read1_2/1 +AGATCGGAAGAGCACACGTCTGAAC +>read1_3/1 +TGGAATTCTCGGGTGCCAAGG +>read1_4/1 +AGATCGGAAGAGCACACGTCT +>read1_5/1 +CTGTCTCTTATACACATCT +>read1_6/1 +AGATGTGTATAAGAGACAG +>read2_1/2 +AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +>read2_2/2 +AGATCGGAAGAGCGTCGTGTAGGGA +>read2_3/2 +TGGAATTCTCGGGTGCCAAGG +>read2_4/2 +AGATCGGAAGAGCACACGTCT +>read2_5/2 +CTGTCTCTTATACACATCT +>read2_6/2 +AGATGTGTATAAGAGACAG diff --git a/assays/BSA_RNAseq/adapters/readme.txt b/assays/BSA_RNAseq/adapters/readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..a029aca99c1b4ba0432cb9989de49109ef0d2645 --- /dev/null +++ b/assays/BSA_RNAseq/adapters/readme.txt @@ -0,0 +1,3 @@ +## raw_data/BSA_rnaseq/adapters/ + +Sequencing adapters used for reads trimming diff --git a/assays/BSA_RNAseq/isa.assay.xlsx b/assays/BSA_RNAseq/isa.assay.xlsx index 2d21f88fb855a54a1f3982155f0bb99249397390..399f4b8a40fb6354edb648cda747f4dc1d558145 100644 Binary files a/assays/BSA_RNAseq/isa.assay.xlsx and b/assays/BSA_RNAseq/isa.assay.xlsx differ diff --git a/assays/BSA_WGS/isa.assay.xlsx b/assays/BSA_WGS/isa.assay.xlsx index 1397489d55497f42e3468d5e7c53118165c9f3de..2e357f7219a77f02119427594574cd6d4d116af0 100644 Binary files a/assays/BSA_WGS/isa.assay.xlsx and b/assays/BSA_WGS/isa.assay.xlsx differ diff --git a/assays/betalain_LCMS_quantification/isa.assay.xlsx b/assays/betalain_LCMS_quantification/isa.assay.xlsx index 134a053fcd23d521beea075532f59f2dd93aa81c..d361dd4b2ffae902a2be61a221938294149b6bfa 100644 Binary files a/assays/betalain_LCMS_quantification/isa.assay.xlsx and b/assays/betalain_LCMS_quantification/isa.assay.xlsx differ diff --git a/assays/betalain_photometric_quantification/isa.assay.xlsx b/assays/betalain_photometric_quantification/isa.assay.xlsx index e18f639f6add7f87f39ecc7590f4455803e5d5b1..68931c6c470e924b1bd598f466bf6f20196dc4dc 100644 Binary files a/assays/betalain_photometric_quantification/isa.assay.xlsx and b/assays/betalain_photometric_quantification/isa.assay.xlsx differ diff --git a/assays/isoform_sequencing/isa.assay.xlsx b/assays/isoform_sequencing/isa.assay.xlsx index 9719eb30c1c6f3ccd645f4189923b8d6215fce9d..ae346d235c02e0ea6c0a10e07149fcc41e5f20f7 100644 Binary files a/assays/isoform_sequencing/isa.assay.xlsx and b/assays/isoform_sequencing/isa.assay.xlsx differ diff --git a/isa.investigation.xlsx b/isa.investigation.xlsx index 8c37020a85cca1fe30c338cad3743015e7e4fa3b..586f555874be0956c7b05f5da13c824073eaa385 100644 Binary files a/isa.investigation.xlsx and b/isa.investigation.xlsx differ diff --git a/studies/BSA_parent_betalain_quant/isa.study.xlsx b/studies/BSA_parent_betalain_quant/isa.study.xlsx index ec46c0474b3ff8ea33525da7511c111656479c76..486411ac1c8c88a7585c68191536289931207512 100644 Binary files a/studies/BSA_parent_betalain_quant/isa.study.xlsx and b/studies/BSA_parent_betalain_quant/isa.study.xlsx differ diff --git a/studies/Bulk_segregant_analysis/isa.study.xlsx b/studies/Bulk_segregant_analysis/isa.study.xlsx index 43a7b33b2beb38dc6c65334db05c480f743bd916..b7b89a26b6cd86616539825c87df590f547b6e4e 100644 Binary files a/studies/Bulk_segregant_analysis/isa.study.xlsx and b/studies/Bulk_segregant_analysis/isa.study.xlsx differ diff --git a/studies/additional_data/README.md b/studies/additional_data/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/studies/additional_data/isa.study.xlsx b/studies/additional_data/isa.study.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..566ca09ccb44bcd0d23eb8c1fe9b6e8b6b8de26d Binary files /dev/null and b/studies/additional_data/isa.study.xlsx differ diff --git a/studies/additional_data/protocols/.gitkeep b/studies/additional_data/protocols/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/studies/additional_data/resources/.gitkeep b/studies/additional_data/resources/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/studies/additional_data/resources/color_pathway_genes.bed b/studies/additional_data/resources/color_pathway_genes.bed new file mode 100644 index 0000000000000000000000000000000000000000..c4ee58173ccc48f2eae2fa0b2e9b3e28eae49bcc --- /dev/null +++ b/studies/additional_data/resources/color_pathway_genes.bed @@ -0,0 +1,36 @@ +chr start end strand gene_id transcript_id Gene Pathway +Scaffold_1 7523181 7527088 - AHp000674 AHp000674.1 AhCYP76AD5 Betalain +Scaffold_1 28687404 28688868 - AHp001663 AHp001663.1 AhBetanidin5GT Betalain +Scaffold_2 6806355 6807984 + AHp003152 AHp003152.1 F3-H_3 Flavonoid +Scaffold_2 6806355 6807984 + AHp003152 AHp003152.2 F3-H_3 Flavonoid +Scaffold_2 31518210 31522095 + AHp004305 AHp004305.1 CHS Flavonoid +Scaffold_2 31518210 31522095 + AHp004305 AHp004305.2 CHS Flavonoid +Scaffold_3 14652200 14653652 + AHp005940 AHp005940.1 AhBetanidin6GT Betalain +Scaffold_3 29507485 29509926 + AHp006454 AHp006454.1 F3H Flavonoid +Scaffold_4 10226509 10227946 - AHp007219 AHp007219.1 AhcDOPA5GT Betalain +Scaffold_5 9470955 9474986 - AHp008991 AHp008991.1 FLS Flavonoid +Scaffold_5 9470955 9474986 - AHp008991 AHp008991.2 FLS Flavonoid +Scaffold_5 19537497 19543306 + AHp009303 AHp009303.1 DFR Flavonoid +Scaffold_6 3461614 3469253 - AHp009962 AHp009962.1 CHI1 Flavonoid +Scaffold_6 13877397 13882497 + AHp010386 AHp010386.1 AhDODAα2 Betalain +Scaffold_8 3308349 3313908 + AHp012752 AHp012752.1 PAL_1 Flavonoid +Scaffold_8 7925628 7930892 - AHp013217 AHp013217.1 C4H_1 Flavonoid +Scaffold_8 7925628 7930892 - AHp013217 AHp013217.2 C4H_1 Flavonoid +Scaffold_9 12115220 12125730 - AHp014409 AHp014409.1 4CL_1 Flavonoid +Scaffold_10 21762620 21764924 - AHp016530 AHp016530.1 AhMYB3 Betalain +Scaffold_10 21780564 21783050 - AHp016531 AHp016531.1 AhMYB4 Betalain +Scaffold_11 16155424 16159543 + AHp017409 AHp017409.1 LAR Flavonoid +Scaffold_11 17204504 17209738 - AHp017497 AHp017497.1 F3-H_1 Flavonoid +Scaffold_14 12286837 12295353 + AHp020962 AHp020962.1 4CL_2 Flavonoid +Scaffold_15 1098208 1100359 + AHp021795 AHp021795.1 ANS Flavonoid +Scaffold_15 2886482 2891165 - AHp021980 AHp021980.1 PAL_2 Flavonoid +Scaffold_15 4193988 4197809 + AHp022120 AHp022120.1 F3-H_4 Flavonoid +Scaffold_15 4211064 4214362 + AHp022122 AHp022122.1 F3-H_2 Flavonoid +Scaffold_15 4227367 4229108 - AHp022123 AHp022123.1 F3-H_5 Flavonoid +Scaffold_15 7959672 7965298 + AHp022382 AHp022382.1 C4H_3 Flavonoid +Scaffold_15 7959672 7965298 + AHp022382 AHp022382.2 C4H_3 Flavonoid +Scaffold_15 8003999 8008333 + AHp022384 AHp022384.1 C4H_2 Flavonoid +Scaffold_16 988733 991447 + AHp022773 AHp022773.1 AhMYB2 Betalain +Scaffold_16 988733 991447 + AHp022773 AHp022773.2 AhMYB2 Betalain +Scaffold_16 5231548 5239058 - AHp023147 AHp023147.1 AhDODAα1 Betalain +Scaffold_16 5301961 5305973 + AHp023148 AHp023148.1 AhCYP76AD2 Betalain diff --git a/studies/additional_data/resources/color_pathway_genes.csv b/studies/additional_data/resources/color_pathway_genes.csv new file mode 100644 index 0000000000000000000000000000000000000000..a427fab2d4361cc316c62eb25f0080f53dcf3c69 --- /dev/null +++ b/studies/additional_data/resources/color_pathway_genes.csv @@ -0,0 +1,30 @@ +Gene,Pathway,Gene_id +PAL_1,Flavonoid,AHp012752 +PAL_2,Flavonoid,AHp021980 +C4H_1,Flavonoid,AHp013217 +C4H_2,Flavonoid,AHp022384 +C4H_3,Flavonoid,AHp022382 +4CL_1,Flavonoid,AHp014409 +4CL_2,Flavonoid,AHp020962 +CHS,Flavonoid,AHp004305 +CHI1,Flavonoid,AHp009962 +F3-H_1,Flavonoid,AHp017497 +F3-H_2,Flavonoid,AHp022122 +F3-H_3,Flavonoid,AHp003152 +F3-H_4,Flavonoid,AHp022120 +F3-H_5,Flavonoid,AHp022123 +DFR,Flavonoid,AHp009303 +F3H,Flavonoid,AHp006454 +FLS,Flavonoid,AHp008991 +LAR,Flavonoid,AHp017409 +ANS,Flavonoid,AHp021795 +AhCYP76AD2,Betalain,AHp023148 +AhCYP76AD5,Betalain,AHp000674 +AhDODAα2,Betalain,AHp010386 +AhDODAα1,Betalain,AHp023147 +AhcDOPA5GT,Betalain,AHp007219 +AhBetanidin5GT,Betalain,AHp001663 +AhBetanidin6GT,Betalain,AHp005940 +AhMYB2,Betalain,AHp022773 +AhMYB3,Betalain,AHp016530 +AhMYB4,Betalain,AHp016531 diff --git a/studies/isoform_sequencing/isa.study.xlsx b/studies/isoform_sequencing/isa.study.xlsx index 60867b92cbe45252333f78b49c485bd39a3e02b2..48af51fa63c14650219e5e214e849d894b1120dd 100644 Binary files a/studies/isoform_sequencing/isa.study.xlsx and b/studies/isoform_sequencing/isa.study.xlsx differ diff --git a/workflows/BSA/RNAseq/QC.sh b/workflows/BSA/RNAseq/QC.sh index 64db67753b630dbf68ec8969414684f3d63565bf..947a4ff755356b5808130b9f984c477c0759599a 100644 --- a/workflows/BSA/RNAseq/QC.sh +++ b/workflows/BSA/RNAseq/QC.sh @@ -22,8 +22,8 @@ GTFIN=polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_correc # Quality control # Initialize the output directory -QCIN=raw_data/BSA_rnaseq/ -QCOUT=raw_data/BSA_rnaseq/fastqc +QCIN=assays/BSA_RNAseq/dataset +QCOUT=assays/BSA_RNAseq/fastqc mkdir -p $QCOUT # run quality control fastqc @@ -32,14 +32,14 @@ fastqc -t 10 -o $QCOUT "$QCIN"*P.fq.gz # Quality control # Initialize the output directory -QCIN=raw_data/BSA_rnaseq/ -QCOUT=raw_data/BSA_rnaseq/fastqc +QCIN=assays/BSA_RNAseq/ +QCOUT=assays/BSA_RNAseq/fastqc mkdir -p $QCOUT # run quality control fastqc fastqc -t 10 -o $QCOUT "$QCIN"*P.fq.gz -cp -r $QCOUT data/BSA/RNAseq/STAR_flower_mappings/QC/ +cp -r $QCOUT runs/BSA/RNAseq/STAR_flower_mappings/QC/ module load samtools/1.13 @@ -53,11 +53,11 @@ GTFQM=/scratch/twinkle1/temp.gtf sed 's/CDS/exon/' $GTFIN > $GTFQM # define input files -QMIN=data/BSA/RNAseq/STAR_flower_mappings/ +QMIN=runs/BSA/RNAseq/STAR_flower_mappings/ # define output directory -QMOUT=data/BSA/RNAseq/STAR_flower_mappings/QC/qualimap +QMOUT=runs/BSA/RNAseq/STAR_flower_mappings/QC/qualimap # create main output directory mkdir -p $QMOUT @@ -85,8 +85,8 @@ done rm $GTFQM # run multiqc to combine the results from fastqc and qualimap into a single report -MULTIQCOUT=data/BSA/RNAseq/STAR_flower_mappings/multiqc -MULTIQCIN=data/BSA/RNAseq/STAR_flower_mappings/QC/ +MULTIQCOUT=runs/BSA/RNAseq/STAR_flower_mappings/multiqc +MULTIQCIN=runs/BSA/RNAseq/STAR_flower_mappings/QC/ mkdir -p $MULTIQCOUT diff --git a/workflows/BSA/RNAseq/adapter_trimming.sh b/workflows/BSA/RNAseq/adapter_trimming.sh index 15d5ab7bef2f182a0afe15ae0caadef289200a67..2821fec969a558c3c2ec65cc964789464e4e58d4 100644 --- a/workflows/BSA/RNAseq/adapter_trimming.sh +++ b/workflows/BSA/RNAseq/adapter_trimming.sh @@ -20,7 +20,7 @@ module load trimmomatic/0.39 # run this part as array job # create array of read fastq files (R1 only): -SOURCE_DIR=raw_data/BSA_rnaseq/ +SOURCE_DIR=assays/BSA_RNAseq/dataset/ FILES=("$SOURCE_DIR"/*R1.fastq.gz) # run trimmomatic, use 6 threads, taking advantage of the baseout function to name output files @@ -32,4 +32,4 @@ java -jar $TRIMMOMATIC/trimmomatic.jar PE \ "${FILES["${SLURM_ARRAY_TASK_ID}"]}" \ "${FILES["${SLURM_ARRAY_TASK_ID}"]/R1.fastq.gz/R2.fastq.gz}" \ -baseout "${FILES["${SLURM_ARRAY_TASK_ID}"]/R1.fastq.gz/trimmed.fq.gz}" \ - ILLUMINACLIP:raw_data/BSA_rnaseq/adapters/custom_adapters.fa:2:30:10 + ILLUMINACLIP:assays/BSA_RNAseq/adapters/custom_adapters.fa:2:30:10 diff --git a/workflows/BSA/RNAseq/index_STAR.sh b/workflows/BSA/RNAseq/index_STAR.sh index cb8606a419e341318008238f456a836ae017649b..6d12c7f12c3562cabf82f0d9e93ac0eb977eab97 100644 --- a/workflows/BSA/RNAseq/index_STAR.sh +++ b/workflows/BSA/RNAseq/index_STAR.sh @@ -18,10 +18,10 @@ module load star/2.7.8a # more specific settings: use the polished, softmasked reference assembly # sjdbOverhang dependend on input read length # as SJDB file, use the newly generated braker2 protein gtf file -REFGENOME=polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta -SJDBFILE=polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf +REFGENOME=runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta +SJDBFILE=runs/polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf -mkdir -p data/BSA/RNAseq/STAR_flower_index +mkdir -p runs/BSA/RNAseq/STAR_flower_index STAR --runThreadN 8 \ --runMode genomeGenerate \ diff --git a/workflows/BSA/RNAseq/run_STAR.sh b/workflows/BSA/RNAseq/run_STAR.sh index 1064306650a392d552fdf51ac9ccde1a1fc2c725..099d4e6f420777b3c9f843d55f9bc292f558e0c4 100644 --- a/workflows/BSA/RNAseq/run_STAR.sh +++ b/workflows/BSA/RNAseq/run_STAR.sh @@ -12,9 +12,9 @@ module load star/2.7.8a # create array of read fastq files (R1 only): -SOURCE_DIR=raw_data/BSA_rnaseq/ +SOURCE_DIR=assays/BSA_RNAseq/dataset/ FILES=("$SOURCE_DIR"/*_1P.fq.gz) -OUTPUTDIR=data/BSA/RNAseq/STAR_flower_mappings +OUTPUTDIR=runs/BSA/RNAseq/STAR_flower_mappings # change directory of outprefix OUTPREFIX1=("${FILES["${SLURM_ARRAY_TASK_ID}"]/$SOURCE_DIR/$OUTPUTDIR}") diff --git a/workflows/BSA/RNAseq/run_kallisto.sh b/workflows/BSA/RNAseq/run_kallisto.sh index ae124fb57e4ca3d234851d6c12be5104ed6cde41..88243f01138aef81632ddb4975627402e2a94826 100644 --- a/workflows/BSA/RNAseq/run_kallisto.sh +++ b/workflows/BSA/RNAseq/run_kallisto.sh @@ -23,9 +23,9 @@ ##### Perform quantification # create array of read fastq files (R1 only): -SOURCE_DIR=raw_data/flower_color_mapping/ +SOURCE_DIR=assays/BSA_RNAseq/dataset/flower_color_mapping/ FILES=("$SOURCE_DIR"AM*_1P.fq.gz) -OUTDIR=data/flower_color_mapping/kallisto_quant/ +OUTDIR=runs/flower_color_mapping/kallisto_quant/ TISSUE_NAMES=("AM_00331_gf" "AM_00331_rf" "AM_00332_gf" "AM_00332_rf") mkdir -p $OUTDIR diff --git a/workflows/BSA/WGS/combined_filter.sh b/workflows/BSA/WGS/combined_filter.sh index 707a9bd4b68212b56b80b324d637e9a69f9c326b..78e6a1718cb1f58098fd970d689df38466db7bc7 100644 --- a/workflows/BSA/WGS/combined_filter.sh +++ b/workflows/BSA/WGS/combined_filter.sh @@ -12,13 +12,13 @@ module load samtools/1.13 -REFERENCE=polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta +REFERENCE=runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta PROVIDER=CCG -OUTDIR=data/BSA/wgs/vcf +OUTDIR=runs/BSA/wgs/vcf mkdir -p $OUTDIR -ALLSAMP=$(for i in data/BSA/wgs/bam_files/gvcf/AM_00*.vcf; do echo -V $i;done) +ALLSAMP=$(for i in runs/BSA/wgs/bam_files/gvcf/AM_00*.vcf; do echo -V $i;done) $MYUTIL/tools/gatk-4.1.7.0/gatk --java-options "-Xmx48G" \ CombineGVCFs \ diff --git a/workflows/BSA/WGS/map_reads.sh b/workflows/BSA/WGS/map_reads.sh index ad2bcf23142c774d731e1471b5639fca1422593d..3a1d2a877b9082e1f8b2906e8527196ac9349aed 100644 --- a/workflows/BSA/WGS/map_reads.sh +++ b/workflows/BSA/WGS/map_reads.sh @@ -17,15 +17,15 @@ module load bwamem2/2.0_gnu module load samtools/1.13 -REFERENCE=polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta +REFERENCE=runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta #bwa-mem2 index $REFERENCE PROVIDER=CCG -INPUTPATH=raw_data/BSA_wgs/ # -OUTPUTPATH=data/BSA/wgs/bam_files/ # Change this for different generations +INPUTPATH=assays/BSA_WGS/dataset # +OUTPUTPATH=runs/BSA/wgs/bam_files/ # Change this for different generations mkdir -p $OUTPUTPATH mkdir -p ${OUTPUTPATH}/metrics/ diff --git a/workflows/BSA/betalain_quantification.Rmd b/workflows/BSA/betalain_quantification.Rmd index 8ab62650b42d6d117839495a3a60751e804a7235..31a9b69c32f1ca1ec6726be542ae9c0e1d5c1223 100644 --- a/workflows/BSA/betalain_quantification.Rmd +++ b/workflows/BSA/betalain_quantification.Rmd @@ -20,7 +20,7 @@ library(ggbeeswarm) Create output directory for plots ```{bash} -mkdir ../../plots/betalain_quantification +mkdir ../../runs/betalain_quantification ``` @@ -28,7 +28,7 @@ Read in measured data ```{r} # read in data -betalain_quantification <- read_excel(path = "../../raw_data/betalain_quantification/Betalain_quantification_summary.xlsx", +betalain_quantification <- read_excel(path = "../../assays/betalain_photometric_quantification/dataset/Betalain_quantification_summary.xlsx", sheet = "Photometric_quantification") # normalize to the same input weight @@ -219,11 +219,11 @@ p_leaf <- ggplot(data = melted_quant %>% filter(tissue == "leaf")) + yend = mean_content), linewidth = 0.9, color = "black") + - geom_text(data = leaf_bc_groups, + geom_text(data = leaf_bc_groups, aes(x = as.numeric(factors) - 0.19, y = 1.15, - label = groups), - size=8, + label = groups), + size=8, inherit.aes = F, color = "red3") + scale_color_manual(values = c( "red3","#F0B327"), labels = c("Betacyanins", "Betaxanthins")) + @@ -246,8 +246,8 @@ p_leaf # plot betalain quantification from flower p_flower <- ggplot(data = melted_quant %>% filter(tissue == "flower")) + - # geom_boxplot(aes(x = accession, - # y = content, + # geom_boxplot(aes(x = accession, + # y = content, # fill = metabolite), # color = "black", # #color = "red4", @@ -282,18 +282,18 @@ p_flower <- ggplot(data = melted_quant %>% filter(tissue == "flower")) + yend = mean_content), linewidth = 0.9, color = "black") + - geom_text(data = flower_bc_groups, + geom_text(data = flower_bc_groups, aes(x = as.numeric(factors) - 0.19, y = 1.15, - label = groups), - size=8, + label = groups), + size=8, inherit.aes = F, color = "red3") + - geom_text(data = flower_bx_groups, + geom_text(data = flower_bx_groups, aes(x = as.numeric(factors) + 0.19, y = 1.15, - label = groups), - size=8, + label = groups), + size=8, inherit.aes = F, color = "#F0B327") + scale_fill_manual(values = c( "red3","#F0B327"), labels = c("Betacyanins", "Betaxanthins")) + @@ -322,7 +322,7 @@ patchplot_betalains <- p_flower / p_leaf + patchplot_betalains -ggsave(filename = "../../plots/betalain_quantification/betalain_quant_photometric_content.png", +ggsave(filename = "../../runs/betalain_quantification/betalain_quant_photometric_content.png", plot = patchplot_betalains, bg = "white", dpi = 450, @@ -335,7 +335,7 @@ HPLC quantification ```{r} # hplc quantification -hplc_quantification <- read_excel(path = "../../raw_data/betalain_quantification/Betalain_quantification_summary.xlsx", +hplc_quantification <- read_excel(path = "../../assays/betalain_photometric_quantification/dataset/Betalain_quantification_summary.xlsx", sheet = "LC-MS_quantification") # get blank measurements @@ -447,14 +447,14 @@ Plot results amaranthin ```{r} # plot pa_leaf <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "leaf")) + - # geom_point(aes(x = accession, - # y = Amaranthin_ratio, + # geom_point(aes(x = accession, + # y = Amaranthin_ratio, # group = uniq_ind), # color = "red3", # position = position_dodge(width = 0.75), # size = 2.8) + - geom_beeswarm(aes(x = accession, - y = Amaranthin_ratio, + geom_beeswarm(aes(x = accession, + y = Amaranthin_ratio, group = uniq_ind), color = "red3", cex = 1.5, @@ -489,14 +489,14 @@ pa_leaf <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "leaf")) + pa_flower <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "flower")) + - # geom_point(aes(x = accession, - # y = Amaranthin_ratio, + # geom_point(aes(x = accession, + # y = Amaranthin_ratio, # group = uniq_ind), # color = "red3", # position = position_dodge(width = 0.75), # size = 2.8) + - geom_beeswarm(aes(x = accession, - y = Amaranthin_ratio, + geom_beeswarm(aes(x = accession, + y = Amaranthin_ratio, group = uniq_ind), color = "red3", cex = 1.5, @@ -536,8 +536,8 @@ Plot results betanin ```{r} # plot pb_leaf <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "leaf")) + - geom_beeswarm(aes(x = accession, - y = Betanin_ratio, + geom_beeswarm(aes(x = accession, + y = Betanin_ratio, group = uniq_ind), color = "red3", cex = 1.5, @@ -572,8 +572,8 @@ pb_leaf <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "leaf")) + pb_flower <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "flower")) + - geom_beeswarm(aes(x = accession, - y = Betanin_ratio, + geom_beeswarm(aes(x = accession, + y = Betanin_ratio, group = uniq_ind), color = "red3", cex = 1.5, @@ -612,8 +612,8 @@ Plot results betalamic acid ```{r} # plot pba_leaf <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "leaf")) + - geom_beeswarm(aes(x = accession, - y = Betalamic_acid_ratio, + geom_beeswarm(aes(x = accession, + y = Betalamic_acid_ratio, group = uniq_ind), color = "grey40", cex = 1.5, @@ -648,8 +648,8 @@ pba_leaf <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "leaf")) + pba_flower <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "flower")) + - geom_beeswarm(aes(x = accession, - y = Betalamic_acid_ratio, + geom_beeswarm(aes(x = accession, + y = Betalamic_acid_ratio, group = uniq_ind), color = "grey40", cex = 1.5, @@ -688,8 +688,8 @@ Plot results vulgaxanthin ```{r} # plot pv_leaf <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "leaf")) + - geom_beeswarm(aes(x = accession, - y = Vulgaxanthin_IV_ratio, + geom_beeswarm(aes(x = accession, + y = Vulgaxanthin_IV_ratio, group = uniq_ind), color = "#F0B327", method = "swarm", @@ -718,8 +718,8 @@ pv_leaf <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "leaf")) + pv_flower <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "flower")) + - geom_beeswarm(aes(x = accession, - y = Vulgaxanthin_IV_ratio, + geom_beeswarm(aes(x = accession, + y = Vulgaxanthin_IV_ratio, group = uniq_ind), color = "#F0B327", method = "swarm", @@ -762,7 +762,7 @@ overview_patchplot <- p_leaf + p_flower + pa_leaf + pa_flower + pb_leaf + pb_flo legend.direction = "vertical", plot.tag = element_text(vjust = 2, size = 17, face = "bold")) -ggsave(filename = "../../plots/betalain_quantification/overview_betalain_quantification.png", +ggsave(filename = "../../runs/betalain_quantification/overview_betalain_quantification.png", plot = overview_patchplot, bg = "white", dpi = 450, @@ -785,7 +785,7 @@ photo_hplc_patchplot <- plot_spacer() / (p_leaf + p_flower) / (pa_leaf + pa_flow -ggsave(filename = "../../plots/betalain_quantification/quantification_without_picture.png", +ggsave(filename = "../../runs/betalain_quantification/quantification_without_picture.png", plot = photo_hplc_patchplot, bg = "white", dpi = 450, @@ -798,7 +798,7 @@ Analyse HPLC results from amaranth roots: ```{r} # hplc quantification -root_quant <- read_excel(path = "../../raw_data/betalain_quantification/Betalain_quantification_summary.xlsx", +root_quant <- read_excel(path = "../../assays/betalain_photometric_quantification/dataset/Betalain_quantification_summary.xlsx", sheet = "transgenic_roots_LC-MS_quantification") root_quant <- root_quant %>% mutate(sample_id = paste0("plate_", plate, "_", root_type)) %>% @@ -834,7 +834,7 @@ root_quant_plot <- ggplot(data = root_quant_norm) + legend.position = "none") root_quant_plot -ggsave(filename = "../../plots/betalain_quantification/root_quantification.png", +ggsave(filename = "../../runs/betalain_quantification/root_quantification.png", plot = root_quant_plot, bg = "white", dpi = 450, @@ -842,5 +842,3 @@ ggsave(filename = "../../plots/betalain_quantification/root_quantification.png", height = 5) ``` - - diff --git a/workflows/BSA/bsa_and_plotting.R b/workflows/BSA/bsa_and_plotting.R index 4f9f2667b3ec478e80153811738a96efea580f7c..ed0b831766c4cf6e4baa13a866298f91a75f3d4e 100644 --- a/workflows/BSA/bsa_and_plotting.R +++ b/workflows/BSA/bsa_and_plotting.R @@ -1,5 +1,5 @@ -setwd("/home/tom/Documents/projects/Ahyp_v2_2_publication/") +setwd("/home/tom/Documents/ARC_projects/betalain_regulation_amaranth/") library(QTLseqr) library(tidyverse) @@ -16,7 +16,7 @@ bsa_analysis <- function(rawData,HighBulk,LowBulk,Chroms,nhigh,nlow){ highBulk = HighBulk, lowBulk = LowBulk, chromList = Chroms) - + df <- df %>% select(CHROM, POS, REF, @@ -38,7 +38,7 @@ bsa_analysis <- function(rawData,HighBulk,LowBulk,Chroms,nhigh,nlow){ mutate(CHROM=as.factor(as.numeric(gsub("Scaffold_","",CHROM))), POS=as.numeric(POS)) %>% filter(REF!='*',ALT!='*') - + df_filt <-filterSNPs(SNPset = df, refAlleleFreq = 0.2, minTotalDepth = 50, @@ -46,7 +46,7 @@ bsa_analysis <- function(rawData,HighBulk,LowBulk,Chroms,nhigh,nlow){ minSampleDepth = 20, minGQ = 99, verbose = TRUE) - + df_filt <- runGprimeAnalysis( SNPset = df_filt, windowSize = 2e6, @@ -64,14 +64,14 @@ bsa_analysis <- function(rawData,HighBulk,LowBulk,Chroms,nhigh,nlow){ return(df_filt) } -AM_00332_leaf_green_red <- bsa_analysis(rawData = 'data/BSA/wgs/vcf/bulk_snps05.table', +AM_00332_leaf_green_red <- bsa_analysis(rawData = 'runs/BSA/wgs/vcf/bulk_snps05.table', HighBulk = "AM_00332_gl", LowBulk = "AM_00332_rl", Chroms = paste0(rep("Scaffold_", 16),1:16), nhigh=80, nlow=80) -AM_00331_flower_red_green <- bsa_analysis(rawData = 'data/BSA/wgs/vcf/bulk_snps05.table', +AM_00331_flower_red_green <- bsa_analysis(rawData = 'runs/BSA/wgs/vcf/bulk_snps05.table', HighBulk = "AM_00331_rf", LowBulk = "AM_00331_gf", Chroms = paste0(rep("Scaffold_", @@ -83,15 +83,15 @@ AM_00331_flower_red_green <- bsa_analysis(rawData = 'data/BSA/wgs/vcf/bulk_snps0 # plot all results # leaf plotGresults <- function(Gresults,betalain_genes){ - qval <- Gresults %>% - filter(qvalue<=0.01) + qval <- Gresults %>% + filter(qvalue<=0.01) #qval <- min(qval$Gprime) qval <- 3 - - + + mG <- Gresults %>% filter(Gprime==max(Gresults$Gprime)) - + p1 <- ggplot()+ geom_line(data=Gresults,aes(POS/1e6,Gprime), size=2) + labs(x= 'Position (Mb)',y= "G' value") + @@ -99,7 +99,7 @@ plotGresults <- function(Gresults,betalain_genes){ geom_hline(data=data.frame(yint=qval), aes(yintercept =yint, linetype ='dashed', - color=alpha('red',0.6)), + color=alpha('red',0.6)), size=1.7)+ facet_grid(.~CHROM, space = 'free_x',scales='free_x') + theme(panel.spacing.x=unit(0.25, "lines")) + @@ -117,23 +117,23 @@ plotGresults <- function(Gresults,betalain_genes){ axis.ticks = element_line(linewidth = 1.5), axis.ticks.length = unit(.25, "cm")) + scale_x_continuous(guide = guide_axis(check.overlap = T)) - #geom_gene_arrow(data=betalain_genes, + #geom_gene_arrow(data=betalain_genes, # aes(xmin = start/1e6, xmax = end/1e6, y = max(Gresults$Gprime), fill = type)) - + #ggsave(outfile,p1,width = 18,height = 7,,bg='white') return(p1) } # flower plotGresults1 <- function(Gresults,betalain_genes){ - qval <- Gresults %>% - filter(qvalue<=0.01) + qval <- Gresults %>% + filter(qvalue<=0.01) #qval <- min(qval$Gprime) qval <- 3 - - + + mG <- Gresults %>% filter(Gprime==max(Gresults$Gprime)) - + p1 <- ggplot()+ geom_line(data=Gresults,aes(POS/1e6,Gprime), size=2) + labs(x= 'Position (Mb)',y= "G' value") + @@ -142,7 +142,7 @@ plotGresults1 <- function(Gresults,betalain_genes){ geom_hline(data=data.frame(yint=qval), aes(yintercept =yint, linetype ='dashed', - color=alpha('red',0.6)), + color=alpha('red',0.6)), size=1.7)+ facet_grid(.~CHROM,space = 'free_x',scales='free_x') + theme(panel.spacing.x=unit(0.25, "lines")) + @@ -158,9 +158,9 @@ plotGresults1 <- function(Gresults,betalain_genes){ axis.ticks = element_line(linewidth = 1.5), axis.ticks.length = unit(.25, "cm")) + scale_x_continuous(guide = guide_axis(check.overlap = T)) - #geom_gene_arrow(data=betalain_genes, + #geom_gene_arrow(data=betalain_genes, # aes(xmin = start/1e6, xmax = end/1e6, y = max(Gresults$Gprime), fill = type)) - + #ggsave(outfile,p1,width = 18,height = 7,,bg='white') return(p1) } @@ -168,23 +168,23 @@ plotGresults1 <- function(Gresults,betalain_genes){ # plot individual chromosomes # flower plotGqtl <- function(Gresults,chr,genes){ - - qval <- Gresults %>% - filter(qvalue<=0.01) + + qval <- Gresults %>% + filter(qvalue<=0.01) #qval <- min(qval$Gprime) qval <- 3 my_qtl <- getQTLTable(SNPset = Gresults, alpha = 0.01,export = F) - + ggplot()+ geom_line(data=filter(Gresults,CHROM==chr),aes(POS/1e6,Gprime),size=2) + labs(x= 'Position (Mb)',y= "G' value") + scale_x_continuous(breaks = c(0,10,20,30))+ geom_hline(data=data.frame(yint=qval), - aes(yintercept = yint, - linetype ='dashed', + aes(yintercept = yint, + linetype ='dashed', color=alpha('red',0.6)), size = 2) + - facet_grid(.~CHROM,space = 'free_x',scales='free_x') + + facet_grid(.~CHROM,space = 'free_x',scales='free_x') + theme(panel.spacing.x=unit(0.25, "lines")) + ylim(0,10) + theme( strip.background = element_rect(fill = alpha('lightblue',0.2)), @@ -199,19 +199,19 @@ plotGqtl <- function(Gresults,chr,genes){ axis.line = element_line(linewidth = 2), axis.ticks = element_line(linewidth = 1.5), axis.ticks.length = unit(.25, "cm")) + - geom_gene_arrow(data=filter(genes, CHROM==chr, type == "transcript") %>% droplevels(), + geom_gene_arrow(data=filter(genes, CHROM==chr, type == "transcript") %>% droplevels(), aes(xmin = start/1e6, xmax = end/1e6, y = 9.5, color = attributes), size=6) + scale_color_manual(values = c(alpha('red',0.6), "black", "black","grey","grey")) } # leaf plotGqtl1 <- function(Gresults,chr,genes){ - - qval <- Gresults %>% - filter(qvalue<=0.01) + + qval <- Gresults %>% + filter(qvalue<=0.01) #qval <- min(qval$Gprime) qval <- 3 my_qtl <- getQTLTable(SNPset = Gresults, alpha = 0.01,export = F) - + p1 <- ggplot() + geom_line(data=filter(Gresults,CHROM==chr),aes(POS/1e6,Gprime),size=2) + labs(x= 'Position (Mb)',y= "G' value") + @@ -239,22 +239,22 @@ plotGqtl1 <- function(Gresults,chr,genes){ axis.line = element_line(linewidth = 2), axis.ticks = element_line(linewidth = 1.5), axis.ticks.length = unit(.25, "cm")) - - + + p2 <- ggplot() + - geom_gene_arrow(data=filter(genes, - CHROM==chr, + geom_gene_arrow(data=filter(genes, + CHROM==chr, type == "transcript", attributes == "ID=AHp023147.1;geneID=AHp023147" | attributes == "ID=AHp023148.1;geneID=AHp023148") %>% droplevels(), - aes(xmin = start, - xmax = end, - y = "chr16", - fill = attributes, + aes(xmin = start, + xmax = end, + y = "chr16", + fill = attributes, forward = c(F,T)), size = 1.5, color = "black", - arrowhead_height = unit(12, "mm"), - arrowhead_width = unit(6, "mm"), + arrowhead_height = unit(12, "mm"), + arrowhead_width = unit(6, "mm"), arrow_body_height = grid::unit(6, "mm")) + geom_text(aes(x = c(5246000,5290000), y = "chr16", @@ -268,7 +268,7 @@ plotGqtl1 <- function(Gresults,chr,genes){ axis.line = element_line(linewidth = 2), axis.ticks = element_line(linewidth = 1.5), axis.text.x = element_text(size=20), - panel.grid.major.y = ggplot2::element_line(colour = "grey", + panel.grid.major.y = ggplot2::element_line(colour = "grey", linewidth = 1), #axis.title.x = element_text(size=40), axis.ticks.length = unit(.25, "cm"), @@ -278,12 +278,12 @@ plotGqtl1 <- function(Gresults,chr,genes){ axis.ticks.y = element_blank(), axis.text.y = element_blank()) + scale_fill_manual(values = c("chocolate2","cyan3",'red')) - + # combine plots: out <- plot_grid(p2,p1, nrow = 2, rel_heights = c(0.3,0.7)) - + out } @@ -348,15 +348,9 @@ MYB_plot <- plot_grid(cowplot_flower, pathway_plot, alignment_plot, labels = c("", "C", "D"), label_size = 34) -ggsave(filename = "plots/paper_myb_combined_alignment.png", +ggsave(filename = "runs/plots/paper_myb_combined_alignment.png", plot = MYB_plot, dpi = 400, width = 25, height = 25, bg = "white") - - - - - - diff --git a/workflows/BSA/read_count_analysis.Rmd b/workflows/BSA/read_count_analysis.Rmd index 4154c86c51ede7b0b6f006ccf5b84574c203946f..f7c97bb2a060eac383eb12231e19872e8ecd811d 100644 --- a/workflows/BSA/read_count_analysis.Rmd +++ b/workflows/BSA/read_count_analysis.Rmd @@ -11,14 +11,14 @@ library(tidyverse) library(DESeq2) library(factoextra) library(patchwork) -knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2_publication/") +knitr::opts_knit$set(root.dir = "/home/tom/Documents/ARC_projects/betalain_regulation_amaranth/") ``` ```{r} ########################## Create a function to generate plots for all betalain pathway genes # load object with names of all betalain and flavonoid genes -pathway_genes <- read.csv(file = "data/manual_sheets/color_pathway_genes.csv", header=T) +pathway_genes <- read.csv(file = "studies/additional_data/resources/color_pathway_genes.csv", header=T) colnames(pathway_genes) <- c("pathway_gene", "pathway", "gene_id") betalain.genes <- pathway_genes %>% filter(pathway == "Betalain") @@ -32,13 +32,13 @@ Transcript level gene expression quantification from kallisto: ```{r} # vector of input directories -sample_names <- dir(path = "data/flower_color_mapping/kallisto_quant/") +sample_names <- dir(path = "runs/flower_color_mapping/kallisto_quant/") # read in tables kallisto_quant <- c() for (i in 1:length(sample_names)){ - x <- read_table(file = paste0("data/flower_color_mapping/kallisto_quant/", + x <- read_table(file = paste0("runs/flower_color_mapping/kallisto_quant/", sample_names[i], "/abundance.tsv")) # set column names and keep relevant columns @@ -126,7 +126,7 @@ patchplot <- betalain_plots[[8]] + betalain_plots[[9]] + betalain_plots[[11]] + theme(plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm"), plot.tag = element_text(size = 35)) -ggsave(filename = "plots/flower_mapping_expression/betalain_gene_kallisto.png", +ggsave(filename = "runs/plots/flower_mapping_expression/betalain_gene_kallisto.png", width = 28, height = 20) @@ -182,7 +182,7 @@ betalain_plots <- plot_betalain_counts(gene_ID_list = betalain_quant$transcript_ # save all plots for (i in 1:length(betalain_plots)){ - ggsave(filename = paste0("plots/flower_mapping_expression/", gene_names[i], ".png"), + ggsave(filename = paste0("runs/plots/flower_mapping_expression/", gene_names[i], ".png"), plot = betalain_plots[[i]], height = 6, width = 8) @@ -219,7 +219,7 @@ ggplot(data=regulator_quant) + legend.position = "bottom", legend.direction = "vertical") -ggsave(filename = "plots/flower_mapping_expression/flower_expression_grid.png", +ggsave(filename = "runs/plots/flower_mapping_expression/flower_expression_grid.png", width = 4, height = 8, dpi = 500) @@ -289,7 +289,7 @@ ggplot(data=flavonoid_grid) + legend.position = "right", legend.direction = "vertical") -ggsave(filename = "plots/flower_mapping_expression/flavonoid_expression_grid.png", +ggsave(filename = "runs/plots/flower_mapping_expression/flavonoid_expression_grid.png", width = 8, height = 8, dpi = 500) diff --git a/workflows/BSA/read_count_analysis_from_bam.Rmd b/workflows/BSA/read_count_analysis_from_bam.Rmd index a3d44236380fe6efe5f4547dc8fb470af44d4dff..f442e2299bfd4934b573a95800a86283e27c16ef 100644 --- a/workflows/BSA/read_count_analysis_from_bam.Rmd +++ b/workflows/BSA/read_count_analysis_from_bam.Rmd @@ -11,40 +11,40 @@ library(tidyverse) library(chromstaR) library(ggtranscript) library(RColorBrewer) -knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2/") +knitr::opts_knit$set(root.dir = "/home/tom/Documents/ARC_projects/betalain_regulation_amaranth/") ``` Extract the sequencing reads from the bam files which overlap both the two non-synonymous SNPs and the stop-gained SNP. Samtools view can be used to extract reads covering a particular position (and their pairs). Non-primary alignments can be discarded. Reads should cover the position of the stop-gained variant (Scaffold 16, 5305851, C->T) and the position of the right non-synonymous variant (Scaffold 16, 5305727, A->T). ```{bash} -mkdir -p data/BSA/RNAseq/phased_reads/ +mkdir -p runs/BSA/RNAseq/phased_reads/ # green flower # index and extract everything overlapping the right non-synonymous variant position -samtools index data/BSA/RNAseq/STAR_flower_mappings/AM_00332_gf_Aligned.sortedByCoord.out.bam -samtools view -b -h -F 256 -P data/BSA/RNAseq/STAR_flower_mappings/AM_00332_gf_Aligned.sortedByCoord.out.bam Scaffold_16:5305727-5305727 > data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_mismatch.bam +samtools index runs/BSA/RNAseq/STAR_flower_mappings/AM_00332_gf_Aligned.sortedByCoord.out.bam +samtools view -b -h -F 256 -P runs/BSA/RNAseq/STAR_flower_mappings/AM_00332_gf_Aligned.sortedByCoord.out.bam Scaffold_16:5305727-5305727 > runs/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_mismatch.bam # index and extract everything overlapping the stop-gained variant position -samtools index data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_mismatch.bam -samtools view -b -h -P data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_mismatch.bam Scaffold_16:5305851-5305851 > data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam -samtools index data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam +samtools index runs/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_mismatch.bam +samtools view -b -h -P runs/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_mismatch.bam Scaffold_16:5305851-5305851 > runs/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam +samtools index runs/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam # save as tsv file using sam2tsv from jvarkit -java -jar /home/tom/Documents/tools/jvarkit/dist/sam2tsv.jar -R polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam > data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam.tsv +java -jar /home/tom/Documents/tools/jvarkit/dist/sam2tsv.jar -R runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta runs/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam > runs/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam.tsv # red flower # index and extract everything overlapping the right non-synonymous variant position -samtools index data/BSA/RNAseq/STAR_flower_mappings/AM_00332_rf_Aligned.sortedByCoord.out.bam -samtools view -b -h -F 256 -P data/BSA/RNAseq/STAR_flower_mappings/AM_00332_rf_Aligned.sortedByCoord.out.bam Scaffold_16:5305727-5305727 > data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_mismatch.bam +samtools index runs/BSA/RNAseq/STAR_flower_mappings/AM_00332_rf_Aligned.sortedByCoord.out.bam +samtools view -b -h -F 256 -P runs/BSA/RNAseq/STAR_flower_mappings/AM_00332_rf_Aligned.sortedByCoord.out.bam Scaffold_16:5305727-5305727 > runs/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_mismatch.bam # index and extract everything overlapping the stop-gained variant position -samtools index data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_mismatch.bam -samtools view -b -h -P data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_mismatch.bam Scaffold_16:5305851-5305851 > data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam -samtools index data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam +samtools index runs/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_mismatch.bam +samtools view -b -h -P runs/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_mismatch.bam Scaffold_16:5305851-5305851 > runs/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam +samtools index runs/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam # save as tsv file using sam2tsv from jvarkit -java -jar /home/tom/Documents/tools/jvarkit/dist/sam2tsv.jar -R polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam > data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam.tsv +java -jar /home/tom/Documents/tools/jvarkit/dist/sam2tsv.jar -R runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta runs/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam > runs/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam.tsv ``` Put in loading and filtering of the data (tsv and sequencing data) into separate functions: @@ -72,9 +72,9 @@ read_bam.tsv <- function(filename){ arrange(read_name) %>% ungroup() %>% #mutate(allele = ifelse(tolower(read_base) == ref_base, "ref", "alt")) %>% # is it the reference or alternative allele? - mutate(allele = ifelse(ref_pos == 5305722 & read_base == "A", - "alt", - ifelse(ref_pos == 5305727 & read_base == "T", + mutate(allele = ifelse(ref_pos == 5305722 & read_base == "A", + "alt", + ifelse(ref_pos == 5305727 & read_base == "T", "alt", ifelse(ref_pos == 5305851 & read_base == "T", "alt", "ref")))) @@ -104,22 +104,22 @@ plot_reads <- function(df, tsv, title){ # maybe rather join the two tables? number_alt_alleles <- tsv %>% filter(allele == "alt") %>% - group_by(read_name) %>% + group_by(read_name) %>% summarise(n = n(), snp_grouped = sum(snp_group)) # join the table detailing the number of alt alleles snp.bam.df <- left_join(df, number_alt_alleles, by = c("qname" = "read_name")) - + # first plot, showing specific reads snpplot <- ggplot() + - geom_range(data = snp.bam.df, + geom_range(data = snp.bam.df, aes(xstart = start, xend = end, y = factor(qname, levels=unique(qname[order(n,snp_grouped,qname)]), ordered=TRUE))) - + # second plot showing alternativ allele positions snpplot2 <- snpplot + - geom_point(data = tsv %>% filter(allele == "alt"), - aes(x = ref_pos, - y = read_name, + geom_point(data = tsv %>% filter(allele == "alt"), + aes(x = ref_pos, + y = read_name, color = as.factor(snp_group)), size = 2.2) + theme_classic() + @@ -149,9 +149,9 @@ Load in data for the red and green flower bulks of the BSA on AM_00332: ```{r} # green flower -snp_tsv.gf <- read_bam.tsv(filename = "data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam.tsv") +snp_tsv.gf <- read_bam.tsv(filename = "runs/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam.tsv") -snp_df.gf <- read_bam_snps_as_df(filename = "data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam", +snp_df.gf <- read_bam_snps_as_df(filename = "runs/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam", tsv = snp_tsv.gf) # how many reads after filtering? @@ -159,9 +159,9 @@ snp_tsv.gf %>% dplyr::count(read_name) # 99 reads # red flower -snp_tsv.rf <- read_bam.tsv(filename = "data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam.tsv") +snp_tsv.rf <- read_bam.tsv(filename = "runs/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam.tsv") -snp_df.rf <- read_bam_snps_as_df(filename = "data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam", +snp_df.rf <- read_bam_snps_as_df(filename = "runs/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam", tsv = snp_tsv.rf) # how many reads after filtering? @@ -179,7 +179,7 @@ plot.gf <- plot_reads(df = snp_df.gf, plot.gf # save plot -ggsave(filename = "plots/rna_seq_reads_gf.png", +ggsave(filename = "runs/plots/rna_seq_reads_gf.png", plot = plot.gf, width = 10, height = 8) @@ -192,17 +192,8 @@ plot.rf <- plot_reads(df = snp_df.rf, plot.rf # save plot -ggsave(filename = "plots/rna_seq_reads_rf.png", +ggsave(filename = "runs/plots/rna_seq_reads_rf.png", plot = plot.rf, width = 10, height = 8) ``` - - - - - - - - - diff --git a/workflows/BSA/snpEff_analysis.Rmd b/workflows/BSA/snpEff_analysis.Rmd index f74a6b0d82f193bc9cedfa9a4c5920b764b79e30..28c1f06fe01e3477ba31fb75e5f70c02470dcb17 100644 --- a/workflows/BSA/snpEff_analysis.Rmd +++ b/workflows/BSA/snpEff_analysis.Rmd @@ -14,7 +14,7 @@ library(ggtranscript) library(reshape2) library(cowplot) library(patchwork) -knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2_publication/") +knitr::opts_knit$set(root.dir = "/home/tom/Documents/ARC_projects/betalain_regulation_amaranth/") ``` ## Database creation @@ -22,15 +22,15 @@ knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2_publicat Run snpEff database creation on the fixed annotation files. Copy the fixed files to the snpeff directory: ```{bash} -mkdir -p data/annotation_analysis/snpEff/databases/AHv2.2/ +mkdir -p runs/annotation_analysis/snpEff/databases/AHv2.2/ # snpEff analysis # add genome file to snpEff database -cp polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta data/annotation_analysis/snpEff/databases/AHv2.2/sequences.fa +cp runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta runs/annotation_analysis/snpEff/databases/AHv2.2/sequences.fa # add annotation file to snpEff database -cp data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff data/annotation_analysis/snpEff/databases/AHv2.2/genes.gff -cp data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta data/annotation_analysis/snpEff/databases/AHv2.2/cds.fa -cp data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.prot.fasta data/annotation_analysis/snpEff/databases/AHv2.2/protein.fa +cp runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff runs/annotation_analysis/snpEff/databases/AHv2.2/genes.gff +cp runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta runs/annotation_analysis/snpEff/databases/AHv2.2/cds.fa +cp runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.prot.fasta runs/annotation_analysis/snpEff/databases/AHv2.2/protein.fa # create database: java -jar /home/tom/Documents/tools/snpEff/snpEff.jar build -v AHv2.2 @@ -42,18 +42,18 @@ java -jar /home/tom/Documents/tools/snpEff/snpEff.jar build -v AHv2.2 Run using data from Markus color/sterility mapping bulks: ```{bash} -mkdir -p data/annotation_analysis/snpEff/bsa_sterility_color/analysis +mkdir -p runs/annotation_analysis/snpEff/bsa_sterility_color/analysis # get data and run snpEff on the example data: -java -jar /home/tom/Documents/tools/snpEff/snpEff.jar -csvStats data/annotation_analysis/snpEff/bsa_sterility_color/output.stats.csv -v AHv2.2 data/annotation_analysis/snpEff/bsa_sterility_color/gatk_filter_maxmissing05_biallelic.vcf.gz > data/annotation_analysis/snpEff/bsa_sterility_color/output.snpeff.vcf +java -jar /home/tom/Documents/tools/snpEff/snpEff.jar -csvStats runs/annotation_analysis/snpEff/bsa_sterility_color/output.stats.csv -v AHv2.2 runs/annotation_analysis/snpEff/bsa_sterility_color/gatk_filter_maxmissing05_biallelic.vcf.gz > runs/annotation_analysis/snpEff/bsa_sterility_color/output.snpeff.vcf # two files are not saved in the output directory but in the current working directory -mv snpEff_* data/annotation_analysis/snpEff/bsa_sterility_color/ +mv snpEff_* runs/annotation_analysis/snpEff/bsa_sterility_color/ # it is challenging to process the snpEff output for downstream analysis # snpsift is a software package distributed with snpEff that eases processing -cat data/annotation_analysis/snpEff/bsa_sterility_color/output.snpeff.chr16.vcf | /home/tom/Documents/tools/snpEff/scripts/vcfEffOnePerLine.pl | java -jar /home/tom/Documents/tools/snpEff/SnpSift.jar extractFields - CHROM POS "ANN[*].GENEID" "ANN[*].EFFECT" > data/annotation_analysis/snpEff/bsa_sterility_color/output.snpeff.chr16.snpsift.txt +cat runs/annotation_analysis/snpEff/bsa_sterility_color/output.snpeff.chr16.vcf | /home/tom/Documents/tools/snpEff/scripts/vcfEffOnePerLine.pl | java -jar /home/tom/Documents/tools/snpEff/SnpSift.jar extractFields - CHROM POS "ANN[*].GENEID" "ANN[*].EFFECT" > runs/annotation_analysis/snpEff/bsa_sterility_color/output.snpeff.chr16.snpsift.txt ``` @@ -63,11 +63,11 @@ Analyze the snpEff test run and check for high impact variants that can be manua ```{r} # load in snpEff summary file -snpEff.tab <- read.table("data/annotation_analysis/snpEff/bsa_sterility_color/snpEff_genes.txt", skip = 1, header = T, comment.char = "") +snpEff.tab <- read.table("runs/annotation_analysis/snpEff/bsa_sterility_color/snpEff_genes.txt", skip = 1, header = T, comment.char = "") colnames(snpEff.tab)[1] <- "GeneName" # load in list of betalain and flavonoid pathway genes -color_pathways <- read.csv("data/manual_sheets/color_pathway_genes.csv", header = T) +color_pathways <- read.csv("runs/manual_sheets/color_pathway_genes.csv", header = T) snpEff.tab <- left_join(snpEff.tab, color_pathways, by = c("GeneId" = "Gene_id")) # check color pathway genes for high impact variants, moderate and in theory also modifier might also be relevant @@ -88,14 +88,14 @@ Subset a more detailed table of betalain genes and their respective positions in ```{r} # load in list of betalain and flavonoid pathway genes -color_pathways <- read.csv("data/manual_sheets/color_pathway_genes.csv", header = T) +color_pathways <- read.csv("runs/manual_sheets/color_pathway_genes.csv", header = T) betalain_chr16 <- color_pathways %>% filter(Gene_id == "AHp022773" | Gene_id == "AHp023148" | Gene_id == "AHp023147") # add BvMYB1like gene betalain_chr16[2,] <- c("BvMYB1like", "Betalain", "AHp022773") -write.table(betalain_chr16, - file = "data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.txt", +write.table(betalain_chr16, + file = "runs/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.txt", quote = F) # set up function for reading in a gtf file @@ -123,18 +123,18 @@ read.gtf <- function(file){ } # read in annotation -annotation.gtf <- read.gtf("polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf") +annotation.gtf <- read.gtf("runs/polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf") # subset betalain genes betalain_chr16.gtf <- annotation.gtf %>% filter(gene_id %in% betalain_chr16$Gene_id) -saveRDS(betalain_chr16.gtf, file = "data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.gtf") +saveRDS(betalain_chr16.gtf, file = "runs/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.gtf") #################### generate BED file of relevant positions -betalain_chr16.gtf <- readRDS("data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.gtf") +betalain_chr16.gtf <- readRDS("runs/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.gtf") # snpeff by default uses the 5000 positions before and after a gene # create a bed file that can be used to subset the vcf file into relevant variants betalain_chr16.bed <- betalain_chr16.gtf %>% @@ -145,16 +145,16 @@ betalain_chr16.bed <- betalain_chr16.gtf %>% select(chrom, chromStart, chromEnd) %>% unique() -write_tsv(betalain_chr16.bed, file = "data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.bed") +write_tsv(betalain_chr16.bed, file = "runs/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.bed") ``` Subset the vcf file and extract the allele frequencies using vcftools ```{bash} # to extract the format field -vcftools --gzvcf data/BSA/wgs/vcf/gatk_filter_maxmissing05_biallelic.vcf.gz --bed data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.bed --indv AM_00331_gf --indv AM_00331_rf --indv AM_00332_gl --indv AM_00332_rl --extract-FORMAT-info AD --out data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16 +vcftools --gzvcf runs/BSA/wgs/vcf/gatk_filter_maxmissing05_biallelic.vcf.gz --bed runs/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.bed --indv AM_00331_gf --indv AM_00331_rf --indv AM_00332_gl --indv AM_00332_rl --extract-FORMAT-info AD --out runs/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16 # also subset the vcf file to include only the variants around the betalain genes -vcftools --gzvcf data/BSA/wgs/vcf/gatk_filter_maxmissing05_biallelic.vcf.gz --bed data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.bed --indv AM_00331_gf --indv AM_00331_rf --indv AM_00332_gl --indv AM_00332_rl --recode --recode-INFO-all --out data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16 +vcftools --gzvcf runs/BSA/wgs/vcf/gatk_filter_maxmissing05_biallelic.vcf.gz --bed runs/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.bed --indv AM_00331_gf --indv AM_00331_rf --indv AM_00332_gl --indv AM_00332_rl --recode --recode-INFO-all --out runs/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16 ``` @@ -162,10 +162,10 @@ Plot the annotated variants in betalain genes. In general, it could be interesti ```{r} # read in betalain gene list on chr 16 -betalain_chr16 <- read.table("data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.txt") +betalain_chr16 <- read.table("runs/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.txt") # read in variant count of extracted SNPs -allele_depth.tab <- read.table(file = "data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.AD.FORMAT", header = T) +allele_depth.tab <- read.table(file = "runs/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.AD.FORMAT", header = T) allele_depth.tab <- allele_depth.tab %>% mutate(customid = paste0("16_", POS)) # allele depth denotes first the reference allele and then the alternative allele, only those reads which were involved in allele calling @@ -176,7 +176,7 @@ allele_depth.tab <- separate(data = allele_depth.tab, col = "AM_00332_gl", sep = allele_depth.tab <- separate(data = allele_depth.tab, col = "AM_00332_rl", sep = ",", into = c("AM00332_rl_ref", "AM00332_rl_alt")) # read in snpsift output and subset for betalain genes -snpsift.tab <- read.table("data/annotation_analysis/snpEff/bsa_sterility_color/output.snpeff.chr16.snpsift.txt", header = T) +snpsift.tab <- read.table("runs/annotation_analysis/snpEff/bsa_sterility_color/output.snpeff.chr16.snpsift.txt", header = T) snpsift.tab <- snpsift.tab %>% filter(ANN....GENEID %in% betalain_chr16$Gene_id) %>% mutate(customid = paste0("16_", POS)) %>% # add customid column to snpsift table to enable merging of the two tables @@ -184,7 +184,7 @@ snpsift.tab <- snpsift.tab %>% # read in betalain gene gtf on chr 16 -betalain_chr16.gtf <- readRDS("data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.gtf") +betalain_chr16.gtf <- readRDS("runs/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.gtf") # join the snpsift table with the allele depth table joined.df <- left_join(snpsift.tab, allele_depth.tab, by = c("CHROM", "POS", "customid")) @@ -201,7 +201,7 @@ Set up plotting functions: filter_variants <- function(variants, gene, bulk1_ref, bulk1_alt, bulk2_ref, bulk2_alt){ # prepare data by only keeping the relevant gene variants and samples - dat <- variants %>% + dat <- variants %>% filter(ANN....GENEID == gene) %>% select(CHROM, POS, ANN....EFFECT, bulk1_ref, bulk1_alt, bulk2_ref, bulk2_alt) %>% filter(bulk1_alt != 0 & bulk2_alt != 0) @@ -226,13 +226,13 @@ plot_bulk_comparison <- function(annotation, trans_id, filtered_variants){ filter(transcript_id == trans_id, type == "CDS") filtered_variants <- filtered_variants[filtered_variants$POS >= min(annotation.filtered$start) & filtered_variants$POS <= max(annotation.filtered$end),] - filtered_variants$ANN....EFFECT <- factor(filtered_variants$ANN....EFFECT, levels = c("intron_variant", - "synonymous_variant", + filtered_variants$ANN....EFFECT <- factor(filtered_variants$ANN....EFFECT, levels = c("intron_variant", + "synonymous_variant", "missense_variant", "stop_gained")) min_pos <- min(annotation.filtered$start) max_pos <- max(annotation.filtered$end) - + # plot the gene with variants p2 <- ggplot() + geom_range(data = annotation.filtered, @@ -253,7 +253,7 @@ plot_bulk_comparison <- function(annotation, trans_id, filtered_variants){ x = "Position Scaffold 16 (bp)") + theme_classic() + #scale_fill_brewer(palette = "RdBl", direction = -1) + # think about color palette to use - scale_fill_viridis_d(direction = -1, + scale_fill_viridis_d(direction = -1, labels = c("Intron variant", "Synonymous variant", "Missense variant", "Stop gained")) + theme(text = element_text(size = 21), #legend.position = "none", @@ -277,23 +277,23 @@ plot_bulk_comparison <- function(annotation, trans_id, filtered_variants){ out_plot <- plot_grid(p2, legend, nrow = 1, rel_widths = c(0.75, 0.25)) - + # # rearrange data: # dat1 <- filtered_variants[,-(4:5)] # dat2 <- filtered_variants[,-(6:7)] # dat1.melt <- melt(dat1, id.vars = c("CHROM", "POS", "ANN....EFFECT")) # dat2.melt <- melt(dat2, id.vars = c("CHROM", "POS", "ANN....EFFECT")) - # + # # # plot relative allele frequency # p1 <- ggplot() + # geom_col(data = dat1.melt, # aes(x = POS, y = as.numeric(value), fill = variable), position = "stack", width = 20) + # xlim(c(min_pos, max_pos)) + - # labs(y = "", - # fill = "red_bulk", + # labs(y = "", + # fill = "red_bulk", # x = "") + # theme_classic() + - # scale_fill_brewer(palette = "Set1", + # scale_fill_brewer(palette = "Set1", # labels = c("Reference allele", "Alternative allele"), # direction = -1, # guide = guide_legend(override.aes = list(alpha = 0))) + # make legend invisible @@ -307,12 +307,12 @@ plot_bulk_comparison <- function(annotation, trans_id, filtered_variants){ # axis.title.x = element_blank(), # legend.title = element_text(color = "transparent"), # legend.text = element_text(color = "transparent")) - # + # # bars <- map(unique(dat2.melt$POS) # , ~geom_col(position = "stack", # width = 20 # , data = dat2.melt %>% filter(POS == .x))) - # + # # p3 <- ggplot(data = dat2.melt, # aes(x=POS, # y=as.numeric(value), @@ -322,8 +322,8 @@ plot_bulk_comparison <- function(annotation, trans_id, filtered_variants){ # labs(fill = "green_bulk", # x = "Position on Scaffold 16") + # theme_classic() + - # scale_fill_brewer(palette = "Set1", - # labels = c("Reference allele", "Alternative allele"), + # scale_fill_brewer(palette = "Set1", + # labels = c("Reference allele", "Alternative allele"), # direction = -1, # guide = guide_legend(override.aes = list(alpha = 0))) + # theme(text = element_text(size = 21), @@ -331,9 +331,9 @@ plot_bulk_comparison <- function(annotation, trans_id, filtered_variants){ # legend.position = "none", # legend.title = element_text(color = "transparent"), # legend.text = element_text(color = "transparent")) - - - #allplots <- p1 + p2 + p3 + + + + #allplots <- p1 + p2 + p3 + #plot_layout(ncol = 1) return(out_plot) } @@ -346,7 +346,7 @@ Plot for CYP76AD2: ```{r} # plot for one gene # filter all homozygous reference variants -dat <- joined.df %>% +dat <- joined.df %>% filter(ANN....GENEID == "AHp023148") %>% filter(AM00332_gl_alt != 0) @@ -355,7 +355,7 @@ dat <- joined.df %>% # filter to only include specific gene -dat.filtered <- filter_variants(variants = joined.df, +dat.filtered <- filter_variants(variants = joined.df, gene = "AHp023148", bulk1_ref = "AM00332_gl_ref", bulk1_alt = "AM00332_gl_alt", @@ -373,8 +373,8 @@ AM00332_CYP76AD <- plot_bulk_comparison(annotation = betalain_chr16.gtf, AM00332_CYP76AD -ggsave(filename = "plots/CYP76AD_AHp023148_bsa_snpeff.png", - width = 14, +ggsave(filename = "runs/plots/CYP76AD_AHp023148_bsa_snpeff.png", + width = 14, height = 6) ``` @@ -387,19 +387,19 @@ plot_all_genes <- function(gene_id, transcript_id, bulk){ output <- list() for (i in 1:length(transcript_id)){ # filter all homozygous reference variants - dat <- joined.df %>% + dat <- joined.df %>% filter(ANN....GENEID == gene_id[i]) %>% filter(AM00332_gl_alt != 0) # filter to only include specific gene if (bulk == "AM00332"){ - dat.filtered <- filter_variants(variants = joined.df, + dat.filtered <- filter_variants(variants = joined.df, gene = gene_id[i], bulk1_ref = "AM00332_gl_ref", bulk1_alt = "AM00332_gl_alt", bulk2_ref = "AM00332_rl_ref", bulk2_alt = "AM00332_rl_alt") } else if (bulk == "AM00331"){ - dat.filtered <- filter_variants(variants = joined.df, + dat.filtered <- filter_variants(variants = joined.df, gene = gene_id[i], bulk1_ref = "AM00331_gf_ref", bulk1_alt = "AM00331_gf_alt", @@ -425,9 +425,9 @@ bulk_AM00332_plot_list <- plot_all_genes(gene_id = gene_id, bulk = "AM00332") for (i in 1:length(bulk_AM00332_plot_list)){ - ggsave(filename = paste0("plots/BSA/AM00332_color_loss_", gene_id[i], ".png"), + ggsave(filename = paste0("runs/plots/BSA/AM00332_color_loss_", gene_id[i], ".png"), plot = bulk_AM00332_plot_list[[i]], - width = 14, + width = 14, height = 6) } @@ -437,9 +437,9 @@ bulk_AM00331_plot_list <- plot_all_genes(gene_id = gene_id, bulk = "AM00331") for (i in 1:length(bulk_AM00331_plot_list)){ - ggsave(filename = paste0("plots/BSA/AM00331_color_loss_", gene_id[i], ".png"), + ggsave(filename = paste0("runs/plots/BSA/AM00331_color_loss_", gene_id[i], ".png"), plot = bulk_AM00331_plot_list[[i]], - width = 14, + width = 14, height = 6) } ``` @@ -456,8 +456,8 @@ plot_bulk_comparison_zoom <- function(annotation, trans_id, filtered_variants){ filter(transcript_id == trans_id, type == "CDS") filtered_variants <- filtered_variants[filtered_variants$POS >= min(annotation.filtered$start) & filtered_variants$POS <= max(annotation.filtered$end),] - filtered_variants$ANN....EFFECT <- factor(filtered_variants$ANN....EFFECT, levels = c("intron_variant", - "synonymous_variant", + filtered_variants$ANN....EFFECT <- factor(filtered_variants$ANN....EFFECT, levels = c("intron_variant", + "synonymous_variant", "missense_variant", "stop_gained")) # plot the gene with variants @@ -486,26 +486,26 @@ plot_bulk_comparison_zoom <- function(annotation, trans_id, filtered_variants){ axis.line.x = element_blank(), axis.text.x = element_blank(), legend.position = "none") # this increases the legend margin - # margin has to be increased so that other legends are not cut off, + # margin has to be increased so that other legends are not cut off, # since the first legend seems to determine the margins - + # rearrange data: dat1 <- filtered_variants[,-(4:5)] dat2 <- filtered_variants[,-(6:7)] dat1.melt <- melt(dat1, id.vars = c("CHROM", "POS", "ANN....EFFECT")) dat2.melt <- melt(dat2, id.vars = c("CHROM", "POS", "ANN....EFFECT")) - + # plot relative allele frequency p1 <- ggplot() + geom_col(data = dat1.melt, aes(x = POS, y = as.numeric(value), fill = variable), position = "stack", width = 2) + xlim(c(min(annotation.filtered %>% select(start)), max(annotation.filtered %>% select(end)))) + - labs(y = "Red bulk\n allele depth", - fill = "red_bulk", + labs(y = "Red bulk\n allele depth", + fill = "red_bulk", x = "") + theme_classic() + - scale_fill_brewer(palette = "Set1", + scale_fill_brewer(palette = "Set1", labels = c("Reference allele", "Alternative allele"), direction = -1, guide = guide_legend(override.aes = list(alpha = 0))) + # make legend invisible @@ -524,12 +524,12 @@ plot_bulk_comparison_zoom <- function(annotation, trans_id, filtered_variants){ axis.title.x = element_blank(), legend.title = element_text(color = "transparent"), legend.text = element_text(color = "transparent")) - + bars <- map(unique(dat2.melt$POS) , ~geom_col(position = "stack", width = 2 , data = dat2.melt %>% filter(POS == .x))) - + p3 <- ggplot(data = dat2.melt, aes(x=POS, y=as.numeric(value), @@ -541,7 +541,7 @@ plot_bulk_comparison_zoom <- function(annotation, trans_id, filtered_variants){ fill = "", y = "Green bulk\n allele depth") + theme_classic() + - scale_fill_brewer(palette = "Set1", + scale_fill_brewer(palette = "Set1", labels = c("Reference allele", "Alternative allele"), #guide = guide_legend(override.aes = list(alpha = 0)), direction = -1) + @@ -564,13 +564,13 @@ plot_bulk_comparison_zoom <- function(annotation, trans_id, filtered_variants){ p3 <- p3 + theme(legend.position = "none") # combine the three plots allplots <- plot_grid(p1, p2, p3, - ncol = 1, + ncol = 1, align = "v", rel_heights = c(0.3, 0.25, 0.45)) - - + + # combine other plots with legend - allplots <- plot_grid(legend, allplots, + allplots <- plot_grid(legend, allplots, ncol = 1, #align = "v", rel_heights = c(0.2,0.8)) @@ -582,7 +582,7 @@ AM00332_CYP76AD_zoom <- plot_bulk_comparison_zoom(annotation = betalain_chr16.gt filtered_variants = dat.filtered) AM00332_CYP76AD_zoom -ggsave(filename = "plots/CYP76AD_AHp023148_bsa_snpeff_zoom.png") +ggsave(filename = "runs/plots/CYP76AD_AHp023148_bsa_snpeff_zoom.png") ``` @@ -620,16 +620,10 @@ grid_with_BSA <- plot_grid(cowplot_leaf, complete_grid, #grid_with_BSA # save plot -ggsave(filename = "plots/BSA_with_grid.png", +ggsave(filename = "runs/plots/BSA_with_grid.png", plot = grid_with_BSA, width = 25, height = 20, bg = "white", dpi = 500) ``` - - - - - - diff --git a/workflows/genome_polishing/helper_script.R b/workflows/genome_polishing/helper_script.R index 2a936488bd83484a3b9b39ff8ca99e7401ab80c0..1571dce75ee67341da985e78ed4191ec42e8ad34 100644 --- a/workflows/genome_polishing/helper_script.R +++ b/workflows/genome_polishing/helper_script.R @@ -1,20 +1,20 @@ # set working directory -setwd("/home/tom/Documents/projects/Ahyp_v2_2_publication/") +setwd("/home/tom/Documents/ARC_projects/betalain_regulation_amaranth/") # read in list of all headers with the correct order -headers <- read.table("data/NextPolish/input/out.headers.txt") +headers <- read.table("runs/NextPolish/input/out.headers.txt") headers <- gsub(">","",headers$V1) headers <- gsub("quiver_","quiver",headers) # read in prefiltered fasta index -prefilter <- read.table("data/NextPolish/input/out.prefiltered.renamed.txt.fai") +prefilter <- read.table("runs/NextPolish/input/out.prefiltered.renamed.txt.fai") -# use the -# every sequence that is in +# use the +# every sequence that is in no_seq <- headers[!headers %in% prefilter[,1]] no_seq <- sub("",">",no_seq) -write.table(no_seq, file="data/NextPolish/processed/header_without_sequence.fa", +write.table(no_seq, file="runs/NextPolish/processed/header_without_sequence.fa", quote=F, row.names = F, col.names = F) diff --git a/workflows/genome_polishing/process_nextpolish_output.sh b/workflows/genome_polishing/process_nextpolish_output.sh index 65ab4586b98162e5bf5109c16f8a7c8a97d992bd..d24e30688ebd3274ae376c0e3874b55e2aefaa23 100644 --- a/workflows/genome_polishing/process_nextpolish_output.sh +++ b/workflows/genome_polishing/process_nextpolish_output.sh @@ -4,8 +4,8 @@ # (see master_thesis/code/process_nextpolish_output.sh for more information about the input file preparation) # Setup -NPOUT=data/NextPolish/output/ -NPPROCESSED=data/NextPolish/processed/ +NPOUT=runs/NextPolish/output/ +NPPROCESSED=runs/NextPolish/processed/ mkdir -p "$NPPROCESSED" @@ -15,10 +15,10 @@ mkdir -p "$NPPROCESSED" cut -f1,2 -d'_' "$NPOUT"genome.nextpolish.fa > "$NPPROCESSED"genome.nextpolish.renamed.fa # index the prefiltered fasta file for use in R -samtools faidx data/NextPolish/input/out.prefiltered.renamed.txt +samtools faidx runs/NextPolish/input/out.prefiltered.renamed.txt # filter the prefiltered file for everything that is not in Nextpolish file: -/home/tom/Documents/tools/bbmap/filterbyname.sh in=data/NextPolish/input/out.prefiltered.renamed.txt \ +/home/tom/Documents/tools/bbmap/filterbyname.sh in=runs/NextPolish/input/out.prefiltered.renamed.txt \ names="$NPPROCESSED"genome.nextpolish.renamed.fa \ out="$NPPROCESSED"prefilter_not_in_Nextpolish.fa @@ -34,7 +34,7 @@ cat "$NPPROCESSED"genome.nextpolish.renamed.fa \ LC_ALL=C awk -v RS=">" -v FS="\n" -v ORS="\n" -v OFS="" '$0 {$1=">"$1"\n"; print}' "$NPPROCESSED"combined.fa > "$NPPROCESSED"combined.linear.fa # rename header file by removing trailing underscore character of Contigs: -sed 's/quiver_/quiver/' data/NextPolish/input/out.headers.txt > "$NPPROCESSED"out.header.renamed.txt +sed 's/quiver_/quiver/' runs/NextPolish/input/out.headers.txt > "$NPPROCESSED"out.header.renamed.txt # order file: ORDER="$NPPROCESSED"out.header.renamed.txt @@ -60,5 +60,5 @@ rm "$NPPROCESSED"out* rm "$NPPROCESSED"prefilter* # copy to final output directory -mkdir -p polished_reference_genome/polished_genome_annotation/assembly/ -cp "$NPPROCESSED"Ahypochondriacus_2.2_polished.fasta polished_reference_genome/polished_genome_annotation/assembly/ +mkdir -p runs/polished_reference_genome/polished_genome_annotation/assembly/ +cp "$NPPROCESSED"Ahypochondriacus_2.2_polished.fasta runs/polished_reference_genome/polished_genome_annotation/assembly/ diff --git a/workflows/genome_polishing/remove_ambiguous_bases.sh b/workflows/genome_polishing/remove_ambiguous_bases.sh index c810a8a1099eb86902fb3ee23a0a589b6ce2d095..1cb8a10921dbfae66f89075283543dba9a6bdf2c 100644 --- a/workflows/genome_polishing/remove_ambiguous_bases.sh +++ b/workflows/genome_polishing/remove_ambiguous_bases.sh @@ -6,8 +6,8 @@ # N specific entries can in a last step be removed. To reconstruct, the saved order can be used to integrate the N chromosomes again (which record the number of Ns removed) # Change these two parameters -INPUT=reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.nospace.underscore.fa -OUTDIR=data/NextPolish/input/ +INPUT=studies/additional_data/resources/reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.nospace.underscore.fa +OUTDIR=runs/NextPolish/input/ mkdir -p $OUTDIR @@ -37,9 +37,8 @@ grep ">" "$OUTDIR"tmp2.txt > "$OUTDIR"out.headers.txt echo "headers saved" # remove headers without sequence (Can be caused by stretch of Ns at the start of a Scaffold (see Scaffold 10)) -sed -r 'N; /(>)[^\n]*\n\1/ s/[^\n]*//; P; D' "$OUTDIR"tmp2.txt | grep . | grep -i -B 1 --no-group-separator '[ATGC]' > "$OUTDIR"data/NextPolish/input/Ahypochondriacus_split.fasta +sed -r 'N; /(>)[^\n]*\n\1/ s/[^\n]*//; P; D' "$OUTDIR"tmp2.txt | grep . | grep -i -B 1 --no-group-separator '[ATGC]' > "$OUTDIR"Ahypochondriacus_split.fasta # remove and rename temporary files rm "$OUTDIR"tmp.txt mv "$OUTDIR"tmp2.txt "$OUTDIR"out.prefiltered.txt - diff --git a/workflows/genome_polishing/run_nextpolish.sh b/workflows/genome_polishing/run_nextpolish.sh index cbf057986b68d5b7db03b06925d898ff70ff3143..a086c0bb8d88e19d99648ad6b209d431b08c6307 100644 --- a/workflows/genome_polishing/run_nextpolish.sh +++ b/workflows/genome_polishing/run_nextpolish.sh @@ -19,15 +19,15 @@ module load samtools mkdir -p /scratch/twinkle1/nextpolish/ # set output directory for saving the polished genome to: -OUTDIR=data/NextPolish/output/ +OUTDIR=runs/NextPolish/output/ ### Prepare input files: # remove all reads containing ambiguous bases from the input using bbduk /home/twinkle1/tools/bbmap/bbduk.sh maxns=0 \ - in=/projects/ag-stetter/twinkle/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_1.fastq.gz \ - in2=/projects/ag-stetter/twinkle/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_2.fastq.gz \ + in=/studies/additional_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_1.fastq.gz \ + in2=/studies/additional_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_2.fastq.gz \ out=/scratch/twinkle1/SRR2106212_1.cleaned.fq \ out2=/scratch/twinkle1/SRR2106212_2.cleaned.fq \ -Xmx16g @@ -49,7 +49,7 @@ round=2 threads=20 read1=/scratch/twinkle1/SRR2106212_1.cleaned.repair.fq read2=/scratch/twinkle1/SRR2106212_2.cleaned.repair.fq -input=/home/twinkle1/master_thesis/data/NextPolish/input/Ahypochondriacus_split.fasta +input=/runs/NextPolish/input/Ahypochondriacus_split.fasta for ((i=1; i<=${round};i++)); do diff --git a/workflows/genome_polishing/unpackSRA.sh b/workflows/genome_polishing/unpackSRA.sh index 7467f476e46a5d21632b6feaac7c4b95f805f971..6cdf885e2a49251f79856dff73ce59f34dbda4a8 100644 --- a/workflows/genome_polishing/unpackSRA.sh +++ b/workflows/genome_polishing/unpackSRA.sh @@ -14,29 +14,29 @@ # download file, show progress, increase default max size so that the download starts # tools/sratoolkit.2.11.2-centos_linux64/bin/prefetch -p -O /projects/ag-stetter/twinkle/lightfoot_WGS_short_reads/ --max-size 30G SRR2106212 -QCOUT=data/NextPolish/QC +QCOUT=runs/NextPolish/QC # set working directory -cd /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/raw_data/lightfoot_WGS_short_reads/ +cd /studies/additional_data/lightfoot_WGS_short_reads/ # before running the fastq-dump command, switch off "Enable Remote Access" by running sratoolskit/bin/vdb-config -i # split into fastq files /home/twinkle1/tools/sratoolkit.2.11.2-centos_linux64/bin/fastq-dump --split-3 --verbose SRR2106212.sra # set working directory -cd /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/ +cd /projects/ag-stetter/twinkle/ARC_projects/betalain_regulation_amaranth # gzip the resulting fastq files # Even though there is an option to gzip it directly using the fastq-dump command, the option is deprecated and should no longer be used -gzip raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_1.fastq -gzip raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_2.fastq +gzip /studies/additional_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_1.fastq +gzip /studies/additional_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_2.fastq # remove sra file afterwards -rm raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212.sra +rm /studies/additional_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212.sra # quality control: module load fastqc/0.11.9 -fastqc -o raw_data/lightfoot_WGS_short_reads/QC/ -t 8 \ - raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_1.fastq.gz \ - raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_2.fastq.gz +fastqc -o /studies/additional_data/lightfoot_WGS_short_reads/QC/ -t 8 \ + /studies/additional_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_1.fastq.gz \ + /studies/additional_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_2.fastq.gz