diff --git a/workflows/BSA/RNAseq/QC.sh b/workflows/BSA/RNAseq/QC.sh
new file mode 100644
index 0000000000000000000000000000000000000000..64db67753b630dbf68ec8969414684f3d63565bf
--- /dev/null
+++ b/workflows/BSA/RNAseq/QC.sh
@@ -0,0 +1,93 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -t 10:00:00
+#SBATCH -J QC
+#SBATCH -o /home/twinkle1/projects/Ahyp_v2_2/logs/flower_color_mapping/mappingLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=10
+#SBATCH --mem=8gb
+
+# Use featurecounts to create the countmatrix from STAR mappings, also perform different quality control measures
+
+# load necessary modules
+
+source $CONDA_PREFIX/etc/profile.d/conda.sh
+conda activate featurecounts
+
+module load fastqc/0.11.9
+
+GTFIN=polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf
+
+
+
+# Quality control
+# Initialize the output directory
+QCIN=raw_data/BSA_rnaseq/
+QCOUT=raw_data/BSA_rnaseq/fastqc
+
+mkdir -p $QCOUT
+# run quality control fastqc
+fastqc -t 10 -o $QCOUT "$QCIN"*P.fq.gz
+
+
+# Quality control
+# Initialize the output directory
+QCIN=raw_data/BSA_rnaseq/
+QCOUT=raw_data/BSA_rnaseq/fastqc
+
+mkdir -p $QCOUT
+# run quality control fastqc
+fastqc -t 10 -o $QCOUT "$QCIN"*P.fq.gz
+
+cp -r $QCOUT data/BSA/RNAseq/STAR_flower_mappings/QC/
+
+module load samtools/1.13
+
+
+# run quality control qualimap on the generated bam files
+# run rnaseq mode for each file, qualimap takes as input a bam sorted by name
+# -p = strand specific protocol, -pe = paired-end sequencing data, -s = file is sorted by name
+
+# prepare input gtf file
+GTFQM=/scratch/twinkle1/temp.gtf
+sed 's/CDS/exon/' $GTFIN > $GTFQM
+
+# define input files
+QMIN=data/BSA/RNAseq/STAR_flower_mappings/
+
+
+# define output directory
+QMOUT=data/BSA/RNAseq/STAR_flower_mappings/QC/qualimap
+
+# create main output directory
+mkdir -p $QMOUT
+
+# run qualimap
+for file in "$QMIN"*out.bam
+do
+	# basename of each sample
+	QMBASE="$(basename -s .sortedByCoord.out.bam $file)"
+	# make output directory
+	mkdir "$QMOUT"/"$QMBASE"
+	# sort by name for qualimap
+	samtools sort -n -T /scratch/twinkle1/ -@ 8 $file -O bam > "$QMOUT"/"$QMBASE"/"$QMBASE".name_sorted.bam
+
+	# run qualimap
+	qualimap rnaseq -bam "$QMOUT"/"$QMBASE"/"$QMBASE".name_sorted.bam \
+		-outdir "$QMOUT"/"$QMBASE" \
+		-gtf $GTFQM \
+		-p strand-specific-reverse \
+		-pe \
+		-s \
+		--java-mem-size=4G
+done
+
+rm $GTFQM
+
+# run multiqc to combine the results from fastqc and qualimap into a single report
+MULTIQCOUT=data/BSA/RNAseq/STAR_flower_mappings/multiqc
+MULTIQCIN=data/BSA/RNAseq/STAR_flower_mappings/QC/
+
+mkdir -p $MULTIQCOUT
+
+multiqc -o $MULTIQCOUT $MULTIQCIN
diff --git a/workflows/BSA/RNAseq/adapter_trimming.sh b/workflows/BSA/RNAseq/adapter_trimming.sh
new file mode 100644
index 0000000000000000000000000000000000000000..15d5ab7bef2f182a0afe15ae0caadef289200a67
--- /dev/null
+++ b/workflows/BSA/RNAseq/adapter_trimming.sh
@@ -0,0 +1,35 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -t 4:00:00
+#SBATCH -J trimmomatic
+#SBATCH -o /home/twinkle1/projects/Ahyp_v2_2/logs/flower_color_mapping/mappingLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=6
+#SBATCH --mem=8gb
+#SBATCH --array 0-3
+#SBATCH --mail-user=twinkle1@smail.uni-koeln.de
+#SBATCH --mail-type=ALL
+
+# trim bulk segregant RNAseq data using trimmomatic with the specified adapter sequences
+
+module load trimmomatic/0.39
+
+# there are a total of 4 samples, array goes from 0-3
+
+### MAIN
+
+# run this part as array job
+# create array of read fastq files (R1 only):
+SOURCE_DIR=raw_data/BSA_rnaseq/
+FILES=("$SOURCE_DIR"/*R1.fastq.gz)
+
+# run trimmomatic, use 6 threads, taking advantage of the baseout function to name output files
+# use the sequencing adapters send by the sequencing center as custom fasta file
+# forward and reverse read adapters are indicated in the fasta file by the /1 and /2 suffixes
+
+java -jar $TRIMMOMATIC/trimmomatic.jar PE \
+	-threads 6 \
+	"${FILES["${SLURM_ARRAY_TASK_ID}"]}" \
+	"${FILES["${SLURM_ARRAY_TASK_ID}"]/R1.fastq.gz/R2.fastq.gz}" \
+	-baseout "${FILES["${SLURM_ARRAY_TASK_ID}"]/R1.fastq.gz/trimmed.fq.gz}" \
+	ILLUMINACLIP:raw_data/BSA_rnaseq/adapters/custom_adapters.fa:2:30:10
diff --git a/workflows/BSA/RNAseq/index_STAR.sh b/workflows/BSA/RNAseq/index_STAR.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cb8606a419e341318008238f456a836ae017649b
--- /dev/null
+++ b/workflows/BSA/RNAseq/index_STAR.sh
@@ -0,0 +1,33 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -t 1:00:00
+#SBATCH -J STAR
+#SBATCH -o /home/twinkle1/projects/Ahyp_v2_2/logs/flower_color_mapping/mappingLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=8
+#SBATCH --mem=32gb
+#SBATCH --job-name="index_STAR"
+
+module load star/2.7.8a
+
+# Index the reference genome
+# only run once per reference genome
+
+# 8 threads, genome generation mode
+# genomeSAindexNbases settings specific for the amaranth reference assembly
+# more specific settings: use the polished, softmasked reference assembly
+# sjdbOverhang dependend on input read length
+# as SJDB file, use the newly generated braker2 protein gtf file
+REFGENOME=polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta
+SJDBFILE=polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf
+
+mkdir -p data/BSA/RNAseq/STAR_flower_index
+
+STAR --runThreadN 8 \
+	--runMode genomeGenerate \
+	--genomeDir /scratch/twinkle1/STAR_flower_index \
+	--sjdbOverhang 100 \
+	--genomeSAindexNbases 13 \
+	--genomeFastaFiles "$REFGENOME" \
+	--sjdbGTFfeatureExon CDS \
+	--sjdbGTFfile "$SJDBFILE"
diff --git a/workflows/BSA/RNAseq/run_STAR.sh b/workflows/BSA/RNAseq/run_STAR.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1064306650a392d552fdf51ac9ccde1a1fc2c725
--- /dev/null
+++ b/workflows/BSA/RNAseq/run_STAR.sh
@@ -0,0 +1,34 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -t 1:00:00
+#SBATCH -J STAR
+#SBATCH -o /home/twinkle1/projects/Ahyp_v2_2/logs/flower_color_mapping/mappingLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=8
+#SBATCH --mem=8gb
+#SBATCH --array 0-3
+
+
+module load star/2.7.8a
+
+# create array of read fastq files (R1 only):
+SOURCE_DIR=raw_data/BSA_rnaseq/
+FILES=("$SOURCE_DIR"/*_1P.fq.gz)
+OUTPUTDIR=data/BSA/RNAseq/STAR_flower_mappings
+
+# change directory of outprefix
+OUTPREFIX1=("${FILES["${SLURM_ARRAY_TASK_ID}"]/$SOURCE_DIR/$OUTPUTDIR}")
+# change suffix of outprefix by removing the file extension etc.
+OUTPREFIX="${OUTPREFIX1/trimmed_1P.fq.gz/}"
+
+
+# run STAR after genome index creation
+mkdir -p "$OUTPUTDIR"
+
+STAR --runThreadN 8 \
+	--runMode alignReads \
+	--outSAMtype BAM SortedByCoordinate \
+	--genomeDir /scratch/twinkle1/STAR_flower_index \
+	--outFileNamePrefix "$OUTPREFIX" \
+	--readFilesCommand zcat \
+	--readFilesIn "${FILES["${SLURM_ARRAY_TASK_ID}"]}" "${FILES["${SLURM_ARRAY_TASK_ID}"]/_1P.fq.gz/_2P.fq.gz}"
diff --git a/workflows/BSA/RNAseq/run_kallisto.sh b/workflows/BSA/RNAseq/run_kallisto.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ae124fb57e4ca3d234851d6c12be5104ed6cde41
--- /dev/null
+++ b/workflows/BSA/RNAseq/run_kallisto.sh
@@ -0,0 +1,37 @@
+#!/bin/bash -l
+
+# Beforehand:
+#Short read data downloaded from SRA using the following accession/run numbers:
+
+#SRA Accession numbers:
+#Floral tissue: SRX722058 SRR1598911
+#Leaf tissue: SRX722059 SRR1598912
+#Root tissue: SRX722060 SRR1598913
+#Stem tissue: SRX722057 SRR1598910
+#Water stressed tissue sample: SRX722061 SRR1598914
+#Immature seeds: SRX722056 SRR1598909
+#Mature seeds: SRX722063 SRR1598916
+#Green Cotyledone: SRX722062 SRR1598915
+
+
+# index the transcriptome
+#KALINDEX=data/gene_expression_quantification/kallisto_quant/index
+#mkdir -p $KALINDEX
+
+#/home/tom/Documents/tools/kallisto/kallisto index -i "$KALINDEX"/index polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.cds.fasta
+
+
+##### Perform quantification
+# create array of read fastq files (R1 only):
+SOURCE_DIR=raw_data/flower_color_mapping/
+FILES=("$SOURCE_DIR"AM*_1P.fq.gz)
+OUTDIR=data/flower_color_mapping/kallisto_quant/
+TISSUE_NAMES=("AM_00331_gf" "AM_00331_rf" "AM_00332_gf" "AM_00332_rf")
+
+mkdir -p $OUTDIR
+
+# kallisto after indexing
+for (( i=0; i<=3; i++))
+do
+	/home/tom/Documents/tools/kallisto/kallisto quant -i "$KALINDEX"/index -o "$OUTDIR""${TISSUE_NAMES[$i]}" --bias --plaintext -t 6 --verbose "${FILES[$i]}" "${FILES[$i]/_1P.fq.gz/_2P.fq.gz}"
+done
diff --git a/workflows/BSA/WGS/combined_filter.sh b/workflows/BSA/WGS/combined_filter.sh
new file mode 100644
index 0000000000000000000000000000000000000000..707a9bd4b68212b56b80b324d637e9a69f9c326b
--- /dev/null
+++ b/workflows/BSA/WGS/combined_filter.sh
@@ -0,0 +1,65 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -o /projects/ag-stetter/markus/bsa_sterility_color/logs/callingLog-%j.txt
+#SBATCH -t 14:00:00
+#SBATCH -J map_reads
+#SBATCH --nodes=1-1
+#SBATCH --ntasks 5
+#SBATCH --mem 48g
+
+#module load bwa
+#module load java/1.8
+module load samtools/1.13
+
+
+REFERENCE=polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta
+
+PROVIDER=CCG
+OUTDIR=data/BSA/wgs/vcf
+mkdir -p $OUTDIR
+
+ALLSAMP=$(for i in data/BSA/wgs/bam_files/gvcf/AM_00*.vcf; do echo -V $i;done)
+
+$MYUTIL/tools/gatk-4.1.7.0/gatk --java-options "-Xmx48G" \
+	CombineGVCFs \
+   -R $REFERENCE \
+   $ALLSAMP \
+   -O $OUTDIR/cohort.g.vcf.gz
+
+
+$MYUTIL/tools/gatk-4.1.7.0/gatk --java-options "-Xmx48G" \
+GenotypeGVCFs \
+-R $REFERENCE \
+-V $OUTDIR/cohort.g.vcf.gz \
+-O $OUTDIR/raw_snps_all.g.vcf \
+--sample-ploidy 50  # this is for pool data pools are approx 25 ind
+
+
+$MYUTIL/tools/gatk-4.1.7.0/gatk --java-options "-Xmx48G" \
+VariantFiltration \
+-R $REFERENCE \
+-V $OUTDIR/raw_snps_all.g.vcf \
+--filter-expression "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" \
+--filter-name "my_snp_filter" \
+--output $OUTDIR/raw_variants_gatk.vcf
+
+$MYUTIL/tools/gatk-4.1.7.0/gatk --java-options "-Xmx48G" \
+SelectVariants \
+-R $REFERENCE \
+-V $OUTDIR/raw_variants_gatk.vcf  \
+--select-type-to-include SNP \
+--output $OUTDIR/filtered_snps_gatk.vcf
+
+vcftools --vcf $OUTDIR/filtered_snps_gatk.vcf \
+--remove-filtered-all --min-alleles 2 --max-alleles 2 --max-missing 0.95 --recode \
+--out $OUTDIR/gatk_filter_maxmissing05_biallelic
+
+mv $OUTDIR/gatk_filter_maxmissing05_biallelic.recode.vcf $OUTDIR/gatk_filter_maxmissing05_biallelic.vcf
+
+$MYUTIL/tools/gatk-4.1.7.0/gatk --java-options "-Xmx40G" \
+VariantsToTable \
+-R $REFERENCE \
+-V $OUTDIR/gatk_filter_maxmissing05_biallelic.vcf \
+-F CHROM -F POS -F REF -F ALT \
+-GF AD -GF DP -GF GQ -GF PL \
+--output $OUTDIR/bulk_snps05.table
diff --git a/workflows/BSA/WGS/map_reads.sh b/workflows/BSA/WGS/map_reads.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ad2bcf23142c774d731e1471b5639fca1422593d
--- /dev/null
+++ b/workflows/BSA/WGS/map_reads.sh
@@ -0,0 +1,81 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -o /home/twinkle1/projects/Ahyp_v2_2/logs/bsa/mappingLog-%j.txt
+#SBATCH -t 11-00:00:00
+#SBATCH -J map_reads
+#SBATCH --array=0-9
+#SBATCH --nodes=1-1
+#SBATCH --ntasks 8
+#SBATCH --mem 48g
+
+####SLURM_ARRAY_TASK_ID=0
+
+
+
+module use /opt/rrzk/modules/experimental
+module load bwamem2/2.0_gnu
+module load samtools/1.13
+
+
+REFERENCE=polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta
+#bwa-mem2 index $REFERENCE
+
+
+PROVIDER=CCG
+
+
+INPUTPATH=raw_data/BSA_wgs/ #
+OUTPUTPATH=data/BSA/wgs/bam_files/ # Change this for different generations
+
+mkdir -p $OUTPUTPATH
+mkdir -p ${OUTPUTPATH}/metrics/
+
+#readarray -t FASTQFILESR1 < ${INPUTPATH}/R1read_list.txt
+#readarray -t FASTQFILESR2 < ${INPUTPATH}/R2read_list.txt
+
+FASTQFILESR1=($(ls -d $INPUTPATH/*_1.fq.gz))
+FASTQFILESR2=($(ls -d $INPUTPATH/*_2.fq.gz))
+
+
+INFILE_R1="${FASTQFILESR1[$SLURM_ARRAY_TASK_ID]}" #INFILE_R1=${INPUTPATH}/"${FASTQFILESR1[0]}"
+INFILE_R2="${FASTQFILESR2[$SLURM_ARRAY_TASK_ID]}" #INFILE_R2=${INPUTPATH}/"${FASTQFILESR2[0]}"
+
+echo $INFILE_R1
+echo $INFILE_R2
+INDNAME=$(basename $INFILE_R1 _1.fq.gz)
+
+#INDNAME=synDH_$(basename $INFILE |awk -F_ '{print $2}')
+#INDNAME=$(basename "$INFILE_R1" .fastq.gz|awk -F_ 'BEGIN{OFS="_";};{print $1;}') #For DH lines with UNIMO in their name ->fastqList2.txt
+
+
+echo maping reads of $INDNAME
+SORTED_NAME=${OUTPUTPATH}/sorted_${INDNAME}.bam
+echo $SORTED_NAME
+
+
+bwa-mem2 mem -t 8 -R '@RG\tID:'${INDNAME}'\tSM:'${INDNAME}'\tCN:'${PROVIDER}'\tPL:illumina' $REFERENCE $INFILE_R1 $INFILE_R2 | samtools sort -O bam -o ${SORTED_NAME}
+
+
+echo mark duplicates
+DEDUP_NAME=${OUTPUTPATH}/${INDNAME}.bam
+METRICS_FILE=${OUTPUTPATH}/metrics/${INDNAME}.txt
+java -Xmx40g -jar /projects/mstette2/tools/picard.jar MarkDuplicates INPUT=${SORTED_NAME} OUTPUT=${DEDUP_NAME} METRICS_FILE=${METRICS_FILE}
+samtools index $DEDUP_NAME
+
+echo calculate samtools flagstat
+samtools flagstat $DEDUP_NAME > ${OUTPUTPATH}/metrics/${INDNAME}.flagstat
+
+echo removing sorted bam
+#rm $SORTED_NAME
+
+
+mkdir -p ${OUTPUTPATH}/gvcf
+GVCFFILE=${OUTPUTPATH}/gvcf/${INDNAME}.g.vcf
+
+$MYUTIL/tools/gatk-4.1.7.0/gatk --java-options "-Xmx16G" CreateSequenceDictionary -R $REFERENCE
+
+$MYUTIL/tools/gatk-4.1.7.0/gatk --java-options "-Xmx48G" HaplotypeCaller \
+-R $REFERENCE \
+-I $DEDUP_NAME \
+-ERC GVCF \
+--output ${GVCFFILE}
diff --git a/workflows/BSA/betalain_quantification.Rmd b/workflows/BSA/betalain_quantification.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..8ab62650b42d6d117839495a3a60751e804a7235
--- /dev/null
+++ b/workflows/BSA/betalain_quantification.Rmd
@@ -0,0 +1,846 @@
+---
+title: "Betalain_quantification"
+author: "twinkle1"
+date: "2023-11-15"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(readxl)
+library(reshape2)
+library(patchwork)
+library(agricolae)
+library(ggpubr)
+library(cowplot)
+library(ggbeeswarm)
+```
+
+Create output directory for plots
+
+```{bash}
+mkdir ../../plots/betalain_quantification
+```
+
+
+Read in measured data
+
+```{r}
+# read in data
+betalain_quantification <- read_excel(path = "../../raw_data/betalain_quantification/Betalain_quantification_summary.xlsx",
+                                      sheet = "Photometric_quantification")
+
+# normalize to the same input weight
+#betalain_quantification <- betalain_quantification %>%
+#  mutate(A536 = ifelse(tissue == "leaf", A536 * (0.0167/0.037), A536),
+#         A480 = ifelse(tissue == "leaf", A480 * (0.0167/0.037), A480))
+
+# correct for betacyanin absorption at 480 nm
+betalain_quantification <- betalain_quantification %>%
+  mutate(A480 = A480 - (A536/3.1))
+
+
+
+# calculate the mass of betalains in our samples
+betalain_quantification <- betalain_quantification %>%
+  mutate(cBetacyanin = (A536/60000)*2, # dilution of factor 2 through added MetOH
+         nBetacyanin = cBetacyanin * 0.000275,
+         mBetacyanin = nBetacyanin * 726.6, # molar mass amaranthin
+         factor_to_1g = 1/fresh_weight,
+         mBetacyanin_in_1g = mBetacyanin * factor_to_1g,
+         mBetacyanin_in_1g_in_mg = mBetacyanin_in_1g * 1000) %>%
+  mutate(mBetacyanin_in_1g_in_mg = mBetacyanin_in_1g_in_mg * cuvette_dilution) # correct for dilution in cuvette
+
+head(betalain_quantification)
+
+betalain_quantification <- betalain_quantification %>%
+  mutate(cBetaxanthin = (A480/48000)*2,
+         nBetaxanthin = cBetaxanthin * 0.000275,
+         mBetaxanthin = nBetaxanthin * 324.333, # vulgaxanthin IV
+         factor_to_1g = 1/fresh_weight,
+         mBetaxanthin_in_1g = mBetaxanthin * factor_to_1g,
+         mBetaxanthin_in_1g_in_mg = mBetaxanthin_in_1g * 1000) %>%
+  mutate(mBetaxanthin_in_1g_in_mg = mBetaxanthin_in_1g_in_mg * cuvette_dilution) %>%
+  select(tube_nr, line, batch, individual, accession, tissue, mBetacyanin_in_1g_in_mg, mBetaxanthin_in_1g_in_mg)
+
+# exclude sample due to irregular betacyanin content
+betalain_quantification <- betalain_quantification %>%
+  filter(tube_nr != 65)
+
+
+# melt dataframe
+melted_quant <- melt(betalain_quantification,
+                     id.vars = c("tube_nr", "line", "batch", "individual", "accession", "tissue"))
+colnames(melted_quant)[7:8] <- c("metabolite", "content")
+
+# add unique identifier to each individual of different lines from some PI number
+melted_quant <- melted_quant %>%
+  mutate(uniq_ind = paste0(line, "_", batch, "_", individual),
+         accession = factor(accession, levels = c("PI 576485","PI 538323","PI 686465")),
+         tissue = as.factor(tissue),
+         batch = as.factor(batch))
+
+# mean betalain content
+mean_expression <- melted_quant %>%
+  group_by(accession, tissue, metabolite) %>%
+  summarise(mean_content = mean(content))
+```
+
+Conduct statistical analysis
+
+```{r}
+# conduct anova between BSA parents
+betalains_leaf <- betalain_quantification %>%
+  filter(tissue == "leaf")
+betalains_flower <- betalain_quantification %>%
+  filter(tissue == "flower")
+
+# anova for leaf betacyanin content
+leaf_bc_aov <- aov(mBetacyanin_in_1g_in_mg ~ accession * batch, data = betalains_leaf)
+summary(leaf_bc_aov)
+# tukey post hoc test
+leaf_bc_post_hoc <- HSD.test(leaf_bc_aov, trt = "accession")
+leaf_bc_post_hoc
+# extract groups for leaf
+leaf_bc_groups <- leaf_bc_post_hoc$groups
+leaf_bc_groups$factors <- factor(rownames(leaf_bc_groups), levels = c("PI 576485","PI 538323","PI 686465"))
+
+
+# anova for leaf betaxanthin content
+leaf_bx_aov <- aov(mBetaxanthin_in_1g_in_mg ~ accession * batch, data = betalains_leaf)
+summary(leaf_bx_aov)
+# tukey post hoc test
+#leaf_bx_post_hoc <- HSD.test(leaf_bx_aov, trt = "batch")
+#leaf_bx_post_hoc
+# extract groups for leaf
+#leaf_bx_groups <- leaf_bx_post_hoc$groups
+#leaf_bx_groups$factors <- as.factor(rownames(leaf_bx_groups))
+
+
+# anova for flower betacyanin content
+flower_bc_aov <- aov(mBetacyanin_in_1g_in_mg ~ accession * batch, data = betalains_flower)
+summary(flower_bc_aov)
+# tukey post hoc test
+flower_bc_post_hoc <- HSD.test(flower_bc_aov, trt = "accession")
+flower_bc_post_hoc
+# extract groups for flower
+flower_bc_groups <- flower_bc_post_hoc$groups
+flower_bc_groups$factors <- factor(rownames(flower_bc_groups), levels = c("PI 576485","PI 538323","PI 686465"))
+
+
+# anova for flower betaxanthin content
+flower_bx_aov <- aov(mBetaxanthin_in_1g_in_mg ~ accession * batch, data = betalains_flower)
+summary(flower_bx_aov)
+# tukey post hoc test
+flower_bx_post_hoc <- HSD.test(flower_bx_aov, trt = "accession")
+flower_bx_post_hoc
+# extract groups for flower
+flower_bx_groups <- flower_bx_post_hoc$groups
+flower_bx_groups$factors <- factor(rownames(flower_bx_groups), levels = c("PI 576485","PI 538323","PI 686465"))
+```
+
+```{r}
+# from: https://stackoverflow.com/questions/54672468/ggplot2-how-to-nudge-the-position-of-points-in-geom-beeswarm
+position_nudge_any <- function(x = 0, y = 0, position) {
+  ggproto(NULL, PositionNudgeAny,
+          nudge = ggplot2::position_nudge(x, y),
+          position = position
+  )
+}
+
+
+#' Internal class doing the actual nudging on top of the other operation
+#' @keywords internal
+PositionNudgeAny <- ggplot2::ggproto("PositionNudgeAny", ggplot2::Position,
+  nudge = NULL,
+  nudge_params = NULL,
+  position = NULL,
+  position_params = NULL,
+
+  setup_params = function(self, data) {
+   list(nudge = self$nudge,
+        nudge_params = self$nudge$setup_params(data),
+        position = self$position,
+        position_params = self$position$setup_params(data))
+  },
+
+  setup_data = function(self, data, params) {
+   data <- params$position$setup_data(data, params$position_params)
+   params$nudge$setup_data(data, params$nudge_params)
+  },
+
+  compute_layer = function(self, data, params, layout) {
+   data <- params$position$compute_layer(data, params$position_params, layout)
+   params$nudge$compute_layer(data, params$nudge_params, layout)
+  }
+)
+```
+
+Plot data
+
+```{r}
+# plot data
+# plot betalain quantification from leaf
+p_leaf <- ggplot(data = melted_quant %>% filter(tissue == "leaf")) +
+  # geom_boxplot(aes(x = accession,
+  #                y = content,
+  #                fill = metabolite),
+  #              outlier.shape = NA,
+  #            color = "black",
+  #            size = 0.4) +
+  geom_point(data = melted_quant %>% filter(tissue == "leaf", metabolite == "mBetacyanin_in_1g_in_mg"),
+             aes(x = accession, y = content),
+             size = 1.8,
+             color = "red3",
+             position = position_nudge_any(x = -0.2,
+                                           y = 0,
+                                           position_beeswarm(priority = "random",
+                                                             cex = 1.5))) +
+  geom_point(data = melted_quant %>% filter(tissue == "leaf", metabolite == "mBetaxanthin_in_1g_in_mg"),
+             aes(x = accession, y = content),
+             color = "#F0B327",
+             size = 1.8,
+             position = position_nudge_any(x = 0.2,
+                                           y = 0,
+                                           position_beeswarm(priority = "random",
+                                                             cex = 1.5))) +
+  geom_segment(data = mean_expression %>% filter(tissue == "leaf", metabolite == "mBetacyanin_in_1g_in_mg"),
+               aes(x = as.numeric(accession) - 0.35,
+                   xend = as.numeric(accession) - 0.05,
+                   y = mean_content,
+                   yend = mean_content),
+               linewidth = 0.9,
+               color = "black") +
+    geom_segment(data = mean_expression %>% filter(tissue == "leaf", metabolite == "mBetaxanthin_in_1g_in_mg"),
+               aes(x = as.numeric(accession) + 0.05,
+                   xend = as.numeric(accession) + 0.35,
+                   y = mean_content,
+                   yend = mean_content),
+               linewidth = 0.9,
+               color = "black") +
+  geom_text(data = leaf_bc_groups, 
+          aes(x = as.numeric(factors) - 0.19,
+              y = 1.15,
+              label = groups), 
+          size=8, 
+          inherit.aes = F,
+          color = "red3") +  
+  scale_color_manual(values = c( "red3","#F0B327"), labels = c("Betacyanins", "Betaxanthins")) +
+  #scale_fill_manual(values = alpha(c("red3","#F0B327"), 0.2), labels = c("Betacyanins", "Betaxanthins")) +
+  scale_fill_manual(values = c("red3","#F0B327"), labels = c("Betacyanins", "Betaxanthins")) +
+  coord_cartesian(ylim = c(0, 1.2)) +
+  labs(y = "Betalain content\n [mg/g]", color = "Wavelength") +
+  theme_classic() +
+    theme(text = element_text(size = 24),
+        axis.title.x = element_blank(),
+        axis.title.y = element_text(size = 15),
+        axis.text.y = element_text(size = 14, color = "black"),
+        legend.title = element_blank(),
+        plot.margin = unit(c(0,0,0,0), "cm"),
+        #axis.text.x = element_text(size = 13, angle = 45, vjust = 1, hjust=1)) +
+        axis.text.x = element_blank()) +
+  guides(color = guide_legend(override.aes = list(size = 4)))
+
+p_leaf
+
+# plot betalain quantification from flower
+p_flower <- ggplot(data = melted_quant %>% filter(tissue == "flower")) +
+  # geom_boxplot(aes(x = accession, 
+  #                y = content, 
+  #                fill = metabolite),
+  #            color = "black",
+  #            #color = "red4",
+  #            size = 0.4) +
+  geom_point(data = melted_quant %>% filter(tissue == "flower", metabolite == "mBetacyanin_in_1g_in_mg"),
+             aes(x = accession, y = content),
+             size = 1.8,
+             color = "red3",
+             position = position_nudge_any(x = -0.2,
+                                           y = 0,
+                                           position_beeswarm(priority = "random",
+                                                             cex = 1.2))) +
+  geom_point(data = melted_quant %>% filter(tissue == "flower", metabolite == "mBetaxanthin_in_1g_in_mg"),
+             aes(x = accession, y = content),
+             color = "#F0B327",
+             size = 1.8,
+             position = position_nudge_any(x = 0.2,
+                                           y = 0,
+                                           position_beeswarm(priority = "random",
+                                                             cex = 1.2))) +
+  geom_segment(data = mean_expression %>% filter(tissue == "flower", metabolite == "mBetacyanin_in_1g_in_mg"),
+               aes(x = as.numeric(accession) - 0.35,
+                   xend = as.numeric(accession) - 0.05,
+                   y = mean_content,
+                   yend = mean_content),
+               linewidth = 0.9,
+               color = "black") +
+    geom_segment(data = mean_expression %>% filter(tissue == "flower", metabolite == "mBetaxanthin_in_1g_in_mg"),
+               aes(x = as.numeric(accession) + 0.05,
+                   xend = as.numeric(accession) + 0.35,
+                   y = mean_content,
+                   yend = mean_content),
+               linewidth = 0.9,
+               color = "black") +
+  geom_text(data = flower_bc_groups, 
+          aes(x = as.numeric(factors) - 0.19,
+              y = 1.15,
+              label = groups), 
+          size=8, 
+          inherit.aes = F,
+          color = "red3") +  
+  geom_text(data = flower_bx_groups, 
+          aes(x = as.numeric(factors) + 0.19,
+              y = 1.15,
+              label = groups), 
+          size=8, 
+          inherit.aes = F,
+          color = "#F0B327") +    
+  scale_fill_manual(values = c( "red3","#F0B327"), labels = c("Betacyanins", "Betaxanthins")) +
+  coord_cartesian(ylim = c(0, 1.2)) +
+  labs(y = "Betalain content\n [mg/g]", color = "Wavelength") +
+  theme_classic() +
+  theme(text = element_text(size = 24),
+        axis.title.x = element_blank(),
+        #axis.title.y = element_text(size = 17),
+        legend.title = element_blank(),
+        plot.margin = unit(c(0,0,0,0), "cm"),
+        #axis.text.x = element_text(size = 13, angle = 45, vjust = 1, hjust=1)) +
+        axis.text.x = element_blank(),
+        axis.title.y = element_blank(),
+        axis.text.y = element_blank())
+
+p_flower
+
+patchplot_betalains <- p_flower / p_leaf +
+  plot_annotation(tag_levels = "A") +
+  plot_layout(guides = "collect", heights = c(1,1)) &
+  theme(legend.position = "bottom",
+        legend.direction = "vertical",
+        #plot.margin = unit(c(0,0,0,0.1), "cm"),
+        plot.tag = element_text(vjust = 2, size = 24))
+
+patchplot_betalains
+
+ggsave(filename = "../../plots/betalain_quantification/betalain_quant_photometric_content.png",
+       plot = patchplot_betalains,
+       bg = "white",
+       dpi = 450,
+       width = 7,
+       height = 9)
+```
+
+
+HPLC quantification
+
+```{r}
+# hplc quantification
+hplc_quantification <- read_excel(path = "../../raw_data/betalain_quantification/Betalain_quantification_summary.xlsx",
+                                      sheet = "LC-MS_quantification")
+
+# get blank measurements
+blank_amaranthin <- hplc_quantification[grep("Blank", hplc_quantification$sample_name), "Amaranthin_ratio"]
+blank_betanin <- hplc_quantification[grep("Blank", hplc_quantification$sample_name), "Betanin_ratio"] # not detected
+blank_betalamic_acid <- hplc_quantification[grep("Blank", hplc_quantification$sample_name), "Betalamic_acid_ratio"] # not detected
+blank_vulgaxanthin_IV <- hplc_quantification[grep("Blank", hplc_quantification$sample_name), "Vulgaxanthin_IV_ratio"] # not detected
+
+# join with other table
+hplc_quant_joined <- inner_join(hplc_quantification, betalain_quantification) %>%
+  mutate_all(~replace(., is.na(.), 0)) %>%
+  mutate(uniq_ind = paste0(accession, "_", line, "_", individual))
+
+# adjust measurements for different input weights
+hplc_quant_joined <- hplc_quant_joined %>%
+  mutate(Amaranthin_ratio = ifelse(tissue == "leaf", Amaranthin_ratio * (0.0167/0.037), Amaranthin_ratio),
+         Betanin_ratio = ifelse(tissue == "leaf", Betanin_ratio * (0.0167/0.037), Betanin_ratio),
+         Betalamic_ratio = ifelse(tissue == "leaf", Betalamic_acid_ratio * (0.0167/0.037), Betalamic_acid_ratio),
+         Vulgaxanthin_IV_ratio = ifelse(tissue == "leaf", Vulgaxanthin_IV_ratio * (0.0167/0.037), Vulgaxanthin_IV_ratio))
+
+# subtract blank
+hplc_quant_normalised <- hplc_quant_joined %>% # replace NA with 0
+  mutate(Amaranthin_ratio = Amaranthin_ratio - blank_amaranthin[[1]],
+         accession = factor(accession, levels = c("PI 576485","PI 538323","PI 686465"))) # subtract blank from actual measurements
+
+# replace negative values with 0
+hplc_quant_normalised[hplc_quant_normalised < 0] <- 0
+
+
+# AUC means
+hplc_AUC_means <- hplc_quant_normalised %>%
+  group_by(tissue, accession) %>%
+  summarise(amaranthin_mean = mean(Amaranthin_ratio),
+            betanin_mean = mean(Betanin_ratio),
+            betalamic_acid_mean = mean(Betalamic_acid_ratio),
+            vulgaxanthin_IV_mean = mean(Vulgaxanthin_IV_ratio)) %>%
+  unique()
+
+
+```
+
+
+Statistical analysis
+
+```{r}
+# conduct anova between BSA parents
+hplc_leaf <- hplc_quant_normalised %>%
+  filter(tissue == "leaf")
+hplc_flower <- hplc_quant_normalised %>%
+  filter(tissue == "flower")
+
+
+anova_and_tukey <- function(data, column){
+  out_list <- list()
+  # anova
+  aov_out <- aov(reformulate("accession", response = column), data = data)
+  out_list[[1]] <- aov_out
+  # tukey post hoc
+  post_hoc_out <- HSD.test(aov_out, trt = "accession")
+  out_list[[2]] <- post_hoc_out
+  # extract groups
+  groups_out <- post_hoc_out$groups
+  groups_out$factors <- factor(rownames(groups_out), levels = c("PI 576485","PI 538323","PI 686465"))
+  out_list[[3]] <- groups_out
+  return(out_list)
+}
+
+# amaranthin leaf
+amaranthin_leaf <- anova_and_tukey(data = hplc_leaf, column = "Amaranthin_ratio")
+summary(amaranthin_leaf[[1]])
+amaranthin_leaf[2]
+
+# amaranthin flower
+amaranthin_flower <- anova_and_tukey(data = hplc_flower, column = "Amaranthin_ratio")
+summary(amaranthin_flower[[1]])
+amaranthin_flower[2]
+
+# betanin leaf
+betanin_leaf <- anova_and_tukey(data = hplc_leaf, column = "Betanin_ratio")
+summary(betanin_leaf[[1]])
+betanin_leaf[2]
+
+# betanin flower
+betanin_flower <- anova_and_tukey(data = hplc_flower, column = "Betanin_ratio")
+summary(betanin_flower[[1]])
+betanin_flower[2]
+
+# betalamic acid leaf
+ba_leaf <- anova_and_tukey(data = hplc_leaf, column = "Betalamic_acid_ratio")
+summary(ba_leaf[[1]])
+ba_leaf[2]
+
+# betalamic acid flower
+ba_flower <- anova_and_tukey(data = hplc_flower, column = "Betalamic_acid_ratio")
+summary(ba_flower[[1]])
+ba_flower[2]
+
+# vulgaxanthin leaf
+vulgaxanthin_leaf <- anova_and_tukey(data = hplc_leaf, column = "Vulgaxanthin_IV_ratio")
+summary(vulgaxanthin_leaf[[1]]) # no significant difference
+
+# vulgaxanthin flower
+vulgaxanthin_flower <- anova_and_tukey(data = hplc_flower, column = "Vulgaxanthin_IV_ratio")
+summary(vulgaxanthin_flower[[1]]) # no significant difference
+```
+
+Plot results amaranthin
+
+```{r}
+# plot
+pa_leaf <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "leaf")) +
+  # geom_point(aes(x = accession, 
+  #                y = Amaranthin_ratio, 
+  #                group = uniq_ind),
+  #            color = "red3",
+  #            position = position_dodge(width = 0.75),
+  #            size = 2.8) +
+  geom_beeswarm(aes(x = accession, 
+                 y = Amaranthin_ratio, 
+                 group = uniq_ind),
+             color = "red3",
+             cex = 1.5,
+             size = 2.8) +
+  geom_segment(data = hplc_AUC_means %>% filter(tissue == "leaf"),
+                aes(x = as.numeric(accession)-0.2, xend = as.numeric(accession)+0.2,
+                    y = amaranthin_mean, yend = amaranthin_mean),
+                color = "black",
+                linewidth = 0.9) +
+  geom_text(data = amaranthin_leaf[[3]],
+          aes(x = as.numeric(factors),
+              y = 280,
+              label = groups),
+          size=8,
+          inherit.aes = F,
+          color = "red3") +
+  coord_cartesian(ylim = c(0, 300)) +
+  labs(y = "Amaranthin\n relative area", color = "Wavelength") +
+  scale_color_manual(values = c("red3", "red3","black","#F0B327")) +
+  theme_classic() +
+  theme(text = element_text(size = 24),
+        axis.title.x = element_blank(),
+        axis.title.y = element_text(size = 15),
+        axis.text.y = element_text(size = 14, color = "black"),
+        legend.title = element_blank(),
+        plot.margin = unit(c(0,0,0,0), "cm"),
+        #axis.text.x = element_text(size = 13, angle = 45, vjust = 1, hjust=1),
+        axis.text.x = element_blank(),
+        legend.position = "none") +
+  guides(color = guide_legend(override.aes = list(size = 4)))
+
+
+
+pa_flower <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "flower")) +
+  # geom_point(aes(x = accession, 
+  #                y = Amaranthin_ratio, 
+  #                group = uniq_ind),
+  #            color = "red3",
+  #            position = position_dodge(width = 0.75),
+  #            size = 2.8) +
+  geom_beeswarm(aes(x = accession, 
+                 y = Amaranthin_ratio, 
+                 group = uniq_ind),
+             color = "red3",
+             cex = 1.5,
+             size = 2.8) +
+  geom_segment(data = hplc_AUC_means %>% filter(tissue == "flower"),
+                aes(x = as.numeric(accession)-0.2, xend = as.numeric(accession)+0.2,
+                    y = amaranthin_mean, yend = amaranthin_mean),
+                color = "black",
+                linewidth = 0.9) +
+  geom_text(data = amaranthin_flower[[3]],
+          aes(x = as.numeric(factors),
+              y = 280,
+              label = groups),
+          size=8,
+          inherit.aes = F,
+          color = "red3") +
+  coord_cartesian(ylim = c(0, 300)) +
+  labs(y = "Amaranthin\n relative area", color = "Wavelength") +
+  scale_color_manual(values = c("red3", "red3","black","#F0B327")) +
+  theme_classic() +
+    theme(text = element_text(size = 24),
+        axis.title.x = element_blank(),
+        #axis.title.y = element_text(size = 17),
+        axis.title.y = element_blank(),
+        axis.text.y = element_blank(),
+        legend.title = element_blank(),
+        plot.margin = unit(c(0,0,0,0), "cm"),
+        #axis.text.x = element_text(size = 13, angle = 45, vjust = 1, hjust=1),
+        axis.text.x = element_blank(),
+        legend.position = "none") +
+  guides(color = guide_legend(override.aes = list(size = 4)))
+
+```
+
+Plot results betanin
+
+```{r}
+# plot
+pb_leaf <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "leaf")) +
+    geom_beeswarm(aes(x = accession, 
+                 y = Betanin_ratio, 
+                 group = uniq_ind),
+             color = "red3",
+             cex = 1.5,
+             size = 2.8) +
+  geom_segment(data = hplc_AUC_means %>% filter(tissue == "leaf"),
+                aes(x = as.numeric(accession)-0.2, xend = as.numeric(accession)+0.2,
+                    y = betanin_mean, yend = betanin_mean),
+                color = "black",
+                linewidth = 0.9) +
+  geom_text(data = betanin_leaf[[3]],
+          aes(x = as.numeric(factors),
+              y = 280,
+              label = groups),
+          size=8,
+          inherit.aes = F,
+          color = "red3") +
+  coord_cartesian(ylim = c(0, 300)) +
+  labs(y = "Betanin\n relative area", color = "Wavelength") +
+  scale_color_manual(values = c("red3", "red3","black","#F0B327")) +
+  theme_classic() +
+  theme(text = element_text(size = 24),
+        axis.title.x = element_blank(),
+        axis.title.y = element_text(size = 15),
+        axis.text.y = element_text(size = 14, color = "black"),
+        legend.title = element_blank(),
+        plot.margin = unit(c(0,0,0,0), "cm"),
+        #axis.text.x = element_text(size = 13, angle = 45, vjust = 1, hjust=1),
+        axis.text.x = element_blank(),
+        legend.position = "none") +
+  guides(color = guide_legend(override.aes = list(size = 4)))
+
+
+
+pb_flower <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "flower")) +
+    geom_beeswarm(aes(x = accession, 
+                 y = Betanin_ratio, 
+                 group = uniq_ind),
+             color = "red3",
+             cex = 1.5,
+             size = 2.8) +
+  geom_segment(data = hplc_AUC_means %>% filter(tissue == "flower"),
+                aes(x = as.numeric(accession)-0.2, xend = as.numeric(accession)+0.2,
+                    y = betanin_mean, yend = betanin_mean),
+                color = "black",
+                linewidth = 0.9) +
+  geom_text(data = betanin_flower[[3]],
+          aes(x = as.numeric(factors),
+              y = 280,
+              label = groups),
+          size=8,
+          inherit.aes = F,
+          color = "red3") +
+  coord_cartesian(ylim = c(0, 300)) +
+  labs(y = "Betanin\n relative area", color = "Wavelength") +
+  scale_color_manual(values = c("red3", "red3","black","#F0B327")) +
+  theme_classic() +
+    theme(text = element_text(size = 24),
+        axis.title.x = element_blank(),
+        #axis.title.y = element_text(size = 17),
+        axis.title.y = element_blank(),
+        axis.text.y = element_blank(),
+        legend.title = element_blank(),
+        plot.margin = unit(c(0,0,0,0), "cm"),
+        #axis.text.x = element_text(size = 13, angle = 45, vjust = 1, hjust=1),
+        axis.text.x = element_blank(),
+        legend.position = "none") +
+  guides(color = guide_legend(override.aes = list(size = 4)))
+```
+
+Plot results betalamic acid
+
+```{r}
+# plot
+pba_leaf <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "leaf")) +
+    geom_beeswarm(aes(x = accession, 
+                 y = Betalamic_acid_ratio, 
+                 group = uniq_ind),
+             color = "grey40",
+             cex = 1.5,
+             size = 2.8) +
+  geom_segment(data = hplc_AUC_means %>% filter(tissue == "leaf"),
+                aes(x = as.numeric(accession)-0.2, xend = as.numeric(accession)+0.2,
+                    y = betalamic_acid_mean, yend = betalamic_acid_mean),
+                color = "black",
+                linewidth = 0.9) +
+  geom_text(data = ba_leaf[[3]],
+          aes(x = as.numeric(factors),
+              y = 280,
+              label = groups),
+          size=8,
+          inherit.aes = F,
+          color = "grey40") +
+  coord_cartesian(ylim = c(0, 300)) +
+  labs(y = "Betalamic acid\n relative area", color = "Wavelength") +
+  scale_color_manual(values = c("red3", "red3","black","#F0B327")) +
+  theme_classic() +
+  theme(text = element_text(size = 24),
+        axis.title.x = element_blank(),
+        axis.title.y = element_text(size = 15),
+        axis.text.y = element_text(size = 14, color = "black"),
+        legend.title = element_blank(),
+        plot.margin = unit(c(0,0,0,0), "cm"),
+        #axis.text.x = element_text(size = 13, angle = 45, vjust = 1, hjust=1),
+        axis.text.x = element_blank(),
+        legend.position = "none") +
+  guides(color = guide_legend(override.aes = list(size = 4)))
+
+
+
+pba_flower <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "flower")) +
+    geom_beeswarm(aes(x = accession, 
+                 y = Betalamic_acid_ratio, 
+                 group = uniq_ind),
+             color = "grey40",
+             cex = 1.5,
+             size = 2.8) +
+  geom_segment(data = hplc_AUC_means %>% filter(tissue == "flower"),
+                aes(x = as.numeric(accession)-0.2, xend = as.numeric(accession)+0.2,
+                    y = betalamic_acid_mean, yend = betalamic_acid_mean),
+                color = "black",
+                linewidth = 0.9) +
+  geom_text(data = ba_flower[[3]],
+          aes(x = as.numeric(factors),
+              y = 280,
+              label = groups),
+          size=8,
+          inherit.aes = F,
+          color = "grey40") +
+  coord_cartesian(ylim = c(0, 300)) +
+  labs(y = "Betalamic acid\n relative area", color = "Wavelength") +
+  scale_color_manual(values = c("red3", "red3","black","#F0B327")) +
+  theme_classic() +
+    theme(text = element_text(size = 24),
+        axis.title.x = element_blank(),
+        #axis.title.y = element_text(size = 17),
+        axis.title.y = element_blank(),
+        axis.text.y = element_blank(),
+        legend.title = element_blank(),
+        plot.margin = unit(c(0,0,0,0), "cm"),
+        #axis.text.x = element_text(size = 13, angle = 45, vjust = 1, hjust=1),
+        axis.text.x = element_blank(),
+        legend.position = "none") +
+  guides(color = guide_legend(override.aes = list(size = 4)))
+```
+
+Plot results vulgaxanthin
+
+```{r}
+# plot
+pv_leaf <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "leaf")) +
+  geom_beeswarm(aes(x = accession, 
+                 y = Vulgaxanthin_IV_ratio, 
+                 group = uniq_ind),
+             color = "#F0B327",
+             method = "swarm",
+             cex = 1.5,
+             size = 2.8) +
+   geom_segment(data = hplc_AUC_means %>% filter(tissue == "leaf"),
+                aes(x = as.numeric(accession)-0.2, xend = as.numeric(accession)+0.2,
+                    y = vulgaxanthin_IV_mean, yend = vulgaxanthin_IV_mean),
+                color = "black",
+                linewidth = 0.9) +
+  ylim(c(0, 300)) +
+  #coord_cartesian(ylim = c(0, 300)) +
+  labs(y = "Vulgaxanthin IV\n relative area", color = "Wavelength") +
+  theme_classic() +
+  scale_x_discrete(labels = c("PI 576485\n leaf","PI 538323\n leaf","PI 686465\n leaf")) +
+  theme(text = element_text(size = 24),
+        axis.title.x = element_blank(),
+        axis.title.y = element_text(size = 15),
+        axis.text.y = element_text(size = 14, color = "black"),
+        axis.text.x = element_text(size = 15, color = "black"),
+        legend.title = element_blank(),
+        plot.margin = unit(c(0,0,0,0), "cm"),
+        legend.position = "none") +
+  guides(color = guide_legend(override.aes = list(size = 4)))
+
+
+
+pv_flower <- ggplot(data = hplc_quant_normalised %>% filter(tissue == "flower")) +
+  geom_beeswarm(aes(x = accession, 
+                 y = Vulgaxanthin_IV_ratio, 
+                 group = uniq_ind),
+             color = "#F0B327",
+             method = "swarm",
+             cex = 1.5,
+             size = 2.8) +
+   geom_segment(data = hplc_AUC_means %>% filter(tissue == "flower"),
+                aes(x = as.numeric(accession)-0.2, xend = as.numeric(accession)+0.2,
+                    y = vulgaxanthin_IV_mean, yend = vulgaxanthin_IV_mean),
+                color = "black",
+                linewidth = 0.9) +
+  ylim(c(0, 300)) +
+  #coord_cartesian(ylim = c(0, 300)) +
+  labs(y = "Vulgaxanthin IV\n relative area", color = "Wavelength") +
+  theme_classic() +
+  scale_x_discrete(labels = c("PI 576485\n flower","PI 538323\n flower","PI 686465\n flower")) +
+    theme(text = element_text(size = 24),
+        axis.title.x = element_blank(),
+        #axis.title.y = element_text(size = 17),
+        legend.title = element_blank(),
+        plot.margin = unit(c(0,0,0,0), "cm"),
+        axis.title.y = element_blank(),
+        axis.text.y = element_blank(),
+        axis.text.x = element_text(size = 15, color = "black"),
+        legend.position = "none") +
+  guides(color = guide_legend(override.aes = list(size = 4)))
+```
+
+
+
+Overview over all quantified metabolites including photometric quantification
+
+```{r}
+overview_patchplot <- p_leaf + p_flower + pa_leaf + pa_flower + pb_leaf + pb_flower +
+  pba_leaf + pba_flower + pv_leaf + pv_flower +
+  plot_annotation(tag_levels = "a", tag_prefix = "(", tag_suffix = ")") +
+  plot_layout(ncol = 2,
+              byrow = T,
+              guides = "collect") &
+  theme(legend.position = "bottom",
+        legend.direction = "vertical",
+        plot.tag = element_text(vjust = 2, size = 17, face = "bold"))
+
+ggsave(filename = "../../plots/betalain_quantification/overview_betalain_quantification.png",
+       plot = overview_patchplot,
+       bg = "white",
+       dpi = 450,
+       width = 10,
+       height = 13)
+```
+
+
+Combine specific plots
+
+
+```{r}
+# combine for figure
+photo_hplc_patchplot <- plot_spacer() / (p_leaf + p_flower) / (pa_leaf + pa_flower) / (pv_leaf + pv_flower) +
+  plot_annotation(tag_levels = list(c("(b)","(c)","(d)","(e)","(f)","(g)"))) +
+  plot_layout(widths = c(0.5, 0.5),
+              guides = "collect") &
+  theme(legend.position = "bottom",
+        plot.tag = element_text(size = 17, face = "bold"))
+
+
+
+ggsave(filename = "../../plots/betalain_quantification/quantification_without_picture.png",
+       plot = photo_hplc_patchplot,
+       bg = "white",
+       dpi = 450,
+       width = 10,
+       height = 10)
+```
+
+
+Analyse HPLC results from amaranth roots:
+
+```{r}
+# hplc quantification
+root_quant <- read_excel(path = "../../raw_data/betalain_quantification/Betalain_quantification_summary.xlsx",
+                                      sheet = "transgenic_roots_LC-MS_quantification")
+root_quant <- root_quant %>%
+  mutate(sample_id = paste0("plate_", plate, "_", root_type)) %>%
+  mutate(sample_id = factor(sample_id, levels = c("plate_3_white", "plate_3_red", "plate_4_white", "plate_4_red")))
+
+# normalise for solvent volume and input weight
+norm_factor <- (75/275) * (0.0167/0.0082) # initial solvent volume 75 instead of 275, normalised input weight 8.2 mg instead of 16.7 mg
+root_quant_norm <- root_quant %>%
+  mutate(Amaranthin_ratio = Amaranthin_ratio * norm_factor,
+         plant_replicate = factor(plant_replicate))
+
+
+# plot comparison
+root_quant_plot <- ggplot(data = root_quant_norm) +
+  geom_col(aes(x = plant_replicate,
+               y = Amaranthin_ratio,
+               fill = root_type),
+           color = "black",
+           position = position_dodge2(reverse = T)) +
+  geom_text(aes(x = plant_replicate,
+                y = Amaranthin_ratio,
+                label = round(Amaranthin_ratio,2),
+                group = root_type),
+            position = position_dodge2(width = 0.9, reverse = T),
+            vjust = -0.2,
+            size = 5) +
+  scale_x_discrete(labels = c("Ind 1", "Ind 2")) +
+  coord_cartesian(ylim = c(0, 450)) +
+  labs(y = "Amaranthin\n relative area", x = "") +
+  scale_fill_manual(values = c("red3", "beige")) +
+  theme_classic() +
+  theme(text = element_text(size = 24),
+        legend.position = "none")
+root_quant_plot
+
+ggsave(filename = "../../plots/betalain_quantification/root_quantification.png",
+       plot = root_quant_plot,
+       bg = "white",
+       dpi = 450,
+       width = 7,
+       height = 5)
+
+```
+
+
diff --git a/workflows/BSA/bsa_and_plotting.R b/workflows/BSA/bsa_and_plotting.R
new file mode 100644
index 0000000000000000000000000000000000000000..4f9f2667b3ec478e80153811738a96efea580f7c
--- /dev/null
+++ b/workflows/BSA/bsa_and_plotting.R
@@ -0,0 +1,362 @@
+
+setwd("/home/tom/Documents/projects/Ahyp_v2_2_publication/")
+
+library(QTLseqr)
+library(tidyverse)
+library(cowplot)
+theme_set(theme_cowplot())
+library(gggenes)
+library(ggtranscript)
+library(ape)
+library(patchwork)
+
+
+bsa_analysis <- function(rawData,HighBulk,LowBulk,Chroms,nhigh,nlow){
+  df <- importFromGATK(file = rawData,
+                       highBulk = HighBulk,
+                       lowBulk = LowBulk,
+                       chromList = Chroms)
+  
+  df <- df %>% select(CHROM,
+                      POS,
+                      REF,
+                      ALT,
+                      AD_REF.LOW,
+                      AD_ALT.LOW,
+                      DP.LOW,
+                      GQ.LOW,
+                      PL.LOW,
+                      SNPindex.LOW,
+                      AD_REF.HIGH,
+                      AD_ALT.HIGH,
+                      DP.HIGH,
+                      GQ.HIGH,
+                      PL.HIGH,
+                      SNPindex.HIGH,
+                      REF_FRQ,
+                      deltaSNP) %>%
+    mutate(CHROM=as.factor(as.numeric(gsub("Scaffold_","",CHROM))),
+           POS=as.numeric(POS)) %>%
+    filter(REF!='*',ALT!='*')
+  
+  df_filt <-filterSNPs(SNPset = df,
+                       refAlleleFreq = 0.2,
+                       minTotalDepth = 50,
+                       maxTotalDepth = 100,
+                       minSampleDepth = 20,
+                       minGQ = 99,
+                       verbose = TRUE)
+  
+  df_filt <- runGprimeAnalysis(
+    SNPset = df_filt,
+    windowSize = 2e6,
+    filter = 0.4,
+    outlierFilter = "deltaSNP")
+  df_filt <- runQTLseqAnalysis(
+    SNPset = df_filt,
+    windowSize = 2e6,
+    popStruc = "RIL",
+    bulkSize =  c(nhigh, nlow),
+    replications = 10000,
+    filter = 0.4,
+    intervals = c(95, 99)
+  )
+  return(df_filt)
+}
+
+AM_00332_leaf_green_red <- bsa_analysis(rawData = 'data/BSA/wgs/vcf/bulk_snps05.table',
+                                        HighBulk = "AM_00332_gl",
+                                        LowBulk = "AM_00332_rl",
+                                        Chroms = paste0(rep("Scaffold_",
+                                                            16),1:16),
+                                        nhigh=80,
+                                        nlow=80)
+AM_00331_flower_red_green <- bsa_analysis(rawData = 'data/BSA/wgs/vcf/bulk_snps05.table',
+                                          HighBulk = "AM_00331_rf",
+                                          LowBulk = "AM_00331_gf",
+                                          Chroms = paste0(rep("Scaffold_",
+                                                              16),1:16),
+                                          nhigh = 68,
+                                          nlow = 68)
+
+
+# plot all results
+# leaf
+plotGresults <- function(Gresults,betalain_genes){
+  qval <- Gresults %>% 
+    filter(qvalue<=0.01) 
+  #qval <- min(qval$Gprime)
+  qval <- 3
+  
+  
+  mG <- Gresults %>%
+    filter(Gprime==max(Gresults$Gprime))
+  
+  p1 <- ggplot()+
+    geom_line(data=Gresults,aes(POS/1e6,Gprime), size=2) +
+    labs(x= 'Position (Mb)',y= "G' value") +
+    scale_x_continuous(breaks = c(0,10,20,30))+
+    geom_hline(data=data.frame(yint=qval),
+               aes(yintercept =yint,
+                   linetype ='dashed',
+                   color=alpha('red',0.6)), 
+               size=1.7)+
+    facet_grid(.~CHROM, space = 'free_x',scales='free_x') +
+    theme(panel.spacing.x=unit(0.25, "lines")) +
+    ylim(0,10) +
+    theme(strip.background = element_rect(fill = alpha('lightblue',0.2)),
+          strip.text = element_text(size=30)) +
+    theme(legend.position="none",
+          axis.text.y = element_text(size=40),
+          axis.title.y = element_text(size=40),
+          #axis.title.x = element_blank(),
+          axis.title.x = element_text(size=40),
+          axis.text.x = element_text(size = 20),
+          panel.spacing.x = unit(6, "mm"),
+          axis.line = element_line(linewidth = 2),
+          axis.ticks = element_line(linewidth = 1.5),
+          axis.ticks.length = unit(.25, "cm")) +
+    scale_x_continuous(guide = guide_axis(check.overlap = T))
+    #geom_gene_arrow(data=betalain_genes, 
+    #                aes(xmin = start/1e6, xmax = end/1e6, y = max(Gresults$Gprime), fill = type))
+  
+  #ggsave(outfile,p1,width = 18,height = 7,,bg='white')
+  return(p1)
+}
+# flower
+plotGresults1 <- function(Gresults,betalain_genes){
+  qval <- Gresults %>% 
+    filter(qvalue<=0.01) 
+  #qval <- min(qval$Gprime)
+  qval <- 3
+  
+  
+  mG <- Gresults %>%
+    filter(Gprime==max(Gresults$Gprime))
+  
+  p1 <- ggplot()+
+    geom_line(data=Gresults,aes(POS/1e6,Gprime), size=2) +
+    labs(x= 'Position (Mb)',y= "G' value") +
+    #labs(x= '',y= "G' value") +
+    scale_x_continuous(breaks = c(0,10,20,30)) +
+    geom_hline(data=data.frame(yint=qval),
+               aes(yintercept =yint,
+                   linetype ='dashed',
+                   color=alpha('red',0.6)), 
+               size=1.7)+
+    facet_grid(.~CHROM,space = 'free_x',scales='free_x') +
+    theme(panel.spacing.x=unit(0.25, "lines")) +
+    ylim(0,8) +
+    theme( strip.background = element_rect(fill = alpha('lightblue',0.2)),
+           strip.text = element_text(size=30)) +
+    theme(legend.position="none",
+          axis.text.y = element_text(size=40),
+          axis.title = element_text(size=40),
+          axis.text.x = element_text(size = 20),
+          panel.spacing.x = unit(6, "mm"),
+          axis.line = element_line(linewidth = 2),
+          axis.ticks = element_line(linewidth = 1.5),
+          axis.ticks.length = unit(.25, "cm")) +
+    scale_x_continuous(guide = guide_axis(check.overlap = T))
+    #geom_gene_arrow(data=betalain_genes, 
+    #                aes(xmin = start/1e6, xmax = end/1e6, y = max(Gresults$Gprime), fill = type))
+  
+  #ggsave(outfile,p1,width = 18,height = 7,,bg='white')
+  return(p1)
+}
+
+# plot individual chromosomes
+# flower
+plotGqtl <- function(Gresults,chr,genes){
+  
+  qval <- Gresults %>% 
+    filter(qvalue<=0.01) 
+  #qval <- min(qval$Gprime)
+  qval <- 3
+  my_qtl <- getQTLTable(SNPset = Gresults, alpha = 0.01,export = F)
+  
+  ggplot()+
+    geom_line(data=filter(Gresults,CHROM==chr),aes(POS/1e6,Gprime),size=2) +
+    labs(x= 'Position (Mb)',y= "G' value") +
+    scale_x_continuous(breaks = c(0,10,20,30))+
+    geom_hline(data=data.frame(yint=qval),
+               aes(yintercept = yint, 
+                   linetype ='dashed', 
+                   color=alpha('red',0.6)),
+               size = 2) +
+    facet_grid(.~CHROM,space = 'free_x',scales='free_x') + 
+    theme(panel.spacing.x=unit(0.25, "lines")) +
+    ylim(0,10) +
+    theme( strip.background = element_rect(fill = alpha('lightblue',0.2)),
+           strip.text = element_text(size=30)) +
+    theme(legend.position="none",
+          axis.text.y = element_blank(),
+          axis.text.x = element_text(size=20),
+          #axis.title.x = element_blank(),
+          axis.title.x = element_text(size=40),
+          axis.title.y = element_text(color = "white", size = 35),
+          #axis.title.y = element_blank(),
+          axis.line = element_line(linewidth = 2),
+          axis.ticks = element_line(linewidth = 1.5),
+          axis.ticks.length = unit(.25, "cm")) +
+    geom_gene_arrow(data=filter(genes, CHROM==chr, type == "transcript") %>% droplevels(), 
+                    aes(xmin = start/1e6, xmax = end/1e6, y = 9.5, color = attributes), size=6) +
+    scale_color_manual(values = c(alpha('red',0.6), "black", "black","grey","grey"))
+}
+# leaf
+plotGqtl1 <- function(Gresults,chr,genes){
+  
+  qval <- Gresults %>% 
+    filter(qvalue<=0.01) 
+  #qval <- min(qval$Gprime)
+  qval <- 3
+  my_qtl <- getQTLTable(SNPset = Gresults, alpha = 0.01,export = F)
+  
+  p1 <- ggplot() +
+    geom_line(data=filter(Gresults,CHROM==chr),aes(POS/1e6,Gprime),size=2) +
+    labs(x= 'Position (Mb)',y= "G' value") +
+    scale_x_continuous(breaks = c(0,5,10,15,20,30))+
+    scale_y_continuous(breaks = c(0,2,4,8)) +
+    geom_hline(data=data.frame(yint=qval),
+               aes(yintercept =yint, linetype ='dashed', color=alpha('red',0.6)),
+               size = 2) +
+    geom_vline(aes(xintercept = 5231549/1e6), color = "black") +
+    geom_vline(aes(xintercept = 5305973/1e6), color = "black") +
+    facet_grid(.~CHROM,space = 'free_x',scales='free_x') +
+    ylim(0,8) +
+    theme(panel.spacing.x=unit(0.25, "lines")) +
+    # theme( strip.background = element_rect(fill = alpha('lightblue',0.2)),
+    #        strip.text = element_text(size=30)) +
+    theme( strip.background = element_blank(),
+           strip.text = element_blank()) +
+    theme(legend.position="none",
+          axis.text.y = element_text(size = 30),
+          #plot.margin = unit(c(0, 0, 0, 0), "cm"),
+          axis.text.x = element_text(size=20),
+          axis.title.x = element_text(size=40),
+          #axis.title.y = element_blank(),
+          axis.title.y = element_text(color = "black", size = 35),
+          axis.line = element_line(linewidth = 2),
+          axis.ticks = element_line(linewidth = 1.5),
+          axis.ticks.length = unit(.25, "cm"))
+  
+  
+  p2 <- ggplot() +
+    geom_gene_arrow(data=filter(genes, 
+                                CHROM==chr, 
+                                type == "transcript",
+                                attributes == "ID=AHp023147.1;geneID=AHp023147" | attributes == "ID=AHp023148.1;geneID=AHp023148") %>% droplevels(),
+                    aes(xmin = start, 
+                        xmax = end, 
+                        y = "chr16", 
+                        fill = attributes, 
+                        forward = c(F,T)),
+                    size = 1.5,
+                    color = "black",
+                    arrowhead_height = unit(12, "mm"), 
+                    arrowhead_width = unit(6, "mm"), 
+                    arrow_body_height = grid::unit(6, "mm")) +
+    geom_text(aes(x = c(5246000,5290000),
+                  y = "chr16",
+                  label = c("AhDODAα1","AhCYP76AD2")),
+              size = 11,
+              nudge_y = 2.5) +
+    coord_cartesian(ylim = c(0,4)) +
+    scale_x_continuous(breaks = c(5250000, 5275000)) +
+    theme(legend.position = "none",
+          plot.margin = unit(c(0, 2, 0.5, 2), "cm"),
+          axis.line = element_line(linewidth = 2),
+          axis.ticks = element_line(linewidth = 1.5),
+          axis.text.x = element_text(size=20),
+          panel.grid.major.y = ggplot2::element_line(colour = "grey", 
+                                                     linewidth = 1),
+          #axis.title.x = element_text(size=40),
+          axis.ticks.length = unit(.25, "cm"),
+          axis.title.y = element_blank(),
+          axis.title.x = element_blank(),
+          axis.line.y = element_blank(),
+          axis.ticks.y = element_blank(),
+          axis.text.y = element_blank()) +
+    scale_fill_manual(values = c("chocolate2","cyan3",'red'))
+  
+  # combine plots:
+  out <- plot_grid(p2,p1,
+                   nrow = 2,
+                   rel_heights = c(0.3,0.7))
+  
+  out
+}
+
+# plot all results
+plot_AM_00332_leaf_green_red <- plotGresults1(AM_00332_leaf_green_red,
+                                             betalain_genes = betalain_genes)
+plot_AM_00331_flower_red_green <- plotGresults(AM_00331_flower_red_green,
+                                               betalain_genes = betalain_genes)
+
+
+
+
+plotleaf16 <- plotGqtl1(AM_00332_leaf_green_red,
+                        genes = betalain_genes,
+                        chr = 16)
+
+plotflower16 <- plotGqtl(AM_00331_flower_red_green,
+                         genes = betalain_genes,
+                         chr = 16)
+plotflower16
+
+
+# use the cowplot package
+cowplot_flower <- plot_grid(plot_AM_00331_flower_red_green, plotflower16,
+                          labels = c("A", "B"),
+                          nrow = 1,
+                          align = "h",
+                          rel_widths = c(0.7, 0.3),
+                          label_size = 34)
+
+ggsave(filename = "paper_grid_flower.png",
+       plot = cowplot_flower,
+       dpi = 400,
+       width = 25,
+       height = 5,
+       bg = "white")
+
+cowplot_leaf <- plot_grid(plot_AM_00332_leaf_green_red, plotleaf16,
+                          labels = c("A", "B"),
+                          nrow = 1,
+                          align = "h",
+                          rel_widths = c(0.7, 0.3),
+                          label_size = 30)
+
+ggsave(filename = "paper_grid_leaf.png",
+       plot = cowplot_leaf,
+       dpi = 400,
+       width = 25,
+       height = 5,
+       bg = "white")
+
+
+# combine with other plots:
+pathway_plot <- ggdraw() + draw_image("plots/betalain_pathway_expression.png")
+
+alignment_plot <- ggdraw() + draw_image("plots/AmMYB2_figure/S6_betalain_myb_R3.png")
+
+MYB_plot <- plot_grid(cowplot_flower, pathway_plot, alignment_plot,
+                       nrow = 3,
+                       align = "v",
+                      rel_heights = c(0.25, 0.75, 0.35),
+                      labels = c("", "C", "D"),
+                      label_size = 34)
+
+ggsave(filename = "plots/paper_myb_combined_alignment.png",
+       plot = MYB_plot,
+       dpi = 400,
+       width = 25,
+       height = 25,
+       bg = "white")
+
+
+
+
+
+
diff --git a/workflows/BSA/read_count_analysis.Rmd b/workflows/BSA/read_count_analysis.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..4154c86c51ede7b0b6f006ccf5b84574c203946f
--- /dev/null
+++ b/workflows/BSA/read_count_analysis.Rmd
@@ -0,0 +1,296 @@
+---
+title: "betalain_gene_expression_flower_bulk"
+author: "twinkle1"
+date: '2022-10-07'
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(DESeq2)
+library(factoextra)
+library(patchwork)
+knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2_publication/")
+```
+
+
+```{r}
+########################## Create a function to generate plots for all betalain pathway genes
+# load object with names of all betalain and flavonoid genes
+pathway_genes <- read.csv(file = "data/manual_sheets/color_pathway_genes.csv", header=T)
+colnames(pathway_genes) <- c("pathway_gene", "pathway", "gene_id")
+betalain.genes <- pathway_genes %>%
+  filter(pathway == "Betalain")
+flavonoid.genes <- pathway_genes %>%
+  filter(pathway == "Flavonoid")
+
+```
+
+
+Transcript level gene expression quantification from kallisto:
+
+```{r}
+# vector of input directories
+sample_names <- dir(path = "data/flower_color_mapping/kallisto_quant/")
+
+# read in tables
+kallisto_quant <- c()
+
+for (i in 1:length(sample_names)){
+  x <- read_table(file = paste0("data/flower_color_mapping/kallisto_quant/",
+                                sample_names[i],
+                                "/abundance.tsv"))
+  # set column names and keep relevant columns
+  x <- x %>%
+    summarise(transcript_id = target_id,
+              tpm = tpm,
+              bulk = sample_names[i])
+  # save in kallisto_quant
+  kallisto_quant <- rbind(kallisto_quant, x)
+}
+kallisto_quant$bulk <- as.factor(kallisto_quant$bulk)
+
+# quick quality control, that each bulk has the correct number of transcripts
+kallisto_quant %>%
+  group_by(bulk) %>%
+  tally()
+
+# add gene id column to table
+kallisto_quant <- kallisto_quant %>%
+  mutate(gene_id = substr(transcript_id, 1, 9))
+
+# check gene expression only for betalain pathway genes
+betalain_quant <- kallisto_quant %>%
+  filter(gene_id %in% betalain.genes$gene_id)
+
+# add betalain_gene information
+betalain_quant <- left_join(betalain_quant, betalain.genes, by = "gene_id")
+
+# which genes are not expressed? Expression under 0.5 TPM in all samples
+betalain_quant %>%
+  group_by(transcript_id) %>%
+  summarise(pathway_gene = pathway_gene,
+            max_tpm = max(tpm)) %>%
+  unique() %>%
+  mutate(expressed = ifelse(max_tpm >= 0.5, "expressed", "no expression"))
+```
+
+
+Plot each gene:
+
+```{r}
+# gene_ID_list and gene_name_list should be vectors with the same length
+plot_betalain_counts <- function(gene_ID_list, gene_name_list){
+  output <- list()
+  for (i in 1:length(gene_ID_list)){
+      # create dataframe with counts for a particular gene
+      transcript_tpm <- betalain_quant %>%
+        filter(transcript_id == gene_ID_list[i])
+      # plot counts
+      output[[i]] <- ggplot(data=transcript_tpm) +
+        geom_col(aes(x = bulk,
+                     y = tpm,
+                     fill = bulk)) +
+        theme_classic() +
+        scale_fill_manual(values = c("chartreuse3", "red3", "chartreuse3", "red3"), guide = "none") +
+        scale_shape_discrete(guide = guide_legend(override.aes = list(size=3),
+                                                  nrow = 2)) +
+        scale_x_discrete(labels = c("Regulator BSA",
+                                    "Regulator BSA",
+                                    "Biosynthesis BSA",
+                                    "Biosynthesis BSA")) +
+        labs(title = paste0(gene_name_list[i], " (", gene_ID_list[i], ")"),
+             y = "TPM",
+             shape = "") +
+        theme(axis.title.y = element_text(size = 30),
+              axis.text.y = element_text(size = 25),
+              title = element_text(size = 25),
+              legend.position = c("bottom"),
+              axis.title.x = element_blank(),
+              axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 25))
+  }
+  return(output)
+}
+
+# save as list of plots
+# since the table is in long format, use each transcript only once
+betalain_plots <- plot_betalain_counts(gene_ID_list = betalain_quant$transcript_id[1:(length(betalain_quant$transcript_id)/4)],
+                                       gene_name_list = betalain_quant$pathway_gene[1:(length(betalain_quant$transcript_id)/4)])
+
+
+patchplot <- betalain_plots[[8]] + betalain_plots[[9]] + betalain_plots[[11]] +
+  betalain_plots[[10]] + betalain_plots[[5]] + betalain_plots[[4]] + betalain_plots[[2]] + betalain_plots[[3]] +
+  plot_layout(nrow = 3) +
+  plot_annotation(tag_levels = "A") &
+  theme(plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm"),
+        plot.tag = element_text(size = 35))
+
+ggsave(filename = "plots/flower_mapping_expression/betalain_gene_kallisto.png",
+       width = 28,
+       height = 20)
+
+```
+
+Produce plots of gene expression only for biosynthesis BSA:
+
+```{r}
+# prepare list of gene names with AhMYB2 isoforms
+gene_names <- betalain_quant$pathway_gene[1:(length(betalain_quant$transcript_id)/4)]
+gene_names[8] <- "AhMYB2.1"
+gene_names[9] <- "AhMYB2.2"
+
+
+plot_betalain_counts <- function(gene_ID_list, gene_name_list){
+  output <- list()
+  for (i in 1:length(gene_ID_list)){
+      # create dataframe with counts for a particular gene
+      transcript_tpm <- betalain_quant %>%
+        filter(transcript_id == gene_ID_list[i],
+               bulk == "AM_00331_gf" | bulk == "AM_00331_rf")
+      # add minimum tpm threshold of 0.5 in atleast one sample
+      if (min(transcript_tpm$tpm) < 0.5) {
+        print(paste0(gene_name_list[i], " is not expressed!"))
+        next
+      }
+      # plot counts
+      output[[i]] <- ggplot(data=transcript_tpm) +
+        geom_col(aes(x = bulk,
+                     y = tpm,
+                     fill = bulk)) +
+        theme_classic() +
+        scale_fill_manual(values = c("chartreuse3", "red3", "chartreuse3", "red3"), guide = "none") +
+        scale_shape_discrete(guide = guide_legend(override.aes = list(size=3),
+                                                  nrow = 2)) +
+        labs(title = paste0(gene_name_list[i]),
+             y = "TPM",
+             shape = "") +
+        theme(axis.title.y = element_text(size = 43),
+              axis.text.y = element_text(size = 38),
+              title = element_text(size = 45),
+              legend.position = c("bottom"),
+              axis.title.x = element_blank(),
+              axis.text.x = element_blank())
+  }
+  return(output)
+}
+
+# save as list of plots
+# since the table is in long format, use each transcript only once
+betalain_plots <- plot_betalain_counts(gene_ID_list = betalain_quant$transcript_id[1:(length(betalain_quant$transcript_id)/4)],
+                                       gene_name_list = gene_names)
+
+# save all plots
+for (i in 1:length(betalain_plots)){
+  ggsave(filename = paste0("plots/flower_mapping_expression/", gene_names[i], ".png"),
+       plot = betalain_plots[[i]],
+       height = 6,
+       width = 8)
+}
+
+```
+
+
+Plot gene expression as a matrix:
+
+```{r}
+# only regulator BSA color bulk
+regulator_quant <- betalain_quant %>%
+  filter(bulk == "AM_00331_gf" | bulk == "AM_00331_rf",
+         !(pathway_gene %in% c("AhMYB3", "AhMYB4", "AhDODAα2","AhCYP76AD5"))) %>%
+  mutate(label = paste0(pathway_gene, substr(transcript_id, 10,11)))
+
+regulator_quant$label <- factor(regulator_quant$label, levels = c("AhBetanidin6GT.1","AhBetanidin5GT.1",
+                                                                  "AhcDOPA5GT.1","AhDODAα1.1",
+                                                                  "AhCYP76AD2.1","AhMYB2.2","AhMYB2.1"))
+
+# plot each gene, but only the comparison between red and green flower
+ggplot(data=regulator_quant) +
+  geom_tile(aes(x = bulk, y = label, fill = tpm)) +
+  scale_fill_distiller(palette = "RdYlBu", trans = "log10", breaks = c(3,30,300)) +
+  labs(fill = "TPM") +
+  scale_x_discrete(labels = c("Green flower", "Red flower")) +
+  theme_classic() +
+  #expand_limits(fill = 300) +
+  theme(text = element_text(size=22),
+        axis.title.x = element_blank(),
+        axis.title.y = element_blank(),
+        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
+        legend.position = "bottom",
+        legend.direction = "vertical")
+
+ggsave(filename = "plots/flower_mapping_expression/flower_expression_grid.png",
+       width = 4,
+       height = 8,
+       dpi = 500)
+
+```
+
+Investigate flavonoid pathway gene expression.
+
+```{r}
+# check gene expression only for flavonoid pathway genes
+flavonoid_quant <- kallisto_quant %>%
+  filter(gene_id %in% flavonoid.genes$gene_id)
+
+# add flavonoid_gene information
+flavonoid_quant <- left_join(flavonoid_quant, flavonoid.genes, by = "gene_id")
+
+# which genes are not expressed? Expression under 0.5 TPM in all samples
+flavonoid_quant %>%
+  group_by(transcript_id) %>%
+  summarise(pathway_gene = pathway_gene,
+            max_tpm = max(tpm)) %>%
+  unique() %>%
+  mutate(expressed = ifelse(max_tpm >= 0.5, "expressed", "no expression"))
+
+# flavonoid pathway gene expression
+flavonoid_grid <- flavonoid_quant %>%
+  filter(!(pathway_gene %in% c("AmMYBl1"))) %>%
+  mutate(label = paste0(pathway_gene, " (", transcript_id, ")"))
+  #mutate(label = paste0(pathway_gene, substr(transcript_id, 10,11)))
+
+# flavonoid_grid$label <- factor(flavonoid_grid$label, levels = rev(c("PAL_1.1","PAL_2.1",
+#                                                                   "C4H_1.1","C4H_1.2",
+#                                                                   "C4H_2.1","C4H_3.1","C4H_3.2",
+#                                                                  "4CL_1.1","4CL_2.1","CHS.1","CHS.2",
+#                                                                  "CHI1.1","F3-H_1.1","F3-H_2.1",
+#                                                                  "F3-H_3.1","F3-H_3.2","F3-H_4.1",
+#                                                                  "F3-H_5.1","F3H.1", "FLS.1",
+#                                                                  "FLS.2","DFR.1","ANS.1","LAR.1")))
+flavonoid_grid$label <- factor(flavonoid_grid$label, levels = rev(c("PAL_1 (AHp012752.1)","PAL_2 (AHp021980.1)",
+                                                                  "C4H_1 (AHp013217.1)","C4H_1 (AHp013217.2)",
+                                                                  "C4H_2 (AHp022384.1)","C4H_3 (AHp022382.1)",
+                                                                  "C4H_3 (AHp022382.2)","4CL_1 (AHp014409.1)",
+                                                                  "4CL_2 (AHp020962.1)","CHS (AHp004305.1)",
+                                                                  "CHS (AHp004305.2)","CHI1 (AHp009962.1)",
+                                                                  "F3-H_1 (AHp017497.1)","F3-H_2 (AHp022122.1)",
+                                                                 "F3-H_3 (AHp003152.1)","F3-H_3 (AHp003152.2)",
+                                                                 "F3-H_4 (AHp022120.1)", "F3-H_5 (AHp022123.1)",
+                                                                 "F3H (AHp006454.1)", "FLS (AHp008991.1)",
+                                                                 "FLS (AHp008991.2)","DFR (AHp009303.1)",
+                                                                 "LAR (AHp017409.1)","ANS (AHp021795.1)")))
+
+
+# plot each gene, but only the comparison between red and green flower
+ggplot(data=flavonoid_grid) +
+  geom_tile(aes(x = bulk, y = label, fill = tpm)) +
+  scale_fill_distiller(palette = "RdYlBu", trans = "log10", breaks = c(3,30,300)) +
+  geom_text(aes(x = bulk, y = label, label = sprintf("%0.2f", round(tpm, digits = 2))),
+            size = 3.5) +
+  labs(fill = "TPM") +
+  scale_x_discrete(labels = c("Regulator_green", "Regulator_red", "Biosynthesis_green", "Biosynthesis_red")) +
+  theme_classic() +
+  expand_limits(fill = 300) +
+  theme(text = element_text(size=22),
+        axis.title.x = element_blank(),
+        axis.title.y = element_blank(),
+        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
+        legend.position = "right",
+        legend.direction = "vertical")
+
+ggsave(filename = "plots/flower_mapping_expression/flavonoid_expression_grid.png",
+       width = 8,
+       height = 8,
+       dpi = 500)
+```
diff --git a/workflows/BSA/read_count_analysis_from_bam.Rmd b/workflows/BSA/read_count_analysis_from_bam.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..a3d44236380fe6efe5f4547dc8fb470af44d4dff
--- /dev/null
+++ b/workflows/BSA/read_count_analysis_from_bam.Rmd
@@ -0,0 +1,208 @@
+---
+title: "BSA_read_variant_plotting"
+author: "twinkle1"
+date: "2023-01-20"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(chromstaR)
+library(ggtranscript)
+library(RColorBrewer)
+knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2/")
+```
+
+Extract the sequencing reads from the bam files which overlap both the two non-synonymous SNPs and the stop-gained SNP. Samtools view can be used to extract reads covering a particular position (and their pairs). Non-primary alignments can be discarded. Reads should cover the position of the stop-gained variant (Scaffold 16, 5305851, C->T) and the position of the right non-synonymous variant (Scaffold 16, 5305727, A->T).
+
+```{bash}
+mkdir -p data/BSA/RNAseq/phased_reads/
+
+# green flower
+# index and extract everything overlapping the right non-synonymous variant position
+samtools index data/BSA/RNAseq/STAR_flower_mappings/AM_00332_gf_Aligned.sortedByCoord.out.bam
+samtools view -b -h -F 256 -P data/BSA/RNAseq/STAR_flower_mappings/AM_00332_gf_Aligned.sortedByCoord.out.bam Scaffold_16:5305727-5305727 > data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_mismatch.bam
+
+# index and extract everything overlapping the stop-gained variant position
+samtools index data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_mismatch.bam
+samtools view -b -h -P data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_mismatch.bam Scaffold_16:5305851-5305851 > data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam
+samtools index data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam
+
+# save as tsv file using sam2tsv from jvarkit
+java -jar /home/tom/Documents/tools/jvarkit/dist/sam2tsv.jar -R polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam > data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam.tsv
+
+
+# red flower
+# index and extract everything overlapping the right non-synonymous variant position
+samtools index data/BSA/RNAseq/STAR_flower_mappings/AM_00332_rf_Aligned.sortedByCoord.out.bam
+samtools view -b -h -F 256 -P data/BSA/RNAseq/STAR_flower_mappings/AM_00332_rf_Aligned.sortedByCoord.out.bam Scaffold_16:5305727-5305727 > data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_mismatch.bam
+
+# index and extract everything overlapping the stop-gained variant position
+samtools index data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_mismatch.bam
+samtools view -b -h -P data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_mismatch.bam Scaffold_16:5305851-5305851 > data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam
+samtools index data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam
+
+# save as tsv file using sam2tsv from jvarkit
+java -jar /home/tom/Documents/tools/jvarkit/dist/sam2tsv.jar -R polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam > data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam.tsv
+```
+
+Put in loading and filtering of the data (tsv and sequencing data) into separate functions:
+
+```{r}
+# function to load in bam tsv and filter reads covering at least the stop position and other snp
+# basecall has to exist at these positions, gap is not sufficient
+read_bam.tsv <- function(filename){
+  # read in data
+  bam.tsv <- as.data.frame(read_tsv(file = filename))
+  colnames(bam.tsv) <- c("read_name", "Flag", "MAPQ", "CHROM", "read_pos", "read_base", "read_qual", "ref_pos",
+                       "ref_base", "CIGAR_op")
+  # limit the analysis to the region of interest
+  # snps of interest are at the positions 5305722, 5305727 and 5305851
+  snp.tsv <- bam.tsv %>%
+    filter(CHROM == "Scaffold_16",
+           ref_pos == 5305722 | ref_pos == 5305727 | ref_pos == 5305851,
+           read_base != ".") %>%
+    mutate(snp_group = if_else(ref_pos == 5305851, 1, 0))
+  # check that there are at least 2 snps and the stop snp is included
+  snp.tsv <- snp.tsv %>%
+    group_by(read_name) %>% # group by read name
+    filter(any(snp_group == 1),
+           n_distinct(read_pos) > 1) %>% # at least 2 snps and at least the stop snp covered
+    arrange(read_name) %>%
+    ungroup() %>%
+    #mutate(allele = ifelse(tolower(read_base) == ref_base, "ref", "alt")) %>% # is it the reference or alternative allele?
+    mutate(allele = ifelse(ref_pos == 5305722 & read_base == "A", 
+                           "alt", 
+                           ifelse(ref_pos == 5305727 & read_base == "T", 
+                                  "alt",
+                                  ifelse(ref_pos == 5305851 & read_base == "T",
+                                         "alt", "ref"))))
+  return(snp.tsv)
+}
+
+# function to load in bam file as granges object to extract read positions
+read_bam_snps_as_df <- function(filename, tsv){
+  # read in data:
+  snp.bam <- readBamFileAsGRanges(bamfile = filename,
+                                min.mapq = 0,
+                                what = "qname")
+  # convert to dataframe:
+  snp.bam.df <- as.data.frame(snp.bam@ranges)
+  snp.bam.df$qname <- snp.bam@elementMetadata@listData$qname
+  snp.bam.df <- snp.bam.df %>%
+    mutate(start = start-1,
+           end = end-1)
+  # filter only the reads kept in the tsv table
+  snp.bam.df <- snp.bam.df %>%
+    filter(qname %in% tsv$read_name)
+}
+
+
+# plot individual reads after loading in data:
+plot_reads <- function(df, tsv, title){
+  # maybe rather join the two tables?
+  number_alt_alleles <- tsv %>%
+    filter(allele == "alt") %>%
+    group_by(read_name) %>% 
+    summarise(n = n(),
+              snp_grouped = sum(snp_group))
+  # join the table detailing the number of alt alleles
+  snp.bam.df <- left_join(df, number_alt_alleles, by = c("qname" = "read_name"))
+  
+  # first plot, showing specific reads
+  snpplot <- ggplot() +
+    geom_range(data = snp.bam.df, 
+               aes(xstart = start, xend = end, y = factor(qname, levels=unique(qname[order(n,snp_grouped,qname)]), ordered=TRUE)))
+  
+  # second plot showing alternativ allele positions
+  snpplot2 <- snpplot +
+    geom_point(data = tsv %>% filter(allele == "alt"), 
+               aes(x = ref_pos, 
+                   y = read_name, 
+                   color = as.factor(snp_group)),
+               size = 2.2) +
+    theme_classic() +
+    scale_color_manual(values = viridis::viridis(n = 4, direction = -1)[3:4]) +
+    labs(#title = title,
+         x = "Position Scaffold 16 (bp)",
+         y = "Read pair",
+         color = "Allele") +
+    #coord_cartesian(xlim = c(5305600, 5305950)) +
+    theme(axis.text.y = element_blank(),
+          legend.position = "none",
+          #text = element_text(size = 21),
+          axis.text.x = element_text(size=25),
+          axis.title.x = element_text(size=40),
+          axis.line = element_line(linewidth = 2),
+          axis.ticks = element_line(linewidth = 1.5),
+          axis.ticks.length = unit(.25, "cm"),
+          axis.title.y = element_blank(),
+          axis.line.y = element_blank(),
+          axis.ticks.y = element_blank())
+  return(snpplot2)
+}
+
+```
+
+Load in data for the red and green flower bulks of the BSA on AM_00332:
+
+```{r}
+# green flower
+snp_tsv.gf <- read_bam.tsv(filename = "data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam.tsv")
+
+snp_df.gf <- read_bam_snps_as_df(filename = "data/BSA/RNAseq/phased_reads/AM_00332_gf_Aligned.sortedByCoord.out.covering_both.bam",
+                                 tsv = snp_tsv.gf)
+
+# how many reads after filtering?
+snp_tsv.gf %>%
+  dplyr::count(read_name) # 99 reads
+
+# red flower
+snp_tsv.rf <- read_bam.tsv(filename = "data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam.tsv")
+
+snp_df.rf <- read_bam_snps_as_df(filename = "data/BSA/RNAseq/phased_reads/AM_00332_rf_Aligned.sortedByCoord.out.covering_both.bam",
+                                 tsv = snp_tsv.rf)
+
+# how many reads after filtering?
+snp_tsv.rf %>%
+  dplyr::count(read_name) # 38 reads
+```
+
+Plot individual reads and alternative variants:
+
+```{r}
+# green flower
+plot.gf <- plot_reads(df = snp_df.gf,
+                      tsv = snp_tsv.gf,
+                      title = "RNAseq reads green flower")
+plot.gf
+
+# save plot
+ggsave(filename = "plots/rna_seq_reads_gf.png",
+       plot = plot.gf,
+       width = 10,
+       height = 8)
+
+
+# red flower
+plot.rf <- plot_reads(df = snp_df.rf,
+                      tsv = snp_tsv.rf,
+                      title = "RNAseq reads red flower")
+plot.rf
+
+# save plot
+ggsave(filename = "plots/rna_seq_reads_rf.png",
+       plot = plot.rf,
+       width = 10,
+       height = 8)
+```
+
+
+
+
+
+
+
+
+
diff --git a/workflows/BSA/readme.txt b/workflows/BSA/readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1c81ccdc62112abefd1eb757dfce03d9db65600c
--- /dev/null
+++ b/workflows/BSA/readme.txt
@@ -0,0 +1,45 @@
+## Bulk segregant analysis
+
+Code for bulk segregant analysis, haplotype analysis and quantification of flower gene expression.
+
+
+### Script order
+
+RNAseq:
+
+- code/BSA/RNAseq/adapter_trimming.sh
+trimming of adapter sequences using Trimmomatic
+
+- code/BSA/RNAseq/index_STAR.sh
+index the genome with STAR using the genome annotation v2.2
+
+- code/BSA/RNAseq/run_STAR.sh
+map reads to the genome using STAR
+
+- code/BSA/RNAseq/QC.sh
+quality control of adapter trimming and read mapping
+
+- code/BSA/RNAseq/run_kallisto.sh
+quantification of gene expression using kallisto
+
+WGS:
+
+- code/BSA/WGS/map_reads.sh
+map WGS reads to the genome
+
+- code/BSA/WGS/combined_filter.sh
+call and filter variants
+
+Analysis:
+
+- code/BSA/read_count_analysis.Rmd
+Analysis of gene expression data in pooled flower tissue
+
+- code/BSA/bsa_and_plotting.R
+Conduct bulk segregant analysis
+
+- code/BSA/snpEff_analysis.Rmd
+Analyse BSA variants using snpEff
+
+- code/BSA/read_count_analysis_from_bam.Rmd
+Investigate support for AhCYP76AD2 variants by RNA-seq data
diff --git a/workflows/BSA/snpEff_analysis.Rmd b/workflows/BSA/snpEff_analysis.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..f74a6b0d82f193bc9cedfa9a4c5920b764b79e30
--- /dev/null
+++ b/workflows/BSA/snpEff_analysis.Rmd
@@ -0,0 +1,635 @@
+---
+title: "snpEff_database_creation"
+author: "twinkle1"
+date: '2022-09-08'
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(data.table)
+library(QTLseqr)
+library(ggtranscript)
+library(reshape2)
+library(cowplot)
+library(patchwork)
+knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2_publication/")
+```
+
+## Database creation
+
+Run snpEff database creation on the fixed annotation files. Copy the fixed files to the snpeff directory:
+
+```{bash}
+mkdir -p data/annotation_analysis/snpEff/databases/AHv2.2/
+
+# snpEff analysis
+# add genome file to snpEff database
+cp polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta data/annotation_analysis/snpEff/databases/AHv2.2/sequences.fa
+# add annotation file to snpEff database
+cp data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff data/annotation_analysis/snpEff/databases/AHv2.2/genes.gff
+cp data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta data/annotation_analysis/snpEff/databases/AHv2.2/cds.fa
+cp data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.prot.fasta data/annotation_analysis/snpEff/databases/AHv2.2/protein.fa
+
+# create database:
+java -jar /home/tom/Documents/tools/snpEff/snpEff.jar build -v AHv2.2
+```
+
+
+## snpEff run
+
+Run using data from Markus color/sterility mapping bulks:
+
+```{bash}
+mkdir -p data/annotation_analysis/snpEff/bsa_sterility_color/analysis
+
+# get data and run snpEff on the example data:
+java -jar /home/tom/Documents/tools/snpEff/snpEff.jar -csvStats data/annotation_analysis/snpEff/bsa_sterility_color/output.stats.csv -v AHv2.2 data/annotation_analysis/snpEff/bsa_sterility_color/gatk_filter_maxmissing05_biallelic.vcf.gz > data/annotation_analysis/snpEff/bsa_sterility_color/output.snpeff.vcf
+
+
+# two files are not saved in the output directory but in the current working directory
+mv snpEff_* data/annotation_analysis/snpEff/bsa_sterility_color/
+
+# it is challenging to process the snpEff output for downstream analysis
+# snpsift is a software package distributed with snpEff that eases processing
+cat data/annotation_analysis/snpEff/bsa_sterility_color/output.snpeff.chr16.vcf | /home/tom/Documents/tools/snpEff/scripts/vcfEffOnePerLine.pl | java -jar /home/tom/Documents/tools/snpEff/SnpSift.jar extractFields - CHROM POS "ANN[*].GENEID" "ANN[*].EFFECT" > data/annotation_analysis/snpEff/bsa_sterility_color/output.snpeff.chr16.snpsift.txt
+```
+
+
+## Analysis
+
+Analyze the snpEff test run and check for high impact variants that can be manually analysed in genes of the betalain and flavonoid pathways.
+
+```{r}
+# load in snpEff summary file
+snpEff.tab <- read.table("data/annotation_analysis/snpEff/bsa_sterility_color/snpEff_genes.txt", skip = 1, header = T, comment.char = "")
+colnames(snpEff.tab)[1] <- "GeneName"
+
+# load in list of betalain and flavonoid pathway genes
+color_pathways <- read.csv("data/manual_sheets/color_pathway_genes.csv", header = T)
+snpEff.tab <- left_join(snpEff.tab, color_pathways, by = c("GeneId" = "Gene_id"))
+
+# check color pathway genes for high impact variants, moderate and in theory also modifier might also be relevant
+snpEff.tab %>%
+  filter(GeneName %in% color_pathways$Gene_id) %>%
+  #filter(variants_impact_HIGH > 0) %>%
+  summarise(GeneId = GeneId,
+            TranscriptId = TranscriptId,
+            Gene = Gene,
+            Pathway = Pathway,
+            variants_impact_HIGH = variants_impact_HIGH,
+            variants_impact_MODERATE = variants_impact_MODERATE,
+            variants_impact_MODIFIER = variants_impact_MODIFIER)
+# high impact variant in one of the five F3-H candidate genes, but also in CYP76AD candidate gene
+```
+
+Subset a more detailed table of betalain genes and their respective positions in the genome:
+
+```{r}
+# load in list of betalain and flavonoid pathway genes
+color_pathways <- read.csv("data/manual_sheets/color_pathway_genes.csv", header = T)
+betalain_chr16 <- color_pathways %>%
+  filter(Gene_id == "AHp022773" | Gene_id == "AHp023148" | Gene_id == "AHp023147")
+# add BvMYB1like gene
+betalain_chr16[2,] <- c("BvMYB1like", "Betalain", "AHp022773")
+
+write.table(betalain_chr16, 
+            file = "data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.txt",
+            quote = F)
+
+# set up function for reading in a gtf file
+read.gtf <- function(file){
+  # based on: https://www.biostars.org/p/272889/
+  # read in the gtf file:
+  gff <- fread(file)
+  setnames(gff, names(gff), c("chr","source","type","start","end","score","strand","phase","attributes"))
+  # subset attribute column into the gene and transcript id columns
+  # function for extracting the two attributes
+  extract_attributes <- function(gtf_column, att_of_interest){
+    att <- strsplit(gtf_column, "; ")
+    att <- gsub("\"","",unlist(att))
+    att <- gsub(";","",unlist(att))
+    if(!is.null(unlist(strsplit(att[grep(att_of_interest, att)], " ")))){
+      return( unlist(strsplit(att[grep(att_of_interest, att)], " "))[2])
+    }else{
+      return(NA)
+    }
+  }
+  # using the function to subset gene and transcript id:
+  gff$gene_id <- unlist(lapply(gff$attributes, extract_attributes, "gene"))
+  gff$transcript_id <- unlist(lapply(gff$attributes, extract_attributes, "transcript"))
+  return(gff)
+}
+
+# read in annotation
+annotation.gtf <- read.gtf("polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf")
+
+# subset betalain genes
+betalain_chr16.gtf <- annotation.gtf %>%
+  filter(gene_id %in% betalain_chr16$Gene_id)
+
+saveRDS(betalain_chr16.gtf, file = "data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.gtf")
+
+
+
+#################### generate BED file of relevant positions
+betalain_chr16.gtf <- readRDS("data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.gtf")
+# snpeff by default uses the 5000 positions before and after a gene
+# create a bed file that can be used to subset the vcf file into relevant variants
+betalain_chr16.bed <- betalain_chr16.gtf %>%
+  filter(type == "transcript") %>%
+  mutate(chrom = chr,
+         chromStart = start - 5000,
+         chromEnd = end + 5000) %>%
+  select(chrom, chromStart, chromEnd) %>%
+  unique()
+
+write_tsv(betalain_chr16.bed, file = "data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.bed")  
+```
+
+Subset the vcf file and extract the allele frequencies using vcftools
+
+```{bash}
+# to extract the format field
+vcftools --gzvcf data/BSA/wgs/vcf/gatk_filter_maxmissing05_biallelic.vcf.gz --bed data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.bed --indv AM_00331_gf --indv AM_00331_rf --indv AM_00332_gl --indv AM_00332_rl --extract-FORMAT-info AD --out data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16
+# also subset the vcf file to include only the variants around the betalain genes
+vcftools --gzvcf data/BSA/wgs/vcf/gatk_filter_maxmissing05_biallelic.vcf.gz --bed data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.bed --indv AM_00331_gf --indv AM_00331_rf --indv AM_00332_gl --indv AM_00332_rl --recode --recode-INFO-all --out data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16
+```
+
+
+Plot the annotated variants in betalain genes. In general, it could be interesting to continue analysis based on genes which are not already on our list which show knockout variants. Load required data
+
+```{r}
+# read in betalain gene list on chr 16
+betalain_chr16 <- read.table("data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.txt")
+
+# read in variant count of extracted SNPs
+allele_depth.tab <- read.table(file = "data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.AD.FORMAT", header = T)
+allele_depth.tab <- allele_depth.tab %>%
+  mutate(customid = paste0("16_", POS))
+# allele depth denotes first the reference allele and then the alternative allele, only those reads which were involved in allele calling
+# split allele depth into two columns
+allele_depth.tab <- separate(data = allele_depth.tab, col = "AM_00331_gf", sep = ",", into = c("AM00331_gf_ref", "AM00331_gf_alt"))
+allele_depth.tab <- separate(data = allele_depth.tab, col = "AM_00331_rf", sep = ",", into = c("AM00331_rf_ref", "AM00331_rf_alt"))
+allele_depth.tab <- separate(data = allele_depth.tab, col = "AM_00332_gl", sep = ",", into = c("AM00332_gl_ref", "AM00332_gl_alt"))
+allele_depth.tab <- separate(data = allele_depth.tab, col = "AM_00332_rl", sep = ",", into = c("AM00332_rl_ref", "AM00332_rl_alt"))
+
+# read in snpsift output and subset for betalain genes
+snpsift.tab <- read.table("data/annotation_analysis/snpEff/bsa_sterility_color/output.snpeff.chr16.snpsift.txt", header = T)
+snpsift.tab <- snpsift.tab %>%
+  filter(ANN....GENEID %in% betalain_chr16$Gene_id) %>%
+  mutate(customid = paste0("16_", POS)) %>% # add customid column to snpsift table to enable merging of the two tables
+  unique()
+
+
+# read in betalain gene gtf on chr 16
+betalain_chr16.gtf <- readRDS("data/annotation_analysis/snpEff/bsa_sterility_color/analysis/betalain_chr16.gtf")
+
+# join the snpsift table with the allele depth table
+joined.df <- left_join(snpsift.tab, allele_depth.tab, by = c("CHROM", "POS", "customid"))
+```
+
+Set up plotting functions:
+
+```{r}
+# annotation is a loaded gtf file
+# variants is the dataframe of all variants and allele depths
+# gene is the character string of the gene that is supposed to be plotted
+# transcript is the transcript character string of the gene that is to be plotted
+# bulk1 and bulk2 denote the two bulks (character strings) which are to be plotted
+
+filter_variants <- function(variants, gene, bulk1_ref, bulk1_alt, bulk2_ref, bulk2_alt){
+  # prepare data by only keeping the relevant gene variants and samples
+  dat <- variants %>% 
+    filter(ANN....GENEID == gene) %>%
+    select(CHROM, POS, ANN....EFFECT, bulk1_ref, bulk1_alt, bulk2_ref, bulk2_alt) %>%
+    filter(bulk1_alt != 0 & bulk2_alt != 0)
+  # as numeric
+  dat[,4] <- as.numeric(dat[,4])
+  dat[,5] <- as.numeric(dat[,5])
+  dat[,6] <- as.numeric(dat[,6])
+  dat[,7] <- as.numeric(dat[,7])
+  # convert to ratios
+  dat[,5] <- dat[,5]/ (dat[,4] + dat[,5])
+  dat[,4] <- 1 - dat[,5]
+  dat[,7] <- dat[,7]/(dat[,6] + dat[,7])
+  dat[,6] <- 1 - dat[,7]
+  dat <- dat[(dat[,5] != 1) | (dat[,7] != 1),]
+  return(dat)
+}
+
+plot_bulk_comparison <- function(annotation, trans_id, filtered_variants){
+
+  # filter annotation to only include the transcript in question
+  annotation.filtered <- annotation %>%
+    filter(transcript_id == trans_id,
+           type == "CDS")
+  filtered_variants <- filtered_variants[filtered_variants$POS >= min(annotation.filtered$start) & filtered_variants$POS <= max(annotation.filtered$end),]
+  filtered_variants$ANN....EFFECT <- factor(filtered_variants$ANN....EFFECT, levels = c("intron_variant", 
+                                                                                        "synonymous_variant", 
+                                                                                        "missense_variant",
+                                                                                        "stop_gained"))
+  min_pos <- min(annotation.filtered$start)
+  max_pos <- max(annotation.filtered$end)
+  
+  # plot the gene with variants
+  p2 <- ggplot() +
+    geom_range(data = annotation.filtered,
+               aes(xstart = start, xend = end, y = transcript_id),
+               fill = "grey90") +
+    geom_intron(data = to_intron(annotation.filtered),
+                aes(xstart = start, xend = end, y = transcript_id, strand = strand)) +
+    # annotate variants on the gene as stripes
+    geom_rect(data = filtered_variants,
+              aes(xmin = POS-8, xmax = POS+8, ymin=0.75, ymax = 1.25, fill = ANN....EFFECT)) +
+    geom_text(aes(x = 5303500,
+                  y = 1.4,
+                  label = "AhCYP76AD2"),
+              size = 13) +
+    coord_cartesian(xlim = c(min_pos, max_pos)) +
+    labs(y = "",
+         fill = "Variant effect",
+         x = "Position Scaffold 16 (bp)") +
+    theme_classic() +
+    #scale_fill_brewer(palette = "RdBl", direction = -1) + # think about color palette to use
+    scale_fill_viridis_d(direction = -1, 
+                         labels = c("Intron variant", "Synonymous variant", "Missense variant", "Stop gained")) +
+    theme(text = element_text(size = 21),
+          #legend.position = "none",
+          legend.position = "right",
+          plot.margin = unit(c(2, 0, 2, 0), "cm"),
+          axis.line.y = element_blank(),
+          axis.ticks.y = element_blank(),
+          axis.text.x = element_text(size=25),
+          axis.title.x = element_text(size=40),
+          axis.line = element_line(linewidth = 2),
+          legend.text = element_text(size = 30),
+          legend.title = element_text(size = 40),
+          axis.ticks = element_line(linewidth = 1.5),
+          axis.ticks.length = unit(.25, "cm"),
+          #axis.ticks.x = element_blank(),
+          #axis.line.x = element_blank(),
+          #axis.text.x = element_blank(),
+          axis.text.y = element_blank())
+  legend <- get_legend(p2)
+  p2 <- p2 + theme(legend.position = "none")
+  out_plot <- plot_grid(p2, legend,
+                        nrow = 1,
+                        rel_widths = c(0.75, 0.25))
+  
+  # # rearrange data:
+  # dat1 <- filtered_variants[,-(4:5)]
+  # dat2 <- filtered_variants[,-(6:7)]
+  # dat1.melt <- melt(dat1, id.vars = c("CHROM", "POS", "ANN....EFFECT"))
+  # dat2.melt <- melt(dat2, id.vars = c("CHROM", "POS", "ANN....EFFECT"))
+  # 
+  # # plot relative allele frequency
+  # p1 <- ggplot() +
+  #   geom_col(data = dat1.melt,
+  #          aes(x = POS, y = as.numeric(value), fill = variable), position = "stack", width = 20) +
+  #   xlim(c(min_pos, max_pos)) +
+  #   labs(y = "", 
+  #        fill = "red_bulk", 
+  #        x = "") +
+  #   theme_classic() +
+  #   scale_fill_brewer(palette = "Set1", 
+  #                     labels = c("Reference allele", "Alternative allele"),
+  #                     direction = -1,
+  #                     guide = guide_legend(override.aes = list(alpha = 0))) + # make legend invisible
+  #   theme(text = element_text(size = 21),
+  #         axis.title.y = element_blank(),
+  #         legend.position = "none",
+  #         axis.line.x = element_blank(),
+  #         plot.margin = unit(c(0, 0, 0, 0), "cm"),
+  #         axis.ticks.x = element_blank(),
+  #         axis.text.x = element_blank(),
+  #         axis.title.x = element_blank(),
+  #         legend.title = element_text(color = "transparent"),
+  #         legend.text = element_text(color = "transparent"))
+  # 
+  # bars <- map(unique(dat2.melt$POS)
+  #           , ~geom_col(position = "stack",
+  #                       width = 20
+  #                      , data = dat2.melt %>% filter(POS == .x)))
+  # 
+  # p3 <- ggplot(data = dat2.melt,
+  #              aes(x=POS,
+  #                  y=as.numeric(value),
+  #                  fill=reorder(variable, as.numeric(value)))) +
+  #   bars +
+  #   xlim(c(min_pos, max_pos)) +
+  #   labs(fill = "green_bulk",
+  #        x = "Position on Scaffold 16") +
+  #   theme_classic() +
+  #   scale_fill_brewer(palette = "Set1", 
+  #                     labels = c("Reference allele", "Alternative allele"), 
+  #                     direction = -1,
+  #                     guide = guide_legend(override.aes = list(alpha = 0))) +
+  #   theme(text = element_text(size = 21),
+  #         axis.title.y = element_blank(),
+  #         legend.position = "none",
+  #         legend.title = element_text(color = "transparent"),
+  #         legend.text = element_text(color = "transparent"))
+  
+  
+  #allplots <- p1 + p2 + p3 + 
+    #plot_layout(ncol = 1)
+  return(out_plot)
+}
+```
+
+
+
+Plot for CYP76AD2:
+
+```{r}
+# plot for one gene
+# filter all homozygous reference variants
+dat <- joined.df %>% 
+  filter(ANN....GENEID == "AHp023148") %>%
+  filter(AM00332_gl_alt != 0)
+
+
+
+
+
+# filter to only include specific gene
+dat.filtered <- filter_variants(variants = joined.df, 
+                  gene = "AHp023148",
+                  bulk1_ref = "AM00332_gl_ref",
+                  bulk1_alt = "AM00332_gl_alt",
+                  bulk2_ref = "AM00332_rl_ref",
+                  bulk2_alt = "AM00332_rl_alt")
+
+
+
+AM00332_CYP76AD <- plot_bulk_comparison(annotation = betalain_chr16.gtf,
+                                        trans_id = "AHp023148.1",
+                                        filtered_variants = dat.filtered)
+
+
+# increase the size of the annotated transcript, also create the legend as its own plot and add with cowplot
+AM00332_CYP76AD
+
+
+ggsave(filename = "plots/CYP76AD_AHp023148_bsa_snpeff.png",
+       width = 14, 
+       height = 6)
+```
+
+
+Function to make the same plot for all of the betalain pathway genes
+
+```{r}
+plot_all_genes <- function(gene_id, transcript_id, bulk){
+  # create a list in the beginning to save all plots in, use this for saving afterwards
+  output <- list()
+  for (i in 1:length(transcript_id)){
+      # filter all homozygous reference variants
+      dat <- joined.df %>% 
+        filter(ANN....GENEID == gene_id[i]) %>%
+        filter(AM00332_gl_alt != 0)
+      # filter to only include specific gene
+      if (bulk == "AM00332"){
+        dat.filtered <- filter_variants(variants = joined.df, 
+                      gene = gene_id[i],
+                      bulk1_ref = "AM00332_gl_ref",
+                      bulk1_alt = "AM00332_gl_alt",
+                      bulk2_ref = "AM00332_rl_ref",
+                      bulk2_alt = "AM00332_rl_alt")
+      } else if (bulk == "AM00331"){
+        dat.filtered <- filter_variants(variants = joined.df, 
+                                    gene = gene_id[i],
+                                    bulk1_ref = "AM00331_gf_ref",
+                                    bulk1_alt = "AM00331_gf_alt",
+                                    bulk2_ref = "AM00331_rf_ref",
+                                    bulk2_alt = "AM00331_rf_alt")
+      } else {
+        return("Error: check bulk variable")
+      }
+      # plot and save in a list
+      output[[i]] <- plot_bulk_comparison(annotation = betalain_chr16.gtf,
+                                            trans_id = transcript_id[i],
+                                            filtered_variants = dat.filtered)
+  }
+  return(output)
+}
+
+gene_id <- unique(betalain_chr16.gtf$gene_id)
+transcript_id <- unique(betalain_chr16.gtf$transcript_id)[-2]
+
+# plot for color loss bulk
+bulk_AM00332_plot_list <- plot_all_genes(gene_id = gene_id,
+                                         transcript_id = transcript_id,
+                                         bulk = "AM00332")
+
+for (i in 1:length(bulk_AM00332_plot_list)){
+  ggsave(filename = paste0("plots/BSA/AM00332_color_loss_", gene_id[i], ".png"),
+         plot = bulk_AM00332_plot_list[[i]],
+         width = 14, 
+         height = 6)
+}
+
+# plot for regulator bulk
+bulk_AM00331_plot_list <- plot_all_genes(gene_id = gene_id,
+                                         transcript_id = transcript_id,
+                                         bulk = "AM00331")
+
+for (i in 1:length(bulk_AM00331_plot_list)){
+  ggsave(filename = paste0("plots/BSA/AM00331_color_loss_", gene_id[i], ".png"),
+         plot = bulk_AM00331_plot_list[[i]],
+         width = 14, 
+         height = 6)
+}
+```
+
+
+
+Zoom in on a specific a specific portion of the plot, centered around the three SNPs of interest:
+
+```{r}
+# adjust function to plot only a part of exon 2 to visualize the variants
+plot_bulk_comparison_zoom <- function(annotation, trans_id, filtered_variants){
+  # filter annotation to only include the transcript in question
+  annotation.filtered <- annotation %>%
+    filter(transcript_id == trans_id,
+           type == "CDS")
+  filtered_variants <- filtered_variants[filtered_variants$POS >= min(annotation.filtered$start) & filtered_variants$POS <= max(annotation.filtered$end),]
+  filtered_variants$ANN....EFFECT <- factor(filtered_variants$ANN....EFFECT, levels = c("intron_variant", 
+                                                                                        "synonymous_variant", 
+                                                                                        "missense_variant",
+                                                                                        "stop_gained"))
+  # plot the gene with variants
+  p2 <- ggplot() +
+    geom_range(data = annotation.filtered,
+               aes(xstart = start, xend = end, y = transcript_id),
+               #fill = "white") +
+               fill = "grey90") +
+    geom_intron(data = to_intron(annotation.filtered),
+                aes(xstart = start, xend = end, y = transcript_id, strand = strand)) +
+    # annotate variants on the gene as stripes
+    geom_rect(data = filtered_variants,
+              aes(xmin = POS-1, xmax = POS+1, ymin=0.75, ymax = 1.25, fill = ANN....EFFECT)) +
+    labs(y = "",
+         fill = "Variant effect") +
+    theme_classic() +
+    #scale_fill_brewer(palette = "RdBu", direction = -1) +
+    scale_fill_viridis_d(direction = -1) +
+    coord_cartesian(xlim = c(5305700, 5305870)) +
+    theme(text = element_text(size = 18),
+          #legend.margin = margin(5, 40, 5, 40),
+          axis.line.y = element_blank(),
+          axis.ticks.y = element_blank(),
+          axis.text.y = element_blank(),
+          axis.ticks.x = element_blank(),
+          axis.line.x = element_blank(),
+          axis.text.x = element_blank(),
+          legend.position = "none") # this increases the legend margin
+  # margin has to be increased so that other legends are not cut off, 
+  # since the first legend seems to determine the margins
+  
+  # rearrange data:
+  dat1 <- filtered_variants[,-(4:5)]
+  dat2 <- filtered_variants[,-(6:7)]
+  dat1.melt <- melt(dat1, id.vars = c("CHROM", "POS", "ANN....EFFECT"))
+  dat2.melt <- melt(dat2, id.vars = c("CHROM", "POS", "ANN....EFFECT"))
+  
+  # plot relative allele frequency
+  p1 <- ggplot() +
+    geom_col(data = dat1.melt,
+           aes(x = POS, y = as.numeric(value), fill = variable), position = "stack", width = 2) +
+    xlim(c(min(annotation.filtered %>% select(start)),
+         max(annotation.filtered %>% select(end)))) +
+    labs(y = "Red bulk\n allele depth", 
+         fill = "red_bulk", 
+         x = "") +
+    theme_classic() +
+    scale_fill_brewer(palette = "Set1", 
+                      labels = c("Reference allele", "Alternative allele"),
+                      direction = -1,
+                      guide = guide_legend(override.aes = list(alpha = 0))) + # make legend invisible
+    coord_cartesian(xlim = c(5305700, 5305870)) +
+    theme(#text = element_text(size = 21),
+          axis.text = element_text(size=25),
+          axis.ticks = element_line(linewidth = 1.5),
+          axis.ticks.length = unit(.25, "cm"),
+          axis.line = element_line(linewidth = 2),
+          #axis.title.y = element_blank(),
+          axis.title.y = element_text(size=30),
+          axis.line.x = element_blank(),
+          axis.ticks.x = element_blank(),
+          legend.position = "none",
+          axis.text.x = element_blank(),
+          axis.title.x = element_blank(),
+          legend.title = element_text(color = "transparent"),
+          legend.text = element_text(color = "transparent"))
+  
+  bars <- map(unique(dat2.melt$POS)
+            , ~geom_col(position = "stack",
+                        width = 2
+                       , data = dat2.melt %>% filter(POS == .x)))
+  
+  p3 <- ggplot(data = dat2.melt,
+               aes(x=POS,
+                   y=as.numeric(value),
+                   fill=reorder(variable, as.numeric(value)))) +
+    bars +
+    xlim(c(min(annotation.filtered %>% select(start)),
+        max(annotation.filtered %>% select(end)))) +
+    labs(x = "Position Scaffold 16 (bp)",
+         fill = "",
+         y = "Green bulk\n allele depth") +
+    theme_classic() +
+    scale_fill_brewer(palette = "Set1", 
+                      labels = c("Reference allele", "Alternative allele"),
+                      #guide = guide_legend(override.aes = list(alpha = 0)),
+                      direction = -1) +
+    coord_cartesian(xlim = c(5305700, 5305870)) +
+    theme(#text = element_text(size = 21),
+          axis.text = element_text(size=25),
+          axis.title.x = element_text(size=40),
+          axis.title.y = element_text(size=30),
+          axis.line = element_line(linewidth = 2),
+          legend.text = element_text(size = 30),
+          legend.title = element_text(size = 30),
+          axis.ticks = element_line(linewidth = 1.5),
+          axis.ticks.length = unit(.25, "cm"))
+          #legend.title = element_text(color = "transparent"),
+          #legend.position = "none",
+          #legend.text = element_text(color = "transparent"),
+          #axis.title.y = element_blank())
+  # create a plot with only the legend
+  legend <- get_legend(p3)
+  p3 <- p3 + theme(legend.position = "none")
+  # combine the three plots
+  allplots <- plot_grid(p1, p2, p3,
+                        ncol = 1, 
+                        align = "v",
+                        rel_heights = c(0.3, 0.25, 0.45))
+  
+  
+  # combine other plots with legend
+  allplots <- plot_grid(legend, allplots, 
+                        ncol = 1,
+                        #align = "v",
+                        rel_heights = c(0.2,0.8))
+  return(allplots)
+}
+
+AM00332_CYP76AD_zoom <- plot_bulk_comparison_zoom(annotation = betalain_chr16.gtf,
+                             trans_id = "AHp023148.1",
+                             filtered_variants = dat.filtered)
+AM00332_CYP76AD_zoom
+
+ggsave(filename = "plots/CYP76AD_AHp023148_bsa_snpeff_zoom.png")
+```
+
+
+
+
+Combine with the output from the read_count_analysis_from_bam.Rmd script:
+
+```{r}
+# plot using cowplot plot grid
+# first combine the bottom row into a single plot
+bottom_row <- plot_grid(AM00332_CYP76AD_zoom,
+                        plot.gf,
+                        align = "v",
+                        nrow = 1,
+                        rel_widths = c(0.35, 0.6),
+                        axis = "b",
+                        labels = c("D", "E"),
+                        label_size = 30)
+
+bottom_row
+
+complete_grid <- plot_grid(AM00332_CYP76AD,
+                           bottom_row,
+                           nrow = 2,
+                           labels = c("C", NA),
+                           label_size = 30,
+                           rel_heights = c(0.3, 0.7))
+
+complete_grid
+
+
+grid_with_BSA <- plot_grid(cowplot_leaf, complete_grid,
+                           nrow = 2,
+                           rel_heights = c(0.3, 0.7))
+#grid_with_BSA
+
+# save plot
+ggsave(filename = "plots/BSA_with_grid.png",
+       plot = grid_with_BSA,
+       width = 25,
+       height = 20,
+       bg = "white",
+       dpi = 500)
+```
+
+
+
+
+
+
diff --git a/workflows/annotation_analysis/R2R3_analysis_reannotation.Rmd b/workflows/annotation_analysis/R2R3_analysis_reannotation.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..a35b46aa656b5298a7c755b2a10131a4ddc0fb11
--- /dev/null
+++ b/workflows/annotation_analysis/R2R3_analysis_reannotation.Rmd
@@ -0,0 +1,525 @@
+---
+title: "MYB_identification_braker2"
+author: "twinkle1"
+date: "2/2/2022"
+output: html_document
+---
+
+```{r setup}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(data.table)
+library(seqinr)
+library(ape)
+library(ggtree)
+library(treeio)
+library(poppr)
+library(ggmsa)
+library(Biostrings)
+knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2/")
+```
+
+## Introduction
+
+This script takes the braker2 output and runs an HMMscan using the MYB DNA-binding domain. It uses the filtering script from the bachelor thesis to subset the R2R3 MYBs and uses those to create a multiple sequence alignment with ClustalOmega and a phylogenetic NJ-tree using ClustalW2.
+
+
+## HMMscan using the MYB DNA-binding domain:
+
+```{bash}
+mkdir -p data/annotation_analysis/myb_annotation/hmmscan
+
+hmmscan --domtblout data/annotation_analysis/myb_annotation/hmmscan/out.txt data/annotation_analysis/myb_annotation/myb_profile/Myb_DNA-binding.hmm polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta
+```
+
+
+## MYB filtering
+
+Takes domtblout output of the HMMscan using the MYB DNA-binding domain HMM profile. Filters identified MYB proteins and classifies them into subgroups based on adjacent repeats of the MYB DNA-binding domain.
+
+
+```{r}
+# Read hmmscan output file into variable data (domtblout format)
+data <- read.table('data/annotation_analysis/myb_annotation/hmmscan/out.txt', row.names = NULL, fill = T)
+## Add column including the length of alignment
+data <- data %>%
+  mutate(alignment_length = V17-V16)
+## Add Column names back
+colnames(data) <- c('target_name','target_accession','tlen','query_name','accession','qlen','E-value','score_all',
+                    'bias_all','nr_domain','total_domains','c-Evalue',
+                    'i-Evalue','score_domain','bias_domain','from_hmm_coord','to_hmm_coord','from_ali_coord','to_ali_coord','from_env_coord',
+                    'to_enc_coord','acc','description','of','target','alignment_length')
+
+# filtering function:
+filter_domains <- function(data){
+  # Filter based on domain_score, alignment_length and acc
+  ## Filter for a domain score of above 25
+  data_filtered <- data %>%
+    filter(score_domain > 25)
+  ## Now filter for a alignment_length of over 20 also, discards domains that are not alignened properly
+  data_filtered <- data_filtered %>%
+    filter(alignment_length > 20)
+  ## Now also filter for acc > 0.8
+  data_filtered_acc <- data_filtered %>%
+    filter(acc > 0.8)
+  ## Filter for adjacent domains
+  ### Make sure that the data is sorted by query_name and each query_name's domain starting position
+  data_adjacent <- arrange(data_filtered_acc,query_name)
+  ### Create the variable n and assign it the number of rows of the dataframe
+  n <- nrow(data_adjacent)
+  ### Create an empty vector
+  vec <- c()
+  ### For each row from row 1 to row n-1 check if the query_name column of the next row is equal to the query_name column of this row.
+  ### If this is not the case (MYB with just 1 domain, last domain in a multi domain MYB) assign '0' to the vector.
+  ### If this is the case, check whether the start position of the next domain and the end position of the current one are a maximum
+  ### of 15 AA apart.
+  ### If they are close assign '0' to the vector, if they are further apart assign '1' to the vector
+  ### This way a checksum can be created for each protein, if the checksum is unequal to 0 one or more of its domains are
+  ### too far apart
+  for(i in seq(1,n-1)) {
+    if(data_adjacent[i,4] == data_adjacent[i+1,4]) {
+      if(data_adjacent[i+1,18] - data_adjacent[i,19] < 15) {
+        vec <- c(vec,'0')
+      } else {
+        vec <- c(vec,'1')
+      }
+    } else {
+      vec <- c(vec,'0')
+    }
+  }
+  ### Add one additional 'NA' as the last value
+  vec <- c(vec,'0')
+  ### Add the created vector as a column to the data
+  vec <- as.numeric(vec)
+  data_adjacent$adjacent <- vec
+  ### Group by query_name, sum the checksum (also convert it back to a dataframe for the head() function) and
+  ### join it back into the filtered data.
+  data_checksum <- data_adjacent %>%
+    group_by(query_name) %>%
+    summarize(.,checksum = sum(adjacent)) %>%
+    as.data.frame.data.frame() %>%
+    right_join(data_adjacent)
+  data_filtered <- data_checksum %>%
+    filter(checksum == 0)
+  ## Group the number of domains for each unique accession and count them
+  domains_per_protein <- data_filtered %>%
+    select(query_name) %>%
+    table(dnn = 'query_name')
+  domains_per_protein <- as.data.frame(domains_per_protein)
+  domains_per_protein <- domains_per_protein %>%
+    filter(Freq > 0)
+  ## Add back the filtered out R-R-type MYB-like proteins
+  ### Select the query_name of domains with checksum unequal to 0, only keep unique query_names
+  query_name <- data_checksum %>%
+    filter(checksum != 0) %>%
+    select(query_name)
+  query_name <- unique(query_name)
+  ### Manually create a vector named Freq with set Freq of 1, convert unique query names and frequency to a dataframe
+  Freq <- c(rep(1,nrow(query_name)))
+  R_R_type <- data.frame(query_name, Freq)
+  ### Combine the filtered dataframe and the manually set R-R-type MYB-like proteins
+  domains_per_protein <- rbind(domains_per_protein, R_R_type)
+  return(domains_per_protein)
+}
+
+# perform filtering
+filtered_mybs <- filter_domains(data)
+
+# add gene names to the dataframe
+filtered_mybs <- filtered_mybs %>%
+  mutate(gene_name = gsub("\\..*","", query_name))
+
+# e.g. number of R2R3 myb isoforms and genes
+nrow(filtered_mybs[filtered_mybs$Freq == 4,])
+length(unique(filtered_mybs[filtered_mybs$Freq == 2,]$gene_name))
+```
+
+
+## Subsetting the fasta file using the name list
+
+Extracting only the R2R3 mybs (with two adjacent MYB domains).
+
+```{r}
+# write dataframe of MYB transcript names, remove MYB-like genes:
+filtered_mybs <- filtered_mybs[filtered_mybs$Freq > 1,]
+
+# write gene names to file
+write.csv(filtered_mybs, file = "data/annotation_analysis/myb_annotation/myb_names.txt",
+            quote = F,
+            row.names = F)
+```
+
+## Basic stats
+
+Calculate basic stats for identified MYB genes using the genome annotation
+
+```{r}
+# set up function for reading in a gtf file
+read.gtf <- function(file){
+  # based on: https://www.biostars.org/p/272889/
+  # read in the gtf file:
+  gff <- fread(file)
+  setnames(gff, names(gff), c("chr","source","type","start","end","score","strand","phase","attributes"))
+  # subset attribute column into the gene and transcript id columns
+  # function for extracting the two attributes
+  extract_attributes <- function(gtf_column, att_of_interest){
+    att <- strsplit(gtf_column, "; ")
+    att <- gsub("\"","",unlist(att))
+    att <- gsub(";","",unlist(att))
+    if(!is.null(unlist(strsplit(att[grep(att_of_interest, att)], " ")))){
+      return( unlist(strsplit(att[grep(att_of_interest, att)], " "))[2])
+    }else{
+      return(NA)
+    }
+  }
+  # using the function to subset gene and transcript id:
+  gff$gene_id <- unlist(lapply(gff$attributes, extract_attributes, "gene"))
+  gff$transcript_id <- unlist(lapply(gff$attributes, extract_attributes, "transcript"))
+  return(gff)
+}
+
+# read annotation and filter for MYB genes
+annotation <- read.gtf(file = "polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf")
+annotation <- annotation %>%
+  filter(transcript_id %in% filtered_mybs$query_name)
+
+# read in MYB subgroup and function assignment
+myb_function <- read_csv(file = "data/manual_sheets/MYB_with_subgroups_and_function.csv")
+myb_function <- myb_function %>%
+  mutate(Function = replace_na(Function, "-"))
+
+# count number of exons
+exon_count <- annotation %>%
+  filter(type == "CDS") %>%
+  group_by(transcript_id) %>%
+  summarise(exon_count = n())
+
+# calculate CDS and protein length
+myb_length <- annotation %>%
+  filter(type == "CDS") %>%
+  mutate(feature_length = (end - start)+1 ) %>%
+  group_by(transcript_id) %>%
+  summarise(cds_length = sum(feature_length)) %>%
+  mutate(aa_length = cds_length / 3)
+
+# record chromosome and position
+myb_position <- annotation %>%
+  filter(type == "transcript") %>%
+  select(transcript_id, chr, start, end, strand)
+
+# join tables and save
+myb_stats <- left_join(filtered_mybs, myb_length, by = c("query_name" = "transcript_id"))
+myb_stats <- left_join(myb_stats, exon_count, by = c("query_name" = "transcript_id"))
+myb_stats <- left_join(myb_stats, myb_position, by = c("query_name" = "transcript_id"))
+myb_stats <- left_join(myb_stats, myb_function, by = c("query_name" = "transcript_id"))
+
+# format correctly
+myb_stats <- myb_stats %>%
+  mutate(subfamily = ifelse(Freq == 2, "R2R3", ifelse(Freq == 3, "3R", "4R")),
+         transcript_id = query_name,
+         gene_id = gene_name) %>%
+  group_by(gene_id) %>%
+  #mutate(gene_name = paste0("AmMYB", cur_group_id())) %>%
+  mutate(gene_name = paste0("-")) %>%
+  ungroup() %>%
+  select(gene_name, gene_id, transcript_id, chr, start, end, strand, subfamily, aa_length, exon_count, Subgroup, Function) %>%
+  as.data.frame()
+
+# adjust name for selected genes
+myb_stats[52,1] <- "AmMYBl1"
+myb_stats[93:94,1] <- "AmMYB2"
+myb_stats[62,1] <- "AmMYB3"
+myb_stats[63,1] <- "AmMYB4"
+
+# write output file
+write.csv(myb_stats, file = "data/annotation_analysis/myb_annotation/myb_stats.csv",
+            quote = T,
+            row.names = F)
+```
+
+
+Analyse basic statistics:
+
+```{r}
+# read in data
+myb_stats <- read.csv(file = "data/annotation_analysis/myb_annotation/myb_stats.csv")
+
+# exon count of R2R3 MYBs
+myb_stats %>%
+  filter(subfamily == "R2R3") %>%
+  group_by(exon_count) %>%
+  summarise(count = n()) %>%
+  mutate(percentage = (count/sum(count))*100)
+```
+
+
+
+Extract myb protein fasta based on the identified MYB names.
+
+```{bash}
+seqkit faidx -l data/annotation_analysis/myb_annotation/myb_names.txt polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta > data/annotation_analysis/myb_annotation/AH_myb.fasta
+```
+
+## Alignment and Phylogeny
+
+Creating the alignment with clustalOmega, the phylogeny using the phylogeny function of clustalw2. Downloaded MYB sequences of Beta vulgaris and Arabidopsis thaliana:
+
+Myb data from B. vulgaris and A. thaliana obtained from:
+https://static-content.springer.com/esm/art%3A10.1186%2Fs12870-014-0249-8/MediaObjects/12870_2014_249_MOESM4_ESM.txt
+Supplements of: Stracke, R., Holtgräwe, D., Schneider, J., Pucker, B., Rosleff Sörensen, T., & Weisshaar, B. (2014). Genome-wide identification and characterisation of R2R3-MYB genes in sugar beet (Beta vulgaris). BMC plant biology, 14(1), 1-17.
+
+```{bash}
+# align MYBs using clustalOmega
+cat data/annotation_analysis/myb_annotation/AH_myb.fasta data/annotation_analysis/myb_annotation/bvulgaris_athaliana_myb.fasta | /home/tom/Documents/tools/clustalo-1.2.4-Ubuntu-x86_64 --threads=6 --in=- --outfile=data/annotation_analysis/myb_annotation/all_myb.aln --force
+
+# create phylogeny from alignment using clustalw2
+/home/tom/Documents/tools/clustalw-2.1-linux-x86_64-libcppstatic/clustalw2 -bootstrap=1000 -infile=data/annotation_analysis/myb_annotation/all_myb.aln -outfile=data/annotation_analysis/myb_annotation/all_myb.phb
+```
+
+## Plot phylogeny
+
+I will try to use the same functions as I did for my bachelor thesis.
+
+```{r}
+# read in the bootstrapped phylogeny
+tree <- read.raxml("data/annotation_analysis/myb_annotation/all_myb.phb")
+
+# create tree object
+t <- ggtree(tree, branch.length = 'none', layout='circular', size=0.2)
+
+# create dataframe used for coloring
+df <- na.omit(t$data[,3])
+df <- df %>%
+  mutate(species = substr(label, 1, 2))
+df$species <- as.factor(df$species)
+
+
+# create color palette for plotting:
+pal2 <- c("#FF0000", "#00A08A", "#F2AD00", "#F98400",
+"#D55E00", "#E69F00", "#56B4E9", "#0072B2")
+
+
+t_species <- t %<+% df +
+  geom_tiplab(size=2,
+              aes(angle = angle, color = species,
+                  hjust= -0.06)) +
+  geom_nodelab(size=1,
+            nudge_x = 0.5,
+            aes(label=bootstrap, angle=angle)) +
+  scale_shape_discrete(solid=F) +
+  scale_color_manual(values = pal2) +
+  theme(legend.position = 'none',
+        legend.direction = 'vertical',
+        plot.title = element_text(size=25, face='bold'),
+        legend.title = element_text(size = 20, face='bold'),
+        legend.text = element_text(size = 14),
+        legend.key.size = unit(1,'cm'))
+
+# exchange specific tip labels with their respective gene names
+# BvMYB1
+t_species$data$label[t_species$data$label == "Bv_jkkr"] <- "BvMYB1"
+# AmMYBl1
+t_species$data$label[t_species$data$label == "AHp014591.1"] <- "AmMYBl1"
+# betalain MYBs
+t_species$data$label[t_species$data$label == "AHp022773.1"] <- "AhMYB2.1"
+t_species$data$label[t_species$data$label == "AHp022773.2"] <- "AhMYB2.2"
+t_species$data$label[t_species$data$label == "AHp016530.1"] <- "AhMYB3.1"
+t_species$data$label[t_species$data$label == "AHp016531.1"] <- "AhMYB4.1"
+
+t_species
+
+
+```
+
+Annotate the subgroups within the phylogenetic tree, based on the A. thaliana and B. vulgaris assignments:
+
+```{r}
+plot_clade <- function(node, label){
+  # subset angles from ggplot object
+  angles <- t_species$data$angle
+  # mark labeled subgroup in the tree between two tips,
+  geom_cladelab(node=node,
+             barsize=0.5,
+             offset = 4,
+             fontsize=4.5,
+             extend=0.1,
+             offset.text = 0.2,
+             label = label,
+             angle = angles[node])
+}
+
+# function to create smaller labels with larger labels
+plot_clade2 <- function(node, label, label2 = ""){
+  # subset angles from ggplot object
+  angles <- t_species$data$angle
+  # mark labeled subgroup in the tree between two tips,
+  geom_cladelab(node=node,
+             barsize=0.5,
+             offset = 4,
+             fontsize = c(4.5,2.8),
+             extend = 0.1,
+             offset.text = 0.2,
+             vjust = c(0,1.5),
+             label = c(label, label2),
+             # adjust angle based on position on tree
+             angle = c((angles[node]), ifelse(angles[node] <= 90 | angles[node] >= 270,
+                                              (angles[node]) -1,
+                                              (angles[node]) +1)))
+}
+
+# label clades according to their clade number
+t_species +
+  plot_clade2(331,'S22', "Defense, stress response") +
+  plot_clade(342,'S23') +
+  plot_clade(347,'3R') +
+  plot_clade2(357,'S21', "Cell wall, lignin") +
+  plot_clade(379,'4R') +
+  plot_clade2(385,'S25', "Embryogenesis") +
+  plot_clade2(397,'S18', "Anther development") +
+  plot_clade(422,'S19+S20') +
+  plot_clade2(308,"S16", "Photomorphogenesis") +
+  plot_clade(492,'S10') +
+  plot_clade(487,'S24') +
+  plot_clade2(501,'S11', "Defense, stress response") +
+  plot_clade2(528,'S12', "Glucosinolate biosynthesis") +
+  plot_clade2(513,'S9', "Development") +
+  plot_clade2(460,'S13', "Mucilage, lignin") +
+  plot_clade2(446,'S4', "Phenylpropanoid biosynthesis") +
+  plot_clade2(568,'S14', "Axillary meristem, root growth") +
+  plot_clade2(542,'S3', "Lignin biosynthesis") +
+  plot_clade2(537,'S2', "Abiotic stress response") +
+  plot_clade2(548,'S7', "Flavonol biosynthesis") +
+  plot_clade2(603,'S6', "Anthocyanin biosynthesis") +
+  plot_clade2(606,'S15', "Development, cell fate") +
+  plot_clade2(592,'S5', "Proanthocyanidin biosynthesis") +
+  plot_clade2(476,'S8', "Lignin biosynthesis") +
+  plot_clade2(554,'S1', "Stress response") +
+  plot_clade(415,'S17') +
+  plot_clade(419,'S17') +
+  plot_clade(439,'S17') +
+  plot_clade2(598,'BvMYB1', "Betalain biosynthesis") +
+  plot_clade2(588,'AtMYB5', "Development, flavonoid biosynthesis") +
+  theme(plot.margin = margin(1,2,1,1, "cm"))
+
+ggsave(filename = "plots/MYB_phylogenetic_tree.png",
+       width = 14,
+       height = 14,
+       dpi = 600)
+
+ggsave(filename = "plots/MYB_phylogenetic_tree.pdf",
+       device = "pdf",
+       width = 12,
+       height = 12,
+       dpi = 600)
+```
+
+
+## Alignment of betalain and anthocyanin MYBs
+
+Create and plot alignment of MYBs of S6 and BvMYB1-like clades.
+
+```{bash}
+mkdir plots/AmMYB2_figure/
+
+# align using clustalomega
+/home/tom/Documents/tools/clustalo-1.2.4-Ubuntu-x86_64 --in=data/annotation_analysis/myb_annotation/S6_betalain_myb_alignment/manual_S6.fasta --outfile=data/annotation_analysis/myb_annotation/S6_betalain_myb_alignment/manual_S6.aln
+```
+
+Import and plot alignment:
+
+```{r}
+# read in alignment
+S6_align <- readAAMultipleAlignment("data/annotation_analysis/myb_annotation/S6_betalain_myb_alignment/manual_S6.aln",
+                                    format = "fasta")
+S6_align@unmasked@ranges@NAMES <- c("AtMYB75 (PAP1)", "AtMYB90 (PAP2)", "AtMYB113", "AtMYB114", "AhMYB2.1", "AhMYB2.2", "BvMYB1", "Bv_ralf",
+                                    "AhMYB3.1","AhMYB4.1")
+
+# plot R2 and R3 domains of the sequence alignment
+ggmsa(S6_align,
+      start = 20,
+      end = 140,
+      seq_name = T,
+      by_conservation = F,
+      border = "black",
+      color = "Clustal") +
+  facet_msa(60) +
+  theme(text = element_text(size = 19),
+        plot.margin = unit(c(1,1,1,1), "cm"))
+
+# save the alignment
+ggsave(filename = "plots/AmMYB2_figure/S6_betalain_myb_align.png",
+       width = 9,
+       height = 6,
+       dpi = 320,
+       bg = "white")
+
+
+# plot only R3 domain in a single row
+ggmsa(S6_align,
+      start = 80,
+      end = 136,
+      seq_name = T,
+      by_conservation = F,
+      border = F,
+      color = "Chemistry_AA") +
+  theme(text = element_text(size = 11),
+        plot.margin = unit(c(0,0.5,0,0.5), "cm"))
+
+# save the alignment
+ggsave(filename = "plots/AmMYB2_figure/S6_betalain_myb_S6.png",
+       width = 9,
+       height = 2.5,
+       dpi = 400,
+       bg = "white")
+```
+
+Add matrix visualisation of important residues:
+
+```{r}
+residue <- 80:136
+# interaction motif
+interaction_motif <- rep(0, length(residue))
+interact_residues <- c(92,93,96,100,111,115)
+interaction_motif[interact_residues-79] <- 2
+# additional zimmermann residues
+zimmermann <- rep(0, length(residue))
+zimmermann_residues <- c(88,89,92,95,97,99,100,102,108,110,111,117,123,124,128)
+zimmermann[zimmermann_residues-79] <- 1
+# sakuta residues
+sakuta <- rep(0, length(residue))
+sakuta_residues <- c(93,96,101,114)
+sakuta[sakuta_residues-79] <- 1
+# hatlestad residues
+hatlestad <- rep(0, length(residue))
+hatlestad_residues <- c(92,93,96,97,99,100,111,115)
+hatlestad[hatlestad_residues-79] <- 1
+
+# prepare dataframe
+df <- data.frame(residue, interaction_motif, hatlestad, zimmermann, sakuta)
+df_melt <- melt(df, id.vars = "residue")
+df_melt$value <- as.factor(df_melt$value)
+
+residue_raster <- ggplot(data = df_melt) +
+  geom_tile(aes(x = residue,
+                y = variable,
+                fill = value),
+            width = 0.7,
+            height = 0.7,
+            linewidth = 0.5,
+            color = "black") +
+  theme_classic() +
+  scale_fill_manual(values = c("white", "black","red3")) +
+  labs(x = "",
+       y = "") +
+  theme(axis.line = element_blank(),
+        axis.ticks = element_blank(),
+        axis.text = element_blank(),
+        legend.position = "none")
+residue_raster
+
+
+ggsave(filename = "plots/AmMYB2_figure/residue_raster.png",
+       width = 9,
+       height = 1.5,
+       dpi = 400)
+```
diff --git a/workflows/annotation_analysis/betalain_and_flavonoid_identification.sh b/workflows/annotation_analysis/betalain_and_flavonoid_identification.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d11067692f19afdb04f2d6d738790d484a9aed17
--- /dev/null
+++ b/workflows/annotation_analysis/betalain_and_flavonoid_identification.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Script is used for the identification of betalain and flavonoid pathway genes in the A. hypochondriacus genome annotation version 2.2
+
+# identification of betalains by blast
+# create blast database of annotated protein sequences
+# source of betalain pathway protein sequences is described in manuscript
+mkdir -p data/annotation_analysis/betalains/DB
+mkdir -p data/annotation_analysis/flavonoids
+
+makeblastdb -in polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta \
+	-out data/annotation_analysis/betalains/DB/blast_db \
+	-logfile data/annotation_analysis/betalains/blast_db.log \
+	-dbtype prot
+
+# run protein blast of described betalain pathway genes against blast database
+blastp -query data/annotation_analysis/betalains/pathway.fasta \
+	-db data/annotation_analysis/betalains/DB/blast_db \
+	-outfmt 7 \
+	-out data/annotation_analysis/betalains/pathway_against_protein.out \
+	-qcov_hsp_perc 80
+
+
+# identification of flavonoids using KIPEs
+# run KIPEs:
+python /home/tom/Documents/tools/KIPEs/KIPEs3.py --baits /home/tom/Documents/tools/KIPEs/flavonoid_baits/ \
+	--positions /home/tom/Documents/tools/KIPEs/flavonoid_residues/ \
+	--out data/annotation_analysis/flavonoids/ \
+	--subject polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta \
+	--seqtype pep \
+	--cpus 6
+
+# create blast db
+mkdir data/annotation_analysis/flavonoids/blast_db
+makeblastdb -in polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta -out data/annotation_analysis/flavonoids/blast_db/db -dbtype nucl
+
+# blast searches for unidentified candidate genes using KIPEs bait sequences
+tblastn -query data/annotation_analysis/flavonoids/KIPEs/blast_query/ANS.fasta -db data/annotation_analysis/flavonoids/blast_db/db -outfmt 7 > data/annotation_analysis/flavonoids/ANS_blast.out
+tblastn -query data/annotation_analysis/flavonoids/KIPEs/blast_query/ANR.fasta -db data/annotation_analysis/flavonoids/blast_db/db -outfmt 7 > data/annotation_analysis/flavonoids/ANR_blast.out
+tblastn -query data/annotation_analysis/flavonoids/KIPEs/blast_query/F3-5-H.fasta -db data/annotation_analysis/flavonoids/blast_db/db -outfmt 7 > data/annotation_analysis/flavonoids/F3-5-H_blast.out
+tblastn -query data/annotation_analysis/flavonoids/KIPEs/blast_query/FNS1.fasta -db data/annotation_analysis/flavonoids/blast_db/db -outfmt 7 > data/annotation_analysis/flavonoids/FNS1_blast.out
+tblastn -query data/annotation_analysis/flavonoids/KIPEs/blast_query/CHI2.fasta -db data/annotation_analysis/flavonoids/blast_db/db -outfmt 7 > data/annotation_analysis/flavonoids/CHI2_blast.out
+
+# exonerate protein alignments
+# prepare fasta file of KIPEs bait sequences
+sed 's/%_//' data/annotation_analysis/flavonoids/KIPEs/blast_query/ANS.fasta > data/annotation_analysis/flavonoids/ANS_fixed.fasta
+# run exonerate
+exonerate --model protein2genome --percent 55 --showvulgar no --showalignment no --showtargetgff yes --query data/annotation_analysis/flavonoids/ANS_fixed.fasta --target polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta > data/annotation_analysis/flavonoids/ANS_exonerate.gff
+
+sed 's/%_//' data/annotation_analysis/flavonoids/KIPEs/blast_query/ANR.fasta > data/annotation_analysis/flavonoids/ANR_fixed.fasta
+exonerate --model protein2genome --percent 55 --showvulgar no --showalignment no --showtargetgff yes --query data/annotation_analysis/flavonoids/ANR_fixed.fasta --target polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta > data/annotation_analysis/flavonoids/ANR_exonerate.gff
+
+sed 's/%_//' data/annotation_analysis/flavonoids/KIPEs/blast_query/F3-5-H.fasta > data/annotation_analysis/flavonoids/F3-5-H_fixed.fasta
+exonerate --model protein2genome --percent 55 --showvulgar no --showalignment no --showtargetgff yes --query data/annotation_analysis/flavonoids/F3-5-H_fixed.fasta --target polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta > data/annotation_analysis/flavonoids/F3-5-H_exonerate.gff
+
+sed 's/%_//' data/annotation_analysis/flavonoids/KIPEs/blast_query/FNS1.fasta > data/annotation_analysis/flavonoids/FNS1_fixed.fasta
+exonerate --model protein2genome --percent 55 --showvulgar no --showalignment no --showtargetgff yes --query data/annotation_analysis/flavonoids/FNS1_fixed.fasta --target polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta > data/annotation_analysis/flavonoids/FNS1_exonerate.gff
+
+sed 's/%_//' data/annotation_analysis/flavonoids/KIPEs/blast_query/CHI2.fasta > data/annotation_analysis/flavonoids/CHI2_fixed.fasta
+exonerate --model protein2genome --percent 55 --showvulgar no --showalignment no --showtargetgff yes --query data/annotation_analysis/flavonoids/CHI2_fixed.fasta --target polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta > data/annotation_analysis/flavonoids/CHI2_exonerate.gff
diff --git a/workflows/annotation_analysis/betalain_phylogenetic_analysis.Rmd b/workflows/annotation_analysis/betalain_phylogenetic_analysis.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..881844d0c47f5ddc341cfd0a4eb59415c56f1d01
--- /dev/null
+++ b/workflows/annotation_analysis/betalain_phylogenetic_analysis.Rmd
@@ -0,0 +1,92 @@
+---
+title: "Betalain_phylogenetic_analysis"
+author: "twinkle1"
+date: "2023-01-18"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(ggtree)
+library(ggrepel)
+knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2/")
+```
+
+## Phylogenetic analysis of betalain pathway genes
+
+Perform phylogenetic analysis of betalain pathway genes. Searched for protein fasta sequences of known betalain pathway genes. Aligned the fasta sequences using ClustalOmega with default settings. Created a neighbour joining tree with 1000 bootstrap replicates using ClustalW. Used seaview to correctly root the phylogeny. Perform alignment and tree construction:
+
+```{bash}
+mkdir -p data/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD
+mkdir -p data/annotation_analysis/betalains/phylogenetic_analysis/DODA
+
+# CYP76AD
+### perform alignment
+/home/tom/Documents/tools/clustalo-1.2.4-Ubuntu-x86_64 --threads=6 --in=data/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD/CYP76AD.fasta --outfile=data/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD/CYP76AD.aln --force
+
+### create phylogeny, 1000 bootstrap replicates
+/home/tom/Documents/tools/clustalw-2.1-linux-x86_64-libcppstatic/clustalw2 -bootstrap=1000 -infile=data/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD/CYP76AD.aln -outfile=data/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD/CYP76AD.phb
+
+# DODA
+### perform alignment
+/home/tom/Documents/tools/clustalo-1.2.4-Ubuntu-x86_64 --threads=6 --in=data/annotation_analysis/betalains/phylogenetic_analysis/DODA/DODA.fasta --outfile=data/annotation_analysis/betalains/phylogenetic_analysis/DODA/DODA.aln --force
+
+### create phylogeny, 1000 bootstrap replicates
+/home/tom/Documents/tools/clustalw-2.1-linux-x86_64-libcppstatic/clustalw2 -bootstrap=1000 -infile=data/annotation_analysis/betalains/phylogenetic_analysis/DODA/DODA.aln -outfile=data/annotation_analysis/betalains/phylogenetic_analysis/DODA/DODA.phb
+```
+
+## Alignment plotting
+
+After rerooting to the outgroup using seaview:
+
+```{r}
+# read in the CYP76AD1 tree
+tree.CYP <- read.tree(file = "data/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD/CYP76AD_rooted.phb")
+
+# adjust names
+tree.CYP$tip.label <- gsub("\\_.*", "", tree.CYP$tip.label)
+tree.CYP$tip.label[12] <- "AhCYP76AD2"
+tree.CYP$tip.label[15] <- "AhCYP76AD5"
+
+# plot tree
+ggtree(tree.CYP, layout = "rectangular") +
+  geom_nodelab(hjust = -0.1, geom = "text") +
+  geom_tiplab(align = T) +
+  xlim_tree(0.4)
+
+# save tree
+ggsave(filename = "plots/CYP76AD_rooted.png", width = 10)
+ggsave(filename = "plots/CYP76AD_rooted.pdf", width = 10)
+
+
+# read in the DODA tree
+tree.DODA <- read.tree(file = "data/annotation_analysis/betalains/phylogenetic_analysis/DODA/DODA_rooted.phb")
+
+# adjust names
+tree.DODA$tip.label <- gsub("\\_.*", "", tree.DODA$tip.label)
+tree.DODA$tip.label[1] <- "BvDODAβ"
+tree.DODA$tip.label[7] <- "AhDODAα1"
+tree.DODA$tip.label[4] <- "AhDODAα2"
+tree.DODA$tip.label[6] <- "CqDODA-1"
+tree.DODA$tip.label[2] <- "McDODAβ"
+tree.DODA$tip.label[9] <- "McDODAα1"
+tree.DODA$tip.label[3] <- "BvDODAα2"
+tree.DODA$tip.label[5] <- "BvDODAα1"
+
+# plot tree
+ggtree(tree.DODA, layout = "rectangular") +
+  geom_nodelab(hjust = -0.1, geom = "text") +
+  geom_tiplab(align = T) +
+  xlim_tree(0.37)
+
+ggsave(filename = "plots/DODA_rooted.png", width = 10)
+ggsave(filename = "plots/DODA_rooted.pdf", width = 10)
+```
+
+
+
+
+
+
+
diff --git a/workflows/annotation_analysis/circos_plotting.Rmd b/workflows/annotation_analysis/circos_plotting.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..d52f50b9bedb962396ad23a018d227b030ade41b
--- /dev/null
+++ b/workflows/annotation_analysis/circos_plotting.Rmd
@@ -0,0 +1,153 @@
+---
+title: "Circos_plot"
+author: "twinkle1"
+date: '2022-08-25'
+output: html_document
+---
+
+```{r setup, include=FALSE}
+library(tidyverse)
+library(circlize)
+library(data.table)
+library(GenomicRanges)
+knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2_publication/")
+```
+
+
+## Setup
+
+```{r}
+read.gtf <- function(file){
+  # based on: https://www.biostars.org/p/272889/
+  # read in the gtf file:
+  gff <- fread(file)
+  setnames(gff, names(gff), c("chr","source","type","start","end","score","strand","phase","attributes"))
+  # subset attribute column into the gene and transcript id columns
+  # function for extracting the two attributes
+  extract_attributes <- function(gtf_column, att_of_interest){
+    att <- strsplit(gtf_column, "; ")
+    att <- gsub("\"","",unlist(att))
+    att <- gsub(";","",unlist(att))
+    if(!is.null(unlist(strsplit(att[grep(att_of_interest, att)], " ")))){
+      return( unlist(strsplit(att[grep(att_of_interest, att)], " "))[2])
+    }else{
+      return(NA)
+    }
+  }
+  # using the function to subset gene and transcript id:
+  gff$gene_id <- unlist(lapply(gff$attributes, extract_attributes, "gene"))
+  gff$transcript_id <- unlist(lapply(gff$attributes, extract_attributes, "transcript"))
+  return(gff)
+}
+
+# Create function for converting gtf dataframe to genomic ranges onbject
+Granges_from_gtf <- function(gtf){
+  # requires the GRanges and tidyverse packages
+  gene_structures <- gtf %>%
+  group_by(transcript_id) %>% # group by transcript id
+  summarise(gene_start = min(start),
+            gene_end = max(end),
+            seqnames = unique(chr), # all sequences should be on the same chromosome
+            gene_strand = unique(strand))
+  # use the gene_structures object to create the genomic ranges object
+  gene_ranges <- GRanges(seqnames = gene_structures$seqnames, 
+                         ranges = IRanges(start=gene_structures$gene_start, 
+                                          end=gene_structures$gene_end,
+                                          names = gene_structures$transcript_id), 
+                         strand = gene_structures$gene_strand)
+  return(gene_ranges)
+}
+```
+
+
+
+## Circos plot
+
+Load in data and perform necessary transformations.
+
+```{r}
+# for the circlize package, bed-like dataframes are required
+# load in the indexed genome and create a bed-like format from the genome
+genome.bed <- read.table("polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta.fai")
+genome.bed <- genome.bed %>%
+  head(16) %>%
+  summarize(chr = V1,
+            start = 1,
+            end = V2)
+# define chromosome order:
+order <- as.numeric(gsub(".*_", "", genome.bed$chr))
+genome.bed$chr <- as.factor(genome.bed$chr)
+# reorder based on on order vector
+genome.bed$chr <- reorder(genome.bed$chr, order)
+
+# load in genome annotation
+gene_annotation <- read.gtf("polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf")
+gene_bed <- gene_annotation %>%
+  filter(type=="CDS") %>%
+  summarize(chr = chr,
+            start = start,
+            end = end)
+
+# load in repetitive element annotation
+rep_annotation <- read.table("data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.out.gff")
+rep_bed <- rep_annotation %>%
+  summarize(chr = V1,
+            start = V4,
+            end = V5)
+
+# prepare MYB transcription factor and color pathway gene annotation in genome
+#myb_genes <- read.csv("data/manual_sheets/MYB_with_subgroups.csv")
+#myb_annotation <- gene_annotation %>%
+#  filter(transcript_id %in% myb_genes$query_name,
+#         type == "transcript") %>%
+#  group_by(gene_id) %>%
+#  summarise(chr = chr,
+#            start = min(start),
+#            end = max(end)) %>%
+#  unique()
+
+
+# prepare plot:
+# color scheme for chromosomes:
+genome.bed$clr <- colorRampPalette(c("#FFFFFF", "#71196E"))(16)
+
+png(filename = "plots/circos.png", width = 4800, height = 4800, res = 1200) # changed this line
+circos.clear()
+circos.par("start.degree" = 90)
+circos.genomicInitialize(data=genome.bed, 
+                         tickLabelsStartFromZero = F, 
+                         axis.labels.cex = 0.3,
+                         labels.cex = 0.5)
+
+# this track adds grey outlines of the chromosomes
+# keep it commented out, since it does not add information to the figure
+#circos.track(ylim = c(0, 1), panel.fun = function(x, y) {
+#    chr = CELL_META$sector.index
+#    xlim = CELL_META$xlim
+#    ylim = CELL_META$ylim
+#    circos.rect(xlim[1], 0, xlim[2], 1, col = "lightgrey")
+#}, track.height = 0.10, bg.border = NA)
+
+# gene density
+circos.genomicDensity(gene_bed, 
+                      track.height=0.15,
+                      window.size = 1000000,
+                      col="dodgerblue3")
+
+# repetitive element density
+circos.genomicDensity(rep_bed,
+                      track.height=0.15,
+                      window.size = 1000000,
+                      col="forestgreen")
+
+
+dev.off()
+
+
+#circos.initialize(sectors = genome.bed$chr, sector.width = genome.bed$end, x=genome.bed$end)
+
+
+```
+
+
+
diff --git a/workflows/annotation_analysis/readme.txt b/workflows/annotation_analysis/readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ab1f357eb170404b986c307f4da1e8098f65d0e4
--- /dev/null
+++ b/workflows/annotation_analysis/readme.txt
@@ -0,0 +1,17 @@
+## Annotation analysis
+
+Identify betalain and flavonoid pathway genes, as well as MYB transcription factor genes in the new genome annotation v2.2.
+
+### Script order:
+
+- code/annotation_analysis/circos_plotting.Rmd
+Generate circos plot based on gene and repetitive element annotation
+
+- code/annotation_analysis/betalain_and_flavonoid_identification.sh
+Identify candidate betalain genes by BLAST, candidate flavonoid genes by KIPEs
+
+- code/annotation_analysis/betalain_phylogenetic_analysis.Rmd
+Phylogenetic analysis of betalain pathway genes
+
+- code/annotation_analysis/R2R3_analysis_reannotation.Rmd
+Identify MYB transcription factor genes by HMMscan, analyse putative function by phylogenetic assessment
diff --git a/workflows/braker2/braker2_prot.sh b/workflows/braker2/braker2_prot.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1ee42414ffc235c337b7c0b6682ae97d880df99f
--- /dev/null
+++ b/workflows/braker2/braker2_prot.sh
@@ -0,0 +1,48 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -t 100:00:00
+#SBATCH -J braker
+#SBATCH -o logs/braker/mappingLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=8
+#SBATCH --mem=84gb
+#SBATCH --mail-user=twinkle1@smail.uni-koeln.de
+#SBATCH --mail-type=ALL
+
+#Braker2 protein input using the plant sequences from orthoDB (downloaded from: https://v100.orthodb.org/download/odb10_plants_fasta.tar.gz) as well as the protein sequences from amaranthus cruentus (removed asterisks and space in fasta header)
+#The downloaded dataset contains sequences from 117 embryophyte species.
+
+#wget https://v100.orthodb.org/download/odb10_plants_fasta.tar.gz
+#tar -xvf odb10_plants_fasta.tar.gz
+# write all into the same file:
+#cat plants/Rawdata/* > protein_db.fasta
+
+# also add the Cruentus sequences:
+#cat /projects/ag-stetter/twinkle/Amaranthus_cruentus/Amacr_pep_nospace.fa >> protein_db.fasta
+# removed asterisk
+#sed 's/\*//' protein_db.fasta > protein_db.fa
+
+#-> in total 3536219 plant protein sequences
+#Since I added the Cruentus sequences, the total number of species is now 118.
+
+
+# run on cheops1
+# this script is used to run braker2 in both RNAseq and protein mode
+
+source $CONDA_PREFIX/etc/profile.d/conda.sh
+conda activate /opt/rrzk/software/conda-envs/braker2
+
+module load samtools/1.13
+
+mkdir -p data/braker2/polished_prot
+
+# run braker in RNAseq and protein mode:
+
+braker.pl --AUGUSTUS_CONFIG_PATH=/home/twinkle1/tools/config/ \
+	--epmode \
+	--prot_seq=data/braker2/input/protein_db.fasta \
+	--genome=polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \
+	--softmasking \
+	--species=polished_prot \
+	--cores=8 \
+	--workingdir=data/braker2/polished_prot
diff --git a/workflows/braker2/braker2_prot_rna.sh b/workflows/braker2/braker2_prot_rna.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ca0de631d378fbf3d694addff56804f7ad35bec4
--- /dev/null
+++ b/workflows/braker2/braker2_prot_rna.sh
@@ -0,0 +1,43 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -t 55:00:00
+#SBATCH -J braker
+#SBATCH -o logs/braker/mappingLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=8
+#SBATCH --mem=84gb
+
+# run on cheops1
+# this script is used to run braker2 in both RNAseq and protein mode
+
+source $CONDA_PREFIX/etc/profile.d/conda.sh
+conda activate /opt/rrzk/software/conda-envs/braker2
+
+module load samtools
+
+# it is recommended to merge the separate bam files beforehand, as specifying many files can cause issues with braker
+samtools merge --threads 7 data/braker2/STAR_mappings/clouse_reads_merged.bam \
+	data/braker2/STAR_mappings/SRR_0_Aligned.sortedByCoord.out.bam \
+	data/braker2/STAR_mappings/SRR_1_Aligned.sortedByCoord.out.bam \
+	data/braker2/STAR_mappings/SRR_2_Aligned.sortedByCoord.out.bam \
+	data/braker2/STAR_mappings/SRR_3_Aligned.sortedByCoord.out.bam \
+	data/braker2/STAR_mappings/SRR_4_Aligned.sortedByCoord.out.bam \
+	data/braker2/STAR_mappings/SRR_5_Aligned.sortedByCoord.out.bam \
+	data/braker2/STAR_mappings/SRR_6_Aligned.sortedByCoord.out.bam \
+	data/braker2/STAR_mappings/SRR_7_Aligned.sortedByCoord.out.bam
+
+# it is unnecessary, however, to filter the bam files beforehand
+# see (https://github.com/Gaius-Augustus/BRAKER/issues/241)
+
+# run braker in RNAseq and protein mode:
+mkdir -p data/braker2/polished_prot_rna
+
+braker.pl --AUGUSTUS_CONFIG_PATH=/home/twinkle1/tools/config/ \
+	--etpmode \
+	--prot_seq=data/braker2/input/protein_db.fasta \
+	--genome=polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \
+	--softmasking \
+	--species=polished_prot_rna \
+	--bam=data/braker2/STAR_mappings/clouse_reads_merged.bam \
+	--cores=8 \
+	--workingdir=data/braker2/polished_prot_rna
diff --git a/workflows/braker2/index_STAR.sh b/workflows/braker2/index_STAR.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6eb95f9002aa4084551caed185c57d0fd6b30603
--- /dev/null
+++ b/workflows/braker2/index_STAR.sh
@@ -0,0 +1,30 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -t 1:00:00
+#SBATCH -J STAR
+#SBATCH -o logs/STAR/braker/mappingLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=8
+#SBATCH --mem=32gb
+#SBATCH --job-name="index_STAR"
+
+module load star/2.7.8a
+
+# Index the reference genome
+# only run once per reference genome
+
+# 8 threads, genome generation mode
+# sjdbOverhang and genomeSAindexNbases settings specific for the amaranth reference assembly v2.1
+# more specific settings: use the polished, softmasked reference assembly
+# as SJDB file, use the newly generated braker2 protein gtf file
+
+
+mkdir -p data/braker2/STAR_index/
+
+STAR --runThreadN 8 \
+	--runMode genomeGenerate \
+	--genomeDir data/braker2/STAR_index/ \
+	--sjdbOverhang 89 \
+	--genomeSAindexNbases 13 \
+	--genomeFastaFiles polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \
+	--sjdbGTFfile data/braker2/prot_run/braker.gtf
diff --git a/workflows/braker2/readme.txt b/workflows/braker2/readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7dee01eb4eba54b1cab58a45205063e4ea31da60
--- /dev/null
+++ b/workflows/braker2/readme.txt
@@ -0,0 +1,17 @@
+## Computational annotation
+
+Create computational genome annotation using BRAKER2.
+ 
+### Script order:
+
+- code/braker2/braker2_prot.sh
+initial run of BRAKER2 using only the protein database as evidence
+
+- code/braker2/index_STAR.sh
+index the polished reference genome for use with STAR
+
+- code/braker2/run_STAR.sh
+map all short reads from Clouse et al. to the polished reference genome for use with BRAKER2
+
+- code/braker2/braker2_prot_rna.sh
+run BRAKER2 with the mapped RNA-seq reads as well as the protein database as input
diff --git a/workflows/braker2/run_STAR.sh b/workflows/braker2/run_STAR.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c552c8f4089e6bc54b2299b31a8f1e7308b0f210
--- /dev/null
+++ b/workflows/braker2/run_STAR.sh
@@ -0,0 +1,50 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -t 1:00:00
+#SBATCH -J STAR
+#SBATCH -o logs/STAR/mappingLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=8
+#SBATCH --mem=8gb
+#SBATCH --array 0-7
+
+
+# Beforehand:
+#Short read data downloaded from SRA using the following accession/run numbers:
+
+#SRA Accession numbers:
+#Floral tissue: SRX722058 SRR1598911
+#Leaf tissue: SRX722059 SRR1598912
+#Root tissue: SRX722060 SRR1598913
+#Stem tissue: SRX722057 SRR1598910
+#Water stressed tissue sample: SRX722061 SRR1598914
+#Immature seeds: SRX722056 SRR1598909
+#Mature seeds: SRX722063 SRR1598916
+#Green Cotyledone: SRX722062 SRR1598915
+
+#using the following commands:
+#Download (show progress):
+#/home/twinkle1/tools/sratoolkit.2.11.2-centos_linux64/bin/prefetch -p -O Clouse_short_reads/ SRR1598916
+#Converted to fastq (reads separated into two files, with an additional file for unpaired reads):
+#/home/twinkle1/tools/sratoolkit.2.11.2-centos_linux64/bin/fastq-dump --split-3 --outdir /scratch/twinkle1/Clouse_short_reads/ Clouse_short_reads/SRR1598909>
+
+
+module load star/2.7.8a
+
+# create array of read fastq files (R1 only):
+SOURCE_DIR=raw_data/Clouse_short_reads
+FILES=("$SOURCE_DIR"/SRR*_1.fastq)
+
+# only for testing puposes:
+#echo "${FILES[0]}"
+#echo "${FILES[0]/_1.fastq.gz/_2.fastq.gz}"
+
+# run STAR after genome index creation
+mkdir -p data/braker2/STAR_mappings/
+
+STAR --runThreadN 8 \
+	--runMode alignReads \
+	--outSAMtype BAM SortedByCoordinate \
+	--genomeDir data/braker2/STAR_index \
+	--outFileNamePrefix data/braker2/STAR_mappings/SRR_"${SLURM_ARRAY_TASK_ID}"_ \
+	--readFilesIn "${FILES["${SLURM_ARRAY_TASK_ID}"]}" "${FILES["${SLURM_ARRAY_TASK_ID}"]/_1.fastq/_2.fastq}"
diff --git a/workflows/functional_annotation/analyse_functional_annotation.Rmd b/workflows/functional_annotation/analyse_functional_annotation.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..41c76f578a0b7c01a6d7474b50ef547192f0df52
--- /dev/null
+++ b/workflows/functional_annotation/analyse_functional_annotation.Rmd
@@ -0,0 +1,83 @@
+---
+title: "analyse_functional_annotation"
+author: "twinkle1"
+date: "2023-06-19"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(ggVennDiagram)
+knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2_publication/")
+```
+
+Prepare input files:
+
+```{bash}
+mkdir data/functional_annotation/analysis
+
+# prepare interproscan annotations (Pfam, PANTHER and CDD)
+awk '{if (($4 == "Pfam") || ($4 == "PANTHER") || ($4 == "CDD")) {print $1}}' data/functional_annotation/interproscan/Ahypochondriacus_2.2_polished_corrected.prot.fasta.tsv | sort | uniq > data/functional_annotation/analysis/interpro_genes.txt
+# prepare eggnog annotation
+tail -n +6 data/functional_annotation/eggnog_mapper/MM_8wexw920.emapper.annotations.tsv | head -n -3 | awk '{print $1}' > data/functional_annotation/analysis/eggnog_genes.txt
+# prepare mercator annotations
+grep ">" data/functional_annotation/mercator_v4/Ahyp2.fa | grep -v "not classified" | sed 's/ .*//' | sed 's/>//' > data/functional_annotation/analysis/mercator_genes.txt
+```
+
+Load in gene names and analyse overlap:
+
+```{r}
+# read in annotation gene list
+annotation_reader <- function(file, source){
+  annot <- read_table(file = file,
+                      col_names = "transcript_id")
+  # add source and geneid
+  annot <- annot %>%
+    mutate(gene_id = substr(transcript_id, 1, 9),
+           source = source)
+  return(annot)
+}
+
+# load in genes
+ips_genes <- annotation_reader(file = "data/functional_annotation/analysis/interpro_genes.txt",
+                               source = "Interproscan")
+eggnog_genes <- annotation_reader(file = "data/functional_annotation/analysis/eggnog_genes.txt",
+                               source = "eggNOG")
+mercator_genes <- annotation_reader(file = "data/functional_annotation/analysis/mercator_genes.txt",
+                               source = "Mercator")
+
+# create input list
+gene_list <- list(Interproscan = unique(ips_genes$gene_id),
+                  eggNOG_mapper = unique(eggnog_genes$gene_id),
+                  Mercator = unique(mercator_genes$gene_id))
+
+# plot Venn Diagram
+p1 <- ggVennDiagram(gene_list,
+                    label = "count") +
+  scale_x_continuous(expand = expansion(mult = .15)) +
+  scale_fill_distiller(palette = "RdBu") +
+  labs(fill = "Number of genes")
+p1
+
+ggsave(filename = "plots/functional_annotation_venn.png",
+       height = 5,
+       width = 7,
+       bg = "white",
+       dpi = 400)
+
+
+getVennOverlap <- function(lsvenn = list(A = sort(sample(LETTERS, 15)),
+                                     B = sort(sample(LETTERS, 15)),
+                                     C = sort(sample(LETTERS, 15)),
+                                     D = sort(sample(LETTERS, 15)))
+                           ) {
+  
+  ItemsList <- gplots::venn(lsvenn, show.plot = FALSE)
+  print(lengths(attributes(ItemsList)$intersections))
+  #return(attributes(ItemsList)$intersections)
+}
+
+sum(getVennOverlap(lsvenn = gene_list))
+```
+
diff --git a/workflows/functional_annotation/readme.txt b/workflows/functional_annotation/readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2c9dbeb6d6020a54eab763608a3bcda0c6d39cd2
--- /dev/null
+++ b/workflows/functional_annotation/readme.txt
@@ -0,0 +1,13 @@
+## functional annotation
+
+Functional annotation and analysis
+
+### Script order:
+
+- code/functional_annotation/run_interproscan.sh
+Run Interproscan on protein sequences for functional annotation
+
+Functional annotation using eggNOG-mapper and Mercator was done using online submission and not run locally.
+
+-code/functional_annotation/analyse_functional_annotation.Rmd
+Analyse the number of annotated genes etc. for all three functional annotation programs
diff --git a/workflows/functional_annotation/run_interproscan.sh b/workflows/functional_annotation/run_interproscan.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1c858364662d21143f32ff28b467a65736d15590
--- /dev/null
+++ b/workflows/functional_annotation/run_interproscan.sh
@@ -0,0 +1,42 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -t 168:00:00
+#SBATCH -J interpro
+#SBATCH -o logs/functional_annotation/mappingLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=20
+#SBATCH --mem=32gb
+#SBATCH --mail-type=ALL
+
+# run on cheops1
+# this script is used to run interproscan on the protein output of the manually corrected amaranth annotation
+# installed on scratch due to the size of the installation
+# installed interproscan using the following commands based on https://interproscan-docs.readthedocs.io/en/latest/HowToDownload.html
+# downloaded with: wget https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.56-89.0/interproscan-5.56-89.0-64-bit.tar.gz
+# checked md5sum after download
+# extracted tarball and indexed hmm models before the first run using the following command: python3 initial_setup.py
+
+# load required modules
+module load openjdk/11.0.2
+
+# set variables
+# interpro directory with executable shell script
+INTERPRODIR=/scratch/twinkle1/interproscan/interproscan-5.56-89.0/
+# output directory for the annotation run
+INTERPROOUT=data/functional_annotation/interproscan/
+# input amino acid fasta file
+INTERPROIN=polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta
+
+# main
+mkdir "$INTERPROOUT"
+cd "$INTERPRODIR"
+
+# dp:disables online lookup, f:output formats, iprlookup: for goterms and pa(thway) matching, 18 cpus as the main application also needs always 1 thread
+./interproscan.sh -i "$INTERPROIN" \
+	-dp \
+	-f tsv,xml,gff3 \
+	-d "$INTERPROOUT" \
+	-iprlookup \
+	-goterms \
+	-pa \
+	-cpu 18
diff --git a/workflows/gene_expression_quantification/isoseq_expression_support.Rmd b/workflows/gene_expression_quantification/isoseq_expression_support.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..955383247301777b226847cfdeb7c7c446a93efd
--- /dev/null
+++ b/workflows/gene_expression_quantification/isoseq_expression_support.Rmd
@@ -0,0 +1,85 @@
+---
+title: "isoseq_support_for_annotation"
+author: "twinkle1"
+date: "2023-02-08"
+output: html_document
+---
+
+```{r setup}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(data.table)
+knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2_publication/")
+```
+
+
+This script assesses the isoseq support for each annotated transcript.
+
+Function to read in gtf file:
+
+```{r}
+# set up function for reading in a gtf file
+read.gtf <- function(file){
+  # based on: https://www.biostars.org/p/272889/
+  # read in the gtf file:
+  gff <- fread(file)
+  setnames(gff, names(gff), c("chr","source","type","start","end","score","strand","phase","attributes"))
+  # subset attribute column into the gene and transcript id columns
+  # function for extracting the two attributes
+  extract_attributes <- function(gtf_column, att_of_interest){
+    att <- strsplit(gtf_column, "; ")
+    att <- gsub("\"","",unlist(att))
+    att <- gsub(";","",unlist(att))
+    if(!is.null(unlist(strsplit(att[grep(att_of_interest, att)], " ")))){
+      return( unlist(strsplit(att[grep(att_of_interest, att)], " "))[2])
+    }else{
+      return(NA)
+    }
+  }
+  # using the function to subset gene and transcript id:
+  gff$gene_id <- unlist(lapply(gff$attributes, extract_attributes, "gene"))
+  gff$transcript_id <- unlist(lapply(gff$attributes, extract_attributes, "transcript"))
+  return(gff)
+}
+```
+
+Bash script to run gffcompare for the isoseq to reference annotation comparison.
+
+```{bash}
+mkdir data/gene_expression_quantification/isoseq/
+
+# run gffcompare
+/home/tom/Documents/tools/gffcompare/gffcompare -r data/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.gtf polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf
+# move to output directory
+mv gffcmp.* data/gene_expression_quantification/isoseq/
+```
+
+
+It might be easier to use the output of gffcompare instead of sqanti:
+
+
+```{r}
+# read in tracking file
+gff_tracking <- read.table(file = "data/gene_expression_quantification/isoseq/gffcmp.tracking")
+gff_tracking$V4 <- as.factor(gff_tracking$V4)
+colnames(gff_tracking) <- c("locus", "xlocus", "isoseq_transcript", "code", "reference_transcript")
+
+# overview
+summary(gff_tracking$code)
+
+# matching exactly the intron chain of isoseq transcripts
+# contained "c" reference transcripts and equal transcripts "="
+sum(summary(gff_tracking$code)[1:2])
+sum(summary(gff_tracking$code)[1:2])/nrow(gff_tracking)
+
+# multi-exon with atleast one splice junction match
+sum(summary(gff_tracking$code)[1:2]+summary(gff_tracking$code)[5])
+sum(summary(gff_tracking$code)[1:2]+summary(gff_tracking$code)[5])/nrow(gff_tracking)
+
+# add single column reference transcript
+gff_tracking$reference_transcript_short <- matrix(unlist(strsplit(x = gff_tracking$reference_transcript, split = "\\|")), ncol = 7, byrow = T)[,2]
+# all transcripts there?
+length(unique(gff_tracking$reference_transcript_short)) #28074
+```
+
+
diff --git a/workflows/gene_expression_quantification/plot_expression_quantification_kallisto.Rmd b/workflows/gene_expression_quantification/plot_expression_quantification_kallisto.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..a384ececc24257d47eef82f4c64b3a84574c02a6
--- /dev/null
+++ b/workflows/gene_expression_quantification/plot_expression_quantification_kallisto.Rmd
@@ -0,0 +1,271 @@
+---
+title: "Clouse_kallisto_gene_expression_quantification"
+author: "twinkle1"
+date: '2022-10-11'
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(pheatmap)
+library(reshape2)
+library(scales)
+knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2_publication/")
+```
+
+After gene expression quantification using kallisto (of fastq files against the indexed transcriptome (!)), create plots to visualize the gene expression levels of different genes in different tissues.
+
+```{r}
+##### Setup, loading of data, preprocessing
+
+# vector of input directories
+indirs <- list.dirs(path = "data/gene_expression_quantification/kallisto_quant")
+indirs <- indirs[-1] # remove base directory
+indirs <- indirs[grep("index", indirs, invert = T)] # remove index directory
+
+# read in abundance file for each input directory
+abundances <- list()
+for (i in 1:length(indirs)){
+  abundances[[i]] <- read.table(file = paste0(indirs[i], "/abundance.tsv"), header = T)
+}
+
+# save as single dataframe with the tpm for each tissue, likely first in wide format
+tpm.df <- data.frame(abundances[[1]]$target_id,
+                     abundances[[1]]$tpm,
+                     abundances[[2]]$tpm,
+                     abundances[[3]]$tpm,
+                     abundances[[4]]$tpm,
+                     abundances[[5]]$tpm,
+                     abundances[[6]]$tpm,
+                     abundances[[7]]$tpm,
+                     abundances[[8]]$tpm)
+colnames(tpm.df) <- c("GeneID", gsub("data/gene_expression_quantification/kallisto_quant/", "", indirs))
+colnames(tpm.df) <- c("GeneID", "Cotyledones", "Flower", "Leaf", "Mature seed", "Root", "Stem", "Water-stressed", "Developing seed")
+
+# melt dataframe for plotting into long format:
+melted.df <- melt(tpm.df, id.vars = "GeneID")
+colnames(melted.df) <- c("GeneID", "tissue", "tpm")
+
+write_csv(tpm.df,
+          file = "data/gene_expression_quantification/kallisto_quant/all_tissue_expression.csv")
+write_csv(melted.df,
+          file = "data/gene_expression_quantification/kallisto_quant/all_tissue_expression_long.csv")
+```
+
+Create dataframe of the betalain genes and the myb genes which can be later used for plotting the gene expression levels for the respective genes.
+
+```{r}
+# load object with names of all betalain and flavonoid genes
+pathway_genes <- read.csv(file = "data/manual_sheets/color_pathway_genes.csv", header=T)
+
+betalain.genes <- pathway_genes %>%
+  filter(Pathway == "Betalain")
+
+flavonoid.genes <- pathway_genes %>%
+  filter(Pathway == "Flavonoid")
+
+# read in the myb genes and subset the betalain mybs
+myb_genes <- read.csv(file = "data/manual_sheets/MYB_with_subgroups.csv", header=T)
+#myb_genes <- read_csv(file = "data/annotation_analysis/myb_annotation/myb_stats.csv")
+colnames(myb_genes) <- c("Transcript_id", "Freq", "Gene_id", "Subgroup")
+
+```
+
+Plot expression levels in different tissues for the betalain genes of amaranth.
+
+```{r}
+# subset the melted tpm dataframe and plot gene expression
+betalain.df <- melted.df %>%
+  filter(gsub("\\..*", "", GeneID) %in% betalain.genes$Gene_id) %>%
+  mutate(joining = gsub("\\..*", "", GeneID))
+
+# join to obtain pathway gene names
+betalain_plotting <- left_join(x = betalain.df, y = betalain.genes, by = c("joining" = "Gene_id")) 
+betalain_plotting <- betalain_plotting %>%
+  select(GeneID, tissue, tpm, Gene) %>%
+  mutate(label = paste0(Gene, " (", GeneID, ")"))
+betalain_plotting$label <- factor(betalain_plotting$label, levels = c("AhCYP76AD2 (AHp023148.1)",
+                                                                      "AhCYP76AD5 (AHp000674.1)",
+                                                                      "AhDODAα1 (AHp023147.1)",
+                                                                      "AhDODAα2 (AHp010386.1)",
+                                                                      "AhBetanidin5GT (AHp001663.1)",
+                                                                      "AhBetanidin6GT (AHp005940.1)",
+                                                                      "AhcDOPA5GT (AHp007219.1)",
+                                                                      "AhMYB2 (AHp022773.1)",
+                                                                      "AhMYB2 (AHp022773.2)",
+                                                                      "AhMYB3 (AHp016530.1)",
+                                                                      "AhMYB4 (AHp016531.1)"))
+
+betalain_plotting$tissue <- factor(betalain_plotting$tissue, levels = c("Root",
+                                                                          "Cotyledones",
+                                                                          "Flower",
+                                                                          "Leaf",
+                                                                          "Stem",
+                                                                          "Developing seed",
+                                                                          "Mature seed",
+                                                                          "Water-stressed"))
+
+
+# new plotting of betalain gene expression per tissue
+ggplot(data = betalain_plotting) +
+  geom_tile(aes(x = tissue, y = label, fill = tpm + 1)) +
+  geom_text(aes(x = tissue, y = label, label = sprintf("%0.2f", round(tpm, digits = 2))),
+            size = 3.5) +
+  scale_fill_distiller(palette = "RdYlBu", trans = "log10", labels = comma) +
+  scale_y_discrete(limits = rev) +
+  #scale_fill_viridis_c(alpha = 0.7, trans = "log10") +
+  labs(fill = "TPM") +
+  theme_classic() +
+  expand_limits(fill = 600) +
+  theme(text = element_text(size=22),
+        axis.title.x = element_blank(),
+        axis.title.y = element_blank(),
+        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
+ggsave(filename = "plots/gene_expression_quantification/betalain_expression.png", width = 10, height = 7,
+       dpi = 400)
+
+
+```
+
+
+Recreate betalain gene expression inference for the MYB transcription factor genes, could also try clustering the genes based on gene expression.
+
+```{r}
+# subset the expression dataframe based on MYB transcription factor information
+myb.df <- melted.df %>%
+  filter(GeneID %in% myb_genes$Transcript_id)
+
+# join the myb.df object with the subgroup information of the myb_genes dataframe
+myb.df <- left_join(myb.df, myb_genes, by = c("GeneID" = "Transcript_id"))
+
+# create label for plot
+label <- paste0(myb_genes$Transcript_id, " (", myb_genes$Subgroup, ")")
+
+# replace transcript id for particular genes
+label[52] <- "AmMYBl1.1 (S5)"
+label[62] <- "AhMYB3.1 (BvMYB1-like)"
+label[63] <- "AhMYB4.1 (BvMYB1-like)"
+label[93] <- "AhMYB2.1 (BvMYB1-like)"
+label[94] <- "AhMYB2.2 (BvMYB1-like)"
+
+# create matrices for the pheatmap function
+myb.mat_label <- matrix(data = round(myb.df$tpm, digits = 2), 
+                  ncol = 8,
+                  byrow = F)
+myb.mat <- matrix(data = log10(myb.df$tpm+1), 
+                  ncol = 8,
+                  byrow = F)
+rownames(myb.mat) <- label
+colnames(myb.mat) <- unique(myb.df$tissue)
+rownames(myb.mat_label) <- label
+colnames(myb.mat_label) <- unique(myb.df$tissue)
+
+# define column order
+col_order <- c("Root",
+          "Cotyledones",
+          "Flower",
+          "Leaf",
+          "Stem",
+          "Developing seed",
+          "Mature seed",
+          "Water-stressed")
+
+# change column order
+myb.mat <- myb.mat[,col_order]
+myb.mat_label <- myb.mat_label[,col_order]
+
+# create clustered heatmap
+pheatmap(mat = myb.mat,
+         cluster_rows = T,
+         cluster_cols = F,
+         angle_col = 45,
+         filename = "plots/gene_expression_quantification/myb_expression.png",
+         width = 10,
+         height = 15,
+         display_numbers = myb.mat_label,
+         number_color = "black",
+         fontsize_number = 9,
+         legend_breaks = c(0,1,2),
+         legend_labels = c(0, 10^1, 10^2))
+
+```
+
+
+Also check the flavonoid pathway for expression differences, huge expression perhaps not expected since the plant produces white seeds:
+
+```{r}
+# flavonoid pathway plot
+# subset the melted tpm dataframe and plot gene expression
+flavonoid.df <- melted.df %>%
+  filter(gsub("\\..*", "", GeneID) %in% flavonoid.genes$Gene_id) %>%
+  mutate(joining = gsub("\\..*", "", GeneID))
+
+# join to obtain pathway gene names
+flavonoid_plotting <- left_join(x = flavonoid.df, y = flavonoid.genes, by = c("joining" = "Gene_id")) 
+flavonoid_plotting <- flavonoid_plotting %>%
+  select(GeneID, tissue, tpm, Gene) %>%
+  mutate(label = paste0(Gene, " (", GeneID, ")"))
+
+flavonoid_plotting$tissue <- factor(flavonoid_plotting$tissue, levels = c("Root",
+                                                                          "Cotyledones",
+                                                                          "Flower",
+                                                                          "Leaf",
+                                                                          "Stem",
+                                                                          "Developing seed",
+                                                                          "Mature seed",
+                                                                          "Water-stressed"))
+
+flavonoid_plotting$label <- factor(flavonoid_plotting$label, levels = c("PAL_1 (AHp012752.1)",
+                                                                         "PAL_2 (AHp021980.1)",
+                                                                         "C4H_1 (AHp013217.1)",
+                                                                         "C4H_1 (AHp013217.2)",
+                                                                         "C4H_2 (AHp022384.1)",
+                                                                         "C4H_3 (AHp022382.1)",
+                                                                         "C4H_3 (AHp022382.2)",
+                                                                         "4CL_1 (AHp014409.1)",
+                                                                         "4CL_2 (AHp020962.1)",
+                                                                         "CHS (AHp004305.1)",
+                                                                         "CHS (AHp004305.2)",
+                                                                         "CHI1 (AHp009962.1)",
+                                                                         "F3-H_1 (AHp017497.1)",
+                                                                         "F3-H_2 (AHp022122.1)",
+                                                                         "F3-H_3 (AHp003152.1)",
+                                                                         "F3-H_3 (AHp003152.2)",
+                                                                         "F3-H_4 (AHp022120.1)",
+                                                                         "F3-H_5 (AHp022123.1)",
+                                                                         "DFR (AHp009303.1)",
+                                                                         "F3H (AHp006454.1)",
+                                                                         "FLS (AHp008991.1)",
+                                                                         "FLS (AHp008991.2)",
+                                                                         "LAR (AHp017409.1)",
+                                                                         "ANR (AHp001635.1)"))
+
+
+# new plotting of flavonoid gene expression per tissue
+ggplot(data = flavonoid_plotting) +
+  geom_tile(aes(x = tissue, y = label, fill = tpm+1)) +
+  geom_text(aes(x = tissue, y = label, label = sprintf("%0.2f", round(tpm, digits = 2))),
+            size = 3.5) +
+  scale_fill_distiller(palette = "RdYlBu", trans = "log10") +
+  scale_y_discrete(limits = rev) +
+  #scale_fill_viridis_c(alpha = 0.7, trans = "log10") +
+  labs(fill = "TPM") +
+  theme_classic() +
+  expand_limits(fill = 600) +
+  theme(text = element_text(size=22),
+        axis.title.x = element_blank(),
+        axis.title.y = element_blank(),
+        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
+ggsave(filename = "plots/gene_expression_quantification/flavonoid_pathway_expression.png", width = 10, height = 10)
+#ggsave(filename = "plots/gene_expression_quantification/flavonoid_pathway_expression.png", width = 10, height = 12)
+```
+
+
+
+
+
+
+
+
+
diff --git a/workflows/gene_expression_quantification/readme.txt b/workflows/gene_expression_quantification/readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6ecc87b7a9c252b71236101605bd45344dd2c58c
--- /dev/null
+++ b/workflows/gene_expression_quantification/readme.txt
@@ -0,0 +1,13 @@
+## Gene expression quantification
+
+Quantify gene expression levels using short-read RNA-seq data from different tissues and the long-read sequencing data.
+
+### Script order:
+- code/gene_expression_quantification/run_kallisto.sh
+Run Kallisto to quantify gene expression using short-read RNA-seq data
+
+- code/gene_expression_quantification/plot_expression_quantification_kallisto.Rmd
+Generate plots from assessed gene expression levels
+
+- code/gene_expression_quantification/isoseq_expression_support.Rmd
+Assess support of Iso-Seq transcripts for annotated genes
diff --git a/workflows/gene_expression_quantification/run_kallisto.sh b/workflows/gene_expression_quantification/run_kallisto.sh
new file mode 100644
index 0000000000000000000000000000000000000000..dd5393b365190368a9e04e374ea28bfb1e07440b
--- /dev/null
+++ b/workflows/gene_expression_quantification/run_kallisto.sh
@@ -0,0 +1,35 @@
+#!/bin/bash -l
+
+# Beforehand:
+#Short read data downloaded from SRA using the following accession/run numbers:
+
+#SRA Accession numbers:
+#Floral tissue: SRX722058 SRR1598911
+#Leaf tissue: SRX722059 SRR1598912
+#Root tissue: SRX722060 SRR1598913
+#Stem tissue: SRX722057 SRR1598910
+#Water stressed tissue sample: SRX722061 SRR1598914
+#Immature seeds: SRX722056 SRR1598909
+#Mature seeds: SRX722063 SRR1598916
+#Green Cotyledone: SRX722062 SRR1598915
+
+
+# index the transcriptome
+KALINDEX=data/gene_expression_quantification/kallisto_quant/index
+mkdir -p $KALINDEX
+
+/home/tom/Documents/tools/kallisto/kallisto index -i "$KALINDEX"/index polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.cds.fasta
+
+
+##### Perform quantification
+# create array of read fastq files (R1 only):
+SOURCE_DIR=raw_data/Clouse_short_reads/
+FILES=("$SOURCE_DIR"SRR*_1.fastq.gz)
+OUTDIR=data/gene_expression_quantification/kallisto_quant/
+TISSUE_NAMES=("young_seed" "stem" "flower" "leaf" "root" "water_stressed" "cotyledones" "mature_seed")
+
+# kallisto after indexing
+for (( i=0; i<=7; i++))
+do
+	/home/tom/Documents/tools/kallisto/kallisto quant -i "$KALINDEX"/index -o "$OUTDIR""${TISSUE_NAMES[$i]}" --bias --plaintext -t 6 --verbose "${FILES[$i]}" "${FILES[$i]/_1.fastq.gz/_2.fastq.gz}"
+done
diff --git a/workflows/genome_polishing/helper_script.R b/workflows/genome_polishing/helper_script.R
new file mode 100644
index 0000000000000000000000000000000000000000..2a936488bd83484a3b9b39ff8ca99e7401ab80c0
--- /dev/null
+++ b/workflows/genome_polishing/helper_script.R
@@ -0,0 +1,20 @@
+# set working directory
+setwd("/home/tom/Documents/projects/Ahyp_v2_2_publication/")
+
+# read in list of all headers with the correct order
+headers <- read.table("data/NextPolish/input/out.headers.txt")
+headers <- gsub(">","",headers$V1)
+headers <- gsub("quiver_","quiver",headers)
+
+# read in prefiltered fasta index
+prefilter <- read.table("data/NextPolish/input/out.prefiltered.renamed.txt.fai")
+
+# use the 
+# every sequence that is in 
+no_seq <- headers[!headers %in% prefilter[,1]]
+no_seq <- sub("",">",no_seq)
+
+write.table(no_seq, file="data/NextPolish/processed/header_without_sequence.fa", 
+            quote=F,
+            row.names = F,
+            col.names = F)
diff --git a/workflows/genome_polishing/process_nextpolish_output.sh b/workflows/genome_polishing/process_nextpolish_output.sh
new file mode 100644
index 0000000000000000000000000000000000000000..65ab4586b98162e5bf5109c16f8a7c8a97d992bd
--- /dev/null
+++ b/workflows/genome_polishing/process_nextpolish_output.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# This script takes the output of NextPolish and reproduces the complete genome file, in the same order as it was before (adding back ambiguous N bases).
+# (see master_thesis/code/process_nextpolish_output.sh for more information about the input file preparation)
+
+# Setup
+NPOUT=data/NextPolish/output/
+NPPROCESSED=data/NextPolish/processed/
+
+mkdir -p "$NPPROCESSED"
+
+
+
+# rename the nextpolish genome fasta
+cut -f1,2 -d'_' "$NPOUT"genome.nextpolish.fa > "$NPPROCESSED"genome.nextpolish.renamed.fa
+
+# index the prefiltered fasta file for use in R
+samtools faidx data/NextPolish/input/out.prefiltered.renamed.txt
+
+# filter the prefiltered file for everything that is not in Nextpolish file:
+/home/tom/Documents/tools/bbmap/filterbyname.sh in=data/NextPolish/input/out.prefiltered.renamed.txt \
+	names="$NPPROCESSED"genome.nextpolish.renamed.fa \
+	out="$NPPROCESSED"prefilter_not_in_Nextpolish.fa
+
+# use the helper Rscript to get the remaining headers without fasta sequences:
+Rscript code/genome_polishing/helper_script.R
+
+# Concatenate everything including the R output
+cat "$NPPROCESSED"genome.nextpolish.renamed.fa \
+	"$NPPROCESSED"prefilter_not_in_Nextpolish.fa \
+	"$NPPROCESSED"header_without_sequence.fa > "$NPPROCESSED"combined.fa
+
+# Linearize fasta file, this also adds a line after the empty fasta sequences in the end:
+LC_ALL=C awk -v RS=">" -v FS="\n" -v ORS="\n" -v OFS="" '$0 {$1=">"$1"\n"; print}' "$NPPROCESSED"combined.fa > "$NPPROCESSED"combined.linear.fa
+
+# rename header file by removing trailing underscore character of Contigs:
+sed 's/quiver_/quiver/' data/NextPolish/input/out.headers.txt > "$NPPROCESSED"out.header.renamed.txt
+
+# order file:
+ORDER="$NPPROCESSED"out.header.renamed.txt
+SORT="$NPPROCESSED"combined.linear.fa
+OUT="$NPPROCESSED"combined.linear.sorted.fa
+
+while read ID; do grep -w -A1 "$ID" $SORT; done < $ORDER > $OUT
+
+# last step: remove "spl" lines and then all empty lines:
+sed '/spl/d' "$NPPROCESSED"combined.linear.sorted.fa | awk 'NF' > "$NPPROCESSED"combined.linear.sorted.nosplit.fa
+
+# normalize sequence length per line:
+/home/tom/Documents/tools/gatk-4.2.5.0/gatk NormalizeFasta -I "$NPPROCESSED"combined.linear.sorted.nosplit.fa -O "$NPPROCESSED"combined.linear.sorted.nosplit.normalized.fa
+
+# final output is data/NextPolish/processed/Ahypochondriacus_2.2_polished.fasta file
+mv "$NPPROCESSED"combined.linear.sorted.nosplit.normalized.fa "$NPPROCESSED"Ahypochondriacus_2.2_polished.fasta
+
+# remove intermediate output files:
+rm "$NPPROCESSED"combined*
+rm "$NPPROCESSED"genome*
+rm "$NPPROCESSED"header*
+rm "$NPPROCESSED"out*
+rm "$NPPROCESSED"prefilter*
+
+# copy to final output directory
+mkdir -p polished_reference_genome/polished_genome_annotation/assembly/
+cp "$NPPROCESSED"Ahypochondriacus_2.2_polished.fasta polished_reference_genome/polished_genome_annotation/assembly/
diff --git a/workflows/genome_polishing/readme.txt b/workflows/genome_polishing/readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e5aef947118edbe27449e45a5dd00f4313ba79fa
--- /dev/null
+++ b/workflows/genome_polishing/readme.txt
@@ -0,0 +1,20 @@
+## Genome polishing
+
+Polish the previously published reference genome of A. hypochondriacus.
+
+### Script order:
+
+- code/genome_polishing/remove_ambiguous_bases.sh
+prepare the reference genome v2.1 for genome polishing by removing all ambiguous bases
+
+- code/genome_polishing/unpackSRA.sh 
+unpack the WGS short read SRA file from the Lightfoot genome assembly
+
+- code/genome_polishing/run_nextpolish.sh
+repare input files and run NextPolish to polish the reference assembly.
+
+- code/genome_polishing/process_nextpolish_output.sh
+return all ambigious bases into the polished reference genome and restore the same chromosome order
+
+
+The helper_script.R is called by other scripts and does not have to be manually run.
diff --git a/workflows/genome_polishing/remove_ambiguous_bases.sh b/workflows/genome_polishing/remove_ambiguous_bases.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c810a8a1099eb86902fb3ee23a0a589b6ce2d095
--- /dev/null
+++ b/workflows/genome_polishing/remove_ambiguous_bases.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Used to remove ambiguous bases from the reference genome to prepare as input for NextPolish
+
+# do not remove Ns but write them into a new entry instead, creating N specific chromosomes. All entries and their order is saved
+# N specific entries can in a last step be removed. To reconstruct, the saved order can be used to integrate the N chromosomes again (which record the number of Ns removed)
+
+# Change these two parameters
+INPUT=reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.nospace.underscore.fa
+OUTDIR=data/NextPolish/input/
+
+mkdir -p $OUTDIR
+
+# split fasta into new entrys based on gap character N
+# print into a single line; replace stretch of Ns with new "split" entry; in the end create newline after first fasta entry
+LC_ALL=C awk -v RS=">" -v FS="\n" -v ORS="\n" -v OFS="" '$0 {$1=">"$1"\n"; print}' $INPUT | sed 's/N*N/\n>spl\n&\n>spl\n/g' > "$OUTDIR"tmp.txt
+
+echo "splits generated"
+
+# start with 0
+i=0
+
+# for each line, if it is a newly created header, increase value of i by 1 and add _i to the header name
+for j in $(cat "$OUTDIR"tmp.txt); do
+	if [[ $j =~ .sp* ]] ; then
+                i=$((i+1))
+        fi
+	echo $j | sed "s/>spl/>spl_"$i"/"
+done > "$OUTDIR"tmp2.txt
+
+echo "In total: "$i" splits renamed"
+
+
+# save the correct order of all headers in a file to be able to restore the order later
+grep ">" "$OUTDIR"tmp2.txt > "$OUTDIR"out.headers.txt
+
+echo "headers saved"
+
+# remove headers without sequence (Can be caused by stretch of Ns at the start of a Scaffold (see Scaffold 10))
+sed -r 'N; /(>)[^\n]*\n\1/ s/[^\n]*//; P; D' "$OUTDIR"tmp2.txt | grep . | grep -i -B 1 --no-group-separator  '[ATGC]'  > "$OUTDIR"data/NextPolish/input/Ahypochondriacus_split.fasta
+
+# remove and rename temporary files
+rm "$OUTDIR"tmp.txt
+mv "$OUTDIR"tmp2.txt "$OUTDIR"out.prefiltered.txt
+
diff --git a/workflows/genome_polishing/run_nextpolish.sh b/workflows/genome_polishing/run_nextpolish.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cbf057986b68d5b7db03b06925d898ff70ff3143
--- /dev/null
+++ b/workflows/genome_polishing/run_nextpolish.sh
@@ -0,0 +1,79 @@
+#!/bin/bash -l
+#SBATCH -D /scratch/twinkle1/nextpolish/
+#SBATCH -t 40:00:00
+#SBATCH -J nextpolish
+#SBATCH -o /home/twinkle1/master_thesis/logs/nextpolish/mappingLog-%j.txt
+#SBATCH --error /home/twinkle1/master_thesis/logs/nextpolish/errorLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=21
+#SBATCH --mem=100gb
+
+
+# Run nextpolish to error correct a reference assembly using short or long reads. Adjust parameters under "Set input and parameters"
+
+
+# setup
+module load bwamem2/2.2.1
+module load samtools
+
+mkdir -p /scratch/twinkle1/nextpolish/
+
+# set output directory for saving the polished genome to:
+OUTDIR=data/NextPolish/output/
+
+
+
+### Prepare input files:
+# remove all reads containing ambiguous bases from the input using bbduk
+/home/twinkle1/tools/bbmap/bbduk.sh maxns=0 \
+	in=/projects/ag-stetter/twinkle/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_1.fastq.gz \
+	in2=/projects/ag-stetter/twinkle/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_2.fastq.gz \
+	out=/scratch/twinkle1/SRR2106212_1.cleaned.fq \
+	out2=/scratch/twinkle1/SRR2106212_2.cleaned.fq \
+	-Xmx16g
+
+# make sure all reads are still paired and the files are in the correct order by running repair from the bbmap suite:
+/home/twinkle1/tools/bbmap/repair.sh \
+	in=/scratch/twinkle1/SRR2106212_1.cleaned.fq \
+	in2=/scratch/twinkle1/SRR2106212_2.cleaned.fq \
+	out=/scratch/twinkle1/SRR2106212_1.cleaned.repair.fq \
+	out2=/scratch/twinkle1/SRR2106212_2.cleaned.repair.fq \
+	-Xmx16g
+
+
+
+### Adopted script to use NextPolish manual from: https://nextpolish.readthedocs.io/en/latest/TUTORIAL.html
+
+#Set input and parameters
+round=2
+threads=20
+read1=/scratch/twinkle1/SRR2106212_1.cleaned.repair.fq
+read2=/scratch/twinkle1/SRR2106212_2.cleaned.repair.fq
+input=/home/twinkle1/master_thesis/data/NextPolish/input/Ahypochondriacus_split.fasta
+
+
+for ((i=1; i<=${round};i++)); do
+#step 1:
+   #index the genome file and do alignment
+   bwa-mem2 index ${input};
+   bwa-mem2 mem -t ${threads} ${input} ${read1} ${read2}|samtools view --threads 19 -F 0x4 -b -|samtools fixmate -m --threads 19  - -|samtools sort -m 2g --threads 20 -|samtools markdup --threads 19 -r - sgs.sort.bam
+   #index bam and genome files
+   samtools index -@ ${threads} sgs.sort.bam;
+   samtools faidx ${input};
+   #polish genome file
+   python /home/twinkle1/tools/NextPolish/lib/nextpolish1.py -g ${input} -t 1 -p ${threads} -s sgs.sort.bam -debug > genome.polishtemp.fa;
+   input=genome.polishtemp.fa;
+#step2:
+   #index genome file and do alignment
+   bwa-mem2 index ${input};
+   bwa-mem2 mem -t ${threads} ${input} ${read1} ${read2}|samtools view --threads 19 -F 0x4 -b -|samtools fixmate -m --threads 19  - -|samtools sort -m 2g --threads 20 -|samtools markdup --threads 19 -r - sgs.sort.bam
+   #index bam and genome files
+   samtools index -@ ${threads} sgs.sort.bam;
+   samtools faidx ${input};
+   #polish genome file
+   python /home/twinkle1/tools/NextPolish/lib/nextpolish1.py -g ${input} -t 2 -p ${threads} -s sgs.sort.bam -debug > genome.nextpolish.fa;
+   input=genome.nextpolish.fa;
+done;
+#Finally polished genome file: genome.nextpolish.fa
+
+cp /scratch/twinkle1/nextpolish/* $OUTDIR
diff --git a/workflows/genome_polishing/unpackSRA.sh b/workflows/genome_polishing/unpackSRA.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7467f476e46a5d21632b6feaac7c4b95f805f971
--- /dev/null
+++ b/workflows/genome_polishing/unpackSRA.sh
@@ -0,0 +1,42 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -t 16:00:00
+#SBATCH -J SRA
+#SBATCH -o logs/mappingLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=8
+#SBATCH --mem=64gb
+#SBATCH --job-name="SRA_unpack"
+
+# requires cheops1 for newer library versions
+# prefetch command has to be run on the headnode before, as it requires internet access
+# Commands used:
+# download file, show progress, increase default max size so that the download starts
+# tools/sratoolkit.2.11.2-centos_linux64/bin/prefetch -p -O /projects/ag-stetter/twinkle/lightfoot_WGS_short_reads/ --max-size 30G SRR2106212
+
+QCOUT=data/NextPolish/QC
+
+# set working directory
+cd /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/raw_data/lightfoot_WGS_short_reads/
+
+# before running the fastq-dump command, switch off "Enable Remote Access" by running sratoolskit/bin/vdb-config -i
+# split into fastq files
+/home/twinkle1/tools/sratoolkit.2.11.2-centos_linux64/bin/fastq-dump --split-3 --verbose SRR2106212.sra
+
+# set working directory
+cd /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+
+# gzip the resulting fastq files
+# Even though there is an option to gzip it directly using the fastq-dump command, the option is deprecated and should no longer be used
+gzip raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_1.fastq
+gzip raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_2.fastq
+
+# remove sra file afterwards
+rm raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212.sra
+
+# quality control:
+module load fastqc/0.11.9
+
+fastqc -o raw_data/lightfoot_WGS_short_reads/QC/ -t 8 \
+        raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_1.fastq.gz \
+        raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_2.fastq.gz
diff --git a/workflows/isoseq_assembly/combined_isoseq3_pipeline.sh b/workflows/isoseq_assembly/combined_isoseq3_pipeline.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e64fc14a3873024264c134d8312959fe7b36e018
--- /dev/null
+++ b/workflows/isoseq_assembly/combined_isoseq3_pipeline.sh
@@ -0,0 +1,34 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -t 12:00:00
+#SBATCH -J isoseq3
+#SBATCH -o logs/isoseq3/mappingLog-%j.txt
+#SBATCH --nodes=1-1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64gb
+
+
+source $CONDA_PREFIX/etc/profile.d/conda.sh
+conda activate isoseq3
+
+module load samtools/1.13
+
+ISOSEQOUT=raw_data/isoseq_raw_reads/processed/
+mkdir -p $ISOSEQOUT
+
+# flnc reads were created using the following command, removing artificial concatemers and filtering out all genes without Poly-A tail
+# using respective primers of the different datasets
+#isoseq3 refine $REFINEIN $PRIMERS $REFINEOUT --require-polya
+
+
+# merge all flnc reads into a single file
+samtools merge "$ISOSEQOUT"combined.merged_flnc.bam "$ISOSEQOUT"*bam
+
+
+# clustering of identical reads
+
+IN="$ISOSEQOUT"combined.merged_flnc.bam
+OUT="$ISOSEQOUT"combined.merged_clustered.bam
+
+isoseq3 cluster	"$IN" "$OUT" --verbose --use-qvs
diff --git a/workflows/isoseq_assembly/combined_mapping_and_collapse.sh b/workflows/isoseq_assembly/combined_mapping_and_collapse.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0ad606a18e7e727dd68b0c006eb8215d428df667
--- /dev/null
+++ b/workflows/isoseq_assembly/combined_mapping_and_collapse.sh
@@ -0,0 +1,71 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -t 02:00:00
+#SBATCH -J collapse
+#SBATCH -o logs/mapping_and_collapse/mappingLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=8
+#SBATCH --mem=42gb
+
+
+# load necessary modules
+source $CONDA_PREFIX/etc/profile.d/conda.sh
+conda activate isoseq
+
+# create output directory
+MAPPINGOUT=data/isoseq/mapping_and_collapse/
+mkdir -p "$MAPPINGOUT"
+
+
+### MAPPING
+
+REFERENCE=polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta
+
+# Align sequences to reference genome
+INPUT=raw_data/isoseq_raw_reads/processed/combined.merged_clustered.hq.fasta
+OUTPUTMM2="$MAPPINGOUT"combined_aln.sam
+
+minimap2 -t 8 -ax splice:hq -uf --secondary=no -a $REFERENCE $INPUT -o $OUTPUTMM2
+
+
+# Before collapsing isoforms, sequences have to be sorted
+OUTPUTSORT="$MAPPINGOUT"combined_aln_sorted.sam
+
+# remove unmapped sequences from file and sort
+sort -k 3,3 -k 4,4n $OUTPUTMM2 > $OUTPUTSORT
+
+
+
+### COLLAPSE
+
+# activate conda environment
+conda activate /projects/ag-stetter/twinkle/sqanti_env/sqanti
+
+
+# Collapsing isoforms
+
+INPUTSORTED="$MAPPINGOUT"combined_aln_sorted.sam
+OUTPUTCOLLAPSE="$MAPPINGOUT"combined
+
+collapse_isoforms_by_sam.py --input $INPUT -s $INPUTSORTED -o $OUTPUTCOLLAPSE -c 0.95 -i 0.9 --max_3_diff 1000
+
+
+
+# Cupcake support scripts after collapse
+# First obtain associated count information
+
+INPUTABUNDANCE="$MAPPINGOUT"combined.collapsed
+CLUSTERREPORT=raw_data/isoseq_raw_reads/processed/combined.merged_clustered.cluster_report.csv
+
+get_abundance_post_collapse.py $INPUTABUNDANCE $CLUSTERREPORT
+
+# add minimum read count of 2
+
+filter_by_count.py --min_count 2 --dun_use_group_count $INPUTABUNDANCE
+
+# filter away 5' degraded isoforms
+# use filter by count results
+
+OUTPUTFILTERED="$MAPPINGOUT"combined.collapsed.min_fl_2
+
+filter_away_subset.py $OUTPUTFILTERED
diff --git a/workflows/isoseq_assembly/combined_mapping_and_collapse_old_genome.sh b/workflows/isoseq_assembly/combined_mapping_and_collapse_old_genome.sh
new file mode 100644
index 0000000000000000000000000000000000000000..64563ac26068b3ef6cbc6fccb7e1c6a480d2f52f
--- /dev/null
+++ b/workflows/isoseq_assembly/combined_mapping_and_collapse_old_genome.sh
@@ -0,0 +1,73 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -t 02:00:00
+#SBATCH -J collapse
+#SBATCH -o logs/mapping_and_collapse/mappingLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=8
+#SBATCH --mem=42gb
+
+
+# load necessary modules
+source $CONDA_PREFIX/etc/profile.d/conda.sh
+conda activate isoseq
+
+# create output directory
+MAPPINGOUT=data/isoseq/mapping_and_collapse_old_genome/
+mkdir -p "$MAPPINGOUT"
+
+
+### MAPPING
+# map reads onto the old reference genome
+REFERENCE=/home/tom/Documents/reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa
+
+# Align sequences to reference genome
+INPUT=raw_data/isoseq_raw_reads/processed/combined.merged_clustered.hq.fasta
+OUTPUTMM2="$MAPPINGOUT"combined_aln.sam
+
+minimap2 -t 8 -ax splice:hq -uf --secondary=no -a $REFERENCE $INPUT -o $OUTPUTMM2
+
+
+
+# Before collapsing isoforms, sequences have to be sorted
+OUTPUTSORT="$MAPPINGOUT"combined_aln_sorted.sam
+
+# remove unmapped sequences from file and sort
+sort -k 3,3 -k 4,4n $OUTPUTMM2 > $OUTPUTSORT
+
+
+
+### COLLAPSE
+
+# activate conda environment
+conda activate /projects/ag-stetter/twinkle/sqanti_env/sqanti
+
+
+# Collapsing isoforms
+
+INPUTSORTED="$MAPPINGOUT"combined_aln_sorted.sam
+OUTPUTCOLLAPSE="$MAPPINGOUT"combined
+
+collapse_isoforms_by_sam.py --input $INPUT -s $INPUTSORTED -o $OUTPUTCOLLAPSE -c 0.95 -i 0.9 --max_3_diff 1000
+
+
+
+# Cupcake support scripts after collapse
+# First obtain associated count information
+
+INPUTABUNDANCE="$MAPPINGOUT"combined.collapsed
+CLUSTERREPORT=raw_data/isoseq_raw_reads/processed/combined.merged_clustered.cluster_report.csv
+
+get_abundance_post_collapse.py $INPUTABUNDANCE $CLUSTERREPORT
+
+# add minimum read count of 2
+
+filter_by_count.py --min_count 2 --dun_use_group_count $INPUTABUNDANCE
+
+
+# filter away 5' degraded isoforms
+# use filter by count results
+
+OUTPUTFILTERED="$MAPPINGOUT"combined.collapsed.min_fl_2
+
+filter_away_subset.py $OUTPUTFILTERED
diff --git a/workflows/isoseq_assembly/comparison_isoseq_polishing_effectiveness.Rmd b/workflows/isoseq_assembly/comparison_isoseq_polishing_effectiveness.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..8dcddd830ecbe4d1e6d346c10c244107aa05690d
--- /dev/null
+++ b/workflows/isoseq_assembly/comparison_isoseq_polishing_effectiveness.Rmd
@@ -0,0 +1,216 @@
+---
+title: "genome_polishing_comparison"
+author: "twinkle1"
+date: '2022-07-27'
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(data.table)
+library(GenomicRanges)
+library(seqinr)
+knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2_publication/")
+```
+
+## Genome polishing comparison
+
+Compare the effect of genome polishing on the coding sequence prediction by CPC2. Start by running coding sequence prediction for both reference genome corrected isoseq sequence sets, the one corrected with the unpolished reference genome and the one corrected with the polished reference genome.
+
+Run CPC2 CDS prediction on both sequence sets:
+
+```{bash}
+mkdir data/isoseq/comparison_genome_polishing
+
+# unpolished genome
+/hom/tom/Documents/tools/CPC2_standalone-1.0.1/bin/CPC2.py -i data/isoseq/sqanti/output_old_genome/combined.collapsed.min_fl_2.filtered.underscore_corrected.fasta --ORF -o data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cpc2
+
+/hom/tom/Documents/tools/CPC2_standalone-1.0.1/bin/CPC2.py -i data/isoseq/mapping_and_collapse_old_genome/combined.collapsed.min_fl_2.filtered.rep.fa --ORF -o data/isoseq/comparison_genome_polishing/unpolished_isoseq_cpc2
+
+# polished genome
+/hom/tom/Documents/tools/CPC2_standalone-1.0.1/bin/CPC2.py -i data/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.fasta --ORF -o data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cpc2
+
+/hom/tom/Documents/tools/CPC2_standalone-1.0.1/bin/CPC2.py -i data/isoseq/mapping_and_collapse/combined.collapsed.min_fl_2.filtered.rep.fa --ORF -o data/isoseq/comparison_genome_polishing/polished_isoseq_cpc2
+```
+
+
+Compare the CDS prediction results from the two files:
+
+```{r}
+# load in cpc2 files for the unpolished genome
+unpolished_cpc2 <- read.table("data/isoseq/comparison_genome_polishing/unpolished_isoseq_cpc2.txt")
+unpolished_sqanti_cpc2 <- read.table("data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cpc2.txt")
+
+# prepare for merge
+unpolished_cpc2 <- unpolished_cpc2 %>%
+  summarise(transcript_id = V1,
+            transcript_length = V2,
+            peptide_length = V3,
+            label = V9)
+unpolished_cpc2$transcript_id <- gsub("\\|.*","", unpolished_cpc2$transcript_id)
+
+unpolished_sqanti_cpc2 <- unpolished_sqanti_cpc2 %>%
+  summarise(transcript_id = V1,
+            transcript_length_s = V2,
+            peptide_length_s = V3,
+            label_s = V9)
+
+# merge tables
+merged_unpolished <- left_join(unpolished_cpc2, unpolished_sqanti_cpc2, by="transcript_id")
+
+# for comparison, keep only transcripts which are predicted as coding in at least one dataset
+merged_unpolished <- merged_unpolished %>%
+  filter(label == "coding" | label_s == "coding") %>%
+  mutate(length_diff = peptide_length - peptide_length_s)
+# positive values indicate longer peptide length in the uncorrected sequences, negative values indicate longer corrected peptide length
+
+# how many sequences have the same annotated peptide length?
+sum(merged_unpolished$peptide_length == merged_unpolished$peptide_length_s)
+sum(merged_unpolished$peptide_length == merged_unpolished$peptide_length_s)/nrow(merged_unpolished) # percentage
+# how many sequences differ in annotated peptide length?
+sum(merged_unpolished$peptide_length != merged_unpolished$peptide_length_s)
+sum(merged_unpolished$peptide_length != merged_unpolished$peptide_length_s)/nrow(merged_unpolished) # percentage
+
+sum(merged_unpolished$length_diff > 0) # how many longer before correction
+sum(merged_unpolished$length_diff < 0) # how many longer after correction
+
+# mean ORF length difference for transcripts with different ORF length predictions
+mean(abs(merged_unpolished[merged_unpolished$length_diff != 0,]$length_diff))
+
+# for how many sequences does coding prediction change by doing genome correction?
+table(interaction(as.factor(merged_unpolished$label), as.factor(merged_unpolished$label_s)))
+
+
+############# comparison with polished genome
+polished_cpc2 <- read.table("data/isoseq/comparison_genome_polishing/polished_isoseq_cpc2.txt")
+polished_sqanti_cpc2 <- read.table("data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cpc2.txt")
+
+# prepare for merge
+polished_cpc2 <- polished_cpc2 %>%
+  summarise(transcript_id = V1,
+            transcript_length = V2,
+            peptide_length = V3,
+            label = V9)
+polished_cpc2$transcript_id <- gsub("\\|.*","", polished_cpc2$transcript_id)
+
+polished_sqanti_cpc2 <- polished_sqanti_cpc2 %>%
+  summarise(transcript_id = V1,
+            transcript_length_s = V2,
+            peptide_length_s = V3,
+            label_s = V9)
+
+# merge tables
+merged_polished <- left_join(polished_cpc2, polished_sqanti_cpc2, by="transcript_id")
+merged_polished <- merged_polished %>%
+  filter(label == "coding" | label_s == "coding") %>%
+  mutate(length_diff = peptide_length - peptide_length_s)
+
+# how many sequences have the same annotated peptide length?
+sum(merged_polished$peptide_length == merged_polished$peptide_length_s)
+sum(merged_polished$peptide_length == merged_polished$peptide_length_s)/nrow(merged_polished)
+# how many sequences differ in annotated peptide length?
+sum(merged_polished$peptide_length != merged_polished$peptide_length_s)
+sum(merged_polished$peptide_length != merged_polished$peptide_length_s)/nrow(merged_polished)
+
+# mean ORF length difference for transcripts with different ORF length predictions
+mean(abs(merged_polished[merged_polished$length_diff != 0,]$length_diff))
+
+# for how many sequences does coding prediction change by doing genome correction?
+table(interaction(as.factor(merged_polished$label), as.factor(merged_polished$label_s)))
+```
+
+
+## BUSCO score of polished and unpolished sequences
+
+Create a bed file based on the cpc2 output which can be used to subset the coding sequence from the cpc2 input fasta files. Convert the CDS to protein sequence and use BUSCO afterwards to assess the completeness sequence set.
+
+```{r}
+# extract part of the fasta file based on bed positions from cpc2
+create_bed_from_cpc2 <- function(cpc2_output){
+  # read in the cpc2 input file, subset all coding sequences with intact ORF, then convert to bed
+  cpc2 <- read.table(cpc2_output)
+  # filter for coding transcripts with an intact ORF
+  cpc2 <- cpc2 %>%
+    filter(V9 == "coding" & V6 == 1) %>%
+    summarise(ID=V1, start=V7-1, end=V7+(V3*3)-1)
+  return(cpc2)
+}
+
+# write cpc2 output as bed file
+
+### isoseq unpolished
+isoseq_unpolished_bed <- create_bed_from_cpc2("data/isoseq/comparison_genome_polishing/unpolished_isoseq_cpc2.txt") 
+write_tsv(isoseq_unpolished_bed, 
+          file = "data/isoseq/comparison_genome_polishing/unpolished_isoseq_cpc2.bed",
+          col_names = F)
+
+### isoseq unpolished
+isoseq_unpolished_sqanti_bed <- create_bed_from_cpc2("data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cpc2.txt") 
+write_tsv(isoseq_unpolished_sqanti_bed, 
+          file = "data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cpc2.bed",
+          col_names = F)
+
+### isoseq polished
+isoseq_polished_bed <- create_bed_from_cpc2("data/isoseq/comparison_genome_polishing/polished_isoseq_cpc2.txt") 
+write_tsv(isoseq_polished_bed, 
+          file = "data/isoseq/comparison_genome_polishing/polished_isoseq_cpc2.bed",
+          col_names = F)
+
+### isoseq sqanti polished
+isoseq_polished_sqanti_bed <- create_bed_from_cpc2("data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cpc2.txt") 
+write_tsv(isoseq_polished_sqanti_bed, 
+          file = "data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cpc2.bed",
+          col_names = F)
+```
+
+Extract predicted protein sequence and run busco:
+
+```{bash}
+mkdir -p data/annotation_analysis/busco
+
+### ISOSEQ UNPOLISHED
+# Extract the coding sequence from the fasta file
+tools/bedtools getfasta -fi data/isoseq/mapping_and_collapse_old_genome/combined.collapsed.min_fl_2.filtered.rep.fa -fo data/isoseq/comparison_genome_polishing/unpolished_isoseq_cds.fasta -bed data/isoseq/comparison_genome_polishing/unpolished_isoseq_cpc2.bed
+# translate into protein sequence
+seqkit translate data/isoseq/comparison_genome_polishing/unpolished_isoseq_cds.fasta > data/isoseq/comparison_genome_polishing/unpolished_isoseq_cds.faa
+# to prepare for busco, trim the fasta header
+sed 's/|.*//' data/isoseq/comparison_genome_polishing/unpolished_isoseq_cds.faa > data/isoseq/comparison_genome_polishing/unpolished_isoseq_cds_fixed.faa
+# run busco
+busco -m protein -i data/isoseq/comparison_genome_polishing/unpolished_isoseq_cds_fixed.faa -o unpolished_isoseq_cds -l embryophyta_odb10 --out_path data/annotation_analysis/busco/ --download_path data/busco/datasets/ -c 7 -f
+
+### ISOSEQ SQANTI UNPOLISHED
+# Extract the coding sequence from the fasta file
+tools/bedtools getfasta -fi data/isoseq/sqanti/output_old_genome/combined.collapsed.min_fl_2.filtered.underscore_corrected.fasta -fo data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cds.fasta -bed data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cpc2.bed
+# translate into protein sequence
+seqkit translate data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cds.fasta > data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cds.faa
+# run busco
+busco -m protein -i data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cds.faa -o unpolished_isoseq_sqanti_cds -l embryophyta_odb10 --out_path data/annotation_analysis/busco/ --download_path data/busco/datasets/ -c 7 -f
+
+### ISOSEQ POLISHED
+# Extract the coding sequence from the fasta file
+tools/bedtools getfasta -fi data/isoseq/mapping_and_collapse/combined.collapsed.min_fl_2.filtered.rep.fa -fo data/isoseq/comparison_genome_polishing/polished_isoseq_cds.fasta -bed data/isoseq/comparison_genome_polishing/polished_isoseq_cpc2.bed
+# translate into protein sequence
+seqkit translate data/isoseq/comparison_genome_polishing/polished_isoseq_cds.fasta > data/isoseq/comparison_genome_polishing/polished_isoseq_cds.faa
+# to prepare for busco, trim the fasta header
+sed 's/|.*//' data/isoseq/comparison_genome_polishing/polished_isoseq_cds.faa > data/isoseq/comparison_genome_polishing/polished_isoseq_cds_fixed.faa
+# run busco
+busco -m protein -i data/isoseq/comparison_genome_polishing/polished_isoseq_cds_fixed.faa -o polished_isoseq_cds -l embryophyta_odb10 --out_path data/annotation_analysis/busco/ --download_path data/busco/datasets/ -c 7 -f
+
+### ISOSEQ SQANTI POLISHED
+# Extract the coding sequence from the fasta file
+tools/bedtools getfasta -fi data/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.fasta -fo data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cds.fasta -bed data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cpc2.bed
+# translate into protein sequence
+seqkit translate data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cds.fasta > data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cds.faa
+# run busco
+busco -m protein -i data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cds.faa -o polished_isoseq_sqanti_cds -l embryophyta_odb10 --out_path data/annotation_analysis/busco/ --download_path data/busco/datasets/ -c 7 -f
+```
+
+
+
+
+
+
+
+
+
diff --git a/workflows/isoseq_assembly/isoseq3_pipeline.sh b/workflows/isoseq_assembly/isoseq3_pipeline.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e7cebe34829325bcb5bb65da9cd4cb7d0b66fd53
--- /dev/null
+++ b/workflows/isoseq_assembly/isoseq3_pipeline.sh
@@ -0,0 +1,52 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -t 12:00:00
+#SBATCH -J isoseq3
+#SBATCH -o logs/isoseq3/mappingLog-%j.txt
+#SBATCH --nodes=1-1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64gb
+#SBATCH --array 0-6
+
+
+source $CONDA_PREFIX/etc/profile.d/conda.sh
+conda activate isoseq3
+
+
+echo "$SLURM_ARRAY_TASK_ID"
+
+# following https://github.com/PacificBiosciences/IsoSeq/blob/master/isoseq-clustering.md from step 4 on using ccs.bam and primer.fa
+# removal of primers and barcodes already done
+# step 4, removal of polyA tail and artificial concatemers
+
+PROCESSING=raw_data/isoseq_raw_reads/processed/
+mkdir -p "PROCESSING"
+
+SOURCE_DIR=raw_data/isoseq_raw_reads/reads
+FILES=("$SOURCE_DIR"/*bam)
+
+PRIMER_SOURCE=raw_data/isoseq_raw_reads/primers
+PRIMER_FILES=("$PRIMER_SOURCE"/*fasta)
+
+REFINEIN="$SOURCE_DIR"/"${FILES["${SLURM_ARRAY_TASK_ID}"]}"
+PRIMERS="$PRIMER_SOURCE"/"${PRIMER_FILES["${SLURM_ARRAY_TASK_ID}"]}"
+REFINEOUT="$PROCESSING""${FILES["${SLURM_ARRAY_TASK_ID}"]/.bam/flnc.bam}"
+
+isoseq3 refine $REFINEIN $PRIMERS $REFINEOUT --require-polya
+
+# flnc reads were created using the following command, removing artificial concatemers and filtering out all genes without Poly-A tail
+# using respective primers of the different datasets
+#isoseq3 refine $REFINEIN $PRIMERS $REFINEOUT --require-polya
+
+
+# merge all flnc reads into a single file
+samtools merge "$ISOSEQOUT"combined.merged_flnc.bam /home/twinkle1/isoseq_raw_reads/*bam
+
+
+# clustering of identical reads
+
+IN="data/isoseq3_pipeline/combined.merged_flnc.bam"
+OUT="data/isoseq3_pipeline/combined.merged_clustered.bam"
+
+isoseq3 cluster	"$IN" "$OUT" --verbose --use-qvs
diff --git a/workflows/isoseq_assembly/readme.txt b/workflows/isoseq_assembly/readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..85aefb6c87ae724447cee891f106d4f45d99e4d7
--- /dev/null
+++ b/workflows/isoseq_assembly/readme.txt
@@ -0,0 +1,23 @@
+## Iso-Seq assembly
+
+Assembly full-length transcript sequencing data.
+
+### Script order:
+
+- code/isoseq_assembly/isoseq3_pipeline.sh
+assemble FLNC reads from CCS files
+
+- code/isoseq_assembly/combined_isoseq3_pipeline.sh
+combine FLNC reads and cluster identical reads
+
+- code/isoseq_assembly/combined_mapping_and_collapse.sh
+collapse clustered reads into unique full-length transcripts using the polished reference genome
+
+- code/isoseq_assembly/combined_mapping_and_collapse_old_genome.sh
+collapse clustered reads into unique full-length transcripts using the unpolished reference genome
+
+- code/isoseq_assembly/run_sqanti.sh
+run SQANTI in order to correct possible sequencing errors in the full-length transcripts using both reference genomes
+
+- code/isoseq_assembly/comparison_isoseq_polishing_effectiveness.Rmd
+compare effect of genome polishing on error correction of full-length transcript sequences
diff --git a/workflows/isoseq_assembly/run_sqanti.sh b/workflows/isoseq_assembly/run_sqanti.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8b29756f693351f7bd9864b15c7cd2923f57ed3f
--- /dev/null
+++ b/workflows/isoseq_assembly/run_sqanti.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# run sqanti correction and filtering
+# preliminary genome annotation is used to compare against, but this only effects the pdf output
+
+mkdir -p data/isoseq/sqanti/output_polished
+
+/home/tom/Documents/tools/SQANTI3-4.2/sqanti3_qc.py \
+        data/isoseq/mapping_and_collapse/combined.collapsed.min_fl_2.filtered.gff \
+        data/braker2/polished_prot_rna/braker.gtf \
+        polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \
+        -d data/isoseq/sqanti/output_polished/ \
+        -n 6
+
+mkdir -p data/isoseq/sqanti/output_old_genome
+
+/home/tom/Documents/tools/SQANTI3-4.2/sqanti3_qc.py \
+	data/isoseq/mapping_and_collapse_old_genome/combined.collapsed.min_fl_2.filtered.gff \
+	data/braker2/polished_prot_rna/braker.gtf \
+	/home/tom/Documents/reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa \
+	-d data/isoseq/sqanti/output_old_genome/ \
+	-n 6
diff --git a/workflows/merge_annotation/.Rhistory b/workflows/merge_annotation/.Rhistory
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/workflows/merge_annotation/Braker2_subsets_and_merge.Rmd b/workflows/merge_annotation/Braker2_subsets_and_merge.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..c56fa6e9f03ad064504612db1aaede31286aca52
--- /dev/null
+++ b/workflows/merge_annotation/Braker2_subsets_and_merge.Rmd
@@ -0,0 +1,251 @@
+---
+title: "BRAKER2_subsets_and_merge"
+author: "twinkle1"
+date: "2/18/2022"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(data.table)
+library(GenomicRanges)
+library(seqinr)
+knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2_publication/")
+```
+
+The following script is used to create new files of differently supported subsets of braker2 predictions. Full and partially supported gene predictions are combined with the Iso-Seq data using TSEBRA.
+
+
+## Setup
+
+```{r}
+# set up function for reading in a gtf file
+read.gtf <- function(file){
+  # based on: https://www.biostars.org/p/272889/
+  # read in the gtf file:
+  gff <- fread(file)
+  setnames(gff, names(gff), c("chr","source","type","start","end","score","strand","phase","attributes"))
+  # subset attribute column into the gene and transcript id columns
+  # function for extracting the two attributes
+  extract_attributes <- function(gtf_column, att_of_interest){
+    att <- strsplit(gtf_column, "; ")
+    att <- gsub("\"","",unlist(att))
+    att <- gsub(";","",unlist(att))
+    if(!is.null(unlist(strsplit(att[grep(att_of_interest, att)], " ")))){
+      return( unlist(strsplit(att[grep(att_of_interest, att)], " "))[2])
+    }else{
+      return(NA)
+    }
+  }
+  # using the function to subset gene and transcript id:
+  gff$gene_id <- unlist(lapply(gff$attributes, extract_attributes, "gene"))
+  gff$transcript_id <- unlist(lapply(gff$attributes, extract_attributes, "transcript"))
+  return(gff)
+}
+
+# Create function for converting gtf dataframe to genomic ranges onbject
+Granges_from_gtf <- function(gtf){
+  # requires the GRanges and tidyverse packages
+  gene_structures <- gtf %>%
+  group_by(transcript_id) %>% # group by transcript id
+  summarise(gene_start = min(start),
+            gene_end = max(end),
+            seqnames = unique(chr), # all sequences should be on the same chromosome
+            gene_strand = unique(strand))
+  # use the gene_structures object to create the genomic ranges object
+  gene_ranges <- GRanges(seqnames = gene_structures$seqnames, 
+                         ranges = IRanges(start=gene_structures$gene_start, 
+                                          end=gene_structures$gene_end,
+                                          names = gene_structures$transcript_id), 
+                         strand = gene_structures$gene_strand)
+  return(gene_ranges)
+}
+
+
+# Create a function to report all transcripts within a read.gtf object, that have both a start and stop codons annotated
+report_both_codons <- function(gtf_object){
+  # report all those transcript ids in the gtf file, which have both a start and a stop codon
+  # summarize by transcript and type
+  x <- gtf_object %>%
+      group_by(transcript_id, type) %>%
+      summarise(count = n())
+  # get the names of subsets with exactly start or stop codon
+  ids_with_start <- x[x$type == "start_codon" & x$count == 1,]$transcript_id
+  ids_with_stop <- x[x$type == "stop_codon" & x$count == 1,]$transcript_id
+  # ids with both codons are returned
+  return(ids_with_start[ids_with_start %in% ids_with_stop])
+}
+```
+
+
+## Main
+
+### Generate supported subsets
+
+Create differently supported subsets using the available script from augustus/braker2:
+
+```{bash}
+mkdir -p data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets
+
+# activate busco environment for the Augustus script
+conda activate busco
+
+# extract braker aa and cds, also generate the bad_genes.lst file
+/home/tom/Documents/tools/Augustus/scripts/getAnnoFastaFromJoingenes.py -g polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta -f data/braker2/polished_prot_rna/braker.gtf -s FILTER -o data/braker2/polished_prot_rna/braker_extracted
+
+# select differently supported subsets based on the amount of support
+tools/BRAKER/scripts/predictionAnalysis/selectSupportedSubsets.py --noSupport data/braker_analysis/external_evidence/polished_prot_rna/no_support.txt --fullSupport data/braker_analysis/external_evidence/polished_prot_rna/full_support.txt --anySupport data/braker_analysis/external_evidence/polished_prot_rna/any_support.txt data/braker2/polished_prot_rna/braker.gtf data/braker2/polished_prot_rna/hintsfile.gff
+
+# remove all lines that contain "#" characters in the beginning (cause issues at later steps)
+sed '/^#/d' data/braker_analysis/external_evidence/polished_prot_rna/full_support.txt > data/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf
+sed '/^#/d' data/braker_analysis/external_evidence/polished_prot_rna/any_support.txt > data/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf
+sed '/^#/d' data/braker_analysis/external_evidence/polished_prot_rna/no_support.txt > data/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf
+```
+
+
+Generate a dataframe with the transcript ids based on the differently supported subsets. The dataframe can be saved as RDS object and can be loaded in subsequent runs. The "fixed" gtf files have lines with only "###" between the different genes removed.
+
+```{r}
+# load in the support set gtfs
+full.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf")
+any.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf")
+no.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf")
+
+# load in the genes with internal stop codons, as detected by the getAnnoFastaFromJoingenes script (see braker2_results folder readme.txt)
+badgenes <- read.table("data/braker_analysis/external_evidence/polished_prot_rna/bad_genes.lst")
+
+# extract the gene names with both, annotated start and annotated stop codons
+codons_any <- report_both_codons(any.support)
+codons_no <- report_both_codons(no.support)
+codons <- c(codons_any, codons_no)
+
+
+### Create dataframe for later filtering steps
+# get the transcript ids from the subsets
+full.ids <- unique(full.support$transcript_id) #18405
+any.ids <- unique(any.support$transcript_id) #32230
+partial.ids <- any.ids[!any.ids %in% full.ids] #13825
+no.ids <- unique(no.support$transcript_id) #7443
+
+# create dataframe with the subset transcript ids and the support category
+all.ids <- c(full.ids, partial.ids, no.ids)
+support <- c(rep("full", length(full.ids)), 
+             rep("partial", length(partial.ids)), 
+             rep("no", length(no.ids)))
+support.df <- data.frame(all.ids, support)
+
+
+# filter out internal stop codons
+support.df <- support.df[!support.df$all.ids %in% badgenes$V1,] 
+# exclude all annotated genes that do not have annotated start and stop codons
+fixed_support.df <- support.df[support.df$all.ids %in% codons,]
+
+
+##################################
+# write created dataframe as rds object:
+saveRDS(fixed_support.df, file="data/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS")
+
+```
+
+
+Filter out predicted genes with internal stop codons as well as predicted genes without start and stop codons. Write the filtered gtf files in the end.
+
+```{r}
+# load in the gtf files to subset
+full.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf")
+any.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf")
+no.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf")
+
+# load in the annotation dataframe:
+support.df <- readRDS(file="data/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS")
+
+# create partial support dataframe:
+partial.support <- any.support[!any.support$transcript_id %in% full.support$transcript_id,]
+
+# subset based on the support dataframe:
+full.support.filtered <- full.support[full.support$transcript_id %in% support.df$all.ids,] # no transcript is excluded
+#length(unique(full.support[!full.support$transcript_id %in% support.df$all.ids,]$transcript_id))
+partial.support.filtered <- partial.support[partial.support$transcript_id %in% support.df$all.ids,] # 73 transcripts are excluded
+#length(unique(partial.support[!partial.support$transcript_id %in% support.df$all.ids,]$transcript_id))
+no.support.filtered <- no.support[no.support$transcript_id %in% support.df$all.ids,] # 567 transcripts are excluded
+#length(unique(no.support[!no.support$transcript_id %in% support.df$all.ids,]$transcript_id))
+
+# total number of transcripts
+#length(unique(full.support.filtered$transcript_id)) # 22012 remain
+#length(unique(partial.support.filtered$transcript_id)) # 7013 remain
+#length(unique(no.support.filtered$transcript_id)) # 3579 remain
+
+### Write the filtered gtf files
+write.table(full.support.filtered[,1:9], 
+          "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.gtf",
+          col.names = F, row.names = F, quote=F, sep ="\t")
+write.table(partial.support.filtered[,1:9], 
+          "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.gtf",
+          col.names = F, row.names = F, quote=F, sep ="\t")
+write.table(no.support.filtered[,1:9], 
+          "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.gtf",
+          col.names = F, row.names = F, quote=F, sep ="\t")
+```
+
+After filtering the gtf files, use the filtered files in order to subset the codingseq and amino acid fasta files. Load the coding sequence and amino acid fasta files and filter them using the support dataframe. 
+
+```{r}
+library(seqinr)
+
+# load in the support dataframe
+support.df <- readRDS(file="data/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS")
+support.df$all.ids <- as.character(support.df$all.ids)
+
+# subset fasta files using the support dataframe
+braker_dna <- read.fasta("data/braker2/polished_prot_rna/braker_extracted.codingseq",
+                            seqtype = "DNA")
+
+# filter the different subsets
+braker_dna.filtered <- braker_dna[getName(braker_dna) %in% support.df[,1]] # 39141 sequences
+braker_dna.full.filtered <- braker_dna[getName(braker_dna) %in% support.df[support.df$support == "full",1]] # 18404 sequences
+braker_dna.partial.filtered <- braker_dna[getName(braker_dna) %in% support.df[support.df$support == "partial",1]] # 13723 sequences
+braker_dna.no.filtered <- braker_dna[getName(braker_dna) %in% support.df[support.df$support == "no",1]] # 7014 sequences
+
+# write subsets
+write.fasta(sequences=braker_dna.filtered, 
+            names=names(braker_dna.filtered), 
+            file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/braker_all.fasta")
+write.fasta(sequences=braker_dna.full.filtered, 
+            names=names(braker_dna.full.filtered), 
+            file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.fasta")
+write.fasta(sequences=braker_dna.partial.filtered, 
+            names=names(braker_dna.partial.filtered), 
+            file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.fasta")
+write.fasta(sequences=braker_dna.no.filtered, 
+            names=names(braker_dna.no.filtered), 
+            file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.fasta")
+
+### repeat for the AA fasta files:
+# subset fasta files using the support dataframe
+braker_aa <- read.fasta("data/braker2/polished_prot_rna/braker_extracted.aa",
+                            seqtype = "AA")
+
+braker_aa.filtered <- braker_aa[getName(braker_aa) %in% support.df[,1]] # 39141 sequences
+braker_aa.full.filtered <- braker_aa[getName(braker_aa) %in% support.df[support.df$support == "full",1]] # 18404 sequences
+braker_aa.partial.filtered <- braker_aa[getName(braker_aa) %in% support.df[support.df$support == "partial",1]] # 13723 sequences
+braker_aa.no.filtered <- braker_aa[getName(braker_aa) %in% support.df[support.df$support == "no",1]] # 7014 sequences
+
+# write subsets
+write.fasta(sequences=braker_aa.filtered, 
+            names=names(braker_aa.filtered), 
+            file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/braker_all.faa")
+write.fasta(sequences=braker_aa.full.filtered, 
+            names=names(braker_aa.full.filtered), 
+            file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.faa")
+write.fasta(sequences=braker_aa.partial.filtered, 
+            names=names(braker_aa.partial.filtered), 
+            file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.faa")
+write.fasta(sequences=braker_aa.no.filtered, 
+            names=names(braker_aa.no.filtered), 
+            file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.faa")
+```
+
+
+
+
diff --git a/workflows/merge_annotation/readme.txt b/workflows/merge_annotation/readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..07756106ab59e572559a654025fa157186cd1b72
--- /dev/null
+++ b/workflows/merge_annotation/readme.txt
@@ -0,0 +1,11 @@
+## Merge of computational annotation and Iso-Seq transcripts
+
+Combine the computational annotation with full-length transcript sequencing data using TSEBRA.
+
+### Script order:
+
+- code/merge_annotation/Braker2_subsets_and_merge.Rmd
+Prepare BRAKER2 input, use only predicted genes supported by external evidence for the merge
+
+- code/merge_annotation/reannotation_correction.Rmd
+Merge BRAKER2 and full-length transcript sequencing data using TSEBRA, deduplicate, rename genes and compare annotation completeness using BUSCO
diff --git a/workflows/merge_annotation/reannotation_correction.Rmd b/workflows/merge_annotation/reannotation_correction.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..d056ed1422893d7094c2a76959383ee851965b2c
--- /dev/null
+++ b/workflows/merge_annotation/reannotation_correction.Rmd
@@ -0,0 +1,578 @@
+---
+title: "Reannotation_correction"
+author: "twinkle1"
+date: "2/18/2022"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(data.table)
+library(GenomicRanges)
+library(seqinr)
+knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2_publication/")
+```
+
+
+## Introduction
+
+Run TSEBRA to combine the computational annotation with Iso-Seq data. Check the generated annotation (based on the merge of Isoseq data with the braker2 predictions) for redundancies. Remove redundant gene structures and rename all genes using a unified rule set.
+
+## Setup
+
+```{r}
+# set up function for reading in a gtf file
+read.gtf <- function(file){
+  # based on: https://www.biostars.org/p/272889/
+  # read in the gtf file:
+  gff <- fread(file)
+  setnames(gff, names(gff), c("chr","source","type","start","end","score","strand","phase","attributes"))
+  # subset attribute column into the gene and transcript id columns
+  # function for extracting the two attributes
+  extract_attributes <- function(gtf_column, att_of_interest){
+    att <- strsplit(gtf_column, "; ")
+    att <- gsub("\"","",unlist(att))
+    att <- gsub(";","",unlist(att))
+    if(!is.null(unlist(strsplit(att[grep(att_of_interest, att)], " ")))){
+      return( unlist(strsplit(att[grep(att_of_interest, att)], " "))[2])
+    }else{
+      return(NA)
+    }
+  }
+  # using the function to subset gene and transcript id:
+  gff$gene_id <- unlist(lapply(gff$attributes, extract_attributes, "gene"))
+  gff$transcript_id <- unlist(lapply(gff$attributes, extract_attributes, "transcript"))
+  return(gff)
+}
+
+# Create function for converting gtf dataframe to genomic ranges onbject
+Granges_from_gtf <- function(gtf){
+  # requires the GRanges and tidyverse packages
+  gene_structures <- gtf %>%
+  group_by(transcript_id) %>% # group by transcript id
+  summarise(gene_start = min(start),
+            gene_end = max(end),
+            seqnames = unique(chr), # all sequences should be on the same chromosome
+            gene_strand = unique(strand))
+  # use the gene_structures object to create the genomic ranges object
+  gene_ranges <- GRanges(seqnames = gene_structures$seqnames, 
+                         ranges = IRanges(start=gene_structures$gene_start, 
+                                          end=gene_structures$gene_end,
+                                          names = gene_structures$transcript_id), 
+                         strand = gene_structures$gene_strand)
+  return(gene_ranges)
+}
+
+
+# write a function to extract the set of nonoverlapping features from the gtf file
+# define query and subject as GRanges objects
+extract_nonoverlaps <- function(query_granges, subject_granges, subject_gtf){
+  # find overlaps, including the strand information
+  overlaps <- findOverlaps(query = query_granges, subject = subject_granges)
+  # get the ids of the query sequences that overlap
+  overlapping_ids <- subject_granges@ranges@NAMES[overlaps@to]
+  # filter the gtf for all sequences that do not overlap
+  non_overlapping_gtf <- subject_gtf[!subject_gtf$transcript_id %in% overlapping_ids]
+  return(non_overlapping_gtf)
+}
+
+# Create a function to report all transcripts within a read.gtf object, that have both a start and stop codons annotated
+report_both_codons <- function(gtf_object){
+  # report all those transcript ids in the gtf file, which have both a start and a stop codon
+  # summarize by transcript and type
+  x <- gtf_object %>%
+      group_by(transcript_id, type) %>%
+      summarise(count = n())
+  # get the names of subsets with exactly start or stop codon
+  ids_with_start <- x[x$type == "start_codon" & x$count == 1,]$transcript_id
+  ids_with_stop <- x[x$type == "stop_codon" & x$count == 1,]$transcript_id
+  # ids with both codons are returned
+  return(ids_with_start[ids_with_start %in% ids_with_stop])
+}
+
+
+# extract all overlapping sequences in lists with their respective other overlapping sequences
+lists.of.overlaps <- function(overlaps){
+  # work with a while loop
+  # get all ids of the selfoverlapping sequences (only once for each id)
+  selfoverlap <- unique(overlaps@from[duplicated(overlaps@from)])
+  # initialize
+  list.out <- list()
+  vec <- c()
+  i <- 1
+  
+  # while there are still selfoverlapping sequences
+  while (length(selfoverlap) > 0){
+    # check each overlap if it belongs to a selfoverlapping sequence
+    for (j in 1:length(overlaps)){
+      # if it belongs to a selfoverlap, save the overlap (to, not from) to vector
+      if (overlaps@from[j] == selfoverlap[1]) {
+        vec <- c(vec, overlaps@to[j])
+      }
+    }
+    # add vector to list, use i as iteration counter for accessing list position
+    list.out[[i]] <- vec
+    # prune sequences in vector from selfoverlap id vector so as not to create duplicated objects in list
+    selfoverlap <- selfoverlap[!selfoverlap %in% vec]
+    # reset vector and add iteration count
+    vec <- c()
+    i <- i+1
+  }
+  return(list.out)
+}
+
+# function to get the duplicated elements from list output
+get_dups <- function(list){
+  dups <- base::Reduce(generics::intersect, list(unlist(list)))
+  return(dups[duplicated(base::Reduce(generics::intersect, list(unlist(list))))])
+}
+
+# function for concatenating list elements with (at least partially) similar entries
+cat.elements <- function(list){
+  # initialize
+  vec <- c()
+  # starting with second element, for each element in the list
+  for (i in 2:length(list)){
+    # for each position in list element
+    for (j in 1:length(list[[i]])){
+      # if the value occurred in the list before
+      if (list[[i]][j] %in% unlist(list[1:(i-1)])){
+        # for each list element before i
+        for (k in 1:(i-1)){
+          # for each position in these list elements
+          for (l in 1:length(list[[k]])){
+            # if the current value is equal to the value that occurred before
+            if (list[[i]][j] == list[[k]][l]){
+              list[[k]] <- unique(c(list[[i]], list[[k]])) # add both into one entry
+              vec <- c(vec, i) # collect rows to remove
+            }
+          }
+        }
+      }
+    }
+  }
+  # remove elements that have been concatenated before and return list
+  return(list[-vec])
+}
+
+# part of the read.gtf function
+extract_attributes <- function(gtf_column, att_of_interest){
+    att <- strsplit(gtf_column, "; ")
+    att <- gsub("\"","",unlist(att))
+    att <- gsub(";","",unlist(att))
+    if(!is.null(unlist(strsplit(att[grep(att_of_interest, att)], " ")))){
+      return( unlist(strsplit(att[grep(att_of_interest, att)], " "))[2])
+    } else {
+      return(NA)
+    }
+}
+```
+
+
+## Main
+
+Combine computational annotation and Isoseq transcripts into a single genome annotation. Predict ORFs in the transcript sequences using CPC2.
+
+```{bash}
+mkdir data/isoseq/cpc/
+
+# Predict coding sequence in braker2 transcripts for TSEBRA
+tools/CPC2_standalone-1.0.1/bin/CPC2.py \
+	-i data/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.fasta \
+	-o data/isoseq/cpc2/cpc2_from_sqanti \
+	--ORF
+```
+
+Convert CPC2 output into a gtf file of predicted transcript coding sequences. CPC2 output annotates the CDS based on transcript internal coordinates. In order to generate the gtf file, the transcript internal coordinates have to be converted into genome coordinates, while keeping track of exon boundaries and strandedness.
+
+```{r}
+# prepare a bed file from the results of cpc2
+cpc2 <- read.table("data/isoseq/cpc2/cpc2_from_sqanti.txt")
+# filter for coding transcripts with an intact ORF
+cpc2 <- cpc2 %>%
+  filter(V9 == "coding" & V6 == 1) %>%
+  summarise(ID=V1, start=V7-1, end=V7+(V3*3)-1)
+# write as bed file for extraction of CDS
+write_tsv(cpc2, 
+          file="data/isoseq/cpc2/cpc2_extraction.bed",
+          quote = "none",
+          col_names = F)
+
+# Convert cpc2 output to gtf file:
+# read in gtf file of isoseq transcript data
+isoseq.gtf <- read.gtf("data/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.gtf")
+isoseq.gtf <- isoseq.gtf[isoseq.gtf$type == "exon",]
+
+# read in cpc2 output file, the bed file used for extraction does suffice
+cpc2.bed <- read.table("data/isoseq_data/cpc2/cpc2_extraction.bed")
+colnames(cpc2.bed) <- c("ID", "CDS_start", "CDS_end")
+# bed format is 0 based, convert back to a 1 based format, end position does not have to converted
+cpc2.bed$CDS_start <- cpc2.bed$CDS_start+1
+
+# create vector of all ids to loop through
+ids <- as.character(unique(cpc2.bed$ID))
+
+# initialize vectors
+stranded <- c()
+output <- c()
+
+# for each transcript id
+for (i in 1:length(ids)){
+  # subset gtf file
+  gtf.subset <- isoseq.gtf[isoseq.gtf$transcript_id == ids[i],]
+  gtf.subset <- gtf.subset %>%
+      arrange(start)
+  # create vector of genomic positions from the exon features
+  position <- c()
+  for (j in 1:nrow(gtf.subset)){
+    position <- c(position, gtf.subset[[j,4]]:gtf.subset[[j,5]])
+  }
+  # include strand information
+  stranded <- gtf.subset[[1,7]]
+  # reverse order of elements for minus strand
+  if (stranded == "-"){
+    position <- rev(position)
+  }
+  # get the positions that are supposed to be extracted
+  cpc2.subset <- cpc2.bed[cpc2.bed$ID == ids[i],]
+  cds_coordinates <- position[cpc2.subset$CDS_start:cpc2.subset$CDS_end]
+  # to calculate the exon ends, shift the cds_coordinate vector by one position, create a pseudovalue for the last entry
+  shifted <- cds_coordinates[-1]
+  shifted[length(shifted)+1] <- shifted[length(shifted)]+1
+  # which values change by more than 1 after one shift? These values represent exon boundaries
+  boundaries <- which(abs(cds_coordinates - shifted) != 1)
+  
+  # create matrix from which to construct the start and end positions of the gtf file
+  # create matrices for single exon genes also, as they are converted to vectors in a later step otherwise
+  if (length(boundaries) > 0){
+    mat <- matrix(data=NA, nrow = 1+length(boundaries), ncol=2)
+    mat[1,1] <- cds_coordinates[1]
+    mat[length(boundaries)+1,2] <- cds_coordinates[length(cds_coordinates)]
+    for (j in 1:length(boundaries)){
+      mat[j,2] <- cds_coordinates[boundaries[j]]
+      mat[j+1,1] <- cds_coordinates[boundaries[j]+1]
+    }
+  } else if (stranded == "+") {
+    mat <- matrix(data=NA, nrow = 1, ncol=2)
+    mat[1,1] <- cds_coordinates[1]
+    mat[1,2] <- cds_coordinates[length(cds_coordinates)]
+  } else {
+    mat <- matrix(data=NA, nrow = 1, ncol=2)
+    mat[1,2] <- cds_coordinates[1]
+    mat[1,1] <- cds_coordinates[length(cds_coordinates)]
+  }
+  
+  # invert matrix for minus strand in order to start with the lowest number
+  # only if there are multiple exons
+  if (stranded == "-" & nrow(mat) != 1){
+    mat <- apply(apply(mat, 1, rev), 1, rev)
+  }
+  
+  # use the previously created gtf structure to store the CDS coordinates
+  # the number of CDS rows is always <= number of exon rows
+  out.subset <- gtf.subset[1:nrow(mat),]
+  for (j in 1:nrow(mat)){
+    out.subset[j,4] <- mat[j,1]
+    out.subset[j,5] <- mat[j,2]
+    out.subset[j,3] <- "CDS"
+  }
+  output <- rbind(output, out.subset)
+  print(i)
+}
+# output represents all CDS records
+
+# write gtf file
+write.table(output[,1:9], 
+            "data/isoseq/cpc2/cpc2_extracted_cds.gtf",
+            col.names = F, row.names = F, quote=F, sep ="\t")
+```
+
+Run TSEBRA to combine computational prediction with Iso-Seq transcripts, using gffread for deduplication:
+
+```{bash}
+mkdir -p data/braker_analysis/TSEBRA
+mkdir -p data/reannotation_correction/computational
+mkdir -p data/reannotation_correction/manual
+
+# -g specifies the braker gtf, -c long read config file, -e braker hint file -l isoseq cpc2 extracted gtf file
+/home/tom/Documents/tools/TSEBRA/bin/tsebra.py \
+	-g data/braker_analysis/external_evidence/fixed_subsets/full_partial_support.gtf \
+	-c /home/tom/Documents/tools/TSEBRA/config/long_reads.cfg \
+	-e data/braker2/polished_prot_rna/hintsfile.gff \
+	-l data/isoseq/cpc2/cpc2_extracted_cds.gtf \
+	-o data/braker_analysis/TSEBRA/tsebra.gtf
+
+
+# To remove any duplicated entries and to cluster the predicted transcripts into loci from the tsebra gtf file I used gffread.
+# -M option merges identical entries, -K option causes stricter merge, -T outputs as gtf (did also output as gff file)
+/home/tom/Documents/tools/gffread-0.12.7/gffread -M -K -T \
+	-d data/braker_analysis/TSEBRA/duplication_info.txt \
+	data/braker_analysis/TSEBRA/tsebra.gtf > \
+	data/braker_analysis/TSEBRA/tsebra_dedup.gtf
+```
+
+
+After the finalisation of the genome prediction using TSEBRA and the deduplication using gffread the generated gtf file is sorted and gene names are unified. Sort the file and add the locus as gene id:
+
+```{r}
+# read in the annotation
+tsebra.gtf <- read.gtf("data/braker_analysis/TSEBRA/tsebra_dedup.gtf")
+
+# create a mapping from locus the gene id, the locus is only found in rows of the "transcript" type
+transcript_locus_mapping <- tsebra.gtf %>%
+  filter(type == "transcript")
+transcript_locus_mapping$locus <- unlist(lapply(transcript_locus_mapping$attributes, extract_attributes, "locus"))
+transcript_locus_mapping <- transcript_locus_mapping %>%
+  select(transcript_id, locus)
+
+# join locus information into the tsebra dataframe
+tsebra.gtf <- left_join(tsebra.gtf, transcript_locus_mapping)
+
+# save information as RDS
+output <- tsebra.gtf %>%
+  select(source, gene_id, transcript_id, locus)
+saveRDS(output, "data/reannotation_correction/computational/locus.RDS")
+
+
+# create a dataframe indicating the order of the scaffolds and contigs
+names <- sort(unique(tsebra.gtf$chr))
+
+# sort names correctly and add number indicating the order
+names <- names[order(nchar(names), names)]
+order <- 1:(length(names))
+
+# join the created dataframe with the gtf dataframe for sorting
+order.df <- data.frame(names, order)
+tsebra.gtf <- left_join(tsebra.gtf, order.df, by= c("chr" = "names"))
+tsebra.gtf$order2 <- 1:nrow(tsebra.gtf)
+
+# group by gene id and then sort, first by chromosome using the order dataframe, then by start position
+# also keep together all gene transcript ids by sorting by the previous order
+tsebra.gtf.sorted <- tsebra.gtf %>%
+  group_by(transcript_id) %>%
+  arrange(order, order2)
+
+# use only one record per transcript, sort by locus and add number with the number of transcript at that locus
+# to later create the transcript names
+transcript_id <- tsebra.gtf.sorted %>% 
+  filter(type == "transcript") %>% 
+  group_by(locus) %>% 
+  mutate(occ = 1:n()) %>%
+  ungroup() %>%
+  select(transcript_id, occ)
+tsebra.gtf.sorted <- left_join(tsebra.gtf.sorted, transcript_id, by=c("transcript_id" = "transcript_id"))
+
+
+
+# write temporary RDS file:
+saveRDS(tsebra.gtf.sorted, 
+          "data/reannotation_correction/computational/tsebra_temp3.RDS")
+```
+
+Rename the transcripts and save again as a renamed gtf file:
+
+```{r}
+# read in RDS file
+tsebra.gtf.sorted <- readRDS("data/reannotation_correction/computational/tsebra_temp3.RDS")
+
+# rename the locus and transcript ID records to match
+# add column indicating the gene order
+ids <- unique(tsebra.gtf.sorted$locus)
+# create vector of gene order
+order3 <- 1:length(ids)
+# fill up the order so that every number includes 6 figures
+order_filled <- c()
+for (i in 1:length(order3)){
+  order_filled[i] <- paste(c(rep(0,6-nchar(order3[i])), order3[i]), sep="", collapse="")
+}
+# add characters to the beginning of the number to create the gene identifier
+gene_identifier <- paste("AHp", order_filled, sep="")
+
+# add the new gene identifier to the tsebra dataframe
+gene.id.df <- data.frame(ids, order3, order_filled, gene_identifier)
+tsebra.gtf.sorted <- left_join(tsebra.gtf.sorted, gene.id.df, by = c("locus" = "ids"))
+
+# create the transcript identifier based on gene identifier 
+tsebra.gtf.sorted <- tsebra.gtf.sorted %>%
+  mutate(transcript_identifier = paste0(gene_identifier, ".", occ))
+
+# include new attribute column
+tsebra.gtf.sorted$new_attributes <- paste0("gene_id \"", 
+                                            tsebra.gtf.sorted$gene_identifier, 
+                                            "\"; transcript_id \"", 
+                                            tsebra.gtf.sorted$transcript_identifier, 
+                                            "\";")
+# create mapping file of old and new names:
+mapping <- tsebra.gtf.sorted %>% select(gene_id, transcript_id, locus, gene_identifier, transcript_identifier)
+saveRDS(mapping, "data/reannotation_correction/computational/mapping.RDS")
+
+# create final gtf file in the correct column order:
+output <- tsebra.gtf.sorted %>%
+  ungroup() %>%
+  select(chr, source, type, start, end, score, strand, phase, new_attributes)
+
+# write gtf
+write.table(output[,1:9], 
+          "data/reannotation_correction/computational/tsebra_renamed.gtf",
+          col.names = F, row.names = F, quote=F, sep ="\t")
+```
+
+Based on evidence from sequencing data, the sequence of the AmMYBl1 gene (AHp014591) was manually adjusted. The resulting file was saved as data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf. Gffread was used to extract cds and protein fasta files of the annotated genes and convert the gtf file to the gff3 format, representing the genome annotation v2.2.
+
+More manual adjustment is required. First, the phase field in the annotation file is incorrect (likely an issue with TSEBRA). Secondly, seemingly a bug in TSEBRA causes the annotated sequence to be changed in the case that the stop codon is split up by an intron. I will save the annotation with the manually corrected AmMYBl1 as a gtf file under a different name and fix the phasing issue. The new manually corrected input file is data/reannotation_correction/manual/manually_MYBl1_corrected.gtf.
+
+As a first step, the genes with wrong values in their last exons are manually adjusted. This is the list of identifiers with an incorrect last exon, the genes were corrected based on their annotated cpc2 cds predictions.
+ERROR: Proteins do not match for transcript AHp001894.1	Strand:+	Exons: 2 checked, corrected
+ERROR: Proteins do not match for transcript AHp012461.3	Strand:+	Exons: 8 checked, corrected
+ERROR: Proteins do not match for transcript AHp002432.2	Strand:+	Exons: 9 checked, corrected
+ERROR: Proteins do not match for transcript AHp016935.2	Strand:+	Exons: 5 checked, corrected
+ERROR: Proteins do not match for transcript AHp002702.1	Strand:-	Exons: 15 checked, corrected
+ERROR: Proteins do not match for transcript AHp001365.4	Strand:+	Exons: 11 checked, corrected
+ERROR: Proteins do not match for transcript AHp001365.3	Strand:+	Exons: 12 checked, corrected
+ERROR: Proteins do not match for transcript AHp011962.1	Strand:-	Exons: 16 checked, corrected
+ERROR: Proteins do not match for transcript AHp011962.2	Strand:-	Exons: 17 checked, corrected
+ERROR: Proteins do not match for transcript AHp013889.1	Strand:-	Exons: 4 checked, corrected
+ERROR: Proteins do not match for transcript AHp008007.4	Strand:+	Exons: 6 checked, corrected
+ERROR: Proteins do not match for transcript AHp008007.3	Strand:+	Exons: 6 checked, corrected
+ERROR: Proteins do not match for transcript AHp020302.2	Strand:-	Exons: 2 checked, corrected
+ERROR: Proteins do not match for transcript AHp022997.1	Strand:-	Exons: 8 checked, corrected
+ERROR: Proteins do not match for transcript AHp015536.2	Strand:+	Exons: 28 checked, corrected
+ERROR: Proteins do not match for transcript AHp003461.2	Strand:-	Exons: 14 checked, corrected
+ERROR: Proteins do not match for transcript AHp004263.2	Strand:+	Exons: 5 checked, corrected
+ERROR: Proteins do not match for transcript AHp017585.1	Strand:-	Exons: 2 checked, corrected
+ERROR: Proteins do not match for transcript AHp016151.2	Strand:-	Exons: 4 checked, corrected
+ERROR: Proteins do not match for transcript AHp015188.1	Strand:+	Exons: 6 checked, corrected
+
+furthermore, I found 10 additional genes for which the the cds was not a multiple of 3, those genes had similar issues compared to the others:
+names(annotation.cds.fasta[which((width(annotation.cds.fasta) %% 3) != 0)])
+ [1] "AHp003199.2" "AHp007694.1" "AHp008244.1" "AHp011701.3" "AHp012360.1" "AHp013978.2" "AHp013978.3" "AHp013978.4" "AHp014078.1" "AHp023614.1"
+
+
+```{r}
+annotation.gtf <- read.gtf("data/reannotation_correction/manual/manually_MYBl1_corrected.gtf")
+# searched the above list in command line using grep -n to identify the lines which need to be corrected
+annotation.gtf[14134,4] <- 31584484
+annotation.gtf[93858,4] <- 340181
+annotation.gtf[18171,4] <- 37391386
+annotation.gtf[127438,4] <- 10148031
+annotation.gtf[20207,5] <- 2036532 # minus strand
+annotation.gtf[10269,4] <- 24730250
+annotation.gtf[10257,4] <- 24730250
+annotation.gtf[90180,5] <- 18259575 # minus strand
+annotation.gtf[90197,5] <- 18259575 # minus strand
+annotation.gtf[104739,5] <- 21390275 # minus strand
+annotation.gtf[60607,4] <- 25416738
+annotation.gtf[60600,4] <- 25416631
+annotation.gtf[153532,5] <- 18012458 # minus strand, only adjust position by 1
+annotation.gtf[173743,5] <- 3280465 # minus strand
+annotation.gtf[116728,4] <- 6429565
+annotation.gtf[25743,5] <- 15898919 # minus strand
+annotation.gtf[31372,4] <- 31118543
+annotation.gtf[132945,5] <- 18214148 # minus strand
+annotation.gtf[121835,5] <- 17488847 # minus strand
+annotation.gtf[114530,4] <- 21203168
+## additional genes:
+annotation.gtf[23976,5] <- 7441696 # minus strand, only adjust by 1
+annotation.gtf[57807,4] <- 21820533 # only adjust by 1
+annotation.gtf[62477,4] <- 27688979 # only adjust by 1
+annotation.gtf[88092,4] <- 13431774 # only adjust by 1
+annotation.gtf[93145,5] <- 23666180 # minus strand, only adjust by 1
+annotation.gtf[105257,5] <- 693429 # minus strand, only adjust by 1
+annotation.gtf[105269,5] <- 693429 # minus strand, only adjust by 1
+annotation.gtf[105281,5] <- 693429 # minus strand, only adjust by 1
+annotation.gtf[105888,5] <- 3528579 # minus strand, only adjust by 1
+annotation.gtf[178221,5] <- 807 # minus strand, only adjust by 1
+
+
+# write corrected table
+write.table(annotation.gtf[,1:9], 
+          "data/reannotation_correction/manual/manually_genes_corrected.gtf",
+          col.names = F, row.names = F, quote=F, sep ="\t")
+```
+
+After manual correction of genes, fix the phase attribute.
+
+```{bash}
+# Add a gene line and convert to gff for gffvalidator. Correct the phasing field using gffvalidator.
+/home/tom/Documents/tools/gffread-0.12.7/gffread -E data/reannotation_correction/manual/manually_genes_corrected.gtf --keep-genes > data/reannotation_correction/manual/manually_genes_corrected.gene.gff
+
+# the genes with wrong coordinates should be fixed at this step, so that they can be assigned the correct phasing attribute
+gt gff3 -tidy -force -retainids -addids no -o data/reannotation_correction/manual/manually_genes_corrected.gene.phase.gff data/reannotation_correction/manual/manually_genes_corrected.gene.gff
+
+# remove empty lines from the gtf file, with only "###"
+grep -v "###" data/reannotation_correction/manual/manually_genes_corrected.gene.phase.gff > data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff
+```
+
+Convert to other formats using gffread.
+
+```{bash}
+# convert to gtf format
+/home/tom/Documents/tools/gffread-0.12.7/gffread -v -T --keep-genes data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff > data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf
+
+# extract prot (-y) and cds (-x) fasta files
+/home/tom/Documents/tools/gffread-0.12.7/gffread -x data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta \
+	-y data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.prot.fasta \
+	-g polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \
+	data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf
+	
+# convert to gff3 format
+#/home/tom/Documents/tools/gffread-0.12.7/gffread data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf > \
+#	data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff
+	
+# copy to the annotation directory:
+cp data/reannotation_correction/manual/A* polished_genome_annotation/annotation/
+```
+
+Perform quality control of the annotated genes:
+
+```{r}
+# load in cds fasta
+annotation.cds.fasta <- readBStringSet(filepath = "data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta")
+
+# are there genes with length != a multiple of 3?
+which((width(annotation.cds.fasta) %% 3) != 0)
+
+# do all genes start with start codon and end with a stop codon?
+# create table of uppercase letters using the last three annotated bases of each fasta entry
+# display only the three stop codons, TAA, TAG and TGA
+table(toupper(subseq(annotation.cds.fasta, start = width(annotation.cds.fasta)-2, end = width(annotation.cds.fasta))))
+# create table of uppercase letters using the first three annotated bases of each fasta entry
+table(toupper(subseq(annotation.cds.fasta, start=1, end=3)))
+# all coding sequences are a multiple of 3, start with a start codon and end with a stop codon
+```
+
+
+Compare annotation completeness of genome annotation v2.2 with previously published A. hypochondriacus genome annotation and A. cruentus annotation using BUSCO.
+
+```{bash}
+# genome annotation v2.2
+# compare against the orthoDBv10 Embryophyta dataset, using 7 threads
+busco -m protein \
+  -i polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta \
+  -o genome_annotation_v2.2 \
+  -l embryophyta_odb10 \
+  --out_path data/annotation_analysis/busco/ \
+  --download_path data/annotation_analysis/busco/datasets/ \
+  -c 6
+  
+# genome annotation v2.1
+busco -m protein \
+  -i /home/tom/Documents/reference_genomes/Ahypochondriacus/annotation/Ahypochondriacus_459_v2.1.protein.fa \
+  -o genome_annotation_v2.1 \
+  -l embryophyta_odb10 \
+  --out_path data/annotation_analysis/busco/ \
+  --download_path data/annotation_analysis/busco/datasets/ \
+  -c 6
+  
+# A. cruentus annotation
+busco -m protein \
+  -i /home/tom/Documents/reference_genomes/Acruentus/annotation/Amacr_pep_20210312.tfa \
+  -o Acruentus_annotation \
+  -l embryophyta_odb10 \
+  --out_path data/annotation_analysis/busco/ \
+  --download_path data/annotation_analysis/busco/datasets/ \
+  -c 6
+```
+
+
diff --git a/workflows/other_inferences/infer_coverage_along_genome.sh b/workflows/other_inferences/infer_coverage_along_genome.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f534a07f001a3ebfbb676bd2d448a4e45c08cccb
--- /dev/null
+++ b/workflows/other_inferences/infer_coverage_along_genome.sh
@@ -0,0 +1,62 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2
+#SBATCH -t 30:00:00
+#SBATCH -J genome_coverage
+#SBATCH -o logs/other_inferences/mappingLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=10
+#SBATCH --mem=48gb
+
+
+# Run bwa to map WGS to the reference genome, calculate read depth at each position using samtools
+
+# setup
+module load bwamem2/2.2.1
+module load samtools/1.13
+
+
+#Set input and parameters
+read1=raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_1.fastq.gz
+read2=raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_2.fastq.gz
+input=polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta
+outdir=data/mapping_bias_inference/genome_coverage/
+outbam=SRR2106212.sorted.bam
+outdedup=SRR2106212.sorted.dedup.bam
+outcov=SRR2106212.coverage
+
+mkdir -p outdir
+
+#index the genome file and map
+bwa-mem2 index ${input}
+# map using 10 threads, output in bam format with header, fixmates, sort and mark and remove duplicates
+bwa-mem2 mem -t 10 ${input} ${read1} ${read2}  | samtools sort -O bam -o "$outdir""$outbam"
+
+echo "mapping complete"
+
+# load correct java version
+module load openjdk/1.8.0_60
+
+echo mark duplicates
+
+java -Xmx40g -jar /home/twinkle1/tools/picard/picard.jar MarkDuplicates --TMP_DIR /scratch/twinkle1 --INPUT "$outdir""$outbam" --OUTPUT "$outdir""$outdedup" --METRICS_FILE "$outdir"SRR2106212.sorted.dedup.metrics
+
+
+samtools index -@ 9 "$outdir""$outdedup"
+
+echo "marking duplicates complete"
+
+#index bam and genome files
+#samtools index -@ 9 $outbam
+
+# get flagstat report
+samtools flagstat -@ 9 "$outdir""$outbam" > "$outdir""$outbam".flagstat.txt
+
+echo "indexing complete"
+
+# calculate depth
+samtools depth "$outdir""$outdedup" > "$outdir"SRR2106212.sorted.dedup.depth
+
+# subset scaffold 10
+grep "Scaffold_10" "$outdir"SRR2106212.sorted.dedup.depth > "$outdir"SRR2106212.sorted.dedup.10.depth
+
+echo "finished"
diff --git a/workflows/other_inferences/mapping_bias_inference_analysis.Rmd b/workflows/other_inferences/mapping_bias_inference_analysis.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..5de4884a74dc617ac05bf6f467e3afcb015b986f
--- /dev/null
+++ b/workflows/other_inferences/mapping_bias_inference_analysis.Rmd
@@ -0,0 +1,265 @@
+---
+title: "read_mapping_bias_analysis"
+author: "twinkle1"
+date: "2023-02-15"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(data.table)
+library(pafr)
+library(GenomicRanges)
+knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2_publication/")
+```
+
+Analyse the observed bias in read mapping observed in ATAC and WG sequencing. A large number of reads map to regions on Scaffold 10. Visulaize the bias in read mapping by plotting Scaffold 10:
+
+```{r}
+# read in data
+coverage <- read.table(file = "data/mapping_bias_inference/genome_coverage/SRR2106212.sorted.dedup.10.depth")
+
+# calculate coverage in windows with fixed window size
+windowsize <- 5000
+
+# add window assignment
+coverage <- coverage %>%
+  mutate(window = ceiling(V2/windowsize))
+
+# calculate coverage in windows
+window_coverage <- coverage %>%
+  group_by(window) %>%
+  summarise(total_cov = sum(V3)) %>%
+  mutate(start = ((window-1)*windowsize),
+         end = start+windowsize)
+
+# plot coverage across Scaffold 10 in windows of 50 kb
+ggplot(data = window_coverage) +
+  geom_rect(aes(xmin = start,
+                xmax = end,
+                ymin = 0,
+                ymax = total_cov)) +
+  theme_bw() +
+  labs(y = "Total read depth in 50 kb window",
+       x = "Position Scaffold 10") +
+  theme(text = element_text(size = 22))
+
+# save plot
+ggsave(filename = "plots/mapping_bias_inference/50k_window_read_coverage.png",
+       width = 14, height = 8)
+```
+
+Investigate if genomic features correlate with the observed mapping bias. Calculate summary statistics including GC content using bedtools:
+
+```{bash}
+# create genome file for bedtools
+awk -F'\t' 'BEGIN {OFS = FS} {print $1,$2}' /projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta.fai > data/mapping_bias_inference/bedtools_genome.txt
+# make windows using bedtools, windowsize 5k, include only Scaffolds
+bedtools makewindows -g data/mapping_bias_inference/bedtools_genome.txt -w 5000 | grep "Scaffold" > data/mapping_bias_inference/all_genome_windows.bed
+# calculate statistics in windows
+bedtools nuc -fi /projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta -bed data/mapping_bias_inference/genome_windows.bed > data/mapping_bias_inference/window_stats.txt
+```
+
+
+Plot Scaffold 10, with read depth and plastid genome content
+
+```{r}
+# read in paf files from minimap2
+chloroplast.paf <- read_paf("data/mapping_bias_inference/plastid_to_genome/chloroplast_to_genome.paf")
+mito.paf <- read_paf("data/mapping_bias_inference/plastid_to_genome/mitochondrium_to_genome.paf")
+mito.df <- as.data.frame(mito.paf)
+chloro.df <- as.data.frame(chloroplast.paf)
+
+# read in mapping files for cruentus
+chloroplast.cruentus.paf <- read_paf("data/mapping_bias_inference/plastid_to_genome/chloroplast_to_genome_cruentus.paf")
+mito.cruentus.paf <- read_paf("data/mapping_bias_inference/plastid_to_genome/mitochondrium_to_genome_cruentus.paf")
+mito.cruentus.df <- as.data.frame(mito.cruentus.paf)
+chloro.cruentus.df <- as.data.frame(chloroplast.cruentus.paf)
+
+
+# mark GC outlier windows, and mapped mitochondrial and chloroplast genome sequences
+# only include alignments with mapq >= 0
+ggplot(data = window_coverage) +
+  geom_rect(aes(xmin = start,
+                xmax = end,
+                ymin = 0,
+                ymax = total_cov)) +
+  geom_rect(data = mito.df %>% filter(tname == "Scaffold_10",
+                                      mapq != 0),
+            aes(xmin = tstart,
+                xmax = tend,
+                ymin = -1000000,
+                ymax = -10000,
+                fill = "Mitochondrial")) +
+  geom_rect(data = chloro.df %>% filter(tname == "Scaffold_10",
+                                        mapq != 0),
+            aes(xmin = tstart,
+                xmax = tend,
+                ymin = -1200000,
+                ymax = -2200000,
+                fill = "Chloroplast")) +
+  coord_cartesian(xlim = c(5300000, 6500000)) +
+  #coord_cartesian(xlim = c(5835000,6390000)) +
+  theme_classic() +
+  labs(y = "Total read depth in 5 kb windows",
+       x = "Position on Scaffold 10",
+       fill = "Annotation") +
+  scale_fill_manual(values = c("red", "blue", "darkgreen"),
+                    breaks = c("5% GC content outlier","Mitochondrial","Chloroplast")) +
+  theme(text = element_text(size = 22))
+
+
+ggsave(filename = "plots/mapping_bias_inference/5kb_outlier_windows_zoom.png",
+       width = 10, height = 6, bg = "white")
+```
+
+How much of the plastid genomes can be found in the reference genome and where?
+
+```{r}
+# how much mitochondrium was mapped?
+mito.df %>%
+  filter(mapq != 0) %>%
+  group_by(tname) %>%
+  summarize(total_align_length = sum(alen)) %>%
+  mutate(percent_aligned = (total_align_length/mito.df$qlen[1])*100) %>%
+  arrange(desc(percent_aligned)) 
+
+# how much chloroplast was mapped?
+chloro.df %>%
+  filter(mapq != 0) %>%
+  group_by(tname) %>%
+  summarize(total_align_length = sum(alen)) %>%
+  mutate(percent_aligned = (total_align_length/chloro.df$qlen[1])*100) %>%
+  arrange(desc(percent_aligned))
+
+# how much mitochondrium was mapped to cruentus?
+mito.cruentus.df %>%
+  filter(mapq != 0) %>%
+  group_by(tname) %>%
+  summarize(total_align_length = sum(alen)) %>%
+  mutate(percent_aligned = (total_align_length/mito.df$qlen[1])*100) %>%
+  arrange(desc(percent_aligned))
+
+# how much chloroplast was mapped to cruentus?
+chloro.cruentus.df %>%
+  filter(mapq != 0) %>%
+  group_by(tname) %>%
+  summarize(total_align_length = sum(alen)) %>%
+  mutate(percent_aligned = (total_align_length/chloro.df$qlen[1])*100) %>%
+  arrange(desc(percent_aligned))
+
+
+# create genomic ranges object to check coverage in case query positions overlap
+# in some cases, a single query base can map to multiple target bases, exclude those from overall percentage of query mapped
+calculate_percent_covered <- function(paf_dataframe){
+  # exclude secondary alignments
+  paf_dataframe <- paf_dataframe %>%
+    filter(mapq != 0)
+  # create GRanges object
+  paf_granges <- GRanges(seqnames = "query",
+                   ranges = IRanges(start = paf_dataframe$qstart,
+                                    end = paf_dataframe$qend))
+  # sum all covered positions, no matter how often they mapped and divide by query length
+  paf_cov <- coverage(paf_granges)
+  above_zero <- paf_cov@listData$query@values > 0
+  return(sum(paf_cov@listData$query@lengths[above_zero]) / paf_dataframe[1,2])
+}
+
+```
+
+There are structural differences in GC content between the different genomes. While the Beta vulgaris mitochondrium has 43.86 % GC, the chloroplast genome has 36.61 % and the Scaffolds of the nuclear genome have GC content between 32.3 % and 33.2 %.
+
+```{bash}
+# GC content of different genomes
+seqkit fx2tab --name --gc data/mapping_bias_inference/plastid_to_genome/Ah_chloroplast.fasta
+seqkit fx2tab --name --gc data/mapping_bias_inference/plastid_to_genome/Bv_mitochondrium.fasta
+seqkit fx2tab --name --gc polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta | grep "Scaffold"
+```
+
+
+Extract a list of genes overlapping the mapped plastid genome positions to investigate enrichment of specific functions.
+
+```{r}
+# set up function for reading in a gtf file
+read.gtf <- function(file){
+  # based on: https://www.biostars.org/p/272889/
+  # read in the gtf file:
+  gff <- fread(file)
+  setnames(gff, names(gff), c("chr","source","type","start","end","score","strand","phase","attributes"))
+  # subset attribute column into the gene and transcript id columns
+  # function for extracting the two attributes
+  extract_attributes <- function(gtf_column, att_of_interest){
+    att <- strsplit(gtf_column, "; ")
+    att <- gsub("\"","",unlist(att))
+    att <- gsub(";","",unlist(att))
+    if(!is.null(unlist(strsplit(att[grep(att_of_interest, att)], " ")))){
+      return( unlist(strsplit(att[grep(att_of_interest, att)], " "))[2])
+    }else{
+      return(NA)
+    }
+  }
+  # using the function to subset gene and transcript id:
+  gff$gene_id <- unlist(lapply(gff$attributes, extract_attributes, "gene"))
+  gff$transcript_id <- unlist(lapply(gff$attributes, extract_attributes, "transcript"))
+  return(gff)
+}
+
+# gene functions of plastid genomic regions
+annotation <- read.gtf("polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf")
+annotation <- annotation %>%
+  filter(type == "transcript")
+
+# convert both annotation and mapped regions to genomic ranges
+annotation_ranges <- GRanges(seqnames = annotation$chr,
+                             ranges = IRanges(start = annotation$start,
+                                              end = annotation$end,
+                                              names = annotation$transcript_id),
+                             strand = annotation$strand)
+
+# mitochondrial regions
+# filter for primary alignments only
+mito.df <- mito.df %>%
+  filter(mapq != 0,
+         tname == "Scaffold_10")
+mito_ranges <- GRanges(seqnames = mito.df$tname,
+                       ranges = IRanges(start = mito.df$tstart,
+                                        end = mito.df$tend))
+
+mito_overlap <- findOverlaps(annotation_ranges, mito_ranges)
+mito_overlap <- annotation[mito_overlap@from,11]
+
+# chloroplast regions
+# filter for primary alignments only
+chloro.df <- chloro.df %>%
+  filter(mapq != 0,
+         tname == "Scaffold_10")
+chloro_ranges <- GRanges(seqnames = chloro.df$tname,
+                         ranges = IRanges(start = chloro.df$tstart,
+                                          end = chloro.df$tend))
+
+chloro_overlap <- findOverlaps(annotation_ranges, chloro_ranges)
+chloro_overlap <- annotation[chloro_overlap@from,11]
+```
+
+
+Compare with functional annotation:
+
+```{r}
+# load functional annotation
+functional_annotation <- readxl::read_xlsx(path = "data/functional_annotation/eggnog_mapper/MM_8wexw920.emapper.annotations.xlsx")
+colnames(functional_annotation) <- functional_annotation[2,]
+functional_annotation <- functional_annotation[-(1:2),]
+
+# check function annotation for overlaps
+mito_functions <- functional_annotation %>%
+  filter(query %in% mito_overlap$transcript_id)
+
+chloro_functions <- functional_annotation %>%
+  filter(query %in% chloro_overlap$transcript_id)
+```
+
+
+
+
+
diff --git a/workflows/other_inferences/plastid_to_genome_mapping.sh b/workflows/other_inferences/plastid_to_genome_mapping.sh
new file mode 100644
index 0000000000000000000000000000000000000000..279e7c7d1e95e65d840836c43c53adcf0858d999
--- /dev/null
+++ b/workflows/other_inferences/plastid_to_genome_mapping.sh
@@ -0,0 +1,35 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -t 01:00:00
+#SBATCH -J minimap
+#SBATCH -o /home/twinkle1/projects/Ahyp_v2_2/logs/other_inferences/mappingLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=8
+#SBATCH --mem=20gb
+
+
+# load necessary modules
+source $CONDA_PREFIX/etc/profile.d/conda.sh
+conda activate isoseq
+
+# create output directory
+MAPPINGOUT=data/mapping_bias_inference/plastid_to_genome/
+mkdir -p "$MAPPINGOUT"
+
+### MAPPING
+# map reads onto the reference genome
+REFERENCE=/projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta
+
+# Align sequences to reference genome
+# sequences obtained from (Beta vulgaris, mitchondrium): https://www.ncbi.nlm.nih.gov/nuccore/BA000009.3
+# (Amaranthus hypochondriacus, chloroplast): https://www.ncbi.nlm.nih.gov/nuccore/KX279888.1
+
+INPUTMITO="$MAPPINGOUT"Bv_mitochondrium.fasta
+INPUTCHLORO="$MAPPINGOUT"Ah_chloroplast.fasta
+OUTPUTMITO="$MAPPINGOUT"mitochondrium_to_genome.paf
+OUTPUTCHLORO="$MAPPINGOUT"chloroplast_to_genome.paf
+
+
+# asm5 for intra species/genus chloroplast assembly, asm10 for cross-species mitchondrial alignment
+minimap2 -t 8 -cx asm10 $REFERENCE $INPUTMITO > $OUTPUTMITO
+minimap2 -t 8 -cx asm5 $REFERENCE $INPUTCHLORO > $OUTPUTCHLORO
diff --git a/workflows/other_inferences/readme.txt b/workflows/other_inferences/readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9b3f718dbe1354c1915fb386295cd3c12e209776
--- /dev/null
+++ b/workflows/other_inferences/readme.txt
@@ -0,0 +1,14 @@
+## Computational annotation
+
+Mapping bias and plastid content investigation
+
+### Script order:
+
+- code/other_inferences/infer_coverage_along_genome.sh
+Map WGS reads against the reference genome to investigate mapping bias
+
+- code/other_inferences/plastid_to_genome_mapping.sh
+Map chloroplast and mitochondria sequences to the reference genome
+
+- code/other_inferences/mapping_bias_inference_analysis.Rmd
+Plot mapping bias with annotated chloroplast and mitochondrial sequences
diff --git a/workflows/readme.txt b/workflows/readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..665da22f3863d059cfb0b6ac3f70fd0bd3be8701
--- /dev/null
+++ b/workflows/readme.txt
@@ -0,0 +1,29 @@
+## Code
+
+All code used for polishing, masking and reannotation of the A. hypochondriacus reference genome.
+
+### Order of usage:
+
+- code/genome_polishing/
+polish the previously published A. hypochondriacus reference genome
+
+- code/repeat_masking/
+to prepare computational annotation, softmask repetitive elements in the polished genome
+
+- code/braker2/
+perform computational annotation of the softmasked reference genome using BRAKER2
+
+- code/isoseq_assembly/
+assembly long read transcript sequencing data and compare effects of genome polishing on reported completeness
+
+- code/merge_annotation/
+merge computational annotation with long-read transcript sequencing data and process output into genome annotation v2.2
+
+- code/annotation_analysis/
+identify flavonoid and betalain pathway genes, as well as MYB transcription factors in the genome annotation v2.2
+
+- code/other_inferences/
+mapping of chloroplast and mitchondrium sequences to the reference genome, analysis of mapping coverage bias
+
+- code/BSA/
+bulk segregant analysis and analysis of RNA-seq data from pooled flower tissue
diff --git a/workflows/repeat_masking/analyse_repetitive_elements.Rmd b/workflows/repeat_masking/analyse_repetitive_elements.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..149b81c0d133112fa45d751176ceba893326bb0d
--- /dev/null
+++ b/workflows/repeat_masking/analyse_repetitive_elements.Rmd
@@ -0,0 +1,54 @@
+---
+title: "Analyse_repetitive_elements"
+author: "twinkle1"
+date: "2023-06-19"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2_publication/")
+```
+
+
+Prepare input file for processing using R:
+
+```{bash}
+# process build summary output
+tail -n +41 data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary | head -n -907 > data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary.tsv
+```
+
+
+Analyse repetitive element content and composition:
+
+```{r}
+# read in buildSummary
+buildSummary <- read_table(file = "data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary.tsv",
+                           col_names = c("repeat", "count", "length", "percent","drop"))
+buildSummary <- buildSummary[,1:4]
+
+# read in repeat classification
+new_TE_classification <- read_delim(file = "data/repeatmasking/reclassification/classified.updated.txt",
+                                    delim = "#",
+                                    col_names = c("TE_family", "classification"))
+
+# add indicator for low complexity and simple repeats
+buildSummary <- left_join(buildSummary, new_TE_classification, by = c("repeat" = "TE_family"))
+buildSummary[grep("rich", buildSummary$`repeat`),5] <- "Low complexity"
+buildSummary <- buildSummary %>%
+  mutate(classification = replace_na(classification, "Simple_repeat"))
+
+# create TE summary table
+summary_table <- buildSummary %>%
+  group_by(classification) %>%
+  summarise(total_length = sum(length),
+            percentage = round((total_length/403994491)*100, 2))
+
+summary_table %>%
+  summarize(sum_total_length = sum(total_length),
+            sum_percentage = sum(percentage))
+
+write.csv(summary_table, file = "data/repeatmasking/reclassification/reclassified.output.tbl",
+          quote = F, row.names = F)
+```
diff --git a/workflows/repeat_masking/readme.txt b/workflows/repeat_masking/readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0fa63432df4359bd85b9c9d507b8363b471fd415
--- /dev/null
+++ b/workflows/repeat_masking/readme.txt
@@ -0,0 +1,14 @@
+## Masking of repetitive elements
+
+The polished reference genome is masked for repetitive elements in order to prepare the computational annotation.
+
+### Script order:
+
+- code/repeat_masking/run_repeatmodeler.sh
+run Repeatmodeler to identify repetitive elements in the polished reference genome
+
+- code/repeat_masking/run_repeatmasker.sh
+run Repeatmasker on the Repeatmodeler output to classify identified elements and mask the polished reference genome
+
+- code/repeat_masking/analyse_repetitive_elements.Rmd
+analyse the repetitive element composition
diff --git a/workflows/repeat_masking/run_repeatmasker.sh b/workflows/repeat_masking/run_repeatmasker.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d848d5e77107ff63567b624be9cdda4c7ccfdf8e
--- /dev/null
+++ b/workflows/repeat_masking/run_repeatmasker.sh
@@ -0,0 +1,53 @@
+#!/bin/bash -l
+#SBATCH -D /projects/ag-stetter/twinkle/projects/Ahyp_v2_2_publication/
+#SBATCH -t 160:00:00
+#SBATCH -J rmodeler
+#SBATCH -o /projects/ag-stetter/twinkle/projects/Ahyp_v2_2/logs/repeatmasker/mappingLog-%j.txt
+#SBATCH --error /projects/ag-stetter/twinkle/projects/Ahyp_v2_2/logs/repeatmasker/errorLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=20
+#SBATCH --mem=42gb
+#SBATCH --mail-type=ALL
+
+
+# run on cheops1
+# this script is used to run repeatmasker usuing the generated repeatmodeler repeatdatabase
+
+# load modules
+module load repeatmasker/4.1.1
+
+# create database directory
+RMOUT=data/repeatmasking/repeatmasker
+
+mkdir -p $RMOUT
+
+### Main
+# watch out with the processor setting, each rmblast job will take 4 threads
+# for 20 threads, the pa setting while using rmblast should be 5
+
+# Converted the polished reference genome fasta to all capital letters as preparation to repeatmasking:
+awk '/^>/ {print($0)}; /^[^>]/ {print(toupper($0))}' data/NextPolish/processed/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta \
+	> "$RMOUT"/Ahypochondriacus_2.2_polished.capital.fasta
+
+### run RepeatMasker
+# lib = repeat database created with repeatmodeler
+RepeatMasker -lib data/repeatmasking/repeatmodeler/consensi.fa.classified \
+	-pa 5 \
+	-small \
+	-e rmblast \
+	-gff \
+	-dir "$RMOUT" \
+	"$RMOUT"/Ahypochondriacus_2.2_polished.capital.fasta
+
+mkdir "$RMOUT"/output
+
+# convert to softmasked fasta
+module load bedtools/2.29.2
+bedtools maskfasta \
+	-fi "$RMOUT"/Ahypochondriacus_2.2_polished.capital.fasta \
+	-bed "$RMOUT"/Ahypochondriacus_2.2_polished.capital.fasta.out.gff \
+	-soft \
+	-fo "$RMOUT"/output/Ahypochondriacus_2.2_polished.softmasked.fasta
+
+# more detailed summary:
+buildSummary.pl data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.out > data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary
diff --git a/workflows/repeat_masking/run_repeatmodeler.sh b/workflows/repeat_masking/run_repeatmodeler.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c33ef68b266852defd7cd38671bed273e9532dfd
--- /dev/null
+++ b/workflows/repeat_masking/run_repeatmodeler.sh
@@ -0,0 +1,41 @@
+#!/bin/bash -l
+#SBATCH -D /scratch/twinkle1/repeatmodeler/
+#SBATCH -t 200:00:00
+#SBATCH -J rmodeler
+#SBATCH -o /home/twinkle1/master_thesis/logs/repeatmodeler/mappingLog-%j.txt
+#SBATCH --error /home/twinkle1/master_thesis/logs/repeatmodeler/errorLog-%j.txt
+#SBATCH --nodes=1
+#SBATCH --ntasks=20
+#SBATCH --mem=42gb
+#SBATCH --mail-user=twinkle1@smail.uni-koeln.de
+#SBATCH --mail-type=ALL
+
+
+# run on cheops1
+# this script is used to run repeatmodeler on the newly polished reference assembly
+
+
+# load modules
+module load repeatmodeler/2.0.1
+
+# create database directory
+mkdir -p data/repeatmodeler/database/
+
+### Main
+# increased number of tasks, each rmblast job will take 4 threads
+# for 20 threads, the pa setting while using rmblast should be 5
+
+# Create database
+BuildDatabase -name data/repeatmodeler/database/polished \
+	data/NextPolish/processed/Ahypochondriacus_2.2_polished.fasta
+
+# run Repeatmodeler
+RepeatModeler -database data/repeatmodeler/database/polished \
+	-pa 5 \
+	-LTRStruct
+
+# reclassify identified repeats
+mkdir -p data/repeatmasking/reclassification/
+
+# repeatmasker version 4.1.5
+RepeatClassifier -consensi data/repeatmasking/reclassification/consensi.fa -stockholm data/repeatmasking/reclassification/families.stk