Compare revisions

f5968747 · f5968747 · f5968747 · f5968747 · f5968747 · f5968747
--- a/workflows/isoseq_assembly/isoseq3_pipeline.sh
+++ b/workflows/isoseq_assembly/isoseq3_pipeline.sh
@@ -20,13 +20,13 @@ echo "$SLURM_ARRAY_TASK_ID"
 # removal of primers and barcodes already done
 # step 4, removal of polyA tail and artificial concatemers

-PROCESSING=raw_data/isoseq_raw_reads/processed/
+PROCESSING=assays/isoform_sequencing/dataset/isoseq_raw_reads/processed/
 mkdir -p "PROCESSING"

-SOURCE_DIR=raw_data/isoseq_raw_reads/reads
+SOURCE_DIR=assays/isoform_sequencing/dataset/reads
 FILES=("$SOURCE_DIR"/*bam)

-PRIMER_SOURCE=raw_data/isoseq_raw_reads/primers
+PRIMER_SOURCE=assays/isoform_sequencing/dataset/primers
 PRIMER_FILES=("$PRIMER_SOURCE"/*fasta)

 REFINEIN="$SOURCE_DIR"/"${FILES["${SLURM_ARRAY_TASK_ID}"]}"
@@ -46,7 +46,7 @@ samtools merge "$ISOSEQOUT"combined.merged_flnc.bam /home/twinkle1/isoseq_raw_re

 # clustering of identical reads

-IN="data/isoseq3_pipeline/combined.merged_flnc.bam"
-OUT="data/isoseq3_pipeline/combined.merged_clustered.bam"
+IN="runs/isoseq3_pipeline/combined.merged_flnc.bam"
+OUT="runs/isoseq3_pipeline/combined.merged_clustered.bam"

 isoseq3 cluster	"$IN" "$OUT" --verbose --use-qvs
--- a/workflows/isoseq_assembly/readme.txt
+++ b/workflows/isoseq_assembly/readme.txt
@@ -4,20 +4,20 @@ Assembly full-length transcript sequencing data.

 ### Script order:

- code/isoseq_assembly/isoseq3_pipeline.sh
+- workflows/isoseq_assembly/isoseq3_pipeline.sh
 assemble FLNC reads from CCS files

- code/isoseq_assembly/combined_isoseq3_pipeline.sh
+- workflows/isoseq_assembly/combined_isoseq3_pipeline.sh
 combine FLNC reads and cluster identical reads

- code/isoseq_assembly/combined_mapping_and_collapse.sh
+- workflows/isoseq_assembly/combined_mapping_and_collapse.sh
 collapse clustered reads into unique full-length transcripts using the polished reference genome

- code/isoseq_assembly/combined_mapping_and_collapse_old_genome.sh
+- workflows/isoseq_assembly/combined_mapping_and_collapse_old_genome.sh
 collapse clustered reads into unique full-length transcripts using the unpolished reference genome

- code/isoseq_assembly/run_sqanti.sh
+- workflows/isoseq_assembly/run_sqanti.sh
 run SQANTI in order to correct possible sequencing errors in the full-length transcripts using both reference genomes

- code/isoseq_assembly/comparison_isoseq_polishing_effectiveness.Rmd
+- workflows/isoseq_assembly/comparison_isoseq_polishing_effectiveness.Rmd
 compare effect of genome polishing on error correction of full-length transcript sequences
--- a/workflows/isoseq_assembly/run_sqanti.sh
+++ b/workflows/isoseq_assembly/run_sqanti.sh
@@ -3,20 +3,20 @@
 # run sqanti correction and filtering
 # preliminary genome annotation is used to compare against, but this only effects the pdf output

-mkdir -p data/isoseq/sqanti/output_polished
+mkdir -p runs/isoseq/sqanti/output_polished

 /home/tom/Documents/tools/SQANTI3-4.2/sqanti3_qc.py \
-        data/isoseq/mapping_and_collapse/combined.collapsed.min_fl_2.filtered.gff \
-        data/braker2/polished_prot_rna/braker.gtf \
-        polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \
-        -d data/isoseq/sqanti/output_polished/ \
+        runs/isoseq/mapping_and_collapse/combined.collapsed.min_fl_2.filtered.gff \
+        runs/braker2/polished_prot_rna/braker.gtf \
+        runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \
+        -d runs/isoseq/sqanti/output_polished/ \
        -n 6

 mkdir -p data/isoseq/sqanti/output_old_genome

 /home/tom/Documents/tools/SQANTI3-4.2/sqanti3_qc.py \
-	data/isoseq/mapping_and_collapse_old_genome/combined.collapsed.min_fl_2.filtered.gff \
-	data/braker2/polished_prot_rna/braker.gtf \
-	/home/tom/Documents/reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa \
-	-d data/isoseq/sqanti/output_old_genome/ \
+	runs/isoseq/mapping_and_collapse_old_genome/combined.collapsed.min_fl_2.filtered.gff \
+	runs/braker2/polished_prot_rna/braker.gtf \
+	studies/additional_data/resources/reference_genomes/Ahypochondriacus/assembly//Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa \
+	-d runs/isoseq/sqanti/output_old_genome/ \
 	-n 6
--- a/workflows/merge_annotation/Braker2_subsets_and_merge.Rmd
+++ b/workflows/merge_annotation/Braker2_subsets_and_merge.Rmd
@@ -54,10 +54,10 @@ Granges_from_gtf <- function(gtf){
            seqnames = unique(chr), # all sequences should be on the same chromosome
            gene_strand = unique(strand))
  # use the gene_structures object to create the genomic ranges object
-  gene_ranges <- GRanges(seqnames = gene_structures$seqnames, 
-                         ranges = IRanges(start=gene_structures$gene_start, 
+  gene_ranges <- GRanges(seqnames = gene_structures$seqnames,
+                         ranges = IRanges(start=gene_structures$gene_start,
                                          end=gene_structures$gene_end,
-                                          names = gene_structures$transcript_id), 
+                                          names = gene_structures$transcript_id),
                         strand = gene_structures$gene_strand)
  return(gene_ranges)
 }
@@ -86,21 +86,21 @@ report_both_codons <- function(gtf_object){
 Create differently supported subsets using the available script from augustus/braker2:

 ```{bash}
-mkdir -p data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets
+mkdir -p runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets

 # activate busco environment for the Augustus script
 conda activate busco

 # extract braker aa and cds, also generate the bad_genes.lst file
-/home/tom/Documents/tools/Augustus/scripts/getAnnoFastaFromJoingenes.py -g polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta -f data/braker2/polished_prot_rna/braker.gtf -s FILTER -o data/braker2/polished_prot_rna/braker_extracted
+/home/tom/Documents/tools/Augustus/scripts/getAnnoFastaFromJoingenes.py -g runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta -f runs/braker2/polished_prot_rna/braker.gtf -s FILTER -o runs/braker2/polished_prot_rna/braker_extracted

 # select differently supported subsets based on the amount of support
-tools/BRAKER/scripts/predictionAnalysis/selectSupportedSubsets.py --noSupport data/braker_analysis/external_evidence/polished_prot_rna/no_support.txt --fullSupport data/braker_analysis/external_evidence/polished_prot_rna/full_support.txt --anySupport data/braker_analysis/external_evidence/polished_prot_rna/any_support.txt data/braker2/polished_prot_rna/braker.gtf data/braker2/polished_prot_rna/hintsfile.gff
+tools/BRAKER/scripts/predictionAnalysis/selectSupportedSubsets.py --noSupport runs/braker_analysis/external_evidence/polished_prot_rna/no_support.txt --fullSupport runs/braker_analysis/external_evidence/polished_prot_rna/full_support.txt --anySupport runs/braker_analysis/external_evidence/polished_prot_rna/any_support.txt runs/braker2/polished_prot_rna/braker.gtf runs/braker2/polished_prot_rna/hintsfile.gff

 # remove all lines that contain "#" characters in the beginning (cause issues at later steps)
-sed '/^#/d' data/braker_analysis/external_evidence/polished_prot_rna/full_support.txt > data/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf
-sed '/^#/d' data/braker_analysis/external_evidence/polished_prot_rna/any_support.txt > data/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf
-sed '/^#/d' data/braker_analysis/external_evidence/polished_prot_rna/no_support.txt > data/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf
+sed '/^#/d' runs/braker_analysis/external_evidence/polished_prot_rna/full_support.txt > runs/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf
+sed '/^#/d' runs/braker_analysis/external_evidence/polished_prot_rna/any_support.txt > runs/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf
+sed '/^#/d' runs/braker_analysis/external_evidence/polished_prot_rna/no_support.txt > runs/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf
 ```


@@ -108,12 +108,12 @@ Generate a dataframe with the transcript ids based on the differently supported

 ```{r}
 # load in the support set gtfs
-full.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf")
-any.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf")
-no.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf")
+full.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf")
+any.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf")
+no.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf")

 # load in the genes with internal stop codons, as detected by the getAnnoFastaFromJoingenes script (see braker2_results folder readme.txt)
-badgenes <- read.table("data/braker_analysis/external_evidence/polished_prot_rna/bad_genes.lst")
+badgenes <- read.table("runs/braker_analysis/external_evidence/polished_prot_rna/bad_genes.lst")

 # extract the gene names with both, annotated start and annotated stop codons
 codons_any <- report_both_codons(any.support)
@@ -130,21 +130,21 @@ no.ids <- unique(no.support$transcript_id) #7443

 # create dataframe with the subset transcript ids and the support category
 all.ids <- c(full.ids, partial.ids, no.ids)
-support <- c(rep("full", length(full.ids)), 
-             rep("partial", length(partial.ids)), 
+support <- c(rep("full", length(full.ids)),
+             rep("partial", length(partial.ids)),
             rep("no", length(no.ids)))
 support.df <- data.frame(all.ids, support)


 # filter out internal stop codons
-support.df <- support.df[!support.df$all.ids %in% badgenes$V1,] 
+support.df <- support.df[!support.df$all.ids %in% badgenes$V1,]
 # exclude all annotated genes that do not have annotated start and stop codons
 fixed_support.df <- support.df[support.df$all.ids %in% codons,]


 ##################################
 # write created dataframe as rds object:
-saveRDS(fixed_support.df, file="data/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS")
+saveRDS(fixed_support.df, file="runs/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS")

 ```

@@ -153,12 +153,12 @@ Filter out predicted genes with internal stop codons as well as predicted genes

 ```{r}
 # load in the gtf files to subset
-full.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf")
-any.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf")
-no.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf")
+full.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf")
+any.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf")
+no.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf")

 # load in the annotation dataframe:
-support.df <- readRDS(file="data/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS")
+support.df <- readRDS(file="runs/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS")

 # create partial support dataframe:
 partial.support <- any.support[!any.support$transcript_id %in% full.support$transcript_id,]
@@ -177,28 +177,28 @@ no.support.filtered <- no.support[no.support$transcript_id %in% support.df$all.i
 #length(unique(no.support.filtered$transcript_id)) # 3579 remain

 ### Write the filtered gtf files
-write.table(full.support.filtered[,1:9], 
-          "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.gtf",
+write.table(full.support.filtered[,1:9],
+          "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.gtf",
          col.names = F, row.names = F, quote=F, sep ="\t")
-write.table(partial.support.filtered[,1:9], 
-          "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.gtf",
+write.table(partial.support.filtered[,1:9],
+          "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.gtf",
          col.names = F, row.names = F, quote=F, sep ="\t")
-write.table(no.support.filtered[,1:9], 
-          "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.gtf",
+write.table(no.support.filtered[,1:9],
+          "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.gtf",
          col.names = F, row.names = F, quote=F, sep ="\t")
 ```

-After filtering the gtf files, use the filtered files in order to subset the codingseq and amino acid fasta files. Load the coding sequence and amino acid fasta files and filter them using the support dataframe. 
+After filtering the gtf files, use the filtered files in order to subset the codingseq and amino acid fasta files. Load the coding sequence and amino acid fasta files and filter them using the support dataframe.

 ```{r}
 library(seqinr)

 # load in the support dataframe
-support.df <- readRDS(file="data/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS")
+support.df <- readRDS(file="runs/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS")
 support.df$all.ids <- as.character(support.df$all.ids)

 # subset fasta files using the support dataframe
-braker_dna <- read.fasta("data/braker2/polished_prot_rna/braker_extracted.codingseq",
+braker_dna <- read.fasta("runs/braker2/polished_prot_rna/braker_extracted.codingseq",
                            seqtype = "DNA")

 # filter the different subsets
@@ -208,22 +208,22 @@ braker_dna.partial.filtered <- braker_dna[getName(braker_dna) %in% support.df[su
 braker_dna.no.filtered <- braker_dna[getName(braker_dna) %in% support.df[support.df$support == "no",1]] # 7014 sequences

 # write subsets
-write.fasta(sequences=braker_dna.filtered, 
-            names=names(braker_dna.filtered), 
-            file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/braker_all.fasta")
-write.fasta(sequences=braker_dna.full.filtered, 
-            names=names(braker_dna.full.filtered), 
-            file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.fasta")
-write.fasta(sequences=braker_dna.partial.filtered, 
-            names=names(braker_dna.partial.filtered), 
-            file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.fasta")
-write.fasta(sequences=braker_dna.no.filtered, 
-            names=names(braker_dna.no.filtered), 
-            file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.fasta")
+write.fasta(sequences=braker_dna.filtered,
+            names=names(braker_dna.filtered),
+            file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/braker_all.fasta")
+write.fasta(sequences=braker_dna.full.filtered,
+            names=names(braker_dna.full.filtered),
+            file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.fasta")
+write.fasta(sequences=braker_dna.partial.filtered,
+            names=names(braker_dna.partial.filtered),
+            file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.fasta")
+write.fasta(sequences=braker_dna.no.filtered,
+            names=names(braker_dna.no.filtered),
+            file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.fasta")

 ### repeat for the AA fasta files:
 # subset fasta files using the support dataframe
-braker_aa <- read.fasta("data/braker2/polished_prot_rna/braker_extracted.aa",
+braker_aa <- read.fasta("runs/braker2/polished_prot_rna/braker_extracted.aa",
                            seqtype = "AA")

 braker_aa.filtered <- braker_aa[getName(braker_aa) %in% support.df[,1]] # 39141 sequences
@@ -232,20 +232,16 @@ braker_aa.partial.filtered <- braker_aa[getName(braker_aa) %in% support.df[suppo
 braker_aa.no.filtered <- braker_aa[getName(braker_aa) %in% support.df[support.df$support == "no",1]] # 7014 sequences

 # write subsets
-write.fasta(sequences=braker_aa.filtered, 
-            names=names(braker_aa.filtered), 
-            file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/braker_all.faa")
-write.fasta(sequences=braker_aa.full.filtered, 
-            names=names(braker_aa.full.filtered), 
-            file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.faa")
-write.fasta(sequences=braker_aa.partial.filtered, 
-            names=names(braker_aa.partial.filtered), 
-            file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.faa")
-write.fasta(sequences=braker_aa.no.filtered, 
-            names=names(braker_aa.no.filtered), 
-            file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.faa")
+write.fasta(sequences=braker_aa.filtered,
+            names=names(braker_aa.filtered),
+            file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/braker_all.faa")
+write.fasta(sequences=braker_aa.full.filtered,
+            names=names(braker_aa.full.filtered),
+            file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.faa")
+write.fasta(sequences=braker_aa.partial.filtered,
+            names=names(braker_aa.partial.filtered),
+            file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.faa")
+write.fasta(sequences=braker_aa.no.filtered,
+            names=names(braker_aa.no.filtered),
+            file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.faa")
 ```
-
-
-
-
--- a/workflows/merge_annotation/readme.txt
+++ b/workflows/merge_annotation/readme.txt
@@ -4,8 +4,8 @@ Combine the computational annotation with full-length transcript sequencing data

 ### Script order:

- code/merge_annotation/Braker2_subsets_and_merge.Rmd
+- workflows/merge_annotation/Braker2_subsets_and_merge.Rmd
 Prepare BRAKER2 input, use only predicted genes supported by external evidence for the merge

- code/merge_annotation/reannotation_correction.Rmd
+- workflows/merge_annotation/reannotation_correction.Rmd
 Merge BRAKER2 and full-length transcript sequencing data using TSEBRA, deduplicate, rename genes and compare annotation completeness using BUSCO
--- a/workflows/merge_annotation/reannotation_correction.Rmd
+++ b/workflows/merge_annotation/reannotation_correction.Rmd
@@ -56,10 +56,10 @@ Granges_from_gtf <- function(gtf){
            seqnames = unique(chr), # all sequences should be on the same chromosome
            gene_strand = unique(strand))
  # use the gene_structures object to create the genomic ranges object
-  gene_ranges <- GRanges(seqnames = gene_structures$seqnames, 
-                         ranges = IRanges(start=gene_structures$gene_start, 
+  gene_ranges <- GRanges(seqnames = gene_structures$seqnames,
+                         ranges = IRanges(start=gene_structures$gene_start,
                                          end=gene_structures$gene_end,
-                                          names = gene_structures$transcript_id), 
+                                          names = gene_structures$transcript_id),
                         strand = gene_structures$gene_strand)
  return(gene_ranges)
 }
@@ -101,7 +101,7 @@ lists.of.overlaps <- function(overlaps){
  list.out <- list()
  vec <- c()
  i <- 1
-  
+
  # while there are still selfoverlapping sequences
  while (length(selfoverlap) > 0){
    # check each overlap if it belongs to a selfoverlapping sequence
@@ -175,12 +175,12 @@ extract_attributes <- function(gtf_column, att_of_interest){
 Combine computational annotation and Isoseq transcripts into a single genome annotation. Predict ORFs in the transcript sequences using CPC2.

 ```{bash}
-mkdir data/isoseq/cpc/
+mkdir runs/isoseq/cpc/

 # Predict coding sequence in braker2 transcripts for TSEBRA
 tools/CPC2_standalone-1.0.1/bin/CPC2.py \
-	-i data/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.fasta \
-	-o data/isoseq/cpc2/cpc2_from_sqanti \
+	-i runs/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.fasta \
+	-o runs/isoseq/cpc2/cpc2_from_sqanti \
 	--ORF
 ```

@@ -188,24 +188,24 @@ Convert CPC2 output into a gtf file of predicted transcript coding sequences. CP

 ```{r}
 # prepare a bed file from the results of cpc2
-cpc2 <- read.table("data/isoseq/cpc2/cpc2_from_sqanti.txt")
+cpc2 <- read.table("runs/isoseq/cpc2/cpc2_from_sqanti.txt")
 # filter for coding transcripts with an intact ORF
 cpc2 <- cpc2 %>%
  filter(V9 == "coding" & V6 == 1) %>%
  summarise(ID=V1, start=V7-1, end=V7+(V3*3)-1)
 # write as bed file for extraction of CDS
-write_tsv(cpc2, 
-          file="data/isoseq/cpc2/cpc2_extraction.bed",
+write_tsv(cpc2,
+          file="runs/isoseq/cpc2/cpc2_extraction.bed",
          quote = "none",
          col_names = F)

 # Convert cpc2 output to gtf file:
 # read in gtf file of isoseq transcript data
-isoseq.gtf <- read.gtf("data/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.gtf")
+isoseq.gtf <- read.gtf("runs/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.gtf")
 isoseq.gtf <- isoseq.gtf[isoseq.gtf$type == "exon",]

 # read in cpc2 output file, the bed file used for extraction does suffice
-cpc2.bed <- read.table("data/isoseq_data/cpc2/cpc2_extraction.bed")
+cpc2.bed <- read.table("runs/isoseq_runs/cpc2/cpc2_extraction.bed")
 colnames(cpc2.bed) <- c("ID", "CDS_start", "CDS_end")
 # bed format is 0 based, convert back to a 1 based format, end position does not have to converted
 cpc2.bed$CDS_start <- cpc2.bed$CDS_start+1
@@ -242,7 +242,7 @@ for (i in 1:length(ids)){
  shifted[length(shifted)+1] <- shifted[length(shifted)]+1
  # which values change by more than 1 after one shift? These values represent exon boundaries
  boundaries <- which(abs(cds_coordinates - shifted) != 1)
-  
+
  # create matrix from which to construct the start and end positions of the gtf file
  # create matrices for single exon genes also, as they are converted to vectors in a later step otherwise
  if (length(boundaries) > 0){
@@ -262,13 +262,13 @@ for (i in 1:length(ids)){
    mat[1,2] <- cds_coordinates[1]
    mat[1,1] <- cds_coordinates[length(cds_coordinates)]
  }
-  
+
  # invert matrix for minus strand in order to start with the lowest number
  # only if there are multiple exons
  if (stranded == "-" & nrow(mat) != 1){
    mat <- apply(apply(mat, 1, rev), 1, rev)
  }
-  
+
  # use the previously created gtf structure to store the CDS coordinates
  # the number of CDS rows is always <= number of exon rows
  out.subset <- gtf.subset[1:nrow(mat),]
@@ -283,33 +283,33 @@ for (i in 1:length(ids)){
 # output represents all CDS records

 # write gtf file
-write.table(output[,1:9], 
-            "data/isoseq/cpc2/cpc2_extracted_cds.gtf",
+write.table(output[,1:9],
+            "runs/isoseq/cpc2/cpc2_extracted_cds.gtf",
            col.names = F, row.names = F, quote=F, sep ="\t")
 ```

 Run TSEBRA to combine computational prediction with Iso-Seq transcripts, using gffread for deduplication:

 ```{bash}
-mkdir -p data/braker_analysis/TSEBRA
-mkdir -p data/reannotation_correction/computational
-mkdir -p data/reannotation_correction/manual
+mkdir -p runs/braker_analysis/TSEBRA
+mkdir -p runs/reannotation_correction/computational
+mkdir -p runs/reannotation_correction/manual

 # -g specifies the braker gtf, -c long read config file, -e braker hint file -l isoseq cpc2 extracted gtf file
 /home/tom/Documents/tools/TSEBRA/bin/tsebra.py \
-	-g data/braker_analysis/external_evidence/fixed_subsets/full_partial_support.gtf \
+	-g runs/braker_analysis/external_evidence/fixed_subsets/full_partial_support.gtf \
 	-c /home/tom/Documents/tools/TSEBRA/config/long_reads.cfg \
-	-e data/braker2/polished_prot_rna/hintsfile.gff \
-	-l data/isoseq/cpc2/cpc2_extracted_cds.gtf \
-	-o data/braker_analysis/TSEBRA/tsebra.gtf
+	-e runs/braker2/polished_prot_rna/hintsfile.gff \
+	-l runs/isoseq/cpc2/cpc2_extracted_cds.gtf \
+	-o runs/braker_analysis/TSEBRA/tsebra.gtf


 # To remove any duplicated entries and to cluster the predicted transcripts into loci from the tsebra gtf file I used gffread.
 # -M option merges identical entries, -K option causes stricter merge, -T outputs as gtf (did also output as gff file)
 /home/tom/Documents/tools/gffread-0.12.7/gffread -M -K -T \
-	-d data/braker_analysis/TSEBRA/duplication_info.txt \
-	data/braker_analysis/TSEBRA/tsebra.gtf > \
-	data/braker_analysis/TSEBRA/tsebra_dedup.gtf
+	-d runs/braker_analysis/TSEBRA/duplication_info.txt \
+	runs/braker_analysis/TSEBRA/tsebra.gtf > \
+	runs/braker_analysis/TSEBRA/tsebra_dedup.gtf
 ```


@@ -317,7 +317,7 @@ After the finalisation of the genome prediction using TSEBRA and the deduplicati

 ```{r}
 # read in the annotation
-tsebra.gtf <- read.gtf("data/braker_analysis/TSEBRA/tsebra_dedup.gtf")
+tsebra.gtf <- read.gtf("runs/braker_analysis/TSEBRA/tsebra_dedup.gtf")

 # create a mapping from locus the gene id, the locus is only found in rows of the "transcript" type
 transcript_locus_mapping <- tsebra.gtf %>%
@@ -332,7 +332,7 @@ tsebra.gtf <- left_join(tsebra.gtf, transcript_locus_mapping)
 # save information as RDS
 output <- tsebra.gtf %>%
  select(source, gene_id, transcript_id, locus)
-saveRDS(output, "data/reannotation_correction/computational/locus.RDS")
+saveRDS(output, "runs/reannotation_correction/computational/locus.RDS")


 # create a dataframe indicating the order of the scaffolds and contigs
@@ -355,9 +355,9 @@ tsebra.gtf.sorted <- tsebra.gtf %>%

 # use only one record per transcript, sort by locus and add number with the number of transcript at that locus
 # to later create the transcript names
-transcript_id <- tsebra.gtf.sorted %>% 
-  filter(type == "transcript") %>% 
-  group_by(locus) %>% 
+transcript_id <- tsebra.gtf.sorted %>%
+  filter(type == "transcript") %>%
+  group_by(locus) %>%
  mutate(occ = 1:n()) %>%
  ungroup() %>%
  select(transcript_id, occ)
@@ -366,15 +366,15 @@ tsebra.gtf.sorted <- left_join(tsebra.gtf.sorted, transcript_id, by=c("transcrip


 # write temporary RDS file:
-saveRDS(tsebra.gtf.sorted, 
-          "data/reannotation_correction/computational/tsebra_temp3.RDS")
+saveRDS(tsebra.gtf.sorted,
+          "runs/reannotation_correction/computational/tsebra_temp3.RDS")
 ```

 Rename the transcripts and save again as a renamed gtf file:

 ```{r}
 # read in RDS file
-tsebra.gtf.sorted <- readRDS("data/reannotation_correction/computational/tsebra_temp3.RDS")
+tsebra.gtf.sorted <- readRDS("runs/reannotation_correction/computational/tsebra_temp3.RDS")

 # rename the locus and transcript ID records to match
 # add column indicating the gene order
@@ -393,19 +393,19 @@ gene_identifier <- paste("AHp", order_filled, sep="")
 gene.id.df <- data.frame(ids, order3, order_filled, gene_identifier)
 tsebra.gtf.sorted <- left_join(tsebra.gtf.sorted, gene.id.df, by = c("locus" = "ids"))

-# create the transcript identifier based on gene identifier 
+# create the transcript identifier based on gene identifier
 tsebra.gtf.sorted <- tsebra.gtf.sorted %>%
  mutate(transcript_identifier = paste0(gene_identifier, ".", occ))

 # include new attribute column
-tsebra.gtf.sorted$new_attributes <- paste0("gene_id \"", 
-                                            tsebra.gtf.sorted$gene_identifier, 
-                                            "\"; transcript_id \"", 
-                                            tsebra.gtf.sorted$transcript_identifier, 
+tsebra.gtf.sorted$new_attributes <- paste0("gene_id \"",
+                                            tsebra.gtf.sorted$gene_identifier,
+                                            "\"; transcript_id \"",
+                                            tsebra.gtf.sorted$transcript_identifier,
                                            "\";")
 # create mapping file of old and new names:
 mapping <- tsebra.gtf.sorted %>% select(gene_id, transcript_id, locus, gene_identifier, transcript_identifier)
-saveRDS(mapping, "data/reannotation_correction/computational/mapping.RDS")
+saveRDS(mapping, "runs/reannotation_correction/computational/mapping.RDS")

 # create final gtf file in the correct column order:
 output <- tsebra.gtf.sorted %>%
@@ -413,14 +413,14 @@ output <- tsebra.gtf.sorted %>%
  select(chr, source, type, start, end, score, strand, phase, new_attributes)

 # write gtf
-write.table(output[,1:9], 
-          "data/reannotation_correction/computational/tsebra_renamed.gtf",
+write.table(output[,1:9],
+          "runs/reannotation_correction/computational/tsebra_renamed.gtf",
          col.names = F, row.names = F, quote=F, sep ="\t")
 ```

-Based on evidence from sequencing data, the sequence of the AmMYBl1 gene (AHp014591) was manually adjusted. The resulting file was saved as data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf. Gffread was used to extract cds and protein fasta files of the annotated genes and convert the gtf file to the gff3 format, representing the genome annotation v2.2.
+Based on evidence from sequencing data, the sequence of the AmMYBl1 gene (AHp014591) was manually adjusted. The resulting file was saved as runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf. Gffread was used to extract cds and protein fasta files of the annotated genes and convert the gtf file to the gff3 format, representing the genome annotation v2.2.

-More manual adjustment is required. First, the phase field in the annotation file is incorrect (likely an issue with TSEBRA). Secondly, seemingly a bug in TSEBRA causes the annotated sequence to be changed in the case that the stop codon is split up by an intron. I will save the annotation with the manually corrected AmMYBl1 as a gtf file under a different name and fix the phasing issue. The new manually corrected input file is data/reannotation_correction/manual/manually_MYBl1_corrected.gtf.
+More manual adjustment is required. First, the phase field in the annotation file is incorrect (likely an issue with TSEBRA). Secondly, seemingly a bug in TSEBRA causes the annotated sequence to be changed in the case that the stop codon is split up by an intron. I will save the annotation with the manually corrected AmMYBl1 as a gtf file under a different name and fix the phasing issue. The new manually corrected input file is runs/reannotation_correction/manual/manually_MYBl1_corrected.gtf.

 As a first step, the genes with wrong values in their last exons are manually adjusted. This is the list of identifiers with an incorrect last exon, the genes were corrected based on their annotated cpc2 cds predictions.
 ERROR: Proteins do not match for transcript AHp001894.1	Strand:+	Exons: 2 checked, corrected
@@ -450,7 +450,7 @@ names(annotation.cds.fasta[which((width(annotation.cds.fasta) %% 3) != 0)])


 ```{r}
-annotation.gtf <- read.gtf("data/reannotation_correction/manual/manually_MYBl1_corrected.gtf")
+annotation.gtf <- read.gtf("runs/reannotation_correction/manual/manually_MYBl1_corrected.gtf")
 # searched the above list in command line using grep -n to identify the lines which need to be corrected
 annotation.gtf[14134,4] <- 31584484
 annotation.gtf[93858,4] <- 340181
@@ -486,8 +486,8 @@ annotation.gtf[178221,5] <- 807 # minus strand, only adjust by 1


 # write corrected table
-write.table(annotation.gtf[,1:9], 
-          "data/reannotation_correction/manual/manually_genes_corrected.gtf",
+write.table(annotation.gtf[,1:9],
+          "runs/reannotation_correction/manual/manually_genes_corrected.gtf",
          col.names = F, row.names = F, quote=F, sep ="\t")
 ```

@@ -495,40 +495,40 @@ After manual correction of genes, fix the phase attribute.

 ```{bash}
 # Add a gene line and convert to gff for gffvalidator. Correct the phasing field using gffvalidator.
-/home/tom/Documents/tools/gffread-0.12.7/gffread -E data/reannotation_correction/manual/manually_genes_corrected.gtf --keep-genes > data/reannotation_correction/manual/manually_genes_corrected.gene.gff
+/home/tom/Documents/tools/gffread-0.12.7/gffread -E runs/reannotation_correction/manual/manually_genes_corrected.gtf --keep-genes > runs/reannotation_correction/manual/manually_genes_corrected.gene.gff

 # the genes with wrong coordinates should be fixed at this step, so that they can be assigned the correct phasing attribute
-gt gff3 -tidy -force -retainids -addids no -o data/reannotation_correction/manual/manually_genes_corrected.gene.phase.gff data/reannotation_correction/manual/manually_genes_corrected.gene.gff
+gt gff3 -tidy -force -retainids -addids no -o runs/reannotation_correction/manual/manually_genes_corrected.gene.phase.gff runs/reannotation_correction/manual/manually_genes_corrected.gene.gff

 # remove empty lines from the gtf file, with only "###"
-grep -v "###" data/reannotation_correction/manual/manually_genes_corrected.gene.phase.gff > data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff
+grep -v "###" runs/reannotation_correction/manual/manually_genes_corrected.gene.phase.gff > runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff
 ```

 Convert to other formats using gffread.

 ```{bash}
 # convert to gtf format
-/home/tom/Documents/tools/gffread-0.12.7/gffread -v -T --keep-genes data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff > data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf
+/home/tom/Documents/tools/gffread-0.12.7/gffread -v -T --keep-genes runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff > runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf

 # extract prot (-y) and cds (-x) fasta files
-/home/tom/Documents/tools/gffread-0.12.7/gffread -x data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta \
-	-y data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.prot.fasta \
-	-g polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \
-	data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf
-	
+/home/tom/Documents/tools/gffread-0.12.7/gffread -x runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta \
+	-y runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.prot.fasta \
+	-g runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \
+	runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf
+
 # convert to gff3 format
-#/home/tom/Documents/tools/gffread-0.12.7/gffread data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf > \
-#	data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff
-	
+#/home/tom/Documents/tools/gffread-0.12.7/gffread runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf > \
+#	runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff
+
 # copy to the annotation directory:
-cp data/reannotation_correction/manual/A* polished_genome_annotation/annotation/
+cp runs/reannotation_correction/manual/A* runs/polished_genome_annotation/annotation/
 ```

 Perform quality control of the annotated genes:

 ```{r}
 # load in cds fasta
-annotation.cds.fasta <- readBStringSet(filepath = "data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta")
+annotation.cds.fasta <- readBStringSet(filepath = "runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta")

 # are there genes with length != a multiple of 3?
 which((width(annotation.cds.fasta) %% 3) != 0)
@@ -549,30 +549,28 @@ Compare annotation completeness of genome annotation v2.2 with previously publis
 # genome annotation v2.2
 # compare against the orthoDBv10 Embryophyta dataset, using 7 threads
 busco -m protein \
-  -i polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta \
+  -i runs/polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta \
  -o genome_annotation_v2.2 \
  -l embryophyta_odb10 \
-  --out_path data/annotation_analysis/busco/ \
-  --download_path data/annotation_analysis/busco/datasets/ \
+  --out_path runs/annotation_analysis/busco/ \
+  --download_path runs/annotation_analysis/busco/datasets/ \
  -c 6
-  
+
 # genome annotation v2.1
 busco -m protein \
  -i /home/tom/Documents/reference_genomes/Ahypochondriacus/annotation/Ahypochondriacus_459_v2.1.protein.fa \
  -o genome_annotation_v2.1 \
  -l embryophyta_odb10 \
-  --out_path data/annotation_analysis/busco/ \
-  --download_path data/annotation_analysis/busco/datasets/ \
+  --out_path runs/annotation_analysis/busco/ \
+  --download_path runs/annotation_analysis/busco/datasets/ \
  -c 6
-  
+
 # A. cruentus annotation
 busco -m protein \
  -i /home/tom/Documents/reference_genomes/Acruentus/annotation/Amacr_pep_20210312.tfa \
  -o Acruentus_annotation \
  -l embryophyta_odb10 \
-  --out_path data/annotation_analysis/busco/ \
-  --download_path data/annotation_analysis/busco/datasets/ \
+  --out_path runs/annotation_analysis/busco/ \
+  --download_path runs/annotation_analysis/busco/datasets/ \
  -c 6
 ```
-
-
--- a/workflows/other_inferences/infer_coverage_along_genome.sh
+++ b/workflows/other_inferences/infer_coverage_along_genome.sh
@@ -16,10 +16,10 @@ module load samtools/1.13


 #Set input and parameters
-read1=raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_1.fastq.gz
-read2=raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_2.fastq.gz
-input=polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta
-outdir=data/mapping_bias_inference/genome_coverage/
+read1=studies/additional_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_1.fastq.gz
+read2=studies/additional_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_2.fastq.gz
+input=runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta
+outdir=runs/mapping_bias_inference/genome_coverage/
 outbam=SRR2106212.sorted.bam
 outdedup=SRR2106212.sorted.dedup.bam
 outcov=SRR2106212.coverage

--- a/workflows/other_inferences/mapping_bias_inference_analysis.Rmd
+++ b/workflows/other_inferences/mapping_bias_inference_analysis.Rmd
@@ -18,7 +18,7 @@ Analyse the observed bias in read mapping observed in ATAC and WG sequencing. A

 ```{r}
 # read in data
-coverage <- read.table(file = "data/mapping_bias_inference/genome_coverage/SRR2106212.sorted.dedup.10.depth")
+coverage <- read.table(file = "runs/mapping_bias_inference/genome_coverage/SRR2106212.sorted.dedup.10.depth")

 # calculate coverage in windows with fixed window size
 windowsize <- 5000
@@ -46,7 +46,7 @@ ggplot(data = window_coverage) +
  theme(text = element_text(size = 22))

 # save plot
-ggsave(filename = "plots/mapping_bias_inference/50k_window_read_coverage.png",
+ggsave(filename = "runs/plots/mapping_bias_inference/50k_window_read_coverage.png",
       width = 14, height = 8)
 ```

@@ -54,11 +54,11 @@ Investigate if genomic features correlate with the observed mapping bias. Calcul

 ```{bash}
 # create genome file for bedtools
-awk -F'\t' 'BEGIN {OFS = FS} {print $1,$2}' /projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta.fai > data/mapping_bias_inference/bedtools_genome.txt
+awk -F'\t' 'BEGIN {OFS = FS} {print $1,$2}' /projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta.fai > runs/mapping_bias_inference/bedtools_genome.txt
 # make windows using bedtools, windowsize 5k, include only Scaffolds
-bedtools makewindows -g data/mapping_bias_inference/bedtools_genome.txt -w 5000 | grep "Scaffold" > data/mapping_bias_inference/all_genome_windows.bed
+bedtools makewindows -g runs/mapping_bias_inference/bedtools_genome.txt -w 5000 | grep "Scaffold" > runs/mapping_bias_inference/all_genome_windows.bed
 # calculate statistics in windows
-bedtools nuc -fi /projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta -bed data/mapping_bias_inference/genome_windows.bed > data/mapping_bias_inference/window_stats.txt
+bedtools nuc -fi /projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta -bed runs/mapping_bias_inference/genome_windows.bed > runs/mapping_bias_inference/window_stats.txt
 ```


@@ -66,14 +66,14 @@ Plot Scaffold 10, with read depth and plastid genome content

 ```{r}
 # read in paf files from minimap2
-chloroplast.paf <- read_paf("data/mapping_bias_inference/plastid_to_genome/chloroplast_to_genome.paf")
-mito.paf <- read_paf("data/mapping_bias_inference/plastid_to_genome/mitochondrium_to_genome.paf")
+chloroplast.paf <- read_paf("runs/mapping_bias_inference/plastid_to_genome/chloroplast_to_genome.paf")
+mito.paf <- read_paf("runs/mapping_bias_inference/plastid_to_genome/mitochondrium_to_genome.paf")
 mito.df <- as.data.frame(mito.paf)
 chloro.df <- as.data.frame(chloroplast.paf)

 # read in mapping files for cruentus
-chloroplast.cruentus.paf <- read_paf("data/mapping_bias_inference/plastid_to_genome/chloroplast_to_genome_cruentus.paf")
-mito.cruentus.paf <- read_paf("data/mapping_bias_inference/plastid_to_genome/mitochondrium_to_genome_cruentus.paf")
+chloroplast.cruentus.paf <- read_paf("runs/mapping_bias_inference/plastid_to_genome/chloroplast_to_genome_cruentus.paf")
+mito.cruentus.paf <- read_paf("runs/mapping_bias_inference/plastid_to_genome/mitochondrium_to_genome_cruentus.paf")
 mito.cruentus.df <- as.data.frame(mito.cruentus.paf)
 chloro.cruentus.df <- as.data.frame(chloroplast.cruentus.paf)

@@ -110,7 +110,7 @@ ggplot(data = window_coverage) +
  theme(text = element_text(size = 22))


-ggsave(filename = "plots/mapping_bias_inference/5kb_outlier_windows_zoom.png",
+ggsave(filename = "runs/plots/mapping_bias_inference/5kb_outlier_windows_zoom.png",
       width = 10, height = 6, bg = "white")
 ```

@@ -123,7 +123,7 @@ mito.df %>%
  group_by(tname) %>%
  summarize(total_align_length = sum(alen)) %>%
  mutate(percent_aligned = (total_align_length/mito.df$qlen[1])*100) %>%
-  arrange(desc(percent_aligned)) 
+  arrange(desc(percent_aligned))

 # how much chloroplast was mapped?
 chloro.df %>%
@@ -172,8 +172,8 @@ There are structural differences in GC content between the different genomes. Wh

 ```{bash}
 # GC content of different genomes
-seqkit fx2tab --name --gc data/mapping_bias_inference/plastid_to_genome/Ah_chloroplast.fasta
-seqkit fx2tab --name --gc data/mapping_bias_inference/plastid_to_genome/Bv_mitochondrium.fasta
+seqkit fx2tab --name --gc runs/mapping_bias_inference/plastid_to_genome/Ah_chloroplast.fasta
+seqkit fx2tab --name --gc runs/mapping_bias_inference/plastid_to_genome/Bv_mitochondrium.fasta
 seqkit fx2tab --name --gc polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta | grep "Scaffold"
 ```

@@ -247,7 +247,7 @@ Compare with functional annotation:

 ```{r}
 # load functional annotation
-functional_annotation <- readxl::read_xlsx(path = "data/functional_annotation/eggnog_mapper/MM_8wexw920.emapper.annotations.xlsx")
+functional_annotation <- readxl::read_xlsx(path = "runs/functional_annotation/eggnog_mapper/MM_8wexw920.emapper.annotations.xlsx")
 colnames(functional_annotation) <- functional_annotation[2,]
 functional_annotation <- functional_annotation[-(1:2),]

@@ -258,8 +258,3 @@ mito_functions <- functional_annotation %>%
 chloro_functions <- functional_annotation %>%
  filter(query %in% chloro_overlap$transcript_id)
 ```
-
-
-
-
-
--- a/workflows/other_inferences/plastid_to_genome_mapping.sh
+++ b/workflows/other_inferences/plastid_to_genome_mapping.sh
@@ -13,12 +13,12 @@ source $CONDA_PREFIX/etc/profile.d/conda.sh
 conda activate isoseq

 # create output directory
-MAPPINGOUT=data/mapping_bias_inference/plastid_to_genome/
+MAPPINGOUT=runs/mapping_bias_inference/plastid_to_genome/
 mkdir -p "$MAPPINGOUT"

 ### MAPPING
 # map reads onto the reference genome
-REFERENCE=/projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta
+REFERENCE=runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta

 # Align sequences to reference genome
 # sequences obtained from (Beta vulgaris, mitchondrium): https://www.ncbi.nlm.nih.gov/nuccore/BA000009.3

--- a/workflows/other_inferences/readme.txt
+++ b/workflows/other_inferences/readme.txt
@@ -4,11 +4,11 @@ Mapping bias and plastid content investigation

 ### Script order:

- code/other_inferences/infer_coverage_along_genome.sh
+- workflows/other_inferences/infer_coverage_along_genome.sh
 Map WGS reads against the reference genome to investigate mapping bias

- code/other_inferences/plastid_to_genome_mapping.sh
+- workflows/other_inferences/plastid_to_genome_mapping.sh
 Map chloroplast and mitochondria sequences to the reference genome

- code/other_inferences/mapping_bias_inference_analysis.Rmd
+- workflows/other_inferences/mapping_bias_inference_analysis.Rmd
 Plot mapping bias with annotated chloroplast and mitochondrial sequences
--- a/workflows/readme.txt
+++ b/workflows/readme.txt
@@ -4,26 +4,26 @@ All code used for polishing, masking and reannotation of the A. hypochondriacus

 ### Order of usage:

- code/genome_polishing/
+- workflows/genome_polishing/
 polish the previously published A. hypochondriacus reference genome

- code/repeat_masking/
+- workflows/repeat_masking/
 to prepare computational annotation, softmask repetitive elements in the polished genome

- code/braker2/
+- workflows/braker2/
 perform computational annotation of the softmasked reference genome using BRAKER2

- code/isoseq_assembly/
+- workflows/isoseq_assembly/
 assembly long read transcript sequencing data and compare effects of genome polishing on reported completeness

- code/merge_annotation/
+- workflows/merge_annotation/
 merge computational annotation with long-read transcript sequencing data and process output into genome annotation v2.2

- code/annotation_analysis/
+- workflows/annotation_analysis/
 identify flavonoid and betalain pathway genes, as well as MYB transcription factors in the genome annotation v2.2

- code/other_inferences/
+- workflows/other_inferences/
 mapping of chloroplast and mitchondrium sequences to the reference genome, analysis of mapping coverage bias

- code/BSA/
+- workflows/BSA/
 bulk segregant analysis and analysis of RNA-seq data from pooled flower tissue
--- a/workflows/repeat_masking/analyse_repetitive_elements.Rmd
+++ b/workflows/repeat_masking/analyse_repetitive_elements.Rmd
@@ -16,7 +16,7 @@ Prepare input file for processing using R:

 ```{bash}
 # process build summary output
-tail -n +41 data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary | head -n -907 > data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary.tsv
+tail -n +41 runs/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary | head -n -907 > runs/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary.tsv
 ```


@@ -24,12 +24,12 @@ Analyse repetitive element content and composition:

 ```{r}
 # read in buildSummary
-buildSummary <- read_table(file = "data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary.tsv",
+buildSummary <- read_table(file = "runs/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary.tsv",
                           col_names = c("repeat", "count", "length", "percent","drop"))
 buildSummary <- buildSummary[,1:4]

 # read in repeat classification
-new_TE_classification <- read_delim(file = "data/repeatmasking/reclassification/classified.updated.txt",
+new_TE_classification <- read_delim(file = "runs/repeatmasking/reclassification/classified.updated.txt",
                                    delim = "#",
                                    col_names = c("TE_family", "classification"))

@@ -49,6 +49,6 @@ summary_table %>%
  summarize(sum_total_length = sum(total_length),
            sum_percentage = sum(percentage))

-write.csv(summary_table, file = "data/repeatmasking/reclassification/reclassified.output.tbl",
+write.csv(summary_table, file = "runs/repeatmasking/reclassification/reclassified.output.tbl",
          quote = F, row.names = F)
 ```
--- a/workflows/repeat_masking/readme.txt
+++ b/workflows/repeat_masking/readme.txt
@@ -4,11 +4,11 @@ The polished reference genome is masked for repetitive elements in order to prep

 ### Script order:

- code/repeat_masking/run_repeatmodeler.sh
+- workflows/repeat_masking/run_repeatmodeler.sh
 run Repeatmodeler to identify repetitive elements in the polished reference genome

- code/repeat_masking/run_repeatmasker.sh
+- workflows/repeat_masking/run_repeatmasker.sh
 run Repeatmasker on the Repeatmodeler output to classify identified elements and mask the polished reference genome

- code/repeat_masking/analyse_repetitive_elements.Rmd
+- workflows/repeat_masking/analyse_repetitive_elements.Rmd
 analyse the repetitive element composition
--- a/workflows/repeat_masking/run_repeatmasker.sh
+++ b/workflows/repeat_masking/run_repeatmasker.sh
@@ -17,7 +17,7 @@
 module load repeatmasker/4.1.1

 # create database directory
-RMOUT=data/repeatmasking/repeatmasker
+RMOUT=runs/repeatmasking/repeatmasker

 mkdir -p $RMOUT

@@ -26,12 +26,12 @@ mkdir -p $RMOUT
 # for 20 threads, the pa setting while using rmblast should be 5

 # Converted the polished reference genome fasta to all capital letters as preparation to repeatmasking:
-awk '/^>/ {print($0)}; /^[^>]/ {print(toupper($0))}' data/NextPolish/processed/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta \
+awk '/^>/ {print($0)}; /^[^>]/ {print(toupper($0))}' runs/NextPolish/processed/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta \
 	> "$RMOUT"/Ahypochondriacus_2.2_polished.capital.fasta

 ### run RepeatMasker
 # lib = repeat database created with repeatmodeler
-RepeatMasker -lib data/repeatmasking/repeatmodeler/consensi.fa.classified \
+RepeatMasker -lib runs/repeatmasking/repeatmodeler/consensi.fa.classified \
 	-pa 5 \
 	-small \
 	-e rmblast \
@@ -50,4 +50,4 @@ bedtools maskfasta \
 	-fo "$RMOUT"/output/Ahypochondriacus_2.2_polished.softmasked.fasta

 # more detailed summary:
-buildSummary.pl data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.out > data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary
+buildSummary.pl runs/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.out > runs/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary
--- a/workflows/repeat_masking/run_repeatmodeler.sh
+++ b/workflows/repeat_masking/run_repeatmodeler.sh
@@ -19,23 +19,23 @@
 module load repeatmodeler/2.0.1

 # create database directory
-mkdir -p data/repeatmodeler/database/
+mkdir -p runs/repeatmodeler/database/

 ### Main
 # increased number of tasks, each rmblast job will take 4 threads
 # for 20 threads, the pa setting while using rmblast should be 5

 # Create database
-BuildDatabase -name data/repeatmodeler/database/polished \
-	data/NextPolish/processed/Ahypochondriacus_2.2_polished.fasta
+BuildDatabase -name runs/repeatmodeler/database/polished \
+	runs/NextPolish/processed/Ahypochondriacus_2.2_polished.fasta

 # run Repeatmodeler
-RepeatModeler -database data/repeatmodeler/database/polished \
+RepeatModeler -database runs/repeatmodeler/database/polished \
 	-pa 5 \
 	-LTRStruct

 # reclassify identified repeats
-mkdir -p data/repeatmasking/reclassification/
+mkdir -p runs/repeatmasking/reclassification/

 # repeatmasker version 4.1.5
-RepeatClassifier -consensi data/repeatmasking/reclassification/consensi.fa -stockholm data/repeatmasking/reclassification/families.stk
+RepeatClassifier -consensi runs/repeatmasking/reclassification/consensi.fa -stockholm runs/repeatmasking/reclassification/families.stk
No results found