Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • t.winkler/betalain_regulation_amaranth
  • ceplas/betalain_regulation_amaranth
2 results
Show changes
Showing
with 196 additions and 207 deletions
......@@ -20,13 +20,13 @@ echo "$SLURM_ARRAY_TASK_ID"
# removal of primers and barcodes already done
# step 4, removal of polyA tail and artificial concatemers
PROCESSING=raw_data/isoseq_raw_reads/processed/
PROCESSING=assays/isoform_sequencing/dataset/isoseq_raw_reads/processed/
mkdir -p "PROCESSING"
SOURCE_DIR=raw_data/isoseq_raw_reads/reads
SOURCE_DIR=assays/isoform_sequencing/dataset/reads
FILES=("$SOURCE_DIR"/*bam)
PRIMER_SOURCE=raw_data/isoseq_raw_reads/primers
PRIMER_SOURCE=assays/isoform_sequencing/dataset/primers
PRIMER_FILES=("$PRIMER_SOURCE"/*fasta)
REFINEIN="$SOURCE_DIR"/"${FILES["${SLURM_ARRAY_TASK_ID}"]}"
......@@ -46,7 +46,7 @@ samtools merge "$ISOSEQOUT"combined.merged_flnc.bam /home/twinkle1/isoseq_raw_re
# clustering of identical reads
IN="data/isoseq3_pipeline/combined.merged_flnc.bam"
OUT="data/isoseq3_pipeline/combined.merged_clustered.bam"
IN="runs/isoseq3_pipeline/combined.merged_flnc.bam"
OUT="runs/isoseq3_pipeline/combined.merged_clustered.bam"
isoseq3 cluster "$IN" "$OUT" --verbose --use-qvs
......@@ -4,20 +4,20 @@ Assembly full-length transcript sequencing data.
### Script order:
- code/isoseq_assembly/isoseq3_pipeline.sh
- workflows/isoseq_assembly/isoseq3_pipeline.sh
assemble FLNC reads from CCS files
- code/isoseq_assembly/combined_isoseq3_pipeline.sh
- workflows/isoseq_assembly/combined_isoseq3_pipeline.sh
combine FLNC reads and cluster identical reads
- code/isoseq_assembly/combined_mapping_and_collapse.sh
- workflows/isoseq_assembly/combined_mapping_and_collapse.sh
collapse clustered reads into unique full-length transcripts using the polished reference genome
- code/isoseq_assembly/combined_mapping_and_collapse_old_genome.sh
- workflows/isoseq_assembly/combined_mapping_and_collapse_old_genome.sh
collapse clustered reads into unique full-length transcripts using the unpolished reference genome
- code/isoseq_assembly/run_sqanti.sh
- workflows/isoseq_assembly/run_sqanti.sh
run SQANTI in order to correct possible sequencing errors in the full-length transcripts using both reference genomes
- code/isoseq_assembly/comparison_isoseq_polishing_effectiveness.Rmd
- workflows/isoseq_assembly/comparison_isoseq_polishing_effectiveness.Rmd
compare effect of genome polishing on error correction of full-length transcript sequences
......@@ -3,20 +3,20 @@
# run sqanti correction and filtering
# preliminary genome annotation is used to compare against, but this only effects the pdf output
mkdir -p data/isoseq/sqanti/output_polished
mkdir -p runs/isoseq/sqanti/output_polished
/home/tom/Documents/tools/SQANTI3-4.2/sqanti3_qc.py \
data/isoseq/mapping_and_collapse/combined.collapsed.min_fl_2.filtered.gff \
data/braker2/polished_prot_rna/braker.gtf \
polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \
-d data/isoseq/sqanti/output_polished/ \
runs/isoseq/mapping_and_collapse/combined.collapsed.min_fl_2.filtered.gff \
runs/braker2/polished_prot_rna/braker.gtf \
runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \
-d runs/isoseq/sqanti/output_polished/ \
-n 6
mkdir -p data/isoseq/sqanti/output_old_genome
/home/tom/Documents/tools/SQANTI3-4.2/sqanti3_qc.py \
data/isoseq/mapping_and_collapse_old_genome/combined.collapsed.min_fl_2.filtered.gff \
data/braker2/polished_prot_rna/braker.gtf \
/home/tom/Documents/reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa \
-d data/isoseq/sqanti/output_old_genome/ \
runs/isoseq/mapping_and_collapse_old_genome/combined.collapsed.min_fl_2.filtered.gff \
runs/braker2/polished_prot_rna/braker.gtf \
studies/additional_data/resources/reference_genomes/Ahypochondriacus/assembly//Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa \
-d runs/isoseq/sqanti/output_old_genome/ \
-n 6
......@@ -54,10 +54,10 @@ Granges_from_gtf <- function(gtf){
seqnames = unique(chr), # all sequences should be on the same chromosome
gene_strand = unique(strand))
# use the gene_structures object to create the genomic ranges object
gene_ranges <- GRanges(seqnames = gene_structures$seqnames,
ranges = IRanges(start=gene_structures$gene_start,
gene_ranges <- GRanges(seqnames = gene_structures$seqnames,
ranges = IRanges(start=gene_structures$gene_start,
end=gene_structures$gene_end,
names = gene_structures$transcript_id),
names = gene_structures$transcript_id),
strand = gene_structures$gene_strand)
return(gene_ranges)
}
......@@ -86,21 +86,21 @@ report_both_codons <- function(gtf_object){
Create differently supported subsets using the available script from augustus/braker2:
```{bash}
mkdir -p data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets
mkdir -p runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets
# activate busco environment for the Augustus script
conda activate busco
# extract braker aa and cds, also generate the bad_genes.lst file
/home/tom/Documents/tools/Augustus/scripts/getAnnoFastaFromJoingenes.py -g polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta -f data/braker2/polished_prot_rna/braker.gtf -s FILTER -o data/braker2/polished_prot_rna/braker_extracted
/home/tom/Documents/tools/Augustus/scripts/getAnnoFastaFromJoingenes.py -g runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta -f runs/braker2/polished_prot_rna/braker.gtf -s FILTER -o runs/braker2/polished_prot_rna/braker_extracted
# select differently supported subsets based on the amount of support
tools/BRAKER/scripts/predictionAnalysis/selectSupportedSubsets.py --noSupport data/braker_analysis/external_evidence/polished_prot_rna/no_support.txt --fullSupport data/braker_analysis/external_evidence/polished_prot_rna/full_support.txt --anySupport data/braker_analysis/external_evidence/polished_prot_rna/any_support.txt data/braker2/polished_prot_rna/braker.gtf data/braker2/polished_prot_rna/hintsfile.gff
tools/BRAKER/scripts/predictionAnalysis/selectSupportedSubsets.py --noSupport runs/braker_analysis/external_evidence/polished_prot_rna/no_support.txt --fullSupport runs/braker_analysis/external_evidence/polished_prot_rna/full_support.txt --anySupport runs/braker_analysis/external_evidence/polished_prot_rna/any_support.txt runs/braker2/polished_prot_rna/braker.gtf runs/braker2/polished_prot_rna/hintsfile.gff
# remove all lines that contain "#" characters in the beginning (cause issues at later steps)
sed '/^#/d' data/braker_analysis/external_evidence/polished_prot_rna/full_support.txt > data/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf
sed '/^#/d' data/braker_analysis/external_evidence/polished_prot_rna/any_support.txt > data/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf
sed '/^#/d' data/braker_analysis/external_evidence/polished_prot_rna/no_support.txt > data/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf
sed '/^#/d' runs/braker_analysis/external_evidence/polished_prot_rna/full_support.txt > runs/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf
sed '/^#/d' runs/braker_analysis/external_evidence/polished_prot_rna/any_support.txt > runs/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf
sed '/^#/d' runs/braker_analysis/external_evidence/polished_prot_rna/no_support.txt > runs/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf
```
......@@ -108,12 +108,12 @@ Generate a dataframe with the transcript ids based on the differently supported
```{r}
# load in the support set gtfs
full.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf")
any.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf")
no.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf")
full.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf")
any.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf")
no.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf")
# load in the genes with internal stop codons, as detected by the getAnnoFastaFromJoingenes script (see braker2_results folder readme.txt)
badgenes <- read.table("data/braker_analysis/external_evidence/polished_prot_rna/bad_genes.lst")
badgenes <- read.table("runs/braker_analysis/external_evidence/polished_prot_rna/bad_genes.lst")
# extract the gene names with both, annotated start and annotated stop codons
codons_any <- report_both_codons(any.support)
......@@ -130,21 +130,21 @@ no.ids <- unique(no.support$transcript_id) #7443
# create dataframe with the subset transcript ids and the support category
all.ids <- c(full.ids, partial.ids, no.ids)
support <- c(rep("full", length(full.ids)),
rep("partial", length(partial.ids)),
support <- c(rep("full", length(full.ids)),
rep("partial", length(partial.ids)),
rep("no", length(no.ids)))
support.df <- data.frame(all.ids, support)
# filter out internal stop codons
support.df <- support.df[!support.df$all.ids %in% badgenes$V1,]
support.df <- support.df[!support.df$all.ids %in% badgenes$V1,]
# exclude all annotated genes that do not have annotated start and stop codons
fixed_support.df <- support.df[support.df$all.ids %in% codons,]
##################################
# write created dataframe as rds object:
saveRDS(fixed_support.df, file="data/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS")
saveRDS(fixed_support.df, file="runs/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS")
```
......@@ -153,12 +153,12 @@ Filter out predicted genes with internal stop codons as well as predicted genes
```{r}
# load in the gtf files to subset
full.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf")
any.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf")
no.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf")
full.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf")
any.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf")
no.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf")
# load in the annotation dataframe:
support.df <- readRDS(file="data/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS")
support.df <- readRDS(file="runs/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS")
# create partial support dataframe:
partial.support <- any.support[!any.support$transcript_id %in% full.support$transcript_id,]
......@@ -177,28 +177,28 @@ no.support.filtered <- no.support[no.support$transcript_id %in% support.df$all.i
#length(unique(no.support.filtered$transcript_id)) # 3579 remain
### Write the filtered gtf files
write.table(full.support.filtered[,1:9],
"data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.gtf",
write.table(full.support.filtered[,1:9],
"runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.gtf",
col.names = F, row.names = F, quote=F, sep ="\t")
write.table(partial.support.filtered[,1:9],
"data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.gtf",
write.table(partial.support.filtered[,1:9],
"runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.gtf",
col.names = F, row.names = F, quote=F, sep ="\t")
write.table(no.support.filtered[,1:9],
"data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.gtf",
write.table(no.support.filtered[,1:9],
"runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.gtf",
col.names = F, row.names = F, quote=F, sep ="\t")
```
After filtering the gtf files, use the filtered files in order to subset the codingseq and amino acid fasta files. Load the coding sequence and amino acid fasta files and filter them using the support dataframe.
After filtering the gtf files, use the filtered files in order to subset the codingseq and amino acid fasta files. Load the coding sequence and amino acid fasta files and filter them using the support dataframe.
```{r}
library(seqinr)
# load in the support dataframe
support.df <- readRDS(file="data/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS")
support.df <- readRDS(file="runs/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS")
support.df$all.ids <- as.character(support.df$all.ids)
# subset fasta files using the support dataframe
braker_dna <- read.fasta("data/braker2/polished_prot_rna/braker_extracted.codingseq",
braker_dna <- read.fasta("runs/braker2/polished_prot_rna/braker_extracted.codingseq",
seqtype = "DNA")
# filter the different subsets
......@@ -208,22 +208,22 @@ braker_dna.partial.filtered <- braker_dna[getName(braker_dna) %in% support.df[su
braker_dna.no.filtered <- braker_dna[getName(braker_dna) %in% support.df[support.df$support == "no",1]] # 7014 sequences
# write subsets
write.fasta(sequences=braker_dna.filtered,
names=names(braker_dna.filtered),
file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/braker_all.fasta")
write.fasta(sequences=braker_dna.full.filtered,
names=names(braker_dna.full.filtered),
file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.fasta")
write.fasta(sequences=braker_dna.partial.filtered,
names=names(braker_dna.partial.filtered),
file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.fasta")
write.fasta(sequences=braker_dna.no.filtered,
names=names(braker_dna.no.filtered),
file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.fasta")
write.fasta(sequences=braker_dna.filtered,
names=names(braker_dna.filtered),
file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/braker_all.fasta")
write.fasta(sequences=braker_dna.full.filtered,
names=names(braker_dna.full.filtered),
file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.fasta")
write.fasta(sequences=braker_dna.partial.filtered,
names=names(braker_dna.partial.filtered),
file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.fasta")
write.fasta(sequences=braker_dna.no.filtered,
names=names(braker_dna.no.filtered),
file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.fasta")
### repeat for the AA fasta files:
# subset fasta files using the support dataframe
braker_aa <- read.fasta("data/braker2/polished_prot_rna/braker_extracted.aa",
braker_aa <- read.fasta("runs/braker2/polished_prot_rna/braker_extracted.aa",
seqtype = "AA")
braker_aa.filtered <- braker_aa[getName(braker_aa) %in% support.df[,1]] # 39141 sequences
......@@ -232,20 +232,16 @@ braker_aa.partial.filtered <- braker_aa[getName(braker_aa) %in% support.df[suppo
braker_aa.no.filtered <- braker_aa[getName(braker_aa) %in% support.df[support.df$support == "no",1]] # 7014 sequences
# write subsets
write.fasta(sequences=braker_aa.filtered,
names=names(braker_aa.filtered),
file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/braker_all.faa")
write.fasta(sequences=braker_aa.full.filtered,
names=names(braker_aa.full.filtered),
file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.faa")
write.fasta(sequences=braker_aa.partial.filtered,
names=names(braker_aa.partial.filtered),
file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.faa")
write.fasta(sequences=braker_aa.no.filtered,
names=names(braker_aa.no.filtered),
file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.faa")
write.fasta(sequences=braker_aa.filtered,
names=names(braker_aa.filtered),
file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/braker_all.faa")
write.fasta(sequences=braker_aa.full.filtered,
names=names(braker_aa.full.filtered),
file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.faa")
write.fasta(sequences=braker_aa.partial.filtered,
names=names(braker_aa.partial.filtered),
file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.faa")
write.fasta(sequences=braker_aa.no.filtered,
names=names(braker_aa.no.filtered),
file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.faa")
```
......@@ -4,8 +4,8 @@ Combine the computational annotation with full-length transcript sequencing data
### Script order:
- code/merge_annotation/Braker2_subsets_and_merge.Rmd
- workflows/merge_annotation/Braker2_subsets_and_merge.Rmd
Prepare BRAKER2 input, use only predicted genes supported by external evidence for the merge
- code/merge_annotation/reannotation_correction.Rmd
- workflows/merge_annotation/reannotation_correction.Rmd
Merge BRAKER2 and full-length transcript sequencing data using TSEBRA, deduplicate, rename genes and compare annotation completeness using BUSCO
......@@ -56,10 +56,10 @@ Granges_from_gtf <- function(gtf){
seqnames = unique(chr), # all sequences should be on the same chromosome
gene_strand = unique(strand))
# use the gene_structures object to create the genomic ranges object
gene_ranges <- GRanges(seqnames = gene_structures$seqnames,
ranges = IRanges(start=gene_structures$gene_start,
gene_ranges <- GRanges(seqnames = gene_structures$seqnames,
ranges = IRanges(start=gene_structures$gene_start,
end=gene_structures$gene_end,
names = gene_structures$transcript_id),
names = gene_structures$transcript_id),
strand = gene_structures$gene_strand)
return(gene_ranges)
}
......@@ -101,7 +101,7 @@ lists.of.overlaps <- function(overlaps){
list.out <- list()
vec <- c()
i <- 1
# while there are still selfoverlapping sequences
while (length(selfoverlap) > 0){
# check each overlap if it belongs to a selfoverlapping sequence
......@@ -175,12 +175,12 @@ extract_attributes <- function(gtf_column, att_of_interest){
Combine computational annotation and Isoseq transcripts into a single genome annotation. Predict ORFs in the transcript sequences using CPC2.
```{bash}
mkdir data/isoseq/cpc/
mkdir runs/isoseq/cpc/
# Predict coding sequence in braker2 transcripts for TSEBRA
tools/CPC2_standalone-1.0.1/bin/CPC2.py \
-i data/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.fasta \
-o data/isoseq/cpc2/cpc2_from_sqanti \
-i runs/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.fasta \
-o runs/isoseq/cpc2/cpc2_from_sqanti \
--ORF
```
......@@ -188,24 +188,24 @@ Convert CPC2 output into a gtf file of predicted transcript coding sequences. CP
```{r}
# prepare a bed file from the results of cpc2
cpc2 <- read.table("data/isoseq/cpc2/cpc2_from_sqanti.txt")
cpc2 <- read.table("runs/isoseq/cpc2/cpc2_from_sqanti.txt")
# filter for coding transcripts with an intact ORF
cpc2 <- cpc2 %>%
filter(V9 == "coding" & V6 == 1) %>%
summarise(ID=V1, start=V7-1, end=V7+(V3*3)-1)
# write as bed file for extraction of CDS
write_tsv(cpc2,
file="data/isoseq/cpc2/cpc2_extraction.bed",
write_tsv(cpc2,
file="runs/isoseq/cpc2/cpc2_extraction.bed",
quote = "none",
col_names = F)
# Convert cpc2 output to gtf file:
# read in gtf file of isoseq transcript data
isoseq.gtf <- read.gtf("data/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.gtf")
isoseq.gtf <- read.gtf("runs/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.gtf")
isoseq.gtf <- isoseq.gtf[isoseq.gtf$type == "exon",]
# read in cpc2 output file, the bed file used for extraction does suffice
cpc2.bed <- read.table("data/isoseq_data/cpc2/cpc2_extraction.bed")
cpc2.bed <- read.table("runs/isoseq_runs/cpc2/cpc2_extraction.bed")
colnames(cpc2.bed) <- c("ID", "CDS_start", "CDS_end")
# bed format is 0 based, convert back to a 1 based format, end position does not have to converted
cpc2.bed$CDS_start <- cpc2.bed$CDS_start+1
......@@ -242,7 +242,7 @@ for (i in 1:length(ids)){
shifted[length(shifted)+1] <- shifted[length(shifted)]+1
# which values change by more than 1 after one shift? These values represent exon boundaries
boundaries <- which(abs(cds_coordinates - shifted) != 1)
# create matrix from which to construct the start and end positions of the gtf file
# create matrices for single exon genes also, as they are converted to vectors in a later step otherwise
if (length(boundaries) > 0){
......@@ -262,13 +262,13 @@ for (i in 1:length(ids)){
mat[1,2] <- cds_coordinates[1]
mat[1,1] <- cds_coordinates[length(cds_coordinates)]
}
# invert matrix for minus strand in order to start with the lowest number
# only if there are multiple exons
if (stranded == "-" & nrow(mat) != 1){
mat <- apply(apply(mat, 1, rev), 1, rev)
}
# use the previously created gtf structure to store the CDS coordinates
# the number of CDS rows is always <= number of exon rows
out.subset <- gtf.subset[1:nrow(mat),]
......@@ -283,33 +283,33 @@ for (i in 1:length(ids)){
# output represents all CDS records
# write gtf file
write.table(output[,1:9],
"data/isoseq/cpc2/cpc2_extracted_cds.gtf",
write.table(output[,1:9],
"runs/isoseq/cpc2/cpc2_extracted_cds.gtf",
col.names = F, row.names = F, quote=F, sep ="\t")
```
Run TSEBRA to combine computational prediction with Iso-Seq transcripts, using gffread for deduplication:
```{bash}
mkdir -p data/braker_analysis/TSEBRA
mkdir -p data/reannotation_correction/computational
mkdir -p data/reannotation_correction/manual
mkdir -p runs/braker_analysis/TSEBRA
mkdir -p runs/reannotation_correction/computational
mkdir -p runs/reannotation_correction/manual
# -g specifies the braker gtf, -c long read config file, -e braker hint file -l isoseq cpc2 extracted gtf file
/home/tom/Documents/tools/TSEBRA/bin/tsebra.py \
-g data/braker_analysis/external_evidence/fixed_subsets/full_partial_support.gtf \
-g runs/braker_analysis/external_evidence/fixed_subsets/full_partial_support.gtf \
-c /home/tom/Documents/tools/TSEBRA/config/long_reads.cfg \
-e data/braker2/polished_prot_rna/hintsfile.gff \
-l data/isoseq/cpc2/cpc2_extracted_cds.gtf \
-o data/braker_analysis/TSEBRA/tsebra.gtf
-e runs/braker2/polished_prot_rna/hintsfile.gff \
-l runs/isoseq/cpc2/cpc2_extracted_cds.gtf \
-o runs/braker_analysis/TSEBRA/tsebra.gtf
# To remove any duplicated entries and to cluster the predicted transcripts into loci from the tsebra gtf file I used gffread.
# -M option merges identical entries, -K option causes stricter merge, -T outputs as gtf (did also output as gff file)
/home/tom/Documents/tools/gffread-0.12.7/gffread -M -K -T \
-d data/braker_analysis/TSEBRA/duplication_info.txt \
data/braker_analysis/TSEBRA/tsebra.gtf > \
data/braker_analysis/TSEBRA/tsebra_dedup.gtf
-d runs/braker_analysis/TSEBRA/duplication_info.txt \
runs/braker_analysis/TSEBRA/tsebra.gtf > \
runs/braker_analysis/TSEBRA/tsebra_dedup.gtf
```
......@@ -317,7 +317,7 @@ After the finalisation of the genome prediction using TSEBRA and the deduplicati
```{r}
# read in the annotation
tsebra.gtf <- read.gtf("data/braker_analysis/TSEBRA/tsebra_dedup.gtf")
tsebra.gtf <- read.gtf("runs/braker_analysis/TSEBRA/tsebra_dedup.gtf")
# create a mapping from locus the gene id, the locus is only found in rows of the "transcript" type
transcript_locus_mapping <- tsebra.gtf %>%
......@@ -332,7 +332,7 @@ tsebra.gtf <- left_join(tsebra.gtf, transcript_locus_mapping)
# save information as RDS
output <- tsebra.gtf %>%
select(source, gene_id, transcript_id, locus)
saveRDS(output, "data/reannotation_correction/computational/locus.RDS")
saveRDS(output, "runs/reannotation_correction/computational/locus.RDS")
# create a dataframe indicating the order of the scaffolds and contigs
......@@ -355,9 +355,9 @@ tsebra.gtf.sorted <- tsebra.gtf %>%
# use only one record per transcript, sort by locus and add number with the number of transcript at that locus
# to later create the transcript names
transcript_id <- tsebra.gtf.sorted %>%
filter(type == "transcript") %>%
group_by(locus) %>%
transcript_id <- tsebra.gtf.sorted %>%
filter(type == "transcript") %>%
group_by(locus) %>%
mutate(occ = 1:n()) %>%
ungroup() %>%
select(transcript_id, occ)
......@@ -366,15 +366,15 @@ tsebra.gtf.sorted <- left_join(tsebra.gtf.sorted, transcript_id, by=c("transcrip
# write temporary RDS file:
saveRDS(tsebra.gtf.sorted,
"data/reannotation_correction/computational/tsebra_temp3.RDS")
saveRDS(tsebra.gtf.sorted,
"runs/reannotation_correction/computational/tsebra_temp3.RDS")
```
Rename the transcripts and save again as a renamed gtf file:
```{r}
# read in RDS file
tsebra.gtf.sorted <- readRDS("data/reannotation_correction/computational/tsebra_temp3.RDS")
tsebra.gtf.sorted <- readRDS("runs/reannotation_correction/computational/tsebra_temp3.RDS")
# rename the locus and transcript ID records to match
# add column indicating the gene order
......@@ -393,19 +393,19 @@ gene_identifier <- paste("AHp", order_filled, sep="")
gene.id.df <- data.frame(ids, order3, order_filled, gene_identifier)
tsebra.gtf.sorted <- left_join(tsebra.gtf.sorted, gene.id.df, by = c("locus" = "ids"))
# create the transcript identifier based on gene identifier
# create the transcript identifier based on gene identifier
tsebra.gtf.sorted <- tsebra.gtf.sorted %>%
mutate(transcript_identifier = paste0(gene_identifier, ".", occ))
# include new attribute column
tsebra.gtf.sorted$new_attributes <- paste0("gene_id \"",
tsebra.gtf.sorted$gene_identifier,
"\"; transcript_id \"",
tsebra.gtf.sorted$transcript_identifier,
tsebra.gtf.sorted$new_attributes <- paste0("gene_id \"",
tsebra.gtf.sorted$gene_identifier,
"\"; transcript_id \"",
tsebra.gtf.sorted$transcript_identifier,
"\";")
# create mapping file of old and new names:
mapping <- tsebra.gtf.sorted %>% select(gene_id, transcript_id, locus, gene_identifier, transcript_identifier)
saveRDS(mapping, "data/reannotation_correction/computational/mapping.RDS")
saveRDS(mapping, "runs/reannotation_correction/computational/mapping.RDS")
# create final gtf file in the correct column order:
output <- tsebra.gtf.sorted %>%
......@@ -413,14 +413,14 @@ output <- tsebra.gtf.sorted %>%
select(chr, source, type, start, end, score, strand, phase, new_attributes)
# write gtf
write.table(output[,1:9],
"data/reannotation_correction/computational/tsebra_renamed.gtf",
write.table(output[,1:9],
"runs/reannotation_correction/computational/tsebra_renamed.gtf",
col.names = F, row.names = F, quote=F, sep ="\t")
```
Based on evidence from sequencing data, the sequence of the AmMYBl1 gene (AHp014591) was manually adjusted. The resulting file was saved as data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf. Gffread was used to extract cds and protein fasta files of the annotated genes and convert the gtf file to the gff3 format, representing the genome annotation v2.2.
Based on evidence from sequencing data, the sequence of the AmMYBl1 gene (AHp014591) was manually adjusted. The resulting file was saved as runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf. Gffread was used to extract cds and protein fasta files of the annotated genes and convert the gtf file to the gff3 format, representing the genome annotation v2.2.
More manual adjustment is required. First, the phase field in the annotation file is incorrect (likely an issue with TSEBRA). Secondly, seemingly a bug in TSEBRA causes the annotated sequence to be changed in the case that the stop codon is split up by an intron. I will save the annotation with the manually corrected AmMYBl1 as a gtf file under a different name and fix the phasing issue. The new manually corrected input file is data/reannotation_correction/manual/manually_MYBl1_corrected.gtf.
More manual adjustment is required. First, the phase field in the annotation file is incorrect (likely an issue with TSEBRA). Secondly, seemingly a bug in TSEBRA causes the annotated sequence to be changed in the case that the stop codon is split up by an intron. I will save the annotation with the manually corrected AmMYBl1 as a gtf file under a different name and fix the phasing issue. The new manually corrected input file is runs/reannotation_correction/manual/manually_MYBl1_corrected.gtf.
As a first step, the genes with wrong values in their last exons are manually adjusted. This is the list of identifiers with an incorrect last exon, the genes were corrected based on their annotated cpc2 cds predictions.
ERROR: Proteins do not match for transcript AHp001894.1 Strand:+ Exons: 2 checked, corrected
......@@ -450,7 +450,7 @@ names(annotation.cds.fasta[which((width(annotation.cds.fasta) %% 3) != 0)])
```{r}
annotation.gtf <- read.gtf("data/reannotation_correction/manual/manually_MYBl1_corrected.gtf")
annotation.gtf <- read.gtf("runs/reannotation_correction/manual/manually_MYBl1_corrected.gtf")
# searched the above list in command line using grep -n to identify the lines which need to be corrected
annotation.gtf[14134,4] <- 31584484
annotation.gtf[93858,4] <- 340181
......@@ -486,8 +486,8 @@ annotation.gtf[178221,5] <- 807 # minus strand, only adjust by 1
# write corrected table
write.table(annotation.gtf[,1:9],
"data/reannotation_correction/manual/manually_genes_corrected.gtf",
write.table(annotation.gtf[,1:9],
"runs/reannotation_correction/manual/manually_genes_corrected.gtf",
col.names = F, row.names = F, quote=F, sep ="\t")
```
......@@ -495,40 +495,40 @@ After manual correction of genes, fix the phase attribute.
```{bash}
# Add a gene line and convert to gff for gffvalidator. Correct the phasing field using gffvalidator.
/home/tom/Documents/tools/gffread-0.12.7/gffread -E data/reannotation_correction/manual/manually_genes_corrected.gtf --keep-genes > data/reannotation_correction/manual/manually_genes_corrected.gene.gff
/home/tom/Documents/tools/gffread-0.12.7/gffread -E runs/reannotation_correction/manual/manually_genes_corrected.gtf --keep-genes > runs/reannotation_correction/manual/manually_genes_corrected.gene.gff
# the genes with wrong coordinates should be fixed at this step, so that they can be assigned the correct phasing attribute
gt gff3 -tidy -force -retainids -addids no -o data/reannotation_correction/manual/manually_genes_corrected.gene.phase.gff data/reannotation_correction/manual/manually_genes_corrected.gene.gff
gt gff3 -tidy -force -retainids -addids no -o runs/reannotation_correction/manual/manually_genes_corrected.gene.phase.gff runs/reannotation_correction/manual/manually_genes_corrected.gene.gff
# remove empty lines from the gtf file, with only "###"
grep -v "###" data/reannotation_correction/manual/manually_genes_corrected.gene.phase.gff > data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff
grep -v "###" runs/reannotation_correction/manual/manually_genes_corrected.gene.phase.gff > runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff
```
Convert to other formats using gffread.
```{bash}
# convert to gtf format
/home/tom/Documents/tools/gffread-0.12.7/gffread -v -T --keep-genes data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff > data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf
/home/tom/Documents/tools/gffread-0.12.7/gffread -v -T --keep-genes runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff > runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf
# extract prot (-y) and cds (-x) fasta files
/home/tom/Documents/tools/gffread-0.12.7/gffread -x data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta \
-y data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.prot.fasta \
-g polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \
data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf
/home/tom/Documents/tools/gffread-0.12.7/gffread -x runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta \
-y runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.prot.fasta \
-g runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \
runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf
# convert to gff3 format
#/home/tom/Documents/tools/gffread-0.12.7/gffread data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf > \
# data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff
#/home/tom/Documents/tools/gffread-0.12.7/gffread runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf > \
# runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff
# copy to the annotation directory:
cp data/reannotation_correction/manual/A* polished_genome_annotation/annotation/
cp runs/reannotation_correction/manual/A* runs/polished_genome_annotation/annotation/
```
Perform quality control of the annotated genes:
```{r}
# load in cds fasta
annotation.cds.fasta <- readBStringSet(filepath = "data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta")
annotation.cds.fasta <- readBStringSet(filepath = "runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta")
# are there genes with length != a multiple of 3?
which((width(annotation.cds.fasta) %% 3) != 0)
......@@ -549,30 +549,28 @@ Compare annotation completeness of genome annotation v2.2 with previously publis
# genome annotation v2.2
# compare against the orthoDBv10 Embryophyta dataset, using 7 threads
busco -m protein \
-i polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta \
-i runs/polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta \
-o genome_annotation_v2.2 \
-l embryophyta_odb10 \
--out_path data/annotation_analysis/busco/ \
--download_path data/annotation_analysis/busco/datasets/ \
--out_path runs/annotation_analysis/busco/ \
--download_path runs/annotation_analysis/busco/datasets/ \
-c 6
# genome annotation v2.1
busco -m protein \
-i /home/tom/Documents/reference_genomes/Ahypochondriacus/annotation/Ahypochondriacus_459_v2.1.protein.fa \
-o genome_annotation_v2.1 \
-l embryophyta_odb10 \
--out_path data/annotation_analysis/busco/ \
--download_path data/annotation_analysis/busco/datasets/ \
--out_path runs/annotation_analysis/busco/ \
--download_path runs/annotation_analysis/busco/datasets/ \
-c 6
# A. cruentus annotation
busco -m protein \
-i /home/tom/Documents/reference_genomes/Acruentus/annotation/Amacr_pep_20210312.tfa \
-o Acruentus_annotation \
-l embryophyta_odb10 \
--out_path data/annotation_analysis/busco/ \
--download_path data/annotation_analysis/busco/datasets/ \
--out_path runs/annotation_analysis/busco/ \
--download_path runs/annotation_analysis/busco/datasets/ \
-c 6
```
......@@ -16,10 +16,10 @@ module load samtools/1.13
#Set input and parameters
read1=raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_1.fastq.gz
read2=raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_2.fastq.gz
input=polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta
outdir=data/mapping_bias_inference/genome_coverage/
read1=studies/additional_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_1.fastq.gz
read2=studies/additional_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_2.fastq.gz
input=runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta
outdir=runs/mapping_bias_inference/genome_coverage/
outbam=SRR2106212.sorted.bam
outdedup=SRR2106212.sorted.dedup.bam
outcov=SRR2106212.coverage
......
......@@ -18,7 +18,7 @@ Analyse the observed bias in read mapping observed in ATAC and WG sequencing. A
```{r}
# read in data
coverage <- read.table(file = "data/mapping_bias_inference/genome_coverage/SRR2106212.sorted.dedup.10.depth")
coverage <- read.table(file = "runs/mapping_bias_inference/genome_coverage/SRR2106212.sorted.dedup.10.depth")
# calculate coverage in windows with fixed window size
windowsize <- 5000
......@@ -46,7 +46,7 @@ ggplot(data = window_coverage) +
theme(text = element_text(size = 22))
# save plot
ggsave(filename = "plots/mapping_bias_inference/50k_window_read_coverage.png",
ggsave(filename = "runs/plots/mapping_bias_inference/50k_window_read_coverage.png",
width = 14, height = 8)
```
......@@ -54,11 +54,11 @@ Investigate if genomic features correlate with the observed mapping bias. Calcul
```{bash}
# create genome file for bedtools
awk -F'\t' 'BEGIN {OFS = FS} {print $1,$2}' /projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta.fai > data/mapping_bias_inference/bedtools_genome.txt
awk -F'\t' 'BEGIN {OFS = FS} {print $1,$2}' /projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta.fai > runs/mapping_bias_inference/bedtools_genome.txt
# make windows using bedtools, windowsize 5k, include only Scaffolds
bedtools makewindows -g data/mapping_bias_inference/bedtools_genome.txt -w 5000 | grep "Scaffold" > data/mapping_bias_inference/all_genome_windows.bed
bedtools makewindows -g runs/mapping_bias_inference/bedtools_genome.txt -w 5000 | grep "Scaffold" > runs/mapping_bias_inference/all_genome_windows.bed
# calculate statistics in windows
bedtools nuc -fi /projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta -bed data/mapping_bias_inference/genome_windows.bed > data/mapping_bias_inference/window_stats.txt
bedtools nuc -fi /projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta -bed runs/mapping_bias_inference/genome_windows.bed > runs/mapping_bias_inference/window_stats.txt
```
......@@ -66,14 +66,14 @@ Plot Scaffold 10, with read depth and plastid genome content
```{r}
# read in paf files from minimap2
chloroplast.paf <- read_paf("data/mapping_bias_inference/plastid_to_genome/chloroplast_to_genome.paf")
mito.paf <- read_paf("data/mapping_bias_inference/plastid_to_genome/mitochondrium_to_genome.paf")
chloroplast.paf <- read_paf("runs/mapping_bias_inference/plastid_to_genome/chloroplast_to_genome.paf")
mito.paf <- read_paf("runs/mapping_bias_inference/plastid_to_genome/mitochondrium_to_genome.paf")
mito.df <- as.data.frame(mito.paf)
chloro.df <- as.data.frame(chloroplast.paf)
# read in mapping files for cruentus
chloroplast.cruentus.paf <- read_paf("data/mapping_bias_inference/plastid_to_genome/chloroplast_to_genome_cruentus.paf")
mito.cruentus.paf <- read_paf("data/mapping_bias_inference/plastid_to_genome/mitochondrium_to_genome_cruentus.paf")
chloroplast.cruentus.paf <- read_paf("runs/mapping_bias_inference/plastid_to_genome/chloroplast_to_genome_cruentus.paf")
mito.cruentus.paf <- read_paf("runs/mapping_bias_inference/plastid_to_genome/mitochondrium_to_genome_cruentus.paf")
mito.cruentus.df <- as.data.frame(mito.cruentus.paf)
chloro.cruentus.df <- as.data.frame(chloroplast.cruentus.paf)
......@@ -110,7 +110,7 @@ ggplot(data = window_coverage) +
theme(text = element_text(size = 22))
ggsave(filename = "plots/mapping_bias_inference/5kb_outlier_windows_zoom.png",
ggsave(filename = "runs/plots/mapping_bias_inference/5kb_outlier_windows_zoom.png",
width = 10, height = 6, bg = "white")
```
......@@ -123,7 +123,7 @@ mito.df %>%
group_by(tname) %>%
summarize(total_align_length = sum(alen)) %>%
mutate(percent_aligned = (total_align_length/mito.df$qlen[1])*100) %>%
arrange(desc(percent_aligned))
arrange(desc(percent_aligned))
# how much chloroplast was mapped?
chloro.df %>%
......@@ -172,8 +172,8 @@ There are structural differences in GC content between the different genomes. Wh
```{bash}
# GC content of different genomes
seqkit fx2tab --name --gc data/mapping_bias_inference/plastid_to_genome/Ah_chloroplast.fasta
seqkit fx2tab --name --gc data/mapping_bias_inference/plastid_to_genome/Bv_mitochondrium.fasta
seqkit fx2tab --name --gc runs/mapping_bias_inference/plastid_to_genome/Ah_chloroplast.fasta
seqkit fx2tab --name --gc runs/mapping_bias_inference/plastid_to_genome/Bv_mitochondrium.fasta
seqkit fx2tab --name --gc polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta | grep "Scaffold"
```
......@@ -247,7 +247,7 @@ Compare with functional annotation:
```{r}
# load functional annotation
functional_annotation <- readxl::read_xlsx(path = "data/functional_annotation/eggnog_mapper/MM_8wexw920.emapper.annotations.xlsx")
functional_annotation <- readxl::read_xlsx(path = "runs/functional_annotation/eggnog_mapper/MM_8wexw920.emapper.annotations.xlsx")
colnames(functional_annotation) <- functional_annotation[2,]
functional_annotation <- functional_annotation[-(1:2),]
......@@ -258,8 +258,3 @@ mito_functions <- functional_annotation %>%
chloro_functions <- functional_annotation %>%
filter(query %in% chloro_overlap$transcript_id)
```
......@@ -13,12 +13,12 @@ source $CONDA_PREFIX/etc/profile.d/conda.sh
conda activate isoseq
# create output directory
MAPPINGOUT=data/mapping_bias_inference/plastid_to_genome/
MAPPINGOUT=runs/mapping_bias_inference/plastid_to_genome/
mkdir -p "$MAPPINGOUT"
### MAPPING
# map reads onto the reference genome
REFERENCE=/projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta
REFERENCE=runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta
# Align sequences to reference genome
# sequences obtained from (Beta vulgaris, mitchondrium): https://www.ncbi.nlm.nih.gov/nuccore/BA000009.3
......
......@@ -4,11 +4,11 @@ Mapping bias and plastid content investigation
### Script order:
- code/other_inferences/infer_coverage_along_genome.sh
- workflows/other_inferences/infer_coverage_along_genome.sh
Map WGS reads against the reference genome to investigate mapping bias
- code/other_inferences/plastid_to_genome_mapping.sh
- workflows/other_inferences/plastid_to_genome_mapping.sh
Map chloroplast and mitochondria sequences to the reference genome
- code/other_inferences/mapping_bias_inference_analysis.Rmd
- workflows/other_inferences/mapping_bias_inference_analysis.Rmd
Plot mapping bias with annotated chloroplast and mitochondrial sequences
......@@ -4,26 +4,26 @@ All code used for polishing, masking and reannotation of the A. hypochondriacus
### Order of usage:
- code/genome_polishing/
- workflows/genome_polishing/
polish the previously published A. hypochondriacus reference genome
- code/repeat_masking/
- workflows/repeat_masking/
to prepare computational annotation, softmask repetitive elements in the polished genome
- code/braker2/
- workflows/braker2/
perform computational annotation of the softmasked reference genome using BRAKER2
- code/isoseq_assembly/
- workflows/isoseq_assembly/
assembly long read transcript sequencing data and compare effects of genome polishing on reported completeness
- code/merge_annotation/
- workflows/merge_annotation/
merge computational annotation with long-read transcript sequencing data and process output into genome annotation v2.2
- code/annotation_analysis/
- workflows/annotation_analysis/
identify flavonoid and betalain pathway genes, as well as MYB transcription factors in the genome annotation v2.2
- code/other_inferences/
- workflows/other_inferences/
mapping of chloroplast and mitchondrium sequences to the reference genome, analysis of mapping coverage bias
- code/BSA/
- workflows/BSA/
bulk segregant analysis and analysis of RNA-seq data from pooled flower tissue
......@@ -16,7 +16,7 @@ Prepare input file for processing using R:
```{bash}
# process build summary output
tail -n +41 data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary | head -n -907 > data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary.tsv
tail -n +41 runs/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary | head -n -907 > runs/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary.tsv
```
......@@ -24,12 +24,12 @@ Analyse repetitive element content and composition:
```{r}
# read in buildSummary
buildSummary <- read_table(file = "data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary.tsv",
buildSummary <- read_table(file = "runs/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary.tsv",
col_names = c("repeat", "count", "length", "percent","drop"))
buildSummary <- buildSummary[,1:4]
# read in repeat classification
new_TE_classification <- read_delim(file = "data/repeatmasking/reclassification/classified.updated.txt",
new_TE_classification <- read_delim(file = "runs/repeatmasking/reclassification/classified.updated.txt",
delim = "#",
col_names = c("TE_family", "classification"))
......@@ -49,6 +49,6 @@ summary_table %>%
summarize(sum_total_length = sum(total_length),
sum_percentage = sum(percentage))
write.csv(summary_table, file = "data/repeatmasking/reclassification/reclassified.output.tbl",
write.csv(summary_table, file = "runs/repeatmasking/reclassification/reclassified.output.tbl",
quote = F, row.names = F)
```
......@@ -4,11 +4,11 @@ The polished reference genome is masked for repetitive elements in order to prep
### Script order:
- code/repeat_masking/run_repeatmodeler.sh
- workflows/repeat_masking/run_repeatmodeler.sh
run Repeatmodeler to identify repetitive elements in the polished reference genome
- code/repeat_masking/run_repeatmasker.sh
- workflows/repeat_masking/run_repeatmasker.sh
run Repeatmasker on the Repeatmodeler output to classify identified elements and mask the polished reference genome
- code/repeat_masking/analyse_repetitive_elements.Rmd
- workflows/repeat_masking/analyse_repetitive_elements.Rmd
analyse the repetitive element composition
......@@ -17,7 +17,7 @@
module load repeatmasker/4.1.1
# create database directory
RMOUT=data/repeatmasking/repeatmasker
RMOUT=runs/repeatmasking/repeatmasker
mkdir -p $RMOUT
......@@ -26,12 +26,12 @@ mkdir -p $RMOUT
# for 20 threads, the pa setting while using rmblast should be 5
# Converted the polished reference genome fasta to all capital letters as preparation to repeatmasking:
awk '/^>/ {print($0)}; /^[^>]/ {print(toupper($0))}' data/NextPolish/processed/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta \
awk '/^>/ {print($0)}; /^[^>]/ {print(toupper($0))}' runs/NextPolish/processed/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta \
> "$RMOUT"/Ahypochondriacus_2.2_polished.capital.fasta
### run RepeatMasker
# lib = repeat database created with repeatmodeler
RepeatMasker -lib data/repeatmasking/repeatmodeler/consensi.fa.classified \
RepeatMasker -lib runs/repeatmasking/repeatmodeler/consensi.fa.classified \
-pa 5 \
-small \
-e rmblast \
......@@ -50,4 +50,4 @@ bedtools maskfasta \
-fo "$RMOUT"/output/Ahypochondriacus_2.2_polished.softmasked.fasta
# more detailed summary:
buildSummary.pl data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.out > data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary
buildSummary.pl runs/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.out > runs/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary
......@@ -19,23 +19,23 @@
module load repeatmodeler/2.0.1
# create database directory
mkdir -p data/repeatmodeler/database/
mkdir -p runs/repeatmodeler/database/
### Main
# increased number of tasks, each rmblast job will take 4 threads
# for 20 threads, the pa setting while using rmblast should be 5
# Create database
BuildDatabase -name data/repeatmodeler/database/polished \
data/NextPolish/processed/Ahypochondriacus_2.2_polished.fasta
BuildDatabase -name runs/repeatmodeler/database/polished \
runs/NextPolish/processed/Ahypochondriacus_2.2_polished.fasta
# run Repeatmodeler
RepeatModeler -database data/repeatmodeler/database/polished \
RepeatModeler -database runs/repeatmodeler/database/polished \
-pa 5 \
-LTRStruct
# reclassify identified repeats
mkdir -p data/repeatmasking/reclassification/
mkdir -p runs/repeatmasking/reclassification/
# repeatmasker version 4.1.5
RepeatClassifier -consensi data/repeatmasking/reclassification/consensi.fa -stockholm data/repeatmasking/reclassification/families.stk
RepeatClassifier -consensi runs/repeatmasking/reclassification/consensi.fa -stockholm runs/repeatmasking/reclassification/families.stk