From c2250f0e02f70ec6f4af29d73c237890f7561e48 Mon Sep 17 00:00:00 2001 From: alisandra <alisandra.denton@hhu.de> Date: Wed, 24 Aug 2022 01:09:54 +0200 Subject: [PATCH] end for today --- _reader/sections/04_section_longreads.tex | 33 ++++++++++++++++------- workflows/maindocker/python_installs.sh | 15 ++++++++--- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/_reader/sections/04_section_longreads.tex b/_reader/sections/04_section_longreads.tex index 440b478..955928d 100644 --- a/_reader/sections/04_section_longreads.tex +++ b/_reader/sections/04_section_longreads.tex @@ -428,7 +428,7 @@ collapse_isoforms_by_sam.py --input runs/isoseq/flnc/m16.flnc.fa \ -o runs/isoseq/collapsed/m16.flnc.to_genome \ --flnc_coverage 2 -ls collapsed +ls runs/isoseq/collapsed \end{lstlisting} To understand everything going on and the parameters for the previous step, @@ -542,7 +542,7 @@ Let's just try it and look at the result. \begin{lstlisting} # convert to gff3 (bc the hints script takes only gff3 correctly) -pfx=collapsed/m16.flnc.to_genome.collapsed. +pfx=runs/isoseq/collapsed/m16.flnc.to_genome.collapsed. gffread ${pfx}bad.gff -o ${pfx}bad.gff3 gffread ${pfx}good.gff -o ${pfx}good.gff3 # feel free to look at output with 'less' to see differences between gff and gff3 @@ -608,7 +608,7 @@ cut it smaller than your chromosomes/scaffolds. # feel free to just type in place official_gtf=studies/AthalianaReferences/resources/Athaliana_167_TAIR10.gene_exons.gtf -subset_genome_related.py --fasta studies/AthalianaReferences/resources/Athaliana_167_TAIR9.fa \ +subset_genome_related.py --fasta studies/AthalianaReferences/resources/Athaliana_167_TAIR9.fa \ -sChr1 -f1 -t300000 --gff ${pfx}good.gff3,${pfx}bad.gff3,${pfx}hints.gff3,$official_gtf \ --bam runs/isoseq/mapped/m16.flnc.sorted.bam @@ -620,19 +620,20 @@ Now we can run augustus \begin{lstlisting} # I don't like typing the same things over and over again where=Chr1_1-300000 -mkdir gene_models +mkdir runs/isoseq/gene_models augustus --hintsfile=${pfx}hints__${where}.gff3 --species=arabidopsis \ - --alternatives-from-evidence=true --extrinsicCfgFile=extrinsic.E.cfg \ + --alternatives-from-evidence=true \ + --extrinsicCfgFile=assays/Zhu2017_IsoSeq/dataset/extrinsic.E.cfg \ --UTR=on --allow_hinted_splicesites=atac \ studies/AthalianaReferences/resources/Athaliana_167_TAIR9__${where}.fa \ - > gene_models/flnc.${where}.augustus + --softmasking=off > runs/isoseq/gene_models/flnc.${where}.augustus less ${pfx}augustus # the augustus output has the hints, commented protein sequence, explanation, # and, what we are after, gtf lines with 'AUGUSTUS', the gene models. # let's subset it to have just these. -less gene_models/flnc.${where}.augustus | grep AUGUSTUS > \ - gene_models/flnc.${where}.augustus.gtf +less runs/isoseq/gene_models/flnc.${where}.augustus | grep AUGUSTUS > \ + runs/isoseq/gene_models/flnc.${where}.augustus.gtf \end{lstlisting} %# copying the commands for the full runs here so as to be able to provide %# comparative results @@ -652,8 +653,8 @@ may a) open multiple tablet instances (think teamwork) or b) adjust the 'feature column of the gff to have a different name. See below for an example \begin{lstlisting} -less gene_models/flnc.${where}.augustus.gtf | awk 'BEGIN {OFS = FS = "\t"} \ - { sub(/^/, "flnc.", $3) }1' > gene_models/flnc.${where}.augustus.gtf.tablet +less runs/isoseq/gene_models/flnc.${where}.augustus.gtf | awk 'BEGIN {OFS = FS = "\t"} \ + { sub(/^/, "flnc.", $3) }1' > runs/isoseq/gene_models/flnc.${where}.augustus.gtf.tablet less gene_models/flnc.${where}.augustus.gtf.tablet # check results # don't worry about understanding all of the 'awk' command. The first bit # is to tell it to use only tabs as a column separator. The second bit says @@ -669,6 +670,18 @@ to see In any case, congratulations for getting all the way to our first full gene models! +\fbox{\begin{minipage}{45em} +END OF WORKING LONG READ SECTION + +If you've gotten this far, know that everything beyond has not been update since 2018, +nor has the install been tested, almost certainly not working. +Also, the easier genomes become to sequence, the less +relevant genome-less analyses become. + +Read through anyways for an overview of the concepts, or try Dominik's challenge assignment :-) +\end{minipage}} + + \paragraph{Mapping high quality isoforms} We're now going to run (nearly) all the same steps for mapping high quality isoforms that we ran for mapping flnc reads above. As it is essentially diff --git a/workflows/maindocker/python_installs.sh b/workflows/maindocker/python_installs.sh index 5da35de..9b81259 100755 --- a/workflows/maindocker/python_installs.sh +++ b/workflows/maindocker/python_installs.sh @@ -11,14 +11,21 @@ pip3 install scikit-learn oldpwd=`pwd` +#dustdas +cd repos +git clone https://github.com/janinamass/dustdas.git +cd dustdas +pip install . +cd $oldpwd + +# cupcake mkdir repos/alisandra cd repos/alisandra -# cDNAcupcake (originally from Magdoll of course, but patched) -git clone https://github.com/alisandra/cDNA_Cupcake.git +git clone https://github.com/Magdoll/cDNA_Cupcake.git cd cDNA_Cupcake/ +git checkout v28.0.0 pip3 install -r requirements.txt -python3 setup.py build -python3 setup.py install +pip install . cd $oldpwd #mkdir Magdoll -- GitLab