From c2250f0e02f70ec6f4af29d73c237890f7561e48 Mon Sep 17 00:00:00 2001
From: alisandra <alisandra.denton@hhu.de>
Date: Wed, 24 Aug 2022 01:09:54 +0200
Subject: [PATCH] end for today

---
 _reader/sections/04_section_longreads.tex | 33 ++++++++++++++++-------
 workflows/maindocker/python_installs.sh   | 15 ++++++++---
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/_reader/sections/04_section_longreads.tex b/_reader/sections/04_section_longreads.tex
index 440b478..955928d 100644
--- a/_reader/sections/04_section_longreads.tex
+++ b/_reader/sections/04_section_longreads.tex
@@ -428,7 +428,7 @@ collapse_isoforms_by_sam.py --input runs/isoseq/flnc/m16.flnc.fa \
   -o runs/isoseq/collapsed/m16.flnc.to_genome \
   --flnc_coverage 2
 
-ls collapsed
+ls runs/isoseq/collapsed
 \end{lstlisting}
 
 To understand everything going on and the parameters for the previous step, 
@@ -542,7 +542,7 @@ Let's just try it and look at the result.
 
 \begin{lstlisting}
 # convert to gff3 (bc the hints script takes only gff3 correctly)
-pfx=collapsed/m16.flnc.to_genome.collapsed.
+pfx=runs/isoseq/collapsed/m16.flnc.to_genome.collapsed.
 gffread ${pfx}bad.gff -o ${pfx}bad.gff3
 gffread ${pfx}good.gff -o ${pfx}good.gff3
 # feel free to look at output with 'less' to see differences between gff and gff3
@@ -608,7 +608,7 @@ cut it smaller than your chromosomes/scaffolds.
 # feel free to just type in place
 official_gtf=studies/AthalianaReferences/resources/Athaliana_167_TAIR10.gene_exons.gtf
 
-subset_genome_related.py --fasta studies/AthalianaReferences/resources/Athaliana_167_TAIR9.fa \ 
+subset_genome_related.py --fasta studies/AthalianaReferences/resources/Athaliana_167_TAIR9.fa \
   -sChr1 -f1 -t300000 --gff ${pfx}good.gff3,${pfx}bad.gff3,${pfx}hints.gff3,$official_gtf \
   --bam runs/isoseq/mapped/m16.flnc.sorted.bam
 
@@ -620,19 +620,20 @@ Now we can run augustus
 \begin{lstlisting}
 # I don't like typing the same things over and over again
 where=Chr1_1-300000
-mkdir gene_models
+mkdir runs/isoseq/gene_models
 augustus --hintsfile=${pfx}hints__${where}.gff3 --species=arabidopsis \
-  --alternatives-from-evidence=true --extrinsicCfgFile=extrinsic.E.cfg \
+  --alternatives-from-evidence=true \
+  --extrinsicCfgFile=assays/Zhu2017_IsoSeq/dataset/extrinsic.E.cfg \
   --UTR=on --allow_hinted_splicesites=atac \
   studies/AthalianaReferences/resources/Athaliana_167_TAIR9__${where}.fa \ 
-  > gene_models/flnc.${where}.augustus
+  --softmasking=off > runs/isoseq/gene_models/flnc.${where}.augustus
 
 less ${pfx}augustus
 # the augustus output has the hints, commented protein sequence, explanation,
 # and, what we are after, gtf lines with 'AUGUSTUS', the gene models.
 # let's subset it to have just these.
-less gene_models/flnc.${where}.augustus | grep AUGUSTUS > \
-  gene_models/flnc.${where}.augustus.gtf
+less runs/isoseq/gene_models/flnc.${where}.augustus | grep AUGUSTUS > \
+  runs/isoseq/gene_models/flnc.${where}.augustus.gtf
 \end{lstlisting}
 %# copying the commands for the full runs here so as to be able to provide
 %# comparative results
@@ -652,8 +653,8 @@ may a) open multiple tablet instances (think teamwork) or b) adjust the 'feature
 column of the gff to have a different name. See below for an example
 
 \begin{lstlisting}
-less gene_models/flnc.${where}.augustus.gtf | awk 'BEGIN {OFS = FS = "\t"} \
-  { sub(/^/, "flnc.", $3) }1' > gene_models/flnc.${where}.augustus.gtf.tablet
+less runs/isoseq/gene_models/flnc.${where}.augustus.gtf | awk 'BEGIN {OFS = FS = "\t"} \
+  { sub(/^/, "flnc.", $3) }1' > runs/isoseq/gene_models/flnc.${where}.augustus.gtf.tablet
 less gene_models/flnc.${where}.augustus.gtf.tablet  # check results 
 # don't worry about understanding all of the 'awk' command. The first bit
 # is to tell it to use only tabs as a column separator. The second bit says
@@ -669,6 +670,18 @@ to see
 In any case, congratulations for getting all the way to our first full
 gene models!
 
+\fbox{\begin{minipage}{45em}
+END OF WORKING LONG READ SECTION
+
+If you've gotten this far, know that everything beyond has not been update since 2018,
+nor has the install been tested, almost certainly not working. 
+Also, the easier genomes become to sequence, the less
+relevant genome-less analyses become. 
+
+Read through anyways for an overview of the concepts, or try Dominik's challenge assignment :-)
+\end{minipage}}
+
+
 \paragraph{Mapping high quality isoforms}
 We're now going to run (nearly) all the same steps for mapping high quality
 isoforms that we ran for mapping flnc reads above. As it is essentially
diff --git a/workflows/maindocker/python_installs.sh b/workflows/maindocker/python_installs.sh
index 5da35de..9b81259 100755
--- a/workflows/maindocker/python_installs.sh
+++ b/workflows/maindocker/python_installs.sh
@@ -11,14 +11,21 @@ pip3 install scikit-learn
 
 oldpwd=`pwd`
 
+#dustdas
+cd repos
+git clone https://github.com/janinamass/dustdas.git
+cd dustdas
+pip install .
+cd $oldpwd
+
+# cupcake
 mkdir repos/alisandra
 cd repos/alisandra
-# cDNAcupcake (originally from Magdoll of course, but patched)
-git clone https://github.com/alisandra/cDNA_Cupcake.git
+git clone https://github.com/Magdoll/cDNA_Cupcake.git
 cd cDNA_Cupcake/
+git checkout v28.0.0
 pip3 install -r requirements.txt
-python3 setup.py build
-python3 setup.py install
+pip install .
 cd $oldpwd
 
 #mkdir Magdoll
-- 
GitLab