From 867b707ce533015930fe14e84154becd385ee35d Mon Sep 17 00:00:00 2001 From: alisandra <alisandra.denton@hhu.de> Date: Tue, 23 Aug 2022 19:50:34 +0200 Subject: [PATCH] slightly less broken isoseq --- _reader/sections/04_section_longreads.tex | 35 ++++++------ workflows/maindocker/Dockerfile | 70 +++++++++++------------ 2 files changed, 52 insertions(+), 53 deletions(-) diff --git a/_reader/sections/04_section_longreads.tex b/_reader/sections/04_section_longreads.tex index 7fd0c0a..0828439 100644 --- a/_reader/sections/04_section_longreads.tex +++ b/_reader/sections/04_section_longreads.tex @@ -216,10 +216,9 @@ multiplexed libraries, we could also perform demultiplexing with \il{lima}. \begin{lstlisting} mkdir -p runs/isoseq/trimmed/ lima runs/isoseq/ccs/m16.ccs.bam workflows/pacbiosciences_lima/clonetech_SMARTer.fa \ - runs/isoseq/trimmed/m16.ccs.bam --no-pbi --isoseq + runs/isoseq/trimmed/m16.ccs.bam --isoseq # The first three arguments are obvious from the usage # Usage: lima [options] INPUT BARCODES OUTPUT -# --no-pbi since we won't need the .pbi output # --isoseq is necessary to tell it what to expect for the primers # let's check the output @@ -236,7 +235,12 @@ less runs/isoseq/trimmed/head_trimmed.sam # _most_ of the poly-A tails are at the end of the sequence now \end{lstlisting} -\subsubsection{Clustering} +\subsubsection{Refining} + +\emph{This has changed a low, clustering and polising, are now called +refining, and ran in one step / polising may be skipped. +Please pardon if the description is off} + So at this point we have some reads that are \emph{in some ways} similar to that which we had after running Trimmomatic on the Illumina data. But not quite. We just saw the remaining poly-As and adapters that were in the @@ -252,23 +256,22 @@ So if we were using more samples, we'd want to run \il{dataset create} first, which would link files in a way that they could all be processed together. \begin{lstlisting} -mkdir -p runs/isoseq/unpolished/ +mkdir -p runs/isoseq/polished/ # from the help function: -# isoseq3 cluster [options] input output -isoseq3 cluster runs/isoseq/trimmed/m16.ccs.primer_5p--primer_3p.bam \ - runs/isoseq/unpolished/m16.unpolished.bam --verbose --require-polya +# isoseq refine [options] <ccs.demux.bam|xml> <primer.fasta|xml> <flnc.bam|xml> +isoseq3 refine runs/isoseq/trimmed/m16.ccs.primer_5p--primer_3p.bam \ + workflows/pacbiosciences_lima/clonetech_SMARTer.fa \ + runs/isoseq/polished/m16.polished.bam --verbose --require-polya # again, we have a lot of output -ls -sh runs/isoseq/unpolished/ +ls -sh runs/isoseq/polished/ # there's two files we really care about, and both are .bam -# *.unpolished.flnc.bam contains the fully cleaned, AKA +# *.polished.flnc.bam contains the fully cleaned, AKA # "full-length non-chimeric" reads. Finally. -# *.unpolished.bam has draft reconstructed transcripts +# *.polished.bam has draft reconstructed transcripts # we can also get a report on how many flnc reads were used # for each draft transcript -isoseq3 summarize runs/isoseq/unpolished/m16.unpolished.bam \ - runs/isoseq/unpolished/m16.summary.csv -less runs/isoseq/unpolished/m16.summary.csv +less runs/isoseq/polished/m16.polished.report.csv \end{lstlisting} After this, the pipelines we're looking at today start to diverge, @@ -309,11 +312,7 @@ a ccs read with just a single subread. %TODO : running into error: Missing .pbi \begin{lstlisting} -mkdir -p runs/isoseq/polished/ -# isoseq3 polish -h # uncomment for explanation -isoseq3 polish runs/isoseq/unpolished/m16.unpolished.bam \ - runs/isoseq/subreads/m16*.subreads.bam \ - runs/isoseq/polished/m16.polished.bam +# isoseq polish is no longer a thing... \end{lstlisting} OK, this step is going to take a long time (maybe an hour?). It's working with 5GB of reads after all diff --git a/workflows/maindocker/Dockerfile b/workflows/maindocker/Dockerfile index d48e6cd..79e8ee8 100644 --- a/workflows/maindocker/Dockerfile +++ b/workflows/maindocker/Dockerfile @@ -48,48 +48,46 @@ RUN apt install hisat2 \ samtools \ minimap2 \ mash \ - cd-hit -y + cd-hit tar bzip2 \ + libhdf5-dev m4 -y +# last ones are for kallisto # --- used to be conda, now binaries... --- # -RUN pip install HTSeq -RUN apt install wget tar bzip2 -y -RUN wget https://anaconda.org/bioconda/isoseq3/3.7.0/download/linux-64/isoseq3-3.7.0-h9ee0642_0.tar.bz2 -RUN tar xvf isoseq3-3.7.0-h9ee0642_0.tar.bz2 -RUN wget https://anaconda.org/bioconda/lima/2.6.0/download/linux-64/lima-2.6.0-h9ee0642_0.tar.bz2 -RUN tar xvf lima-2.6.0-h9ee0642_0.tar.bz2 -RUN wget https://anaconda.org/bioconda/pbccs/6.4.0/download/linux-64/pbccs-6.4.0-h9ee0642_0.tar.bz2 -RUN tar xvf pbccs-6.4.0-h9ee0642_0.tar.bz2 -RUN wget https://anaconda.org/bioconda/bax2bam/0.0.11/download/linux-64/bax2bam-0.0.11-0.tar.bz2 -RUN tar xvf bax2bam-0.0.11-0.tar.bz2 +# for virtualenv intro +ENV PATH="/home/$DOCKER_USER/.local/bin:${PATH}" +RUN pip install HTSeq virtualenv +RUN wget https://anaconda.org/bioconda/isoseq3/3.7.0/download/linux-64/isoseq3-3.7.0-h9ee0642_0.tar.bz2 && \ + tar xvf isoseq3-3.7.0-h9ee0642_0.tar.bz2 && \ + wget https://anaconda.org/bioconda/lima/2.6.0/download/linux-64/lima-2.6.0-h9ee0642_0.tar.bz2 && \ + tar xvf lima-2.6.0-h9ee0642_0.tar.bz2 && \ + wget https://anaconda.org/bioconda/pbccs/6.4.0/download/linux-64/pbccs-6.4.0-h9ee0642_0.tar.bz2 && \ + tar xvf pbccs-6.4.0-h9ee0642_0.tar.bz2 && \ + wget https://anaconda.org/bioconda/bax2bam/0.0.11/download/linux-64/bax2bam-0.0.11-0.tar.bz2 && \ + tar xvf bax2bam-0.0.11-0.tar.bz2 # kallisto -RUN apt install libhdf5-dev m4 -y -WORKDIR /home/$DOCKER_USER/repos -RUN curl -O -L http://ftpmirror.gnu.org/autoconf/autoconf-2.69.tar.gz -RUN tar -xzf autoconf-2.69.tar.gz -WORKDIR /home/$DOCKER_USER/repos/autoconf-2.69 -RUN ./configure && make && make install -WORKDIR /home/$DOCKER_USER/repos -RUN git clone https://github.com/pachterlab/kallisto.git && mkdir kallisto/build -WORKDIR /home/$DOCKER_USER/repos/kallisto/build ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/x86_64-linux-gnu/hdf5/serial/lib -RUN cmake -DCMAKE_INSTALL_PREFIX=/home/$DOCKER_USER/ -DUSE_HDF5=ON .. && make && make install +RUN cd /home/$DOCKER_USER/repos && \ + curl -O -L http://ftpmirror.gnu.org/autoconf/autoconf-2.69.tar.gz && \ + tar -xzf autoconf-2.69.tar.gz && cd /home/$DOCKER_USER/repos/autoconf-2.69 && \ + ./configure && make && make install && cd /home/$DOCKER_USER/repos && \ + git clone https://github.com/pachterlab/kallisto.git && \ + mkdir kallisto/build && \ + cd /home/$DOCKER_USER/repos/kallisto/build && \ + cmake -DCMAKE_INSTALL_PREFIX=/home/$DOCKER_USER/ -DUSE_HDF5=ON .. && make && make install # python COPY python_installs.sh ./ RUN ./python_installs.sh && rm python_installs.sh -# for virtualenv intro -RUN pip install virtualenv -ENV PATH="/home/$DOCKER_USER/.local/bin:${PATH}" # jars -RUN mkdir /home/$DOCKER_USER/sw -WORKDIR /home/$DOCKER_USER/sw -RUN wget http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.39.zip && \ - apt install unzip -y && \ - unzip Trimmomatic-0.39.zip && \ - rm Trimmomatic-0.39.zip +RUN mkdir /home/$DOCKER_USER/sw && \ + cd /home/$DOCKER_USER/sw && \ + wget http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.39.zip && \ + apt install unzip -y && \ + unzip Trimmomatic-0.39.zip && \ + rm Trimmomatic-0.39.zip # cleanup WORKDIR /home/$DOCKER_USER/ @@ -99,11 +97,11 @@ RUN rm *.bz2 && rm -r info # shared folder # rnaseq-workshop folder RUN wget https://github.com/git-lfs/git-lfs/releases/download/v3.2.0/git-lfs-linux-amd64-v3.2.0.tar.gz && \ - mv git-lfs-linux-amd64-v3.2.0.tar.gz sw/ -WORKDIR /home/$DOCKER_USER/sw/ -RUN tar xvf git-lfs-linux-amd64-v3.2.0.tar.gz -WORKDIR /home/$DOCKER_USER/sw/git-lfs-3.2.0/ -RUN ./install.sh && \ + mv git-lfs-linux-amd64-v3.2.0.tar.gz sw/ && \ + cd /home/$DOCKER_USER/sw/ && \ + tar xvf git-lfs-linux-amd64-v3.2.0.tar.gz && \ + cd /home/$DOCKER_USER/sw/git-lfs-3.2.0/ && \ + ./install.sh && \ rm ../git-lfs-linux-amd64-v3.2.0.tar.gz WORKDIR /home/$DOCKER_USER/ @@ -118,6 +116,8 @@ EXPOSE 8889 COPY ./first.sh /home/$DOCKER_USER/ RUN chown $DOCKER_USER:$DOCKER_USER /home/$DOCKER_USER/first.sh + +ENV PATH="/home/$DOCKER_USER/bin:${PATH}" USER $DOCKER_USER RUN git lfs install -- GitLab