From 4fe37915364f4b2850b89e79a255254bfefca600 Mon Sep 17 00:00:00 2001 From: Saskia Hiltemann <zazkia@gmail.com> Date: Fri, 3 Nov 2023 18:13:19 +0100 Subject: [PATCH] add workflows and results for second experiment --- runs/Experiment2/mapping_stats_mp_at.xlsx | Bin 0 -> 11149 bytes .../3d_pca_plot_schippers_20211021.r | 175 ++++ workflows/Experiment1/SNP_pipe.sh | 478 +++++++++ workflows/Experiment1/SNP_pipe_mp.sh | 481 +++++++++ ...our_methods_noiseqbio_schippers_20211021.r | 923 ++++++++++++++++++ .../3d_pca_plot_schippers_second_20211209.r | 176 ++++ ...s_noiseqbio_schippers_second_at_20211209.r | 923 ++++++++++++++++++ ...s_noiseqbio_schippers_second_mp_20211209.r | 922 +++++++++++++++++ workflows/Experiment2/snp_pipe_at.sge | 13 + workflows/Experiment2/snp_pipe_at.sh | 469 +++++++++ workflows/Experiment2/snp_pipe_mp.sge | 13 + workflows/Experiment2/snp_pipe_mp.sh | 480 +++++++++ 12 files changed, 5053 insertions(+) create mode 100644 runs/Experiment2/mapping_stats_mp_at.xlsx create mode 100644 workflows/Experiment1/3d_pca_plot_schippers_20211021.r create mode 100644 workflows/Experiment1/SNP_pipe.sh create mode 100644 workflows/Experiment1/SNP_pipe_mp.sh create mode 100644 workflows/Experiment1/new_deg_pipe_four_methods_noiseqbio_schippers_20211021.r create mode 100644 workflows/Experiment2/3d_pca_plot_schippers_second_20211209.r create mode 100644 workflows/Experiment2/new_deg_pipe_four_methods_noiseqbio_schippers_second_at_20211209.r create mode 100644 workflows/Experiment2/new_deg_pipe_four_methods_noiseqbio_schippers_second_mp_20211209.r create mode 100644 workflows/Experiment2/snp_pipe_at.sge create mode 100644 workflows/Experiment2/snp_pipe_at.sh create mode 100644 workflows/Experiment2/snp_pipe_mp.sge create mode 100644 workflows/Experiment2/snp_pipe_mp.sh diff --git a/runs/Experiment2/mapping_stats_mp_at.xlsx b/runs/Experiment2/mapping_stats_mp_at.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..de5eb4d7dde707a922acb4c175e73806d1725931 GIT binary patch literal 11149 zcmaKS1ymf{(k|{A++7BOySqCC8!WiHySozt1h?QC+?^o7-Q7L7KRD-K@8q5L-&?b$ zXLrx8-Q88SzpAzBQ<MRRKnH<=fdR2GOHu*(L*U=WdJe``jtung$MS?RX>cZ#&@=Cd zI8WDl3{j(+wy^OuNpHWDUoZL`-xzV6y*&eAfW_aTW2Jm9M>s@MS3a0rN2E(ch})I` zX`^DEus!p#kZzCg+AP&a$VSzq$&n3*N~U|Qu80$CK!4yd&TA`mYv^)se*TW%6l%0$ z<QZhGqX+(Kw0{#;e6wn{`WUSidK;8nReU5K>bk89CeAgnsh%50cS|kKz|xAUFea*^ znvE;pV#VNk?<o&MJ*!o{9%i4>_^Bo;rNf!GDbyiD0BtX*;>D4x5k@@rr8PnBVAyP? zbArlD07Ov+8pfm~pke*(wRylm-p>D%VUXWuxV_!O*1^)i*4C2V&DttjWzc4c5vA>x z7Gl+!PS*l230yrU4rB$TaPV7#IHqL;`oKj}?&~V5@pr(Y2r@et{95Zbci?Ke<pDvL zMvm&QZv9SP#U}co#lGVuL+%u&F;aENWT4P5`p*j^hbzDHNBE~a&L%vK4b@24;ny16 zA=w&4R{U}$i{(b*Wos_24gIj{!8(BtN+`0=sqlT=LBaP#2s%lmq0ytrY}Chs2$C_q zt7^H`R&^hPg9<B1VmU@kb<s|OhSTp$)fe@mlP|n{&6I~?GzY&!TGUP!GAy_b#B*;d zGgq#|Z17^=0(=F{@?~j6MrSX`*NuFCYE6->nv?0)3ez_<O(^#xjBEg-!*T~+F&63r zJ+{A{WbB#dz<EdDl<)uE^>LUpRr)LjpB`hfwdx9OpsxSRWP{g$@b_^J!KYu8pnSB! z)%y{g=<*r({)cs#Y}*|N11Dtyd^=Ewx*>F`+-LA5$4&w0+;FtaQc)wqik-_?@gm3i zMJwsM3@8XYR1eu<!t)NyiawK&LS?WgxV}20Gpl~0^5mgHDD_o#{jr!U?y3pXPOma@ z%0^J!&zu0Jv(KWxd|I=_r;OO6p(eQT5xdV_nC}>I6`&5RQzzFG<d_wF6Uy)p1AL?; z)^c-fw?T(ag4H}v$?IJby@n272v|DQdk?YRV<ys#<>VF=1jGjXzr_sV-!bFp<Zfl` z_#QDQ+H1C(oG3mW9i-CEwytu-aS3QuV2kNDJGIRx(p2TAUF1-?&c_4xuKKSJHHHJo zRV}QFbbRS0(#%`m?w^eDm&OmySGdJy)g>>3*5g;2oUP$E!+K$LV~!PKH_pJ9hHyHW z1R;9j+x(lJZJvf1P4OTEDu|6RZ~ZRg+YCNF_SpY6gH(xIOF%HF&_qcE>$g3a9AtqL zN{7!w=hB(G&Oj%hCq%wb!7cdMK>9mqVc<qOo<iZwFlOXzA<2n@j4AXtRc{=a5zkQn zvx(vDth4V7>vfu(txrpny_kwAyd|$TigwOXV|`*&=)LO^tzfWZBNQO<7A;lsGkhGS z8Cmd3cOk2l{;6)ty*3TDcYv@fDF&5iq?+F@FW&cs;U@_QREvonFQd?nGR{%-o8xCr zyz{w>5;48>UP);b8i~oyq<{-X@F<}O5g$YAN+J#%qW5i{pO(9V3}gVzJ2gLVAvy#k zXk=@Q9%h{02TGZeF8@l4c<Eu?=S6?9<X4DN<H|VtoX)i9^sw%J8)0_a78)+V=U6** zpbfnhH9-ia?>ea@3jfU5c$UEh+;I{B+MDHJQ1N5#2K9UG((Pe*QvKM3_y`NNufqNV z3uI@UUhH-oZx!~Z2n1GCXwIMm#b}ff7@~eu{*!?znV;m?4kR*>P!NBF`BRoycn<Au z{q?5ZVqYJk9s$hplFWg*C-@HpYhluKbOud83Z@)xF06DCXRs%tfMWOO-UwAm5<=vz zikv!9*85SN$mAgho})-Hco|ZXgy%XssJUa?ZojK;Cht6A&+YDnUWlJ!pDTvdS-bQI zrBV_t<W<zVt5Cw*JKdlizV<HH&Q5AF+@v_rW^!A*FK#YVmC#qP)X4-kfEQSHnswF^ zYC%PT8g^ncLdoJW@|`7^4C(IWj$hHk$=xQ5LkQqRspz#2_jy?poH;L5Y|TZA>^lom z8TeX8KAb>@htwxestwocifP@#ERJ$0uCiKs_W<E%y?PeS$);J#2zUsvXXSs@W$KrU zRp87nZUfh<W=ZD|ECmGSSl1GG6@a~vakE@YQuK)|YevAx3v&(2FP5$$b(l>i8a&){ zr@EF{ix%`|5~&E+L+SSzrr<S>wq2=@Klt`leaVYo$mhmqX_jIZf_2%f53N{g>xSwL zT|TMHrExDISeO4eNzs{6ohle*7tt!U;xW;4o~bRa85ws0eVlkDvaDmm#hwLjr+!0` z&p^=@St7I^y%}4w5To76YQA@ovj``xi+@h0-V<fp-Bo}!fK758&FPBD2h7)<UEx7x z&UQW=zi8p!Ba-QvUx@;GCkkqtEa$H>Ys`CbS&vUYxeyU!tlcG23iXI44X7H~M84c< z@;0+YnJ#HHr*lZBEpMXCKBkuJA1LP4*y~)B&cbBS#~}HSiVw`B`qv9o@Qhmo$$u}V zx&LH8%`RLq`{{df=&BPtsK7KIBG}laH-)V+!4e4=fo=O)Gp=n8Qd*Vbt;Che?Y2|v zK~_D#Nah&0bNU{O=$wi-<)KH(UUNy#^4zEEm{0{REQ!Nu7^fyOXPW2%Kwb3S!H))- z=TChT(I>>?=A!f^j?=d@Mc{f*MOm>niJ$r!Y4F&<#-_}B&897<VS+eTR8S-?<kviX zd0K9pRi1x_T=>2NB-psB>n;Qc$c@l{14!(DlB%Pbv9XgQ!(Ts4?^xNUqv?ppmEg0c zmowhOd!i_~c^c0*A*aV(Z0SOJ6P-jIIq{*G1klpr`)rZ`q5AdPcSS1=C#PCE>UhZP z^Bqtx!IRg^*wDv}v%Cc_-i+3zI=-3~mQ-y8+x{=MXD+YK*9T|ruBPqS@-`v2Q-k{} zr&)RTrvhuQb1$(6$G7(@X0`J|kYDawR$BTtFOu8Z?ypz&^*PVvh$PUSMoye=7`4}9 zYivC)e=eB0)s$V?bwKy`wuNrE{b-pyA3kQ|E@)`jI+_SQxSr}U`=J5&7JV8DZ}swI z^Tb)X_|oY9wLNX&9NML>zHp)!xpd9yN|WJ=FRIdX>i)`Qq9v2RERE7r^G3Qf3wf^6 ze7?u-=5cCkP3eNn<@D&zr}fTpp?~G}QD(=qCS&R1*?^Vj+v>V!6T#}3#9hnJjfROG zvx~}~J7-wxQ!F2gI#xY9IvOgTd|pkWL?kP{P1&2h8y-uacZNJaXi7hgxm_8q-rbHq z**t8av7aCKa8^Df1!tLVrEOgp1$~cPm})*Zc*qlIdkrA-S1G-Zd?nb3n+aIBTOwe; zaC2&zFVaTn(2Y5jn2Vf{t65z)w~s|`db->=y7I|eH*w<eYSvRkJGY<Gq<f8_^J(D8 zxoDO6Iwj#D>g(jI@VvazM7aPyP4s*^syB7evvYk;C;>?8k1ZTtEQ=Grbi3QTGUuX( z!4B-8_~?}(?^{SZX|Yd0_b8b6$Vt9jwp7)^(mQ#Ur>cb{e7UDOlX35xbhbtO<ba*I zhJ9f3($b!$Fw7Om;K_&1iSSCe_VKjJydF!tibQxG0lhtmfehE_iR}ty(Rbshh~fx~ zQ78K9V*hcUa3|T?B54CnO=sDXZT*=)=Fscj7kBM(V=H}i9r)%{FD7q|2CM6s(|!() zyzqzl9TuVx-_NucXBS5Iw_X=iZDhD-UsJ~&<DvHV_)%vSAa=rMZ#!R}B$}_uGJZ?i zwcSC*UL8qzlqVOcukQ&kml5IAN)+gx3%thLKMxe~F?UEYOg+R)5zRNG@Vh?`lqnFo z1^Uw+Nnn;IV~Zwcy-He^02tFk7e_=K$J&KBPWqqBZuZK#fpl@fC(gC0#H9<_4TolI z!n3xOc{6aDGYD-;5~Tz`>^%v}Lk3Ec_M^Fha`7WeM`m?mE|L8(B}@UyQ4^~)q`>cJ zJ$>iTFIEqRruFDLeC`;$)`U0s$<=CQ`Fb$ZNpfh<JvLKY)Y}<vwkzZPIeB(A^0Mk` zu@2m9_!?v>IbScTCoQP88JK8i9UiOP-9W)F6+WJH_mRRtYjaq-n}b|gYx6S&D(;6S zI{U_iFvRf5Bu=1I{%%;c5T9=NwhaXpZTRG;?*3;d3Ss$5nc>fL25A=a!(2-P$K^7k za}*<*`U~(a8pDmJp?Vc!sXf*UT5Z5#2-Ok70E0jTH&~ns)9$elsIt^quc;)trZrX^ z_JfC1gfthTx6uW^1+&$F0D!r3cXco6m#D~9a-F0_7$-c_x6akIE+8`pg`6Eo<^iv^ zOi~EB2^!!bp$34W45hYXrV1&m)g;HoIJi}pzcUm)OoC^tGH_g%URElo)dV168l0-j zHyjd?uJ9Y<hg@CoFIiqc$0>M{RVRdtfj5ek;;zEoJHl<hf2l$xEWN_#OoLCuz&gUs zxsFnl?Vl(V*H6}jrk_&<et`&O5nr7ttuwNqjU7z|DIjM=Y?%S8U1HB(Df#YhxbNc# zr`_nPVg%vEhJ&Oh7t*6{i5yz_<=hzaIZ|akvIbpQKbqSP&P%6DYOiU@k=qM=lwFh_ zL>`HqMnc}dQp7#^+jjNT243q=xWNrxuv4Op0LD)?B1Q>?72xfDMgB$~Uw)=9396o2 z1@#`UeXGDPm%HjRiu8jmtJ#z9VptTYO=&VU09)s4w4?C}q+KvMs(KDlZ*qe3sR=wr zNc*TR-^Y+`%@W_d;blnE0CL=b_?ZFZ7^AZa=>`J_^s`(DxW<<aklDo^&|?a}g{Q|& z43IR4eN;}rHVDQ<FaQ?23UV9LXD5BiJT2j~UWuIn8IU}Oqq4n=vN_P>R*u1dK$vAf zhFdrjIx}G#4|vraXq4M8Y?ke9zAAm3q9Gj@GeT@X=N%Ft`mGTg{Lf1D^R)3$4@-|h z11-)?hSrF+u-z^$K`qn|Rs4D$kR7OaJK$ehf7AH14UV>815jkRy?W|5@F2Vb!LUMt z+eyjRO9$=lEXFpu3MhMOei3gB_v~eNCA(;o9xV%vucxu3y3F?VWcP3w`iX^*gloE@ zvtI3rR+Ea@j!&db6k{nzp_q_4e_-|7@ta`z!i&i5QN7dU7PYPOVWOsFnc~WM`T1~& zOe6h;lA==@zg;M>5^;Wb@)9#po1Yg;D3sx0Dy8~5d{7Sxu_esuz%wvHXOG2Q*Q0pW zjiA+jX+0XORS6V+%=^u>x^f|b>YkX;m6SbQcoc5#=lFHY99zqyR;h%dbhjhzZZ<tg zd2p*KAHS#VGc$r3s>b9dJ(`4`hDBf4f11D}^N<MS2e(W5z06^80C<?}Bsegi1`PoM zTI<*w2UdIDBEq2CUVne)tKp&<6IC6oH%}JKo|2r8J&fGGelio~dY9v;iO6~qeJtt; z)T-7f;BP!$6Vfs+btOnU#@VH?Mf)d$eGvI2D7z0J@^GKAIUggUK&mKvK+E_d|B4Cu z51qVj@~OAMX8xW7f%)!w#J62FYx&^VCoGf+*rDR6b%Ycn&jw&7XlXe=rJ(JEzjJ*v zf_tEXTF-Zv{3VaZp<MWO^>@lO6a-Sp1h3m{Vji8)cnyE%!4P==C}m+n^ew4}Je(hy z@nFg`@E4u30dkY1$4#U@a*RUGcv>Zm9wlTaC|%74Jh8gen9oEx_)B<ya)_}FD5tG^ zAtn4%L_$0DJ+U-@M1n%@b`ACY4|+gaLZEh+iVTt?q!b<35C(YNHs@TEu8h1xhIQRG z>zt;l43zM$>_GY+ceTZ!SOr>PlJ+h)gT<g~#hGtx+v*SNM{V?@pC(7PsR%Wc-eTe! zR&pU)Z(jyivy+Ws0y-Q%J(`HVej|?DU<1q+Y7%w9BFdkHo*LZ3^tR&_20?21ee}`; zVzf8x)zw_|1SIOPpy1vJQ>1IgJOJrWK>&SWyq=FP6Ljo!QoZ{Z0_!er*#L>*Oj11r zo~f^f-AdGhv`NBPCe@Mkx;Ty(^f%+TLRV$|VoG%EsV*?6L`ruiA`_Uj=C<8TYd)%< z9HZ%w{ccu_j_pF53alqjz&r-UEwqlEmEak@c5m27{6DLPiD+PKV$TLage?hsY<@Cc zWJDt2<o}LTVutq<w1=!6)mcNVubIL`0cowpzuQ$i_HdvfK1wP{!<45Y*I?&^*>^NU zakwX(M9U2JBye$m|B<;V8k$-9=&sKYECkfNGz3M5Or`mJY$3Zy*PIDVMv^3yvclNu zmINWDztR~k?E8{C3ns+MNzP?kO6d(#g2KrHoyw>BS%HYhj6F^mJK6PSxJ;qWeZmJC z-L(vp#;Xw1Y5|Kh*_ik^Vv+$sv=7;D4_|~q0&77lH1egGW(utC5|o8zJ-Z&R2=s!; zT>`nZ2!u`iGUe9O>oSJHbT*#D`kH#!F>{M%NGKXs<Tp<IUT=4|R5`fy^{7P8oM;I( zG#fiqydU4W?)Ik=4=peg)!;y&dMu8D=4l7?l~bpXHJhTAY^m&l*wr-UxpO=)m?0{g zua6L2qI5haZ_%O%>UV2+-eO774ffCMlVSgab|gjHi~~cV0HFRPXM0T%tdwF$$Wk8v zsJ)`4aAx2G##JWGloyAfr!FNnl11=3cvvp__rlZ9cPL^eO<%+<J@BXD`k-fc)bIG` z>@hI1**`1aU~W0J$NDNC)h08e6vZ@b*G4!f=5Y=AH=^}l(*5$UAT)K^W#1|j@%EEl z5{~bu{QW%vSlFhzQ5H>ktu0F13Gm0Ng9CR)!p5FrR8r?GU@JlHznVDj>nx=iO6JMC ztVIbq2s{JQRco0(YDnEy!*zrdrg+Oq20XAu5nbDZO{d0zT+>Pls1}?QhrjldRRq#3 z-b~X!rn}+lh;bD~3+;iWfam&RJ*h|E)hvfd#smUQUp|wbl#$h-^6fojOmlZm<5jnL z0L9$tbwFx{@d*|wtG#ndel`YS0=SBJ<^}eP<T^jD0DF>=MlCby3k#mu?sS1Use!<~ zS0lmOOcnMR2jUouuOQoK_9C1m&QOYWpk6{f5l}47O^4+wer4Mrn!!$6cjEb!<Vw%D zxN#4h{uXkQ*$)`9#HC99b}!;%diVaSje_(>(m7xf&d@jm+>20SNK>rsGtNdB>~VQw z*XZ}ZQ%<tMaTbd&amF|7Ih+hIhLB2J=Lt7TcwmG?5kta4h_PN?pqo85mvKe``-()o zbf3R#BS6@&j+P6<fgKU+FHf=bYa)1Yl{l*@N^h6)QEYybfJj!$fz*Hc(q;d9=j+Kj z<i>P$bz=aIfRi2kapbS;2F{=#`cQ1+-P~78HRC2NBADKBVnoIhRGL?snBK)=(Q?Qw zB*#`Rd=^TEOL=Nm$DY%<{rwVj7RO$1gW1qT?k~!FN2Iq;y^%CC^DjkW(a*6!at^`c zSpcP)VZb>WRw_z^s<	on<AAy}_15ot#0PSYBq>3}&z==?9i*FEBF!KM5gd)x>HA z#<ji3xKP!^v*0^3_ySsr#9uhH5^8%{q6TU9mTN+SB)fLNMVV^9l1l}zAH2o9<+;28 zj$Ha`Ul<MQLTVbEuR~lCgUaV-@~$avU<<r7A-?X_$@2s_j;5ibV5HDH2rr41Pu5>g zDBA6ieIHmJ)S0ERDuyJJIUT7ip=P-rgbGx`=k(|x_=w)(j2rQl7rhik;11;tyI{@b zzWCMltE0xv4p4(l!r_VK8{-o6O>TgkZXlIV^T>+r+4Pc#X2c~R1moXrGQ~5J(y3qr zoHk(t`Q!nmxukwQvz)~3GTYGK<SjDT54}iAu%Vry0g%@H%o)`V_oms!&_PravYbWE z*Zg?B{v3Wn=^6qvW&oUWzl-r64eT1R_Cd{3J6Xfy5KY=eFle!|EQ?-r(tdsH5utJx z6~+NJe&xg%g#*rHeZBmGR@fp6E@PePkX#Kx)gbaBHaW2Ah&YHkQgWdTiXt(!Bn(AE zm(S_imV<CbsDk{zzx#`%IV)00Jc-L%lyTxZ8P$LthS3k$Yv3=H1#-vZG4~6jgqB&U zF%mEd$U3NymuAcK;)>`=n+kCTK{b2S@c!;&DlbLew@hAY=E7*_^|YyhU_;^0YuGL_ zANvx>ZJCXPgr>&*1j)SESgYB^x;tjJ1zAuNVXpE6nrK23&A$?SN^6g7ft0EI+?!XR zG6z;l)-n~MfXySm5mOND0|$6(K#&m|<_Dv|OnIvaGgfjf1C+G;P_;c03fXgOm&{#Q zS8)k7GE$`!BwioBdTum;dm7q9|F*-r)+LpiA^gOL{Ea#EbYKR35Q(^5%#CyUh9trM zA(LI-C%-qiCDN|P9KW5sfU5@NPR27wEKc^&7$om@mpG9JY31`VaA!ez$z4sBB<9cH zqCnS;N=e^^%FkfnFU(hugK6!N`)`x`cfW2#RkTx*3zl1K{*@(ENstcf;|RUgzp)pA zQt==oy{Eb97&Th1qS7J_ez|s(Ya+YV-$BhWUU<{=?p+gaISzlCeZGpf2WB&|N2+{E zoZ#fq5!@Rn)+b+>Bgv{9dhgIFsHy;6*1J(&gLIvOH3dp9F<j{7eA7CspGN9rEPSBP zt0NjoZ-EhG!R_jwr^igWeaw_xCyzHXkG~#2ri!ss<YK1f-8ny^u@tY;Gb<%UEG3nV z;>WvoI2}Y@@)jpExNDNxZ;7tR2ND3ws;u0PT{Q^M%mb29pw>-a#t;fa0|4W@`0<`y zm$QWa;?mNnE2ILx-UtNw<h7!H`XTV>f_9%Q%W}{sI?q?KRQ8D}+@nAZ3evzxV<n(V zJY)&CVe8+X9a~3Zv5CBy9+`Dih=f9)avh#O7doZ?H)13FIG2>uD33=sU=d=J6(aZx z#J)=nagDYRG*x&nnfMK*=^CnUPh@^R$2=P7pj3jGx8{`)rgx%PmeGW%<`pEnXwC?d z<QqQuz2Os*<jl-G=o^X=!~v)O9-si6DZh_eDs4TzgV^7b=n{B{tDHyS%&<DoMTk>T zT%F&NPpXUD7sivyrbcn=#zr@3{pzr^!s!;vZ1Nt%$+!H|-r8?iGAfeULqj9$NfYMI z4G*1u?FDgo&6Y9^ZST__0XId+45d11h~C4&{<i6F2v{=df1!YC%^|X<s2b<$fUflr z7A=Bm8Qzm~9*q<3m%)m1!<gP5b5}Yq($el<ML8NN$b9%|XNe(ia3MP6jG6&l551(f z%($71sFSKft0<wSsvd8lOA3?AVS!)b4u>QUheYp&$ufP<oWLn^!RORrE7rGKCAgRL z2jew=T3BT~gBiY`umNa9Q$}MvoYMyxa&v_JgM8Eu`aKx*5AfsqAL6oSr9Q^rpq71K zsrxl|t01WVE!!jA*Q;e4-6=^b|Kp%hwxcO86aN089CZ?rzQB><87guNF{;W2hjV(M zut84G^#mk08XnW-wuFcvMYX6QuW`I+5On6TPQcs0zqnx9?=kktF`b}>TduV8w>k)l zRv)g}iO(89cZ6~HV;@uFMC%HqvVkqo5^slX@oZqzj<}Kj5F5uTX1ea?ya|SN?joLO z$Y(M_HKiDfKpgLG(f)vWHJ%bO+r`5g*MKrbF|HE1C`v!FA7o=B3Kj%rr9WF0(^txJ zH@PLa)q{l)xOv(gna8IMA5wI-x7=oPR+mtic;KpyHr&Q6Gh8t1@$siF3qbZLbf<BD zsp4@fsL6IWtQ%n#XnTnKVg;jtuJbrTyh&CwNOsr2(gLlQmkOX}DxV0sBe9Z`G<aZx zAXh~zOe+yiw^_tIgy~=dO9)ino{3kjl_UMhfy8UbRhHYW0v%|JuahtsU;~tBmm>|` zW)Zlu(-sRZEa-!>lJoCuFjlb0aPdO`{UA{GR<;BeA~}7a9UuARMP#2{Qs{^mPjt)o zW_oj9Y90iA=?$0lHw18%oLq@-QimxrX!>MK9K1uPjTD$AYULJV{xx9Rt6MS45LieI zDUyf|i~@{1GegwN3xP=qfd!xTB*#9a#z9<LQcIRlg1*%)eh(Pv`7~3zacFYU4KLXd z^<3L22yh9v`WP5o&Y}uJN(G48&q@UM?4jxG1Ds=*jS&&)xPk2mPBnH(!{2cs0ogd` z2BtL;5j)BV0D>lEX=$z^)$7mN^qXpXF=_T-AW%5|6`7kHqUm$o))LWb{mtVn0O*6F z>r(p&7Iw`@L(w3&x7suSf-(<wYj622Id?2_ahnpXG#Wb?$}}Lpm0WRx%5j1@k9LIC zdT0sRK8TP~GjGrf3#-@>5CDM-Y3?2fhRld8m`Cj1tQ&YXKolS^B<$sJh1LtaYORyV zR*K{0ZXWoC_%mB2E^Oq;F`JKPn4?%;kPP--uqjZ3dP+(d^&Gag=^%2a2(Rdnz(Yi6 z@ZI446$$lDm313AV_atV<d)XI+K&*Z_VM%66AGuMQo^(dRrCUcg>x0afD})&Qj(yC zh|O43%4KuGhgedQ0={TQAhb5YH3rtJ*6sqk%jFzgMl!m~6mEe+_NPr0NA*n=^=*x{ z_n6j~xz_Ea)>{0=tad`RbWU|AeWI6ii<n|d2I@<?QORq$)>5@}&UMzq$(xAUnAXEO z)=Tvl4q~<CPPHQ)k`SJO_3k}OvmebHtcSDmaouPaO<ES~jTsovi5F*gu%7Cl1F(F$ zT<;+}tUK0&vF0tk9*^}|AM2k+Pw?4z<%$qnq9<q6!=P`HYgC_2FK!=WYc7nK+_ZhR z-S=!*^D9-xI1rz1o<I$B9k_s2?MLhx(XXlUm_3g#W<Rh;aP|Bj`BUlo=YC^(mW~0k z5KPzl@75o8Oe5rClT>Ec=zU)7nY_IkTFBE2M~Lv(E2ki1if+<cb{gWdFqY*DBm2iZ z(-gP__)d9|h$L!;FZA+eusGC-Xmx!&?p%1&820me3l)&{iknw-yl<L(Al#m}c1t^2 zc=BBswfG-R9t<6vIZ_eqAkM|c`Z+;{70f%9ywgW9Yc|~cbsC9uUtf;4ch_9E`gUa| zc{pnHxAI2j?lK>ypdTnX+Jgt30*n<dC#Y7rUugJwV-9#r-F=|PDCVYqUKGg}&X{=v zV<tvN?rt#Gri`kmUzhWNyp2uwchBy?j<u}S<uZP;=NH?a(4W3n$*-?VzL^D)zS_}5 z;+Gjy6R{bQ2R(Mb*Rb+z6>fVL4wGfL;D<YWR@OY#L~zxk?_bUHnHDr(9<7R9$}N2> z7ZjfCFTY;jzCyn@PMid?H7?)!BE{bSt#N|;cjM%5rpBL~guhuE@p6_uj0hoT&luvj zeNxbdQJig2LY2>Vp#4gl<Gt(&vvsdN^mtlu#>eAd#^)~EC9e-zqYuM+MG&A$av_4d z`dnbz{GU%xHgMENa#z>QC}OLKK9shi&HG_VF#u~WIW_4Gy5o3s@&~2mi{}QL15cC~ zhJQ;<rGt)OsFZ7g*V@o3wm5dbRvFGABI)IMW-$&R!j3o)#<NT)#~Q5DGB875S>NyQ z?Ff((%##gO24*R<yTZ=snGl|Nz}vzul`HxjKapSlZullvUb(vB1@>+kh4Xul$+!2W z_^&(-^#AF(o9R0k8>u)snA@2C)q4M$^iI}57rT9i3lQ{oAsNIyF7PiCbcTkY{?aYa z@Ap;rW9}wOg8XI&Lx#(Qs(o9c)2h?j_xzhfv9aTDFU8q1_iqfGWGwSa-=0&~*1SjP zIpLL3l}2aBQ;7iQrKfUu!bX7UAhs+6r(`kIknmcBCTF9`FVIJIQ3sP}k?XvJOtPX% zV`6n0WpH#~H>|3l(Nq=+HjY0}Z~3$Po_?fVtrK!SvHUu`z8x1)$m;)uc2O~AxQbbA z8Hsj9WJgt(7QP}WhvIv5J6V)Xaz`JH!S)Si0?$$$<I5<@^G;v9y4`QOCQU?U0~MPc zmZeO!?o~dwCIMC0v#$8Bv#i3$T44|Rdxo<jMRBA4bzkO1e6l`%tzpS+HQDMKNvW$q z^mMlL+n<5Z_YTz8efgB<EEGK5n@*Qq^us8R*WlR)vt`MNU+&7KpVxWHl*faYwm{Ed z9<E1PHe{x^-PT1Dw%ZpPT&wfPjWMLXD)?eSD@ao;wz|IZ4aV*q>rb3is)vsgwBzC! zyJYb^$B4uhR*}LF`U#XYgSE1Hc?vgaL130QvRCjtRk;;HNLt^Dm}J6nYZA>EWx}DB zl`lcJJGqk9nUk|ehkc#sCtsCs@!mtCFA%C?;*Ex34g23V3+h`)7}**sI@sDdG8o!A z82?4yQ684>VMJ-Y#E3r&--4JJl2(RQRQM!pff|-p-B3(o|8dFtnNNN)AJSvJzJ?{! zn=6grkY?V$`iBx1sZtb}Vk@fhS1u3DksbhCnW1!5iBeo1nFcGZ4RZy+7J)ZSJkb7w z)p~T|ckMm1t@HwHt#<#^g#p?`s=4UmZ<phV8Z0k;(w$QHYa6)uXup`2aYH<uM5$!D zBxKsyR85IFL*OhFM-4updd+e(*0nuk8`3A%IY}U@e+TC(K~{_De)b_cnUX(zq`I@d zs67@({BEJ%2a2&!j5>tK2u*qpRm;G^Sb5O1Wf2oWE-xvWz$m3g{@qA;YS1&px6Kyu zv5`q^IMY8CQo|<S<t2pgo@~Ru?^Dw?JcD2`!>~Ma!J>-VujXKZ2Vg*>fJZECQ%>GP z;^UrV1FO6_a~rlf<30`E{NT+QMm{ZfcfDoH^Bu(LL%jI9A(alKw+GDY)6dPnJhbJ! z(P{YRp{4)2X;J?4kiMPWyMr>~6y8Cs@9Yu1>nfEKk%YF-egT^Yqh5sF1wOO9mhdIC zJn?F2Y$^N1or90>;>*>%O?3gMknCbhW?5+}`g*?lB7w(mcHU$>WpZ?R*e!yqZG`aS zI|9#WenC;h;sJ<c4+l{ME$J`30T{(Y^~0mvc^E0iyov&H!0s=}HISjT#YZvS0qf{R z%-Vk2v5kBizBxm~S&x&Cc1YiTE`YFO`*-5BB?U_8Ls|P*IzE!li@=D>_;aDQ&t;+N z`lydrSd>1NXr9!iC0`g$MeArSBShT?GsH<Qd0cZ^<_(2RXkq8gP2X|6N3z45^~b|E zD|o!w>OYgv(B5WfiP+jW8QVDNeR8)mcGUT+RsM-b;VqU|A8AP|sa=NsP^1_Af5LpF zW?{adV2URCu`>DMISTls3JVAe8~kzBo~+=)zEni18F)FlpDw$fLY@szmS5rUyS7n= zVbF?u2vZB5zf1STQ91N;CHAElVuYu~N3*iTegL7}5sis^^)^P;<;7h9hB)*%31WPp zYz-dueF^N~2~@0WGYr+>bp?a;PLS5pKmhc|8V%zNPUz?}(26=vhh!hlyf7<IFS>Gp zN8PT&LrKAsl#iLNWV&=tB8y-Kgy+o}=yAWA;BU3{!98Q7un9O7k|zuFZTso;qBJL8 zEYB2Ki~;Z{1sZ@LMOy;94NJdl+{KDd_QeYOk5$&4$mOdEFU0Qf=VdyS;A!)hS?ASw zw;T8`=2z{zNWF%3ckR$wog}VV$h~73)r7uK&>b8X-*|FWB1P8&j)|UiQHx*GaW}QF zh}$h<nP+_xr(EAX6MM76uKDH_b;x(mfP$fe{KK65lREg$ocw>~4<6;e760Uqy)z5{ z5uCSM{flS#Z{<I^74H<Ye?;z$j`pVf58>?Js()4&@8pYr#0KX7-uM5bV*GoRKQr-r zCGw9HlKi#Gf2d3Tz0RNT{hqh~kuK7|U*z9e{NF452`}$3|Bp<Q|1Y5bZ}mTS-8%;T tBQBJGsQ(Wl{agFb$avqG|41g)|GG~VWgwy6C&9jb1l|CV6Yzfae*o!*W2pcD literal 0 HcmV?d00001 diff --git a/workflows/Experiment1/3d_pca_plot_schippers_20211021.r b/workflows/Experiment1/3d_pca_plot_schippers_20211021.r new file mode 100644 index 0000000..2408e58 --- /dev/null +++ b/workflows/Experiment1/3d_pca_plot_schippers_20211021.r @@ -0,0 +1,175 @@ +# 3d R plot +#saveat <- "/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Schippers" +saveat <- "R:/haasf/madland_RNA-seq_Schippers" + +#file.rpkm <- '/mnt/Teamdrive/projects/MAdLand/cooperations/Schippers_Lab_RNA-seq_analyses/Mp.p.sort.wotRNA.rpkm' +file.rpkm <- "S:/projects/MAdLand/cooperations/Schippers_Lab_RNA-seq_analyses/Mp.p.sort.wotRNA.rpkm" +# file.rpkm <- "S:/projects/MAdLand/cooperations/Schippers_Lab_RNA-seq_analyses/31315_all.p.sort.rpkm" + +data.rpkm <- read.table(file.rpkm, header=T, sep="\t", row.names=1) + + +# sort by colnames +data.rpkm <- data.rpkm[,order(colnames(data.rpkm))] + + +librariesName <- list( + control_6h = c("control_6h", "red"), + control_24h = c("control_24h", "blue"), + LCS_6h = c("LCS_6h", "green"), + LCS_24h = c("LCS_24h", "yellow") +) + +header.ori <- c("MF82", "MF81", "MF80", "MF76", "MF75", "MF74", "MF68", "MF67", "MF66", "MF62", "MF61", "MF60") +header.new <- c("LCS_24h.2", "LCS_24h.1", "LCS_24h", "control_24h.2", "control_24h.1", "control_24h", "control_6h.2", "LCS_6h.1", "LCS_6h", "LCS_6h.2", "control_6h.1", "control_6h") +# header.ori <- c("P97", "P96", "P95", "P91", "P90", "P89", "P111", "P110", "P109", "P105", "P104", "P103") +# header.new <- c("LCS_6h.2", "LCS_6h.1", "LCS_6h", "control_6h.2", "control_6h.1", "control_6h", "LCS_24h.2", "LCS_24h.1", "LCS_24h", "control_24h.2", "control_24h.1", "control_24h") + + +header.new <- header.new[order(header.ori)] +header.ori <- header.ori[order(header.ori)] + +col.header <- header.new + +colnames(data.rpkm) <- col.header + +library("DESeq2") +library("ggplot2") +library("RColorBrewer") +library("pheatmap") +library("BiocGenerics") +library("rgl") +library("magick") +library("sjmisc") + + +################### running ###################### +### PCA RPKM ### + +set.seed(0) + +data.inv <- t(data.rpkm) +data.dist <-dist(data.inv, method="euclidean") # "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski" +data.dist.hc <- hclust(data.dist,method="ward.D2") +data.dist.pca <-princomp(data.dist,cor=T) + +pc1 <- data.dist.pca$scores[,1] +pc2 <- data.dist.pca$scores[,2] +pc3 <- data.dist.pca$scores[,3] + +# create data frame for pc1 pc2 pc3 +data.dist.pca.frame = data.frame(pc1,pc2,pc3) +rownames(data.dist.pca.frame) <- names(data.dist.pca$scale) +colnames(data.dist.pca.frame) <- c("pc1","pc2","pc3") + +condition.values <- c() +condition.values.color <- c() + + +for(a in colnames(data.rpkm)) { + + v <- substr(a, nchar(a), nchar(a)) + if(str_contains(v, c(1,2,3,4,5,6,7,8,9,0), logic = "or")) { + if (substr(substr(a, 1, nchar(a)-2), nchar(substr(a, 1, nchar(a)-2)), nchar(substr(a, 1, nchar(a)-2))) == ".") { + n <- substr(a, 1, nchar(a)-3) + } else { + n <- substr(a, 1, nchar(a)-2) + } + + } else { + n <- a + + } + condition.values <- c(condition.values, librariesName[n][[1]][1]) + condition.values.color <- c(condition.values.color, librariesName[n][[1]][2]) +} + + +data.dist.pca.frame["tissue"] <- condition.values +data.dist.pca.frame["color"] <- condition.values.color +data.dist.pca.frame["name"] <- names(data.dist.pca$scale) +attr(data.dist.pca.frame, "percentVar") <- (data.dist.pca$sdev)^2 / sum(data.dist.pca$sdev^2) # cumsum() + +# simple plot +png(filename=paste0(saveat, "/HC_RPKM_normalized.png")) +plot(data.dist.hc) # hc plot +dev.off() +png(filename=paste0(saveat, "/PCA_variance_RPKM_normalized.png")) +plot(data.dist.pca) # variances; var(data.dist.pca$sdev[1:9]) +dev.off() +# get the parcent variation +percentVar <- round(100 * attr(data.dist.pca.frame, "percentVar")) + +# 3d plot +plot3d(pc1, pc2, pc3, + type = "s", # p, s, l, h, n + #pch = c(1:3), + col = condition.values.color, + size = 1, + xlab = paste0("PC1: ",percentVar[1],"% variance"), + ylab = paste0("PC2: ",percentVar[2],"% variance"), + zlab = paste0("PC3: ",percentVar[3],"% variance"), + cex = 2, + main = "", # -> princomp", +) + +# shift <- matrix(4, 4, 4, byrow = TRUE) +# text3d(shift,texts=1:3) +grid3d(c("x", "y", "z")) + +## add legend +legend3d("right", unique(condition.values), pch = 19, col = unique(condition.values.color)) + +#### video ##### +M <- par3d("userMatrix") +play3d( par3dinterp( userMatrix=list(M, + rotate3d(M, pi/2, 1, 0, 0), + rotate3d(M, pi/2, 0, 1, 0) ) ), + duration=2 ) + +movie3d(spin3d(axis = c(1, 2, 1)), duration = 5, + dir = saveat) + +#### video end #### + +# pc1, pc2 +png(filename=paste0(saveat, "/PCA_RPKM_normalized.png")) +ggplot( + data.dist.pca.frame, + aes( + pc1, + pc2, + color=tissue) +) + + geom_point(size=2.5) + + xlab( + paste0("PC1: ",percentVar[1],"% variance") + ) + + ylab( + paste0("PC2: ",percentVar[2],"% variance") + ) + + #theme() + #, face="bold" + scale_colour_manual( + values= c("blue", "red", "yellow", "green") # dodgerblue3 + ) + + ggtitle("PCA of all samples (RPKM normalized)") + + theme( + plot.title = element_text(lineheight=.8), + panel.background = element_rect(fill = "gray95") + ) +dev.off() + + + + + +######## Some experiments ###### +# for(i in colnames(data)) { +# colnames(data)[colnames(data)==grep(i, names(data), value = TRUE)] <- librariesName[[i]] +# } +# +# for(i in colnames(data)) { +# colnames(data)[colnames(data)==grep(i, names(data), value = TRUE)] <- names(librariesName[librariesName==grep(i, librariesName, value = TRUE)]) +# } + + diff --git a/workflows/Experiment1/SNP_pipe.sh b/workflows/Experiment1/SNP_pipe.sh new file mode 100644 index 0000000..1daee78 --- /dev/null +++ b/workflows/Experiment1/SNP_pipe.sh @@ -0,0 +1,478 @@ +#!/bin/bash + +#. /etc/profile.d/modules.sh + +export PATH=$PATH:/vol/biotools/bin +export MANPATH=$MANPATH:/vol/biotools/man +export PATH=$PATH:/vol/software/bin + + +# install important R package - lummerland cluster does not have it ... +#echo "install.packages(\"gplots\", repos=\"https://cran.rstudio.com\")" | R --vanilla --no-save +#R CMD INSTALL /vol/agrensing/tools/R_packages/bitops_1.0-6.tar.gz +#R CMD INSTALL /vol/agrensing/tools/R_packages/caTools_1.17.1.1.tar.gz +#R CMD INSTALL /vol/agrensing/tools/R_packages/gtools_3.8.1.tar.gz +#R CMD INSTALL /vol/agrensing/tools/R_packages/gdata_2.18.0.tar.gz +#R CMD INSTALL /vol/agrensing/tools/R_packages/gplots_3.0.1.tar.gz + +#functions + +startle () +{ + number_row_reads=`wc -l $fw | awk '{print $1/4}'` + echo "## Start log ##" + echo "Number of row reads: "$number_row_reads +} + +trimmo () + { + # Remove leading low quality or N bases (below quality 3) (LEADING:3) + # Remove trailing low quality or N bases (below quality 3) (TRAILING:3) + # Scan the read with a 4-base wide sliding window, cutting when the average quality per base drops below 15 (SLIDINGWINDOW:4:15) + # HEADCROP: Cut the specified Number of bases from the start of the read + # MINLEN was first set to 50 bp. But there are libraries with read start length of 50 bp. + + if [ -e $rv ]; then + java $java_ram -jar /nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads $cpu -phred33 -summary $trimmo_out".trimmo.stats" \ + $fw $rv $trimmo_out".PE.fw" $trimmo_out".SE.fw" $trimmo_out".PE.rv" $trimmo_out".SE.rv" \ + ILLUMINACLIP:/nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/adapters/TruSeq-PE_all.fna:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 HEADCROP:10 MINLEN:30 + + # stats + echo "## Trimmo statistics ##" + printf "trim\t"$trimmo_out".PE.fw\t"; grep -c "^\+$" $trimmo_out".PE.fw" + # fastqc + /nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc" $trimmo_out".PE."* + fi + if [ ! -e $rv ]; then + java $java_ram -jar /nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/trimmomatic-0.39.jar SE -threads $cpu -phred33 -summary $trimmo_out".trimmo.stats" \ + $fw $trimmo_out".SE" \ + ILLUMINACLIP:/nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/adapters/TruSeq-PE_all.fna:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 HEADCROP:10 MINLEN:30 + + # stats + echo "## Trimmo statistics ##" + # printf "trim\t"$trimmo_out".SE\t"; grep -c "^\+$" $trimmo_out".SE" + printf "trim\t"$trimmo_out".SE\t"; wc -l $trimmo_out".SE" | awk '{print $1/4}' + # fastqc + /nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc" $trimmo_out".SE" + fi + + } + +polyA () + { + # min_len was first set to 50 bp. But there are libraries with read start length of 50 bp. + # If you specify any statistic option, no other ouput will be generated. To preprocess data, do not specify a statistics option. + # -stats_dupl -stats_all + + inpA=$polyA_out".polyA.passed" + infA=$polyA_out".polyA.failed" + + if [ -e $rv ]; then + /nfs/agrensing/tools/prinseq/prinseq-lite-0.20.4/./prinseq-lite.pl -verbose -no_qual_header -fastq $trimmo_out".PE.fw" -fastq2 $trimmo_out".PE.rv" \ + -trim_tail_left 5 -trim_tail_right 5 -min_len 30 -out_good $inpA -out_bad $infA \ + -out_format 3 -log $polyA_out"_polyA.log" + + # stats + echo "## PolyA statistics ##" + printf "polyA\t"$inpA"_1.fastq\t"; wc -l $inpA"_1.fastq" | awk '{print $1/4}' + #fastqc + /nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc" $inpA"_"[12]".fastq" + fi + if [ ! -e $rv ]; then + /nfs/agrensing/tools/prinseq/prinseq-lite-0.20.4/./prinseq-lite.pl -verbose -no_qual_header -fastq $trimmo_out".SE" \ + -trim_tail_left 5 -trim_tail_right 5 -min_len 30 -out_good $inpA -out_bad $infA \ + -out_format 3 -log $polyA_out"_polyA.log" + + # stats + echo "## PolyA statistics ##" + printf "polyA\t"$inpA".fastq\t"; grep -c "^\+$" $inpA".fastq" + #fastqc + /nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc" $inpA".fastq" + fi + } + +duple () +{ + # number of 100% identical duplicat reads. + echo "Duplication:" + + if [ -e $rv ]; then + + #SE 1 + IN=$polyA_out".polyA.passed" + + num_dup_se1=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_1_dupl_histogram"}}' $IN"_1.fastq" | wc -l` + printf $num_dup_se1"\t"; printf %.5f\\t "$(( 100000 * num_dup_se1 / number_row_reads ))e-5" + #SE 2 + num_dup_se2=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_2_dupl_histogram"}}' $IN"_2.fastq" | wc -l` + printf $num_dup_se2"\t"; printf %.5f\\t "$(( 100000 * num_dup_se2 / number_row_reads ))e-5" + + + #PE + num_dup_pe=`awk -v nof=$IN 'NR % 4 == 1 || NR % 4 == 2 {if(n==1){if(foo[name]==""){c=0}; foo[name]=foo[name]$1; if(c==1){seq[foo[name]]++} else {c=1}; n=0} else if($1~/^@/){split($1,w,"/"); name=w[1]; foo[name]; n=1}} END {for(a in seq){print seq[a]; p[seq[a]]++} for(l in p){print l"\t"p[l] >> nof"_1_2_dupl_histogram"}}' $IN"_1.fastq" $IN"_2.fastq" | wc -l` + printf $num_dup_pe"\t"; printf %.5f\\n "$(( 100000 * num_dup_pe / number_row_reads ))e-5" + + + fi + if [ ! -e $rv ]; then + for IN in $polyA_out".polyA.passed"*; + do + num_dup_se1=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_dupl_histogram"}}' $IN | wc -l` + printf $IN"\t"$num_dup_se1"\t"; printf %.5f\\n "$(( 100000 * num_dup_se1 / number_row_reads ))e-5" + done + fi + + +} + +mappi() +{ + if [ -e $rv ]; then + /homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -N 1 -n 7 -A sam --npaths=5 \ + --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \ + --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \ + $polyA_out".polyA.passed_1.fastq" $polyA_out".polyA.passed_2.fastq" 2> $mappi_out".mappi.err" + + unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped.1"` + + fi + if [ ! -e $rv ]; then + /ceph/agrensing/tools/gmap-gsnap/gmap-2021-03-08/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -N 1 -n 5 -A sam --npaths=5 \ + --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \ + --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \ + $polyA_out".polyA.passed.fastq" 2> $mappi_out".mappi.err" + + unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped"` + fi + + + + # mapping statistics + echo "## Pp genome version 3 ##" + echo "Unmapped reads: "$unmapped_reads + +} + +organelli() +{ + if [ -e $rv ]; then + for IN in $mappi_out".genome.concordant_"*; + do + print "Fragments:" + printf $IN"\t"; + with=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Ei "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`; + wo=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Eiv "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`; + awk -v with=$with -v wo=$wo 'BEGIN {print with"\t"wo"\t"with/(wo+with)*100" %"}' ; + done + fi + if [ ! -e $rv ]; then + for IN in $mappi_out".genome"*"uniq"*; + do + print "SE reads:" + printf $IN"\t"; + with=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Ei "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`; + wo=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Eiv "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`; + awk -v with=$with -v wo=$wo 'BEGIN {print with"\t"wo"\t"with/(wo+with)*100" %"}' ; + done + + fi + + +} + +mappi_gDNA() +{ + if [ -e $rv ]; then + /homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -n 7 -A sam --npaths=5 \ + --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \ + --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \ + $polyA_out".polyA.passed_1.fastq" $polyA_out".polyA.passed_2.fastq" 2> $mappi_out".mappi.err" + + unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped.1"` + fi + + if [ ! -e $rv ]; then + /homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -n 5 -A sam --npaths=5 \ + --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \ + --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \ + $polyA_out".polyA.passed.fastq" 2> $mappi_out".mappi.err" + + unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped"` + fi + + # mapping statistics + echo "## Pp genome version 3 ##" + echo "Unmapped reads: "$unmapped_reads +} + + +sample () +{ + # if PE use ".genome"*[tcqg] + for IN in $mappi_out".genome"*[tcqg] + do + printf $IN"\tsort.bam\n" >&2 + /vol/agrensing/tools/samtools/samtools-1.12/samtools view -@ $cpu -bhS $IN | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G - -o $IN".sort.bam" + + printf $IN"\trmdup.bam\n" >&2 + grep -vE "chloro|ribo|mito|ERCC" $IN | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G -n -O BAM - | \ + /vol/agrensing/tools/samtools/samtools-1.12/samtools fixmate -@ $cpu -m - - | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G -O BAM - | \ + /vol/agrensing/tools/samtools/samtools-1.12/samtools markdup -@ $cpu -s -r - \ + $IN".filter.fixmate.sort.markdup_r.bam" + + # samtools statistics + v_tmp=`awk '$1!~"@" {print $1}' $IN | sort | uniq | wc -l` + printf "Number of mapped reads: "$IN"\t"$v_tmp"\n" + + # compress the sam text files + # gzip $IN + # rm sam file + rm $IN + + + # bam index + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $IN*"_r.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $IN".sort.bam" + + done + + ls -l $mappi_out*"bam" >&2 + + # merge all unique bam files + echo "merging all .uniq bam files" >&2 + echo "merging all .uniq bam files" + /vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.bam" $mappi_out*"uniq"*".sort.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.bam" + + /vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.rmdup.bam" $mappi_out*"uniq"*"_r.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.rmdup.bam" +} + +counti() +{ + if [ -e $rv ]; then + # read count v3.3 and ERCC combined + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_ercc_gt.gff \ + -t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.pB.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_32458_ercc.exon.pB.log" + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_ercc_gt.gff \ + -t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.p.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_32458_ercc.exon.p.log" + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \ + -t exon -g Parent -o $counti_out"_v3.3_31315_ercc.exon.pB.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_31315_ercc.exon.pB.log" + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \ + -t exon -g Parent -o $counti_out"_v3.3_31315_ercc.exon.p.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_31315_ercc.exon.p.log" + + + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \ + -t exon -g Parent -o $counti_out"_all_v3.3_31315_ercc.exon.pB.txt" $mappi_out".uniq.merged.bam" 2> $counti_out"_all_v3.3_31315_ercc.exon.pB.log" + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \ + -t exon -g Parent -o $counti_out"_all_v3.3_31315_ercc.exon.p.txt" $mappi_out".uniq.merged.bam" 2> $counti_out"_all_v3.3_31315_ercc.exon.p.log" + + # statistics + printf "\n### FeatureCount statisitcs v3.3 ###\n" + # 32458 + echo "Fragment counts on gene models exon PE 32458:" + grep "Successfully assigned fragments" $counti_out"_v3.3_32458_ercc.exon.pB.log" + echo "Fragment counts on gene models exon PE+SE 32458:" + grep "Successfully assigned fragments" $counti_out"_v3.3_32458_ercc.exon.p.log" + # 31315 + echo "Fragment counts on gene models exon PE 31315:" + grep "Successfully assigned fragments" $counti_out"_v3.3_31315_ercc.exon.pB.log" + echo "Fragment counts on gene models exon PE+SE 31315:" + grep "Successfully assigned fragments" $counti_out"_v3.3_31315_ercc.exon.p.log" + # 31315 no rmdup + echo "Fragment counts on gene models exon PE 31315 (no rmdup):" + grep "Successfully assigned fragments" $counti_out"_all_v3.3_31315_ercc.exon.pB.log" + echo "Fragment counts on gene models exon PE+SE 31315 (no rmdup):" + grep "Successfully assigned fragments" $counti_out"_all_v3.3_31315_ercc.exon.p.log" + + fi + if [ ! -e $rv ]; then + + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_ercc_gt.gff \ + -t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.SE.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_32458_ercc.exon.SE.log" + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \ + -t exon -g Parent -o $counti_out"_v3.3_31315_ercc.exon.SE.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_31315_ercc.exon.SE.log" + + # statistics + printf "\n### FeatureCount statisitcs v3.3 ###" + # 32458 + echo "Fragment counts on gene models exon SE 32458:" + grep "Successfully assigned " $counti_out"_v3.3_32458_ercc.exon.SE.log" + # 31315 + echo "Fragment counts on gene models exon SE 31315:" + grep "Successfully assigned " $counti_out"_v3.3_31315_ercc.exon.SE.log" + + fi +} + +depthi() +{ + /vol/agrensing/tools/samtools/samtools-1.12/samtools depth -aa $mappi_out".uniq.merged.rmdup.bam" > $mappi_out".uniq.merged.rmdup.depth" +} + + +snpi() +{ + gatkrun='/vol/agrensing/tools/gatk/gatk-4.2.0.0/gatk --java-options -Xmx50G' + threading_option='--native-pair-hmm-threads 8' + # V3 + # start the SNP detection + $gatkrun HaplotypeCaller $threading_option -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -ploidy 1 -O $snpi_out".HaplotypeCaller.vcf" 2> $snpi_out".HaplotypeCaller.err" + + $gatkrun SelectVariants -R $reffna --variant $snpi_out".HaplotypeCaller.vcf" -select-type INDEL -O $snpi_out".SelectVariants.vcf" 2> $snpi_out".SelectVariants.err" + + # HQ SNPs + python2.7 /vol/agrensing/fhaas/GeneAtlas/SNP/GetHighQualVcfs.py -i $snpi_out".HaplotypeCaller.vcf" -o $output_dir"/05_SNP_calling/"$species"/"$prefix"/" --ploidy 1 --percentile 90 --GQ 90 2> $snpi_out".GetHighQualVcfs.HQ.err" + +# mv $snpi_out".GetHighQualVcfs.HQ.vcf" $snpi_out".GetHighQualVcfs.V3.HQ.vcf" + + $gatkrun IndexFeatureFile -I $snpi_out".GetHighQualVcfs.HQ.vcf" 2> $snpi_out".GetHighQualVcfs.HQ.idx.err" + + $gatkrun BaseRecalibrator -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.table" 2> $snpi_out".BaseRecalibrator.err" + + $gatkrun ApplyBQSR -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" --bqsr-recal-file $snpi_out".BaseRecalibrator.table" -O $snpi_out".BaseRecalibrator.ApplyBQSR.bam" \ + 2> $snpi_out".BaseRecalibrator.ApplyBQSR.err" + + $gatkrun BaseRecalibrator -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.recal.table" \ + 2> $snpi_out".BaseRecalibrator.recal.err" + + $gatkrun AnalyzeCovariates -before $snpi_out".BaseRecalibrator.table" -after $snpi_out".BaseRecalibrator.recal.table" -verbosity DEBUG -csv $snpi_out".AnalyzeCovariates.calibration-report.csv" \ + -plots $snpi_out".AnalyzeCovariates.calibration-report.QC.pdf" 2> $snpi_out".AnalyzeCovariates.calibration-report.err" + + $gatkrun PrintReads -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -O $snpi_out".PrintReads.bam" 2> $snpi_out".PrintReads.err" + + $gatkrun PrintReads -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -O $snpi_out".PrintReads.ApplyBQSR.bam" 2> $snpi_out".PrintReads.ApplyBQSR.err" + + # HaplotypeCaller and VariantFiltration w/o ApplyBQSR BAM file + $gatkrun HaplotypeCaller $threading_option -R $reffna -I $snpi_out".PrintReads.bam" -ploidy 1 -O $snpi_out".PrintReads.HaplotypeCaller.vcf" 2> $snpi_out".PrintReads.HaplotypeCaller.err" + + $gatkrun HaplotypeCaller $threading_option -R $reffna -I $snpi_out".PrintReads.ApplyBQSR.bam" -ploidy 1 -O $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.vcf" \ + 2> $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.err" + + # filter flags ## NOTE: it is important to look for float and integer numbers (e.g. FS must be 60.0 not 60) ## + # --filter-expression "FS > 60" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0" + # --filter-name "LowMQ" --filter-expression "QUAL > 30.0 && QUAL < 60.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage" + # + $gatkrun VariantFiltration -R $reffna --variant $snpi_out".PrintReads.HaplotypeCaller.vcf" \ + --filter-expression "FS > 60.0" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0" \ + --filter-name "LowMQ" --filter-expression "QUAL > 30.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage" \ + -O $snpi_out".VariantFiltration.vcf" 2> $snpi_out".VariantFiltration.err" + + $gatkrun VariantFiltration -R $reffna --variant $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.vcf" \ + --filter-expression "FS > 60.0" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0" \ + --filter-name "LowMQ" --filter-expression "QUAL > 30.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage" \ + -O $snpi_out".ApplyBQSR.VariantFiltration.vcf" 2> $snpi_out".ApplyBQSR.VariantFiltration.err" +} + +corri() +{ + for IN in $mappi_out*.gz + do + gunzip $IN + file=`echo $IN | awk '{print substr($1, 1, length($1)-3)}'` + /vol/agrensing/tools/samtools/samtools-1.12/samtools view -@ $cpu -bhS $file | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G - -o $file".sort.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $file".sort.bam" + gzip $file + + done + /vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.bam" $mappi_out*"uniq"*".sort.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.bam" + + + + gatkrun='/ceph/agrensing/tools/gatk/gatk-4.2.0.0/gatk --java-options -Xmx50G' + threading_option='--native-pair-hmm-threads 8' + $gatkrun BaseRecalibrator -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.recal.table" \ + 2> $snpi_out".BaseRecalibrator.recal.err" + + $gatkrun AnalyzeCovariates -before $snpi_out".BaseRecalibrator.table" -after $snpi_out".BaseRecalibrator.recal.table" -verbosity DEBUG -csv $snpi_out".AnalyzeCovariates.calibration-report.csv" \ + -plots $snpi_out".AnalyzeCovariates.calibration-report.QC.pdf" 2> $snpi_out".AnalyzeCovariates.calibration-report.err" + + +} + + +# script input variables + +cpu=$1 +output_dir=$2 +prefix=$3 +species=$4 +infile_path=$5 + +# set variables +java_ram="-Xmx50G -Xms25G" +id=$prefix + +fw=$infile_path"/fw."$prefix +rv=$infile_path"/rv."$prefix + +db_path_V3="/vol/agrensing/fhaas/DB/Pp_core_chloro_mito_ribo_ercc_sNone/Pp_core_chloro_mito_ribo_ercc_sNone" +db_name_V3="Pp_core_chloro_mito_ribo_ercc_sNone" + +reffna="/vol/agrensing/fhaas/DB/Pp_core_chloro_mito_ribo_ercc/Pp_core_chloro_mito_ribo_ercc.fna" + + + +# outdir variables +# output=$output_dir"/"$prefix +trimmo_out=$output_dir"/02_trimmomatic/"$species"/"$prefix"/"$prefix +polyA_out=$output_dir"/03_poly-A/"$species"/"$prefix"/"$prefix +mappi_out=$output_dir"/04_genome_mapping/"$species"/"$prefix"/"$prefix +snpi_out=$output_dir"/05_SNP_calling/"$species"/"$prefix"/"$prefix +counti_out=$output_dir"/06_counts/"$species"/"$prefix"/"$prefix + +# create result folder +mkdir -p $output_dir"/02_trimmomatic/"$species"/"$prefix +mkdir -p $output_dir"/03_poly-A/"$species"/"$prefix +mkdir -p $output_dir"/04_genome_mapping/"$species"/"$prefix +mkdir -p $output_dir"/05_SNP_calling/"$species"/"$prefix +mkdir -p $output_dir"/06_counts/"$species"/"$prefix + +mkdir -p $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc" +mkdir -p $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc" + + + + + +# print running machine and start date +hostname +date + +# print all variables +printf $cpu"\n"$output_dir"\n"$prefix"\n"$species"\n"$fw"\n" + +if [ -e $rv ]; then + printf $rv"\n" +fi + +printf "\'$java_ram\'""\n"$id"\n"$db_path_V3"\n"$db_name_V3"\n" + +echo "" + +echo "Phase 0 assamble data" +# startle + +date +echo "Phase 1 run trimming" +# trimmo + +date +echo "Phase 2 run polyA" +# polyA +# duple +date +echo "Phase 3 run mapping" +# mappi +#-# mappi_gDNA +# organelli +# sample +# counti +# depthi +date +echo "Phase 4 run SNP calling" +snpi +#-# corri +date + diff --git a/workflows/Experiment1/SNP_pipe_mp.sh b/workflows/Experiment1/SNP_pipe_mp.sh new file mode 100644 index 0000000..a01f28b --- /dev/null +++ b/workflows/Experiment1/SNP_pipe_mp.sh @@ -0,0 +1,481 @@ +#!/bin/bash + +#. /etc/profile.d/modules.sh + +export PATH=$PATH:/vol/biotools/bin +export MANPATH=$MANPATH:/vol/biotools/man +export PATH=$PATH:/vol/software/bin + + +# install important R package - lummerland cluster does not have it ... +#echo "install.packages(\"gplots\", repos=\"https://cran.rstudio.com\")" | R --vanilla --no-save +#R CMD INSTALL /vol/agrensing/tools/R_packages/bitops_1.0-6.tar.gz +#R CMD INSTALL /vol/agrensing/tools/R_packages/caTools_1.17.1.1.tar.gz +#R CMD INSTALL /vol/agrensing/tools/R_packages/gtools_3.8.1.tar.gz +#R CMD INSTALL /vol/agrensing/tools/R_packages/gdata_2.18.0.tar.gz +#R CMD INSTALL /vol/agrensing/tools/R_packages/gplots_3.0.1.tar.gz + +#functions + +startle () +{ + number_row_reads=`wc -l $fw | awk '{print $1/4}'` + echo "## Start log ##" + echo "Number of row reads: "$number_row_reads +} + +trimmo () + { + # Remove leading low quality or N bases (below quality 3) (LEADING:3) + # Remove trailing low quality or N bases (below quality 3) (TRAILING:3) + # Scan the read with a 4-base wide sliding window, cutting when the average quality per base drops below 15 (SLIDINGWINDOW:4:15) + # HEADCROP: Cut the specified Number of bases from the start of the read + # MINLEN was first set to 50 bp. But there are libraries with read start length of 50 bp. + + if [ -e $rv ]; then + java $java_ram -jar /nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads $cpu -phred33 -summary $trimmo_out".trimmo.stats" \ + $fw $rv $trimmo_out".PE.fw" $trimmo_out".SE.fw" $trimmo_out".PE.rv" $trimmo_out".SE.rv" \ + ILLUMINACLIP:/nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/adapters/TruSeq-PE_all.fna:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 HEADCROP:10 MINLEN:30 + + # stats + echo "## Trimmo statistics ##" + printf "trim\t"$trimmo_out".PE.fw\t"; grep -c "^\+$" $trimmo_out".PE.fw" + # fastqc + /nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc" $trimmo_out".PE."* + fi + if [ ! -e $rv ]; then + java $java_ram -jar /nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/trimmomatic-0.39.jar SE -threads $cpu -phred33 -summary $trimmo_out".trimmo.stats" \ + $fw $trimmo_out".SE" \ + ILLUMINACLIP:/nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/adapters/TruSeq-PE_all.fna:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 HEADCROP:10 MINLEN:30 + + # stats + echo "## Trimmo statistics ##" + # printf "trim\t"$trimmo_out".SE\t"; grep -c "^\+$" $trimmo_out".SE" + printf "trim\t"$trimmo_out".SE\t"; wc -l $trimmo_out".SE" | awk '{print $1/4}' + # fastqc + /nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc" $trimmo_out".SE" + fi + + } + +polyA () + { + # min_len was first set to 50 bp. But there are libraries with read start length of 50 bp. + # If you specify any statistic option, no other ouput will be generated. To preprocess data, do not specify a statistics option. + # -stats_dupl -stats_all + + inpA=$polyA_out".polyA.passed" + infA=$polyA_out".polyA.failed" + + if [ -e $rv ]; then + /nfs/agrensing/tools/prinseq/prinseq-lite-0.20.4/./prinseq-lite.pl -verbose -no_qual_header -fastq $trimmo_out".PE.fw" -fastq2 $trimmo_out".PE.rv" \ + -trim_tail_left 5 -trim_tail_right 5 -min_len 30 -out_good $inpA -out_bad $infA \ + -out_format 3 -log $polyA_out"_polyA.log" + + # stats + echo "## PolyA statistics ##" + printf "polyA\t"$inpA"_1.fastq\t"; wc -l $inpA"_1.fastq" | awk '{print $1/4}' + #fastqc + /nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc" $inpA"_"[12]".fastq" + fi + if [ ! -e $rv ]; then + /nfs/agrensing/tools/prinseq/prinseq-lite-0.20.4/./prinseq-lite.pl -verbose -no_qual_header -fastq $trimmo_out".SE" \ + -trim_tail_left 5 -trim_tail_right 5 -min_len 30 -out_good $inpA -out_bad $infA \ + -out_format 3 -log $polyA_out"_polyA.log" + + # stats + echo "## PolyA statistics ##" + printf "polyA\t"$inpA".fastq\t"; grep -c "^\+$" $inpA".fastq" + #fastqc + /nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc" $inpA".fastq" + fi + } + +duple () +{ + # number of 100% identical duplicat reads. + echo "Duplication:" + + if [ -e $rv ]; then + + #SE 1 + IN=$polyA_out".polyA.passed" + + num_dup_se1=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_1_dupl_histogram"}}' $IN"_1.fastq" | wc -l` + printf $num_dup_se1"\t"; printf %.5f\\t "$(( 100000 * num_dup_se1 / number_row_reads ))e-5" + #SE 2 + num_dup_se2=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_2_dupl_histogram"}}' $IN"_2.fastq" | wc -l` + printf $num_dup_se2"\t"; printf %.5f\\t "$(( 100000 * num_dup_se2 / number_row_reads ))e-5" + + + #PE + num_dup_pe=`awk -v nof=$IN 'NR % 4 == 1 || NR % 4 == 2 {if(n==1){if(foo[name]==""){c=0}; foo[name]=foo[name]$1; if(c==1){seq[foo[name]]++} else {c=1}; n=0} else if($1~/^@/){split($1,w,"/"); name=w[1]; foo[name]; n=1}} END {for(a in seq){print seq[a]; p[seq[a]]++} for(l in p){print l"\t"p[l] >> nof"_1_2_dupl_histogram"}}' $IN"_1.fastq" $IN"_2.fastq" | wc -l` + printf $num_dup_pe"\t"; printf %.5f\\n "$(( 100000 * num_dup_pe / number_row_reads ))e-5" + + + fi + if [ ! -e $rv ]; then + for IN in $polyA_out".polyA.passed"*; + do + num_dup_se1=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_dupl_histogram"}}' $IN | wc -l` + printf $IN"\t"$num_dup_se1"\t"; printf %.5f\\n "$(( 100000 * num_dup_se1 / number_row_reads ))e-5" + done + fi + + +} + +mappi() +{ + if [ -e $rv ]; then + /homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -N 1 -n 7 -A sam --npaths=5 \ + --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \ + --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \ + $polyA_out".polyA.passed_1.fastq" $polyA_out".polyA.passed_2.fastq" 2> $mappi_out".mappi.err" + + unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped.1"` + + fi + if [ ! -e $rv ]; then + /ceph/agrensing/tools/gmap-gsnap/gmap-2021-03-08/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -N 1 -n 5 -A sam --npaths=5 \ + --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \ + --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \ + $polyA_out".polyA.passed.fastq" 2> $mappi_out".mappi.err" + + unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped"` + fi + + + + # mapping statistics + echo "## Pp genome version 3 ##" + echo "Unmapped reads: "$unmapped_reads + +} + +organelli() +{ + if [ -e $rv ]; then + for IN in $mappi_out".genome.concordant_"*; + do + print "Fragments:" + printf $IN"\t"; + with=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Ei "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`; + wo=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Eiv "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`; + awk -v with=$with -v wo=$wo 'BEGIN {print with"\t"wo"\t"with/(wo+with)*100" %"}' ; + done + fi + if [ ! -e $rv ]; then + for IN in $mappi_out".genome"*"uniq"*; + do + print "SE reads:" + printf $IN"\t"; + with=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Ei "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`; + wo=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Eiv "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`; + awk -v with=$with -v wo=$wo 'BEGIN {print with"\t"wo"\t"with/(wo+with)*100" %"}' ; + done + + fi + + +} + +mappi_gDNA() +{ + if [ -e $rv ]; then + /homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -n 7 -A sam --npaths=5 \ + --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \ + --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \ + $polyA_out".polyA.passed_1.fastq" $polyA_out".polyA.passed_2.fastq" 2> $mappi_out".mappi.err" + + unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped.1"` + fi + + if [ ! -e $rv ]; then + /homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -n 5 -A sam --npaths=5 \ + --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \ + --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \ + $polyA_out".polyA.passed.fastq" 2> $mappi_out".mappi.err" + + unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped"` + fi + + # mapping statistics + echo "## Pp genome version 3 ##" + echo "Unmapped reads: "$unmapped_reads +} + + +sample () +{ + # if PE use ".genome"*[tcqg] + for IN in $mappi_out".genome"*[tcqg] + do + printf $IN"\tsort.bam\n" >&2 + /vol/agrensing/tools/samtools/samtools-1.12/samtools view -@ $cpu -bhS $IN | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G - -o $IN".sort.bam" + + printf $IN"\trmdup.bam\n" >&2 + grep -vE "chloro|ribo|mito|ERCC" $IN | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G -n -O BAM - | \ + /vol/agrensing/tools/samtools/samtools-1.12/samtools fixmate -@ $cpu -m - - | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G -O BAM - | \ + /vol/agrensing/tools/samtools/samtools-1.12/samtools markdup -@ $cpu -s -r - \ + $IN".filter.fixmate.sort.markdup_r.bam" + + # samtools statistics + v_tmp=`awk '$1!~"@" {print $1}' $IN | sort | uniq | wc -l` + printf "Number of mapped reads: "$IN"\t"$v_tmp"\n" + + # compress the sam text files + # gzip $IN + # rm sam file + rm $IN + + + # bam index + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $IN*"_r.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $IN".sort.bam" + + done + + ls -l $mappi_out*"bam" >&2 + + # merge all unique bam files + echo "merging all .uniq bam files" >&2 + echo "merging all .uniq bam files" + /vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.bam" $mappi_out*"uniq"*".sort.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.bam" + + /vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.rmdup.bam" $mappi_out*"uniq"*"_r.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.rmdup.bam" +} + +counti() +{ + if [ -e $rv ]; then + # read count v3.3 and ERCC combined +# /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_ercc_gt.gff \ +# -t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.pB.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_32458_ercc.exon.pB.log" +# /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_ercc_gt.gff \ +# -t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.p.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_32458_ercc.exon.p.log" + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a /vol/agrensing/fhaas/DB/Marchantia_polymorpha/MpTak_v6.1r1_iso1.gff \ + -t exon -g Parent -o $counti_out"_v6.1.exon.pB.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v6.1.exon.pB.log" + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a /vol/agrensing/fhaas/DB/Marchantia_polymorpha/MpTak_v6.1r1_iso1.gff \ + -t exon -g Parent -o $counti_out"_v6.1.exon.p.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v6.1.exon.p.log" + + + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a /vol/agrensing/fhaas/DB/Marchantia_polymorpha/MpTak_v6.1r1_iso1.gff \ + -t exon -g Parent -o $counti_out"_all_v6.1.exon.pB.txt" $mappi_out".uniq.merged.bam" 2> $counti_out"_all_v6.1.exon.pB.log" + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a /vol/agrensing/fhaas/DB/Marchantia_polymorpha/MpTak_v6.1r1_iso1.gff \ + -t exon -g Parent -o $counti_out"_all_v6.1.exon.p.txt" $mappi_out".uniq.merged.bam" 2> $counti_out"_all_v6.1.exon.p.log" + + + + # statistics + printf "\n### FeatureCount statisitcs v6.1 ###\n" + # 32458 +# echo "Fragment counts on gene models exon PE 32458:" +# grep "Successfully assigned fragments" $counti_out"_v3.3_32458_ercc.exon.pB.log" +# echo "Fragment counts on gene models exon PE+SE 32458:" +# grep "Successfully assigned fragments" $counti_out"_v3.3_32458_ercc.exon.p.log" + # results + echo "Fragment counts on gene models exon PE:" + grep "Successfully assigned fragments" $counti_out"_v6.1.exon.pB.log" + echo "Fragment counts on gene models exon PE+SE:" + grep "Successfully assigned fragments" $counti_out"_v6.1.exon.p.log" + # results no rmdup + echo "Fragment counts on gene models exon PE (no rmdup):" + grep "Successfully assigned fragments" $counti_out"_all_v6.1.exon.pB.log" + echo "Fragment counts on gene models exon PE+SE (no rmdup):" + grep "Successfully assigned fragments" $counti_out"_all_v6.1.exon.p.log" + + fi + if [ ! -e $rv ]; then + echo "This part is not working yet" +# /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_ercc_gt.gff \ +# -t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.SE.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_32458_ercc.exon.SE.log" +# /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \ +# -t exon -g Parent -o $counti_out"_v3.3_31315_ercc.exon.SE.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_31315_ercc.exon.SE.log" +# +# # statistics +# printf "\n### FeatureCount statisitcs v3.3 ###" +# # 32458 +# echo "Fragment counts on gene models exon SE 32458:" +# grep "Successfully assigned " $counti_out"_v3.3_32458_ercc.exon.SE.log" +# # 31315 +# echo "Fragment counts on gene models exon SE 31315:" +# grep "Successfully assigned " $counti_out"_v3.3_31315_ercc.exon.SE.log" + + fi +} + +depthi() +{ + /vol/agrensing/tools/samtools/samtools-1.12/samtools depth -aa $mappi_out".uniq.merged.rmdup.bam" > $mappi_out".uniq.merged.rmdup.depth" +} + + +snpi() +{ + gatkrun='/vol/agrensing/tools/gatk/gatk-4.2.0.0/gatk --java-options -Xmx50G' + threading_option='--native-pair-hmm-threads 8' + # V3 + # start the SNP detection + $gatkrun HaplotypeCaller $threading_option -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -ploidy 1 -O $snpi_out".HaplotypeCaller.vcf" 2> $snpi_out".HaplotypeCaller.err" + + $gatkrun SelectVariants -R $reffna --variant $snpi_out".HaplotypeCaller.vcf" -select-type INDEL -O $snpi_out".SelectVariants.vcf" 2> $snpi_out".SelectVariants.err" + + # HQ SNPs + python2.7 /vol/agrensing/fhaas/GeneAtlas/SNP/GetHighQualVcfs.py -i $snpi_out".HaplotypeCaller.vcf" -o $output_dir"/05_SNP_calling/"$species"/"$prefix"/" --ploidy 1 --percentile 90 --GQ 90 2> $snpi_out".GetHighQualVcfs.HQ.err" + +# mv $snpi_out".GetHighQualVcfs.HQ.vcf" $snpi_out".GetHighQualVcfs.V3.HQ.vcf" + + $gatkrun IndexFeatureFile -I $snpi_out".GetHighQualVcfs.HQ.vcf" 2> $snpi_out".GetHighQualVcfs.HQ.idx.err" + + $gatkrun BaseRecalibrator -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.table" 2> $snpi_out".BaseRecalibrator.err" + + $gatkrun ApplyBQSR -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" --bqsr-recal-file $snpi_out".BaseRecalibrator.table" -O $snpi_out".BaseRecalibrator.ApplyBQSR.bam" \ + 2> $snpi_out".BaseRecalibrator.ApplyBQSR.err" + + $gatkrun BaseRecalibrator -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.recal.table" \ + 2> $snpi_out".BaseRecalibrator.recal.err" + + $gatkrun AnalyzeCovariates -before $snpi_out".BaseRecalibrator.table" -after $snpi_out".BaseRecalibrator.recal.table" -verbosity DEBUG -csv $snpi_out".AnalyzeCovariates.calibration-report.csv" \ + -plots $snpi_out".AnalyzeCovariates.calibration-report.QC.pdf" 2> $snpi_out".AnalyzeCovariates.calibration-report.err" + + $gatkrun PrintReads -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -O $snpi_out".PrintReads.bam" 2> $snpi_out".PrintReads.err" + + $gatkrun PrintReads -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -O $snpi_out".PrintReads.ApplyBQSR.bam" 2> $snpi_out".PrintReads.ApplyBQSR.err" + + # HaplotypeCaller and VariantFiltration w/o ApplyBQSR BAM file + $gatkrun HaplotypeCaller $threading_option -R $reffna -I $snpi_out".PrintReads.bam" -ploidy 1 -O $snpi_out".PrintReads.HaplotypeCaller.vcf" 2> $snpi_out".PrintReads.HaplotypeCaller.err" + + $gatkrun HaplotypeCaller $threading_option -R $reffna -I $snpi_out".PrintReads.ApplyBQSR.bam" -ploidy 1 -O $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.vcf" \ + 2> $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.err" + + # filter flags ## NOTE: it is important to look for float and integer numbers (e.g. FS must be 60.0 not 60) ## + # --filter-expression "FS > 60" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0" + # --filter-name "LowMQ" --filter-expression "QUAL > 30.0 && QUAL < 60.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage" + # + $gatkrun VariantFiltration -R $reffna --variant $snpi_out".PrintReads.HaplotypeCaller.vcf" \ + --filter-expression "FS > 60.0" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0" \ + --filter-name "LowMQ" --filter-expression "QUAL > 30.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage" \ + -O $snpi_out".VariantFiltration.vcf" 2> $snpi_out".VariantFiltration.err" + + $gatkrun VariantFiltration -R $reffna --variant $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.vcf" \ + --filter-expression "FS > 60.0" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0" \ + --filter-name "LowMQ" --filter-expression "QUAL > 30.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage" \ + -O $snpi_out".ApplyBQSR.VariantFiltration.vcf" 2> $snpi_out".ApplyBQSR.VariantFiltration.err" +} + +corri() +{ + for IN in $mappi_out*.gz + do + gunzip $IN + file=`echo $IN | awk '{print substr($1, 1, length($1)-3)}'` + /vol/agrensing/tools/samtools/samtools-1.12/samtools view -@ $cpu -bhS $file | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G - -o $file".sort.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $file".sort.bam" + gzip $file + + done + /vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.bam" $mappi_out*"uniq"*".sort.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.bam" + + + + gatkrun='/ceph/agrensing/tools/gatk/gatk-4.2.0.0/gatk --java-options -Xmx50G' + threading_option='--native-pair-hmm-threads 8' + $gatkrun BaseRecalibrator -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.recal.table" \ + 2> $snpi_out".BaseRecalibrator.recal.err" + + $gatkrun AnalyzeCovariates -before $snpi_out".BaseRecalibrator.table" -after $snpi_out".BaseRecalibrator.recal.table" -verbosity DEBUG -csv $snpi_out".AnalyzeCovariates.calibration-report.csv" \ + -plots $snpi_out".AnalyzeCovariates.calibration-report.QC.pdf" 2> $snpi_out".AnalyzeCovariates.calibration-report.err" + + +} + + +# script input variables + +cpu=$1 +output_dir=$2 +prefix=$3 +species=$4 +infile_path=$5 + +# set variables +java_ram="-Xmx50G -Xms25G" +id=$prefix + +fw=$infile_path"/fw."$prefix +rv=$infile_path"/rv."$prefix + +db_path_V3="/ceph/agrensing/fhaas/DB/Marchantia_polymorpha/Marchantia_v6.1_maingenome_mito_ribo_chloro/Marchantia_v6.1_maingenome_mito_ribo_chloro" +db_name_V3="Marchantia_v6.1_maingenome_mito_ribo_chloro" + +reffna="/ceph/agrensing/fhaas/DB/Marchantia_polymorpha/Marchantia_v6.1.fna" + + + +# outdir variables +# output=$output_dir"/"$prefix +trimmo_out=$output_dir"/02_trimmomatic/"$species"/"$prefix"/"$prefix +polyA_out=$output_dir"/03_poly-A/"$species"/"$prefix"/"$prefix +mappi_out=$output_dir"/04_genome_mapping/"$species"/"$prefix"/"$prefix +snpi_out=$output_dir"/05_SNP_calling/"$species"/"$prefix"/"$prefix +counti_out=$output_dir"/06_counts/"$species"/"$prefix"/"$prefix + +# create result folder +mkdir -p $output_dir"/02_trimmomatic/"$species"/"$prefix +mkdir -p $output_dir"/03_poly-A/"$species"/"$prefix +mkdir -p $output_dir"/04_genome_mapping/"$species"/"$prefix +mkdir -p $output_dir"/05_SNP_calling/"$species"/"$prefix +mkdir -p $output_dir"/06_counts/"$species"/"$prefix + +mkdir -p $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc" +mkdir -p $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc" + + + + + +# print running machine and start date +hostname +date + +# print all variables +printf $cpu"\n"$output_dir"\n"$prefix"\n"$species"\n"$fw"\n" + +if [ -e $rv ]; then + printf $rv"\n" +fi + +printf "\'$java_ram\'""\n"$id"\n"$db_path_V3"\n"$db_name_V3"\n" + +echo "" + +echo "Phase 0 assamble data" +startle + +date +echo "Phase 1 run trimming" +trimmo + +date +echo "Phase 2 run polyA" +polyA +duple +date +echo "Phase 3 run mapping" +mappi +#-# mappi_gDNA +organelli +sample +counti +depthi +date +echo "Phase 4 run SNP calling" +snpi +#-# corri +date + + diff --git a/workflows/Experiment1/new_deg_pipe_four_methods_noiseqbio_schippers_20211021.r b/workflows/Experiment1/new_deg_pipe_four_methods_noiseqbio_schippers_20211021.r new file mode 100644 index 0000000..1703784 --- /dev/null +++ b/workflows/Experiment1/new_deg_pipe_four_methods_noiseqbio_schippers_20211021.r @@ -0,0 +1,923 @@ +############################################################### +## DEG pipeline with edgeR, DESeq2 and NOISeq # +## Modified from the template of Dr. Kristian Ullrich 2014 # +## @fabianhaas, AG Rensing, Marburg, Jan 2016 # +############################################################### +# The pipeline expects sample triplicates. +# If you don't have three samples of one condition, use face files with zero counts. +# RPKM calculation is optional. +# Additional program to sum up the tsv files: /mnt/nas_temp/home/haasf/Program/sumup_htseq_counts.pl folder file_prefix + +########################################### +## please define your variables # +########################################### + +saveat <- "/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Schippers/Mp_all_alternative_DEGs_0.9" + +file <- '/mnt/Teamdrive/projects/MAdLand/cooperations/Schippers_Lab_RNA-seq_analyses/Mp_all.p.sort.wotRNA.counts' +# file <- '/mnt/Teamdrive/projects/MAdLand/cooperations/Schippers_Lab_RNA-seq_analyses/31315_all.p.sort.counts' + +# Table with the length and the GC content +LenGCTable <- "/mnt/Teamdrive/projects/MAdLand/cooperations/Schippers_Lab_RNA-seq_analyses/MpTak_v6.1r1.mrna.fasta.GC.length" +# LenGCTable <- "/mnt/NAS_coruscant_datashare/haasf/GeneAtlas_RNASeq/DEG_2nd_round/cosmoss_V3.3.release_2016-07-22.IsoformV31.fasta.length_GC_Contig_Daten_for_34840_1nd.csv" + +# Column position of the gene name and the width/gc content (first column is gene name!) +myLenPos <- 1 +myGCPos <- 2 + +# +# hoo <- c("","") +# sampleLists <- c() +# for(a in hoo) { +# for(b in hoo) { +# if(a!=b) { +# tmp <- c(a,b) +# sampleLists <- c(sampleLists, list(tmp)) +# } +# } +# } + +sampleLists <- list(c("control_6h", "control_24h"), + c("control_6h", "LCS_6h"), + c("control_6h", "LCS_24h"), + c("control_24h", "LCS_6h"), + c("control_24h", "LCS_24h"), + c("LCS_6h", "LCS_24h") +) + +# Do you want a RPKM value table (1/0) +# Be warned, it takes a while to do this. approx. 10 minutes for 35000 genes and 6 samples. +rpkm <- 0 +# Are the data original HTSeq files? yes <- 1 ; no <- 0 +htseq <- 0 + +# threshold settings for the R packages +#edgeR +pvtedgeR <- 0.001 +#DESeq2 +pvtDESeq2 <- 0.001 +#NOISeq +pvtNOISeq <- 0.9 + +# Output folder names +edgeRDir <- "edgeR" +DESeq2Dir <- "DESeq2" +NOISeqDir <- "NOISeq" + +######################### +## stop editing here!! ## +######################### + + +########################################### +## executiv function # +########################################### + +workflow <- function(file, saveat, SampleNames, LenGCTable, myLenPos, myGCPos, rpkm, pvtedgeR, pvtDESeq2, pvtNOISeq, edgeRDir, DESeq2Dir, NOISeqDir, htseq) +{ + # setwd(workingFolder) # set workspace + + sampleNameI <- SampleNames[1] + sampleNameII <- SampleNames[2] + + # Comparison name + cName <- paste(sampleNameI, sampleNameII, sep = "_vs_") + + filename <- strsplit(file, "/") + filename <- filename[[1]][length(filename[[1]])] # long but it's working + sampleName <- gsub(", ","_",toString(SampleNames)) + samples <- SampleNames + # create output dir + dir.create(saveat, showWarnings = FALSE) + outdir <- paste0(saveat,"/",sampleName) + dir.create(outdir) + + # Start the log file + sink(paste0(outdir, "/DEGpipe_",cName, ".log")) + + # print parameter into the log file + print("set parameters:") + print(paste0("Counts file:", file)) + print(paste0("Results stored at:", saveat)) + print(paste0("SampleNames: ",SampleNames)) + print(paste0("LenGCTable: ",LenGCTable)) + print(paste0("myLenPos: ",myLenPos)) + print(paste0("myGCPos: ",myGCPos)) + print(paste0("rpkm: ",rpkm)) + print(paste0("htseq: ",htseq)) + print(paste0("pvtedgeR: ",pvtedgeR)) + print(paste0("pvtDESeq2: ",pvtDESeq2)) + print(paste0("pvtNOISeq: ",pvtNOISeq)) + print(paste0("edgeRDir: ",edgeRDir)) + print(paste0("DESeq2Dir: ",DESeq2Dir)) + print(paste0("NOISeqDir: ",NOISeqDir)) + # session and package infos for the log file + print(sessionInfo()) + print(Sys.info()) + + # load spike-in count tables + data0 <- read.table(file, header=T, sep="\t", row.names=1) + # just if the gene ids not ending with .1 (it is nessessery for NOISeqbio iteration step) + #row.names(data0) <- paste0(row.names(data0), ".1") + + # remove HTSeq rows like "__no_feature" + data0 <- data0[ with(data0, !grepl('__', rownames(data0))) , ] + data0 <- data0[ with(data0, !grepl('ERCC', rownames(data0))) , ] + #data0 <- data0[ , with(data0, !grepl('X.1', colnames(data0)))] + #rename col names (library name to roman number) + + librariesName <- c("MF82", "MF81", "MF80", "MF76", "MF75", "MF74", "MF68", "MF67", "MF66", "MF62", "MF61", "MF60") +# librariesName <- c("P97", "P96", "P95", "P91", "P90", "P89", "P111", "P110", "P109", "P105", "P104", "P103") + romanNumber <- c("LCS_24h.2", "LCS_24h.1", "LCS_24h", "control_24h.2", "control_24h.1", "control_24h", "control_6h.2", "LCS_6h.1", "LCS_6h", "LCS_6h.2", "control_6h.1", "control_6h") +# romanNumber <- c("LCS_6h.2", "LCS_6h.1", "LCS_6h", "control_6h.2", "control_6h.1", "control_6h", "LCS_24h.2", "LCS_24h.1", "LCS_24h", "control_24h.2", "control_24h.1", "control_24h") + + for(i in 1:length(librariesName)) { + colnames(data0)[colnames(data0)==grep(librariesName[i], names(data0), value = TRUE)] <- romanNumber[i] + } + + # creats a list of the column names/samples you would like to compare + SampleNames <- makeSampleList(sampleNameI, length(grep(paste0("^",sampleNameI,"[^A-Z]"), colnames(data0))) + 1, + sampleNameII, length(grep(paste0("^",sampleNameII,"[^A-Z]"), colnames(data0))) + 1) + print(SampleNames) + + ## mal anschauen als ersatz für das darüber ## + ## newenw <- table.new[, grepl(paste(LIST,collapse="|"), colnames(table.new))] + + + # exctract the samples from the data0 if needed + if(length(SampleNames) > 0) { + RTableInFiles <- data0[,SampleNames] + data <- data0[,SampleNames] + } + print(head(RTableInFiles)) + + # sort by column name + # data <- data[ , order(names(data))] # dangeros, the order of query and referenz might be not the same + # see how many rows and columns are in the tables + print(paste("Raw data row number: ",nrow(data))) + print(paste("Raw data column number: ",ncol(data))) + + # 2) A typical differential expression analysis workflow + # 2.1) Filtering and exploratory data analysis + # filter: One single gene/spike-in must have at least 5 counts in minimum 2 libraries + filter <- apply(data, 1, function(x) length(x[x>5])>=2) + dataFiltered <- data[filter,] + # see how many rows and columns are left + print(paste("Filtered data row number: ",nrow(dataFiltered))) + print(paste("Filtered data row number: ",ncol(dataFiltered))) + # filter gene names + genes <- rownames(dataFiltered)[grep("^Pp3", rownames(dataFiltered))] + # filter spike-in names + spikes <- rownames(dataFiltered)[grep("^ERCC", rownames(dataFiltered))] + # get the column name list as.factors + xtmp <- colnames(dataFiltered) + x <- split2list(xtmp) + + sampleNameList <- gsub(".[1234567890]+$", "", SampleNames) + columnPos <- makeIndexList(SampleNames) + + # run ... + ### edgeR ### + print(paste0("edgeR is running with ", cName)) + packageDescription("edgeR") + edgeRFunction(outdir, edgeRDir, cName, RTableInFiles, pvtedgeR, sampleNameI, sampleNameII, sampleNameList) + + #### DESeq2 #### + print(paste0("DESeq2 is running with ", cName)) + packageDescription("DESeq2") + DESeq2Function(outdir, DESeq2Dir, cName, RTableInFiles, pvtDESeq2, sampleNameI, sampleNameII, sampleNameList) + + ### NOISeq ### + print(paste0("NOISeq is running with ", cName)) + packageDescription("NOISeq") + NOISeqFunction(outdir, NOISeqDir, cName, RTableInFiles, pvtNOISeq, sampleNameI, sampleNameII, myLenPos, myGCPos, LenGCTable, columnPos, sampleNameList, SampleNames) + + ### Overlap of all three methods ### + # intersection with DEG direction check => 1, without => 0 + intersection_4.file <- intersection(outdir,1,4) + write.table(intersection_4.file, file = paste0(outdir, "/", cName,"_intersection_four_methods_",nrow(intersection_4.file),".tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # edgeR, DESeq2, NOISeq + intersection_3.file <- intersection(outdir,1,3) + write.table(intersection_3.file, file = paste0(outdir, "/", cName,"_intersection_three_methods_",nrow(intersection_3.file),".tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + sink() # End of log file + +} + + + + + +#################### +## main functions # +#################### + +plotGraphics <- function(data, librariesName, cName, outdir, romanNumber) +{ + library("DESeq2") + library("ggplot2") + library("RColorBrewer") + library("pheatmap") + library("genefilter") + + # create sample annotation lists + # sample.name.list <- gsub(".[12]", "", librariesName[colnames(data)]) + sample.name.list <- gsub(".[12]", "", romanNumber) + sample.name.uniq <- unique(unlist(sample.name.list, use.names = FALSE)) + + col.data.condition = data.frame(row.names = librariesName, condition = sample.name.list) + + # run DESeq2 functions + dds.matrix <- DESeqDataSetFromMatrix(countData = data, colData = col.data.condition, design = ~condition) # data must be integer! (Ganze Zahlen) + colData(dds.matrix)$condition <- factor(colData(dds.matrix)$condition, levels = sample.name.uniq) + dds.matrix.deseq <- DESeq(dds.matrix) + dds.matrix.res <- results(dds.matrix.deseq) + dds.matrix.res <- dds.matrix.res[order(dds.matrix.res$padj), ] + dds.matrix.res.fdr <- dds.matrix.res[!is.na(dds.matrix.res$padj), ] + dds.matrix.res.fdr <- dds.matrix.res.fdr[dds.matrix.res.fdr$padj < 0.0001, ] + dds.matrix.rld <- rlogTransformation(dds.matrix, blind = TRUE) + + # data for PCA plot + pcaData <- plotPCA(dds.matrix.rld, intgroup = "condition", returnData=TRUE) + percentVar <- round(100 * attr(pcaData, "percentVar")) + # color for samples + sample.colors <- c("gray60","#D73027","#4575B4", "purple", "orange","#113FF7","green", "black") + + # plot the PCA + # https://cran.r-project.org/web/packages/ggfortify/vignettes/plot_pca.html + png(filename=paste0(outdir,"PCA_all_samples_DEseq2_normalized.png")) + ggplot( + pcaData, + aes( + PC1, + PC2, + color=condition, + shape=condition) + ) + + scale_shape_manual(values = c(4, 1, 2, 3, 0, 5, 6, 7)) + + geom_point(size=2.5) + + xlab( + paste0("PC1: ",percentVar[1],"% variance") + ) + + ylab( + paste0("PC2: ",percentVar[2],"% variance") + ) + + theme() + #, face="bold" + scale_colour_manual( + values=sample.colors # dodgerblue3 + ) + + ggtitle("PCA of all samples (DESeq2 normalized)") + + theme( + plot.title = element_text(lineheight=.8), + panel.background = element_rect(fill = "gray95") + ) + dev.off() + + # how many genes will be involved in the heatmap calculation + numbGenes <- 100 + # pheatmap data + topVarGenes <- head(order(rowVars(assay(dds.matrix.rld)),decreasing=TRUE),numbGenes) + mat <- assay(dds.matrix.rld)[ topVarGenes, ] + mat <- mat - rowMeans(mat) + # pheatmap data annotation + anno <-as.data.frame(colData(dds.matrix.rld)[,c("condition")]) + rownames(anno) <- colnames(data) # check out the row names of annotation + colnames(anno) <- "condition" + # pheatmap data annotation color + condition <- sample.colors + names(condition) <- sample.name.uniq + anno_colors <- list(condition = condition) + + #heatmap(mat) + #pheatmap(mat) + # ftp://cran.r-project.org/pub/R/web/packages/pheatmap/pheatmap.pdf + png(filename=paste0(outdir,"HC_heatmap_all_samples_DESeq2_normalized.png")) + pheatmap(mat , + annotation = anno, + annotation_colors = anno_colors, + fontsize = 10, + fontsize_row = 6, + fontsize_col = 10, + main = paste0("Heatmap of all samples (DESeq2 normalized) ", numbGenes," genes\n"), + annotation_names_col = F, + show_rownames = F, + cluster_rows = F, + border_color = F, + treeheight_col = 20 + ) + dev.off() +} + + +edgeRFunction <- function(odir, edgeRDir, cName, RTableInFiles, pvtedgeR, sampleNameI, sampleNameII, sampleNameList) +{ + library(edgeR) + + outdir <- paste0(odir, "/",edgeRDir,"/") # set outdir + + dir.create(outdir, showWarnings = FALSE) # create output folder + dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots" + + comparison <- cName + counts <- RTableInFiles + + pvt <- pvtedgeR + + group <- factor(sampleNameList) + + # edgeR 'normal' + + y.normal <- DGEList(counts = counts, group = group) + + y.normal[,1] + + png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotMDS"))) + plotMDS(y.normal, method = "bcv", main = paste0(cName, "\nedgeR - plotMDS")) + dev.off() + + y.normal <- calcNormFactors(y.normal) + y.normal <- estimateCommonDisp(y.normal) + y.normal <- estimateTagwiseDisp(y.normal) + + + png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotBCV"))) + plotBCV(y.normal, main = paste0(cName, "\nedgeR - plotBCV")) + dev.off() + + et.normal <- exactTest(y.normal, pair = c(sampleNameI, sampleNameII)) + + topTags(et.normal) + # adjust.method ==> character string specifying p-value adjustment method. Possible values are "none", "BH", "fdr" (equivalent to "BH"), "BY" and "holm". See p.adjust for details. + summary(de.normal <- decideTestsDGE(et.normal, p = pvt, adjust = "BH")) + et.normal.fdr <- cbind(et.normal$table, p.adjust(et.normal$table[, 3], method = "BH")) + et.normal.fdr <- et.normal.fdr[et.normal.fdr[, 4] < pvt, ] + + write.table(et.normal.fdr, file = paste0(outdir, cName, "_", nrow(et.normal.fdr), "_edgeR_fdr.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + detags.normal <- rownames(de.normal)[as.logical(de.normal)] + png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotSmear"))) + plotSmear(et.normal, de.normal, tags = detags.normal, main = paste0(cName, "\nedgeR - plotSmear")) + abline(h = c(-2, 2), col = "blue") + dev.off() + + # edgeR 'GLM' + + y.glm <- DGEList(counts = counts, group = group) + + design <- model.matrix(~0 + group, data = y.glm$samples) + design + + y.glm <- estimateGLMCommonDisp(y.glm, design) + y.glm <- estimateGLMTrendedDisp(y.glm, design) + + ## Loading required package: splines + y.glm <- estimateGLMTagwiseDisp(y.glm, design) + + png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotBCV - GLM"))) + plotBCV(y.glm, main = paste0(cName, "\nedgeR - plotBCV - GLM")) + dev.off() + + y.fit <- glmFit(y.glm, design) + y.lrt <- glmLRT(y.fit) + + FDR <- p.adjust(y.lrt$table$PValue, method = "BH") + sum(FDR < pvt) + + topTags(y.lrt) + + summary(de.glm <- decideTestsDGE(y.lrt, p = pvt, adjust = "BH")) + +} + + +DESeq2Function <- function(odir, DESeq2Dir, cName, RTableInFiles, pvtDESeq2, sampleNameI, sampleNameII, sampleNameList) +{ + library(DESeq2) + library(RColorBrewer) + library(gplots) + library(vsn) + + outdir <- paste0(odir, "/",DESeq2Dir,"/") # set outdir + + dir.create(outdir, showWarnings = FALSE) + dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots" + + + pvt <- pvtDESeq2 + + + + # DESeqDataSet(se, design, ignoreRank = FALSE) + # DESeqDataSetFromMatrix(countData, colData, design, tidy = FALSE, ignoreRank = FALSE, ...) + # DESeqDataSetFromHTSeqCount(sampleTable, directory = ".", design, ignoreRank = FALSE, ...) + # DESeqDataSetFromTximport(txi, colData, design, ...) + coldata = data.frame(row.names = colnames(RTableInFiles), condition = sampleNameList) + + ddsHTSeq <- DESeqDataSetFromMatrix(countData = RTableInFiles, colData = coldata, design = ~ condition) + + colData(ddsHTSeq)$condition <- factor(colData(ddsHTSeq)$condition, levels = c(sampleNameI, sampleNameII)) + + ddsHTSeq <- DESeq(ddsHTSeq) + + ddsHTSeq.res <- results(ddsHTSeq) + ddsHTSeq.res <- ddsHTSeq.res[order(ddsHTSeq.res$padj), ] + head(ddsHTSeq.res) + + # png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - plotMA"))) + # plotMA(ddsHTSeq, main = paste0(cName, "\nDESeq2 - plotMA"), ylim = c(-2, 2)) + # dev.off() + + ddsHTSeq.res.fdr <- ddsHTSeq.res[!is.na(ddsHTSeq.res$padj), ] + ddsHTSeq.res.fdr <- ddsHTSeq.res.fdr[ddsHTSeq.res.fdr$padj < pvt, ] + + write.table(as.matrix(ddsHTSeq.res.fdr), file = paste0(outdir, cName, "_", nrow(as.matrix(ddsHTSeq.res.fdr)), "_DESeq2_fdr.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + ddsHTSeq.rld <- rlogTransformation(ddsHTSeq, blind = TRUE) + ddsHTSeq.vsd <- varianceStabilizingTransformation(ddsHTSeq, blind = TRUE) + + par(mfrow = c(1, 3)) + notAllZero <- (rowSums(counts(ddsHTSeq)) > 0) + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.1"))) + meanSdPlot(log2(counts(ddsHTSeq, normalized = TRUE)[notAllZero, ] + 1)) # , ylim = c(0, 2.5) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.2"))) # , ylim = c(0, 2.5) + meanSdPlot(assay(ddsHTSeq.rld[notAllZero, ])) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.3"))) # , ylim = c(0, 2.5) + meanSdPlot(assay(ddsHTSeq.vsd[notAllZero, ])) + dev.off() + + hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100) + distRL <- dist(t(assay(ddsHTSeq.rld))) + mat <- as.matrix(distRL) + rownames(mat) <- colnames(mat) <- with(colData(ddsHTSeq), paste0(condition)) + + # png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - heatmap"))) + # heatmap(mat, trace = "none", col = rev(hmcol), margin = c(13, 13)) + # dev.off() + + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - PCA"))) + print(plotPCA(ddsHTSeq.rld, intgroup = "condition")) + dev.off() + + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - DispEsts"))) + plotDispEsts(ddsHTSeq, main = paste0(cName, "\nDESeq2 - plotDispEsts")) + dev.off() +} + +NOISeqFunction <- function(odir, NOISeqDir, cName, RTableInFiles, pvtNOISeq, sampleNameI, sampleNameII, myLenPos, myGCPos, LenGCTable, columnPos, sampleNameList, SampleNames) +{ + library(NOISeq) + + outdir <- paste0(odir, "/",NOISeqDir,"/") # set outdir + + dir.create(outdir, showWarnings = FALSE) + dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots" + + pvt <- pvtNOISeq + + counts <- RTableInFiles[, columnPos] + head(counts) + + # must be changed if there are no triplicates + # !! Please change this setting if needed !! + myfactors <- data.frame(Treatment = sampleNameList, TreatmentRun = SampleNames) + # sort counts by row names + mycounts <- counts[order(rownames(counts)), ] + # filter out low counts + mycounts.filter <- filtered.data(mycounts, factor = myfactors$Treatment, norm = FALSE, depth = NULL, method = 1, cv.cutoff = 100, cpm = 1) + mycounts.rowname <- rownames(mycounts.filter) + + # Extra file with length and GC content + myLengthTable <- read.table(LenGCTable, sep = "\t", header = TRUE, stringsAsFactor = FALSE, row.names=1) + myLengthTable <- myLengthTable[order(rownames(myLengthTable)),] # sort by rownames + myLengthTable.filter <- myLengthTable[mycounts.rowname,] + + mylength <- myLengthTable.filter[,myLenPos] + names(mylength) <- rownames(myLengthTable.filter) + + mygc <- myLengthTable.filter[,myGCPos] + names(mygc) <- rownames(myLengthTable.filter) + + + + + + mydata <- readData(data = mycounts.filter, length = mylength, gc = mygc, factors = myfactors) + mydata + + mydata.GCbias <- dat(mydata, type = "GCbias") +#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - GCbias.1"))) +#-# explo.plot(mydata.GCbias) +#-# dev.off() + + mydata.GCbias.group <- dat(mydata, factor = "Treatment", type = "GCbias") +#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - GCbias.2"))) +#-# explo.plot(mydata.GCbias.group) +#-# dev.off() + + show(mydata.GCbias.group) + + mydata.lengthbias <- dat(mydata, type = "lengthbias") +#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - Lenbias.1"))) +#-# explo.plot(mydata.lengthbias) +#-# dev.off() + + mydata.lengthbias.group <- dat(mydata, factor = "Treatment", type = "lengthbias") +#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - Lenbias.2"))) +#-# explo.plot(mydata.lengthbias.group) +#-# dev.off() + + show(mydata.lengthbias.group) + + # looks different + mydata.cd <- dat(mydata, type = "cd") +#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - cd"))) +#-# explo.plot(mydata.cd) +#-# dev.off() + mylength + + myRPKM <- NOISeq::rpkm(assayData(mydata)$exprs) + myUQUA <- NOISeq::uqua(assayData(mydata)$exprs, long = mylength, lc = 0.5, k = 0) + myTMM <- NOISeq::tmm(assayData(mydata)$exprs, long = 1000, lc = 0) + + mynoiseq.rpkm <- noiseq(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological") + mynoiseq.nnorm <- noiseq(mydata, k = 0.5, norm = "n", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological") + mynoiseq.tmm <- noiseq(mydata, k = 0.5, norm = "tmm", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological") + + # RPKM + mynoiseq.rpkm.prob <- mynoiseq.rpkm@results[[1]][mynoiseq.rpkm@results[[1]]$prob > pvt, ] + mynoiseq.rpkm.prob <- mynoiseq.rpkm.prob[!is.na(mynoiseq.rpkm.prob$prob), ] + write.table(mynoiseq.rpkm.prob, file = paste0(outdir, cName, "_", nrow(mynoiseq.rpkm.prob), "_NOISeq_rpkm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # none + mynoiseq.nnorm.prob <- mynoiseq.nnorm@results[[1]][mynoiseq.nnorm@results[[1]]$prob > pvt, ] + mynoiseq.nnorm.prob <- mynoiseq.nnorm.prob[!is.na(mynoiseq.nnorm.prob$prob), ] + write.table(mynoiseq.nnorm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseq.nnorm.prob), "_NOISeq_nnorm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # TMM + mynoiseq.tmm.prob <- mynoiseq.tmm@results[[1]][mynoiseq.tmm@results[[1]]$prob > pvt, ] + mynoiseq.tmm.prob <- mynoiseq.tmm.prob[!is.na(mynoiseq.tmm.prob$prob), ] + write.table(mynoiseq.tmm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseq.tmm.prob), "_NOISeq_tmm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + ############ NOISeq bio ################ + + pvtnsb <- 0.995 + a0per_value <- 0.99 + + iterations <- 5 + x5 <- c(sample(10000:99999, iterations, replace=F), 12345) + x5 + + collector.rpkm <- NULL + collector.tmm <- NULL + + + # for (iter in x5) { + # # RPKM + # mynoiseqbio.rpkm <- noiseqbio(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = iter, filter = 1)#, plot = TRUE) + # mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm@results[[1]][mynoiseqbio.rpkm@results[[1]]$prob > pvtnsb, ] + # mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm.prob[!is.na(mynoiseqbio.rpkm.prob$prob), ] + # collector.rpkm <- rbind(collector.rpkm, mynoiseqbio.rpkm.prob) + # #collector <- rbind(collector, mynoiseqbio.rpkm.prob[rownames(mynoiseqbio.rpkm.prob) %in% rownames(collector),]) + # # TMM + # mynoiseqbio.tmm <- noiseqbio(mydata, k = 0.5, norm = "tmm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = iter, filter = 1)#, plot = TRUE) + # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ] + # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ] + # collector.tmm <- rbind(collector.tmm, mynoiseqbio.tmm.prob) + # } + # + # # RPKM + # print(sort(row.names(collector.rpkm))) + # library(stringr) + # mynoiseqbio.rpkm.prob.overlap <- collector.rpkm[str_split_fixed(row.names(collector.rpkm), "\\.",2)[,2]>=iterations+10,] + # row.names(mynoiseqbio.rpkm.prob.overlap) <- substr(row.names(mynoiseqbio.rpkm.prob.overlap),1,str_length(row.names(mynoiseqbio.rpkm.prob.overlap))-1) + # + # write.table(mynoiseqbio.rpkm.prob.overlap, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.rpkm.prob.overlap),"_NOISeqbio_rpkm_prob_seed_iteration_overlap.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # + # #TMM + # print(sort(row.names(collector.tmm))) + # library(stringr) + # mynoiseqbio.tmm.prob.overlap <- collector.tmm[str_split_fixed(row.names(collector.tmm), "\\.",2)[,2]>=iterations+10,] + # row.names(mynoiseqbio.tmm.prob.overlap) <- substr(row.names(mynoiseqbio.tmm.prob.overlap),1,str_length(row.names(mynoiseqbio.tmm.prob.overlap))-1) + # + # write.table(mynoiseqbio.tmm.prob.overlap, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.tmm.prob.overlap),"_NOISeqbio_tmm_prob_seed_iteration_overlap.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + # # none + # mynoiseqbio.nnorm.prob <- mynoiseqbio.nnorm@results[[1]][mynoiseqbio.nnorm@results[[1]]$prob > pvtnsb, ] + # mynoiseqbio.nnorm.prob <- mynoiseqbio.nnorm.prob[!is.na(mynoiseqbio.nnorm.prob$prob), ] + # write.table(mynoiseqbio.nnorm.prob, file = paste0(outdir, cName, "_NOISeqbio_nnorm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # # TMM + # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ] + # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ] + # write.table(mynoiseqbio.tmm.prob, file = paste0(outdir, cName, "_NOISeqbio_tmm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + + + + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbiorpkm"))) + mynoiseqbio.rpkm <- noiseqbio(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE) + dev.off() + + mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm@results[[1]][mynoiseqbio.rpkm@results[[1]]$prob > pvtnsb, ] + mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm.prob[!is.na(mynoiseqbio.rpkm.prob$prob), ] + write.table(mynoiseqbio.rpkm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.rpkm.prob),"_NOISeqbio_rpkm_prob_seed_12345.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbiontmm"))) + mynoiseqbio.tmm <- noiseqbio(mydata, k = 0.5, norm = "tmm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE) + dev.off() + + mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ] + mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ] + write.table(mynoiseqbio.tmm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.tmm.prob),"_NOISeqbio_tmm_prob_seed_12345.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbionnorm"))) + mynoiseqbio.nnorm <- noiseqbio(mydata, k = 0.5, norm = "n", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE) + dev.off() + + head(mynoiseqbio.rpkm@results[[1]]) + head(mynoiseqbio.nnorm@results[[1]]) + + mynoiseqbio.rpkm.deg <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = NULL) + + mynoiseqbio.rpkm.deg.up <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = "up") + mynoiseqbio.rpkm.deg.down <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = "down") + mynoiseqbio.nnorm.deg <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = NULL) + mynoiseqbio.nnorm.deg.up <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = "up") + mynoiseqbio.nnorm.deg.down <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = "down") + + par(mfrow = c(3, 2)) + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.9"))) + DE.plot(mynoiseq.rpkm, q = 0.9, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.8"))) + DE.plot(mynoiseq.rpkm, q = 0.8, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorn_0.9"))) + DE.plot(mynoiseq.nnorm, q = 0.9, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorn_0.8"))) + DE.plot(mynoiseq.nnorm, q = 0.8, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.tmm_0.9"))) + DE.plot(mynoiseq.tmm, q = 0.9, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.tmm_0.8"))) + DE.plot(mynoiseq.tmm, q = 0.8, graphic = "expr", log.scale = TRUE) + dev.off() + + par(mfrow = c(2, 2)) + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.rpkm_",pvt))) + DE.plot(mynoiseqbio.rpkm, q = pvt, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.rpkm_",pvtnsb))) + DE.plot(mynoiseqbio.rpkm, q = pvtnsb, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.nnorm_",pvt))) + DE.plot(mynoiseqbio.nnorm, q = pvt, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.nnorm_",pvtnsb))) + DE.plot(mynoiseqbio.nnorm, q = pvtnsb, graphic = "expr", log.scale = TRUE) + dev.off() + + par(mfrow = c(1, 2)) + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.9_MD"))) + DE.plot(mynoiseq.rpkm, q = 0.9, graphic = "MD", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorm_0.9_MD"))) + DE.plot(mynoiseq.nnorm, q = 0.9, graphic = "MD", log.scale = TRUE) + dev.off() +} + +########################################### +## additional function for more usability # +########################################### + +calcRPKM <- function(c,n,l) +{ + RPKM <- (10^9* as.numeric(c)) / (as.numeric(n) * as.numeric(l)) + return(RPKM) +} + +RPKMMatrix <- function(InTable,directory,LenGCTable,myLenPos) +{ + myLengthTable <- read.table(paste0(directory,LenGCTable), sep = "\t", header = TRUE, stringsAsFactor = FALSE, row.names = 1) + + newtable <- matrix(data = c(1), nrow = nrow(InTable), ncol(InTable), byrow= TRUE) + colnames(newtable) <- colnames(InTable) + rownames(newtable) <- rownames(InTable) + + sumCol <- apply(InTable,2,sum) # list of the all col sums + + for(s in colnames(InTable)) { + for(r in rownames(InTable)) { + w <- InTable[r, s] + len <- myLengthTable[r, myLenPos] + sum <- as.numeric(sumCol[s]) + + newtable[r, s] <- calcRPKM(w, sum, len) + } + } + return(newtable) +} + + +# rmHTSeqLines removes the last five rows +rmHTSeqLines <- function(counts) +{ + counts <- head(counts, -5) + return(counts) +} + + +############### +## functions ## +############### + +# split list to a factor list +split2list <- function(tmp) +{ + xx <- c() + for(i in 1:length(tmp)) { + if(grepl("\\.", tmp[i])) { + foo <- strsplit(tmp[i], '\\.')[[1]] + fooo <- strsplit(foo, " ")[[1]] + xx <- c(xx, fooo) + } else { + xx <- c(xx, tmp[i]) + } + } + return(as.factor(xx)) +} + +# makeGroups +# https://github.com/drisso/RUVSeq/blob/master/R/helper.R +makeGroups <- function(xs) +{ + xs <- factor(xs) + groups <- matrix(-1, nrow = length(levels(xs)), ncol = max(table(xs))) + for (i in 1:length(levels(xs))) { + idxs <- which(xs == levels(xs)[i]) + groups[i,1:length(idxs)] <- idxs + } + return(groups) +} + +# The function makeSampleList creates a list of the columns(samples) you would like to compare +makeSampleList <- function(A, Al, B, Bl) +{ + l <- c() + if(exists("Al", mode = "numeric")) { + l <- makeSampleList.helper(A, Al, l) + l <- makeSampleList.helper(B, Bl, l) + } + return(l) +} +makeSampleList.helper <- function(v,k,l) +{ + for(h in 1:k) { + if(h == 1) { + l <- c(l, v) + } else { + l <- c(l, paste0(v,".",h-1)) + } + } + return(l) +} + +# function creates a list of numbers out of another list. The List is the index of the given list. +makeIndexList <- function(l) +{ + n <- 1 + jj <- c() + for(v in l) { + jj <- c(jj, n) + n <- n + 1 + } + return(jj) +} + +intersection <- function(l,check,t) +{ + l.list <- list.files(path = l, pattern = "*(rpkm|edgeR|DESeq2).*tsv", recursive = TRUE) + if(grepl("bio", l.list[4],fixed = TRUE)) { + l.list.noiseq <- read.table(paste0(l,"/",l.list[3]), header=T, sep="\t", row.names=1) + l.list.noiseqbio <- read.table(paste0(l,"/",l.list[4]), header=T, sep="\t", row.names=1) + } else { + l.list.noiseq <- read.table(paste0(l,"/",l.list[4]), header=T, sep="\t", row.names=1) + l.list.noiseqbio <- read.table(paste0(l,"/",l.list[3]), header=T, sep="\t", row.names=1) + } + l.list.edgeR <- read.table(paste0(l,"/",l.list[2]), header=T, sep="\t", row.names=1) + l.list.deseq2 <- read.table(paste0(l,"/",l.list[1]), header=T, sep="\t", row.names=1) + + + print("DEGs edgeR") + print(nrow(l.list.edgeR)) + print("DEGs deseq2") + print(nrow(l.list.deseq2)) + print("DEGs noiseq") + print(nrow(l.list.noiseq)) + print("DEGs noiseqbio") + print(nrow(l.list.noiseqbio)) + + interect.list <- "" + interect.list.direct <- "" + + if(t==4) { + # List of overlaps without direction check + interect.list <- intersect(rownames(l.list.edgeR), intersect(rownames(l.list.noiseq), intersect(rownames(l.list.noiseqbio), rownames(l.list.deseq2)))) + + print("DEGs intersection without direction check (four methods)") + print(length(interect.list)) + print(interect.list) + print("DEGs intersection with direction check") + + # List of overlaps with direction check + if(length(interect.list)!=0) { + interect.list.direct <- interDirect(l.list.edgeR, l.list.deseq2, l.list.noiseq, l.list.noiseqbio, interect.list) + print(length(interect.list.direct)) + print(interect.list.direct) + } else { + print("0") + } + } + if(t==3) { + # List of overlaps without direction check + interect.list <- intersect(rownames(l.list.edgeR), intersect(rownames(l.list.noiseq), rownames(l.list.deseq2))) + + print("DEGs intersection without direction check (three methods)") + print(length(interect.list)) + print(interect.list) + print("DEGs intersection with direction check") + + # List of overlaps with direction check + if(length(interect.list)!=0) { + # fake data frame with 0 rows => nrow == 0 + df_fake <- data.frame(Date=as.Date(character()), + File=character(), + User=character(), + stringsAsFactors=FALSE) + + interect.list.direct <- interDirect(l.list.edgeR, l.list.deseq2, l.list.noiseq, df_fake, interect.list) + print(length(interect.list.direct)) + print(interect.list.direct) + } else { + print("0") + } + } + + + if(check == 1){ + return(l.list.noiseq[interect.list.direct,]) + } else { + return(l.list.noiseq[interect.list,]) + } +} + +interDirect <- function(e,d,n,nb,i) +{ + lil <- length(i) + + if(nrow(nb) == 0) { # use noiseq twice to simulate noiseqbio + nbx <- apply(n[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) + nby <- paste0(nbx[,"M"],rownames(nbx)) + } else { + nbx <- apply(nb[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) + nby <- paste0(nbx[,"log2FC"],rownames(nbx)) + } + + ex <- apply(e[i,], 2, function(x) {ifelse(x < 0, "-", "+")}) #-#-#-# hier < und > + dx <- apply(d[i,], 2, function(x) {ifelse(x < 0, "-", "+")}) #-#-#-# hier < und > + nx <- apply(n[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) # twisted, because edgeR and DESeq2 are calculating vice versa + + ey <- paste0(ex[,"logFC"],rownames(ex)) + dy <- paste0(dx[,"log2FoldChange"],rownames(dx)) + ny <- paste0(nx[,"M"],rownames(nx)) + + interect.list.direct <- intersect(ey, intersect(dy, intersect(ny, nby))) + + if(lil>0 && length(interect.list.direct)<=0) { + ex <- apply(e[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) #-#-#-# hier < und > + dx <- apply(d[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) #-#-#-# hier < und > + + ey <- paste0(ex[,"logFC"],rownames(ex)) + dy <- paste0(dx[,"log2FoldChange"],rownames(dx)) + + interect.list.direct <- intersect(ey, intersect(dy, intersect(ny, nby))) + } + + return(substr(interect.list.direct,2,100)) +} + +########################################### +## executiv part of the script # +########################################### +for(SampleNames in sampleLists) { + workflow(file, saveat, SampleNames, LenGCTable, myLenPos, myGCPos, rpkm, pvtedgeR, pvtDESeq2, pvtNOISeq, edgeRDir, DESeq2Dir, NOISeqDir, htseq) +} +print("Thank you for using the DEG pipeline :) \n") + +# fin + +### compare DEGs analysis ### +# used venny and Excel + + diff --git a/workflows/Experiment2/3d_pca_plot_schippers_second_20211209.r b/workflows/Experiment2/3d_pca_plot_schippers_second_20211209.r new file mode 100644 index 0000000..c4aeea7 --- /dev/null +++ b/workflows/Experiment2/3d_pca_plot_schippers_second_20211209.r @@ -0,0 +1,176 @@ +# 3d R plot +saveat <- "/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Schippers/second_experiment/At_all_DEGs_0.9" +# saveat <- "R:/haasf/madland_RNA-seq_Schippers" + +file.rpkm <- '/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Schippers/second_experiment/At_all.p.sort.wo_ncRNA_miRNA.rpkm' +# file.rpkm <- "S:/projects/MAdLand/cooperations/Schippers_Lab_RNA-seq_analyses/Mp.p.sort.wotRNA.rpkm" +# file.rpkm <- "S:/projects/MAdLand/cooperations/Schippers_Lab_RNA-seq_analyses/31315_all.p.sort.rpkm" + +data.rpkm <- read.table(file.rpkm, header=T, sep="\t", row.names=1) + + +# sort by colnames +data.rpkm <- data.rpkm[,order(colnames(data.rpkm))] + + +librariesName <- list( + control_6h = c("control_6h", "red"), + control_24h = c("control_24h", "blue"), + LCS_6h = c("LCS_6h", "green"), + LCS_24h = c("LCS_24h", "yellow") +) + +# header.ori <- c("M54.fq.M54.fq_all.p.count", "M53.fq.M53.fq_all.p.count", "M52.fq.M52.fq_all.p.count", "M48.fq.M48.fq_all.p.count", "M47.fq.M47.fq_all.p.count", "M46.fq.M46.fq_all.p.count", "M40.fq.M40.fq_all.p.count", "M39.fq.M39.fq_all.p.count", "M38.fq.M38.fq_all.p.count", "M34.fq.M34.fq_all.p.count", "M33.fq.M33.fq_all.p.count", "M32.fq.M32.fq_all.p.count") +# header.new <- c("LCS_24h.2", "LCS_24h.1", "LCS_24h", "control_24h.2", "control_24h.1", "control_24h", "LCS_6h.2", "LCS_6h.1", "LCS_6h", "control_6h.2", "control_6h.1", "control_6h") +# header.ori <- c("MF82", "MF81", "MF80", "MF76", "MF75", "MF74", "MF68", "MF67", "MF66", "MF62", "MF61", "MF60") +# header.new <- c("LCS_24h.2", "LCS_24h.1", "LCS_24h", "control_24h.2", "control_24h.1", "control_24h", "control_6h.2", "LCS_6h.1", "LCS_6h", "LCS_6h.2", "control_6h.1", "control_6h") +header.ori <- c("A6.fq.A6.fq.single.p.count", "A5.fq.A5.fq.single.p.count", "A4.fq.A4.fq.single.p.count", "A26.fq.A26.fq.single.p.count", "A25.fq.A25.fq.single.p.count", "A24.fq.A24.fq.single.p.count", "A20.fq.A20.fq.single.p.count", "A19.fq.A19.fq.single.p.count", "A18.fq.A18.fq.single.p.count", "A12.fq.A12.fq.single.p.count", "A11.fq.A11.fq.single.p.count", "A10.fq.A10.fq.single.p.count") +header.new <- c("control_6h.2", "control_6h.1", "control_6h", "LCS_24h.2", "LCS_24h.1", "LCS_24h", "control_24h.2", "control_24h.1", "control_24h", "LCS_6h.2", "LCS_6h.1", "LCS_6h") + + +header.new <- header.new[order(header.ori)] +header.ori <- header.ori[order(header.ori)] + +col.header <- header.new + +colnames(data.rpkm) <- col.header + +library("DESeq2") +library("ggplot2") +library("RColorBrewer") +library("pheatmap") +library("BiocGenerics") +library("rgl") +library("magick") +library("sjmisc") + + +################### running ###################### +### PCA RPKM ### + +set.seed(0) + +data.inv <- t(data.rpkm) +data.dist <-dist(data.inv, method="euclidean") # "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski" +data.dist.hc <- hclust(data.dist,method="ward.D2") +data.dist.pca <-princomp(data.dist,cor=T) + +pc1 <- data.dist.pca$scores[,1] +pc2 <- data.dist.pca$scores[,2] +pc3 <- data.dist.pca$scores[,3] + +# create data frame for pc1 pc2 pc3 +data.dist.pca.frame = data.frame(pc1,pc2,pc3) +rownames(data.dist.pca.frame) <- names(data.dist.pca$scale) +colnames(data.dist.pca.frame) <- c("pc1","pc2","pc3") + +condition.values <- c() +condition.values.color <- c() + + +for(a in colnames(data.rpkm)) { + + v <- substr(a, nchar(a), nchar(a)) + if(str_contains(v, c(1,2,3,4,5,6,7,8,9,0), logic = "or")) { + if (substr(substr(a, 1, nchar(a)-2), nchar(substr(a, 1, nchar(a)-2)), nchar(substr(a, 1, nchar(a)-2))) == ".") { + n <- substr(a, 1, nchar(a)-3) + } else { + n <- substr(a, 1, nchar(a)-2) + } + + } else { + n <- a + + } + condition.values <- c(condition.values, librariesName[n][[1]][1]) + condition.values.color <- c(condition.values.color, librariesName[n][[1]][2]) +} + + +data.dist.pca.frame["tissue"] <- condition.values +data.dist.pca.frame["color"] <- condition.values.color +data.dist.pca.frame["name"] <- names(data.dist.pca$scale) +attr(data.dist.pca.frame, "percentVar") <- (data.dist.pca$sdev)^2 / sum(data.dist.pca$sdev^2) # cumsum() + +# simple plot +png(filename=paste0(saveat, "/HC_RPKM_normalized.png")) +plot(data.dist.hc) # hc plot +dev.off() +png(filename=paste0(saveat, "/PCA_variance_RPKM_normalized.png")) +plot(data.dist.pca) # variances; var(data.dist.pca$sdev[1:9]) +dev.off() +# get the parcent variation +percentVar <- round(100 * attr(data.dist.pca.frame, "percentVar")) + +# 3d plot +plot3d(pc1, pc2, pc3, + type = "s", # p, s, l, h, n + #pch = c(1:3), + col = condition.values.color, + size = 1, + xlab = paste0("PC1: ",percentVar[1],"% variance"), + ylab = paste0("PC2: ",percentVar[2],"% variance"), + zlab = paste0("PC3: ",percentVar[3],"% variance"), + cex = 2, + main = "", # -> princomp", +) + +# shift <- matrix(4, 4, 4, byrow = TRUE) +# text3d(shift,texts=1:3) +grid3d(c("x", "y", "z")) + +## add legend +legend3d("right", unique(condition.values), pch = 19, col = unique(condition.values.color)) + +#### video ##### +M <- par3d("userMatrix") +play3d( par3dinterp( userMatrix=list(M, + rotate3d(M, pi/2, 1, 0, 0), + rotate3d(M, pi/2, 0, 1, 0) ) ), + duration=2 ) + +movie3d(spin3d(axis = c(1, 2, 1)), duration = 5, + dir = saveat) + +#### video end #### + +# pc1, pc2 +png(filename=paste0(saveat, "/PCA_RPKM_normalized.png")) +ggplot( + data.dist.pca.frame, + aes( + pc1, + pc2, + color=tissue) +) + + geom_point(size=2.5) + + xlab( + paste0("PC1: ",percentVar[1],"% variance") + ) + + ylab( + paste0("PC2: ",percentVar[2],"% variance") + ) + + #theme() + #, face="bold" + scale_colour_manual( + values= c("blue", "red", "yellow", "green") # dodgerblue3 + ) + + ggtitle("PCA of all samples (RPKM normalized)") + + theme( + plot.title = element_text(lineheight=.8), + panel.background = element_rect(fill = "gray95") + ) +dev.off() + + + + + +######## Some experiments ###### +# for(i in colnames(data)) { +# colnames(data)[colnames(data)==grep(i, names(data), value = TRUE)] <- librariesName[[i]] +# } +# +# for(i in colnames(data)) { +# colnames(data)[colnames(data)==grep(i, names(data), value = TRUE)] <- names(librariesName[librariesName==grep(i, librariesName, value = TRUE)]) +# } + diff --git a/workflows/Experiment2/new_deg_pipe_four_methods_noiseqbio_schippers_second_at_20211209.r b/workflows/Experiment2/new_deg_pipe_four_methods_noiseqbio_schippers_second_at_20211209.r new file mode 100644 index 0000000..a57bbb7 --- /dev/null +++ b/workflows/Experiment2/new_deg_pipe_four_methods_noiseqbio_schippers_second_at_20211209.r @@ -0,0 +1,923 @@ +############################################################### +## DEG pipeline with edgeR, DESeq2 and NOISeq # +## Modified from the template of Dr. Kristian Ullrich 2014 # +## @fabianhaas, AG Rensing, Marburg, Jan 2016 # +############################################################### +# The pipeline expects sample triplicates. +# If you don't have three samples of one condition, use face files with zero counts. +# RPKM calculation is optional. +# Additional program to sum up the tsv files: /mnt/nas_temp/home/haasf/Program/sumup_htseq_counts.pl folder file_prefix + +########################################### +## please define your variables # +########################################### + +saveat <- "/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Schippers/second_experiment/At_all_DEGs_0.9" + +file <- '/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Schippers/second_experiment/At_all.p.sort.wo_ncRNA_miRNA.counts' +# file <- '/mnt/Teamdrive/projects/MAdLand/cooperations/Schippers_Lab_RNA-seq_analyses/31315_all.p.sort.counts' + +# Table with the length and the GC content +LenGCTable <- "/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Schippers/second_experiment/Arabidopsis_thaliana.TAIR10.cdna.all.GC.length" +# LenGCTable <- "/mnt/NAS_coruscant_datashare/haasf/GeneAtlas_RNASeq/DEG_2nd_round/cosmoss_V3.3.release_2016-07-22.IsoformV31.fasta.length_GC_Contig_Daten_for_34840_1nd.csv" + +# Column position of the gene name and the width/gc content (first column is gene name!) +myLenPos <- 1 +myGCPos <- 2 + +# +# hoo <- c("","") +# sampleLists <- c() +# for(a in hoo) { +# for(b in hoo) { +# if(a!=b) { +# tmp <- c(a,b) +# sampleLists <- c(sampleLists, list(tmp)) +# } +# } +# } + +sampleLists <- list(c("LCS_24h", "control_24h"), + c("LCS_24h", "control_6h"), + c("LCS_24h", "LCS_6h"), + c("control_24h", "LCS_6h"), + c("control_24h", "control_6h"), + c("LCS_6h", "control_6h") +) + +# Do you want a RPKM value table (1/0) +# Be warned, it takes a while to do this. approx. 10 minutes for 35000 genes and 6 samples. +rpkm <- 0 +# Are the data original HTSeq files? yes <- 1 ; no <- 0 +htseq <- 0 + +# threshold settings for the R packages +#edgeR +pvtedgeR <- 0.001 +#DESeq2 +pvtDESeq2 <- 0.001 +#NOISeq +pvtNOISeq <- 0.9 + +# Output folder names +edgeRDir <- "edgeR" +DESeq2Dir <- "DESeq2" +NOISeqDir <- "NOISeq" + +######################### +## stop editing here!! ## +######################### + + +########################################### +## executiv function # +########################################### + +workflow <- function(file, saveat, SampleNames, LenGCTable, myLenPos, myGCPos, rpkm, pvtedgeR, pvtDESeq2, pvtNOISeq, edgeRDir, DESeq2Dir, NOISeqDir, htseq) +{ + # setwd(workingFolder) # set workspace + + sampleNameI <- SampleNames[1] + sampleNameII <- SampleNames[2] + + # Comparison name + cName <- paste(sampleNameI, sampleNameII, sep = "_vs_") + + filename <- strsplit(file, "/") + filename <- filename[[1]][length(filename[[1]])] # long but it's working + sampleName <- gsub(", ","_",toString(SampleNames)) + samples <- SampleNames + # create output dir + dir.create(saveat, showWarnings = FALSE) + outdir <- paste0(saveat,"/",sampleName) + dir.create(outdir) + + # Start the log file + sink(paste0(outdir, "/DEGpipe_",cName, ".log")) + + # print parameter into the log file + print("set parameters:") + print(paste0("Counts file:", file)) + print(paste0("Results stored at:", saveat)) + print(paste0("SampleNames: ",SampleNames)) + print(paste0("LenGCTable: ",LenGCTable)) + print(paste0("myLenPos: ",myLenPos)) + print(paste0("myGCPos: ",myGCPos)) + print(paste0("rpkm: ",rpkm)) + print(paste0("htseq: ",htseq)) + print(paste0("pvtedgeR: ",pvtedgeR)) + print(paste0("pvtDESeq2: ",pvtDESeq2)) + print(paste0("pvtNOISeq: ",pvtNOISeq)) + print(paste0("edgeRDir: ",edgeRDir)) + print(paste0("DESeq2Dir: ",DESeq2Dir)) + print(paste0("NOISeqDir: ",NOISeqDir)) + # session and package infos for the log file + print(sessionInfo()) + print(Sys.info()) + + # load spike-in count tables + data0 <- read.table(file, header=T, sep="\t", row.names=1) + # just if the gene ids not ending with .1 (it is nessessery for NOISeqbio iteration step) + #row.names(data0) <- paste0(row.names(data0), ".1") + + # remove HTSeq rows like "__no_feature" + data0 <- data0[ with(data0, !grepl('__', rownames(data0))) , ] + data0 <- data0[ with(data0, !grepl('ERCC', rownames(data0))) , ] + #data0 <- data0[ , with(data0, !grepl('X.1', colnames(data0)))] + #rename col names (library name to roman number) + + # librariesName <- c("M54.fq.M54.fq.single.p.count", "M53.fq.M53.fq.single.p.count", "M52.fq.M52.fq.single.p.count", "M48.fq.M48.fq.single.p.count", "M47.fq.M47.fq.single.p.count", "M46.fq.M46.fq.single.p.count", "M40.fq.M40.fq.single.p.count", "M39.fq.M39.fq.single.p.count", "M38.fq.M38.fq.single.p.count", "M34.fq.M34.fq.single.p.count", "M33.fq.M33.fq.single.p.count", "M32.fq.M32.fq.single.p.count") + # romanNumber <- c("LCS_24h.2", "LCS_24h.1", "LCS_24h", "control_24h.2", "control_24h.1", "control_24h", "LCS_6h.2", "LCS_6h.1", "LCS_6h", "control_6h.2", "control_6h.1", "control_6h") + librariesName <- c("A6.fq.A6.fq.all.p.count", "A5.fq.A5.fq.all.p.count", "A4.fq.A4.fq.all.p.count", "A26.fq.A26.fq.all.p.count", "A25.fq.A25.fq.all.p.count", "A24.fq.A24.fq.all.p.count", "A20.fq.A20.fq.all.p.count", "A19.fq.A19.fq.all.p.count", "A18.fq.A18.fq.all.p.count", "A12.fq.A12.fq.all.p.count", "A11.fq.A11.fq.all.p.count", "A10.fq.A10.fq.all.p.count") + romanNumber <- c("control_6h.2", "control_6h.1", "control_6h", "LCS_24h.2", "LCS_24h.1", "LCS_24h", "control_24h.2", "control_24h.1", "control_24h", "LCS_6h.2", "LCS_6h.1", "LCS_6h") + + + for(i in 1:length(librariesName)) { + colnames(data0)[colnames(data0)==grep(librariesName[i], names(data0), value = TRUE)] <- romanNumber[i] + } + + # creats a list of the column names/samples you would like to compare + SampleNames <- makeSampleList(sampleNameI, length(grep(paste0("^",sampleNameI,"[^A-Z]"), colnames(data0))) + 1, + sampleNameII, length(grep(paste0("^",sampleNameII,"[^A-Z]"), colnames(data0))) + 1) + print(SampleNames) + + ## mal anschauen als ersatz für das darüber ## + ## newenw <- table.new[, grepl(paste(LIST,collapse="|"), colnames(table.new))] + + + # exctract the samples from the data0 if needed + if(length(SampleNames) > 0) { + RTableInFiles <- data0[,SampleNames] + data <- data0[,SampleNames] + } + print(head(RTableInFiles)) + + # sort by column name + # data <- data[ , order(names(data))] # dangeros, the order of query and referenz might be not the same + # see how many rows and columns are in the tables + print(paste("Raw data row number: ",nrow(data))) + print(paste("Raw data column number: ",ncol(data))) + + # 2) A typical differential expression analysis workflow + # 2.1) Filtering and exploratory data analysis + # filter: One single gene/spike-in must have at least 5 counts in minimum 2 libraries + filter <- apply(data, 1, function(x) length(x[x>5])>=2) + dataFiltered <- data[filter,] + # see how many rows and columns are left + print(paste("Filtered data row number: ",nrow(dataFiltered))) + print(paste("Filtered data row number: ",ncol(dataFiltered))) + # filter gene names + genes <- rownames(dataFiltered)[grep("^Pp3", rownames(dataFiltered))] + # filter spike-in names + spikes <- rownames(dataFiltered)[grep("^ERCC", rownames(dataFiltered))] + # get the column name list as.factors + xtmp <- colnames(dataFiltered) + x <- split2list(xtmp) + + sampleNameList <- gsub(".[1234567890]+$", "", SampleNames) + columnPos <- makeIndexList(SampleNames) + + # run ... + ### edgeR ### + print(paste0("edgeR is running with ", cName)) + packageDescription("edgeR") + edgeRFunction(outdir, edgeRDir, cName, RTableInFiles, pvtedgeR, sampleNameI, sampleNameII, sampleNameList) + + #### DESeq2 #### + print(paste0("DESeq2 is running with ", cName)) + packageDescription("DESeq2") + DESeq2Function(outdir, DESeq2Dir, cName, RTableInFiles, pvtDESeq2, sampleNameI, sampleNameII, sampleNameList) + + ### NOISeq ### + print(paste0("NOISeq is running with ", cName)) + packageDescription("NOISeq") + NOISeqFunction(outdir, NOISeqDir, cName, RTableInFiles, pvtNOISeq, sampleNameI, sampleNameII, myLenPos, myGCPos, LenGCTable, columnPos, sampleNameList, SampleNames) + + ### Overlap of all three methods ### + # intersection with DEG direction check => 1, without => 0 + intersection_4.file <- intersection(outdir,1,4) + write.table(intersection_4.file, file = paste0(outdir, "/", cName,"_intersection_four_methods_",nrow(intersection_4.file),".tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # edgeR, DESeq2, NOISeq + intersection_3.file <- intersection(outdir,1,3) + write.table(intersection_3.file, file = paste0(outdir, "/", cName,"_intersection_three_methods_",nrow(intersection_3.file),".tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + sink() # End of log file + +} + + + + + +#################### +## main functions # +#################### + +plotGraphics <- function(data, librariesName, cName, outdir, romanNumber) +{ + library("DESeq2") + library("ggplot2") + library("RColorBrewer") + library("pheatmap") + library("genefilter") + + # create sample annotation lists + # sample.name.list <- gsub(".[12]", "", librariesName[colnames(data)]) + sample.name.list <- gsub(".[12]", "", romanNumber) + sample.name.uniq <- unique(unlist(sample.name.list, use.names = FALSE)) + + col.data.condition = data.frame(row.names = librariesName, condition = sample.name.list) + + # run DESeq2 functions + dds.matrix <- DESeqDataSetFromMatrix(countData = data, colData = col.data.condition, design = ~condition) # data must be integer! (Ganze Zahlen) + colData(dds.matrix)$condition <- factor(colData(dds.matrix)$condition, levels = sample.name.uniq) + dds.matrix.deseq <- DESeq(dds.matrix) + dds.matrix.res <- results(dds.matrix.deseq) + dds.matrix.res <- dds.matrix.res[order(dds.matrix.res$padj), ] + dds.matrix.res.fdr <- dds.matrix.res[!is.na(dds.matrix.res$padj), ] + dds.matrix.res.fdr <- dds.matrix.res.fdr[dds.matrix.res.fdr$padj < 0.0001, ] + dds.matrix.rld <- rlogTransformation(dds.matrix, blind = TRUE) + + # data for PCA plot + pcaData <- plotPCA(dds.matrix.rld, intgroup = "condition", returnData=TRUE) + percentVar <- round(100 * attr(pcaData, "percentVar")) + # color for samples + sample.colors <- c("gray60","#D73027","#4575B4", "purple", "orange","#113FF7","green", "black") + + # plot the PCA + # https://cran.r-project.org/web/packages/ggfortify/vignettes/plot_pca.html + png(filename=paste0(outdir,"PCA_all_samples_DEseq2_normalized.png")) + ggplot( + pcaData, + aes( + PC1, + PC2, + color=condition, + shape=condition) + ) + + scale_shape_manual(values = c(4, 1, 2, 3, 0, 5, 6, 7)) + + geom_point(size=2.5) + + xlab( + paste0("PC1: ",percentVar[1],"% variance") + ) + + ylab( + paste0("PC2: ",percentVar[2],"% variance") + ) + + theme() + #, face="bold" + scale_colour_manual( + values=sample.colors # dodgerblue3 + ) + + ggtitle("PCA of all samples (DESeq2 normalized)") + + theme( + plot.title = element_text(lineheight=.8), + panel.background = element_rect(fill = "gray95") + ) + dev.off() + + # how many genes will be involved in the heatmap calculation + numbGenes <- 100 + # pheatmap data + topVarGenes <- head(order(rowVars(assay(dds.matrix.rld)),decreasing=TRUE),numbGenes) + mat <- assay(dds.matrix.rld)[ topVarGenes, ] + mat <- mat - rowMeans(mat) + # pheatmap data annotation + anno <-as.data.frame(colData(dds.matrix.rld)[,c("condition")]) + rownames(anno) <- colnames(data) # check out the row names of annotation + colnames(anno) <- "condition" + # pheatmap data annotation color + condition <- sample.colors + names(condition) <- sample.name.uniq + anno_colors <- list(condition = condition) + + #heatmap(mat) + #pheatmap(mat) + # ftp://cran.r-project.org/pub/R/web/packages/pheatmap/pheatmap.pdf + png(filename=paste0(outdir,"HC_heatmap_all_samples_DESeq2_normalized.png")) + pheatmap(mat , + annotation = anno, + annotation_colors = anno_colors, + fontsize = 10, + fontsize_row = 6, + fontsize_col = 10, + main = paste0("Heatmap of all samples (DESeq2 normalized) ", numbGenes," genes\n"), + annotation_names_col = F, + show_rownames = F, + cluster_rows = F, + border_color = F, + treeheight_col = 20 + ) + dev.off() +} + + +edgeRFunction <- function(odir, edgeRDir, cName, RTableInFiles, pvtedgeR, sampleNameI, sampleNameII, sampleNameList) +{ + library(edgeR) + + outdir <- paste0(odir, "/",edgeRDir,"/") # set outdir + + dir.create(outdir, showWarnings = FALSE) # create output folder + dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots" + + comparison <- cName + counts <- RTableInFiles + + pvt <- pvtedgeR + + group <- factor(sampleNameList) + + # edgeR 'normal' + + y.normal <- DGEList(counts = counts, group = group) + + y.normal[,1] + + png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotMDS"))) + plotMDS(y.normal, method = "bcv", main = paste0(cName, "\nedgeR - plotMDS")) + dev.off() + + y.normal <- calcNormFactors(y.normal) + y.normal <- estimateCommonDisp(y.normal) + y.normal <- estimateTagwiseDisp(y.normal) + + + png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotBCV"))) + plotBCV(y.normal, main = paste0(cName, "\nedgeR - plotBCV")) + dev.off() + + et.normal <- exactTest(y.normal, pair = c(sampleNameI, sampleNameII)) + + topTags(et.normal) + # adjust.method ==> character string specifying p-value adjustment method. Possible values are "none", "BH", "fdr" (equivalent to "BH"), "BY" and "holm". See p.adjust for details. + summary(de.normal <- decideTestsDGE(et.normal, p = pvt, adjust = "BH")) + et.normal.fdr <- cbind(et.normal$table, p.adjust(et.normal$table[, 3], method = "BH")) + et.normal.fdr <- et.normal.fdr[et.normal.fdr[, 4] < pvt, ] + + write.table(et.normal.fdr, file = paste0(outdir, cName, "_", nrow(et.normal.fdr), "_edgeR_fdr.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + detags.normal <- rownames(de.normal)[as.logical(de.normal)] + png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotSmear"))) + plotSmear(et.normal, de.normal, tags = detags.normal, main = paste0(cName, "\nedgeR - plotSmear")) + abline(h = c(-2, 2), col = "blue") + dev.off() + + # edgeR 'GLM' + + y.glm <- DGEList(counts = counts, group = group) + + design <- model.matrix(~0 + group, data = y.glm$samples) + design + + y.glm <- estimateGLMCommonDisp(y.glm, design) + y.glm <- estimateGLMTrendedDisp(y.glm, design) + + ## Loading required package: splines + y.glm <- estimateGLMTagwiseDisp(y.glm, design) + + png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotBCV - GLM"))) + plotBCV(y.glm, main = paste0(cName, "\nedgeR - plotBCV - GLM")) + dev.off() + + y.fit <- glmFit(y.glm, design) + y.lrt <- glmLRT(y.fit) + + FDR <- p.adjust(y.lrt$table$PValue, method = "BH") + sum(FDR < pvt) + + topTags(y.lrt) + + summary(de.glm <- decideTestsDGE(y.lrt, p = pvt, adjust = "BH")) + +} + + +DESeq2Function <- function(odir, DESeq2Dir, cName, RTableInFiles, pvtDESeq2, sampleNameI, sampleNameII, sampleNameList) +{ + library(DESeq2) + library(RColorBrewer) + library(gplots) + library(vsn) + + outdir <- paste0(odir, "/",DESeq2Dir,"/") # set outdir + + dir.create(outdir, showWarnings = FALSE) + dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots" + + + pvt <- pvtDESeq2 + + + + # DESeqDataSet(se, design, ignoreRank = FALSE) + # DESeqDataSetFromMatrix(countData, colData, design, tidy = FALSE, ignoreRank = FALSE, ...) + # DESeqDataSetFromHTSeqCount(sampleTable, directory = ".", design, ignoreRank = FALSE, ...) + # DESeqDataSetFromTximport(txi, colData, design, ...) + coldata = data.frame(row.names = colnames(RTableInFiles), condition = sampleNameList) + + ddsHTSeq <- DESeqDataSetFromMatrix(countData = RTableInFiles, colData = coldata, design = ~ condition) + + colData(ddsHTSeq)$condition <- factor(colData(ddsHTSeq)$condition, levels = c(sampleNameI, sampleNameII)) + + ddsHTSeq <- DESeq(ddsHTSeq) + + ddsHTSeq.res <- results(ddsHTSeq) + ddsHTSeq.res <- ddsHTSeq.res[order(ddsHTSeq.res$padj), ] + head(ddsHTSeq.res) + + # png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - plotMA"))) + # plotMA(ddsHTSeq, main = paste0(cName, "\nDESeq2 - plotMA"), ylim = c(-2, 2)) + # dev.off() + + ddsHTSeq.res.fdr <- ddsHTSeq.res[!is.na(ddsHTSeq.res$padj), ] + ddsHTSeq.res.fdr <- ddsHTSeq.res.fdr[ddsHTSeq.res.fdr$padj < pvt, ] + + write.table(as.matrix(ddsHTSeq.res.fdr), file = paste0(outdir, cName, "_", nrow(as.matrix(ddsHTSeq.res.fdr)), "_DESeq2_fdr.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + ddsHTSeq.rld <- rlogTransformation(ddsHTSeq, blind = TRUE) + ddsHTSeq.vsd <- varianceStabilizingTransformation(ddsHTSeq, blind = TRUE) + + par(mfrow = c(1, 3)) + notAllZero <- (rowSums(counts(ddsHTSeq)) > 0) + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.1"))) + meanSdPlot(log2(counts(ddsHTSeq, normalized = TRUE)[notAllZero, ] + 1)) # , ylim = c(0, 2.5) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.2"))) # , ylim = c(0, 2.5) + meanSdPlot(assay(ddsHTSeq.rld[notAllZero, ])) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.3"))) # , ylim = c(0, 2.5) + meanSdPlot(assay(ddsHTSeq.vsd[notAllZero, ])) + dev.off() + + hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100) + distRL <- dist(t(assay(ddsHTSeq.rld))) + mat <- as.matrix(distRL) + rownames(mat) <- colnames(mat) <- with(colData(ddsHTSeq), paste0(condition)) + + # png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - heatmap"))) + # heatmap(mat, trace = "none", col = rev(hmcol), margin = c(13, 13)) + # dev.off() + + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - PCA"))) + print(plotPCA(ddsHTSeq.rld, intgroup = "condition")) + dev.off() + + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - DispEsts"))) + plotDispEsts(ddsHTSeq, main = paste0(cName, "\nDESeq2 - plotDispEsts")) + dev.off() +} + +NOISeqFunction <- function(odir, NOISeqDir, cName, RTableInFiles, pvtNOISeq, sampleNameI, sampleNameII, myLenPos, myGCPos, LenGCTable, columnPos, sampleNameList, SampleNames) +{ + library(NOISeq) + + outdir <- paste0(odir, "/",NOISeqDir,"/") # set outdir + + dir.create(outdir, showWarnings = FALSE) + dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots" + + pvt <- pvtNOISeq + + counts <- RTableInFiles[, columnPos] + head(counts) + + # must be changed if there are no triplicates + # !! Please change this setting if needed !! + myfactors <- data.frame(Treatment = sampleNameList, TreatmentRun = SampleNames) + # sort counts by row names + mycounts <- counts[order(rownames(counts)), ] + # filter out low counts + mycounts.filter <- filtered.data(mycounts, factor = myfactors$Treatment, norm = FALSE, depth = NULL, method = 1, cv.cutoff = 100, cpm = 1) + mycounts.rowname <- rownames(mycounts.filter) + + # Extra file with length and GC content + myLengthTable <- read.table(LenGCTable, sep = "\t", header = TRUE, stringsAsFactor = FALSE, row.names=1) + myLengthTable <- myLengthTable[order(rownames(myLengthTable)),] # sort by rownames + myLengthTable.filter <- myLengthTable[mycounts.rowname,] + + mylength <- myLengthTable.filter[,myLenPos] + names(mylength) <- rownames(myLengthTable.filter) + + mygc <- myLengthTable.filter[,myGCPos] + names(mygc) <- rownames(myLengthTable.filter) + + + + + + mydata <- readData(data = mycounts.filter, length = mylength, gc = mygc, factors = myfactors) + mydata + + mydata.GCbias <- dat(mydata, type = "GCbias") +#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - GCbias.1"))) +#-# explo.plot(mydata.GCbias) +#-# dev.off() + + mydata.GCbias.group <- dat(mydata, factor = "Treatment", type = "GCbias") +#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - GCbias.2"))) +#-# explo.plot(mydata.GCbias.group) +#-# dev.off() + + show(mydata.GCbias.group) + + mydata.lengthbias <- dat(mydata, type = "lengthbias") +#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - Lenbias.1"))) +#-# explo.plot(mydata.lengthbias) +#-# dev.off() + + mydata.lengthbias.group <- dat(mydata, factor = "Treatment", type = "lengthbias") +#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - Lenbias.2"))) +#-# explo.plot(mydata.lengthbias.group) +#-# dev.off() + + show(mydata.lengthbias.group) + + # looks different + mydata.cd <- dat(mydata, type = "cd") +#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - cd"))) +#-# explo.plot(mydata.cd) +#-# dev.off() + mylength + + myRPKM <- NOISeq::rpkm(assayData(mydata)$exprs) + myUQUA <- NOISeq::uqua(assayData(mydata)$exprs, long = mylength, lc = 0.5, k = 0) + myTMM <- NOISeq::tmm(assayData(mydata)$exprs, long = 1000, lc = 0) + + mynoiseq.rpkm <- noiseq(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological") + mynoiseq.nnorm <- noiseq(mydata, k = 0.5, norm = "n", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological") + mynoiseq.tmm <- noiseq(mydata, k = 0.5, norm = "tmm", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological") + + # RPKM + mynoiseq.rpkm.prob <- mynoiseq.rpkm@results[[1]][mynoiseq.rpkm@results[[1]]$prob > pvt, ] + mynoiseq.rpkm.prob <- mynoiseq.rpkm.prob[!is.na(mynoiseq.rpkm.prob$prob), ] + write.table(mynoiseq.rpkm.prob, file = paste0(outdir, cName, "_", nrow(mynoiseq.rpkm.prob), "_NOISeq_rpkm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # none + mynoiseq.nnorm.prob <- mynoiseq.nnorm@results[[1]][mynoiseq.nnorm@results[[1]]$prob > pvt, ] + mynoiseq.nnorm.prob <- mynoiseq.nnorm.prob[!is.na(mynoiseq.nnorm.prob$prob), ] + write.table(mynoiseq.nnorm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseq.nnorm.prob), "_NOISeq_nnorm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # TMM + mynoiseq.tmm.prob <- mynoiseq.tmm@results[[1]][mynoiseq.tmm@results[[1]]$prob > pvt, ] + mynoiseq.tmm.prob <- mynoiseq.tmm.prob[!is.na(mynoiseq.tmm.prob$prob), ] + write.table(mynoiseq.tmm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseq.tmm.prob), "_NOISeq_tmm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + ############ NOISeq bio ################ + + pvtnsb <- 0.995 + a0per_value <- 0.99 + + iterations <- 5 + x5 <- c(sample(10000:99999, iterations, replace=F), 12345) + x5 + + collector.rpkm <- NULL + collector.tmm <- NULL + + + # for (iter in x5) { + # # RPKM + # mynoiseqbio.rpkm <- noiseqbio(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = iter, filter = 1)#, plot = TRUE) + # mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm@results[[1]][mynoiseqbio.rpkm@results[[1]]$prob > pvtnsb, ] + # mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm.prob[!is.na(mynoiseqbio.rpkm.prob$prob), ] + # collector.rpkm <- rbind(collector.rpkm, mynoiseqbio.rpkm.prob) + # #collector <- rbind(collector, mynoiseqbio.rpkm.prob[rownames(mynoiseqbio.rpkm.prob) %in% rownames(collector),]) + # # TMM + # mynoiseqbio.tmm <- noiseqbio(mydata, k = 0.5, norm = "tmm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = iter, filter = 1)#, plot = TRUE) + # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ] + # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ] + # collector.tmm <- rbind(collector.tmm, mynoiseqbio.tmm.prob) + # } + # + # # RPKM + # print(sort(row.names(collector.rpkm))) + # library(stringr) + # mynoiseqbio.rpkm.prob.overlap <- collector.rpkm[str_split_fixed(row.names(collector.rpkm), "\\.",2)[,2]>=iterations+10,] + # row.names(mynoiseqbio.rpkm.prob.overlap) <- substr(row.names(mynoiseqbio.rpkm.prob.overlap),1,str_length(row.names(mynoiseqbio.rpkm.prob.overlap))-1) + # + # write.table(mynoiseqbio.rpkm.prob.overlap, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.rpkm.prob.overlap),"_NOISeqbio_rpkm_prob_seed_iteration_overlap.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # + # #TMM + # print(sort(row.names(collector.tmm))) + # library(stringr) + # mynoiseqbio.tmm.prob.overlap <- collector.tmm[str_split_fixed(row.names(collector.tmm), "\\.",2)[,2]>=iterations+10,] + # row.names(mynoiseqbio.tmm.prob.overlap) <- substr(row.names(mynoiseqbio.tmm.prob.overlap),1,str_length(row.names(mynoiseqbio.tmm.prob.overlap))-1) + # + # write.table(mynoiseqbio.tmm.prob.overlap, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.tmm.prob.overlap),"_NOISeqbio_tmm_prob_seed_iteration_overlap.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + # # none + # mynoiseqbio.nnorm.prob <- mynoiseqbio.nnorm@results[[1]][mynoiseqbio.nnorm@results[[1]]$prob > pvtnsb, ] + # mynoiseqbio.nnorm.prob <- mynoiseqbio.nnorm.prob[!is.na(mynoiseqbio.nnorm.prob$prob), ] + # write.table(mynoiseqbio.nnorm.prob, file = paste0(outdir, cName, "_NOISeqbio_nnorm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # # TMM + # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ] + # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ] + # write.table(mynoiseqbio.tmm.prob, file = paste0(outdir, cName, "_NOISeqbio_tmm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + + + + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbiorpkm"))) + mynoiseqbio.rpkm <- noiseqbio(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE) + dev.off() + + mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm@results[[1]][mynoiseqbio.rpkm@results[[1]]$prob > pvtnsb, ] + mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm.prob[!is.na(mynoiseqbio.rpkm.prob$prob), ] + write.table(mynoiseqbio.rpkm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.rpkm.prob),"_NOISeqbio_rpkm_prob_seed_12345.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbiontmm"))) + mynoiseqbio.tmm <- noiseqbio(mydata, k = 0.5, norm = "tmm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE) + dev.off() + + mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ] + mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ] + write.table(mynoiseqbio.tmm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.tmm.prob),"_NOISeqbio_tmm_prob_seed_12345.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbionnorm"))) + mynoiseqbio.nnorm <- noiseqbio(mydata, k = 0.5, norm = "n", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE) + dev.off() + + head(mynoiseqbio.rpkm@results[[1]]) + head(mynoiseqbio.nnorm@results[[1]]) + + mynoiseqbio.rpkm.deg <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = NULL) + + mynoiseqbio.rpkm.deg.up <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = "up") + mynoiseqbio.rpkm.deg.down <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = "down") + mynoiseqbio.nnorm.deg <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = NULL) + mynoiseqbio.nnorm.deg.up <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = "up") + mynoiseqbio.nnorm.deg.down <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = "down") + + par(mfrow = c(3, 2)) + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.9"))) + DE.plot(mynoiseq.rpkm, q = 0.9, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.8"))) + DE.plot(mynoiseq.rpkm, q = 0.8, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorn_0.9"))) + DE.plot(mynoiseq.nnorm, q = 0.9, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorn_0.8"))) + DE.plot(mynoiseq.nnorm, q = 0.8, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.tmm_0.9"))) + DE.plot(mynoiseq.tmm, q = 0.9, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.tmm_0.8"))) + DE.plot(mynoiseq.tmm, q = 0.8, graphic = "expr", log.scale = TRUE) + dev.off() + + par(mfrow = c(2, 2)) + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.rpkm_",pvt))) + DE.plot(mynoiseqbio.rpkm, q = pvt, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.rpkm_",pvtnsb))) + DE.plot(mynoiseqbio.rpkm, q = pvtnsb, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.nnorm_",pvt))) + DE.plot(mynoiseqbio.nnorm, q = pvt, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.nnorm_",pvtnsb))) + DE.plot(mynoiseqbio.nnorm, q = pvtnsb, graphic = "expr", log.scale = TRUE) + dev.off() + + par(mfrow = c(1, 2)) + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.9_MD"))) + DE.plot(mynoiseq.rpkm, q = 0.9, graphic = "MD", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorm_0.9_MD"))) + DE.plot(mynoiseq.nnorm, q = 0.9, graphic = "MD", log.scale = TRUE) + dev.off() +} + +########################################### +## additional function for more usability # +########################################### + +calcRPKM <- function(c,n,l) +{ + RPKM <- (10^9* as.numeric(c)) / (as.numeric(n) * as.numeric(l)) + return(RPKM) +} + +RPKMMatrix <- function(InTable,directory,LenGCTable,myLenPos) +{ + myLengthTable <- read.table(paste0(directory,LenGCTable), sep = "\t", header = TRUE, stringsAsFactor = FALSE, row.names = 1) + + newtable <- matrix(data = c(1), nrow = nrow(InTable), ncol(InTable), byrow= TRUE) + colnames(newtable) <- colnames(InTable) + rownames(newtable) <- rownames(InTable) + + sumCol <- apply(InTable,2,sum) # list of the all col sums + + for(s in colnames(InTable)) { + for(r in rownames(InTable)) { + w <- InTable[r, s] + len <- myLengthTable[r, myLenPos] + sum <- as.numeric(sumCol[s]) + + newtable[r, s] <- calcRPKM(w, sum, len) + } + } + return(newtable) +} + + +# rmHTSeqLines removes the last five rows +rmHTSeqLines <- function(counts) +{ + counts <- head(counts, -5) + return(counts) +} + + +############### +## functions ## +############### + +# split list to a factor list +split2list <- function(tmp) +{ + xx <- c() + for(i in 1:length(tmp)) { + if(grepl("\\.", tmp[i])) { + foo <- strsplit(tmp[i], '\\.')[[1]] + fooo <- strsplit(foo, " ")[[1]] + xx <- c(xx, fooo) + } else { + xx <- c(xx, tmp[i]) + } + } + return(as.factor(xx)) +} + +# makeGroups +# https://github.com/drisso/RUVSeq/blob/master/R/helper.R +makeGroups <- function(xs) +{ + xs <- factor(xs) + groups <- matrix(-1, nrow = length(levels(xs)), ncol = max(table(xs))) + for (i in 1:length(levels(xs))) { + idxs <- which(xs == levels(xs)[i]) + groups[i,1:length(idxs)] <- idxs + } + return(groups) +} + +# The function makeSampleList creates a list of the columns(samples) you would like to compare +makeSampleList <- function(A, Al, B, Bl) +{ + l <- c() + if(exists("Al", mode = "numeric")) { + l <- makeSampleList.helper(A, Al, l) + l <- makeSampleList.helper(B, Bl, l) + } + return(l) +} +makeSampleList.helper <- function(v,k,l) +{ + for(h in 1:k) { + if(h == 1) { + l <- c(l, v) + } else { + l <- c(l, paste0(v,".",h-1)) + } + } + return(l) +} + +# function creates a list of numbers out of another list. The List is the index of the given list. +makeIndexList <- function(l) +{ + n <- 1 + jj <- c() + for(v in l) { + jj <- c(jj, n) + n <- n + 1 + } + return(jj) +} + +intersection <- function(l,check,t) +{ + l.list <- list.files(path = l, pattern = "*(rpkm|edgeR|DESeq2).*tsv", recursive = TRUE) + if(grepl("bio", l.list[4],fixed = TRUE)) { + l.list.noiseq <- read.table(paste0(l,"/",l.list[3]), header=T, sep="\t", row.names=1) + l.list.noiseqbio <- read.table(paste0(l,"/",l.list[4]), header=T, sep="\t", row.names=1) + } else { + l.list.noiseq <- read.table(paste0(l,"/",l.list[4]), header=T, sep="\t", row.names=1) + l.list.noiseqbio <- read.table(paste0(l,"/",l.list[3]), header=T, sep="\t", row.names=1) + } + l.list.edgeR <- read.table(paste0(l,"/",l.list[2]), header=T, sep="\t", row.names=1) + l.list.deseq2 <- read.table(paste0(l,"/",l.list[1]), header=T, sep="\t", row.names=1) + + + print("DEGs edgeR") + print(nrow(l.list.edgeR)) + print("DEGs deseq2") + print(nrow(l.list.deseq2)) + print("DEGs noiseq") + print(nrow(l.list.noiseq)) + print("DEGs noiseqbio") + print(nrow(l.list.noiseqbio)) + + interect.list <- "" + interect.list.direct <- "" + + if(t==4) { + # List of overlaps without direction check + interect.list <- intersect(rownames(l.list.edgeR), intersect(rownames(l.list.noiseq), intersect(rownames(l.list.noiseqbio), rownames(l.list.deseq2)))) + + print("DEGs intersection without direction check (four methods)") + print(length(interect.list)) + print(interect.list) + print("DEGs intersection with direction check") + + # List of overlaps with direction check + if(length(interect.list)!=0) { + interect.list.direct <- interDirect(l.list.edgeR, l.list.deseq2, l.list.noiseq, l.list.noiseqbio, interect.list) + print(length(interect.list.direct)) + print(interect.list.direct) + } else { + print("0") + } + } + if(t==3) { + # List of overlaps without direction check + interect.list <- intersect(rownames(l.list.edgeR), intersect(rownames(l.list.noiseq), rownames(l.list.deseq2))) + + print("DEGs intersection without direction check (three methods)") + print(length(interect.list)) + print(interect.list) + print("DEGs intersection with direction check") + + # List of overlaps with direction check + if(length(interect.list)!=0) { + # fake data frame with 0 rows => nrow == 0 + df_fake <- data.frame(Date=as.Date(character()), + File=character(), + User=character(), + stringsAsFactors=FALSE) + + interect.list.direct <- interDirect(l.list.edgeR, l.list.deseq2, l.list.noiseq, df_fake, interect.list) + print(length(interect.list.direct)) + print(interect.list.direct) + } else { + print("0") + } + } + + + if(check == 1){ + return(l.list.noiseq[interect.list.direct,]) + } else { + return(l.list.noiseq[interect.list,]) + } +} + +interDirect <- function(e,d,n,nb,i) +{ + lil <- length(i) + + if(nrow(nb) == 0) { # use noiseq twice to simulate noiseqbio + nbx <- apply(n[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) + nby <- paste0(nbx[,"M"],rownames(nbx)) + } else { + nbx <- apply(nb[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) + nby <- paste0(nbx[,"log2FC"],rownames(nbx)) + } + + ex <- apply(e[i,], 2, function(x) {ifelse(x < 0, "-", "+")}) #-#-#-# hier < und > + dx <- apply(d[i,], 2, function(x) {ifelse(x < 0, "-", "+")}) #-#-#-# hier < und > + nx <- apply(n[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) # twisted, because edgeR and DESeq2 are calculating vice versa + + ey <- paste0(ex[,"logFC"],rownames(ex)) + dy <- paste0(dx[,"log2FoldChange"],rownames(dx)) + ny <- paste0(nx[,"M"],rownames(nx)) + + interect.list.direct <- intersect(ey, intersect(dy, intersect(ny, nby))) + + if(lil>0 && length(interect.list.direct)<=0) { + ex <- apply(e[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) #-#-#-# hier < und > + dx <- apply(d[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) #-#-#-# hier < und > + + ey <- paste0(ex[,"logFC"],rownames(ex)) + dy <- paste0(dx[,"log2FoldChange"],rownames(dx)) + + interect.list.direct <- intersect(ey, intersect(dy, intersect(ny, nby))) + } + + return(substr(interect.list.direct,2,100)) +} + +########################################### +## executiv part of the script # +########################################### +for(SampleNames in sampleLists) { + workflow(file, saveat, SampleNames, LenGCTable, myLenPos, myGCPos, rpkm, pvtedgeR, pvtDESeq2, pvtNOISeq, edgeRDir, DESeq2Dir, NOISeqDir, htseq) +} +print("Thank you for using the DEG pipeline :) \n") + +# fin + +### compare DEGs analysis ### +# used venny and Excel + diff --git a/workflows/Experiment2/new_deg_pipe_four_methods_noiseqbio_schippers_second_mp_20211209.r b/workflows/Experiment2/new_deg_pipe_four_methods_noiseqbio_schippers_second_mp_20211209.r new file mode 100644 index 0000000..086ba3c --- /dev/null +++ b/workflows/Experiment2/new_deg_pipe_four_methods_noiseqbio_schippers_second_mp_20211209.r @@ -0,0 +1,922 @@ +############################################################### +## DEG pipeline with edgeR, DESeq2 and NOISeq # +## Modified from the template of Dr. Kristian Ullrich 2014 # +## @fabianhaas, AG Rensing, Marburg, Jan 2016 # +############################################################### +# The pipeline expects sample triplicates. +# If you don't have three samples of one condition, use face files with zero counts. +# RPKM calculation is optional. +# Additional program to sum up the tsv files: /mnt/nas_temp/home/haasf/Program/sumup_htseq_counts.pl folder file_prefix + +########################################### +## please define your variables # +########################################### + +saveat <- "/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Schippers/second_experiment/Mp_rmdup_DEGs_0.9" + +file <- '/mnt/NAS_coruscant_datashare/haasf/madland_RNA-seq_Schippers/second_experiment/Mp.p.sort.wotRNA.counts' +# file <- '/mnt/Teamdrive/projects/MAdLand/cooperations/Schippers_Lab_RNA-seq_analyses/31315_all.p.sort.counts' + +# Table with the length and the GC content +LenGCTable <- "/mnt/Teamdrive/projects/MAdLand/cooperations/Schippers_Lab_RNA-seq_analyses/MpTak_v6.1r1.mrna.fasta.GC.length" +# LenGCTable <- "/mnt/NAS_coruscant_datashare/haasf/GeneAtlas_RNASeq/DEG_2nd_round/cosmoss_V3.3.release_2016-07-22.IsoformV31.fasta.length_GC_Contig_Daten_for_34840_1nd.csv" + +# Column position of the gene name and the width/gc content (first column is gene name!) +myLenPos <- 1 +myGCPos <- 2 + +# +# hoo <- c("","") +# sampleLists <- c() +# for(a in hoo) { +# for(b in hoo) { +# if(a!=b) { +# tmp <- c(a,b) +# sampleLists <- c(sampleLists, list(tmp)) +# } +# } +# } + +sampleLists <- list(c("LCS_24h", "control_24h"), + c("LCS_24h", "control_6h"), + c("LCS_24h", "LCS_6h"), + c("control_24h", "LCS_6h"), + c("control_24h", "control_6h"), + c("LCS_6h", "control_6h") +) + +# Do you want a RPKM value table (1/0) +# Be warned, it takes a while to do this. approx. 10 minutes for 35000 genes and 6 samples. +rpkm <- 0 +# Are the data original HTSeq files? yes <- 1 ; no <- 0 +htseq <- 0 + +# threshold settings for the R packages +#edgeR +pvtedgeR <- 0.001 +#DESeq2 +pvtDESeq2 <- 0.001 +#NOISeq +pvtNOISeq <- 0.9 + +# Output folder names +edgeRDir <- "edgeR" +DESeq2Dir <- "DESeq2" +NOISeqDir <- "NOISeq" + +######################### +## stop editing here!! ## +######################### + + +########################################### +## executiv function # +########################################### + +workflow <- function(file, saveat, SampleNames, LenGCTable, myLenPos, myGCPos, rpkm, pvtedgeR, pvtDESeq2, pvtNOISeq, edgeRDir, DESeq2Dir, NOISeqDir, htseq) +{ + # setwd(workingFolder) # set workspace + + sampleNameI <- SampleNames[1] + sampleNameII <- SampleNames[2] + + # Comparison name + cName <- paste(sampleNameI, sampleNameII, sep = "_vs_") + + filename <- strsplit(file, "/") + filename <- filename[[1]][length(filename[[1]])] # long but it's working + sampleName <- gsub(", ","_",toString(SampleNames)) + samples <- SampleNames + # create output dir + dir.create(saveat, showWarnings = FALSE) + outdir <- paste0(saveat,"/",sampleName) + dir.create(outdir) + + # Start the log file + sink(paste0(outdir, "/DEGpipe_",cName, ".log")) + + # print parameter into the log file + print("set parameters:") + print(paste0("Counts file:", file)) + print(paste0("Results stored at:", saveat)) + print(paste0("SampleNames: ",SampleNames)) + print(paste0("LenGCTable: ",LenGCTable)) + print(paste0("myLenPos: ",myLenPos)) + print(paste0("myGCPos: ",myGCPos)) + print(paste0("rpkm: ",rpkm)) + print(paste0("htseq: ",htseq)) + print(paste0("pvtedgeR: ",pvtedgeR)) + print(paste0("pvtDESeq2: ",pvtDESeq2)) + print(paste0("pvtNOISeq: ",pvtNOISeq)) + print(paste0("edgeRDir: ",edgeRDir)) + print(paste0("DESeq2Dir: ",DESeq2Dir)) + print(paste0("NOISeqDir: ",NOISeqDir)) + # session and package infos for the log file + print(sessionInfo()) + print(Sys.info()) + + # load spike-in count tables + data0 <- read.table(file, header=T, sep="\t", row.names=1) + # just if the gene ids not ending with .1 (it is nessessery for NOISeqbio iteration step) + #row.names(data0) <- paste0(row.names(data0), ".1") + + # remove HTSeq rows like "__no_feature" + data0 <- data0[ with(data0, !grepl('__', rownames(data0))) , ] + data0 <- data0[ with(data0, !grepl('ERCC', rownames(data0))) , ] + #data0 <- data0[ , with(data0, !grepl('X.1', colnames(data0)))] + #rename col names (library name to roman number) + + librariesName <- c("M54.fq.M54.fq.single.p.count", "M53.fq.M53.fq.single.p.count", "M52.fq.M52.fq.single.p.count", "M48.fq.M48.fq.single.p.count", "M47.fq.M47.fq.single.p.count", "M46.fq.M46.fq.single.p.count", "M40.fq.M40.fq.single.p.count", "M39.fq.M39.fq.single.p.count", "M38.fq.M38.fq.single.p.count", "M34.fq.M34.fq.single.p.count", "M33.fq.M33.fq.single.p.count", "M32.fq.M32.fq.single.p.count") +# librariesName <- c("P97", "P96", "P95", "P91", "P90", "P89", "P111", "P110", "P109", "P105", "P104", "P103") + romanNumber <- c("LCS_24h.2", "LCS_24h.1", "LCS_24h", "control_24h.2", "control_24h.1", "control_24h", "LCS_6h.2", "LCS_6h.1", "LCS_6h", "control_6h.2", "control_6h.1", "control_6h") +# romanNumber <- c("24h_LCS.2", "24h_LCS.1", "24h_LCS", "DMSO24_control.2", "DMSO24_control.1", "DMSO24_control", "6h_LCS.2", "6h_LCS.1", "6h_LCS", "DMSO6_control.2", "DMSO6_control.1", "DMSO6_control") + + for(i in 1:length(librariesName)) { + colnames(data0)[colnames(data0)==grep(librariesName[i], names(data0), value = TRUE)] <- romanNumber[i] + } + + # creats a list of the column names/samples you would like to compare + SampleNames <- makeSampleList(sampleNameI, length(grep(paste0("^",sampleNameI,"[^A-Z]"), colnames(data0))) + 1, + sampleNameII, length(grep(paste0("^",sampleNameII,"[^A-Z]"), colnames(data0))) + 1) + print(SampleNames) + + ## mal anschauen als ersatz für das darüber ## + ## newenw <- table.new[, grepl(paste(LIST,collapse="|"), colnames(table.new))] + + + # exctract the samples from the data0 if needed + if(length(SampleNames) > 0) { + RTableInFiles <- data0[,SampleNames] + data <- data0[,SampleNames] + } + print(head(RTableInFiles)) + + # sort by column name + # data <- data[ , order(names(data))] # dangeros, the order of query and referenz might be not the same + # see how many rows and columns are in the tables + print(paste("Raw data row number: ",nrow(data))) + print(paste("Raw data column number: ",ncol(data))) + + # 2) A typical differential expression analysis workflow + # 2.1) Filtering and exploratory data analysis + # filter: One single gene/spike-in must have at least 5 counts in minimum 2 libraries + filter <- apply(data, 1, function(x) length(x[x>5])>=2) + dataFiltered <- data[filter,] + # see how many rows and columns are left + print(paste("Filtered data row number: ",nrow(dataFiltered))) + print(paste("Filtered data row number: ",ncol(dataFiltered))) + # filter gene names + genes <- rownames(dataFiltered)[grep("^Pp3", rownames(dataFiltered))] + # filter spike-in names + spikes <- rownames(dataFiltered)[grep("^ERCC", rownames(dataFiltered))] + # get the column name list as.factors + xtmp <- colnames(dataFiltered) + x <- split2list(xtmp) + + sampleNameList <- gsub(".[1234567890]+$", "", SampleNames) + columnPos <- makeIndexList(SampleNames) + + # run ... + ### edgeR ### + print(paste0("edgeR is running with ", cName)) + packageDescription("edgeR") + edgeRFunction(outdir, edgeRDir, cName, RTableInFiles, pvtedgeR, sampleNameI, sampleNameII, sampleNameList) + + #### DESeq2 #### + print(paste0("DESeq2 is running with ", cName)) + packageDescription("DESeq2") + DESeq2Function(outdir, DESeq2Dir, cName, RTableInFiles, pvtDESeq2, sampleNameI, sampleNameII, sampleNameList) + + ### NOISeq ### + print(paste0("NOISeq is running with ", cName)) + packageDescription("NOISeq") + NOISeqFunction(outdir, NOISeqDir, cName, RTableInFiles, pvtNOISeq, sampleNameI, sampleNameII, myLenPos, myGCPos, LenGCTable, columnPos, sampleNameList, SampleNames) + + ### Overlap of all three methods ### + # intersection with DEG direction check => 1, without => 0 + intersection_4.file <- intersection(outdir,1,4) + write.table(intersection_4.file, file = paste0(outdir, "/", cName,"_intersection_four_methods_",nrow(intersection_4.file),".tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # edgeR, DESeq2, NOISeq + intersection_3.file <- intersection(outdir,1,3) + write.table(intersection_3.file, file = paste0(outdir, "/", cName,"_intersection_three_methods_",nrow(intersection_3.file),".tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + sink() # End of log file + +} + + + + + +#################### +## main functions # +#################### + +plotGraphics <- function(data, librariesName, cName, outdir, romanNumber) +{ + library("DESeq2") + library("ggplot2") + library("RColorBrewer") + library("pheatmap") + library("genefilter") + + # create sample annotation lists + # sample.name.list <- gsub(".[12]", "", librariesName[colnames(data)]) + sample.name.list <- gsub(".[12]", "", romanNumber) + sample.name.uniq <- unique(unlist(sample.name.list, use.names = FALSE)) + + col.data.condition = data.frame(row.names = librariesName, condition = sample.name.list) + + # run DESeq2 functions + dds.matrix <- DESeqDataSetFromMatrix(countData = data, colData = col.data.condition, design = ~condition) # data must be integer! (Ganze Zahlen) + colData(dds.matrix)$condition <- factor(colData(dds.matrix)$condition, levels = sample.name.uniq) + dds.matrix.deseq <- DESeq(dds.matrix) + dds.matrix.res <- results(dds.matrix.deseq) + dds.matrix.res <- dds.matrix.res[order(dds.matrix.res$padj), ] + dds.matrix.res.fdr <- dds.matrix.res[!is.na(dds.matrix.res$padj), ] + dds.matrix.res.fdr <- dds.matrix.res.fdr[dds.matrix.res.fdr$padj < 0.0001, ] + dds.matrix.rld <- rlogTransformation(dds.matrix, blind = TRUE) + + # data for PCA plot + pcaData <- plotPCA(dds.matrix.rld, intgroup = "condition", returnData=TRUE) + percentVar <- round(100 * attr(pcaData, "percentVar")) + # color for samples + sample.colors <- c("gray60","#D73027","#4575B4", "purple", "orange","#113FF7","green", "black") + + # plot the PCA + # https://cran.r-project.org/web/packages/ggfortify/vignettes/plot_pca.html + png(filename=paste0(outdir,"PCA_all_samples_DEseq2_normalized.png")) + ggplot( + pcaData, + aes( + PC1, + PC2, + color=condition, + shape=condition) + ) + + scale_shape_manual(values = c(4, 1, 2, 3, 0, 5, 6, 7)) + + geom_point(size=2.5) + + xlab( + paste0("PC1: ",percentVar[1],"% variance") + ) + + ylab( + paste0("PC2: ",percentVar[2],"% variance") + ) + + theme() + #, face="bold" + scale_colour_manual( + values=sample.colors # dodgerblue3 + ) + + ggtitle("PCA of all samples (DESeq2 normalized)") + + theme( + plot.title = element_text(lineheight=.8), + panel.background = element_rect(fill = "gray95") + ) + dev.off() + + # how many genes will be involved in the heatmap calculation + numbGenes <- 100 + # pheatmap data + topVarGenes <- head(order(rowVars(assay(dds.matrix.rld)),decreasing=TRUE),numbGenes) + mat <- assay(dds.matrix.rld)[ topVarGenes, ] + mat <- mat - rowMeans(mat) + # pheatmap data annotation + anno <-as.data.frame(colData(dds.matrix.rld)[,c("condition")]) + rownames(anno) <- colnames(data) # check out the row names of annotation + colnames(anno) <- "condition" + # pheatmap data annotation color + condition <- sample.colors + names(condition) <- sample.name.uniq + anno_colors <- list(condition = condition) + + #heatmap(mat) + #pheatmap(mat) + # ftp://cran.r-project.org/pub/R/web/packages/pheatmap/pheatmap.pdf + png(filename=paste0(outdir,"HC_heatmap_all_samples_DESeq2_normalized.png")) + pheatmap(mat , + annotation = anno, + annotation_colors = anno_colors, + fontsize = 10, + fontsize_row = 6, + fontsize_col = 10, + main = paste0("Heatmap of all samples (DESeq2 normalized) ", numbGenes," genes\n"), + annotation_names_col = F, + show_rownames = F, + cluster_rows = F, + border_color = F, + treeheight_col = 20 + ) + dev.off() +} + + +edgeRFunction <- function(odir, edgeRDir, cName, RTableInFiles, pvtedgeR, sampleNameI, sampleNameII, sampleNameList) +{ + library(edgeR) + + outdir <- paste0(odir, "/",edgeRDir,"/") # set outdir + + dir.create(outdir, showWarnings = FALSE) # create output folder + dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots" + + comparison <- cName + counts <- RTableInFiles + + pvt <- pvtedgeR + + group <- factor(sampleNameList) + + # edgeR 'normal' + + y.normal <- DGEList(counts = counts, group = group) + + y.normal[,1] + + png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotMDS"))) + plotMDS(y.normal, method = "bcv", main = paste0(cName, "\nedgeR - plotMDS")) + dev.off() + + y.normal <- calcNormFactors(y.normal) + y.normal <- estimateCommonDisp(y.normal) + y.normal <- estimateTagwiseDisp(y.normal) + + + png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotBCV"))) + plotBCV(y.normal, main = paste0(cName, "\nedgeR - plotBCV")) + dev.off() + + et.normal <- exactTest(y.normal, pair = c(sampleNameI, sampleNameII)) + + topTags(et.normal) + # adjust.method ==> character string specifying p-value adjustment method. Possible values are "none", "BH", "fdr" (equivalent to "BH"), "BY" and "holm". See p.adjust for details. + summary(de.normal <- decideTestsDGE(et.normal, p = pvt, adjust = "BH")) + et.normal.fdr <- cbind(et.normal$table, p.adjust(et.normal$table[, 3], method = "BH")) + et.normal.fdr <- et.normal.fdr[et.normal.fdr[, 4] < pvt, ] + + write.table(et.normal.fdr, file = paste0(outdir, cName, "_", nrow(et.normal.fdr), "_edgeR_fdr.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + detags.normal <- rownames(de.normal)[as.logical(de.normal)] + png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotSmear"))) + plotSmear(et.normal, de.normal, tags = detags.normal, main = paste0(cName, "\nedgeR - plotSmear")) + abline(h = c(-2, 2), col = "blue") + dev.off() + + # edgeR 'GLM' + + y.glm <- DGEList(counts = counts, group = group) + + design <- model.matrix(~0 + group, data = y.glm$samples) + design + + y.glm <- estimateGLMCommonDisp(y.glm, design) + y.glm <- estimateGLMTrendedDisp(y.glm, design) + + ## Loading required package: splines + y.glm <- estimateGLMTagwiseDisp(y.glm, design) + + png(filename=paste0(outdir,"plots/",paste0(cName, " edgeR - plotBCV - GLM"))) + plotBCV(y.glm, main = paste0(cName, "\nedgeR - plotBCV - GLM")) + dev.off() + + y.fit <- glmFit(y.glm, design) + y.lrt <- glmLRT(y.fit) + + FDR <- p.adjust(y.lrt$table$PValue, method = "BH") + sum(FDR < pvt) + + topTags(y.lrt) + + summary(de.glm <- decideTestsDGE(y.lrt, p = pvt, adjust = "BH")) + +} + + +DESeq2Function <- function(odir, DESeq2Dir, cName, RTableInFiles, pvtDESeq2, sampleNameI, sampleNameII, sampleNameList) +{ + library(DESeq2) + library(RColorBrewer) + library(gplots) + library(vsn) + + outdir <- paste0(odir, "/",DESeq2Dir,"/") # set outdir + + dir.create(outdir, showWarnings = FALSE) + dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots" + + + pvt <- pvtDESeq2 + + + + # DESeqDataSet(se, design, ignoreRank = FALSE) + # DESeqDataSetFromMatrix(countData, colData, design, tidy = FALSE, ignoreRank = FALSE, ...) + # DESeqDataSetFromHTSeqCount(sampleTable, directory = ".", design, ignoreRank = FALSE, ...) + # DESeqDataSetFromTximport(txi, colData, design, ...) + coldata = data.frame(row.names = colnames(RTableInFiles), condition = sampleNameList) + + ddsHTSeq <- DESeqDataSetFromMatrix(countData = RTableInFiles, colData = coldata, design = ~ condition) + + colData(ddsHTSeq)$condition <- factor(colData(ddsHTSeq)$condition, levels = c(sampleNameI, sampleNameII)) + + ddsHTSeq <- DESeq(ddsHTSeq) + + ddsHTSeq.res <- results(ddsHTSeq) + ddsHTSeq.res <- ddsHTSeq.res[order(ddsHTSeq.res$padj), ] + head(ddsHTSeq.res) + + # png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - plotMA"))) + # plotMA(ddsHTSeq, main = paste0(cName, "\nDESeq2 - plotMA"), ylim = c(-2, 2)) + # dev.off() + + ddsHTSeq.res.fdr <- ddsHTSeq.res[!is.na(ddsHTSeq.res$padj), ] + ddsHTSeq.res.fdr <- ddsHTSeq.res.fdr[ddsHTSeq.res.fdr$padj < pvt, ] + + write.table(as.matrix(ddsHTSeq.res.fdr), file = paste0(outdir, cName, "_", nrow(as.matrix(ddsHTSeq.res.fdr)), "_DESeq2_fdr.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + ddsHTSeq.rld <- rlogTransformation(ddsHTSeq, blind = TRUE) + ddsHTSeq.vsd <- varianceStabilizingTransformation(ddsHTSeq, blind = TRUE) + + par(mfrow = c(1, 3)) + notAllZero <- (rowSums(counts(ddsHTSeq)) > 0) + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.1"))) + meanSdPlot(log2(counts(ddsHTSeq, normalized = TRUE)[notAllZero, ] + 1)) # , ylim = c(0, 2.5) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.2"))) # , ylim = c(0, 2.5) + meanSdPlot(assay(ddsHTSeq.rld[notAllZero, ])) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - SdPlot.3"))) # , ylim = c(0, 2.5) + meanSdPlot(assay(ddsHTSeq.vsd[notAllZero, ])) + dev.off() + + hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100) + distRL <- dist(t(assay(ddsHTSeq.rld))) + mat <- as.matrix(distRL) + rownames(mat) <- colnames(mat) <- with(colData(ddsHTSeq), paste0(condition)) + + # png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - heatmap"))) + # heatmap(mat, trace = "none", col = rev(hmcol), margin = c(13, 13)) + # dev.off() + + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - PCA"))) + print(plotPCA(ddsHTSeq.rld, intgroup = "condition")) + dev.off() + + png(filename=paste0(outdir,"plots/",paste0(cName, " DESeq2 - DispEsts"))) + plotDispEsts(ddsHTSeq, main = paste0(cName, "\nDESeq2 - plotDispEsts")) + dev.off() +} + +NOISeqFunction <- function(odir, NOISeqDir, cName, RTableInFiles, pvtNOISeq, sampleNameI, sampleNameII, myLenPos, myGCPos, LenGCTable, columnPos, sampleNameList, SampleNames) +{ + library(NOISeq) + + outdir <- paste0(odir, "/",NOISeqDir,"/") # set outdir + + dir.create(outdir, showWarnings = FALSE) + dir.create(paste0(outdir,"plots"), showWarnings = FALSE) # create subfolder "plots" + + pvt <- pvtNOISeq + + counts <- RTableInFiles[, columnPos] + head(counts) + + # must be changed if there are no triplicates + # !! Please change this setting if needed !! + myfactors <- data.frame(Treatment = sampleNameList, TreatmentRun = SampleNames) + # sort counts by row names + mycounts <- counts[order(rownames(counts)), ] + # filter out low counts + mycounts.filter <- filtered.data(mycounts, factor = myfactors$Treatment, norm = FALSE, depth = NULL, method = 1, cv.cutoff = 100, cpm = 1) + mycounts.rowname <- rownames(mycounts.filter) + + # Extra file with length and GC content + myLengthTable <- read.table(LenGCTable, sep = "\t", header = TRUE, stringsAsFactor = FALSE, row.names=1) + myLengthTable <- myLengthTable[order(rownames(myLengthTable)),] # sort by rownames + myLengthTable.filter <- myLengthTable[mycounts.rowname,] + + mylength <- myLengthTable.filter[,myLenPos] + names(mylength) <- rownames(myLengthTable.filter) + + mygc <- myLengthTable.filter[,myGCPos] + names(mygc) <- rownames(myLengthTable.filter) + + + + + + mydata <- readData(data = mycounts.filter, length = mylength, gc = mygc, factors = myfactors) + mydata + + mydata.GCbias <- dat(mydata, type = "GCbias") +#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - GCbias.1"))) +#-# explo.plot(mydata.GCbias) +#-# dev.off() + + mydata.GCbias.group <- dat(mydata, factor = "Treatment", type = "GCbias") +#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - GCbias.2"))) +#-# explo.plot(mydata.GCbias.group) +#-# dev.off() + + show(mydata.GCbias.group) + + mydata.lengthbias <- dat(mydata, type = "lengthbias") +#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - Lenbias.1"))) +#-# explo.plot(mydata.lengthbias) +#-# dev.off() + + mydata.lengthbias.group <- dat(mydata, factor = "Treatment", type = "lengthbias") +#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - Lenbias.2"))) +#-# explo.plot(mydata.lengthbias.group) +#-# dev.off() + + show(mydata.lengthbias.group) + + # looks different + mydata.cd <- dat(mydata, type = "cd") +#-# png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - cd"))) +#-# explo.plot(mydata.cd) +#-# dev.off() + mylength + + myRPKM <- NOISeq::rpkm(assayData(mydata)$exprs) + myUQUA <- NOISeq::uqua(assayData(mydata)$exprs, long = mylength, lc = 0.5, k = 0) + myTMM <- NOISeq::tmm(assayData(mydata)$exprs, long = 1000, lc = 0) + + mynoiseq.rpkm <- noiseq(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological") + mynoiseq.nnorm <- noiseq(mydata, k = 0.5, norm = "n", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological") + mynoiseq.tmm <- noiseq(mydata, k = 0.5, norm = "tmm", factor = "Treatment", pnr = 0.2, nss = 5, v = 0.02, lc = 1, replicates = "biological") + + # RPKM + mynoiseq.rpkm.prob <- mynoiseq.rpkm@results[[1]][mynoiseq.rpkm@results[[1]]$prob > pvt, ] + mynoiseq.rpkm.prob <- mynoiseq.rpkm.prob[!is.na(mynoiseq.rpkm.prob$prob), ] + write.table(mynoiseq.rpkm.prob, file = paste0(outdir, cName, "_", nrow(mynoiseq.rpkm.prob), "_NOISeq_rpkm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # none + mynoiseq.nnorm.prob <- mynoiseq.nnorm@results[[1]][mynoiseq.nnorm@results[[1]]$prob > pvt, ] + mynoiseq.nnorm.prob <- mynoiseq.nnorm.prob[!is.na(mynoiseq.nnorm.prob$prob), ] + write.table(mynoiseq.nnorm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseq.nnorm.prob), "_NOISeq_nnorm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # TMM + mynoiseq.tmm.prob <- mynoiseq.tmm@results[[1]][mynoiseq.tmm@results[[1]]$prob > pvt, ] + mynoiseq.tmm.prob <- mynoiseq.tmm.prob[!is.na(mynoiseq.tmm.prob$prob), ] + write.table(mynoiseq.tmm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseq.tmm.prob), "_NOISeq_tmm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + ############ NOISeq bio ################ + + pvtnsb <- 0.995 + a0per_value <- 0.99 + + iterations <- 5 + x5 <- c(sample(10000:99999, iterations, replace=F), 12345) + x5 + + collector.rpkm <- NULL + collector.tmm <- NULL + + + # for (iter in x5) { + # # RPKM + # mynoiseqbio.rpkm <- noiseqbio(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = iter, filter = 1)#, plot = TRUE) + # mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm@results[[1]][mynoiseqbio.rpkm@results[[1]]$prob > pvtnsb, ] + # mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm.prob[!is.na(mynoiseqbio.rpkm.prob$prob), ] + # collector.rpkm <- rbind(collector.rpkm, mynoiseqbio.rpkm.prob) + # #collector <- rbind(collector, mynoiseqbio.rpkm.prob[rownames(mynoiseqbio.rpkm.prob) %in% rownames(collector),]) + # # TMM + # mynoiseqbio.tmm <- noiseqbio(mydata, k = 0.5, norm = "tmm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = iter, filter = 1)#, plot = TRUE) + # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ] + # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ] + # collector.tmm <- rbind(collector.tmm, mynoiseqbio.tmm.prob) + # } + # + # # RPKM + # print(sort(row.names(collector.rpkm))) + # library(stringr) + # mynoiseqbio.rpkm.prob.overlap <- collector.rpkm[str_split_fixed(row.names(collector.rpkm), "\\.",2)[,2]>=iterations+10,] + # row.names(mynoiseqbio.rpkm.prob.overlap) <- substr(row.names(mynoiseqbio.rpkm.prob.overlap),1,str_length(row.names(mynoiseqbio.rpkm.prob.overlap))-1) + # + # write.table(mynoiseqbio.rpkm.prob.overlap, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.rpkm.prob.overlap),"_NOISeqbio_rpkm_prob_seed_iteration_overlap.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # + # #TMM + # print(sort(row.names(collector.tmm))) + # library(stringr) + # mynoiseqbio.tmm.prob.overlap <- collector.tmm[str_split_fixed(row.names(collector.tmm), "\\.",2)[,2]>=iterations+10,] + # row.names(mynoiseqbio.tmm.prob.overlap) <- substr(row.names(mynoiseqbio.tmm.prob.overlap),1,str_length(row.names(mynoiseqbio.tmm.prob.overlap))-1) + # + # write.table(mynoiseqbio.tmm.prob.overlap, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.tmm.prob.overlap),"_NOISeqbio_tmm_prob_seed_iteration_overlap.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + # # none + # mynoiseqbio.nnorm.prob <- mynoiseqbio.nnorm@results[[1]][mynoiseqbio.nnorm@results[[1]]$prob > pvtnsb, ] + # mynoiseqbio.nnorm.prob <- mynoiseqbio.nnorm.prob[!is.na(mynoiseqbio.nnorm.prob$prob), ] + # write.table(mynoiseqbio.nnorm.prob, file = paste0(outdir, cName, "_NOISeqbio_nnorm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + # # TMM + # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ] + # mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ] + # write.table(mynoiseqbio.tmm.prob, file = paste0(outdir, cName, "_NOISeqbio_tmm_prob.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + + + + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbiorpkm"))) + mynoiseqbio.rpkm <- noiseqbio(mydata, k = 0.5, norm = "rpkm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE) + dev.off() + + mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm@results[[1]][mynoiseqbio.rpkm@results[[1]]$prob > pvtnsb, ] + mynoiseqbio.rpkm.prob <- mynoiseqbio.rpkm.prob[!is.na(mynoiseqbio.rpkm.prob$prob), ] + write.table(mynoiseqbio.rpkm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.rpkm.prob),"_NOISeqbio_rpkm_prob_seed_12345.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbiontmm"))) + mynoiseqbio.tmm <- noiseqbio(mydata, k = 0.5, norm = "tmm", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE) + dev.off() + + mynoiseqbio.tmm.prob <- mynoiseqbio.tmm@results[[1]][mynoiseqbio.tmm@results[[1]]$prob > pvtnsb, ] + mynoiseqbio.tmm.prob <- mynoiseqbio.tmm.prob[!is.na(mynoiseqbio.tmm.prob$prob), ] + write.table(mynoiseqbio.tmm.prob, file = paste0(outdir, cName,"_", nrow(mynoiseqbio.tmm.prob),"_NOISeqbio_tmm_prob_seed_12345.tsv"), sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE) + + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - noiseqbionnorm"))) + mynoiseqbio.nnorm <- noiseqbio(mydata, k = 0.5, norm = "n", factor = "Treatment", lc = 1, r = 20, adj = 1.5, a0per = a0per_value, random.seed = 12345, filter = 1, plot = TRUE) + dev.off() + + head(mynoiseqbio.rpkm@results[[1]]) + head(mynoiseqbio.nnorm@results[[1]]) + + mynoiseqbio.rpkm.deg <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = NULL) + + mynoiseqbio.rpkm.deg.up <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = "up") + mynoiseqbio.rpkm.deg.down <- degenes(mynoiseqbio.rpkm, q = pvtnsb, M = "down") + mynoiseqbio.nnorm.deg <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = NULL) + mynoiseqbio.nnorm.deg.up <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = "up") + mynoiseqbio.nnorm.deg.down <- degenes(mynoiseqbio.nnorm, q = pvtnsb, M = "down") + + par(mfrow = c(3, 2)) + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.9"))) + DE.plot(mynoiseq.rpkm, q = 0.9, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.8"))) + DE.plot(mynoiseq.rpkm, q = 0.8, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorn_0.9"))) + DE.plot(mynoiseq.nnorm, q = 0.9, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorn_0.8"))) + DE.plot(mynoiseq.nnorm, q = 0.8, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.tmm_0.9"))) + DE.plot(mynoiseq.tmm, q = 0.9, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.tmm_0.8"))) + DE.plot(mynoiseq.tmm, q = 0.8, graphic = "expr", log.scale = TRUE) + dev.off() + + par(mfrow = c(2, 2)) + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.rpkm_",pvt))) + DE.plot(mynoiseqbio.rpkm, q = pvt, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.rpkm_",pvtnsb))) + DE.plot(mynoiseqbio.rpkm, q = pvtnsb, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.nnorm_",pvt))) + DE.plot(mynoiseqbio.nnorm, q = pvt, graphic = "expr", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeqbio - DE.nnorm_",pvtnsb))) + DE.plot(mynoiseqbio.nnorm, q = pvtnsb, graphic = "expr", log.scale = TRUE) + dev.off() + + par(mfrow = c(1, 2)) + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.rpkm_0.9_MD"))) + DE.plot(mynoiseq.rpkm, q = 0.9, graphic = "MD", log.scale = TRUE) + dev.off() + png(filename=paste0(outdir,"plots/",paste0(cName, " NOISeq - DE.nnorm_0.9_MD"))) + DE.plot(mynoiseq.nnorm, q = 0.9, graphic = "MD", log.scale = TRUE) + dev.off() +} + +########################################### +## additional function for more usability # +########################################### + +calcRPKM <- function(c,n,l) +{ + RPKM <- (10^9* as.numeric(c)) / (as.numeric(n) * as.numeric(l)) + return(RPKM) +} + +RPKMMatrix <- function(InTable,directory,LenGCTable,myLenPos) +{ + myLengthTable <- read.table(paste0(directory,LenGCTable), sep = "\t", header = TRUE, stringsAsFactor = FALSE, row.names = 1) + + newtable <- matrix(data = c(1), nrow = nrow(InTable), ncol(InTable), byrow= TRUE) + colnames(newtable) <- colnames(InTable) + rownames(newtable) <- rownames(InTable) + + sumCol <- apply(InTable,2,sum) # list of the all col sums + + for(s in colnames(InTable)) { + for(r in rownames(InTable)) { + w <- InTable[r, s] + len <- myLengthTable[r, myLenPos] + sum <- as.numeric(sumCol[s]) + + newtable[r, s] <- calcRPKM(w, sum, len) + } + } + return(newtable) +} + + +# rmHTSeqLines removes the last five rows +rmHTSeqLines <- function(counts) +{ + counts <- head(counts, -5) + return(counts) +} + + +############### +## functions ## +############### + +# split list to a factor list +split2list <- function(tmp) +{ + xx <- c() + for(i in 1:length(tmp)) { + if(grepl("\\.", tmp[i])) { + foo <- strsplit(tmp[i], '\\.')[[1]] + fooo <- strsplit(foo, " ")[[1]] + xx <- c(xx, fooo) + } else { + xx <- c(xx, tmp[i]) + } + } + return(as.factor(xx)) +} + +# makeGroups +# https://github.com/drisso/RUVSeq/blob/master/R/helper.R +makeGroups <- function(xs) +{ + xs <- factor(xs) + groups <- matrix(-1, nrow = length(levels(xs)), ncol = max(table(xs))) + for (i in 1:length(levels(xs))) { + idxs <- which(xs == levels(xs)[i]) + groups[i,1:length(idxs)] <- idxs + } + return(groups) +} + +# The function makeSampleList creates a list of the columns(samples) you would like to compare +makeSampleList <- function(A, Al, B, Bl) +{ + l <- c() + if(exists("Al", mode = "numeric")) { + l <- makeSampleList.helper(A, Al, l) + l <- makeSampleList.helper(B, Bl, l) + } + return(l) +} +makeSampleList.helper <- function(v,k,l) +{ + for(h in 1:k) { + if(h == 1) { + l <- c(l, v) + } else { + l <- c(l, paste0(v,".",h-1)) + } + } + return(l) +} + +# function creates a list of numbers out of another list. The List is the index of the given list. +makeIndexList <- function(l) +{ + n <- 1 + jj <- c() + for(v in l) { + jj <- c(jj, n) + n <- n + 1 + } + return(jj) +} + +intersection <- function(l,check,t) +{ + l.list <- list.files(path = l, pattern = "*(rpkm|edgeR|DESeq2).*tsv", recursive = TRUE) + if(grepl("bio", l.list[4],fixed = TRUE)) { + l.list.noiseq <- read.table(paste0(l,"/",l.list[3]), header=T, sep="\t", row.names=1) + l.list.noiseqbio <- read.table(paste0(l,"/",l.list[4]), header=T, sep="\t", row.names=1) + } else { + l.list.noiseq <- read.table(paste0(l,"/",l.list[4]), header=T, sep="\t", row.names=1) + l.list.noiseqbio <- read.table(paste0(l,"/",l.list[3]), header=T, sep="\t", row.names=1) + } + l.list.edgeR <- read.table(paste0(l,"/",l.list[2]), header=T, sep="\t", row.names=1) + l.list.deseq2 <- read.table(paste0(l,"/",l.list[1]), header=T, sep="\t", row.names=1) + + + print("DEGs edgeR") + print(nrow(l.list.edgeR)) + print("DEGs deseq2") + print(nrow(l.list.deseq2)) + print("DEGs noiseq") + print(nrow(l.list.noiseq)) + print("DEGs noiseqbio") + print(nrow(l.list.noiseqbio)) + + interect.list <- "" + interect.list.direct <- "" + + if(t==4) { + # List of overlaps without direction check + interect.list <- intersect(rownames(l.list.edgeR), intersect(rownames(l.list.noiseq), intersect(rownames(l.list.noiseqbio), rownames(l.list.deseq2)))) + + print("DEGs intersection without direction check (four methods)") + print(length(interect.list)) + print(interect.list) + print("DEGs intersection with direction check") + + # List of overlaps with direction check + if(length(interect.list)!=0) { + interect.list.direct <- interDirect(l.list.edgeR, l.list.deseq2, l.list.noiseq, l.list.noiseqbio, interect.list) + print(length(interect.list.direct)) + print(interect.list.direct) + } else { + print("0") + } + } + if(t==3) { + # List of overlaps without direction check + interect.list <- intersect(rownames(l.list.edgeR), intersect(rownames(l.list.noiseq), rownames(l.list.deseq2))) + + print("DEGs intersection without direction check (three methods)") + print(length(interect.list)) + print(interect.list) + print("DEGs intersection with direction check") + + # List of overlaps with direction check + if(length(interect.list)!=0) { + # fake data frame with 0 rows => nrow == 0 + df_fake <- data.frame(Date=as.Date(character()), + File=character(), + User=character(), + stringsAsFactors=FALSE) + + interect.list.direct <- interDirect(l.list.edgeR, l.list.deseq2, l.list.noiseq, df_fake, interect.list) + print(length(interect.list.direct)) + print(interect.list.direct) + } else { + print("0") + } + } + + + if(check == 1){ + return(l.list.noiseq[interect.list.direct,]) + } else { + return(l.list.noiseq[interect.list,]) + } +} + +interDirect <- function(e,d,n,nb,i) +{ + lil <- length(i) + + if(nrow(nb) == 0) { # use noiseq twice to simulate noiseqbio + nbx <- apply(n[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) + nby <- paste0(nbx[,"M"],rownames(nbx)) + } else { + nbx <- apply(nb[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) + nby <- paste0(nbx[,"log2FC"],rownames(nbx)) + } + + ex <- apply(e[i,], 2, function(x) {ifelse(x < 0, "-", "+")}) #-#-#-# hier < und > + dx <- apply(d[i,], 2, function(x) {ifelse(x < 0, "-", "+")}) #-#-#-# hier < und > + nx <- apply(n[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) # twisted, because edgeR and DESeq2 are calculating vice versa + + ey <- paste0(ex[,"logFC"],rownames(ex)) + dy <- paste0(dx[,"log2FoldChange"],rownames(dx)) + ny <- paste0(nx[,"M"],rownames(nx)) + + interect.list.direct <- intersect(ey, intersect(dy, intersect(ny, nby))) + + if(lil>0 && length(interect.list.direct)<=0) { + ex <- apply(e[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) #-#-#-# hier < und > + dx <- apply(d[i,], 2, function(x) {ifelse(x > 0, "-", "+")}) #-#-#-# hier < und > + + ey <- paste0(ex[,"logFC"],rownames(ex)) + dy <- paste0(dx[,"log2FoldChange"],rownames(dx)) + + interect.list.direct <- intersect(ey, intersect(dy, intersect(ny, nby))) + } + + return(substr(interect.list.direct,2,100)) +} + +########################################### +## executiv part of the script # +########################################### +for(SampleNames in sampleLists) { + workflow(file, saveat, SampleNames, LenGCTable, myLenPos, myGCPos, rpkm, pvtedgeR, pvtDESeq2, pvtNOISeq, edgeRDir, DESeq2Dir, NOISeqDir, htseq) +} +print("Thank you for using the DEG pipeline :) \n") + +# fin + +### compare DEGs analysis ### +# used venny and Excel + diff --git a/workflows/Experiment2/snp_pipe_at.sge b/workflows/Experiment2/snp_pipe_at.sge new file mode 100644 index 0000000..c0c5cd5 --- /dev/null +++ b/workflows/Experiment2/snp_pipe_at.sge @@ -0,0 +1,13 @@ +#!/bin/sh + + +prefix=$1 +job_mem=$2 +out_path=$3 +nthreads=$4 +species=$5 +inpath=$6 + + + +qsub -N "run_"$prefix -l h_vmem=$job_mem -pe multislot $nthreads -S /bin/bash -e $out_path -o $out_path /vol/agrensing/fhaas/RNA-seq_Schippers/second_samples/no_backup/SNP_pipe_At.sh $nthreads $out_path $prefix $species $inpath diff --git a/workflows/Experiment2/snp_pipe_at.sh b/workflows/Experiment2/snp_pipe_at.sh new file mode 100644 index 0000000..1d09557 --- /dev/null +++ b/workflows/Experiment2/snp_pipe_at.sh @@ -0,0 +1,469 @@ +#!/bin/bash + +#. /etc/profile.d/modules.sh + +export PATH=$PATH:/vol/biotools/bin +export MANPATH=$MANPATH:/vol/biotools/man +export PATH=$PATH:/vol/software/bin + + +# install important R package - lummerland cluster does not have it ... +#echo "install.packages(\"gplots\", repos=\"https://cran.rstudio.com\")" | R --vanilla --no-save +#R CMD INSTALL /vol/agrensing/tools/R_packages/bitops_1.0-6.tar.gz +#R CMD INSTALL /vol/agrensing/tools/R_packages/caTools_1.17.1.1.tar.gz +#R CMD INSTALL /vol/agrensing/tools/R_packages/gtools_3.8.1.tar.gz +#R CMD INSTALL /vol/agrensing/tools/R_packages/gdata_2.18.0.tar.gz +#R CMD INSTALL /vol/agrensing/tools/R_packages/gplots_3.0.1.tar.gz + +#functions + +startle () +{ + number_row_reads=`wc -l $fw | awk '{print $1/4}'` + echo "## Start log ##" + echo "Number of row reads: "$number_row_reads +} + +trimmo () + { + # Remove leading low quality or N bases (below quality 3) (LEADING:3) + # Remove trailing low quality or N bases (below quality 3) (TRAILING:3) + # Scan the read with a 4-base wide sliding window, cutting when the average quality per base drops below 15 (SLIDINGWINDOW:4:15) + # HEADCROP: Cut the specified Number of bases from the start of the read + # MINLEN was first set to 50 bp. But there are libraries with read start length of 50 bp. + + if [ -e $rv ]; then + java $java_ram -jar /nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads $cpu -phred33 -summary $trimmo_out".trimmo.stats" \ + $fw $rv $trimmo_out".PE.fw" $trimmo_out".SE.fw" $trimmo_out".PE.rv" $trimmo_out".SE.rv" \ + ILLUMINACLIP:/nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/adapters/TruSeq-PE_all.fna:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 HEADCROP:10 MINLEN:30 + + # stats + echo "## Trimmo statistics ##" + printf "trim\t"$trimmo_out".PE.fw\t"; grep -c "^\+$" $trimmo_out".PE.fw" + # fastqc + /nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc" $trimmo_out".PE."* + fi + if [ ! -e $rv ]; then + java $java_ram -jar /nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/trimmomatic-0.39.jar SE -threads $cpu -phred33 -summary $trimmo_out".trimmo.stats" \ + $fw $trimmo_out".SE" \ + ILLUMINACLIP:/nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/adapters/TruSeq-PE_all.fna:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 HEADCROP:10 MINLEN:30 + + # stats + echo "## Trimmo statistics ##" + # printf "trim\t"$trimmo_out".SE\t"; grep -c "^\+$" $trimmo_out".SE" + printf "trim\t"$trimmo_out".SE\t"; wc -l $trimmo_out".SE" | awk '{print $1/4}' + # fastqc + /nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc" $trimmo_out".SE" + fi + + } + +polyA () + { + # min_len was first set to 50 bp. But there are libraries with read start length of 50 bp. + # If you specify any statistic option, no other ouput will be generated. To preprocess data, do not specify a statistics option. + # -stats_dupl -stats_all + + inpA=$polyA_out".polyA.passed" + infA=$polyA_out".polyA.failed" + + if [ -e $rv ]; then + /nfs/agrensing/tools/prinseq/prinseq-lite-0.20.4/./prinseq-lite.pl -verbose -no_qual_header -fastq $trimmo_out".PE.fw" -fastq2 $trimmo_out".PE.rv" \ + -trim_tail_left 5 -trim_tail_right 5 -min_len 30 -out_good $inpA -out_bad $infA \ + -out_format 3 -log $polyA_out"_polyA.log" + + # stats + echo "## PolyA statistics ##" + printf "polyA\t"$inpA"_1.fastq\t"; wc -l $inpA"_1.fastq" | awk '{print $1/4}' + #fastqc + /nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc" $inpA"_"[12]".fastq" + fi + if [ ! -e $rv ]; then + /nfs/agrensing/tools/prinseq/prinseq-lite-0.20.4/./prinseq-lite.pl -verbose -no_qual_header -fastq $trimmo_out".SE" \ + -trim_tail_left 5 -trim_tail_right 5 -min_len 30 -out_good $inpA -out_bad $infA \ + -out_format 3 -log $polyA_out"_polyA.log" + + # stats + echo "## PolyA statistics ##" + printf "polyA\t"$inpA".fastq\t"; grep -c "^\+$" $inpA".fastq" + #fastqc + /nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc" $inpA".fastq" + fi + } + +duple () +{ + # number of 100% identical duplicat reads. + echo "Duplication:" + + if [ -e $rv ]; then + + #SE 1 + IN=$polyA_out".polyA.passed" + + num_dup_se1=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_1_dupl_histogram"}}' $IN"_1.fastq" | wc -l` + printf $num_dup_se1"\t"; printf %.5f\\t "$(( 100000 * num_dup_se1 / number_row_reads ))e-5" + #SE 2 + num_dup_se2=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_2_dupl_histogram"}}' $IN"_2.fastq" | wc -l` + printf $num_dup_se2"\t"; printf %.5f\\t "$(( 100000 * num_dup_se2 / number_row_reads ))e-5" + + + #PE + num_dup_pe=`awk -v nof=$IN 'NR % 4 == 1 || NR % 4 == 2 {if(n==1){if(foo[name]==""){c=0}; foo[name]=foo[name]$1; if(c==1){seq[foo[name]]++} else {c=1}; n=0} else if($1~/^@/){split($1,w,"/"); name=w[1]; foo[name]; n=1}} END {for(a in seq){print seq[a]; p[seq[a]]++} for(l in p){print l"\t"p[l] >> nof"_1_2_dupl_histogram"}}' $IN"_1.fastq" $IN"_2.fastq" | wc -l` + printf $num_dup_pe"\t"; printf %.5f\\n "$(( 100000 * num_dup_pe / number_row_reads ))e-5" + + + fi + if [ ! -e $rv ]; then + for IN in $polyA_out".polyA.passed"*; + do + num_dup_se1=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_dupl_histogram"}}' $IN | wc -l` + printf $IN"\t"$num_dup_se1"\t"; printf %.5f\\n "$(( 100000 * num_dup_se1 / number_row_reads ))e-5" + done + fi + + +} + +mappi() +{ + if [ -e $rv ]; then + /homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -N 1 -n 7 -A sam --npaths=5 \ + --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \ + --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \ + $polyA_out".polyA.passed_1.fastq" $polyA_out".polyA.passed_2.fastq" 2> $mappi_out".mappi.err" + + unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped.1"` + + fi + if [ ! -e $rv ]; then + /ceph/agrensing/tools/gmap-gsnap/gmap-2021-03-08/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -N 1 -n 5 -A sam --npaths=5 \ + --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \ + --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \ + $polyA_out".polyA.passed.fastq" 2> $mappi_out".mappi.err" + + unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped"` + fi + + + + # mapping statistics + echo "## Pp genome version 3 ##" + echo "Unmapped reads: "$unmapped_reads + +} + +organelli() +{ + if [ -e $rv ]; then + for IN in $mappi_out".genome.concordant_"*; + do + print "Fragments:" + printf $IN"\t"; + with=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Ei "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`; + wo=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Eiv "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`; + awk -v with=$with -v wo=$wo 'BEGIN {print with"\t"wo"\t"with/(wo+with)*100" %"}' ; + done + fi + if [ ! -e $rv ]; then + for IN in $mappi_out".genome"*"uniq"*; + do + print "SE reads:" + printf $IN"\t"; + with=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Ei "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`; + wo=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Eiv "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`; + awk -v with=$with -v wo=$wo 'BEGIN {print with"\t"wo"\t"with/(wo+with)*100" %"}' ; + done + + fi + + +} + +mappi_gDNA() +{ + if [ -e $rv ]; then + /homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -n 7 -A sam --npaths=5 \ + --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \ + --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \ + $polyA_out".polyA.passed_1.fastq" $polyA_out".polyA.passed_2.fastq" 2> $mappi_out".mappi.err" + + unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped.1"` + fi + + if [ ! -e $rv ]; then + /homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -n 5 -A sam --npaths=5 \ + --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \ + --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \ + $polyA_out".polyA.passed.fastq" 2> $mappi_out".mappi.err" + + unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped"` + fi + + # mapping statistics + echo "## Pp genome version 3 ##" + echo "Unmapped reads: "$unmapped_reads +} + + +sample () +{ + # if PE use ".genome"*[tcqg] + for IN in $mappi_out".genome"*[tcqg] + do + printf $IN"\tsort.bam\n" >&2 + /vol/agrensing/tools/samtools/samtools-1.12/samtools view -@ $cpu -bhS $IN | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G - -o $IN".sort.bam" + + printf $IN"\trmdup.bam\n" >&2 + grep -vE "chloro|ribo|mito|ERCC" $IN | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G -n -O BAM - | \ + /vol/agrensing/tools/samtools/samtools-1.12/samtools fixmate -@ $cpu -m - - | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G -O BAM - | \ + /vol/agrensing/tools/samtools/samtools-1.12/samtools markdup -@ $cpu -s -r - \ + $IN".filter.fixmate.sort.markdup_r.bam" + + # samtools statistics + v_tmp=`awk '$1!~"@" {print $1}' $IN | sort | uniq | wc -l` + printf "Number of mapped reads: "$IN"\t"$v_tmp"\n" + + # compress the sam text files + # gzip $IN + # rm sam file + rm $IN + + + # bam index + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $IN*"_r.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $IN".sort.bam" + + done + + ls -l $mappi_out*"bam" >&2 + + # merge all unique bam files + echo "merging all .uniq bam files" >&2 + echo "merging all .uniq bam files" + /vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.bam" $mappi_out*"uniq"*".sort.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.bam" + + /vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.rmdup.bam" $mappi_out*"uniq"*"_r.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.rmdup.bam" +} + +counti() +{ + if [ -e $rv ]; then + # read count v3.3 and ERCC combined + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a $gff_file \ + -t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.pB.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v51_rmdup.exon.pB.log" + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a $gff_file \ + -t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.p.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v51_rmdup.exon.p.log" + + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a $gff_file \ + -t exon -g Parent -o $counti_out"_all_v3.3_31315_ercc.exon.pB.txt" $mappi_out".uniq.merged.bam" 2> $counti_out"_v51_all.exon.pB.log" + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a $gff_file \ + -t exon -g Parent -o $counti_out"_all_v3.3_31315_ercc.exon.p.txt" $mappi_out".uniq.merged.bam" 2> $counti_out"_v51_all.exon.p.log" + + # statistics + printf "\n### FeatureCount statisitcs v3.3 ###\n" + # 32458 + echo "Fragment counts on gene models exon PE rmdup:" + grep "Successfully assigned fragments" $counti_out"_v51_rmdup.exon.pB.log" + echo "Fragment counts on gene models exon PE+SE rmdup:" + grep "Successfully assigned fragments" $counti_out"_v51_rmdup.exon.p.log" + + # 31315 no rmdup + echo "Fragment counts on gene models exon PE (no rmdup):" + grep "Successfully assigned fragments" $counti_out"_v51_all.exon.pB.log" + echo "Fragment counts on gene models exon PE+SE (no rmdup):" + grep "Successfully assigned fragments" $counti_out"_v51_all.exon.p.log" + + fi + if [ ! -e $rv ]; then + echo "This part is not working yet" +# /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_ercc_gt.gff \ +# -t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.SE.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_32458_ercc.exon.SE.log" +# /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \ +# -t exon -g Parent -o $counti_out"_v3.3_31315_ercc.exon.SE.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_31315_ercc.exon.SE.log" +# +# # statistics +# printf "\n### FeatureCount statisitcs v3.3 ###" +# # 32458 +# echo "Fragment counts on gene models exon SE 32458:" +# grep "Successfully assigned " $counti_out"_v3.3_32458_ercc.exon.SE.log" +# # 31315 +# echo "Fragment counts on gene models exon SE 31315:" +# grep "Successfully assigned " $counti_out"_v3.3_31315_ercc.exon.SE.log" + + fi +} + +depthi() +{ + /vol/agrensing/tools/samtools/samtools-1.12/samtools depth -aa $mappi_out".uniq.merged.rmdup.bam" > $mappi_out".uniq.merged.rmdup.depth" +} + + +snpi() +{ + gatkrun='/vol/agrensing/tools/gatk/gatk-4.2.0.0/gatk --java-options -Xmx50G' + threading_option='--native-pair-hmm-threads 8' + # V3 + # start the SNP detection + $gatkrun HaplotypeCaller $threading_option -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -ploidy 1 -O $snpi_out".HaplotypeCaller.vcf" 2> $snpi_out".HaplotypeCaller.err" + + $gatkrun SelectVariants -R $reffna --variant $snpi_out".HaplotypeCaller.vcf" -select-type INDEL -O $snpi_out".SelectVariants.vcf" 2> $snpi_out".SelectVariants.err" + + # HQ SNPs + python2.7 /vol/agrensing/fhaas/GeneAtlas/SNP/GetHighQualVcfs.py -i $snpi_out".HaplotypeCaller.vcf" -o $output_dir"/05_SNP_calling/"$species"/"$prefix"/" --ploidy 1 --percentile 90 --GQ 90 2> $snpi_out".GetHighQualVcfs.HQ.err" + +# mv $snpi_out".GetHighQualVcfs.HQ.vcf" $snpi_out".GetHighQualVcfs.V3.HQ.vcf" + + $gatkrun IndexFeatureFile -I $snpi_out".GetHighQualVcfs.HQ.vcf" 2> $snpi_out".GetHighQualVcfs.HQ.idx.err" + + $gatkrun BaseRecalibrator -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.table" 2> $snpi_out".BaseRecalibrator.err" + + $gatkrun ApplyBQSR -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" --bqsr-recal-file $snpi_out".BaseRecalibrator.table" -O $snpi_out".BaseRecalibrator.ApplyBQSR.bam" \ + 2> $snpi_out".BaseRecalibrator.ApplyBQSR.err" + + $gatkrun BaseRecalibrator -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.recal.table" \ + 2> $snpi_out".BaseRecalibrator.recal.err" + + $gatkrun AnalyzeCovariates -before $snpi_out".BaseRecalibrator.table" -after $snpi_out".BaseRecalibrator.recal.table" -verbosity DEBUG -csv $snpi_out".AnalyzeCovariates.calibration-report.csv" \ + -plots $snpi_out".AnalyzeCovariates.calibration-report.QC.pdf" 2> $snpi_out".AnalyzeCovariates.calibration-report.err" + + $gatkrun PrintReads -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -O $snpi_out".PrintReads.bam" 2> $snpi_out".PrintReads.err" + + $gatkrun PrintReads -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -O $snpi_out".PrintReads.ApplyBQSR.bam" 2> $snpi_out".PrintReads.ApplyBQSR.err" + + # HaplotypeCaller and VariantFiltration w/o ApplyBQSR BAM file + $gatkrun HaplotypeCaller $threading_option -R $reffna -I $snpi_out".PrintReads.bam" -ploidy 1 -O $snpi_out".PrintReads.HaplotypeCaller.vcf" 2> $snpi_out".PrintReads.HaplotypeCaller.err" + + $gatkrun HaplotypeCaller $threading_option -R $reffna -I $snpi_out".PrintReads.ApplyBQSR.bam" -ploidy 1 -O $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.vcf" \ + 2> $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.err" + + # filter flags ## NOTE: it is important to look for float and integer numbers (e.g. FS must be 60.0 not 60) ## + # --filter-expression "FS > 60" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0" + # --filter-name "LowMQ" --filter-expression "QUAL > 30.0 && QUAL < 60.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage" + # + $gatkrun VariantFiltration -R $reffna --variant $snpi_out".PrintReads.HaplotypeCaller.vcf" \ + --filter-expression "FS > 60.0" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0" \ + --filter-name "LowMQ" --filter-expression "QUAL > 30.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage" \ + -O $snpi_out".VariantFiltration.vcf" 2> $snpi_out".VariantFiltration.err" + + $gatkrun VariantFiltration -R $reffna --variant $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.vcf" \ + --filter-expression "FS > 60.0" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0" \ + --filter-name "LowMQ" --filter-expression "QUAL > 30.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage" \ + -O $snpi_out".ApplyBQSR.VariantFiltration.vcf" 2> $snpi_out".ApplyBQSR.VariantFiltration.err" +} + +corri() +{ + for IN in $mappi_out*.gz + do + gunzip $IN + file=`echo $IN | awk '{print substr($1, 1, length($1)-3)}'` + /vol/agrensing/tools/samtools/samtools-1.12/samtools view -@ $cpu -bhS $file | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G - -o $file".sort.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $file".sort.bam" + gzip $file + + done + /vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.bam" $mappi_out*"uniq"*".sort.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.bam" + + + + gatkrun='/ceph/agrensing/tools/gatk/gatk-4.2.0.0/gatk --java-options -Xmx50G' + threading_option='--native-pair-hmm-threads 8' + $gatkrun BaseRecalibrator -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.recal.table" \ + 2> $snpi_out".BaseRecalibrator.recal.err" + + $gatkrun AnalyzeCovariates -before $snpi_out".BaseRecalibrator.table" -after $snpi_out".BaseRecalibrator.recal.table" -verbosity DEBUG -csv $snpi_out".AnalyzeCovariates.calibration-report.csv" \ + -plots $snpi_out".AnalyzeCovariates.calibration-report.QC.pdf" 2> $snpi_out".AnalyzeCovariates.calibration-report.err" + + +} + + +# script input variables + +cpu=$1 +output_dir=$2 +prefix=$3 +species=$4 +infile_path=$5 + +# set variables +java_ram="-Xmx50G -Xms25G" +id=$prefix + +fw=$infile_path"/fw."$prefix +rv=$infile_path"/rv."$prefix + +db_path_V3="/vol/agrensing/fhaas/DB/Arabidopsis_TAIR10_chloro_mito/Arabidopsis_TAIR10_chloro_mito" +db_name_V3="Arabidopsis_TAIR10_chloro_mito" + +reffna="/ceph/agrensing/fhaas/DB/Arabidopsis_TAIR10_chloro_mito/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa" +gff_file="/ceph/agrensing/fhaas/DB/Arabidopsis_TAIR10_chloro_mito/Arabidopsis_thaliana.TAIR10.51.rmiso.gff3" + + +# outdir variables +# output=$output_dir"/"$prefix +trimmo_out=$output_dir"/02_trimmomatic/"$species"/"$prefix"/"$prefix +polyA_out=$output_dir"/03_poly-A/"$species"/"$prefix"/"$prefix +mappi_out=$output_dir"/04_genome_mapping/"$species"/"$prefix"/"$prefix +snpi_out=$output_dir"/05_SNP_calling/"$species"/"$prefix"/"$prefix +counti_out=$output_dir"/06_counts/"$species"/"$prefix"/"$prefix + +# create result folder +mkdir -p $output_dir"/02_trimmomatic/"$species"/"$prefix +mkdir -p $output_dir"/03_poly-A/"$species"/"$prefix +mkdir -p $output_dir"/04_genome_mapping/"$species"/"$prefix +mkdir -p $output_dir"/05_SNP_calling/"$species"/"$prefix +mkdir -p $output_dir"/06_counts/"$species"/"$prefix + +mkdir -p $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc" +mkdir -p $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc" + + + + + +# print running machine and start date +hostname +date + +# print all variables +printf $cpu"\n"$output_dir"\n"$prefix"\n"$species"\n"$fw"\n" + +if [ -e $rv ]; then + printf $rv"\n" +fi + +printf "\'$java_ram\'""\n"$id"\n"$db_path_V3"\n"$db_name_V3"\n" + +echo "" + +echo "Phase 0 assamble data" +startle + +date +echo "Phase 1 run trimming" +trimmo + +date +echo "Phase 2 run polyA" +polyA +duple +date +echo "Phase 3 run mapping" +mappi +#-# mappi_gDNA +organelli +sample +counti +depthi +date +echo "Phase 4 run SNP calling" +snpi +#-# corri +date + diff --git a/workflows/Experiment2/snp_pipe_mp.sge b/workflows/Experiment2/snp_pipe_mp.sge new file mode 100644 index 0000000..9bb7813 --- /dev/null +++ b/workflows/Experiment2/snp_pipe_mp.sge @@ -0,0 +1,13 @@ +#!/bin/sh + + +prefix=$1 +job_mem=$2 +out_path=$3 +nthreads=$4 +species=$5 +inpath=$6 + + + +qsub -N "run_"$prefix -l h_vmem=$job_mem -pe multislot $nthreads -S /bin/bash -e $out_path -o $out_path /vol/agrensing/fhaas/RNA-seq_Schippers/second_samples/no_backup/SNP_pipe_Mp.sh $nthreads $out_path $prefix $species $inpath diff --git a/workflows/Experiment2/snp_pipe_mp.sh b/workflows/Experiment2/snp_pipe_mp.sh new file mode 100644 index 0000000..8fda4f6 --- /dev/null +++ b/workflows/Experiment2/snp_pipe_mp.sh @@ -0,0 +1,480 @@ +#!/bin/bash + +#. /etc/profile.d/modules.sh + +export PATH=$PATH:/vol/biotools/bin +export MANPATH=$MANPATH:/vol/biotools/man +export PATH=$PATH:/vol/software/bin + + +# install important R package - lummerland cluster does not have it ... +#echo "install.packages(\"gplots\", repos=\"https://cran.rstudio.com\")" | R --vanilla --no-save +#R CMD INSTALL /vol/agrensing/tools/R_packages/bitops_1.0-6.tar.gz +#R CMD INSTALL /vol/agrensing/tools/R_packages/caTools_1.17.1.1.tar.gz +#R CMD INSTALL /vol/agrensing/tools/R_packages/gtools_3.8.1.tar.gz +#R CMD INSTALL /vol/agrensing/tools/R_packages/gdata_2.18.0.tar.gz +#R CMD INSTALL /vol/agrensing/tools/R_packages/gplots_3.0.1.tar.gz + +#functions + +startle () +{ + number_row_reads=`wc -l $fw | awk '{print $1/4}'` + echo "## Start log ##" + echo "Number of row reads: "$number_row_reads +} + +trimmo () + { + # Remove leading low quality or N bases (below quality 3) (LEADING:3) + # Remove trailing low quality or N bases (below quality 3) (TRAILING:3) + # Scan the read with a 4-base wide sliding window, cutting when the average quality per base drops below 15 (SLIDINGWINDOW:4:15) + # HEADCROP: Cut the specified Number of bases from the start of the read + # MINLEN was first set to 50 bp. But there are libraries with read start length of 50 bp. + + if [ -e $rv ]; then + java $java_ram -jar /nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads $cpu -phred33 -summary $trimmo_out".trimmo.stats" \ + $fw $rv $trimmo_out".PE.fw" $trimmo_out".SE.fw" $trimmo_out".PE.rv" $trimmo_out".SE.rv" \ + ILLUMINACLIP:/nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/adapters/TruSeq-PE_all.fna:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 HEADCROP:10 MINLEN:30 + + # stats + echo "## Trimmo statistics ##" + printf "trim\t"$trimmo_out".PE.fw\t"; grep -c "^\+$" $trimmo_out".PE.fw" + # fastqc + /nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc" $trimmo_out".PE."* + fi + if [ ! -e $rv ]; then + java $java_ram -jar /nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/trimmomatic-0.39.jar SE -threads $cpu -phred33 -summary $trimmo_out".trimmo.stats" \ + $fw $trimmo_out".SE" \ + ILLUMINACLIP:/nfs/agrensing/tools/trimmomatic/Trimmomatic-0.39/adapters/TruSeq-PE_all.fna:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 HEADCROP:10 MINLEN:30 + + # stats + echo "## Trimmo statistics ##" + # printf "trim\t"$trimmo_out".SE\t"; grep -c "^\+$" $trimmo_out".SE" + printf "trim\t"$trimmo_out".SE\t"; wc -l $trimmo_out".SE" | awk '{print $1/4}' + # fastqc + /nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc" $trimmo_out".SE" + fi + + } + +polyA () + { + # min_len was first set to 50 bp. But there are libraries with read start length of 50 bp. + # If you specify any statistic option, no other ouput will be generated. To preprocess data, do not specify a statistics option. + # -stats_dupl -stats_all + + inpA=$polyA_out".polyA.passed" + infA=$polyA_out".polyA.failed" + + if [ -e $rv ]; then + /nfs/agrensing/tools/prinseq/prinseq-lite-0.20.4/./prinseq-lite.pl -verbose -no_qual_header -fastq $trimmo_out".PE.fw" -fastq2 $trimmo_out".PE.rv" \ + -trim_tail_left 5 -trim_tail_right 5 -min_len 30 -out_good $inpA -out_bad $infA \ + -out_format 3 -log $polyA_out"_polyA.log" + + # stats + echo "## PolyA statistics ##" + printf "polyA\t"$inpA"_1.fastq\t"; wc -l $inpA"_1.fastq" | awk '{print $1/4}' + #fastqc + /nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc" $inpA"_"[12]".fastq" + fi + if [ ! -e $rv ]; then + /nfs/agrensing/tools/prinseq/prinseq-lite-0.20.4/./prinseq-lite.pl -verbose -no_qual_header -fastq $trimmo_out".SE" \ + -trim_tail_left 5 -trim_tail_right 5 -min_len 30 -out_good $inpA -out_bad $infA \ + -out_format 3 -log $polyA_out"_polyA.log" + + # stats + echo "## PolyA statistics ##" + printf "polyA\t"$inpA".fastq\t"; grep -c "^\+$" $inpA".fastq" + #fastqc + /nfs/agrensing/tools/FastQC/FastQC_v0.11.7/fastqc -f fastq -t $cpu -o $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc" $inpA".fastq" + fi + } + +duple () +{ + # number of 100% identical duplicat reads. + echo "Duplication:" + + if [ -e $rv ]; then + + #SE 1 + IN=$polyA_out".polyA.passed" + + num_dup_se1=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_1_dupl_histogram"}}' $IN"_1.fastq" | wc -l` + printf $num_dup_se1"\t"; printf %.5f\\t "$(( 100000 * num_dup_se1 / number_row_reads ))e-5" + #SE 2 + num_dup_se2=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_2_dupl_histogram"}}' $IN"_2.fastq" | wc -l` + printf $num_dup_se2"\t"; printf %.5f\\t "$(( 100000 * num_dup_se2 / number_row_reads ))e-5" + + + #PE + num_dup_pe=`awk -v nof=$IN 'NR % 4 == 1 || NR % 4 == 2 {if(n==1){if(foo[name]==""){c=0}; foo[name]=foo[name]$1; if(c==1){seq[foo[name]]++} else {c=1}; n=0} else if($1~/^@/){split($1,w,"/"); name=w[1]; foo[name]; n=1}} END {for(a in seq){print seq[a]; p[seq[a]]++} for(l in p){print l"\t"p[l] >> nof"_1_2_dupl_histogram"}}' $IN"_1.fastq" $IN"_2.fastq" | wc -l` + printf $num_dup_pe"\t"; printf %.5f\\n "$(( 100000 * num_dup_pe / number_row_reads ))e-5" + + + fi + if [ ! -e $rv ]; then + for IN in $polyA_out".polyA.passed"*; + do + num_dup_se1=`awk -v name=$IN 'NR % 4 == 2 {foo[$1]++} END {for(o in foo){print o"\t"foo[o]; p[foo[o]]++} for(l in p){print l"\t"p[l] >> name"_dupl_histogram"}}' $IN | wc -l` + printf $IN"\t"$num_dup_se1"\t"; printf %.5f\\n "$(( 100000 * num_dup_se1 / number_row_reads ))e-5" + done + fi + + +} + +mappi() +{ + if [ -e $rv ]; then + /homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -N 1 -n 7 -A sam --npaths=5 \ + --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \ + --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \ + $polyA_out".polyA.passed_1.fastq" $polyA_out".polyA.passed_2.fastq" 2> $mappi_out".mappi.err" + + unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped.1"` + + fi + if [ ! -e $rv ]; then + /ceph/agrensing/tools/gmap-gsnap/gmap-2021-03-08/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -N 1 -n 5 -A sam --npaths=5 \ + --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \ + --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \ + $polyA_out".polyA.passed.fastq" 2> $mappi_out".mappi.err" + + unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped"` + fi + + + + # mapping statistics + echo "## Pp genome version 3 ##" + echo "Unmapped reads: "$unmapped_reads + +} + +organelli() +{ + if [ -e $rv ]; then + for IN in $mappi_out".genome.concordant_"*; + do + print "Fragments:" + printf $IN"\t"; + with=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Ei "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`; + wo=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Eiv "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`; + awk -v with=$with -v wo=$wo 'BEGIN {print with"\t"wo"\t"with/(wo+with)*100" %"}' ; + done + fi + if [ ! -e $rv ]; then + for IN in $mappi_out".genome"*"uniq"*; + do + print "SE reads:" + printf $IN"\t"; + with=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Ei "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`; + wo=`/ceph/agrensing/tools/samtools/samtools-1.12/samtools view $IN | grep -Eiv "chloro|mito|ribo" | awk '{print $1}' | sort | uniq | wc -l`; + awk -v with=$with -v wo=$wo 'BEGIN {print with"\t"wo"\t"with/(wo+with)*100" %"}' ; + done + + fi + + +} + +mappi_gDNA() +{ + if [ -e $rv ]; then + /homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -n 7 -A sam --npaths=5 \ + --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \ + --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \ + $polyA_out".polyA.passed_1.fastq" $polyA_out".polyA.passed_2.fastq" 2> $mappi_out".mappi.err" + + unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped.1"` + fi + + if [ ! -e $rv ]; then + /homes/fhaas/src/gsnap -D $db_path_V3 -d $db_name_V3 -t $cpu -n 5 -A sam --npaths=5 \ + --read-group-platform=illumina --read-group-id=RGUNKNOWN --read-group-name=$id --read-group-library=LBUNKNOWN \ + --failed-input=$mappi_out".unmapped" --split-output=$mappi_out".genome" \ + $polyA_out".polyA.passed.fastq" 2> $mappi_out".mappi.err" + + unmapped_reads=`grep -c "^\+$" $mappi_out".unmapped"` + fi + + # mapping statistics + echo "## Pp genome version 3 ##" + echo "Unmapped reads: "$unmapped_reads +} + + +sample () +{ + # if PE use ".genome"*[tcqg] + for IN in $mappi_out".genome"*[tcqg] + do + printf $IN"\tsort.bam\n" >&2 + /vol/agrensing/tools/samtools/samtools-1.12/samtools view -@ $cpu -bhS $IN | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G - -o $IN".sort.bam" + + printf $IN"\trmdup.bam\n" >&2 + grep -vE "chloro|ribo|mito|ERCC" $IN | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G -n -O BAM - | \ + /vol/agrensing/tools/samtools/samtools-1.12/samtools fixmate -@ $cpu -m - - | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G -O BAM - | \ + /vol/agrensing/tools/samtools/samtools-1.12/samtools markdup -@ $cpu -s -r - \ + $IN".filter.fixmate.sort.markdup_r.bam" + + # samtools statistics + v_tmp=`awk '$1!~"@" {print $1}' $IN | sort | uniq | wc -l` + printf "Number of mapped reads: "$IN"\t"$v_tmp"\n" + + # compress the sam text files + # gzip $IN + # rm sam file + rm $IN + + + # bam index + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $IN*"_r.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $IN".sort.bam" + + done + + ls -l $mappi_out*"bam" >&2 + + # merge all unique bam files + echo "merging all .uniq bam files" >&2 + echo "merging all .uniq bam files" + /vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.bam" $mappi_out*"uniq"*".sort.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.bam" + + /vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.rmdup.bam" $mappi_out*"uniq"*"_r.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.rmdup.bam" +} + +counti() +{ + if [ -e $rv ]; then + # read count v3.3 and ERCC combined +# /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_ercc_gt.gff \ +# -t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.pB.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_32458_ercc.exon.pB.log" +# /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_ercc_gt.gff \ +# -t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.p.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_32458_ercc.exon.p.log" + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a /vol/agrensing/fhaas/DB/Marchantia_polymorpha/MpTak_v6.1r1_iso1.gff \ + -t exon -g Parent -o $counti_out"_v6.1.exon.pB.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v6.1.exon.pB.log" + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a /vol/agrensing/fhaas/DB/Marchantia_polymorpha/MpTak_v6.1r1_iso1.gff \ + -t exon -g Parent -o $counti_out"_v6.1.exon.p.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v6.1.exon.p.log" + + + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -B -C -T $cpu -a /vol/agrensing/fhaas/DB/Marchantia_polymorpha/MpTak_v6.1r1_iso1.gff \ + -t exon -g Parent -o $counti_out"_all_v6.1.exon.pB.txt" $mappi_out".uniq.merged.bam" 2> $counti_out"_all_v6.1.exon.pB.log" + /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -p -C -T $cpu -a /vol/agrensing/fhaas/DB/Marchantia_polymorpha/MpTak_v6.1r1_iso1.gff \ + -t exon -g Parent -o $counti_out"_all_v6.1.exon.p.txt" $mappi_out".uniq.merged.bam" 2> $counti_out"_all_v6.1.exon.p.log" + + + + # statistics + printf "\n### FeatureCount statisitcs v6.1 ###\n" + # 32458 +# echo "Fragment counts on gene models exon PE 32458:" +# grep "Successfully assigned fragments" $counti_out"_v3.3_32458_ercc.exon.pB.log" +# echo "Fragment counts on gene models exon PE+SE 32458:" +# grep "Successfully assigned fragments" $counti_out"_v3.3_32458_ercc.exon.p.log" + # results + echo "Fragment counts on gene models exon PE:" + grep "Successfully assigned fragments" $counti_out"_v6.1.exon.pB.log" + echo "Fragment counts on gene models exon PE+SE:" + grep "Successfully assigned fragments" $counti_out"_v6.1.exon.p.log" + # results no rmdup + echo "Fragment counts on gene models exon PE (no rmdup):" + grep "Successfully assigned fragments" $counti_out"_all_v6.1.exon.pB.log" + echo "Fragment counts on gene models exon PE+SE (no rmdup):" + grep "Successfully assigned fragments" $counti_out"_all_v6.1.exon.p.log" + + fi + if [ ! -e $rv ]; then + echo "This part is not working yet" +# /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_ercc_gt.gff \ +# -t exon -g Parent -o $counti_out"_v3.3_32458_ercc.exon.SE.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_32458_ercc.exon.SE.log" +# /vol/agrensing/tools/Subread/subread-1.6.2-Linux-x86_64/bin/featureCounts -T $cpu -a /vol/agrensing/fhaas/GeneAtlas/Pp_v3.3_rm.double_clean.N_ercc_gt.gff \ +# -t exon -g Parent -o $counti_out"_v3.3_31315_ercc.exon.SE.txt" $mappi_out".uniq.merged.rmdup.bam" 2> $counti_out"_v3.3_31315_ercc.exon.SE.log" +# +# # statistics +# printf "\n### FeatureCount statisitcs v3.3 ###" +# # 32458 +# echo "Fragment counts on gene models exon SE 32458:" +# grep "Successfully assigned " $counti_out"_v3.3_32458_ercc.exon.SE.log" +# # 31315 +# echo "Fragment counts on gene models exon SE 31315:" +# grep "Successfully assigned " $counti_out"_v3.3_31315_ercc.exon.SE.log" + + fi +} + +depthi() +{ + /vol/agrensing/tools/samtools/samtools-1.12/samtools depth -aa $mappi_out".uniq.merged.rmdup.bam" > $mappi_out".uniq.merged.rmdup.depth" +} + + +snpi() +{ + gatkrun='/vol/agrensing/tools/gatk/gatk-4.2.0.0/gatk --java-options -Xmx50G' + threading_option='--native-pair-hmm-threads 8' + # V3 + # start the SNP detection + $gatkrun HaplotypeCaller $threading_option -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -ploidy 1 -O $snpi_out".HaplotypeCaller.vcf" 2> $snpi_out".HaplotypeCaller.err" + + $gatkrun SelectVariants -R $reffna --variant $snpi_out".HaplotypeCaller.vcf" -select-type INDEL -O $snpi_out".SelectVariants.vcf" 2> $snpi_out".SelectVariants.err" + + # HQ SNPs + python2.7 /vol/agrensing/fhaas/GeneAtlas/SNP/GetHighQualVcfs.py -i $snpi_out".HaplotypeCaller.vcf" -o $output_dir"/05_SNP_calling/"$species"/"$prefix"/" --ploidy 1 --percentile 90 --GQ 90 2> $snpi_out".GetHighQualVcfs.HQ.err" + +# mv $snpi_out".GetHighQualVcfs.HQ.vcf" $snpi_out".GetHighQualVcfs.V3.HQ.vcf" + + $gatkrun IndexFeatureFile -I $snpi_out".GetHighQualVcfs.HQ.vcf" 2> $snpi_out".GetHighQualVcfs.HQ.idx.err" + + $gatkrun BaseRecalibrator -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.table" 2> $snpi_out".BaseRecalibrator.err" + + $gatkrun ApplyBQSR -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" --bqsr-recal-file $snpi_out".BaseRecalibrator.table" -O $snpi_out".BaseRecalibrator.ApplyBQSR.bam" \ + 2> $snpi_out".BaseRecalibrator.ApplyBQSR.err" + + $gatkrun BaseRecalibrator -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.recal.table" \ + 2> $snpi_out".BaseRecalibrator.recal.err" + + $gatkrun AnalyzeCovariates -before $snpi_out".BaseRecalibrator.table" -after $snpi_out".BaseRecalibrator.recal.table" -verbosity DEBUG -csv $snpi_out".AnalyzeCovariates.calibration-report.csv" \ + -plots $snpi_out".AnalyzeCovariates.calibration-report.QC.pdf" 2> $snpi_out".AnalyzeCovariates.calibration-report.err" + + $gatkrun PrintReads -R $reffna -I $mappi_out".uniq.merged.rmdup.bam" -O $snpi_out".PrintReads.bam" 2> $snpi_out".PrintReads.err" + + $gatkrun PrintReads -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -O $snpi_out".PrintReads.ApplyBQSR.bam" 2> $snpi_out".PrintReads.ApplyBQSR.err" + + # HaplotypeCaller and VariantFiltration w/o ApplyBQSR BAM file + $gatkrun HaplotypeCaller $threading_option -R $reffna -I $snpi_out".PrintReads.bam" -ploidy 1 -O $snpi_out".PrintReads.HaplotypeCaller.vcf" 2> $snpi_out".PrintReads.HaplotypeCaller.err" + + $gatkrun HaplotypeCaller $threading_option -R $reffna -I $snpi_out".PrintReads.ApplyBQSR.bam" -ploidy 1 -O $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.vcf" \ + 2> $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.err" + + # filter flags ## NOTE: it is important to look for float and integer numbers (e.g. FS must be 60.0 not 60) ## + # --filter-expression "FS > 60" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0" + # --filter-name "LowMQ" --filter-expression "QUAL > 30.0 && QUAL < 60.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage" + # + $gatkrun VariantFiltration -R $reffna --variant $snpi_out".PrintReads.HaplotypeCaller.vcf" \ + --filter-expression "FS > 60.0" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0" \ + --filter-name "LowMQ" --filter-expression "QUAL > 30.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage" \ + -O $snpi_out".VariantFiltration.vcf" 2> $snpi_out".VariantFiltration.err" + + $gatkrun VariantFiltration -R $reffna --variant $snpi_out".PrintReads.ApplyBQSR.HaplotypeCaller.vcf" \ + --filter-expression "FS > 60.0" --filter-name "StrandBias" --filter-expression "QD < 2.0" --filter-name "LowQD" --filter-expression "MQ < 40.0" \ + --filter-name "LowMQ" --filter-expression "QUAL > 30.0" --filter-name "LowQual" --filter-expression "DP < 5" --filter-name "LowCoverage" \ + -O $snpi_out".ApplyBQSR.VariantFiltration.vcf" 2> $snpi_out".ApplyBQSR.VariantFiltration.err" +} + +corri() +{ + for IN in $mappi_out*.gz + do + gunzip $IN + file=`echo $IN | awk '{print substr($1, 1, length($1)-3)}'` + /vol/agrensing/tools/samtools/samtools-1.12/samtools view -@ $cpu -bhS $file | /vol/agrensing/tools/samtools/samtools-1.12/samtools sort -@ $cpu -m 2G - -o $file".sort.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $file".sort.bam" + gzip $file + + done + /vol/agrensing/tools/samtools/samtools-1.12/samtools merge -@ $cpu $mappi_out".uniq.merged.bam" $mappi_out*"uniq"*".sort.bam" + /vol/agrensing/tools/samtools/samtools-1.12/samtools index $mappi_out".uniq.merged.bam" + + + + gatkrun='/ceph/agrensing/tools/gatk/gatk-4.2.0.0/gatk --java-options -Xmx50G' + threading_option='--native-pair-hmm-threads 8' + $gatkrun BaseRecalibrator -R $reffna -I $snpi_out".BaseRecalibrator.ApplyBQSR.bam" -known-sites $snpi_out".GetHighQualVcfs.HQ.vcf" -O $snpi_out".BaseRecalibrator.recal.table" \ + 2> $snpi_out".BaseRecalibrator.recal.err" + + $gatkrun AnalyzeCovariates -before $snpi_out".BaseRecalibrator.table" -after $snpi_out".BaseRecalibrator.recal.table" -verbosity DEBUG -csv $snpi_out".AnalyzeCovariates.calibration-report.csv" \ + -plots $snpi_out".AnalyzeCovariates.calibration-report.QC.pdf" 2> $snpi_out".AnalyzeCovariates.calibration-report.err" + + +} + + +# script input variables + +cpu=$1 +output_dir=$2 +prefix=$3 +species=$4 +infile_path=$5 + +# set variables +java_ram="-Xmx50G -Xms25G" +id=$prefix + +fw=$infile_path"/fw."$prefix +rv=$infile_path"/rv."$prefix + +db_path_V3="/ceph/agrensing/fhaas/DB/Marchantia_polymorpha/Marchantia_v6.1_maingenome_mito_ribo_chloro/Marchantia_v6.1_maingenome_mito_ribo_chloro" +db_name_V3="Marchantia_v6.1_maingenome_mito_ribo_chloro" + +reffna="/ceph/agrensing/fhaas/DB/Marchantia_polymorpha/Marchantia_v6.1.fna" + + + +# outdir variables +# output=$output_dir"/"$prefix +trimmo_out=$output_dir"/02_trimmomatic/"$species"/"$prefix"/"$prefix +polyA_out=$output_dir"/03_poly-A/"$species"/"$prefix"/"$prefix +mappi_out=$output_dir"/04_genome_mapping/"$species"/"$prefix"/"$prefix +snpi_out=$output_dir"/05_SNP_calling/"$species"/"$prefix"/"$prefix +counti_out=$output_dir"/06_counts/"$species"/"$prefix"/"$prefix + +# create result folder +mkdir -p $output_dir"/02_trimmomatic/"$species"/"$prefix +mkdir -p $output_dir"/03_poly-A/"$species"/"$prefix +mkdir -p $output_dir"/04_genome_mapping/"$species"/"$prefix +mkdir -p $output_dir"/05_SNP_calling/"$species"/"$prefix +mkdir -p $output_dir"/06_counts/"$species"/"$prefix + +mkdir -p $output_dir"/02_trimmomatic/"$species"/"$prefix"/fastqc" +mkdir -p $output_dir"/03_poly-A/"$species"/"$prefix"/fastqc" + + + + + +# print running machine and start date +hostname +date + +# print all variables +printf $cpu"\n"$output_dir"\n"$prefix"\n"$species"\n"$fw"\n" + +if [ -e $rv ]; then + printf $rv"\n" +fi + +printf "\'$java_ram\'""\n"$id"\n"$db_path_V3"\n"$db_name_V3"\n" + +echo "" + +echo "Phase 0 assamble data" +startle + +date +echo "Phase 1 run trimming" +trimmo + +date +echo "Phase 2 run polyA" +polyA +duple +date +echo "Phase 3 run mapping" +mappi +#-# mappi_gDNA +organelli +sample +counti +depthi +date +echo "Phase 4 run SNP calling" +snpi +#-# corri +date + -- GitLab