From 3527fe09c4b091ff0bf56fdbbe33adbd746ba163 Mon Sep 17 00:00:00 2001
From: Viktoria Petrova <vipet103@hhu.de>
Date: Mon, 20 Jan 2025 12:31:10 +0100
Subject: [PATCH] add species selection assay and protocols

---
 assays/SpeciesSelection/README.md                |   0
 assays/SpeciesSelection/dataset/.gitkeep         |   0
 assays/SpeciesSelection/isa.assay.xlsx           | Bin 0 -> 6896 bytes
 assays/SpeciesSelection/protocols/.gitkeep       |   0
 .../protocols/Cross-speciesPredictionModels.md   |   9 +++++++++
 ...iesModelsAndLeave-one-outCross-validation.txt |   5 +++++
 6 files changed, 14 insertions(+)
 create mode 100644 assays/SpeciesSelection/README.md
 create mode 100644 assays/SpeciesSelection/dataset/.gitkeep
 create mode 100644 assays/SpeciesSelection/isa.assay.xlsx
 create mode 100644 assays/SpeciesSelection/protocols/.gitkeep
 create mode 100644 assays/SpeciesSelection/protocols/Cross-speciesPredictionModels.md
 create mode 100644 assays/SpeciesSelection/protocols/Intra-speciesModelsAndLeave-one-outCross-validation.txt

diff --git a/assays/SpeciesSelection/README.md b/assays/SpeciesSelection/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/assays/SpeciesSelection/dataset/.gitkeep b/assays/SpeciesSelection/dataset/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/assays/SpeciesSelection/isa.assay.xlsx b/assays/SpeciesSelection/isa.assay.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..c6921bbade9524e5387a78d5bd2c98077c7d4c9c
GIT binary patch
literal 6896
zcmai3by(D0(_VU!mK9dIrMpy0Kw|0c5|(D^21P=;k&s40knR=<r8^W*B&3mEkn&sQ
zL4EakzxVvH*Iv7GpE=h(^P8D-|CHrWP_F?103g6hk5@N#rVUdE2>{qZ1po*Uzv@ZY
z!(5;+7bA5~2dJ|FyN8`^NrD2bofALkNcjE|8wA@xUE_XxPIVtq;Q?UGL!6atW-s~G
zjQe%QB4(;0V?ve`fo<=M&YMcLk+PFunY*dsD*PbK=}{&-;)dh2SNYL2*+9+5jzrEC
z64&DwU_x`Axq+!{pL3!{L}g{6kvPiBLb|2u?7aA2`B`&=oiMQyjymQ8SkogdO-c0d
zR3ZwrU-xz{LHnYQ?je(0@7XteSLuZGk<>tZsZn3=FgT6^c+=gDq^PC<2`f-Y){<of
zDY(@=Lw@*L{LR2)bexw<liiV6eH<%^@1gO8Oa>Zj*PpUz;nbNeq4?*JaPc#2Oco>A
zVVs{S%2(=13E(;%Z1)g+^<BL1@(mo!$B=Qfh=B>i2dks?CBzO1P$(l(F^;!<=i?2G
zWItb6`QfMy+9l*weG=Lwz&ZYAaFO=9_U@r=AoY6RbK&dnPu43Q++_(`N7Zu^EnXm1
z{wETEACW+i|8t=uz8N_|ZJjx;Y%Zg<mnl2FhWJko834fgX~X{AvLx=cVh1O-*tXC8
zrPF6H8DMr^6IJd9240aiIv8n9<nzveFWaFpV%U*Z7*UR!D;9kl145DOk*aVqY}S_~
zAhf>0;K8=O3oA!FW-RnzS`2Q7T<p|k^K*GxNMTP)w9XBgZgj8M<1E)&VTELtC{V9b
zpOrY%(D+)2TU*pYQR%kJNsr%hU}5c`*mNWPi*v3~K3kZc{aWmnNe-2T?pZIRnLK&u
zW6$yJv?p52+04qrD+ZcVO=rRd88XA!S+!}F1Zstf3w^q;T*+n(Fjs~m<izW~HIO|E
z@~3sz!8ASN9=#<n>;t%B<Zsqc5R7=({-o@01I$Z<e`7X~u0r!WvkeGlNq#qTw|BC6
zU~g~p{YkhY+PZ@iUvm5WTE&sBn=XVY)|u(CzTOc^V0Ep$t*oVh-T9FQsSFt~8zZx`
z_+83dnT>%X!4M_+ag4GoW(fYOLV3$J$cK04)B?YAT2B}?`Wge>yK(mo0W#hdawlK9
zSOp;_hyLCB5N=#)h5I3^`wFC^FK+jZ_q-S|bZ69pex)0mMOnvEs;Ig)I;ctYjA`Lf
zv&0M>-6-I8#OhZ_AYI0jeh!lKJ7#Fo24<8oQ;S-1Z=64NIiF+-;l9+v#tYZ{<RX4L
z$RxVc9QyT`sDCkz(};SjhkEbk67R|VfwzS>J$Dz^DgQgsxR(Tdr^eY53U&E2P3yu`
z8UqnTwj&4!|7H6F!u6fcr->@|shs$Jb4M&ObEJX-bkxX2ZYnjhsl0aS@J%pbF|t%`
zeC_FBlc3yEY1RZ@tRSff?rCe<e(FkTB(6AV;fa46D|PH@Ly_4lmhPe8)0a0wBcQ%r
zx)b64%|HoQFW%Sq(dc}Fb}8Vh!aWk3U{^BIk0DVhDI}87JT;@_Rp504TXZ(<&lKy^
z$qYR-Uy?U_Ldz$&O<pu6>f3_m+;i$~g%Gh-CAV5F84p%Ry~3E{I<KcJtzSo%w1u*4
zg$KmFSlk$DGPJ6-+pWqp89Ywe@)tN(bSjA`+QN2xMCf}upp(`m!@Lh!ASTli9?fCz
zW-H!ijH&PcFmS})V}GYeasXa(UYa3-*<i!^MFm1rQ&+OtMeK;rrg($SQy8vN1M7!2
zmKyT6JLs4B=dOLeL29m(Vsi+Z??w5t*A{gOv;ljvr;LE&m*0BZ@1m1&gDHFIqVb><
z&S%u??a!%&)_0B5+{xRU;2th%0(kQS!XFO>IdJClDC|WB^#XT@C^*Ph*d#pUqU*pb
zCE^||(RE}iP>Se_=p@hPiJ+C`FJ}$5X9h8-H@p|LRwNrFl0{p^I8}D>bMc#<EOS<=
zyJ>d#{6iaqsbbp%Fc~0}{`$y949PDZ0!J}@GL6LnK<;ZkdB*KXz_@EZFOB&g_xs1R
z3UQj?;Zz2nz=sRX3C!>gx3a%Y6MDDU8Q<M~x|Y7FFdjDj*n#$-FuKv^CAxc%MzzLd
z(HAnlceOzxQZMe6BnV^dhMr2iJI7~Y<GSa5!q}ZvJt42_1AhC6CCySGpwnEU%ClB`
z6uZG=yfQ9Y9!v&$D|$=6LU$VDP)@VfG5vi|68up=RV&%6UeT*8zbvrws~jRmh<3<>
zXosN7%yqUjae|tuxj0$DES!I2ZM>3FASZs;_QSAz_q9e0Q|xkT^y`G~^yL&nliW?b
zYm}Rys?#ktQd4zcEG^Hgsn#{2g#lTXJtW*bKFpR-%>+M!+Sujf;Ur&2UujG)7)fdI
z<G>80HHMZj0%1(m08G1<?JqYaM*Nq-?{Vc<$%@BjgO%LP-rfF$QwuO;sWmKtr7Vvs
z)7aYkh)>x-ylCgVJ&xIPxq5lwR+>}qZTfvks<L-9B$q5I(V|*VaNfk9Hw1iBF~d`g
zmj||)SXOQvlGf8tPs0@fg?kTlYcNJgUT*FRnQq`!cTVKGJT6pq6fL$OVa-LZT=ZH>
z#j`VW#SllH%n~~`ek!{(l1H2l%8&}iJ*9;j-WZzTFI!)e>x*gQJS4=cc-DAItRZtm
zY0E;oXPKL%KhK&R%vq$9V=HmIBD}l1BzmedR$o5tx$F=F*_&6fXy|257F`o(4IS4O
zczTQI;^ctvDj@&0e7jge?Vvxa{AJwY2KjD9Ai8EJ)#Vt2XakpKKR$mHn=KtB`x$=F
zH)j7YrDl42nOQKCKDRM`EB`lJ!?d(xM5Yj48Rk$G5O{iw9R=N`gB`_f#ssWsjHbC+
ziAR{#HR&57EPFvat)5zIHDcljNX>n?z5Divg3_q-<&=_yHJK?pFkILCYnUC0;NJTf
z$;CuZ`(X|~c?lU14gr(^mcql@zcE#s!*=r%J}DJ>DBCzU3R8hzBE1h-YT;Vy0x%;I
zj29coPN^pmr6^`2Qxax3Rg53o%}WFFA8i>4IerJGV;;!cqX;UK2{UUeIBQd~=$FfV
zH=mn!7Lu~VC=f+pm%=#|la9Sp$cqye7>L0(rYM^4x>ZyI%Bd9Nzcr;K>fF%K=AbJ3
z`S{p@X;rXP&4%Mpi~Bsa-mpb$V<PcH6|m>4ZKd<P%5XRBHa|x1Gm;M<yy|o{cW$3g
z;P}iaT!4IBgWUOd&Rgp$O>h(VKHxOZSh+P8J#-H$v-`B+*0eEx!{^#H#!v=Q%b605
zmM-h07=q_Bx*lV$Cx^Xct>QWjtG!FKIv^YBI}e)i{Kgw(;<{t1m?)?z9Vl{^VhBxx
zBT#_NOkv4>CQT2HwwQfxJyM*82#3{j_R3mFw7vsFj=}9KX3ToaY{S;eR2)Xlg5GCm
zo4eZ<N*QUw7p@{l`Lq0CZ#)~OM1A*%@Qg?veS<G<6wIyf(nLQx?9(c{INn!S&lBHh
zvM9?C_5ON3_2~Re9mU?+B{>*RCM3uaBNObd1H>Aa_rv?zB#VtK3Bm*cR8cm2#E?Fw
zP#1N$mRq-<l5q#&deDlf)>EJgB}$?+lLuaNNaC6{(Qitd>CylCxpPEY^mg=qeG`y;
zV;Me4HXB}mzb9ERid8P|oatXT0+!Y$>EYpjtxXH}!|y6>>Rgd8kL9Jne;yDrNr&RZ
zcql$Yhx)4D1@-krn4Q};DY=yEdegD?{q|Zh^@8haT@eDxtST>{5#rCJzfj#=m7(ab
zF9Yk#>>PmA%nzPu1JnB{GMQZLC7z5l`6dmD-{_%FS&g|HQXy2?$6)B!UYN)e>MsP(
zPZkTIG0l`^!_5B@lZ*FIOeMQ%n~-(RPH!#%I~OlzNw(ir51GeRbZ`}DlD@}}_jGf5
zS(21I-J>`Vj{63bo<;rUO-t$>p;T_+H9aJ=r*_3v18li(tVg5E{kWM5ZFr;T#WCNb
z2$`e#xLMeQT0F_Y-=!IFXRz)*_>?e~KPtSgl%NjrsI)qygG+`C=DCWBLA2?8v!kbO
zMt^jTe%wFe-Tlcu@p0!v*bFv)*NNXvDn?G1H6qCeEi#qqsukKS0;&RAy*QJ)ErfBa
zC0#?h?-w6Ft6X`0Lx|XD9h8@Fh+F_XqGYvcWQ>AyrO_p63_vyZ9~E1N!S|qS&N_Yi
zliZobi8@a?G%whcis)H=KJ(BdP;}qyx=X|6gSvTcLWw@6$XxBy!Hbi1J5%v><ZZ|d
zy2TzAO+WxZ!Q`3Koeu4Ojp!gQC^F<$m9o38SJyl{srmEIdkc9CK&l3k;87bc+@NRu
zQZ|PZGfjCX)dEDG$grY=GdidELb@W7>xb)KWfHc+AB>EbTPDnPw)c9X;od@_x9>2x
zWr{kFPAx2dme9k|>^+~s&Yj=58D{ROu+lD664M=&3ZWr>f<#8K(LubqVXn5l9+oK8
zvdiUTXSa+_QYgHL3fG@N;}dyS+fQ&dhC1MI?L;!J5T9I{Z7j)I;+2wliW7B{4C+l*
z--HrTy6esXarh5}x6w&T>YuW>p6=66Rf~%D>1-&zc}@U_zV0r<8ses(E}#S|-eE-_
z#dy8bDVWRkD!fe@){+KxkO|b+b(Zm9uh-Jlm($uWd7)ohT+5g<1{hFE@Zlc%WaTY;
zt1c>G?WBk`9TV^EdAOkK9D8*lFBWc$mkH6SN3c(nhQ?6u#B!r~$HRT5-FXc0C|wDq
zlA;1l7dL(5doih{n#OgIYyr_7UhQaI7A>v(mPkE2HxJa<xgcx4d{r}!w~E<kaH6AF
z9R?X`;`g7`Nw{vY?PqV*l9w%b(KpS*A&L_fayWGcY&8aTx?dWgO*4f;$j`f}SexF@
z`W{<k4lXCb*N)$G`PMVmlpV8HYwKd@6*_o18kK^KW=T<5_sccc1YQm(mW&tIp3)c0
zSS36&EKysZZ1$>~JDioO`qs@GrB*&x>ZMpR#W>NMq2*-lCjM%Kbs`;-w>3T!nKAZ|
zw~nvaXSa#dzIS8#)m4wXY+h)0{S4g@<1z}Of#Li!bhvohLVrvfc?kxH85#6s`-p{g
zP7MWzHPp}ADonjFm6<QzZNpEt_ypzVSggY7_FeSr*IlRc;k|FmqBHkTcpBDn#W?zT
z^<RQ8v_z!jZ!|7hCoPnv&Ze4<YZpP7s~<3ll%vm?QO3QYK=q#HF(<%Rk6&o1IgE?R
zf5#)6%3{ESM*EaMPIvVgxs`Q?<@<VB)tKQUQyL~M4=PsZt|-3jfU}-=P$FNAPaj%c
za7IupYvR$}Je&{L9H!BDbhFq)6g-ND<;ucj48b2(dkv8iR*M$UytiEM5d{(vVwT&}
zq7m$Nn9S>_T0u$GS2cCs`DWb>?z6~y^>P;Hwqtj1oH&+UOpt1S52_1ugJ2dhvntu0
zY=L(4drt4d49~cL_e7;fOxxLR_}J7F_9$oDlYDZbRxA}40_q@>!ctUjy9|Tvaru%}
z9Qe@XGmoM-)3bxAA_f@_aK=AatF>#S!50~!p~g>xFrRG0OKTb#V8goaR#};Df3UhA
zBtc;jfn<!u&>7!UHdln@rbRu`t3Q3p-Q}?RVLh4}OwxhG<3jc=jlX;@^1gv<k7)L(
zIPpPz*sN^+akw|>1Mw`ozM#+RcuyYdxW!LB9B+K<q=N>ge`(3Gxy+lqz;w|M`PjRY
zaM*2BO<t9M{5oA7zlo%{3NLa?Z=57!A=CY3boikGg_rXqz9aJ~);*#T>#h4$GZS9v
z9B0F?&RyZwbGznmgVAH4v{Zc&;p79N3c6uKcLL5Iertm_9Nh<>X6CdV?b(R#m~*9j
z?;qdU+hYVy?b^B$e;!FWT8(@asN`?VP07ZVB=6A0zGXkCAxO7+0VP=SxNdgRS~Yi-
zp#MIEnc15vJJ~z@dF|`@+yM<l49^URi#R?akAIq(I5>PCoYUj(+0Srd2W_9;c=9DO
z4aWFHOp_~@)JwYpAUm_z*5kseo`@RD7k7TrgS&8NL$OIX4sR%vsc1%GC5(db{U??R
z{e&DbdvUw-*6Yzt_F9?u?_sy16ANiBgwVkbF=}L~A3O+GUqV%SYMHVKfpD|AiG>Tn
zJx2q_kyg03pq3;yJM76*xWauKmCdmp#s_?pT8`I11)Um0-$->acy>m{zJQ%GG>sm(
zOafLi$MGg}#j@uYHeP3`If<y>whUk9(-B!$?eQ$EMJFqDE{C-dFn^s?J_2?O@*5Lc
zP3#v*cTnFjd$dww5WYZ+k1P0em0D+qRk2o$%bGYe{vqyKkgg;HeZey0%EyM<%mXK0
zGe@fgLYc{=kh^`jYtmJZJHJsG4u(pjbD377iarKLZQ~D7p0Ye7Zv={LD=V%e32i#=
zD6TghavXr&=W)S9a;>&Sc37AjXY6CN@;I)UR`H}~8$@Rm8&xak#)6>*WA8UU59FL_
zCoa;bc|%$a>Mu~P0()6`);j!QhKPtQBB~F;?-4V#cY^*1nQ9+GRl;xHzY)7#7_Z)(
z!}}C)C`nHjCN>+<Vy8ejQ(2e$)jLgkOzNQ1&ced28P8&0LI+jUIDOn&YC%>KSy>;e
zke_F-RyWX{&Y?TOI7pL|mQRp-sEiw?TFTa}ZP%8AO6)=VSruk(7H+8uA)gcPneAjr
zxHUDMh%H=AY7L&Nz!|0sgIQvBqe@7Tb_`=j<<^0G%6Yx@s?q(`7_(4=qTa+OXH)}g
zU5K%Cm*sUJ#W?y26g~HR9h~qrHTVkiOU92V$D%*)+uXHg)Tm;{r5i$3YCwB3y=-q?
z;F8Dfm1yIUpe^<>|8!Uy_U#adtmy#Y!`kM^f^%}$b%}9)s3M?oR8w#$O4*s|N%=_a
zmluS4S;n|Y7qd2)3)v!vCrQ00AKZBSfM<%MumI_0-;?8N-58<JxDC#*4m#Z~$7i$o
zyXO{T`!6ocwBgZbGrN|<LjosBU;ntnn6pVd)JFuv1rZFq%Qo=4_!JqZpo9?ko@}3m
zF&>q$1t3jK%K~HE?veSi?5VnkCvtSHX@3)jNyp+7vpa5fMjPGLDN&jXp_Ll_s8N}`
zqMD`FIs((`shcWzUjj;K<1=pVfWT#(42+6Je1&XVM1h0u>OH=Y?0#-l?`kn;%Hdtv
zjpsp~uvsXGMqI+X(@eoFWu>p*-)&Exaq$tj)vl%;qwG#Ndpb1%4xOfwH@Rhw&LL%5
zkjHAfyeBERVnV9mVIagrqv8E+N)wy<4K+vMSkF@oyPB{JpVZ}u6QZvLb2LL?3$sow
z)-|t>_Q%lzcd*@FJjmq~TdNqsOoccpEQ;=GQMt{xjy*|ly5Ab3bH0C_LxuO9xWy>T
z%V(=+`F$^&8FN3yVJ^yzODhWp2hXaI`|5F(rkV>9!qjsyMf`v#yM%O6WPDZ>m{?4F
za}lY_)&MA>>|>Q;GfvvI{B92WH>B}{Cs@jINXQ_-e^<p3(fDosi$?xygTE{0cQN;J
zeJ_e(hzMT^kpJoQPjUAO1pt)(#Qabo|37EnHPFlT-B}wVm3}CqSDyYp-FHp&a(xHP
zUjqL9fPZSFS3tzO1cY0JpG(;9N&7#~`&~-BT;JU@A!HZCKK~&sULF0iV*M#CUZDX1
zjlZzJgvMWtUGDOy%=2p(Jd7VF_)GNpwZs3SFbnu&hbxuhKac;XPVvj}+uvcoREuAJ
z|9w{cwLu`FW+1q8{y91Rg8r`~(_jCsS^t7vdi=Ai|MDpRJM5nx|0nv-4)Y6rfq32Y
pb3(Z4H-Ezatckzi;WvJ!%9T8?EQj{vS;I$6A8!yD2_pV({(n-zR^$Kx

literal 0
HcmV?d00001

diff --git a/assays/SpeciesSelection/protocols/.gitkeep b/assays/SpeciesSelection/protocols/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/assays/SpeciesSelection/protocols/Cross-speciesPredictionModels.md b/assays/SpeciesSelection/protocols/Cross-speciesPredictionModels.md
new file mode 100644
index 0000000..0a21008
--- /dev/null
+++ b/assays/SpeciesSelection/protocols/Cross-speciesPredictionModels.md
@@ -0,0 +1,9 @@
+## Cross-species prediction models
+
+Ensuring a diverse range of species in the training set, while simultaneously reserving enough data for validation and testing to effectively evaluate the models’ generalization ability, proved difficult. At the start of development, the amount of high-quality, publicly available ATAC-seq data was low. Around 60% of the plant ATAC-seq data on SRA available up until July 2023 needed to be discarded after the final quality control. This left the ATAC-seq data of the 14 plant species used in this study. In later development stages 3 more ATAC-seq datasets, from *Actinidia chinensis, Panicum miliaceum* and *Sorghum bicolor*, and 2 more ChIP-seq datasets corresponding to acquired ATAC-seq datasets, from *A.chinensis* and *M.polymorpha*, became available. The low availability of high-quality data, especially in early development stages, turned out to be a major hindrance in providing the network with an appropriate amount of data to train on. Data of two species, *A.thaliana* and *O.sativa*, was set aside as a hold-out test set. In doing so, both a dicot and a monocot species with available ATAC- and ChIP-seq datasets could be used for final evaluation. The same applied to the two validation species, the dicot *Medicago truncatula* and the monocot *S.polyrhiza* (Table 3).
+
+The resulting training, validation, and test split for the ATAC-seq models, ChIP-seq models and Combined models was around 90% training set, 5% validation set and 5% test set (Fig. 3a).
+
+The model training pairs were visualized using the Uniform Manifold Approximation and Projection (UMAP) learning technique for dimension reduction (McInnes et al. 2018). Random training pairs, 5% of each species in the training set, were used to calculate the UMAPS. Gap subsequences and flagged sequences were not included. The chosen parameters were 10 neighbors, 0.1 minimum distance and the Euclidean distance metric. The additional species datasets, added in later development stages, were included. None of the available settings and metrics for UMAP computation showed distinct clusters based on the number of peaks within the input (Fig. 3b).
+
+For the first seven models only the species for which experimental ATAC-seq data of high quality was available up until July of 2023 were trained on. The same applied to the BiHybrid_05 model using ChIP-seq data. The Combined model used both datasets. The Combined_02 model used additional data of four species. Gap subsequences were masked for all models; unplaced scaffolds and non-nuclear sequences were masked starting with model BiHybrid_04.
\ No newline at end of file
diff --git a/assays/SpeciesSelection/protocols/Intra-speciesModelsAndLeave-one-outCross-validation.txt b/assays/SpeciesSelection/protocols/Intra-speciesModelsAndLeave-one-outCross-validation.txt
new file mode 100644
index 0000000..615b0fc
--- /dev/null
+++ b/assays/SpeciesSelection/protocols/Intra-speciesModelsAndLeave-one-outCross-validation.txt
@@ -0,0 +1,5 @@
+## Intra-species models and leave-one-out cross-validation
+
+Cross-species validation instead of an in-species split for the validation and training data was deemed closer to the real-world use case of predicting ATAC- and ChIP-seq data for an entire species. However, two models were trained using an intra-species training and validation split. These models, IS_10 and IS_20, used 10% and 20% of each species dataset as the validation set respectively. The input files were split using Predmoter's intra_species_train_val_split.py script in “side_scripts.” This method ensured that each sequence ID from the original fasta file was fully assigned to either training or validation set. Since the focus of this study is on cross-species prediction, all 25 plant species were used in leave-one-out cross-validation (LOOCV) to evaluate the best model setup on different species. All these setups were trained on ATAC- and ChIP-seq datasets simultaneously (Table 4). When performing LOOCV the model performance was evaluated on all datasets available in the left-out species.
+
+All models excluded gap subsequences, subsequences of 21 384 bp only containing Ns, and flagged subsequences. For more details on exact model parameters see Supplementary Section S1.3 and Supplementary Table S4.
\ No newline at end of file
-- 
GitLab