diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..300d7fa0bdb18676993d18c9505173e20a6bd944 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,8 @@ +assays/isoform_sequencing/dataset/reads/375-4.merged.bam filter=lfs diff=lfs merge=lfs -text +assays/isoform_sequencing/dataset/reads/375-5.merged.bam filter=lfs diff=lfs merge=lfs -text +assays/isoform_sequencing/dataset/reads/375-6.merged.bam filter=lfs diff=lfs merge=lfs -text +studies/additional_data/resources/reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa filter=lfs diff=lfs merge=lfs -text +assays/isoform_sequencing/dataset/reads/119_1_CCS.bam filter=lfs diff=lfs merge=lfs -text +assays/isoform_sequencing/dataset/reads/375-1.merged.bam filter=lfs diff=lfs merge=lfs -text +assays/isoform_sequencing/dataset/reads/375-2.merged.bam filter=lfs diff=lfs merge=lfs -text +assays/isoform_sequencing/dataset/reads/375-3.merged.bam filter=lfs diff=lfs merge=lfs -text diff --git a/assays/BSA_RNAseq/isa.assay.xlsx b/assays/BSA_RNAseq/isa.assay.xlsx index 399f4b8a40fb6354edb648cda747f4dc1d558145..72130cca8afe4887989ea2d9f789b515d63a7805 100644 Binary files a/assays/BSA_RNAseq/isa.assay.xlsx and b/assays/BSA_RNAseq/isa.assay.xlsx differ diff --git a/assays/BSA_WGS/isa.assay.xlsx b/assays/BSA_WGS/isa.assay.xlsx index 2e357f7219a77f02119427594574cd6d4d116af0..b8f9e15747d3a0d79169f1da3a75cdc94e088f7d 100644 Binary files a/assays/BSA_WGS/isa.assay.xlsx and b/assays/BSA_WGS/isa.assay.xlsx differ diff --git a/assays/betalain_LCMS_quantification/isa.assay.xlsx b/assays/betalain_LCMS_quantification/isa.assay.xlsx index d361dd4b2ffae902a2be61a221938294149b6bfa..3dfc4ee2c6bf0974b8500e80be0f48442074323a 100644 Binary files a/assays/betalain_LCMS_quantification/isa.assay.xlsx and b/assays/betalain_LCMS_quantification/isa.assay.xlsx differ diff --git a/assays/betalain_photometric_quantification/isa.assay.xlsx b/assays/betalain_photometric_quantification/isa.assay.xlsx index 68931c6c470e924b1bd598f466bf6f20196dc4dc..b5fbed0656e6357187d48c5f5fc97b85b02b1fea 100644 Binary files a/assays/betalain_photometric_quantification/isa.assay.xlsx and b/assays/betalain_photometric_quantification/isa.assay.xlsx differ diff --git a/assays/isoform_sequencing/dataset/reads/119_1_CCS.bam b/assays/isoform_sequencing/dataset/reads/119_1_CCS.bam new file mode 100755 index 0000000000000000000000000000000000000000..3b93c6bd5b6ac5201600c087b9d65fd71bd5eb48 --- /dev/null +++ b/assays/isoform_sequencing/dataset/reads/119_1_CCS.bam @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b53789374be790e77f6ac1f8a43cd871b87ca567361091ef89a2439b1ba53fb2 +size 1293612729 diff --git a/assays/isoform_sequencing/dataset/reads/375-1.merged.bam b/assays/isoform_sequencing/dataset/reads/375-1.merged.bam new file mode 100644 index 0000000000000000000000000000000000000000..d96dfe1b0e1939f8874e9b46331f0cfea1bcde40 --- /dev/null +++ b/assays/isoform_sequencing/dataset/reads/375-1.merged.bam @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82dffeb3ea29774537fad008dcb5aed4f5aa93baab9e33b870213f4267a14bb7 +size 615150400 diff --git a/assays/isoform_sequencing/dataset/reads/375-2.merged.bam b/assays/isoform_sequencing/dataset/reads/375-2.merged.bam new file mode 100644 index 0000000000000000000000000000000000000000..0a989f356fe3ddb08a8c170daba8cbee142c9506 --- /dev/null +++ b/assays/isoform_sequencing/dataset/reads/375-2.merged.bam @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:638b8a627d813363468b7ba99f91cedb25dfce0cbc796e102aedc5f87c011d97 +size 778034718 diff --git a/assays/isoform_sequencing/dataset/reads/375-3.merged.bam b/assays/isoform_sequencing/dataset/reads/375-3.merged.bam new file mode 100644 index 0000000000000000000000000000000000000000..d7d31240f11e944f8bcebe5b994259c95a5e17c3 --- /dev/null +++ b/assays/isoform_sequencing/dataset/reads/375-3.merged.bam @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62ea280d73147d216548b43a8a04dcf0070e18484e6895abeaa4f53ef61b13e8 +size 558073691 diff --git a/assays/isoform_sequencing/dataset/reads/375-4.merged.bam b/assays/isoform_sequencing/dataset/reads/375-4.merged.bam new file mode 100644 index 0000000000000000000000000000000000000000..645e7ea876ee60960f7fe001732274129b770418 --- /dev/null +++ b/assays/isoform_sequencing/dataset/reads/375-4.merged.bam @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a14cb9747f7e67e5b2823b4389cfee294f30f43afd389262e76f755b09fb713a +size 344068849 diff --git a/assays/isoform_sequencing/dataset/reads/375-5.merged.bam b/assays/isoform_sequencing/dataset/reads/375-5.merged.bam new file mode 100644 index 0000000000000000000000000000000000000000..6b86c2bf1ad96a1125bff5154dbdbecefcc884cb --- /dev/null +++ b/assays/isoform_sequencing/dataset/reads/375-5.merged.bam @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1e2fd5a7ca5bb8fdf53dd65b02d41dcb8dc5b6bbcd6e1de3a42b196d0913f35 +size 485186923 diff --git a/assays/isoform_sequencing/dataset/reads/375-6.merged.bam b/assays/isoform_sequencing/dataset/reads/375-6.merged.bam new file mode 100644 index 0000000000000000000000000000000000000000..1046ee535d36fa5fd95d9665ea83cbe388a71ca3 --- /dev/null +++ b/assays/isoform_sequencing/dataset/reads/375-6.merged.bam @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd25e104b61822019898b069ca57a23735bb193fef63ec3094803ca10b3fa68c +size 253500036 diff --git a/assays/isoform_sequencing/isa.assay.xlsx b/assays/isoform_sequencing/isa.assay.xlsx index ae346d235c02e0ea6c0a10e07149fcc41e5f20f7..6b7f3ef71f38b78db0fd0838b361fb8eb7e3b36b 100644 Binary files a/assays/isoform_sequencing/isa.assay.xlsx and b/assays/isoform_sequencing/isa.assay.xlsx differ diff --git a/isa.investigation.xlsx b/isa.investigation.xlsx index 586f555874be0956c7b05f5da13c824073eaa385..8dd45fa7fbd03a346ca95ac8c9f8cb7c0a5c0b35 100644 Binary files a/isa.investigation.xlsx and b/isa.investigation.xlsx differ diff --git a/studies/BSA_parent_betalain_quant/isa.study.xlsx b/studies/BSA_parent_betalain_quant/isa.study.xlsx index 486411ac1c8c88a7585c68191536289931207512..ff0a662db33b8f0884c36657625524382087e5f4 100644 Binary files a/studies/BSA_parent_betalain_quant/isa.study.xlsx and b/studies/BSA_parent_betalain_quant/isa.study.xlsx differ diff --git a/studies/Bulk_segregant_analysis/isa.study.xlsx b/studies/Bulk_segregant_analysis/isa.study.xlsx index b7b89a26b6cd86616539825c87df590f547b6e4e..fc71a2dec298cd1363acf5427b9d443042145e1e 100644 Binary files a/studies/Bulk_segregant_analysis/isa.study.xlsx and b/studies/Bulk_segregant_analysis/isa.study.xlsx differ diff --git a/studies/additional_data/isa.study.xlsx b/studies/additional_data/isa.study.xlsx index 566ca09ccb44bcd0d23eb8c1fe9b6e8b6b8de26d..1257ad2d6cab9e38f7644077deaa31902f89123c 100644 Binary files a/studies/additional_data/isa.study.xlsx and b/studies/additional_data/isa.study.xlsx differ diff --git a/studies/additional_data/resources/reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa b/studies/additional_data/resources/reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa new file mode 100644 index 0000000000000000000000000000000000000000..492065652a687bb40a32e926c786f34ffbb8243d --- /dev/null +++ b/studies/additional_data/resources/reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93583f53b0f4cc263e31e60df8ad2c08d3fc63c9d28754cb63729d84465081af +size 410638442 diff --git a/studies/additional_data/resources/reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa.fai b/studies/additional_data/resources/reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa.fai new file mode 100644 index 0000000000000000000000000000000000000000..97c2b7c3de9bd408c0d14599b35ce185f8abaa16 --- /dev/null +++ b/studies/additional_data/resources/reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa.fai @@ -0,0 +1,908 @@ +Scaffold_10 22670516 13 60 61 +Scaffold_11 22280117 23048384 60 61 +Scaffold_12 22052327 45699850 60 61 +Scaffold_13 20679869 68119729 60 61 +Scaffold_14 20190685 89144276 60 61 +Scaffold_15 17522127 109671486 60 61 +Scaffold_16 16951160 127485662 60 61 +Scaffold_1 38124660 144719354 60 61 +Scaffold_2 35657244 183479437 60 61 +Scaffold_3 30204323 219730981 60 61 +Scaffold_4 28349311 250438722 60 61 +Scaffold_5 25672467 279260534 60 61 +Scaffold_6 24628041 305360888 60 61 +Scaffold_7 24364990 330399409 60 61 +Scaffold_8 23766980 355170495 60 61 +Scaffold_9 22691259 379333604 60 61 +Contig177_quiver_ 1092290 402403070 60 61 +Contig536_quiver_ 146947 403513584 60 61 +Contig747_quiver_ 124984 403663000 60 61 +Contig366_quiver_ 47732 403790087 60 61 +Contig260_quiver_ 42336 403838634 60 61 +Contig722_quiver_ 33993 403881695 60 61 +Contig197_quiver_ 33645 403916274 60 61 +Contig115_quiver_ 33068 403950499 60 61 +Contig208_quiver_ 32090 403984138 60 61 +Contig519_quiver_ 31873 404016782 60 61 +Contig566_quiver_ 31835 404049206 60 61 +Contig723_quiver_ 30881 404081591 60 61 +Contig727_quiver_ 29350 404113006 60 61 +Contig312_quiver_ 29079 404142865 60 61 +Contig70_quiver_ 28210 404172447 60 61 +Contig279_quiver_ 28065 404201147 60 61 +Contig169_quiver_ 27805 404229699 60 61 +Contig466_quiver_ 26991 404257987 60 61 +Contig605_quiver_ 26859 404285447 60 61 +Contig709_quiver_ 26773 404312773 60 61 +Contig522_quiver_ 26662 404340012 60 61 +Contig227_quiver_ 26461 404367138 60 61 +Contig628_quiver_ 26403 404394060 60 61 +Contig86_quiver_ 26099 404420922 60 61 +Contig748_quiver_ 25848 404447475 60 61 +Contig89_quiver_ 25648 404473772 60 61 +Contig289_quiver_ 25117 404499867 60 61 +Contig689_quiver_ 25079 404525422 60 61 +Contig859_quiver_ 25067 404550938 60 61 +Contig603_quiver_ 24882 404576442 60 61 +Contig493_quiver_ 24433 404601758 60 61 +Contig900_quiver_ 24095 404626618 60 61 +Contig358_quiver_ 23685 404651134 60 61 +Contig590_quiver_ 23389 404675233 60 61 +Contig897_quiver_ 23285 404699031 60 61 +Contig607_quiver_ 22971 404722724 60 61 +Contig297_quiver_ 22702 404746097 60 61 +Contig486_quiver_ 22614 404769197 60 61 +Contig153_quiver_ 22391 404792207 60 61 +Contig43_quiver_ 22275 404814990 60 61 +Contig836_quiver_ 22221 404837656 60 61 +Contig98_quiver_ 21908 404860266 60 61 +Contig738_quiver_ 21900 404882559 60 61 +Contig44_quiver_ 21840 404904842 60 61 +Contig839_quiver_ 21773 404927065 60 61 +Contig171_quiver_ 21664 404949220 60 61 +Contig683_quiver_ 21654 404971265 60 61 +Contig829_quiver_ 21408 404993299 60 61 +Contig414_quiver_ 21258 405015083 60 61 +Contig420_quiver_ 21195 405036715 60 61 +Contig62_quiver_ 20867 405058282 60 61 +Contig716_quiver_ 20807 405079516 60 61 +Contig380_quiver_ 20550 405100689 60 61 +Contig660_quiver_ 20476 405121601 60 61 +Contig636_quiver_ 20341 405142438 60 61 +Contig392_quiver_ 20248 405163138 60 61 +Contig511_quiver_ 20120 405183743 60 61 +Contig796_quiver_ 20095 405204218 60 61 +Contig734_quiver_ 20078 405224667 60 61 +Contig731_quiver_ 20055 405245099 60 61 +Contig717_quiver_ 19854 405265508 60 61 +Contig698_quiver_ 19744 405285712 60 61 +Contig881_quiver_ 19605 405305805 60 61 +Contig706_quiver_ 19576 405325756 60 61 +Contig598_quiver_ 19424 405345678 60 61 +Contig644_quiver_ 19382 405365445 60 61 +Contig810_quiver_ 19292 405385170 60 61 +Contig489_quiver_ 19185 405404803 60 61 +Contig47_quiver_ 18870 405424326 60 61 +Contig168_quiver_ 18782 405443530 60 61 +Contig174_quiver_ 18767 405462645 60 61 +Contig535_quiver_ 18605 405481744 60 61 +Contig377_quiver_ 18599 405500679 60 61 +Contig445_quiver_ 18556 405519607 60 61 +Contig194_quiver_ 18476 405538492 60 61 +Contig183_quiver_ 18405 405557295 60 61 +Contig48_quiver_ 18327 405576025 60 61 +Contig190_quiver_ 18041 405594677 60 61 +Contig161_quiver_ 18026 405613038 60 61 +Contig432_quiver_ 17693 405631384 60 61 +Contig140_quiver_ 17678 405649391 60 61 +Contig68_quiver_ 17594 405667382 60 61 +Contig534_quiver_ 17355 405685289 60 61 +Contig643_quiver_ 17128 405702953 60 61 +Contig858_quiver_ 17049 405720386 60 61 +Contig789_quiver_ 17014 405737739 60 61 +Contig347_quiver_ 16909 405755056 60 61 +Contig826_quiver_ 16890 405772266 60 61 +Contig654_quiver_ 16827 405789457 60 61 +Contig348_quiver_ 16708 405806584 60 61 +Contig848_quiver_ 16704 405823590 60 61 +Contig550_quiver_ 16652 405840592 60 61 +Contig685_quiver_ 16619 405857541 60 61 +Contig436_quiver_ 16606 405874456 60 61 +Contig863_quiver_ 16518 405891358 60 61 +Contig718_quiver_ 16450 405908171 60 61 +Contig560_quiver_ 16426 405924915 60 61 +Contig373_quiver_ 16252 405941634 60 61 +Contig199_quiver_ 16166 405958176 60 61 +Contig546_quiver_ 16118 405974631 60 61 +Contig774_quiver_ 16111 405991037 60 61 +Contig60_quiver_ 16102 406007435 60 61 +Contig143_quiver_ 16086 406023825 60 61 +Contig250_quiver_ 16084 406040199 60 61 +Contig764_quiver_ 16044 406056571 60 61 +Contig622_quiver_ 16022 406072902 60 61 +Contig268_quiver_ 15987 406089211 60 61 +Contig699_quiver_ 15933 406105484 60 61 +Contig235_quiver_ 15851 406121702 60 61 +Contig695_quiver_ 15803 406137837 60 61 +Contig806_quiver_ 15713 406153923 60 61 +Contig447_quiver_ 15703 406169917 60 61 +Contig697_quiver_ 15630 406185901 60 61 +Contig17_quiver_ 15627 406201810 60 61 +Contig562_quiver_ 15413 406217717 60 61 +Contig111_quiver_ 15403 406233406 60 61 +Contig189_quiver_ 15398 406249085 60 61 +Contig264_quiver_ 15366 406264759 60 61 +Contig554_quiver_ 15352 406280401 60 61 +Contig132_quiver_ 15342 406296028 60 61 +Contig118_quiver_ 15318 406311645 60 61 +Contig338_quiver_ 15310 406327238 60 61 +Contig135_quiver_ 15292 406342823 60 61 +Contig862_quiver_ 15265 406358389 60 61 +Contig246_quiver_ 15248 406373928 60 61 +Contig71_quiver_ 15152 406389449 60 61 +Contig750_quiver_ 15085 406404873 60 61 +Contig370_quiver_ 14967 406420229 60 61 +Contig363_quiver_ 14899 406435465 60 61 +Contig506_quiver_ 14868 406450632 60 61 +Contig50_quiver_ 14797 406465766 60 61 +Contig538_quiver_ 14772 406480829 60 61 +Contig884_quiver_ 14749 406495867 60 61 +Contig304_quiver_ 14597 406510881 60 61 +Contig668_quiver_ 14590 406525741 60 61 +Contig580_quiver_ 14555 406540594 60 61 +Contig773_quiver_ 14446 406555411 60 61 +Contig282_quiver_ 14371 406570117 60 61 +Contig228_quiver_ 14266 406584747 60 61 +Contig674_quiver_ 14249 406599270 60 61 +Contig505_quiver_ 14210 406613776 60 61 +Contig651_quiver_ 14182 406628242 60 61 +Contig544_quiver_ 14164 406642680 60 61 +Contig93_quiver_ 14121 406657099 60 61 +Contig27_quiver_ 14081 406671474 60 61 +Contig596_quiver_ 13967 406685809 60 61 +Contig393_quiver_ 13912 406700028 60 61 +Contig405_quiver_ 13879 406714191 60 61 +Contig254_quiver_ 13811 406728321 60 61 +Contig207_quiver_ 13794 406742382 60 61 +Contig658_quiver_ 13769 406756425 60 61 +Contig355_quiver_ 13753 406770443 60 61 +Contig203_quiver_ 13722 406784445 60 61 +Contig233_quiver_ 13719 406798415 60 61 +Contig623_quiver_ 13698 406812382 60 61 +Contig277_quiver_ 13696 406826328 60 61 +Contig75_quiver_ 13694 406840271 60 61 +Contig179_quiver_ 13674 406854213 60 61 +Contig638_quiver_ 13657 406868134 60 61 +Contig577_quiver_ 13634 406882038 60 61 +Contig465_quiver_ 13633 406895919 60 61 +Contig127_quiver_ 13618 406909799 60 61 +Contig299_quiver_ 13570 406923663 60 61 +Contig345_quiver_ 13504 406937479 60 61 +Contig154_quiver_ 13463 406951228 60 61 +Contig469_quiver_ 13462 406964935 60 61 +Contig604_quiver_ 13409 406978641 60 61 +Contig144_quiver_ 13380 406992293 60 61 +Contig123_quiver_ 13329 407005915 60 61 +Contig126_quiver_ 13299 407019486 60 61 +Contig164_quiver_ 13290 407033026 60 61 +Contig125_quiver_ 13211 407046557 60 61 +Contig872_quiver_ 13177 407060008 60 61 +Contig889_quiver_ 13146 407073424 60 61 +Contig105_quiver_ 13130 407086809 60 61 +Contig901_quiver_ 13046 407100177 60 61 +Contig286_quiver_ 12947 407113460 60 61 +Contig715_quiver_ 12902 407126642 60 61 +Contig186_quiver_ 12877 407139779 60 61 +Contig14_quiver_ 12873 407152889 60 61 +Contig96_quiver_ 12859 407165995 60 61 +Contig524_quiver_ 12816 407179088 60 61 +Contig418_quiver_ 12657 407192137 60 61 +Contig431_quiver_ 12636 407205024 60 61 +Contig479_quiver_ 12627 407217890 60 61 +Contig855_quiver_ 12574 407230747 60 61 +Contig390_quiver_ 12382 407243550 60 61 +Contig619_quiver_ 12319 407256158 60 61 +Contig215_quiver_ 12215 407268702 60 61 +Contig473_quiver_ 12211 407281140 60 61 +Contig453_quiver_ 12178 407293574 60 61 +Contig57_quiver_ 12174 407305973 60 61 +Contig512_quiver_ 12135 407318369 60 61 +Contig729_quiver_ 12102 407330726 60 61 +Contig586_quiver_ 12083 407343049 60 61 +Contig430_quiver_ 11974 407355353 60 61 +Contig647_quiver_ 11966 407367546 60 61 +Contig641_quiver_ 11958 407379731 60 61 +Contig421_quiver_ 11855 407391908 60 61 +Contig531_quiver_ 11852 407403980 60 61 +Contig621_quiver_ 11846 407416049 60 61 +Contig324_quiver_ 11840 407428112 60 61 +Contig608_quiver_ 11840 407440169 60 61 +Contig903_quiver_ 11839 407452226 60 61 +Contig733_quiver_ 11826 407464282 60 61 +Contig831_quiver_ 11816 407476325 60 61 +Contig438_quiver_ 11802 407488357 60 61 +Contig46_quiver_ 11679 407500374 60 61 +Contig865_quiver_ 11640 407512267 60 61 +Contig666_quiver_ 11591 407524120 60 61 +Contig273_quiver_ 11549 407535924 60 61 +Contig620_quiver_ 11511 407547685 60 61 +Contig516_quiver_ 11439 407559407 60 61 +Contig128_quiver_ 11427 407571056 60 61 +Contig165_quiver_ 11388 407582693 60 61 +Contig193_quiver_ 11359 407594290 60 61 +Contig156_quiver_ 11336 407605858 60 61 +Contig595_quiver_ 11316 407617402 60 61 +Contig813_quiver_ 11270 407628926 60 61 +Contig851_quiver_ 11269 407640403 60 61 +Contig724_quiver_ 11264 407651879 60 61 +Contig755_quiver_ 11252 407663350 60 61 +Contig216_quiver_ 11232 407674809 60 61 +Contig838_quiver_ 11232 407686248 60 61 +Contig321_quiver_ 11214 407697687 60 61 +Contig853_quiver_ 11126 407709107 60 61 +Contig602_quiver_ 11110 407720438 60 61 +Contig515_quiver_ 11085 407731753 60 61 +Contig802_quiver_ 11026 407743042 60 61 +Contig818_quiver_ 10988 407754271 60 61 +Contig458_quiver_ 10976 407765462 60 61 +Contig488_quiver_ 10811 407776640 60 61 +Contig530_quiver_ 10762 407787651 60 61 +Contig794_quiver_ 10737 407798612 60 61 +Contig201_quiver_ 10670 407809547 60 61 +Contig383_quiver_ 10652 407820414 60 61 +Contig499_quiver_ 10629 407831263 60 61 +Contig226_quiver_ 10488 407842089 60 61 +Contig116_quiver_ 10417 407852771 60 61 +Contig803_quiver_ 10387 407863381 60 61 +Contig592_quiver_ 10385 407873961 60 61 +Contig461_quiver_ 10378 407884539 60 61 +Contig91_quiver_ 10354 407895108 60 61 +Contig185_quiver_ 10330 407905654 60 61 +Contig138_quiver_ 10318 407916176 60 61 +Contig833_quiver_ 10279 407926685 60 61 +Contig873_quiver_ 10246 407937155 60 61 +Contig823_quiver_ 10244 407947591 60 61 +Contig399_quiver_ 10195 407958025 60 61 +Contig763_quiver_ 10189 407968409 60 61 +Contig79_quiver_ 10185 407978786 60 61 +Contig327_quiver_ 10183 407989160 60 61 +Contig133_quiver_ 10157 407999532 60 61 +Contig99_quiver_ 10121 408009877 60 61 +Contig533_quiver_ 10121 408020186 60 61 +Contig714_quiver_ 10087 408030495 60 61 +Contig601_quiver_ 10062 408040770 60 61 +Contig882_quiver_ 10042 408051019 60 61 +Contig308_quiver_ 9995 408061248 60 61 +Contig868_quiver_ 9995 408071429 60 61 +Contig579_quiver_ 9949 408081610 60 61 +Contig434_quiver_ 9925 408091744 60 61 +Contig340_quiver_ 9889 408101854 60 61 +Contig180_quiver_ 9873 408111927 60 61 +Contig649_quiver_ 9834 408121984 60 61 +Contig319_quiver_ 9800 408132001 60 61 +Contig244_quiver_ 9786 408141984 60 61 +Contig39_quiver_ 9723 408151952 60 61 +Contig166_quiver_ 9711 408161857 60 61 +Contig266_quiver_ 9690 408171749 60 61 +Contig874_quiver_ 9669 408181620 60 61 +Contig694_quiver_ 9570 408191470 60 61 +Contig703_quiver_ 9565 408201219 60 61 +Contig287_quiver_ 9537 408210963 60 61 +Contig589_quiver_ 9487 408220678 60 61 +Contig25_quiver_ 9456 408230342 60 61 +Contig777_quiver_ 9449 408239975 60 61 +Contig51_quiver_ 9311 408249600 60 61 +Contig753_quiver_ 9286 408259086 60 61 +Contig449_quiver_ 9282 408268546 60 61 +Contig633_quiver_ 9256 408278002 60 61 +Contig58_quiver_ 9227 408287431 60 61 +Contig661_quiver_ 9221 408296831 60 61 +Contig109_quiver_ 9032 408306225 60 61 +Contig609_quiver_ 9000 408315427 60 61 +Contig798_quiver_ 8995 408324596 60 61 +Contig251_quiver_ 8954 408333760 60 61 +Contig490_quiver_ 8949 408342883 60 61 +Contig781_quiver_ 8893 408352001 60 61 +Contig137_quiver_ 8867 408361062 60 61 +Contig270_quiver_ 8856 408370096 60 61 +Contig378_quiver_ 8830 408379119 60 61 +Contig242_quiver_ 8804 408388116 60 61 +Contig342_quiver_ 8773 408397086 60 61 +Contig597_quiver_ 8716 408406025 60 61 +Contig394_quiver_ 8712 408414906 60 61 +Contig525_quiver_ 8697 408423783 60 61 +Contig545_quiver_ 8692 408432644 60 61 +Contig225_quiver_ 8689 408441500 60 61 +Contig102_quiver_ 8675 408450353 60 61 +Contig103_quiver_ 8663 408459192 60 61 +Contig229_quiver_ 8647 408468019 60 61 +Contig497_quiver_ 8642 408476830 60 61 +Contig556_quiver_ 8639 408485636 60 61 +Contig31_quiver_ 8556 408494437 60 61 +Contig551_quiver_ 8520 408503155 60 61 +Contig106_quiver_ 8492 408511836 60 61 +Contig510_quiver_ 8453 408520489 60 61 +Contig637_quiver_ 8430 408529102 60 61 +Contig55_quiver_ 8381 408537691 60 61 +Contig627_quiver_ 8363 408546231 60 61 +Contig663_quiver_ 8327 408554753 60 61 +Contig907_quiver_ 8288 408563238 60 61 +Contig902_quiver_ 8281 408571684 60 61 +Contig735_quiver_ 8127 408580123 60 61 +Contig424_quiver_ 8110 408588405 60 61 +Contig56_quiver_ 8082 408596669 60 61 +Contig688_quiver_ 8076 408604905 60 61 +Contig330_quiver_ 8017 408613135 60 61 +Contig883_quiver_ 7952 408621305 60 61 +Contig761_quiver_ 7936 408629409 60 61 +Contig344_quiver_ 7934 408637497 60 61 +Contig239_quiver_ 7904 408645583 60 61 +Contig73_quiver_ 7868 408653637 60 61 +Contig104_quiver_ 7855 408661656 60 61 +Contig455_quiver_ 7850 408669661 60 61 +Contig290_quiver_ 7764 408677661 60 61 +Contig364_quiver_ 7732 408685574 60 61 +Contig248_quiver_ 7681 408693454 60 61 +Contig69_quiver_ 7669 408701282 60 61 +Contig669_quiver_ 7637 408709098 60 61 +Contig584_quiver_ 7594 408716882 60 61 +Contig134_quiver_ 7591 408724622 60 61 +Contig204_quiver_ 7585 408732359 60 61 +Contig336_quiver_ 7525 408740090 60 61 +Contig879_quiver_ 7519 408747760 60 61 +Contig114_quiver_ 7516 408755424 60 61 +Contig632_quiver_ 7458 408763085 60 61 +Contig113_quiver_ 7415 408770687 60 61 +Contig196_quiver_ 7415 408778245 60 61 +Contig291_quiver_ 7395 408785803 60 61 +Contig332_quiver_ 7394 408793341 60 61 +Contig843_quiver_ 7357 408800878 60 61 +Contig904_quiver_ 7305 408808377 60 61 +Contig398_quiver_ 7302 408815823 60 61 +Contig38_quiver_ 7248 408823265 60 61 +Contig261_quiver_ 7227 408830653 60 61 +Contig571_quiver_ 7197 408838020 60 61 +Contig160_quiver_ 7195 408845356 60 61 +Contig24_quiver_ 7191 408852689 60 61 +Contig122_quiver_ 7175 408860019 60 61 +Contig148_quiver_ 7167 408867333 60 61 +Contig252_quiver_ 7136 408874639 60 61 +Contig785_quiver_ 7123 408881913 60 61 +Contig611_quiver_ 7073 408889174 60 61 +Contig30_quiver_ 7060 408896383 60 61 +Contig812_quiver_ 7053 408903580 60 61 +Contig807_quiver_ 7034 408910770 60 61 +Contig110_quiver_ 7016 408917941 60 61 +Contig679_quiver_ 7003 408925093 60 61 +Contig439_quiver_ 6972 408932232 60 61 +Contig610_quiver_ 6952 408939340 60 61 +Contig613_quiver_ 6886 408946427 60 61 +Contig147_quiver_ 6852 408953447 60 61 +Contig306_quiver_ 6833 408960433 60 61 +Contig840_quiver_ 6816 408967399 60 61 +Contig365_quiver_ 6801 408974348 60 61 +Contig667_quiver_ 6756 408981282 60 61 +Contig422_quiver_ 6688 408988170 60 61 +Contig322_quiver_ 6639 408994989 60 61 +Contig454_quiver_ 6583 409001758 60 61 +Contig192_quiver_ 6567 409008470 60 61 +Contig296_quiver_ 6489 409015166 60 61 +Contig357_quiver_ 6465 409021783 60 61 +Contig456_quiver_ 6462 409028375 60 61 +Contig687_quiver_ 6454 409034964 60 61 +Contig804_quiver_ 6445 409041545 60 61 +Contig626_quiver_ 6431 409048117 60 61 +Contig567_quiver_ 6430 409054675 60 61 +Contig310_quiver_ 6413 409061232 60 61 +Contig36_quiver_ 6404 409067770 60 61 +Contig292_quiver_ 6394 409074300 60 61 +Contig200_quiver_ 6384 409080820 60 61 +Contig100_quiver_ 6344 409087330 60 61 +Contig741_quiver_ 6342 409093799 60 61 +Contig837_quiver_ 6337 409100266 60 61 +Contig648_quiver_ 6329 409106728 60 61 +Contig521_quiver_ 6301 409113182 60 61 +Contig575_quiver_ 6294 409119608 60 61 +Contig83_quiver_ 6277 409126025 60 61 +Contig587_quiver_ 6245 409132426 60 61 +Contig825_quiver_ 6243 409138795 60 61 +Contig583_quiver_ 6234 409145162 60 61 +Contig708_quiver_ 6220 409151519 60 61 +Contig832_quiver_ 6215 409157862 60 61 +Contig484_quiver_ 6198 409164200 60 61 +Contig759_quiver_ 6146 409170521 60 61 +Contig395_quiver_ 6145 409176789 60 61 +Contig537_quiver_ 6145 409183056 60 61 +Contig730_quiver_ 6131 409189323 60 61 +Contig811_quiver_ 6130 409195576 60 61 +Contig404_quiver_ 6100 409201828 60 61 +Contig387_quiver_ 6096 409208049 60 61 +Contig117_quiver_ 6086 409214266 60 61 +Contig684_quiver_ 6070 409220473 60 61 +Contig876_quiver_ 6031 409226664 60 61 +Contig258_quiver_ 5973 409232815 60 61 +Contig675_quiver_ 5916 409238907 60 61 +Contig284_quiver_ 5907 409244941 60 61 +Contig37_quiver_ 5842 409250965 60 61 +Contig614_quiver_ 5812 409256924 60 61 +Contig151_quiver_ 5770 409262852 60 61 +Contig681_quiver_ 5738 409268738 60 61 +Contig130_quiver_ 5715 409274591 60 61 +Contig407_quiver_ 5710 409280421 60 61 +Contig463_quiver_ 5675 409286246 60 61 +Contig563_quiver_ 5662 409292035 60 61 +Contig49_quiver_ 5657 409297810 60 61 +Contig412_quiver_ 5653 409303581 60 61 +Contig313_quiver_ 5621 409309348 60 61 +Contig496_quiver_ 5609 409315082 60 61 +Contig303_quiver_ 5606 409320804 60 61 +Contig665_quiver_ 5555 409326523 60 61 +Contig720_quiver_ 5553 409332190 60 61 +Contig846_quiver_ 5541 409337855 60 61 +Contig21_quiver_ 5516 409343507 60 61 +Contig23_quiver_ 5492 409349133 60 61 +Contig80_quiver_ 5492 409354735 60 61 +Contig175_quiver_ 5474 409360338 60 61 +Contig502_quiver_ 5429 409365923 60 61 +Contig819_quiver_ 5425 409371462 60 61 +Contig45_quiver_ 5424 409376996 60 61 +Contig63_quiver_ 5408 409382529 60 61 +Contig181_quiver_ 5372 409388047 60 61 +Contig328_quiver_ 5348 409393528 60 61 +Contig555_quiver_ 5330 409398985 60 61 +Contig769_quiver_ 5329 409404423 60 61 +Contig384_quiver_ 5325 409409860 60 61 +Contig367_quiver_ 5317 409415293 60 61 +Contig346_quiver_ 5314 409420718 60 61 +Contig485_quiver_ 5308 409426140 60 61 +Contig809_quiver_ 5262 409431556 60 61 +Contig564_quiver_ 5260 409436925 60 61 +Contig369_quiver_ 5251 409442292 60 61 +Contig513_quiver_ 5229 409447650 60 61 +Contig379_quiver_ 5226 409452986 60 61 +Contig314_quiver_ 5223 409458319 60 61 +Contig792_quiver_ 5208 409463649 60 61 +Contig905_quiver_ 5197 409468963 60 61 +Contig481_quiver_ 5170 409474266 60 61 +Contig646_quiver_ 5139 409479542 60 61 +Contig76_quiver_ 5134 409484785 60 61 +Contig72_quiver_ 5098 409490023 60 61 +Contig656_quiver_ 5071 409495225 60 61 +Contig676_quiver_ 5063 409500400 60 61 +Contig331_quiver_ 5023 409505567 60 61 +Contig222_quiver_ 5017 409510693 60 61 +Contig139_quiver_ 4978 409515813 60 61 +Contig202_quiver_ 4978 409520893 60 61 +Contig503_quiver_ 4942 409525973 60 61 +Contig220_quiver_ 4935 409531017 60 61 +Contig568_quiver_ 4927 409536054 60 61 +Contig163_quiver_ 4925 409541083 60 61 +Contig397_quiver_ 4916 409546110 60 61 +Contig428_quiver_ 4916 409551127 60 61 +Contig311_quiver_ 4908 409556144 60 61 +Contig766_quiver_ 4898 409561153 60 61 +Contig20_quiver_ 4895 409566151 60 61 +Contig316_quiver_ 4891 409571147 60 61 +Contig468_quiver_ 4812 409576139 60 61 +Contig176_quiver_ 4807 409581051 60 61 +Contig782_quiver_ 4794 409585958 60 61 +Contig498_quiver_ 4785 409590851 60 61 +Contig618_quiver_ 4773 409595735 60 61 +Contig527_quiver_ 4755 409600607 60 61 +Contig3_quiver_ 4742 409605459 60 61 +Contig281_quiver_ 4737 409610300 60 61 +Contig617_quiver_ 4734 409615135 60 61 +Contig423_quiver_ 4718 409619967 60 61 +Contig448_quiver_ 4716 409624783 60 61 +Contig307_quiver_ 4712 409629597 60 61 +Contig305_quiver_ 4703 409634407 60 61 +Contig528_quiver_ 4687 409639208 60 61 +Contig864_quiver_ 4685 409643993 60 61 +Contig74_quiver_ 4653 409648775 60 61 +Contig817_quiver_ 4638 409653525 60 61 +Contig262_quiver_ 4631 409658260 60 61 +Contig743_quiver_ 4598 409662988 60 61 +Contig339_quiver_ 4597 409667682 60 61 +Contig816_quiver_ 4547 409672375 60 61 +Contig834_quiver_ 4546 409677017 60 61 +Contig606_quiver_ 4537 409681658 60 61 +Contig491_quiver_ 4500 409686290 60 61 +Contig178_quiver_ 4491 409690884 60 61 +Contig543_quiver_ 4465 409695469 60 61 +Contig188_quiver_ 4461 409700028 60 61 +Contig353_quiver_ 4460 409704583 60 61 +Contig457_quiver_ 4437 409709137 60 61 +Contig713_quiver_ 4424 409713667 60 61 +Contig625_quiver_ 4420 409718184 60 61 +Contig121_quiver_ 4407 409722697 60 61 +Contig403_quiver_ 4392 409727197 60 61 +Contig520_quiver_ 4387 409731682 60 61 +Contig209_quiver_ 4370 409736162 60 61 +Contig767_quiver_ 4367 409740624 60 61 +Contig361_quiver_ 4364 409745083 60 61 +Contig371_quiver_ 4362 409749539 60 61 +Contig557_quiver_ 4353 409753993 60 61 +Contig224_quiver_ 4332 409758438 60 61 +Contig142_quiver_ 4321 409762862 60 61 +Contig732_quiver_ 4309 409767275 60 61 +Contig591_quiver_ 4301 409771675 60 61 +Contig10_quiver_ 4296 409776066 60 61 +Contig218_quiver_ 4292 409780453 60 61 +Contig517_quiver_ 4276 409784836 60 61 +Contig514_quiver_ 4242 409789203 60 61 +Contig34_quiver_ 4240 409793534 60 61 +Contig26_quiver_ 4167 409797863 60 61 +Contig131_quiver_ 4154 409802119 60 61 +Contig150_quiver_ 4136 409806362 60 61 +Contig315_quiver_ 4135 409810586 60 61 +Contig451_quiver_ 4126 409814809 60 61 +Contig362_quiver_ 4115 409819023 60 61 +Contig265_quiver_ 4049 409823226 60 61 +Contig408_quiver_ 4041 409827362 60 61 +Contig354_quiver_ 4031 409831490 60 61 +Contig475_quiver_ 3979 409835608 60 61 +Contig464_quiver_ 3945 409839673 60 61 +Contig482_quiver_ 3917 409843703 60 61 +Contig411_quiver_ 3900 409847705 60 61 +Contig899_quiver_ 3879 409851689 60 61 +Contig742_quiver_ 3866 409855652 60 61 +Contig326_quiver_ 3864 409859602 60 61 +Contig827_quiver_ 3858 409863550 60 61 +Contig409_quiver_ 3856 409867492 60 61 +Contig294_quiver_ 3847 409871432 60 61 +Contig799_quiver_ 3845 409875363 60 61 +Contig744_quiver_ 3821 409879292 60 61 +Contig841_quiver_ 3820 409883196 60 61 +Contig875_quiver_ 3799 409887099 60 61 +Contig642_quiver_ 3766 409890981 60 61 +Contig680_quiver_ 3750 409894829 60 61 +Contig728_quiver_ 3750 409898661 60 61 +Contig671_quiver_ 3726 409902493 60 61 +Contig518_quiver_ 3723 409906301 60 61 +Contig187_quiver_ 3706 409910106 60 61 +Contig472_quiver_ 3676 409913893 60 61 +Contig866_quiver_ 3667 409917650 60 61 +Contig271_quiver_ 3645 409921398 60 61 +Contig19_quiver_ 3601 409925122 60 61 +Contig149_quiver_ 3582 409928803 60 61 +Contig217_quiver_ 3570 409932464 60 61 +Contig368_quiver_ 3565 409936113 60 61 +Contig887_quiver_ 3563 409939757 60 61 +Contig97_quiver_ 3532 409943398 60 61 +Contig191_quiver_ 3529 409947008 60 61 +Contig549_quiver_ 3502 409950615 60 61 +Contig847_quiver_ 3501 409954195 60 61 +Contig94_quiver_ 3500 409957773 60 61 +Contig240_quiver_ 3490 409961351 60 61 +Contig476_quiver_ 3459 409964919 60 61 +Contig711_quiver_ 3452 409968455 60 61 +Contig443_quiver_ 3432 409971984 60 61 +Contig721_quiver_ 3428 409975493 60 61 +Contig221_quiver_ 3424 409978998 60 61 +Contig101_quiver_ 3402 409982499 60 61 +Contig470_quiver_ 3402 409985977 60 61 +Contig754_quiver_ 3389 409989455 60 61 +Contig725_quiver_ 3382 409992920 60 61 +Contig898_quiver_ 3364 409996378 60 61 +Contig778_quiver_ 3363 409999818 60 61 +Contig612_quiver_ 3356 410003257 60 61 +Contig624_quiver_ 3355 410006688 60 61 +Contig285_quiver_ 3351 410010118 60 61 +Contig860_quiver_ 3337 410013544 60 61 +Contig124_quiver_ 3323 410016956 60 61 +Contig775_quiver_ 3309 410020354 60 61 +Contig892_quiver_ 3296 410023738 60 61 +Contig350_quiver_ 3269 410027108 60 61 +Contig585_quiver_ 3242 410030451 60 61 +Contig740_quiver_ 3242 410033767 60 61 +Contig323_quiver_ 3197 410037083 60 61 +Contig61_quiver_ 3171 410040352 60 61 +Contig772_quiver_ 3168 410043595 60 61 +Contig783_quiver_ 3141 410046835 60 61 +Contig693_quiver_ 3114 410050048 60 61 +Contig795_quiver_ 3112 410053233 60 61 +Contig278_quiver_ 3108 410056416 60 61 +Contig814_quiver_ 3096 410059595 60 61 +Contig293_quiver_ 3083 410062762 60 61 +Contig309_quiver_ 3078 410065916 60 61 +Contig223_quiver_ 3058 410069065 60 61 +Contig425_quiver_ 3050 410072193 60 61 +Contig870_quiver_ 3042 410075313 60 61 +Contig501_quiver_ 3041 410078425 60 61 +Contig845_quiver_ 3018 410081536 60 61 +Contig786_quiver_ 3011 410084624 60 61 +Contig588_quiver_ 3010 410087705 60 61 +Contig639_quiver_ 3002 410090785 60 61 +Contig645_quiver_ 2994 410093857 60 61 +Contig157_quiver_ 2973 410096920 60 61 +Contig507_quiver_ 2972 410099962 60 61 +Contig40_quiver_ 2967 410103002 60 61 +Contig539_quiver_ 2959 410106038 60 61 +Contig653_quiver_ 2947 410109066 60 61 +Contig376_quiver_ 2943 410112082 60 61 +Contig523_quiver_ 2932 410115094 60 61 +Contig787_quiver_ 2924 410118094 60 61 +Contig301_quiver_ 2895 410121086 60 61 +Contig302_quiver_ 2888 410124049 60 61 +Contig896_quiver_ 2883 410127005 60 61 +Contig705_quiver_ 2863 410129956 60 61 +Contig283_quiver_ 2849 410132886 60 61 +Contig702_quiver_ 2783 410135802 60 61 +Contig276_quiver_ 2778 410138651 60 61 +Contig770_quiver_ 2773 410141495 60 61 +Contig388_quiver_ 2772 410144334 60 61 +Contig474_quiver_ 2763 410147172 60 61 +Contig631_quiver_ 2760 410150001 60 61 +Contig791_quiver_ 2748 410152826 60 61 +Contig255_quiver_ 2734 410155639 60 61 +Contig92_quiver_ 2721 410158437 60 61 +Contig84_quiver_ 2714 410161222 60 61 +Contig389_quiver_ 2704 410164001 60 61 +Contig650_quiver_ 2699 410166770 60 61 +Contig600_quiver_ 2695 410169533 60 61 +Contig375_quiver_ 2670 410172292 60 61 +Contig253_quiver_ 2668 410175026 60 61 +Contig788_quiver_ 2667 410177758 60 61 +Contig854_quiver_ 2662 410180489 60 61 +Contig878_quiver_ 2648 410183215 60 61 +Contig274_quiver_ 2636 410185927 60 61 +Contig210_quiver_ 2624 410188626 60 61 +Contig446_quiver_ 2598 410191313 60 61 +Contig634_quiver_ 2576 410193974 60 61 +Contig760_quiver_ 2574 410196612 60 61 +Contig333_quiver_ 2564 410199248 60 61 +Contig630_quiver_ 2557 410201874 60 61 +Contig381_quiver_ 2523 410204493 60 61 +Contig337_quiver_ 2517 410207078 60 61 +Contig483_quiver_ 2517 410209656 60 61 +Contig219_quiver_ 2495 410212234 60 61 +Contig359_quiver_ 2490 410214790 60 61 +Contig739_quiver_ 2490 410217341 60 61 +Contig391_quiver_ 2478 410219892 60 61 +Contig867_quiver_ 2477 410222431 60 61 +Contig2_quiver_ 2474 410224967 60 61 +Contig415_quiver_ 2473 410227502 60 61 +Contig417_quiver_ 2459 410230036 60 61 +Contig159_quiver_ 2445 410232555 60 61 +Contig288_quiver_ 2431 410235060 60 61 +Contig652_quiver_ 2401 410237551 60 61 +Contig95_quiver_ 2399 410240011 60 61 +Contig170_quiver_ 2398 410242469 60 61 +Contig28_quiver_ 2361 410244925 60 61 +Contig452_quiver_ 2352 410247345 60 61 +Contig87_quiver_ 2348 410249755 60 61 +Contig793_quiver_ 2344 410252162 60 61 +Contig356_quiver_ 2341 410254565 60 61 +Contig815_quiver_ 2341 410256965 60 61 +Contig784_quiver_ 2329 410259365 60 61 +Contig67_quiver_ 2328 410261751 60 61 +Contig894_quiver_ 2323 410264137 60 61 +Contig885_quiver_ 2318 410266518 60 61 +Contig145_quiver_ 2314 410268894 60 61 +Contig352_quiver_ 2308 410271266 60 61 +Contig259_quiver_ 2298 410273632 60 61 +Contig659_quiver_ 2297 410275988 60 61 +Contig33_quiver_ 2296 410278342 60 61 +Contig267_quiver_ 2293 410280696 60 61 +Contig726_quiver_ 2256 410283047 60 61 +Contig82_quiver_ 2252 410285359 60 61 +Contig692_quiver_ 2241 410287668 60 61 +Contig64_quiver_ 2240 410289965 60 61 +Contig136_quiver_ 2235 410292262 60 61 +Contig406_quiver_ 2192 410294554 60 61 +Contig241_quiver_ 2191 410296802 60 61 +Contig704_quiver_ 2190 410299049 60 61 +Contig429_quiver_ 2177 410301295 60 61 +Contig245_quiver_ 2167 410303528 60 61 +Contig459_quiver_ 2124 410305751 60 61 +Contig701_quiver_ 2115 410307930 60 61 +Contig372_quiver_ 2101 410310100 60 61 +Contig553_quiver_ 2097 410312256 60 61 +Contig561_quiver_ 2097 410314407 60 61 +Contig460_quiver_ 2094 410316558 60 61 +Contig736_quiver_ 2091 410318706 60 61 +Contig745_quiver_ 2089 410320851 60 61 +Contig65_quiver_ 2076 410322993 60 61 +Contig852_quiver_ 2069 410325123 60 61 +Contig90_quiver_ 2064 410327245 60 61 +Contig861_quiver_ 2063 410329363 60 61 +Contig808_quiver_ 2056 410331480 60 61 +Contig234_quiver_ 2055 410333590 60 61 +Contig574_quiver_ 2051 410335699 60 61 +Contig664_quiver_ 2047 410337804 60 61 +Contig295_quiver_ 2046 410339905 60 61 +Contig77_quiver_ 2039 410342004 60 61 +Contig341_quiver_ 2032 410344096 60 61 +Contig547_quiver_ 2027 410346181 60 61 +Contig558_quiver_ 2011 410348261 60 61 +Contig629_quiver_ 1981 410350325 60 61 +Contig880_quiver_ 1975 410352359 60 61 +Contig771_quiver_ 1969 410354386 60 61 +Contig696_quiver_ 1959 410356407 60 61 +Contig495_quiver_ 1957 410358418 60 61 +Contig78_quiver_ 1952 410360426 60 61 +Contig552_quiver_ 1945 410362430 60 61 +Contig146_quiver_ 1941 410364427 60 61 +Contig396_quiver_ 1941 410366420 60 61 +Contig158_quiver_ 1939 410368413 60 61 +Contig280_quiver_ 1935 410370404 60 61 +Contig824_quiver_ 1934 410372391 60 61 +Contig529_quiver_ 1933 410374377 60 61 +Contig257_quiver_ 1930 410376362 60 61 +Contig682_quiver_ 1926 410378344 60 61 +Contig141_quiver_ 1925 410380322 60 61 +Contig243_quiver_ 1921 410382299 60 61 +Contig895_quiver_ 1919 410384272 60 61 +Contig427_quiver_ 1910 410386242 60 61 +Contig542_quiver_ 1906 410388203 60 61 +Contig570_quiver_ 1904 410390160 60 61 +Contig906_quiver_ 1904 410392115 60 61 +Contig877_quiver_ 1891 410394070 60 61 +Contig820_quiver_ 1887 410396012 60 61 +Contig842_quiver_ 1886 410397950 60 61 +Contig382_quiver_ 1885 410399887 60 61 +Contig657_quiver_ 1882 410401823 60 61 +Contig441_quiver_ 1875 410403756 60 61 +Contig886_quiver_ 1860 410405682 60 61 +Contig737_quiver_ 1858 410407592 60 61 +Contig581_quiver_ 1854 410409500 60 61 +Contig751_quiver_ 1834 410411404 60 61 +Contig779_quiver_ 1818 410413288 60 61 +Contig236_quiver_ 1817 410415156 60 61 +Contig678_quiver_ 1814 410417023 60 61 +Contig385_quiver_ 1810 410418887 60 61 +Contig59_quiver_ 1805 410420746 60 61 +Contig559_quiver_ 1796 410422601 60 61 +Contig891_quiver_ 1786 410424446 60 61 +Contig9_quiver_ 1773 410426279 60 61 +Contig88_quiver_ 1764 410428100 60 61 +Contig830_quiver_ 1762 410429913 60 61 +Contig400_quiver_ 1749 410431724 60 61 +Contig578_quiver_ 1744 410433522 60 61 +Contig888_quiver_ 1743 410435315 60 61 +Contig374_quiver_ 1738 410437107 60 61 +Contig504_quiver_ 1738 410438893 60 61 +Contig801_quiver_ 1726 410440679 60 61 +Contig66_quiver_ 1725 410442452 60 61 +Contig640_quiver_ 1715 410444225 60 61 +Contig532_quiver_ 1714 410445988 60 61 +Contig351_quiver_ 1711 410447750 60 61 +Contig410_quiver_ 1704 410449509 60 61 +Contig719_quiver_ 1695 410451261 60 61 +Contig467_quiver_ 1694 410453004 60 61 +Contig572_quiver_ 1689 410454746 60 61 +Contig494_quiver_ 1680 410456483 60 61 +Contig195_quiver_ 1670 410458210 60 61 +Contig237_quiver_ 1665 410459927 60 61 +Contig231_quiver_ 1656 410461639 60 61 +Contig155_quiver_ 1631 410463342 60 61 +Contig569_quiver_ 1627 410465020 60 61 +Contig320_quiver_ 1623 410466694 60 61 +Contig416_quiver_ 1621 410468364 60 61 +Contig442_quiver_ 1621 410470032 60 61 +Contig790_quiver_ 1621 410471700 60 61 +Contig81_quiver_ 1607 410473367 60 61 +Contig444_quiver_ 1607 410475020 60 61 +Contig573_quiver_ 1606 410476673 60 61 +Contig120_quiver_ 1599 410478325 60 61 +Contig849_quiver_ 1596 410479970 60 61 +Contig478_quiver_ 1595 410481612 60 61 +Contig167_quiver_ 1582 410483253 60 61 +Contig582_quiver_ 1579 410484881 60 61 +Contig413_quiver_ 1569 410486506 60 61 +Contig746_quiver_ 1567 410488121 60 61 +Contig52_quiver_ 1566 410489733 60 61 +Contig437_quiver_ 1561 410491345 60 61 +Contig541_quiver_ 1561 410492952 60 61 +Contig462_quiver_ 1557 410494559 60 61 +Contig749_quiver_ 1552 410496161 60 61 +Contig594_quiver_ 1544 410497758 60 61 +Contig401_quiver_ 1534 410499347 60 61 +Contig756_quiver_ 1533 410500926 60 61 +Contig272_quiver_ 1530 410502504 60 61 +Contig7_quiver_ 1529 410504077 60 61 +Contig18_quiver_ 1515 410505650 60 61 +Contig212_quiver_ 1513 410507210 60 61 +Contig780_quiver_ 1513 410508768 60 61 +Contig173_quiver_ 1512 410510326 60 61 +Contig275_quiver_ 1496 410511883 60 61 +Contig426_quiver_ 1491 410513423 60 61 +Contig107_quiver_ 1478 410514958 60 61 +Contig593_quiver_ 1475 410516480 60 61 +Contig440_quiver_ 1467 410517999 60 61 +Contig686_quiver_ 1466 410519510 60 61 +Contig828_quiver_ 1465 410521020 60 61 +Contig797_quiver_ 1464 410522529 60 61 +Contig677_quiver_ 1461 410524037 60 61 +Contig435_quiver_ 1458 410525542 60 61 +Contig821_quiver_ 1443 410527044 60 61 +Contig300_quiver_ 1437 410528531 60 61 +Contig655_quiver_ 1424 410530011 60 61 +Contig119_quiver_ 1417 410531478 60 61 +Contig576_quiver_ 1416 410532938 60 61 +Contig857_quiver_ 1411 410534397 60 61 +Contig419_quiver_ 1406 410535851 60 61 +Contig776_quiver_ 1404 410537300 60 61 +Contig615_quiver_ 1397 410538747 60 61 +Contig172_quiver_ 1393 410540187 60 61 +Contig433_quiver_ 1385 410541623 60 61 +Contig343_quiver_ 1382 410543051 60 61 +Contig492_quiver_ 1374 410544476 60 61 +Contig890_quiver_ 1370 410545892 60 61 +Contig471_quiver_ 1369 410547304 60 61 +Contig182_quiver_ 1366 410548715 60 61 +Contig238_quiver_ 1359 410550123 60 61 +Contig707_quiver_ 1358 410551524 60 61 +Contig835_quiver_ 1348 410552924 60 61 +Contig822_quiver_ 1339 410554314 60 61 +Contig768_quiver_ 1337 410555695 60 61 +Contig856_quiver_ 1330 410557074 60 61 +Contig53_quiver_ 1324 410558445 60 61 +Contig263_quiver_ 1323 410559811 60 61 +Contig844_quiver_ 1322 410561176 60 61 +Contig269_quiver_ 1318 410562540 60 61 +Contig205_quiver_ 1316 410563899 60 61 +Contig765_quiver_ 1311 410565256 60 61 +Contig710_quiver_ 1304 410566608 60 61 +Contig112_quiver_ 1300 410567953 60 61 +Contig249_quiver_ 1296 410569294 60 61 +Contig129_quiver_ 1295 410570631 60 61 +Contig247_quiver_ 1295 410571967 60 61 +Contig480_quiver_ 1294 410573303 60 61 +Contig500_quiver_ 1277 410574638 60 61 +Contig334_quiver_ 1269 410575956 60 61 +Contig508_quiver_ 1268 410577266 60 61 +Contig565_quiver_ 1262 410578575 60 61 +Contig509_quiver_ 1261 410579878 60 61 +Contig691_quiver_ 1259 410581180 60 61 +Contig758_quiver_ 1256 410582479 60 61 +Contig184_quiver_ 1252 410583775 60 61 +Contig805_quiver_ 1243 410585067 60 61 +Contig329_quiver_ 1240 410586350 60 61 +Contig548_quiver_ 1231 410587630 60 61 +Contig450_quiver_ 1229 410588901 60 61 +Contig757_quiver_ 1220 410590170 60 61 +Contig752_quiver_ 1213 410591430 60 61 +Contig41_quiver_ 1210 410592682 60 61 +Contig335_quiver_ 1203 410593932 60 61 +Contig206_quiver_ 1199 410595175 60 61 +Contig230_quiver_ 1196 410596413 60 61 +Contig325_quiver_ 1187 410597648 60 61 +Contig670_quiver_ 1182 410598874 60 61 +Contig232_quiver_ 1176 410600095 60 61 +Contig318_quiver_ 1173 410601310 60 61 +Contig673_quiver_ 1163 410602522 60 61 +Contig800_quiver_ 1161 410603724 60 61 +Contig540_quiver_ 1158 410604924 60 61 +Contig317_quiver_ 1156 410606121 60 61 +Contig850_quiver_ 1153 410607316 60 61 +Contig386_quiver_ 1152 410608508 60 61 +Contig214_quiver_ 1143 410609699 60 61 +Contig402_quiver_ 1136 410610881 60 61 +Contig298_quiver_ 1127 410612055 60 61 +Contig198_quiver_ 1126 410613220 60 61 +Contig85_quiver_ 1123 410614383 60 61 +Contig616_quiver_ 1115 410615544 60 61 +Contig42_quiver_ 1110 410616696 60 61 +Contig869_quiver_ 1109 410617844 60 61 +Contig662_quiver_ 1099 410618991 60 61 +Contig211_quiver_ 1080 410620128 60 61 +Contig108_quiver_ 1077 410621245 60 61 +Contig635_quiver_ 1072 410622359 60 61 +Contig893_quiver_ 1066 410623468 60 61 +Contig349_quiver_ 1064 410624571 60 61 +Contig871_quiver_ 1043 410625672 60 61 +Contig256_quiver_ 1040 410626752 60 61 +Contig700_quiver_ 1040 410627829 60 61 +Contig690_quiver_ 981 410628906 60 61 +Contig162_quiver_ 970 410629923 60 61 +Contig672_quiver_ 964 410630929 60 61 +Contig526_quiver_ 872 410631929 60 61 +Contig712_quiver_ 869 410632835 60 61 +Contig213_quiver_ 818 410633738 60 61 +Contig599_quiver_ 815 410634589 60 61 +Contig762_quiver_ 741 410635437 60 61 +Contig360_quiver_ 727 410636210 60 61 +Contig487_quiver_ 640 410636969 60 61 +Contig477_quiver_ 639 410637639 60 61 +Contig152_quiver_ 131 410638308 60 61 diff --git a/studies/isoform_sequencing/isa.study.xlsx b/studies/isoform_sequencing/isa.study.xlsx index 48af51fa63c14650219e5e214e849d894b1120dd..8eb16406627b088cc52f0610b3be96421ca7c2d1 100644 Binary files a/studies/isoform_sequencing/isa.study.xlsx and b/studies/isoform_sequencing/isa.study.xlsx differ diff --git a/workflows/BSA/readme.txt b/workflows/BSA/readme.txt index 1c81ccdc62112abefd1eb757dfce03d9db65600c..a7750ca09f24c08d3cf55d2b0f935a1bfa3e3a5f 100644 --- a/workflows/BSA/readme.txt +++ b/workflows/BSA/readme.txt @@ -7,39 +7,39 @@ Code for bulk segregant analysis, haplotype analysis and quantification of flowe RNAseq: -- code/BSA/RNAseq/adapter_trimming.sh +- workflows/BSA/RNAseq/adapter_trimming.sh trimming of adapter sequences using Trimmomatic -- code/BSA/RNAseq/index_STAR.sh +- workflows/BSA/RNAseq/index_STAR.sh index the genome with STAR using the genome annotation v2.2 -- code/BSA/RNAseq/run_STAR.sh +- workflows/BSA/RNAseq/run_STAR.sh map reads to the genome using STAR -- code/BSA/RNAseq/QC.sh +- workflows/BSA/RNAseq/QC.sh quality control of adapter trimming and read mapping -- code/BSA/RNAseq/run_kallisto.sh +- workflows/BSA/RNAseq/run_kallisto.sh quantification of gene expression using kallisto WGS: -- code/BSA/WGS/map_reads.sh +- workflows/BSA/WGS/map_reads.sh map WGS reads to the genome -- code/BSA/WGS/combined_filter.sh +- workflows/BSA/WGS/combined_filter.sh call and filter variants Analysis: -- code/BSA/read_count_analysis.Rmd +- workflows/BSA/read_count_analysis.Rmd Analysis of gene expression data in pooled flower tissue -- code/BSA/bsa_and_plotting.R +- workflows/BSA/bsa_and_plotting.R Conduct bulk segregant analysis -- code/BSA/snpEff_analysis.Rmd +- workflows/BSA/snpEff_analysis.Rmd Analyse BSA variants using snpEff -- code/BSA/read_count_analysis_from_bam.Rmd +- workflows/BSA/read_count_analysis_from_bam.Rmd Investigate support for AhCYP76AD2 variants by RNA-seq data diff --git a/workflows/annotation_analysis/R2R3_analysis_reannotation.Rmd b/workflows/annotation_analysis/R2R3_analysis_reannotation.Rmd index a35b46aa656b5298a7c755b2a10131a4ddc0fb11..2d73d201de112f39962676d6f211b59bb92dd7a7 100644 --- a/workflows/annotation_analysis/R2R3_analysis_reannotation.Rmd +++ b/workflows/annotation_analysis/R2R3_analysis_reannotation.Rmd @@ -27,9 +27,9 @@ This script takes the braker2 output and runs an HMMscan using the MYB DNA-bindi ## HMMscan using the MYB DNA-binding domain: ```{bash} -mkdir -p data/annotation_analysis/myb_annotation/hmmscan +mkdir -p runs/annotation_analysis/myb_annotation/hmmscan -hmmscan --domtblout data/annotation_analysis/myb_annotation/hmmscan/out.txt data/annotation_analysis/myb_annotation/myb_profile/Myb_DNA-binding.hmm polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta +hmmscan --domtblout runs/annotation_analysis/myb_annotation/hmmscan/out.txt runs/annotation_analysis/myb_annotation/myb_profile/Myb_DNA-binding.hmm runs/polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta ``` @@ -40,7 +40,7 @@ Takes domtblout output of the HMMscan using the MYB DNA-binding domain HMM profi ```{r} # Read hmmscan output file into variable data (domtblout format) -data <- read.table('data/annotation_analysis/myb_annotation/hmmscan/out.txt', row.names = NULL, fill = T) +data <- read.table('runs/annotation_analysis/myb_annotation/hmmscan/out.txt', row.names = NULL, fill = T) ## Add column including the length of alignment data <- data %>% mutate(alignment_length = V17-V16) @@ -144,7 +144,7 @@ Extracting only the R2R3 mybs (with two adjacent MYB domains). filtered_mybs <- filtered_mybs[filtered_mybs$Freq > 1,] # write gene names to file -write.csv(filtered_mybs, file = "data/annotation_analysis/myb_annotation/myb_names.txt", +write.csv(filtered_mybs, file = "runs/annotation_analysis/myb_annotation/myb_names.txt", quote = F, row.names = F) ``` @@ -179,12 +179,12 @@ read.gtf <- function(file){ } # read annotation and filter for MYB genes -annotation <- read.gtf(file = "polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf") +annotation <- read.gtf(file = "runs/polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf") annotation <- annotation %>% filter(transcript_id %in% filtered_mybs$query_name) # read in MYB subgroup and function assignment -myb_function <- read_csv(file = "data/manual_sheets/MYB_with_subgroups_and_function.csv") +myb_function <- read_csv(file = "studies/additional_data/resources/MYB_with_subgroups_and_function.csv") myb_function <- myb_function %>% mutate(Function = replace_na(Function, "-")) @@ -232,7 +232,7 @@ myb_stats[62,1] <- "AmMYB3" myb_stats[63,1] <- "AmMYB4" # write output file -write.csv(myb_stats, file = "data/annotation_analysis/myb_annotation/myb_stats.csv", +write.csv(myb_stats, file = "runs/annotation_analysis/myb_annotation/myb_stats.csv", quote = T, row.names = F) ``` @@ -242,7 +242,7 @@ Analyse basic statistics: ```{r} # read in data -myb_stats <- read.csv(file = "data/annotation_analysis/myb_annotation/myb_stats.csv") +myb_stats <- read.csv(file = "runs/annotation_analysis/myb_annotation/myb_stats.csv") # exon count of R2R3 MYBs myb_stats %>% @@ -257,7 +257,7 @@ myb_stats %>% Extract myb protein fasta based on the identified MYB names. ```{bash} -seqkit faidx -l data/annotation_analysis/myb_annotation/myb_names.txt polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta > data/annotation_analysis/myb_annotation/AH_myb.fasta +seqkit faidx -l runs/annotation_analysis/myb_annotation/myb_names.txt runs/polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta > runs/annotation_analysis/myb_annotation/AH_myb.fasta ``` ## Alignment and Phylogeny @@ -270,10 +270,10 @@ Supplements of: Stracke, R., Holtgräwe, D., Schneider, J., Pucker, B., Rosleff ```{bash} # align MYBs using clustalOmega -cat data/annotation_analysis/myb_annotation/AH_myb.fasta data/annotation_analysis/myb_annotation/bvulgaris_athaliana_myb.fasta | /home/tom/Documents/tools/clustalo-1.2.4-Ubuntu-x86_64 --threads=6 --in=- --outfile=data/annotation_analysis/myb_annotation/all_myb.aln --force +cat runs/annotation_analysis/myb_annotation/AH_myb.fasta runs/annotation_analysis/myb_annotation/bvulgaris_athaliana_myb.fasta | /home/tom/Documents/tools/clustalo-1.2.4-Ubuntu-x86_64 --threads=6 --in=- --outfile=runs/annotation_analysis/myb_annotation/all_myb.aln --force # create phylogeny from alignment using clustalw2 -/home/tom/Documents/tools/clustalw-2.1-linux-x86_64-libcppstatic/clustalw2 -bootstrap=1000 -infile=data/annotation_analysis/myb_annotation/all_myb.aln -outfile=data/annotation_analysis/myb_annotation/all_myb.phb +/home/tom/Documents/tools/clustalw-2.1-linux-x86_64-libcppstatic/clustalw2 -bootstrap=1000 -infile=runs/annotation_analysis/myb_annotation/all_myb.aln -outfile=runs/annotation_analysis/myb_annotation/all_myb.phb ``` ## Plot phylogeny @@ -282,7 +282,7 @@ I will try to use the same functions as I did for my bachelor thesis. ```{r} # read in the bootstrapped phylogeny -tree <- read.raxml("data/annotation_analysis/myb_annotation/all_myb.phb") +tree <- read.raxml("runs/annotation_analysis/myb_annotation/all_myb.phb") # create tree object t <- ggtree(tree, branch.length = 'none', layout='circular', size=0.2) @@ -401,12 +401,12 @@ t_species + plot_clade2(588,'AtMYB5', "Development, flavonoid biosynthesis") + theme(plot.margin = margin(1,2,1,1, "cm")) -ggsave(filename = "plots/MYB_phylogenetic_tree.png", +ggsave(filename = "runs/MYB_phylogenetic_tree.png", width = 14, height = 14, dpi = 600) -ggsave(filename = "plots/MYB_phylogenetic_tree.pdf", +ggsave(filename = "runs/MYB_phylogenetic_tree.pdf", device = "pdf", width = 12, height = 12, @@ -419,17 +419,17 @@ ggsave(filename = "plots/MYB_phylogenetic_tree.pdf", Create and plot alignment of MYBs of S6 and BvMYB1-like clades. ```{bash} -mkdir plots/AmMYB2_figure/ +mkdir runs/plots/AmMYB2_figure/ # align using clustalomega -/home/tom/Documents/tools/clustalo-1.2.4-Ubuntu-x86_64 --in=data/annotation_analysis/myb_annotation/S6_betalain_myb_alignment/manual_S6.fasta --outfile=data/annotation_analysis/myb_annotation/S6_betalain_myb_alignment/manual_S6.aln +/home/tom/Documents/tools/clustalo-1.2.4-Ubuntu-x86_64 --in=runs/annotation_analysis/myb_annotation/S6_betalain_myb_alignment/manual_S6.fasta --outfile=runs/annotation_analysis/myb_annotation/S6_betalain_myb_alignment/manual_S6.aln ``` Import and plot alignment: ```{r} # read in alignment -S6_align <- readAAMultipleAlignment("data/annotation_analysis/myb_annotation/S6_betalain_myb_alignment/manual_S6.aln", +S6_align <- readAAMultipleAlignment("runs/annotation_analysis/myb_annotation/S6_betalain_myb_alignment/manual_S6.aln", format = "fasta") S6_align@unmasked@ranges@NAMES <- c("AtMYB75 (PAP1)", "AtMYB90 (PAP2)", "AtMYB113", "AtMYB114", "AhMYB2.1", "AhMYB2.2", "BvMYB1", "Bv_ralf", "AhMYB3.1","AhMYB4.1") @@ -447,7 +447,7 @@ ggmsa(S6_align, plot.margin = unit(c(1,1,1,1), "cm")) # save the alignment -ggsave(filename = "plots/AmMYB2_figure/S6_betalain_myb_align.png", +ggsave(filename = "runs/plots/AmMYB2_figure/S6_betalain_myb_align.png", width = 9, height = 6, dpi = 320, @@ -466,7 +466,7 @@ ggmsa(S6_align, plot.margin = unit(c(0,0.5,0,0.5), "cm")) # save the alignment -ggsave(filename = "plots/AmMYB2_figure/S6_betalain_myb_S6.png", +ggsave(filename = "runs/plots/AmMYB2_figure/S6_betalain_myb_S6.png", width = 9, height = 2.5, dpi = 400, @@ -518,7 +518,7 @@ residue_raster <- ggplot(data = df_melt) + residue_raster -ggsave(filename = "plots/AmMYB2_figure/residue_raster.png", +ggsave(filename = "runs/plots/AmMYB2_figure/residue_raster.png", width = 9, height = 1.5, dpi = 400) diff --git a/workflows/annotation_analysis/betalain_and_flavonoid_identification.sh b/workflows/annotation_analysis/betalain_and_flavonoid_identification.sh index d11067692f19afdb04f2d6d738790d484a9aed17..b4b878aab818aff5a777df5e6dbe33b34128eacc 100644 --- a/workflows/annotation_analysis/betalain_and_flavonoid_identification.sh +++ b/workflows/annotation_analysis/betalain_and_flavonoid_identification.sh @@ -5,19 +5,19 @@ # identification of betalains by blast # create blast database of annotated protein sequences # source of betalain pathway protein sequences is described in manuscript -mkdir -p data/annotation_analysis/betalains/DB -mkdir -p data/annotation_analysis/flavonoids +mkdir -p runs/annotation_analysis/betalains/DB +mkdir -p runs/annotation_analysis/flavonoids -makeblastdb -in polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta \ - -out data/annotation_analysis/betalains/DB/blast_db \ - -logfile data/annotation_analysis/betalains/blast_db.log \ +makeblastdb -in runs/polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta \ + -out runs/annotation_analysis/betalains/DB/blast_db \ + -logfile runs/annotation_analysis/betalains/blast_db.log \ -dbtype prot # run protein blast of described betalain pathway genes against blast database -blastp -query data/annotation_analysis/betalains/pathway.fasta \ - -db data/annotation_analysis/betalains/DB/blast_db \ +blastp -query runs/annotation_analysis/betalains/pathway.fasta \ + -db runs/annotation_analysis/betalains/DB/blast_db \ -outfmt 7 \ - -out data/annotation_analysis/betalains/pathway_against_protein.out \ + -out runs/annotation_analysis/betalains/pathway_against_protein.out \ -qcov_hsp_perc 80 @@ -25,36 +25,36 @@ blastp -query data/annotation_analysis/betalains/pathway.fasta \ # run KIPEs: python /home/tom/Documents/tools/KIPEs/KIPEs3.py --baits /home/tom/Documents/tools/KIPEs/flavonoid_baits/ \ --positions /home/tom/Documents/tools/KIPEs/flavonoid_residues/ \ - --out data/annotation_analysis/flavonoids/ \ - --subject polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta \ + --out runs/annotation_analysis/flavonoids/ \ + --subject runs/polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta \ --seqtype pep \ --cpus 6 # create blast db -mkdir data/annotation_analysis/flavonoids/blast_db -makeblastdb -in polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta -out data/annotation_analysis/flavonoids/blast_db/db -dbtype nucl +mkdir runs/annotation_analysis/flavonoids/blast_db +makeblastdb -in runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta -out runs/annotation_analysis/flavonoids/blast_db/db -dbtype nucl # blast searches for unidentified candidate genes using KIPEs bait sequences -tblastn -query data/annotation_analysis/flavonoids/KIPEs/blast_query/ANS.fasta -db data/annotation_analysis/flavonoids/blast_db/db -outfmt 7 > data/annotation_analysis/flavonoids/ANS_blast.out -tblastn -query data/annotation_analysis/flavonoids/KIPEs/blast_query/ANR.fasta -db data/annotation_analysis/flavonoids/blast_db/db -outfmt 7 > data/annotation_analysis/flavonoids/ANR_blast.out -tblastn -query data/annotation_analysis/flavonoids/KIPEs/blast_query/F3-5-H.fasta -db data/annotation_analysis/flavonoids/blast_db/db -outfmt 7 > data/annotation_analysis/flavonoids/F3-5-H_blast.out -tblastn -query data/annotation_analysis/flavonoids/KIPEs/blast_query/FNS1.fasta -db data/annotation_analysis/flavonoids/blast_db/db -outfmt 7 > data/annotation_analysis/flavonoids/FNS1_blast.out -tblastn -query data/annotation_analysis/flavonoids/KIPEs/blast_query/CHI2.fasta -db data/annotation_analysis/flavonoids/blast_db/db -outfmt 7 > data/annotation_analysis/flavonoids/CHI2_blast.out +tblastn -query runs/annotation_analysis/flavonoids/KIPEs/blast_query/ANS.fasta -db runs/annotation_analysis/flavonoids/blast_db/db -outfmt 7 > runs/annotation_analysis/flavonoids/ANS_blast.out +tblastn -query runs/annotation_analysis/flavonoids/KIPEs/blast_query/ANR.fasta -db runs/annotation_analysis/flavonoids/blast_db/db -outfmt 7 > runs/annotation_analysis/flavonoids/ANR_blast.out +tblastn -query runs/annotation_analysis/flavonoids/KIPEs/blast_query/F3-5-H.fasta -db runs/annotation_analysis/flavonoids/blast_db/db -outfmt 7 > runs/annotation_analysis/flavonoids/F3-5-H_blast.out +tblastn -query runs/annotation_analysis/flavonoids/KIPEs/blast_query/FNS1.fasta -db runs/annotation_analysis/flavonoids/blast_db/db -outfmt 7 > runs/annotation_analysis/flavonoids/FNS1_blast.out +tblastn -query runs/annotation_analysis/flavonoids/KIPEs/blast_query/CHI2.fasta -db runs/annotation_analysis/flavonoids/blast_db/db -outfmt 7 > runs/annotation_analysis/flavonoids/CHI2_blast.out # exonerate protein alignments # prepare fasta file of KIPEs bait sequences -sed 's/%_//' data/annotation_analysis/flavonoids/KIPEs/blast_query/ANS.fasta > data/annotation_analysis/flavonoids/ANS_fixed.fasta +sed 's/%_//' runs/annotation_analysis/flavonoids/KIPEs/blast_query/ANS.fasta > runs/annotation_analysis/flavonoids/ANS_fixed.fasta # run exonerate -exonerate --model protein2genome --percent 55 --showvulgar no --showalignment no --showtargetgff yes --query data/annotation_analysis/flavonoids/ANS_fixed.fasta --target polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta > data/annotation_analysis/flavonoids/ANS_exonerate.gff +exonerate --model protein2genome --percent 55 --showvulgar no --showalignment no --showtargetgff yes --query runs/annotation_analysis/flavonoids/ANS_fixed.fasta --target runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta > runs/annotation_analysis/flavonoids/ANS_exonerate.gff -sed 's/%_//' data/annotation_analysis/flavonoids/KIPEs/blast_query/ANR.fasta > data/annotation_analysis/flavonoids/ANR_fixed.fasta -exonerate --model protein2genome --percent 55 --showvulgar no --showalignment no --showtargetgff yes --query data/annotation_analysis/flavonoids/ANR_fixed.fasta --target polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta > data/annotation_analysis/flavonoids/ANR_exonerate.gff +sed 's/%_//' runs/annotation_analysis/flavonoids/KIPEs/blast_query/ANR.fasta > runs/annotation_analysis/flavonoids/ANR_fixed.fasta +exonerate --model protein2genome --percent 55 --showvulgar no --showalignment no --showtargetgff yes --query runs/annotation_analysis/flavonoids/ANR_fixed.fasta --target runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta > runs/annotation_analysis/flavonoids/ANR_exonerate.gff -sed 's/%_//' data/annotation_analysis/flavonoids/KIPEs/blast_query/F3-5-H.fasta > data/annotation_analysis/flavonoids/F3-5-H_fixed.fasta -exonerate --model protein2genome --percent 55 --showvulgar no --showalignment no --showtargetgff yes --query data/annotation_analysis/flavonoids/F3-5-H_fixed.fasta --target polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta > data/annotation_analysis/flavonoids/F3-5-H_exonerate.gff +sed 's/%_//' runs/annotation_analysis/flavonoids/KIPEs/blast_query/F3-5-H.fasta > runs/annotation_analysis/flavonoids/F3-5-H_fixed.fasta +exonerate --model protein2genome --percent 55 --showvulgar no --showalignment no --showtargetgff yes --query runs/annotation_analysis/flavonoids/F3-5-H_fixed.fasta --target runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta > runs/annotation_analysis/flavonoids/F3-5-H_exonerate.gff -sed 's/%_//' data/annotation_analysis/flavonoids/KIPEs/blast_query/FNS1.fasta > data/annotation_analysis/flavonoids/FNS1_fixed.fasta -exonerate --model protein2genome --percent 55 --showvulgar no --showalignment no --showtargetgff yes --query data/annotation_analysis/flavonoids/FNS1_fixed.fasta --target polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta > data/annotation_analysis/flavonoids/FNS1_exonerate.gff +sed 's/%_//' runs/annotation_analysis/flavonoids/KIPEs/blast_query/FNS1.fasta > runs/annotation_analysis/flavonoids/FNS1_fixed.fasta +exonerate --model protein2genome --percent 55 --showvulgar no --showalignment no --showtargetgff yes --query runs/annotation_analysis/flavonoids/FNS1_fixed.fasta --target runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta > runs/annotation_analysis/flavonoids/FNS1_exonerate.gff -sed 's/%_//' data/annotation_analysis/flavonoids/KIPEs/blast_query/CHI2.fasta > data/annotation_analysis/flavonoids/CHI2_fixed.fasta -exonerate --model protein2genome --percent 55 --showvulgar no --showalignment no --showtargetgff yes --query data/annotation_analysis/flavonoids/CHI2_fixed.fasta --target polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta > data/annotation_analysis/flavonoids/CHI2_exonerate.gff +sed 's/%_//' runs/annotation_analysis/flavonoids/KIPEs/blast_query/CHI2.fasta > runs/annotation_analysis/flavonoids/CHI2_fixed.fasta +exonerate --model protein2genome --percent 55 --showvulgar no --showalignment no --showtargetgff yes --query runs/annotation_analysis/flavonoids/CHI2_fixed.fasta --target runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta > runs/annotation_analysis/flavonoids/CHI2_exonerate.gff diff --git a/workflows/annotation_analysis/betalain_phylogenetic_analysis.Rmd b/workflows/annotation_analysis/betalain_phylogenetic_analysis.Rmd index 881844d0c47f5ddc341cfd0a4eb59415c56f1d01..51422229a209bf205a21214ffc92e3b32753d360 100644 --- a/workflows/annotation_analysis/betalain_phylogenetic_analysis.Rmd +++ b/workflows/annotation_analysis/betalain_phylogenetic_analysis.Rmd @@ -18,22 +18,22 @@ knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2/") Perform phylogenetic analysis of betalain pathway genes. Searched for protein fasta sequences of known betalain pathway genes. Aligned the fasta sequences using ClustalOmega with default settings. Created a neighbour joining tree with 1000 bootstrap replicates using ClustalW. Used seaview to correctly root the phylogeny. Perform alignment and tree construction: ```{bash} -mkdir -p data/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD -mkdir -p data/annotation_analysis/betalains/phylogenetic_analysis/DODA +mkdir -p runs/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD +mkdir -p runs/annotation_analysis/betalains/phylogenetic_analysis/DODA # CYP76AD ### perform alignment -/home/tom/Documents/tools/clustalo-1.2.4-Ubuntu-x86_64 --threads=6 --in=data/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD/CYP76AD.fasta --outfile=data/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD/CYP76AD.aln --force +/home/tom/Documents/tools/clustalo-1.2.4-Ubuntu-x86_64 --threads=6 --in=runs/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD/CYP76AD.fasta --outfile=runs/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD/CYP76AD.aln --force ### create phylogeny, 1000 bootstrap replicates -/home/tom/Documents/tools/clustalw-2.1-linux-x86_64-libcppstatic/clustalw2 -bootstrap=1000 -infile=data/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD/CYP76AD.aln -outfile=data/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD/CYP76AD.phb +/home/tom/Documents/tools/clustalw-2.1-linux-x86_64-libcppstatic/clustalw2 -bootstrap=1000 -infile=runs/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD/CYP76AD.aln -outfile=runs/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD/CYP76AD.phb # DODA ### perform alignment -/home/tom/Documents/tools/clustalo-1.2.4-Ubuntu-x86_64 --threads=6 --in=data/annotation_analysis/betalains/phylogenetic_analysis/DODA/DODA.fasta --outfile=data/annotation_analysis/betalains/phylogenetic_analysis/DODA/DODA.aln --force +/home/tom/Documents/tools/clustalo-1.2.4-Ubuntu-x86_64 --threads=6 --in=runs/annotation_analysis/betalains/phylogenetic_analysis/DODA/DODA.fasta --outfile=runs/annotation_analysis/betalains/phylogenetic_analysis/DODA/DODA.aln --force ### create phylogeny, 1000 bootstrap replicates -/home/tom/Documents/tools/clustalw-2.1-linux-x86_64-libcppstatic/clustalw2 -bootstrap=1000 -infile=data/annotation_analysis/betalains/phylogenetic_analysis/DODA/DODA.aln -outfile=data/annotation_analysis/betalains/phylogenetic_analysis/DODA/DODA.phb +/home/tom/Documents/tools/clustalw-2.1-linux-x86_64-libcppstatic/clustalw2 -bootstrap=1000 -infile=runs/annotation_analysis/betalains/phylogenetic_analysis/DODA/DODA.aln -outfile=runs/annotation_analysis/betalains/phylogenetic_analysis/DODA/DODA.phb ``` ## Alignment plotting @@ -42,7 +42,7 @@ After rerooting to the outgroup using seaview: ```{r} # read in the CYP76AD1 tree -tree.CYP <- read.tree(file = "data/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD/CYP76AD_rooted.phb") +tree.CYP <- read.tree(file = "runs/annotation_analysis/betalains/phylogenetic_analysis/CYP76AD/CYP76AD_rooted.phb") # adjust names tree.CYP$tip.label <- gsub("\\_.*", "", tree.CYP$tip.label) @@ -56,12 +56,12 @@ ggtree(tree.CYP, layout = "rectangular") + xlim_tree(0.4) # save tree -ggsave(filename = "plots/CYP76AD_rooted.png", width = 10) -ggsave(filename = "plots/CYP76AD_rooted.pdf", width = 10) +ggsave(filename = "runs/plots/CYP76AD_rooted.png", width = 10) +ggsave(filename = "runs/plots/CYP76AD_rooted.pdf", width = 10) # read in the DODA tree -tree.DODA <- read.tree(file = "data/annotation_analysis/betalains/phylogenetic_analysis/DODA/DODA_rooted.phb") +tree.DODA <- read.tree(file = "runs/annotation_analysis/betalains/phylogenetic_analysis/DODA/DODA_rooted.phb") # adjust names tree.DODA$tip.label <- gsub("\\_.*", "", tree.DODA$tip.label) @@ -80,13 +80,6 @@ ggtree(tree.DODA, layout = "rectangular") + geom_tiplab(align = T) + xlim_tree(0.37) -ggsave(filename = "plots/DODA_rooted.png", width = 10) -ggsave(filename = "plots/DODA_rooted.pdf", width = 10) +ggsave(filename = "runs/plots/DODA_rooted.png", width = 10) +ggsave(filename = "runs/plots/DODA_rooted.pdf", width = 10) ``` - - - - - - - diff --git a/workflows/annotation_analysis/circos_plotting.Rmd b/workflows/annotation_analysis/circos_plotting.Rmd index d52f50b9bedb962396ad23a018d227b030ade41b..139d65437037909574de680d089885517ee5aa86 100644 --- a/workflows/annotation_analysis/circos_plotting.Rmd +++ b/workflows/annotation_analysis/circos_plotting.Rmd @@ -50,10 +50,10 @@ Granges_from_gtf <- function(gtf){ seqnames = unique(chr), # all sequences should be on the same chromosome gene_strand = unique(strand)) # use the gene_structures object to create the genomic ranges object - gene_ranges <- GRanges(seqnames = gene_structures$seqnames, - ranges = IRanges(start=gene_structures$gene_start, + gene_ranges <- GRanges(seqnames = gene_structures$seqnames, + ranges = IRanges(start=gene_structures$gene_start, end=gene_structures$gene_end, - names = gene_structures$transcript_id), + names = gene_structures$transcript_id), strand = gene_structures$gene_strand) return(gene_ranges) } @@ -68,7 +68,7 @@ Load in data and perform necessary transformations. ```{r} # for the circlize package, bed-like dataframes are required # load in the indexed genome and create a bed-like format from the genome -genome.bed <- read.table("polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta.fai") +genome.bed <- read.table("runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta.fai") genome.bed <- genome.bed %>% head(16) %>% summarize(chr = V1, @@ -81,7 +81,7 @@ genome.bed$chr <- as.factor(genome.bed$chr) genome.bed$chr <- reorder(genome.bed$chr, order) # load in genome annotation -gene_annotation <- read.gtf("polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf") +gene_annotation <- read.gtf("runs/polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf") gene_bed <- gene_annotation %>% filter(type=="CDS") %>% summarize(chr = chr, @@ -89,14 +89,14 @@ gene_bed <- gene_annotation %>% end = end) # load in repetitive element annotation -rep_annotation <- read.table("data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.out.gff") +rep_annotation <- read.table("runs/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.out.gff") rep_bed <- rep_annotation %>% summarize(chr = V1, start = V4, end = V5) # prepare MYB transcription factor and color pathway gene annotation in genome -#myb_genes <- read.csv("data/manual_sheets/MYB_with_subgroups.csv") +#myb_genes <- read.csv("runs/data/manual_sheets/MYB_with_subgroups.csv") #myb_annotation <- gene_annotation %>% # filter(transcript_id %in% myb_genes$query_name, # type == "transcript") %>% @@ -111,11 +111,11 @@ rep_bed <- rep_annotation %>% # color scheme for chromosomes: genome.bed$clr <- colorRampPalette(c("#FFFFFF", "#71196E"))(16) -png(filename = "plots/circos.png", width = 4800, height = 4800, res = 1200) # changed this line +png(filename = "runs/plots/circos.png", width = 4800, height = 4800, res = 1200) # changed this line circos.clear() circos.par("start.degree" = 90) -circos.genomicInitialize(data=genome.bed, - tickLabelsStartFromZero = F, +circos.genomicInitialize(data=genome.bed, + tickLabelsStartFromZero = F, axis.labels.cex = 0.3, labels.cex = 0.5) @@ -129,7 +129,7 @@ circos.genomicInitialize(data=genome.bed, #}, track.height = 0.10, bg.border = NA) # gene density -circos.genomicDensity(gene_bed, +circos.genomicDensity(gene_bed, track.height=0.15, window.size = 1000000, col="dodgerblue3") @@ -148,6 +148,3 @@ dev.off() ``` - - - diff --git a/workflows/annotation_analysis/readme.txt b/workflows/annotation_analysis/readme.txt index ab1f357eb170404b986c307f4da1e8098f65d0e4..33ca1cf7d559697701db7ec179f7077835878d6b 100644 --- a/workflows/annotation_analysis/readme.txt +++ b/workflows/annotation_analysis/readme.txt @@ -4,14 +4,14 @@ Identify betalain and flavonoid pathway genes, as well as MYB transcription fact ### Script order: -- code/annotation_analysis/circos_plotting.Rmd +- workflows/annotation_analysis/circos_plotting.Rmd Generate circos plot based on gene and repetitive element annotation -- code/annotation_analysis/betalain_and_flavonoid_identification.sh +- workflows/annotation_analysis/betalain_and_flavonoid_identification.sh Identify candidate betalain genes by BLAST, candidate flavonoid genes by KIPEs -- code/annotation_analysis/betalain_phylogenetic_analysis.Rmd +- workflows/annotation_analysis/betalain_phylogenetic_analysis.Rmd Phylogenetic analysis of betalain pathway genes -- code/annotation_analysis/R2R3_analysis_reannotation.Rmd +- workflows/annotation_analysis/R2R3_analysis_reannotation.Rmd Identify MYB transcription factor genes by HMMscan, analyse putative function by phylogenetic assessment diff --git a/workflows/braker2/braker2_prot.sh b/workflows/braker2/braker2_prot.sh index 1ee42414ffc235c337b7c0b6682ae97d880df99f..d4baf4bb60312127f89598f958125e4cc37d766f 100644 --- a/workflows/braker2/braker2_prot.sh +++ b/workflows/braker2/braker2_prot.sh @@ -34,15 +34,15 @@ conda activate /opt/rrzk/software/conda-envs/braker2 module load samtools/1.13 -mkdir -p data/braker2/polished_prot +mkdir -p runs/braker2/polished_prot # run braker in RNAseq and protein mode: braker.pl --AUGUSTUS_CONFIG_PATH=/home/twinkle1/tools/config/ \ --epmode \ - --prot_seq=data/braker2/input/protein_db.fasta \ - --genome=polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \ + --prot_seq=runs/braker2/input/protein_db.fasta \ + --genome=runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \ --softmasking \ --species=polished_prot \ --cores=8 \ - --workingdir=data/braker2/polished_prot + --workingdir=runs/braker2/polished_prot diff --git a/workflows/braker2/braker2_prot_rna.sh b/workflows/braker2/braker2_prot_rna.sh index ca0de631d378fbf3d694addff56804f7ad35bec4..1f185921f832b24a544c866a5d2001d7de99189c 100644 --- a/workflows/braker2/braker2_prot_rna.sh +++ b/workflows/braker2/braker2_prot_rna.sh @@ -16,28 +16,28 @@ conda activate /opt/rrzk/software/conda-envs/braker2 module load samtools # it is recommended to merge the separate bam files beforehand, as specifying many files can cause issues with braker -samtools merge --threads 7 data/braker2/STAR_mappings/clouse_reads_merged.bam \ - data/braker2/STAR_mappings/SRR_0_Aligned.sortedByCoord.out.bam \ - data/braker2/STAR_mappings/SRR_1_Aligned.sortedByCoord.out.bam \ - data/braker2/STAR_mappings/SRR_2_Aligned.sortedByCoord.out.bam \ - data/braker2/STAR_mappings/SRR_3_Aligned.sortedByCoord.out.bam \ - data/braker2/STAR_mappings/SRR_4_Aligned.sortedByCoord.out.bam \ - data/braker2/STAR_mappings/SRR_5_Aligned.sortedByCoord.out.bam \ - data/braker2/STAR_mappings/SRR_6_Aligned.sortedByCoord.out.bam \ - data/braker2/STAR_mappings/SRR_7_Aligned.sortedByCoord.out.bam +samtools merge --threads 7 runs/braker2/STAR_mappings/clouse_reads_merged.bam \ + runs/braker2/STAR_mappings/SRR_0_Aligned.sortedByCoord.out.bam \ + runs/braker2/STAR_mappings/SRR_1_Aligned.sortedByCoord.out.bam \ + runs/braker2/STAR_mappings/SRR_2_Aligned.sortedByCoord.out.bam \ + runs/braker2/STAR_mappings/SRR_3_Aligned.sortedByCoord.out.bam \ + runs/braker2/STAR_mappings/SRR_4_Aligned.sortedByCoord.out.bam \ + runs/braker2/STAR_mappings/SRR_5_Aligned.sortedByCoord.out.bam \ + runs/braker2/STAR_mappings/SRR_6_Aligned.sortedByCoord.out.bam \ + runs/braker2/STAR_mappings/SRR_7_Aligned.sortedByCoord.out.bam # it is unnecessary, however, to filter the bam files beforehand # see (https://github.com/Gaius-Augustus/BRAKER/issues/241) # run braker in RNAseq and protein mode: -mkdir -p data/braker2/polished_prot_rna +mkdir -p runs/braker2/polished_prot_rna braker.pl --AUGUSTUS_CONFIG_PATH=/home/twinkle1/tools/config/ \ --etpmode \ - --prot_seq=data/braker2/input/protein_db.fasta \ - --genome=polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \ + --prot_seq=runs/braker2/input/protein_db.fasta \ + --genome=runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \ --softmasking \ --species=polished_prot_rna \ - --bam=data/braker2/STAR_mappings/clouse_reads_merged.bam \ + --bam=runs/braker2/STAR_mappings/clouse_reads_merged.bam \ --cores=8 \ - --workingdir=data/braker2/polished_prot_rna + --workingdir=runs/braker2/polished_prot_rna diff --git a/workflows/braker2/index_STAR.sh b/workflows/braker2/index_STAR.sh index 6eb95f9002aa4084551caed185c57d0fd6b30603..72d3b17e4db980c39aa57635b4305ca70ffe1226 100644 --- a/workflows/braker2/index_STAR.sh +++ b/workflows/braker2/index_STAR.sh @@ -19,12 +19,12 @@ module load star/2.7.8a # as SJDB file, use the newly generated braker2 protein gtf file -mkdir -p data/braker2/STAR_index/ +mkdir -p runs/braker2/STAR_index/ STAR --runThreadN 8 \ --runMode genomeGenerate \ - --genomeDir data/braker2/STAR_index/ \ + --genomeDir runs/braker2/STAR_index/ \ --sjdbOverhang 89 \ --genomeSAindexNbases 13 \ - --genomeFastaFiles polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \ - --sjdbGTFfile data/braker2/prot_run/braker.gtf + --genomeFastaFiles runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \ + --sjdbGTFfile runs/braker2/prot_run/braker.gtf diff --git a/workflows/braker2/readme.txt b/workflows/braker2/readme.txt index 7dee01eb4eba54b1cab58a45205063e4ea31da60..2fee4f38871722543425324a67253d9758c2e7e4 100644 --- a/workflows/braker2/readme.txt +++ b/workflows/braker2/readme.txt @@ -1,17 +1,17 @@ ## Computational annotation Create computational genome annotation using BRAKER2. - + ### Script order: -- code/braker2/braker2_prot.sh +- workflows/braker2/braker2_prot.sh initial run of BRAKER2 using only the protein database as evidence -- code/braker2/index_STAR.sh +- workflows/braker2/index_STAR.sh index the polished reference genome for use with STAR -- code/braker2/run_STAR.sh +- workflows/braker2/run_STAR.sh map all short reads from Clouse et al. to the polished reference genome for use with BRAKER2 -- code/braker2/braker2_prot_rna.sh +- workflows/braker2/braker2_prot_rna.sh run BRAKER2 with the mapped RNA-seq reads as well as the protein database as input diff --git a/workflows/braker2/run_STAR.sh b/workflows/braker2/run_STAR.sh index c552c8f4089e6bc54b2299b31a8f1e7308b0f210..495a219bcf50afaa5f0a1da90873ba5ea4664b86 100644 --- a/workflows/braker2/run_STAR.sh +++ b/workflows/braker2/run_STAR.sh @@ -32,7 +32,7 @@ module load star/2.7.8a # create array of read fastq files (R1 only): -SOURCE_DIR=raw_data/Clouse_short_reads +SOURCE_DIR=studies/additional_data/Clouse_short_reads FILES=("$SOURCE_DIR"/SRR*_1.fastq) # only for testing puposes: @@ -40,11 +40,11 @@ FILES=("$SOURCE_DIR"/SRR*_1.fastq) #echo "${FILES[0]/_1.fastq.gz/_2.fastq.gz}" # run STAR after genome index creation -mkdir -p data/braker2/STAR_mappings/ +mkdir -p runs/braker2/STAR_mappings/ STAR --runThreadN 8 \ --runMode alignReads \ --outSAMtype BAM SortedByCoordinate \ - --genomeDir data/braker2/STAR_index \ - --outFileNamePrefix data/braker2/STAR_mappings/SRR_"${SLURM_ARRAY_TASK_ID}"_ \ + --genomeDir runs/braker2/STAR_index \ + --outFileNamePrefix runs/braker2/STAR_mappings/SRR_"${SLURM_ARRAY_TASK_ID}"_ \ --readFilesIn "${FILES["${SLURM_ARRAY_TASK_ID}"]}" "${FILES["${SLURM_ARRAY_TASK_ID}"]/_1.fastq/_2.fastq}" diff --git a/workflows/functional_annotation/analyse_functional_annotation.Rmd b/workflows/functional_annotation/analyse_functional_annotation.Rmd index 41c76f578a0b7c01a6d7474b50ef547192f0df52..f4fb8c7303a8be8f99dc974ab68159b917bd33e9 100644 --- a/workflows/functional_annotation/analyse_functional_annotation.Rmd +++ b/workflows/functional_annotation/analyse_functional_annotation.Rmd @@ -15,14 +15,14 @@ knitr::opts_knit$set(root.dir = "/home/tom/Documents/projects/Ahyp_v2_2_publicat Prepare input files: ```{bash} -mkdir data/functional_annotation/analysis +mkdir runs/functional_annotation/analysis # prepare interproscan annotations (Pfam, PANTHER and CDD) -awk '{if (($4 == "Pfam") || ($4 == "PANTHER") || ($4 == "CDD")) {print $1}}' data/functional_annotation/interproscan/Ahypochondriacus_2.2_polished_corrected.prot.fasta.tsv | sort | uniq > data/functional_annotation/analysis/interpro_genes.txt +awk '{if (($4 == "Pfam") || ($4 == "PANTHER") || ($4 == "CDD")) {print $1}}' runs/functional_annotation/interproscan/Ahypochondriacus_2.2_polished_corrected.prot.fasta.tsv | sort | uniq > runs/functional_annotation/analysis/interpro_genes.txt # prepare eggnog annotation -tail -n +6 data/functional_annotation/eggnog_mapper/MM_8wexw920.emapper.annotations.tsv | head -n -3 | awk '{print $1}' > data/functional_annotation/analysis/eggnog_genes.txt +tail -n +6 runs/functional_annotation/eggnog_mapper/MM_8wexw920.emapper.annotations.tsv | head -n -3 | awk '{print $1}' > runs/functional_annotation/analysis/eggnog_genes.txt # prepare mercator annotations -grep ">" data/functional_annotation/mercator_v4/Ahyp2.fa | grep -v "not classified" | sed 's/ .*//' | sed 's/>//' > data/functional_annotation/analysis/mercator_genes.txt +grep ">" runs/functional_annotation/mercator_v4/Ahyp2.fa | grep -v "not classified" | sed 's/ .*//' | sed 's/>//' > runs/functional_annotation/analysis/mercator_genes.txt ``` Load in gene names and analyse overlap: @@ -40,11 +40,11 @@ annotation_reader <- function(file, source){ } # load in genes -ips_genes <- annotation_reader(file = "data/functional_annotation/analysis/interpro_genes.txt", +ips_genes <- annotation_reader(file = "runs/functional_annotation/analysis/interpro_genes.txt", source = "Interproscan") -eggnog_genes <- annotation_reader(file = "data/functional_annotation/analysis/eggnog_genes.txt", +eggnog_genes <- annotation_reader(file = "runs/functional_annotation/analysis/eggnog_genes.txt", source = "eggNOG") -mercator_genes <- annotation_reader(file = "data/functional_annotation/analysis/mercator_genes.txt", +mercator_genes <- annotation_reader(file = "runs/functional_annotation/analysis/mercator_genes.txt", source = "Mercator") # create input list @@ -60,7 +60,7 @@ p1 <- ggVennDiagram(gene_list, labs(fill = "Number of genes") p1 -ggsave(filename = "plots/functional_annotation_venn.png", +ggsave(filename = "runs/plots/functional_annotation_venn.png", height = 5, width = 7, bg = "white", @@ -72,7 +72,7 @@ getVennOverlap <- function(lsvenn = list(A = sort(sample(LETTERS, 15)), C = sort(sample(LETTERS, 15)), D = sort(sample(LETTERS, 15))) ) { - + ItemsList <- gplots::venn(lsvenn, show.plot = FALSE) print(lengths(attributes(ItemsList)$intersections)) #return(attributes(ItemsList)$intersections) @@ -80,4 +80,3 @@ getVennOverlap <- function(lsvenn = list(A = sort(sample(LETTERS, 15)), sum(getVennOverlap(lsvenn = gene_list)) ``` - diff --git a/workflows/functional_annotation/readme.txt b/workflows/functional_annotation/readme.txt index 2c9dbeb6d6020a54eab763608a3bcda0c6d39cd2..9a494233bfc4f967c32470a3932b138f1e628494 100644 --- a/workflows/functional_annotation/readme.txt +++ b/workflows/functional_annotation/readme.txt @@ -4,10 +4,10 @@ Functional annotation and analysis ### Script order: -- code/functional_annotation/run_interproscan.sh +- workflows/functional_annotation/run_interproscan.sh Run Interproscan on protein sequences for functional annotation Functional annotation using eggNOG-mapper and Mercator was done using online submission and not run locally. --code/functional_annotation/analyse_functional_annotation.Rmd +-workflows/functional_annotation/analyse_functional_annotation.Rmd Analyse the number of annotated genes etc. for all three functional annotation programs diff --git a/workflows/functional_annotation/run_interproscan.sh b/workflows/functional_annotation/run_interproscan.sh index 1c858364662d21143f32ff28b467a65736d15590..730e669a82c498f8a8ac5fb2a867000d892a7cd9 100644 --- a/workflows/functional_annotation/run_interproscan.sh +++ b/workflows/functional_annotation/run_interproscan.sh @@ -23,9 +23,9 @@ module load openjdk/11.0.2 # interpro directory with executable shell script INTERPRODIR=/scratch/twinkle1/interproscan/interproscan-5.56-89.0/ # output directory for the annotation run -INTERPROOUT=data/functional_annotation/interproscan/ +INTERPROOUT=runs/functional_annotation/interproscan/ # input amino acid fasta file -INTERPROIN=polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta +INTERPROIN=runs/polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta # main mkdir "$INTERPROOUT" diff --git a/workflows/gene_expression_quantification/isoseq_expression_support.Rmd b/workflows/gene_expression_quantification/isoseq_expression_support.Rmd index 955383247301777b226847cfdeb7c7c446a93efd..3a85cb003f747fb4852125b5bac468672ed3e386 100644 --- a/workflows/gene_expression_quantification/isoseq_expression_support.Rmd +++ b/workflows/gene_expression_quantification/isoseq_expression_support.Rmd @@ -46,12 +46,12 @@ read.gtf <- function(file){ Bash script to run gffcompare for the isoseq to reference annotation comparison. ```{bash} -mkdir data/gene_expression_quantification/isoseq/ +mkdir runs/gene_expression_quantification/isoseq/ # run gffcompare -/home/tom/Documents/tools/gffcompare/gffcompare -r data/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.gtf polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf +/home/tom/Documents/tools/gffcompare/gffcompare -r runs/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.gtf runs/polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.gtf # move to output directory -mv gffcmp.* data/gene_expression_quantification/isoseq/ +mv gffcmp.* runs/gene_expression_quantification/isoseq/ ``` @@ -60,7 +60,7 @@ It might be easier to use the output of gffcompare instead of sqanti: ```{r} # read in tracking file -gff_tracking <- read.table(file = "data/gene_expression_quantification/isoseq/gffcmp.tracking") +gff_tracking <- read.table(file = "runs/gene_expression_quantification/isoseq/gffcmp.tracking") gff_tracking$V4 <- as.factor(gff_tracking$V4) colnames(gff_tracking) <- c("locus", "xlocus", "isoseq_transcript", "code", "reference_transcript") @@ -81,5 +81,3 @@ gff_tracking$reference_transcript_short <- matrix(unlist(strsplit(x = gff_tracki # all transcripts there? length(unique(gff_tracking$reference_transcript_short)) #28074 ``` - - diff --git a/workflows/gene_expression_quantification/plot_expression_quantification_kallisto.Rmd b/workflows/gene_expression_quantification/plot_expression_quantification_kallisto.Rmd index a384ececc24257d47eef82f4c64b3a84574c02a6..a24d15ff4a717ae3ad6efe3bcb4fa0e462a6880a 100644 --- a/workflows/gene_expression_quantification/plot_expression_quantification_kallisto.Rmd +++ b/workflows/gene_expression_quantification/plot_expression_quantification_kallisto.Rmd @@ -20,7 +20,7 @@ After gene expression quantification using kallisto (of fastq files against the ##### Setup, loading of data, preprocessing # vector of input directories -indirs <- list.dirs(path = "data/gene_expression_quantification/kallisto_quant") +indirs <- list.dirs(path = "runs/gene_expression_quantification/kallisto_quant") indirs <- indirs[-1] # remove base directory indirs <- indirs[grep("index", indirs, invert = T)] # remove index directory @@ -40,7 +40,7 @@ tpm.df <- data.frame(abundances[[1]]$target_id, abundances[[6]]$tpm, abundances[[7]]$tpm, abundances[[8]]$tpm) -colnames(tpm.df) <- c("GeneID", gsub("data/gene_expression_quantification/kallisto_quant/", "", indirs)) +colnames(tpm.df) <- c("GeneID", gsub("runs/gene_expression_quantification/kallisto_quant/", "", indirs)) colnames(tpm.df) <- c("GeneID", "Cotyledones", "Flower", "Leaf", "Mature seed", "Root", "Stem", "Water-stressed", "Developing seed") # melt dataframe for plotting into long format: @@ -48,16 +48,16 @@ melted.df <- melt(tpm.df, id.vars = "GeneID") colnames(melted.df) <- c("GeneID", "tissue", "tpm") write_csv(tpm.df, - file = "data/gene_expression_quantification/kallisto_quant/all_tissue_expression.csv") + file = "runs/gene_expression_quantification/kallisto_quant/all_tissue_expression.csv") write_csv(melted.df, - file = "data/gene_expression_quantification/kallisto_quant/all_tissue_expression_long.csv") + file = "runs/gene_expression_quantification/kallisto_quant/all_tissue_expression_long.csv") ``` Create dataframe of the betalain genes and the myb genes which can be later used for plotting the gene expression levels for the respective genes. ```{r} # load object with names of all betalain and flavonoid genes -pathway_genes <- read.csv(file = "data/manual_sheets/color_pathway_genes.csv", header=T) +pathway_genes <- read.csv(file = "runs/manual_sheets/color_pathway_genes.csv", header=T) betalain.genes <- pathway_genes %>% filter(Pathway == "Betalain") @@ -66,8 +66,8 @@ flavonoid.genes <- pathway_genes %>% filter(Pathway == "Flavonoid") # read in the myb genes and subset the betalain mybs -myb_genes <- read.csv(file = "data/manual_sheets/MYB_with_subgroups.csv", header=T) -#myb_genes <- read_csv(file = "data/annotation_analysis/myb_annotation/myb_stats.csv") +myb_genes <- read.csv(file = "runs/manual_sheets/MYB_with_subgroups.csv", header=T) +#myb_genes <- read_csv(file = "runs/annotation_analysis/myb_annotation/myb_stats.csv") colnames(myb_genes) <- c("Transcript_id", "Freq", "Gene_id", "Subgroup") ``` @@ -81,7 +81,7 @@ betalain.df <- melted.df %>% mutate(joining = gsub("\\..*", "", GeneID)) # join to obtain pathway gene names -betalain_plotting <- left_join(x = betalain.df, y = betalain.genes, by = c("joining" = "Gene_id")) +betalain_plotting <- left_join(x = betalain.df, y = betalain.genes, by = c("joining" = "Gene_id")) betalain_plotting <- betalain_plotting %>% select(GeneID, tissue, tpm, Gene) %>% mutate(label = paste0(Gene, " (", GeneID, ")")) @@ -122,7 +122,7 @@ ggplot(data = betalain_plotting) + axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) -ggsave(filename = "plots/gene_expression_quantification/betalain_expression.png", width = 10, height = 7, +ggsave(filename = "runs/plots/gene_expression_quantification/betalain_expression.png", width = 10, height = 7, dpi = 400) @@ -150,10 +150,10 @@ label[93] <- "AhMYB2.1 (BvMYB1-like)" label[94] <- "AhMYB2.2 (BvMYB1-like)" # create matrices for the pheatmap function -myb.mat_label <- matrix(data = round(myb.df$tpm, digits = 2), +myb.mat_label <- matrix(data = round(myb.df$tpm, digits = 2), ncol = 8, byrow = F) -myb.mat <- matrix(data = log10(myb.df$tpm+1), +myb.mat <- matrix(data = log10(myb.df$tpm+1), ncol = 8, byrow = F) rownames(myb.mat) <- label @@ -180,7 +180,7 @@ pheatmap(mat = myb.mat, cluster_rows = T, cluster_cols = F, angle_col = 45, - filename = "plots/gene_expression_quantification/myb_expression.png", + filename = "runs/plots/gene_expression_quantification/myb_expression.png", width = 10, height = 15, display_numbers = myb.mat_label, @@ -202,7 +202,7 @@ flavonoid.df <- melted.df %>% mutate(joining = gsub("\\..*", "", GeneID)) # join to obtain pathway gene names -flavonoid_plotting <- left_join(x = flavonoid.df, y = flavonoid.genes, by = c("joining" = "Gene_id")) +flavonoid_plotting <- left_join(x = flavonoid.df, y = flavonoid.genes, by = c("joining" = "Gene_id")) flavonoid_plotting <- flavonoid_plotting %>% select(GeneID, tissue, tpm, Gene) %>% mutate(label = paste0(Gene, " (", GeneID, ")")) @@ -257,15 +257,6 @@ ggplot(data = flavonoid_plotting) + axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) -ggsave(filename = "plots/gene_expression_quantification/flavonoid_pathway_expression.png", width = 10, height = 10) -#ggsave(filename = "plots/gene_expression_quantification/flavonoid_pathway_expression.png", width = 10, height = 12) +ggsave(filename = "runs/plots/gene_expression_quantification/flavonoid_pathway_expression.png", width = 10, height = 10) +#ggsave(filename = "runs/plots/gene_expression_quantification/flavonoid_pathway_expression.png", width = 10, height = 12) ``` - - - - - - - - - diff --git a/workflows/gene_expression_quantification/readme.txt b/workflows/gene_expression_quantification/readme.txt index 6ecc87b7a9c252b71236101605bd45344dd2c58c..fbe97a73684b28b5d4ba01492cd5164da2f59e7d 100644 --- a/workflows/gene_expression_quantification/readme.txt +++ b/workflows/gene_expression_quantification/readme.txt @@ -3,11 +3,11 @@ Quantify gene expression levels using short-read RNA-seq data from different tissues and the long-read sequencing data. ### Script order: -- code/gene_expression_quantification/run_kallisto.sh +- workflows/gene_expression_quantification/run_kallisto.sh Run Kallisto to quantify gene expression using short-read RNA-seq data -- code/gene_expression_quantification/plot_expression_quantification_kallisto.Rmd +- workflows/gene_expression_quantification/plot_expression_quantification_kallisto.Rmd Generate plots from assessed gene expression levels -- code/gene_expression_quantification/isoseq_expression_support.Rmd +- workflows/gene_expression_quantification/isoseq_expression_support.Rmd Assess support of Iso-Seq transcripts for annotated genes diff --git a/workflows/gene_expression_quantification/run_kallisto.sh b/workflows/gene_expression_quantification/run_kallisto.sh index dd5393b365190368a9e04e374ea28bfb1e07440b..f853f9095d819e066827b48cf9855f96154255f4 100644 --- a/workflows/gene_expression_quantification/run_kallisto.sh +++ b/workflows/gene_expression_quantification/run_kallisto.sh @@ -15,7 +15,7 @@ # index the transcriptome -KALINDEX=data/gene_expression_quantification/kallisto_quant/index +KALINDEX=runs/gene_expression_quantification/kallisto_quant/index mkdir -p $KALINDEX /home/tom/Documents/tools/kallisto/kallisto index -i "$KALINDEX"/index polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.cds.fasta @@ -23,9 +23,9 @@ mkdir -p $KALINDEX ##### Perform quantification # create array of read fastq files (R1 only): -SOURCE_DIR=raw_data/Clouse_short_reads/ +SOURCE_DIR=studies/additional_data/Clouse_short_reads/ FILES=("$SOURCE_DIR"SRR*_1.fastq.gz) -OUTDIR=data/gene_expression_quantification/kallisto_quant/ +OUTDIR=runs/gene_expression_quantification/kallisto_quant/ TISSUE_NAMES=("young_seed" "stem" "flower" "leaf" "root" "water_stressed" "cotyledones" "mature_seed") # kallisto after indexing diff --git a/workflows/genome_polishing/readme.txt b/workflows/genome_polishing/readme.txt index e5aef947118edbe27449e45a5dd00f4313ba79fa..8b1a98c092675a41e418bf82a869d4f2b73724ca 100644 --- a/workflows/genome_polishing/readme.txt +++ b/workflows/genome_polishing/readme.txt @@ -4,16 +4,16 @@ Polish the previously published reference genome of A. hypochondriacus. ### Script order: -- code/genome_polishing/remove_ambiguous_bases.sh +- workflows/genome_polishing/remove_ambiguous_bases.sh prepare the reference genome v2.1 for genome polishing by removing all ambiguous bases -- code/genome_polishing/unpackSRA.sh +- workflows/genome_polishing/unpackSRA.sh unpack the WGS short read SRA file from the Lightfoot genome assembly -- code/genome_polishing/run_nextpolish.sh +- workflows/genome_polishing/run_nextpolish.sh repare input files and run NextPolish to polish the reference assembly. -- code/genome_polishing/process_nextpolish_output.sh +- workflows/genome_polishing/process_nextpolish_output.sh return all ambigious bases into the polished reference genome and restore the same chromosome order diff --git a/workflows/isoseq_assembly/combined_isoseq3_pipeline.sh b/workflows/isoseq_assembly/combined_isoseq3_pipeline.sh index e64fc14a3873024264c134d8312959fe7b36e018..370cfb91da27f51bef47ae0af13bcb05188ee36a 100644 --- a/workflows/isoseq_assembly/combined_isoseq3_pipeline.sh +++ b/workflows/isoseq_assembly/combined_isoseq3_pipeline.sh @@ -14,7 +14,7 @@ conda activate isoseq3 module load samtools/1.13 -ISOSEQOUT=raw_data/isoseq_raw_reads/processed/ +ISOSEQOUT=assays/isoform_sequencing/dataset/processed/ mkdir -p $ISOSEQOUT # flnc reads were created using the following command, removing artificial concatemers and filtering out all genes without Poly-A tail diff --git a/workflows/isoseq_assembly/combined_mapping_and_collapse.sh b/workflows/isoseq_assembly/combined_mapping_and_collapse.sh index 0ad606a18e7e727dd68b0c006eb8215d428df667..ac1100a7a24a1a1d330bdc6ed48ea77c8c45a4cc 100644 --- a/workflows/isoseq_assembly/combined_mapping_and_collapse.sh +++ b/workflows/isoseq_assembly/combined_mapping_and_collapse.sh @@ -13,16 +13,16 @@ source $CONDA_PREFIX/etc/profile.d/conda.sh conda activate isoseq # create output directory -MAPPINGOUT=data/isoseq/mapping_and_collapse/ +MAPPINGOUT=runs/isoseq/mapping_and_collapse/ mkdir -p "$MAPPINGOUT" ### MAPPING -REFERENCE=polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta +REFERENCE=runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta # Align sequences to reference genome -INPUT=raw_data/isoseq_raw_reads/processed/combined.merged_clustered.hq.fasta +INPUT=assays/isoform_sequencing/dataset/processed/combined.merged_clustered.hq.fasta OUTPUTMM2="$MAPPINGOUT"combined_aln.sam minimap2 -t 8 -ax splice:hq -uf --secondary=no -a $REFERENCE $INPUT -o $OUTPUTMM2 @@ -55,7 +55,7 @@ collapse_isoforms_by_sam.py --input $INPUT -s $INPUTSORTED -o $OUTPUTCOLLAPSE -c # First obtain associated count information INPUTABUNDANCE="$MAPPINGOUT"combined.collapsed -CLUSTERREPORT=raw_data/isoseq_raw_reads/processed/combined.merged_clustered.cluster_report.csv +CLUSTERREPORT=assays/isoform_sequencing/dataset/processed/combined.merged_clustered.cluster_report.csv get_abundance_post_collapse.py $INPUTABUNDANCE $CLUSTERREPORT diff --git a/workflows/isoseq_assembly/combined_mapping_and_collapse_old_genome.sh b/workflows/isoseq_assembly/combined_mapping_and_collapse_old_genome.sh index 64563ac26068b3ef6cbc6fccb7e1c6a480d2f52f..8faabb26d5660bef3fa9d2f8c81d861a795f914d 100644 --- a/workflows/isoseq_assembly/combined_mapping_and_collapse_old_genome.sh +++ b/workflows/isoseq_assembly/combined_mapping_and_collapse_old_genome.sh @@ -13,16 +13,16 @@ source $CONDA_PREFIX/etc/profile.d/conda.sh conda activate isoseq # create output directory -MAPPINGOUT=data/isoseq/mapping_and_collapse_old_genome/ +MAPPINGOUT=runs/isoseq/mapping_and_collapse_old_genome/ mkdir -p "$MAPPINGOUT" ### MAPPING # map reads onto the old reference genome -REFERENCE=/home/tom/Documents/reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa +REFERENCE=studies/additional_data/resources/reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa # Align sequences to reference genome -INPUT=raw_data/isoseq_raw_reads/processed/combined.merged_clustered.hq.fasta +INPUT=assays/isoform_sequencing/dataset/processed/combined.merged_clustered.hq.fasta OUTPUTMM2="$MAPPINGOUT"combined_aln.sam minimap2 -t 8 -ax splice:hq -uf --secondary=no -a $REFERENCE $INPUT -o $OUTPUTMM2 @@ -56,7 +56,7 @@ collapse_isoforms_by_sam.py --input $INPUT -s $INPUTSORTED -o $OUTPUTCOLLAPSE -c # First obtain associated count information INPUTABUNDANCE="$MAPPINGOUT"combined.collapsed -CLUSTERREPORT=raw_data/isoseq_raw_reads/processed/combined.merged_clustered.cluster_report.csv +CLUSTERREPORT=assays/isoform_sequencing/dataset/processed/combined.merged_clustered.cluster_report.csv get_abundance_post_collapse.py $INPUTABUNDANCE $CLUSTERREPORT diff --git a/workflows/isoseq_assembly/comparison_isoseq_polishing_effectiveness.Rmd b/workflows/isoseq_assembly/comparison_isoseq_polishing_effectiveness.Rmd index 8dcddd830ecbe4d1e6d346c10c244107aa05690d..e35684ba29d63e83ca47fcc119a18ee7cb8f9d8d 100644 --- a/workflows/isoseq_assembly/comparison_isoseq_polishing_effectiveness.Rmd +++ b/workflows/isoseq_assembly/comparison_isoseq_polishing_effectiveness.Rmd @@ -21,17 +21,17 @@ Compare the effect of genome polishing on the coding sequence prediction by CPC2 Run CPC2 CDS prediction on both sequence sets: ```{bash} -mkdir data/isoseq/comparison_genome_polishing +mkdir runs/isoseq/comparison_genome_polishing # unpolished genome -/hom/tom/Documents/tools/CPC2_standalone-1.0.1/bin/CPC2.py -i data/isoseq/sqanti/output_old_genome/combined.collapsed.min_fl_2.filtered.underscore_corrected.fasta --ORF -o data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cpc2 +/hom/tom/Documents/tools/CPC2_standalone-1.0.1/bin/CPC2.py -i runs/isoseq/sqanti/output_old_genome/combined.collapsed.min_fl_2.filtered.underscore_corrected.fasta --ORF -o runs/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cpc2 -/hom/tom/Documents/tools/CPC2_standalone-1.0.1/bin/CPC2.py -i data/isoseq/mapping_and_collapse_old_genome/combined.collapsed.min_fl_2.filtered.rep.fa --ORF -o data/isoseq/comparison_genome_polishing/unpolished_isoseq_cpc2 +/hom/tom/Documents/tools/CPC2_standalone-1.0.1/bin/CPC2.py -i runs/isoseq/mapping_and_collapse_old_genome/combined.collapsed.min_fl_2.filtered.rep.fa --ORF -o runs/isoseq/comparison_genome_polishing/unpolished_isoseq_cpc2 # polished genome -/hom/tom/Documents/tools/CPC2_standalone-1.0.1/bin/CPC2.py -i data/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.fasta --ORF -o data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cpc2 +/hom/tom/Documents/tools/CPC2_standalone-1.0.1/bin/CPC2.py -i runs/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.fasta --ORF -o runs/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cpc2 -/hom/tom/Documents/tools/CPC2_standalone-1.0.1/bin/CPC2.py -i data/isoseq/mapping_and_collapse/combined.collapsed.min_fl_2.filtered.rep.fa --ORF -o data/isoseq/comparison_genome_polishing/polished_isoseq_cpc2 +/hom/tom/Documents/tools/CPC2_standalone-1.0.1/bin/CPC2.py -i runs/isoseq/mapping_and_collapse/combined.collapsed.min_fl_2.filtered.rep.fa --ORF -o runs/isoseq/comparison_genome_polishing/polished_isoseq_cpc2 ``` @@ -39,8 +39,8 @@ Compare the CDS prediction results from the two files: ```{r} # load in cpc2 files for the unpolished genome -unpolished_cpc2 <- read.table("data/isoseq/comparison_genome_polishing/unpolished_isoseq_cpc2.txt") -unpolished_sqanti_cpc2 <- read.table("data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cpc2.txt") +unpolished_cpc2 <- read.table("runs/isoseq/comparison_genome_polishing/unpolished_isoseq_cpc2.txt") +unpolished_sqanti_cpc2 <- read.table("runs/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cpc2.txt") # prepare for merge unpolished_cpc2 <- unpolished_cpc2 %>% @@ -83,8 +83,8 @@ table(interaction(as.factor(merged_unpolished$label), as.factor(merged_unpolishe ############# comparison with polished genome -polished_cpc2 <- read.table("data/isoseq/comparison_genome_polishing/polished_isoseq_cpc2.txt") -polished_sqanti_cpc2 <- read.table("data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cpc2.txt") +polished_cpc2 <- read.table("runs/isoseq/comparison_genome_polishing/polished_isoseq_cpc2.txt") +polished_sqanti_cpc2 <- read.table("runs/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cpc2.txt") # prepare for merge polished_cpc2 <- polished_cpc2 %>% @@ -140,77 +140,68 @@ create_bed_from_cpc2 <- function(cpc2_output){ # write cpc2 output as bed file ### isoseq unpolished -isoseq_unpolished_bed <- create_bed_from_cpc2("data/isoseq/comparison_genome_polishing/unpolished_isoseq_cpc2.txt") -write_tsv(isoseq_unpolished_bed, - file = "data/isoseq/comparison_genome_polishing/unpolished_isoseq_cpc2.bed", +isoseq_unpolished_bed <- create_bed_from_cpc2("runs/isoseq/comparison_genome_polishing/unpolished_isoseq_cpc2.txt") +write_tsv(isoseq_unpolished_bed, + file = "runs/isoseq/comparison_genome_polishing/unpolished_isoseq_cpc2.bed", col_names = F) ### isoseq unpolished -isoseq_unpolished_sqanti_bed <- create_bed_from_cpc2("data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cpc2.txt") -write_tsv(isoseq_unpolished_sqanti_bed, - file = "data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cpc2.bed", +isoseq_unpolished_sqanti_bed <- create_bed_from_cpc2("runs/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cpc2.txt") +write_tsv(isoseq_unpolished_sqanti_bed, + file = "runs/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cpc2.bed", col_names = F) ### isoseq polished -isoseq_polished_bed <- create_bed_from_cpc2("data/isoseq/comparison_genome_polishing/polished_isoseq_cpc2.txt") -write_tsv(isoseq_polished_bed, - file = "data/isoseq/comparison_genome_polishing/polished_isoseq_cpc2.bed", +isoseq_polished_bed <- create_bed_from_cpc2("runs/isoseq/comparison_genome_polishing/polished_isoseq_cpc2.txt") +write_tsv(isoseq_polished_bed, + file = "runs/isoseq/comparison_genome_polishing/polished_isoseq_cpc2.bed", col_names = F) ### isoseq sqanti polished -isoseq_polished_sqanti_bed <- create_bed_from_cpc2("data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cpc2.txt") -write_tsv(isoseq_polished_sqanti_bed, - file = "data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cpc2.bed", +isoseq_polished_sqanti_bed <- create_bed_from_cpc2("runs/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cpc2.txt") +write_tsv(isoseq_polished_sqanti_bed, + file = "runs/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cpc2.bed", col_names = F) ``` Extract predicted protein sequence and run busco: ```{bash} -mkdir -p data/annotation_analysis/busco +mkdir -p runs/annotation_analysis/busco ### ISOSEQ UNPOLISHED # Extract the coding sequence from the fasta file -tools/bedtools getfasta -fi data/isoseq/mapping_and_collapse_old_genome/combined.collapsed.min_fl_2.filtered.rep.fa -fo data/isoseq/comparison_genome_polishing/unpolished_isoseq_cds.fasta -bed data/isoseq/comparison_genome_polishing/unpolished_isoseq_cpc2.bed +tools/bedtools getfasta -fi runs/isoseq/mapping_and_collapse_old_genome/combined.collapsed.min_fl_2.filtered.rep.fa -fo runs/isoseq/comparison_genome_polishing/unpolished_isoseq_cds.fasta -bed runs/isoseq/comparison_genome_polishing/unpolished_isoseq_cpc2.bed # translate into protein sequence -seqkit translate data/isoseq/comparison_genome_polishing/unpolished_isoseq_cds.fasta > data/isoseq/comparison_genome_polishing/unpolished_isoseq_cds.faa +seqkit translate runs/isoseq/comparison_genome_polishing/unpolished_isoseq_cds.fasta > runs/isoseq/comparison_genome_polishing/unpolished_isoseq_cds.faa # to prepare for busco, trim the fasta header -sed 's/|.*//' data/isoseq/comparison_genome_polishing/unpolished_isoseq_cds.faa > data/isoseq/comparison_genome_polishing/unpolished_isoseq_cds_fixed.faa +sed 's/|.*//' runs/isoseq/comparison_genome_polishing/unpolished_isoseq_cds.faa > runs/isoseq/comparison_genome_polishing/unpolished_isoseq_cds_fixed.faa # run busco -busco -m protein -i data/isoseq/comparison_genome_polishing/unpolished_isoseq_cds_fixed.faa -o unpolished_isoseq_cds -l embryophyta_odb10 --out_path data/annotation_analysis/busco/ --download_path data/busco/datasets/ -c 7 -f +busco -m protein -i runs/isoseq/comparison_genome_polishing/unpolished_isoseq_cds_fixed.faa -o unpolished_isoseq_cds -l embryophyta_odb10 --out_path runs/annotation_analysis/busco/ --download_path runs/busco/datasets/ -c 7 -f ### ISOSEQ SQANTI UNPOLISHED # Extract the coding sequence from the fasta file -tools/bedtools getfasta -fi data/isoseq/sqanti/output_old_genome/combined.collapsed.min_fl_2.filtered.underscore_corrected.fasta -fo data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cds.fasta -bed data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cpc2.bed +tools/bedtools getfasta -fi runs/isoseq/sqanti/output_old_genome/combined.collapsed.min_fl_2.filtered.underscore_corrected.fasta -fo runs/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cds.fasta -bed runs/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cpc2.bed # translate into protein sequence -seqkit translate data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cds.fasta > data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cds.faa +seqkit translate runs/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cds.fasta > runs/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cds.faa # run busco -busco -m protein -i data/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cds.faa -o unpolished_isoseq_sqanti_cds -l embryophyta_odb10 --out_path data/annotation_analysis/busco/ --download_path data/busco/datasets/ -c 7 -f +busco -m protein -i runs/isoseq/comparison_genome_polishing/unpolished_isoseq_sqanti_cds.faa -o unpolished_isoseq_sqanti_cds -l embryophyta_odb10 --out_path runs/annotation_analysis/busco/ --download_path runs/busco/datasets/ -c 7 -f ### ISOSEQ POLISHED # Extract the coding sequence from the fasta file -tools/bedtools getfasta -fi data/isoseq/mapping_and_collapse/combined.collapsed.min_fl_2.filtered.rep.fa -fo data/isoseq/comparison_genome_polishing/polished_isoseq_cds.fasta -bed data/isoseq/comparison_genome_polishing/polished_isoseq_cpc2.bed +tools/bedtools getfasta -fi runs/isoseq/mapping_and_collapse/combined.collapsed.min_fl_2.filtered.rep.fa -fo runs/isoseq/comparison_genome_polishing/polished_isoseq_cds.fasta -bed runs/isoseq/comparison_genome_polishing/polished_isoseq_cpc2.bed # translate into protein sequence -seqkit translate data/isoseq/comparison_genome_polishing/polished_isoseq_cds.fasta > data/isoseq/comparison_genome_polishing/polished_isoseq_cds.faa +seqkit translate runs/isoseq/comparison_genome_polishing/polished_isoseq_cds.fasta > runs/isoseq/comparison_genome_polishing/polished_isoseq_cds.faa # to prepare for busco, trim the fasta header -sed 's/|.*//' data/isoseq/comparison_genome_polishing/polished_isoseq_cds.faa > data/isoseq/comparison_genome_polishing/polished_isoseq_cds_fixed.faa +sed 's/|.*//' runs/isoseq/comparison_genome_polishing/polished_isoseq_cds.faa > runs/isoseq/comparison_genome_polishing/polished_isoseq_cds_fixed.faa # run busco -busco -m protein -i data/isoseq/comparison_genome_polishing/polished_isoseq_cds_fixed.faa -o polished_isoseq_cds -l embryophyta_odb10 --out_path data/annotation_analysis/busco/ --download_path data/busco/datasets/ -c 7 -f +busco -m protein -i runs/isoseq/comparison_genome_polishing/polished_isoseq_cds_fixed.faa -o polished_isoseq_cds -l embryophyta_odb10 --out_path runs/annotation_analysis/busco/ --download_path runs/busco/datasets/ -c 7 -f ### ISOSEQ SQANTI POLISHED # Extract the coding sequence from the fasta file -tools/bedtools getfasta -fi data/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.fasta -fo data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cds.fasta -bed data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cpc2.bed +tools/bedtools getfasta -fi runs/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.fasta -fo runs/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cds.fasta -bed runs/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cpc2.bed # translate into protein sequence -seqkit translate data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cds.fasta > data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cds.faa +seqkit translate runs/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cds.fasta > runs/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cds.faa # run busco -busco -m protein -i data/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cds.faa -o polished_isoseq_sqanti_cds -l embryophyta_odb10 --out_path data/annotation_analysis/busco/ --download_path data/busco/datasets/ -c 7 -f +busco -m protein -i runs/isoseq/comparison_genome_polishing/polished_isoseq_sqanti_cds.faa -o polished_isoseq_sqanti_cds -l embryophyta_odb10 --out_path runs/annotation_analysis/busco/ --download_path runs/busco/datasets/ -c 7 -f ``` - - - - - - - - - diff --git a/workflows/isoseq_assembly/isoseq3_pipeline.sh b/workflows/isoseq_assembly/isoseq3_pipeline.sh index e7cebe34829325bcb5bb65da9cd4cb7d0b66fd53..13ebc3b8fb6161db527bcdfd2c9096a03906e19b 100644 --- a/workflows/isoseq_assembly/isoseq3_pipeline.sh +++ b/workflows/isoseq_assembly/isoseq3_pipeline.sh @@ -20,13 +20,13 @@ echo "$SLURM_ARRAY_TASK_ID" # removal of primers and barcodes already done # step 4, removal of polyA tail and artificial concatemers -PROCESSING=raw_data/isoseq_raw_reads/processed/ +PROCESSING=assays/isoform_sequencing/dataset/isoseq_raw_reads/processed/ mkdir -p "PROCESSING" -SOURCE_DIR=raw_data/isoseq_raw_reads/reads +SOURCE_DIR=assays/isoform_sequencing/dataset/reads FILES=("$SOURCE_DIR"/*bam) -PRIMER_SOURCE=raw_data/isoseq_raw_reads/primers +PRIMER_SOURCE=assays/isoform_sequencing/dataset/primers PRIMER_FILES=("$PRIMER_SOURCE"/*fasta) REFINEIN="$SOURCE_DIR"/"${FILES["${SLURM_ARRAY_TASK_ID}"]}" @@ -46,7 +46,7 @@ samtools merge "$ISOSEQOUT"combined.merged_flnc.bam /home/twinkle1/isoseq_raw_re # clustering of identical reads -IN="data/isoseq3_pipeline/combined.merged_flnc.bam" -OUT="data/isoseq3_pipeline/combined.merged_clustered.bam" +IN="runs/isoseq3_pipeline/combined.merged_flnc.bam" +OUT="runs/isoseq3_pipeline/combined.merged_clustered.bam" isoseq3 cluster "$IN" "$OUT" --verbose --use-qvs diff --git a/workflows/isoseq_assembly/readme.txt b/workflows/isoseq_assembly/readme.txt index 85aefb6c87ae724447cee891f106d4f45d99e4d7..9a10ba652a4092dd60da5d7a56e150a08a6cf729 100644 --- a/workflows/isoseq_assembly/readme.txt +++ b/workflows/isoseq_assembly/readme.txt @@ -4,20 +4,20 @@ Assembly full-length transcript sequencing data. ### Script order: -- code/isoseq_assembly/isoseq3_pipeline.sh +- workflows/isoseq_assembly/isoseq3_pipeline.sh assemble FLNC reads from CCS files -- code/isoseq_assembly/combined_isoseq3_pipeline.sh +- workflows/isoseq_assembly/combined_isoseq3_pipeline.sh combine FLNC reads and cluster identical reads -- code/isoseq_assembly/combined_mapping_and_collapse.sh +- workflows/isoseq_assembly/combined_mapping_and_collapse.sh collapse clustered reads into unique full-length transcripts using the polished reference genome -- code/isoseq_assembly/combined_mapping_and_collapse_old_genome.sh +- workflows/isoseq_assembly/combined_mapping_and_collapse_old_genome.sh collapse clustered reads into unique full-length transcripts using the unpolished reference genome -- code/isoseq_assembly/run_sqanti.sh +- workflows/isoseq_assembly/run_sqanti.sh run SQANTI in order to correct possible sequencing errors in the full-length transcripts using both reference genomes -- code/isoseq_assembly/comparison_isoseq_polishing_effectiveness.Rmd +- workflows/isoseq_assembly/comparison_isoseq_polishing_effectiveness.Rmd compare effect of genome polishing on error correction of full-length transcript sequences diff --git a/workflows/isoseq_assembly/run_sqanti.sh b/workflows/isoseq_assembly/run_sqanti.sh index 8b29756f693351f7bd9864b15c7cd2923f57ed3f..b14191664a830eba59fa9037221267c667015e4c 100644 --- a/workflows/isoseq_assembly/run_sqanti.sh +++ b/workflows/isoseq_assembly/run_sqanti.sh @@ -3,20 +3,20 @@ # run sqanti correction and filtering # preliminary genome annotation is used to compare against, but this only effects the pdf output -mkdir -p data/isoseq/sqanti/output_polished +mkdir -p runs/isoseq/sqanti/output_polished /home/tom/Documents/tools/SQANTI3-4.2/sqanti3_qc.py \ - data/isoseq/mapping_and_collapse/combined.collapsed.min_fl_2.filtered.gff \ - data/braker2/polished_prot_rna/braker.gtf \ - polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \ - -d data/isoseq/sqanti/output_polished/ \ + runs/isoseq/mapping_and_collapse/combined.collapsed.min_fl_2.filtered.gff \ + runs/braker2/polished_prot_rna/braker.gtf \ + runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \ + -d runs/isoseq/sqanti/output_polished/ \ -n 6 mkdir -p data/isoseq/sqanti/output_old_genome /home/tom/Documents/tools/SQANTI3-4.2/sqanti3_qc.py \ - data/isoseq/mapping_and_collapse_old_genome/combined.collapsed.min_fl_2.filtered.gff \ - data/braker2/polished_prot_rna/braker.gtf \ - /home/tom/Documents/reference_genomes/Ahypochondriacus/assembly/Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa \ - -d data/isoseq/sqanti/output_old_genome/ \ + runs/isoseq/mapping_and_collapse_old_genome/combined.collapsed.min_fl_2.filtered.gff \ + runs/braker2/polished_prot_rna/braker.gtf \ + studies/additional_data/resources/reference_genomes/Ahypochondriacus/assembly//Ahypochondriacus_459_v2.0.softmasked.nospace.underscore.fa \ + -d runs/isoseq/sqanti/output_old_genome/ \ -n 6 diff --git a/workflows/merge_annotation/Braker2_subsets_and_merge.Rmd b/workflows/merge_annotation/Braker2_subsets_and_merge.Rmd index c56fa6e9f03ad064504612db1aaede31286aca52..d7005294484ab68178e936ff076696ee7144f524 100644 --- a/workflows/merge_annotation/Braker2_subsets_and_merge.Rmd +++ b/workflows/merge_annotation/Braker2_subsets_and_merge.Rmd @@ -54,10 +54,10 @@ Granges_from_gtf <- function(gtf){ seqnames = unique(chr), # all sequences should be on the same chromosome gene_strand = unique(strand)) # use the gene_structures object to create the genomic ranges object - gene_ranges <- GRanges(seqnames = gene_structures$seqnames, - ranges = IRanges(start=gene_structures$gene_start, + gene_ranges <- GRanges(seqnames = gene_structures$seqnames, + ranges = IRanges(start=gene_structures$gene_start, end=gene_structures$gene_end, - names = gene_structures$transcript_id), + names = gene_structures$transcript_id), strand = gene_structures$gene_strand) return(gene_ranges) } @@ -86,21 +86,21 @@ report_both_codons <- function(gtf_object){ Create differently supported subsets using the available script from augustus/braker2: ```{bash} -mkdir -p data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets +mkdir -p runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets # activate busco environment for the Augustus script conda activate busco # extract braker aa and cds, also generate the bad_genes.lst file -/home/tom/Documents/tools/Augustus/scripts/getAnnoFastaFromJoingenes.py -g polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta -f data/braker2/polished_prot_rna/braker.gtf -s FILTER -o data/braker2/polished_prot_rna/braker_extracted +/home/tom/Documents/tools/Augustus/scripts/getAnnoFastaFromJoingenes.py -g runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta -f runs/braker2/polished_prot_rna/braker.gtf -s FILTER -o runs/braker2/polished_prot_rna/braker_extracted # select differently supported subsets based on the amount of support -tools/BRAKER/scripts/predictionAnalysis/selectSupportedSubsets.py --noSupport data/braker_analysis/external_evidence/polished_prot_rna/no_support.txt --fullSupport data/braker_analysis/external_evidence/polished_prot_rna/full_support.txt --anySupport data/braker_analysis/external_evidence/polished_prot_rna/any_support.txt data/braker2/polished_prot_rna/braker.gtf data/braker2/polished_prot_rna/hintsfile.gff +tools/BRAKER/scripts/predictionAnalysis/selectSupportedSubsets.py --noSupport runs/braker_analysis/external_evidence/polished_prot_rna/no_support.txt --fullSupport runs/braker_analysis/external_evidence/polished_prot_rna/full_support.txt --anySupport runs/braker_analysis/external_evidence/polished_prot_rna/any_support.txt runs/braker2/polished_prot_rna/braker.gtf runs/braker2/polished_prot_rna/hintsfile.gff # remove all lines that contain "#" characters in the beginning (cause issues at later steps) -sed '/^#/d' data/braker_analysis/external_evidence/polished_prot_rna/full_support.txt > data/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf -sed '/^#/d' data/braker_analysis/external_evidence/polished_prot_rna/any_support.txt > data/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf -sed '/^#/d' data/braker_analysis/external_evidence/polished_prot_rna/no_support.txt > data/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf +sed '/^#/d' runs/braker_analysis/external_evidence/polished_prot_rna/full_support.txt > runs/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf +sed '/^#/d' runs/braker_analysis/external_evidence/polished_prot_rna/any_support.txt > runs/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf +sed '/^#/d' runs/braker_analysis/external_evidence/polished_prot_rna/no_support.txt > runs/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf ``` @@ -108,12 +108,12 @@ Generate a dataframe with the transcript ids based on the differently supported ```{r} # load in the support set gtfs -full.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf") -any.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf") -no.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf") +full.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf") +any.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf") +no.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf") # load in the genes with internal stop codons, as detected by the getAnnoFastaFromJoingenes script (see braker2_results folder readme.txt) -badgenes <- read.table("data/braker_analysis/external_evidence/polished_prot_rna/bad_genes.lst") +badgenes <- read.table("runs/braker_analysis/external_evidence/polished_prot_rna/bad_genes.lst") # extract the gene names with both, annotated start and annotated stop codons codons_any <- report_both_codons(any.support) @@ -130,21 +130,21 @@ no.ids <- unique(no.support$transcript_id) #7443 # create dataframe with the subset transcript ids and the support category all.ids <- c(full.ids, partial.ids, no.ids) -support <- c(rep("full", length(full.ids)), - rep("partial", length(partial.ids)), +support <- c(rep("full", length(full.ids)), + rep("partial", length(partial.ids)), rep("no", length(no.ids))) support.df <- data.frame(all.ids, support) # filter out internal stop codons -support.df <- support.df[!support.df$all.ids %in% badgenes$V1,] +support.df <- support.df[!support.df$all.ids %in% badgenes$V1,] # exclude all annotated genes that do not have annotated start and stop codons fixed_support.df <- support.df[support.df$all.ids %in% codons,] ################################## # write created dataframe as rds object: -saveRDS(fixed_support.df, file="data/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS") +saveRDS(fixed_support.df, file="runs/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS") ``` @@ -153,12 +153,12 @@ Filter out predicted genes with internal stop codons as well as predicted genes ```{r} # load in the gtf files to subset -full.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf") -any.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf") -no.support <- read.gtf("data/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf") +full.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/full_support_fixed.gtf") +any.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/any_support_fixed.gtf") +no.support <- read.gtf("runs/braker_analysis/external_evidence/polished_prot_rna/no_support_fixed.gtf") # load in the annotation dataframe: -support.df <- readRDS(file="data/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS") +support.df <- readRDS(file="runs/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS") # create partial support dataframe: partial.support <- any.support[!any.support$transcript_id %in% full.support$transcript_id,] @@ -177,28 +177,28 @@ no.support.filtered <- no.support[no.support$transcript_id %in% support.df$all.i #length(unique(no.support.filtered$transcript_id)) # 3579 remain ### Write the filtered gtf files -write.table(full.support.filtered[,1:9], - "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.gtf", +write.table(full.support.filtered[,1:9], + "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.gtf", col.names = F, row.names = F, quote=F, sep ="\t") -write.table(partial.support.filtered[,1:9], - "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.gtf", +write.table(partial.support.filtered[,1:9], + "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.gtf", col.names = F, row.names = F, quote=F, sep ="\t") -write.table(no.support.filtered[,1:9], - "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.gtf", +write.table(no.support.filtered[,1:9], + "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.gtf", col.names = F, row.names = F, quote=F, sep ="\t") ``` -After filtering the gtf files, use the filtered files in order to subset the codingseq and amino acid fasta files. Load the coding sequence and amino acid fasta files and filter them using the support dataframe. +After filtering the gtf files, use the filtered files in order to subset the codingseq and amino acid fasta files. Load the coding sequence and amino acid fasta files and filter them using the support dataframe. ```{r} library(seqinr) # load in the support dataframe -support.df <- readRDS(file="data/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS") +support.df <- readRDS(file="runs/braker_analysis/external_evidence/polished_prot_rna/external_evidence.RDS") support.df$all.ids <- as.character(support.df$all.ids) # subset fasta files using the support dataframe -braker_dna <- read.fasta("data/braker2/polished_prot_rna/braker_extracted.codingseq", +braker_dna <- read.fasta("runs/braker2/polished_prot_rna/braker_extracted.codingseq", seqtype = "DNA") # filter the different subsets @@ -208,22 +208,22 @@ braker_dna.partial.filtered <- braker_dna[getName(braker_dna) %in% support.df[su braker_dna.no.filtered <- braker_dna[getName(braker_dna) %in% support.df[support.df$support == "no",1]] # 7014 sequences # write subsets -write.fasta(sequences=braker_dna.filtered, - names=names(braker_dna.filtered), - file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/braker_all.fasta") -write.fasta(sequences=braker_dna.full.filtered, - names=names(braker_dna.full.filtered), - file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.fasta") -write.fasta(sequences=braker_dna.partial.filtered, - names=names(braker_dna.partial.filtered), - file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.fasta") -write.fasta(sequences=braker_dna.no.filtered, - names=names(braker_dna.no.filtered), - file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.fasta") +write.fasta(sequences=braker_dna.filtered, + names=names(braker_dna.filtered), + file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/braker_all.fasta") +write.fasta(sequences=braker_dna.full.filtered, + names=names(braker_dna.full.filtered), + file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.fasta") +write.fasta(sequences=braker_dna.partial.filtered, + names=names(braker_dna.partial.filtered), + file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.fasta") +write.fasta(sequences=braker_dna.no.filtered, + names=names(braker_dna.no.filtered), + file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.fasta") ### repeat for the AA fasta files: # subset fasta files using the support dataframe -braker_aa <- read.fasta("data/braker2/polished_prot_rna/braker_extracted.aa", +braker_aa <- read.fasta("runs/braker2/polished_prot_rna/braker_extracted.aa", seqtype = "AA") braker_aa.filtered <- braker_aa[getName(braker_aa) %in% support.df[,1]] # 39141 sequences @@ -232,20 +232,16 @@ braker_aa.partial.filtered <- braker_aa[getName(braker_aa) %in% support.df[suppo braker_aa.no.filtered <- braker_aa[getName(braker_aa) %in% support.df[support.df$support == "no",1]] # 7014 sequences # write subsets -write.fasta(sequences=braker_aa.filtered, - names=names(braker_aa.filtered), - file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/braker_all.faa") -write.fasta(sequences=braker_aa.full.filtered, - names=names(braker_aa.full.filtered), - file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.faa") -write.fasta(sequences=braker_aa.partial.filtered, - names=names(braker_aa.partial.filtered), - file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.faa") -write.fasta(sequences=braker_aa.no.filtered, - names=names(braker_aa.no.filtered), - file.out = "data/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.faa") +write.fasta(sequences=braker_aa.filtered, + names=names(braker_aa.filtered), + file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/braker_all.faa") +write.fasta(sequences=braker_aa.full.filtered, + names=names(braker_aa.full.filtered), + file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/full_support.faa") +write.fasta(sequences=braker_aa.partial.filtered, + names=names(braker_aa.partial.filtered), + file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/partial_support.faa") +write.fasta(sequences=braker_aa.no.filtered, + names=names(braker_aa.no.filtered), + file.out = "runs/braker_analysis/external_evidence/polished_prot_rna/fixed_subsets/no_support.faa") ``` - - - - diff --git a/workflows/merge_annotation/readme.txt b/workflows/merge_annotation/readme.txt index 07756106ab59e572559a654025fa157186cd1b72..6c4599ca2b3835408bfec628bb1320d68e4cf6cf 100644 --- a/workflows/merge_annotation/readme.txt +++ b/workflows/merge_annotation/readme.txt @@ -4,8 +4,8 @@ Combine the computational annotation with full-length transcript sequencing data ### Script order: -- code/merge_annotation/Braker2_subsets_and_merge.Rmd +- workflows/merge_annotation/Braker2_subsets_and_merge.Rmd Prepare BRAKER2 input, use only predicted genes supported by external evidence for the merge -- code/merge_annotation/reannotation_correction.Rmd +- workflows/merge_annotation/reannotation_correction.Rmd Merge BRAKER2 and full-length transcript sequencing data using TSEBRA, deduplicate, rename genes and compare annotation completeness using BUSCO diff --git a/workflows/merge_annotation/reannotation_correction.Rmd b/workflows/merge_annotation/reannotation_correction.Rmd index d056ed1422893d7094c2a76959383ee851965b2c..29687114b52bc87c69d168942bc854252bb11572 100644 --- a/workflows/merge_annotation/reannotation_correction.Rmd +++ b/workflows/merge_annotation/reannotation_correction.Rmd @@ -56,10 +56,10 @@ Granges_from_gtf <- function(gtf){ seqnames = unique(chr), # all sequences should be on the same chromosome gene_strand = unique(strand)) # use the gene_structures object to create the genomic ranges object - gene_ranges <- GRanges(seqnames = gene_structures$seqnames, - ranges = IRanges(start=gene_structures$gene_start, + gene_ranges <- GRanges(seqnames = gene_structures$seqnames, + ranges = IRanges(start=gene_structures$gene_start, end=gene_structures$gene_end, - names = gene_structures$transcript_id), + names = gene_structures$transcript_id), strand = gene_structures$gene_strand) return(gene_ranges) } @@ -101,7 +101,7 @@ lists.of.overlaps <- function(overlaps){ list.out <- list() vec <- c() i <- 1 - + # while there are still selfoverlapping sequences while (length(selfoverlap) > 0){ # check each overlap if it belongs to a selfoverlapping sequence @@ -175,12 +175,12 @@ extract_attributes <- function(gtf_column, att_of_interest){ Combine computational annotation and Isoseq transcripts into a single genome annotation. Predict ORFs in the transcript sequences using CPC2. ```{bash} -mkdir data/isoseq/cpc/ +mkdir runs/isoseq/cpc/ # Predict coding sequence in braker2 transcripts for TSEBRA tools/CPC2_standalone-1.0.1/bin/CPC2.py \ - -i data/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.fasta \ - -o data/isoseq/cpc2/cpc2_from_sqanti \ + -i runs/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.fasta \ + -o runs/isoseq/cpc2/cpc2_from_sqanti \ --ORF ``` @@ -188,24 +188,24 @@ Convert CPC2 output into a gtf file of predicted transcript coding sequences. CP ```{r} # prepare a bed file from the results of cpc2 -cpc2 <- read.table("data/isoseq/cpc2/cpc2_from_sqanti.txt") +cpc2 <- read.table("runs/isoseq/cpc2/cpc2_from_sqanti.txt") # filter for coding transcripts with an intact ORF cpc2 <- cpc2 %>% filter(V9 == "coding" & V6 == 1) %>% summarise(ID=V1, start=V7-1, end=V7+(V3*3)-1) # write as bed file for extraction of CDS -write_tsv(cpc2, - file="data/isoseq/cpc2/cpc2_extraction.bed", +write_tsv(cpc2, + file="runs/isoseq/cpc2/cpc2_extraction.bed", quote = "none", col_names = F) # Convert cpc2 output to gtf file: # read in gtf file of isoseq transcript data -isoseq.gtf <- read.gtf("data/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.gtf") +isoseq.gtf <- read.gtf("runs/isoseq/sqanti/output_polished/combined.collapsed.min_fl_2.filtered_corrected.gtf") isoseq.gtf <- isoseq.gtf[isoseq.gtf$type == "exon",] # read in cpc2 output file, the bed file used for extraction does suffice -cpc2.bed <- read.table("data/isoseq_data/cpc2/cpc2_extraction.bed") +cpc2.bed <- read.table("runs/isoseq_runs/cpc2/cpc2_extraction.bed") colnames(cpc2.bed) <- c("ID", "CDS_start", "CDS_end") # bed format is 0 based, convert back to a 1 based format, end position does not have to converted cpc2.bed$CDS_start <- cpc2.bed$CDS_start+1 @@ -242,7 +242,7 @@ for (i in 1:length(ids)){ shifted[length(shifted)+1] <- shifted[length(shifted)]+1 # which values change by more than 1 after one shift? These values represent exon boundaries boundaries <- which(abs(cds_coordinates - shifted) != 1) - + # create matrix from which to construct the start and end positions of the gtf file # create matrices for single exon genes also, as they are converted to vectors in a later step otherwise if (length(boundaries) > 0){ @@ -262,13 +262,13 @@ for (i in 1:length(ids)){ mat[1,2] <- cds_coordinates[1] mat[1,1] <- cds_coordinates[length(cds_coordinates)] } - + # invert matrix for minus strand in order to start with the lowest number # only if there are multiple exons if (stranded == "-" & nrow(mat) != 1){ mat <- apply(apply(mat, 1, rev), 1, rev) } - + # use the previously created gtf structure to store the CDS coordinates # the number of CDS rows is always <= number of exon rows out.subset <- gtf.subset[1:nrow(mat),] @@ -283,33 +283,33 @@ for (i in 1:length(ids)){ # output represents all CDS records # write gtf file -write.table(output[,1:9], - "data/isoseq/cpc2/cpc2_extracted_cds.gtf", +write.table(output[,1:9], + "runs/isoseq/cpc2/cpc2_extracted_cds.gtf", col.names = F, row.names = F, quote=F, sep ="\t") ``` Run TSEBRA to combine computational prediction with Iso-Seq transcripts, using gffread for deduplication: ```{bash} -mkdir -p data/braker_analysis/TSEBRA -mkdir -p data/reannotation_correction/computational -mkdir -p data/reannotation_correction/manual +mkdir -p runs/braker_analysis/TSEBRA +mkdir -p runs/reannotation_correction/computational +mkdir -p runs/reannotation_correction/manual # -g specifies the braker gtf, -c long read config file, -e braker hint file -l isoseq cpc2 extracted gtf file /home/tom/Documents/tools/TSEBRA/bin/tsebra.py \ - -g data/braker_analysis/external_evidence/fixed_subsets/full_partial_support.gtf \ + -g runs/braker_analysis/external_evidence/fixed_subsets/full_partial_support.gtf \ -c /home/tom/Documents/tools/TSEBRA/config/long_reads.cfg \ - -e data/braker2/polished_prot_rna/hintsfile.gff \ - -l data/isoseq/cpc2/cpc2_extracted_cds.gtf \ - -o data/braker_analysis/TSEBRA/tsebra.gtf + -e runs/braker2/polished_prot_rna/hintsfile.gff \ + -l runs/isoseq/cpc2/cpc2_extracted_cds.gtf \ + -o runs/braker_analysis/TSEBRA/tsebra.gtf # To remove any duplicated entries and to cluster the predicted transcripts into loci from the tsebra gtf file I used gffread. # -M option merges identical entries, -K option causes stricter merge, -T outputs as gtf (did also output as gff file) /home/tom/Documents/tools/gffread-0.12.7/gffread -M -K -T \ - -d data/braker_analysis/TSEBRA/duplication_info.txt \ - data/braker_analysis/TSEBRA/tsebra.gtf > \ - data/braker_analysis/TSEBRA/tsebra_dedup.gtf + -d runs/braker_analysis/TSEBRA/duplication_info.txt \ + runs/braker_analysis/TSEBRA/tsebra.gtf > \ + runs/braker_analysis/TSEBRA/tsebra_dedup.gtf ``` @@ -317,7 +317,7 @@ After the finalisation of the genome prediction using TSEBRA and the deduplicati ```{r} # read in the annotation -tsebra.gtf <- read.gtf("data/braker_analysis/TSEBRA/tsebra_dedup.gtf") +tsebra.gtf <- read.gtf("runs/braker_analysis/TSEBRA/tsebra_dedup.gtf") # create a mapping from locus the gene id, the locus is only found in rows of the "transcript" type transcript_locus_mapping <- tsebra.gtf %>% @@ -332,7 +332,7 @@ tsebra.gtf <- left_join(tsebra.gtf, transcript_locus_mapping) # save information as RDS output <- tsebra.gtf %>% select(source, gene_id, transcript_id, locus) -saveRDS(output, "data/reannotation_correction/computational/locus.RDS") +saveRDS(output, "runs/reannotation_correction/computational/locus.RDS") # create a dataframe indicating the order of the scaffolds and contigs @@ -355,9 +355,9 @@ tsebra.gtf.sorted <- tsebra.gtf %>% # use only one record per transcript, sort by locus and add number with the number of transcript at that locus # to later create the transcript names -transcript_id <- tsebra.gtf.sorted %>% - filter(type == "transcript") %>% - group_by(locus) %>% +transcript_id <- tsebra.gtf.sorted %>% + filter(type == "transcript") %>% + group_by(locus) %>% mutate(occ = 1:n()) %>% ungroup() %>% select(transcript_id, occ) @@ -366,15 +366,15 @@ tsebra.gtf.sorted <- left_join(tsebra.gtf.sorted, transcript_id, by=c("transcrip # write temporary RDS file: -saveRDS(tsebra.gtf.sorted, - "data/reannotation_correction/computational/tsebra_temp3.RDS") +saveRDS(tsebra.gtf.sorted, + "runs/reannotation_correction/computational/tsebra_temp3.RDS") ``` Rename the transcripts and save again as a renamed gtf file: ```{r} # read in RDS file -tsebra.gtf.sorted <- readRDS("data/reannotation_correction/computational/tsebra_temp3.RDS") +tsebra.gtf.sorted <- readRDS("runs/reannotation_correction/computational/tsebra_temp3.RDS") # rename the locus and transcript ID records to match # add column indicating the gene order @@ -393,19 +393,19 @@ gene_identifier <- paste("AHp", order_filled, sep="") gene.id.df <- data.frame(ids, order3, order_filled, gene_identifier) tsebra.gtf.sorted <- left_join(tsebra.gtf.sorted, gene.id.df, by = c("locus" = "ids")) -# create the transcript identifier based on gene identifier +# create the transcript identifier based on gene identifier tsebra.gtf.sorted <- tsebra.gtf.sorted %>% mutate(transcript_identifier = paste0(gene_identifier, ".", occ)) # include new attribute column -tsebra.gtf.sorted$new_attributes <- paste0("gene_id \"", - tsebra.gtf.sorted$gene_identifier, - "\"; transcript_id \"", - tsebra.gtf.sorted$transcript_identifier, +tsebra.gtf.sorted$new_attributes <- paste0("gene_id \"", + tsebra.gtf.sorted$gene_identifier, + "\"; transcript_id \"", + tsebra.gtf.sorted$transcript_identifier, "\";") # create mapping file of old and new names: mapping <- tsebra.gtf.sorted %>% select(gene_id, transcript_id, locus, gene_identifier, transcript_identifier) -saveRDS(mapping, "data/reannotation_correction/computational/mapping.RDS") +saveRDS(mapping, "runs/reannotation_correction/computational/mapping.RDS") # create final gtf file in the correct column order: output <- tsebra.gtf.sorted %>% @@ -413,14 +413,14 @@ output <- tsebra.gtf.sorted %>% select(chr, source, type, start, end, score, strand, phase, new_attributes) # write gtf -write.table(output[,1:9], - "data/reannotation_correction/computational/tsebra_renamed.gtf", +write.table(output[,1:9], + "runs/reannotation_correction/computational/tsebra_renamed.gtf", col.names = F, row.names = F, quote=F, sep ="\t") ``` -Based on evidence from sequencing data, the sequence of the AmMYBl1 gene (AHp014591) was manually adjusted. The resulting file was saved as data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf. Gffread was used to extract cds and protein fasta files of the annotated genes and convert the gtf file to the gff3 format, representing the genome annotation v2.2. +Based on evidence from sequencing data, the sequence of the AmMYBl1 gene (AHp014591) was manually adjusted. The resulting file was saved as runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf. Gffread was used to extract cds and protein fasta files of the annotated genes and convert the gtf file to the gff3 format, representing the genome annotation v2.2. -More manual adjustment is required. First, the phase field in the annotation file is incorrect (likely an issue with TSEBRA). Secondly, seemingly a bug in TSEBRA causes the annotated sequence to be changed in the case that the stop codon is split up by an intron. I will save the annotation with the manually corrected AmMYBl1 as a gtf file under a different name and fix the phasing issue. The new manually corrected input file is data/reannotation_correction/manual/manually_MYBl1_corrected.gtf. +More manual adjustment is required. First, the phase field in the annotation file is incorrect (likely an issue with TSEBRA). Secondly, seemingly a bug in TSEBRA causes the annotated sequence to be changed in the case that the stop codon is split up by an intron. I will save the annotation with the manually corrected AmMYBl1 as a gtf file under a different name and fix the phasing issue. The new manually corrected input file is runs/reannotation_correction/manual/manually_MYBl1_corrected.gtf. As a first step, the genes with wrong values in their last exons are manually adjusted. This is the list of identifiers with an incorrect last exon, the genes were corrected based on their annotated cpc2 cds predictions. ERROR: Proteins do not match for transcript AHp001894.1 Strand:+ Exons: 2 checked, corrected @@ -450,7 +450,7 @@ names(annotation.cds.fasta[which((width(annotation.cds.fasta) %% 3) != 0)]) ```{r} -annotation.gtf <- read.gtf("data/reannotation_correction/manual/manually_MYBl1_corrected.gtf") +annotation.gtf <- read.gtf("runs/reannotation_correction/manual/manually_MYBl1_corrected.gtf") # searched the above list in command line using grep -n to identify the lines which need to be corrected annotation.gtf[14134,4] <- 31584484 annotation.gtf[93858,4] <- 340181 @@ -486,8 +486,8 @@ annotation.gtf[178221,5] <- 807 # minus strand, only adjust by 1 # write corrected table -write.table(annotation.gtf[,1:9], - "data/reannotation_correction/manual/manually_genes_corrected.gtf", +write.table(annotation.gtf[,1:9], + "runs/reannotation_correction/manual/manually_genes_corrected.gtf", col.names = F, row.names = F, quote=F, sep ="\t") ``` @@ -495,40 +495,40 @@ After manual correction of genes, fix the phase attribute. ```{bash} # Add a gene line and convert to gff for gffvalidator. Correct the phasing field using gffvalidator. -/home/tom/Documents/tools/gffread-0.12.7/gffread -E data/reannotation_correction/manual/manually_genes_corrected.gtf --keep-genes > data/reannotation_correction/manual/manually_genes_corrected.gene.gff +/home/tom/Documents/tools/gffread-0.12.7/gffread -E runs/reannotation_correction/manual/manually_genes_corrected.gtf --keep-genes > runs/reannotation_correction/manual/manually_genes_corrected.gene.gff # the genes with wrong coordinates should be fixed at this step, so that they can be assigned the correct phasing attribute -gt gff3 -tidy -force -retainids -addids no -o data/reannotation_correction/manual/manually_genes_corrected.gene.phase.gff data/reannotation_correction/manual/manually_genes_corrected.gene.gff +gt gff3 -tidy -force -retainids -addids no -o runs/reannotation_correction/manual/manually_genes_corrected.gene.phase.gff runs/reannotation_correction/manual/manually_genes_corrected.gene.gff # remove empty lines from the gtf file, with only "###" -grep -v "###" data/reannotation_correction/manual/manually_genes_corrected.gene.phase.gff > data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff +grep -v "###" runs/reannotation_correction/manual/manually_genes_corrected.gene.phase.gff > runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff ``` Convert to other formats using gffread. ```{bash} # convert to gtf format -/home/tom/Documents/tools/gffread-0.12.7/gffread -v -T --keep-genes data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff > data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf +/home/tom/Documents/tools/gffread-0.12.7/gffread -v -T --keep-genes runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff > runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf # extract prot (-y) and cds (-x) fasta files -/home/tom/Documents/tools/gffread-0.12.7/gffread -x data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta \ - -y data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.prot.fasta \ - -g polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \ - data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf - +/home/tom/Documents/tools/gffread-0.12.7/gffread -x runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta \ + -y runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.prot.fasta \ + -g runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta \ + runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf + # convert to gff3 format -#/home/tom/Documents/tools/gffread-0.12.7/gffread data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf > \ -# data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff - +#/home/tom/Documents/tools/gffread-0.12.7/gffread runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gtf > \ +# runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.gff + # copy to the annotation directory: -cp data/reannotation_correction/manual/A* polished_genome_annotation/annotation/ +cp runs/reannotation_correction/manual/A* runs/polished_genome_annotation/annotation/ ``` Perform quality control of the annotated genes: ```{r} # load in cds fasta -annotation.cds.fasta <- readBStringSet(filepath = "data/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta") +annotation.cds.fasta <- readBStringSet(filepath = "runs/reannotation_correction/manual/Ahypochondriacus_2.2_polished_corrected.cds.fasta") # are there genes with length != a multiple of 3? which((width(annotation.cds.fasta) %% 3) != 0) @@ -549,30 +549,28 @@ Compare annotation completeness of genome annotation v2.2 with previously publis # genome annotation v2.2 # compare against the orthoDBv10 Embryophyta dataset, using 7 threads busco -m protein \ - -i polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta \ + -i runs/polished_genome_annotation/annotation/Ahypochondriacus_2.2_polished_corrected.prot.fasta \ -o genome_annotation_v2.2 \ -l embryophyta_odb10 \ - --out_path data/annotation_analysis/busco/ \ - --download_path data/annotation_analysis/busco/datasets/ \ + --out_path runs/annotation_analysis/busco/ \ + --download_path runs/annotation_analysis/busco/datasets/ \ -c 6 - + # genome annotation v2.1 busco -m protein \ -i /home/tom/Documents/reference_genomes/Ahypochondriacus/annotation/Ahypochondriacus_459_v2.1.protein.fa \ -o genome_annotation_v2.1 \ -l embryophyta_odb10 \ - --out_path data/annotation_analysis/busco/ \ - --download_path data/annotation_analysis/busco/datasets/ \ + --out_path runs/annotation_analysis/busco/ \ + --download_path runs/annotation_analysis/busco/datasets/ \ -c 6 - + # A. cruentus annotation busco -m protein \ -i /home/tom/Documents/reference_genomes/Acruentus/annotation/Amacr_pep_20210312.tfa \ -o Acruentus_annotation \ -l embryophyta_odb10 \ - --out_path data/annotation_analysis/busco/ \ - --download_path data/annotation_analysis/busco/datasets/ \ + --out_path runs/annotation_analysis/busco/ \ + --download_path runs/annotation_analysis/busco/datasets/ \ -c 6 ``` - - diff --git a/workflows/other_inferences/infer_coverage_along_genome.sh b/workflows/other_inferences/infer_coverage_along_genome.sh index f534a07f001a3ebfbb676bd2d448a4e45c08cccb..8db44b51b0be8b847020d4121ab7cb8fa517668b 100644 --- a/workflows/other_inferences/infer_coverage_along_genome.sh +++ b/workflows/other_inferences/infer_coverage_along_genome.sh @@ -16,10 +16,10 @@ module load samtools/1.13 #Set input and parameters -read1=raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_1.fastq.gz -read2=raw_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_2.fastq.gz -input=polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta -outdir=data/mapping_bias_inference/genome_coverage/ +read1=studies/additional_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_1.fastq.gz +read2=studies/additional_data/lightfoot_WGS_short_reads/SRR2106212/SRR2106212_2.fastq.gz +input=runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta +outdir=runs/mapping_bias_inference/genome_coverage/ outbam=SRR2106212.sorted.bam outdedup=SRR2106212.sorted.dedup.bam outcov=SRR2106212.coverage diff --git a/workflows/other_inferences/mapping_bias_inference_analysis.Rmd b/workflows/other_inferences/mapping_bias_inference_analysis.Rmd index 5de4884a74dc617ac05bf6f467e3afcb015b986f..97eb942b6a09d110545361c60a301a5394fbcc6c 100644 --- a/workflows/other_inferences/mapping_bias_inference_analysis.Rmd +++ b/workflows/other_inferences/mapping_bias_inference_analysis.Rmd @@ -18,7 +18,7 @@ Analyse the observed bias in read mapping observed in ATAC and WG sequencing. A ```{r} # read in data -coverage <- read.table(file = "data/mapping_bias_inference/genome_coverage/SRR2106212.sorted.dedup.10.depth") +coverage <- read.table(file = "runs/mapping_bias_inference/genome_coverage/SRR2106212.sorted.dedup.10.depth") # calculate coverage in windows with fixed window size windowsize <- 5000 @@ -46,7 +46,7 @@ ggplot(data = window_coverage) + theme(text = element_text(size = 22)) # save plot -ggsave(filename = "plots/mapping_bias_inference/50k_window_read_coverage.png", +ggsave(filename = "runs/plots/mapping_bias_inference/50k_window_read_coverage.png", width = 14, height = 8) ``` @@ -54,11 +54,11 @@ Investigate if genomic features correlate with the observed mapping bias. Calcul ```{bash} # create genome file for bedtools -awk -F'\t' 'BEGIN {OFS = FS} {print $1,$2}' /projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta.fai > data/mapping_bias_inference/bedtools_genome.txt +awk -F'\t' 'BEGIN {OFS = FS} {print $1,$2}' /projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta.fai > runs/mapping_bias_inference/bedtools_genome.txt # make windows using bedtools, windowsize 5k, include only Scaffolds -bedtools makewindows -g data/mapping_bias_inference/bedtools_genome.txt -w 5000 | grep "Scaffold" > data/mapping_bias_inference/all_genome_windows.bed +bedtools makewindows -g runs/mapping_bias_inference/bedtools_genome.txt -w 5000 | grep "Scaffold" > runs/mapping_bias_inference/all_genome_windows.bed # calculate statistics in windows -bedtools nuc -fi /projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta -bed data/mapping_bias_inference/genome_windows.bed > data/mapping_bias_inference/window_stats.txt +bedtools nuc -fi /projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta -bed runs/mapping_bias_inference/genome_windows.bed > runs/mapping_bias_inference/window_stats.txt ``` @@ -66,14 +66,14 @@ Plot Scaffold 10, with read depth and plastid genome content ```{r} # read in paf files from minimap2 -chloroplast.paf <- read_paf("data/mapping_bias_inference/plastid_to_genome/chloroplast_to_genome.paf") -mito.paf <- read_paf("data/mapping_bias_inference/plastid_to_genome/mitochondrium_to_genome.paf") +chloroplast.paf <- read_paf("runs/mapping_bias_inference/plastid_to_genome/chloroplast_to_genome.paf") +mito.paf <- read_paf("runs/mapping_bias_inference/plastid_to_genome/mitochondrium_to_genome.paf") mito.df <- as.data.frame(mito.paf) chloro.df <- as.data.frame(chloroplast.paf) # read in mapping files for cruentus -chloroplast.cruentus.paf <- read_paf("data/mapping_bias_inference/plastid_to_genome/chloroplast_to_genome_cruentus.paf") -mito.cruentus.paf <- read_paf("data/mapping_bias_inference/plastid_to_genome/mitochondrium_to_genome_cruentus.paf") +chloroplast.cruentus.paf <- read_paf("runs/mapping_bias_inference/plastid_to_genome/chloroplast_to_genome_cruentus.paf") +mito.cruentus.paf <- read_paf("runs/mapping_bias_inference/plastid_to_genome/mitochondrium_to_genome_cruentus.paf") mito.cruentus.df <- as.data.frame(mito.cruentus.paf) chloro.cruentus.df <- as.data.frame(chloroplast.cruentus.paf) @@ -110,7 +110,7 @@ ggplot(data = window_coverage) + theme(text = element_text(size = 22)) -ggsave(filename = "plots/mapping_bias_inference/5kb_outlier_windows_zoom.png", +ggsave(filename = "runs/plots/mapping_bias_inference/5kb_outlier_windows_zoom.png", width = 10, height = 6, bg = "white") ``` @@ -123,7 +123,7 @@ mito.df %>% group_by(tname) %>% summarize(total_align_length = sum(alen)) %>% mutate(percent_aligned = (total_align_length/mito.df$qlen[1])*100) %>% - arrange(desc(percent_aligned)) + arrange(desc(percent_aligned)) # how much chloroplast was mapped? chloro.df %>% @@ -172,8 +172,8 @@ There are structural differences in GC content between the different genomes. Wh ```{bash} # GC content of different genomes -seqkit fx2tab --name --gc data/mapping_bias_inference/plastid_to_genome/Ah_chloroplast.fasta -seqkit fx2tab --name --gc data/mapping_bias_inference/plastid_to_genome/Bv_mitochondrium.fasta +seqkit fx2tab --name --gc runs/mapping_bias_inference/plastid_to_genome/Ah_chloroplast.fasta +seqkit fx2tab --name --gc runs/mapping_bias_inference/plastid_to_genome/Bv_mitochondrium.fasta seqkit fx2tab --name --gc polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta | grep "Scaffold" ``` @@ -247,7 +247,7 @@ Compare with functional annotation: ```{r} # load functional annotation -functional_annotation <- readxl::read_xlsx(path = "data/functional_annotation/eggnog_mapper/MM_8wexw920.emapper.annotations.xlsx") +functional_annotation <- readxl::read_xlsx(path = "runs/functional_annotation/eggnog_mapper/MM_8wexw920.emapper.annotations.xlsx") colnames(functional_annotation) <- functional_annotation[2,] functional_annotation <- functional_annotation[-(1:2),] @@ -258,8 +258,3 @@ mito_functions <- functional_annotation %>% chloro_functions <- functional_annotation %>% filter(query %in% chloro_overlap$transcript_id) ``` - - - - - diff --git a/workflows/other_inferences/plastid_to_genome_mapping.sh b/workflows/other_inferences/plastid_to_genome_mapping.sh index 279e7c7d1e95e65d840836c43c53adcf0858d999..b2c694e75b569ead2d428a8f7bc76d07f07ecaf6 100644 --- a/workflows/other_inferences/plastid_to_genome_mapping.sh +++ b/workflows/other_inferences/plastid_to_genome_mapping.sh @@ -13,12 +13,12 @@ source $CONDA_PREFIX/etc/profile.d/conda.sh conda activate isoseq # create output directory -MAPPINGOUT=data/mapping_bias_inference/plastid_to_genome/ +MAPPINGOUT=runs/mapping_bias_inference/plastid_to_genome/ mkdir -p "$MAPPINGOUT" ### MAPPING # map reads onto the reference genome -REFERENCE=/projects/ag-stetter/reference_genomes/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta +REFERENCE=runs/polished_genome_annotation/assembly/Ahypochondriacus_2.2_polished.softmasked.fasta # Align sequences to reference genome # sequences obtained from (Beta vulgaris, mitchondrium): https://www.ncbi.nlm.nih.gov/nuccore/BA000009.3 diff --git a/workflows/other_inferences/readme.txt b/workflows/other_inferences/readme.txt index 9b3f718dbe1354c1915fb386295cd3c12e209776..b49fe8f5741a93d84e891444d9e737e2759ae58e 100644 --- a/workflows/other_inferences/readme.txt +++ b/workflows/other_inferences/readme.txt @@ -4,11 +4,11 @@ Mapping bias and plastid content investigation ### Script order: -- code/other_inferences/infer_coverage_along_genome.sh +- workflows/other_inferences/infer_coverage_along_genome.sh Map WGS reads against the reference genome to investigate mapping bias -- code/other_inferences/plastid_to_genome_mapping.sh +- workflows/other_inferences/plastid_to_genome_mapping.sh Map chloroplast and mitochondria sequences to the reference genome -- code/other_inferences/mapping_bias_inference_analysis.Rmd +- workflows/other_inferences/mapping_bias_inference_analysis.Rmd Plot mapping bias with annotated chloroplast and mitochondrial sequences diff --git a/workflows/readme.txt b/workflows/readme.txt index 665da22f3863d059cfb0b6ac3f70fd0bd3be8701..63f041c071444fc733ff4b8b46b4519ab744ac09 100644 --- a/workflows/readme.txt +++ b/workflows/readme.txt @@ -4,26 +4,26 @@ All code used for polishing, masking and reannotation of the A. hypochondriacus ### Order of usage: -- code/genome_polishing/ +- workflows/genome_polishing/ polish the previously published A. hypochondriacus reference genome -- code/repeat_masking/ +- workflows/repeat_masking/ to prepare computational annotation, softmask repetitive elements in the polished genome -- code/braker2/ +- workflows/braker2/ perform computational annotation of the softmasked reference genome using BRAKER2 -- code/isoseq_assembly/ +- workflows/isoseq_assembly/ assembly long read transcript sequencing data and compare effects of genome polishing on reported completeness -- code/merge_annotation/ +- workflows/merge_annotation/ merge computational annotation with long-read transcript sequencing data and process output into genome annotation v2.2 -- code/annotation_analysis/ +- workflows/annotation_analysis/ identify flavonoid and betalain pathway genes, as well as MYB transcription factors in the genome annotation v2.2 -- code/other_inferences/ +- workflows/other_inferences/ mapping of chloroplast and mitchondrium sequences to the reference genome, analysis of mapping coverage bias -- code/BSA/ +- workflows/BSA/ bulk segregant analysis and analysis of RNA-seq data from pooled flower tissue diff --git a/workflows/repeat_masking/analyse_repetitive_elements.Rmd b/workflows/repeat_masking/analyse_repetitive_elements.Rmd index 149b81c0d133112fa45d751176ceba893326bb0d..40eaa9afe525e587dac3e0b3a2d8e792db1967b6 100644 --- a/workflows/repeat_masking/analyse_repetitive_elements.Rmd +++ b/workflows/repeat_masking/analyse_repetitive_elements.Rmd @@ -16,7 +16,7 @@ Prepare input file for processing using R: ```{bash} # process build summary output -tail -n +41 data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary | head -n -907 > data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary.tsv +tail -n +41 runs/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary | head -n -907 > runs/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary.tsv ``` @@ -24,12 +24,12 @@ Analyse repetitive element content and composition: ```{r} # read in buildSummary -buildSummary <- read_table(file = "data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary.tsv", +buildSummary <- read_table(file = "runs/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary.tsv", col_names = c("repeat", "count", "length", "percent","drop")) buildSummary <- buildSummary[,1:4] # read in repeat classification -new_TE_classification <- read_delim(file = "data/repeatmasking/reclassification/classified.updated.txt", +new_TE_classification <- read_delim(file = "runs/repeatmasking/reclassification/classified.updated.txt", delim = "#", col_names = c("TE_family", "classification")) @@ -49,6 +49,6 @@ summary_table %>% summarize(sum_total_length = sum(total_length), sum_percentage = sum(percentage)) -write.csv(summary_table, file = "data/repeatmasking/reclassification/reclassified.output.tbl", +write.csv(summary_table, file = "runs/repeatmasking/reclassification/reclassified.output.tbl", quote = F, row.names = F) ``` diff --git a/workflows/repeat_masking/readme.txt b/workflows/repeat_masking/readme.txt index 0fa63432df4359bd85b9c9d507b8363b471fd415..609c3be3937d01c2b7449f1346498ec293ac7c58 100644 --- a/workflows/repeat_masking/readme.txt +++ b/workflows/repeat_masking/readme.txt @@ -4,11 +4,11 @@ The polished reference genome is masked for repetitive elements in order to prep ### Script order: -- code/repeat_masking/run_repeatmodeler.sh +- workflows/repeat_masking/run_repeatmodeler.sh run Repeatmodeler to identify repetitive elements in the polished reference genome -- code/repeat_masking/run_repeatmasker.sh +- workflows/repeat_masking/run_repeatmasker.sh run Repeatmasker on the Repeatmodeler output to classify identified elements and mask the polished reference genome -- code/repeat_masking/analyse_repetitive_elements.Rmd +- workflows/repeat_masking/analyse_repetitive_elements.Rmd analyse the repetitive element composition diff --git a/workflows/repeat_masking/run_repeatmasker.sh b/workflows/repeat_masking/run_repeatmasker.sh index d848d5e77107ff63567b624be9cdda4c7ccfdf8e..fa9cdbade24c88f58b6bd8d13a076326608a91a5 100644 --- a/workflows/repeat_masking/run_repeatmasker.sh +++ b/workflows/repeat_masking/run_repeatmasker.sh @@ -17,7 +17,7 @@ module load repeatmasker/4.1.1 # create database directory -RMOUT=data/repeatmasking/repeatmasker +RMOUT=runs/repeatmasking/repeatmasker mkdir -p $RMOUT @@ -26,12 +26,12 @@ mkdir -p $RMOUT # for 20 threads, the pa setting while using rmblast should be 5 # Converted the polished reference genome fasta to all capital letters as preparation to repeatmasking: -awk '/^>/ {print($0)}; /^[^>]/ {print(toupper($0))}' data/NextPolish/processed/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta \ +awk '/^>/ {print($0)}; /^[^>]/ {print(toupper($0))}' runs/NextPolish/processed/Ahypochondriacus/V2_2/Ahypochondriacus_2.2_polished.softmasked.fasta \ > "$RMOUT"/Ahypochondriacus_2.2_polished.capital.fasta ### run RepeatMasker # lib = repeat database created with repeatmodeler -RepeatMasker -lib data/repeatmasking/repeatmodeler/consensi.fa.classified \ +RepeatMasker -lib runs/repeatmasking/repeatmodeler/consensi.fa.classified \ -pa 5 \ -small \ -e rmblast \ @@ -50,4 +50,4 @@ bedtools maskfasta \ -fo "$RMOUT"/output/Ahypochondriacus_2.2_polished.softmasked.fasta # more detailed summary: -buildSummary.pl data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.out > data/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary +buildSummary.pl runs/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.out > runs/repeatmasking/repeatmasker/Ahypochondriacus_2.2_polished.capital.fasta.buildSummary diff --git a/workflows/repeat_masking/run_repeatmodeler.sh b/workflows/repeat_masking/run_repeatmodeler.sh index c33ef68b266852defd7cd38671bed273e9532dfd..a2904a74a71e9548265f4f58f9fa4aa04f87a864 100644 --- a/workflows/repeat_masking/run_repeatmodeler.sh +++ b/workflows/repeat_masking/run_repeatmodeler.sh @@ -19,23 +19,23 @@ module load repeatmodeler/2.0.1 # create database directory -mkdir -p data/repeatmodeler/database/ +mkdir -p runs/repeatmodeler/database/ ### Main # increased number of tasks, each rmblast job will take 4 threads # for 20 threads, the pa setting while using rmblast should be 5 # Create database -BuildDatabase -name data/repeatmodeler/database/polished \ - data/NextPolish/processed/Ahypochondriacus_2.2_polished.fasta +BuildDatabase -name runs/repeatmodeler/database/polished \ + runs/NextPolish/processed/Ahypochondriacus_2.2_polished.fasta # run Repeatmodeler -RepeatModeler -database data/repeatmodeler/database/polished \ +RepeatModeler -database runs/repeatmodeler/database/polished \ -pa 5 \ -LTRStruct # reclassify identified repeats -mkdir -p data/repeatmasking/reclassification/ +mkdir -p runs/repeatmasking/reclassification/ # repeatmasker version 4.1.5 -RepeatClassifier -consensi data/repeatmasking/reclassification/consensi.fa -stockholm data/repeatmasking/reclassification/families.stk +RepeatClassifier -consensi runs/repeatmasking/reclassification/consensi.fa -stockholm runs/repeatmasking/reclassification/families.stk