{ "cells": [ { "cell_type": "markdown", "id": "8c83058e-1fdf-4d1a-9202-4816a300e7d2", "metadata": {}, "source": [ "### read number of associated TEs for all species, store in tsv file" ] }, { "cell_type": "code", "execution_count": 26, "id": "0b3bd0f8-fc7e-4cc5-98c7-ea3ccd8fecb0", "metadata": {}, "outputs": [], "source": [ "import gffpandas.gffpandas as gffpd\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import glob\n", "import os" ] }, { "cell_type": "code", "execution_count": 51, "id": "bc351336-f616-43a3-ac51-b07e85d02c7e", "metadata": {}, "outputs": [], "source": [ "#get all associations\n", "all_locations = [i for i in os.listdir(\"<arc/runs/association_gff3>/.\") if i[0] != '.']" ] }, { "cell_type": "code", "execution_count": 77, "id": "3db55a8e-5a77-4182-a7e6-cf96fc00cdd7", "metadata": {}, "outputs": [], "source": [ "species_file = open('<arc/runs>/Fig6_species').read().split('\\n')\n", "species_dic = {}\n", "for i in species_file[:-1]:\n", " species_dic[i.split('\\t')[0]] = i.split('\\t')[1] " ] }, { "cell_type": "code", "execution_count": 87, "id": "06fdddce-1198-4048-91dc-304ea7f00147", "metadata": {}, "outputs": [], "source": [ "with open(\"<arc/runs>/Fig5_data.tsv\", \"w\") as file:\n", " file.write(\"species\\ttype\\tlocation\\tnumber_of_associated_tes\\n\")\n", " file.close()\n", "for species in species_dic.keys():\n", " for location in all_locations:\n", " gff = gffpd.read_gff3(f'<arc/runs/association_gff3>/{location}/{species}_te_gene_associaton.gff3').df\n", " with open(\"<arc/runs>/Fig5_data.tsv\", \"a\") as file:\n", " file.write(f'{species}\\t{species_dic[species]}\\t{location}\\t{len(set(gff.attributes))}\\n')\n", " " ] }, { "cell_type": "markdown", "id": "00c864fd-3021-4ed6-8389-7242cbc330a0", "metadata": {}, "source": [ "### now continue working with this data" ] }, { "cell_type": "code", "execution_count": 64, "id": "9d7af3ad-ff4f-4f18-bb5c-f0e8ee554c4c", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('<arc/runs>/Fig5_data.tsv', sep = '\\t')" ] }, { "cell_type": "code", "execution_count": 72, "id": "91956aef-0364-467e-b1eb-0e2543d69520", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>species</th>\n", " <th>type</th>\n", " <th>location</th>\n", " <th>number_of_associated_tes</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>a_alpina</td>\n", " <td>c3</td>\n", " <td>start_end</td>\n", " <td>1665</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>a_alpina</td>\n", " <td>c3</td>\n", " <td>upstream</td>\n", " <td>1801</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>a_alpina</td>\n", " <td>c3</td>\n", " <td>downstream</td>\n", " <td>639</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>a_alpina</td>\n", " <td>c3</td>\n", " <td>inside</td>\n", " <td>234</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>a_thaliana</td>\n", " <td>c3</td>\n", " <td>start_end</td>\n", " <td>654</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>59</th>\n", " <td>m_nitens</td>\n", " <td>c34</td>\n", " <td>inside</td>\n", " <td>251</td>\n", " </tr>\n", " <tr>\n", " <th>60</th>\n", " <td>m_suffruticosa</td>\n", " <td>c34</td>\n", " <td>start_end</td>\n", " <td>842</td>\n", " </tr>\n", " <tr>\n", " <th>61</th>\n", " <td>m_suffruticosa</td>\n", " <td>c34</td>\n", " <td>upstream</td>\n", " <td>1450</td>\n", " </tr>\n", " <tr>\n", " <th>62</th>\n", " <td>m_suffruticosa</td>\n", " <td>c34</td>\n", " <td>downstream</td>\n", " <td>653</td>\n", " </tr>\n", " <tr>\n", " <th>63</th>\n", " <td>m_suffruticosa</td>\n", " <td>c34</td>\n", " <td>inside</td>\n", " <td>257</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>64 rows × 4 columns</p>\n", "</div>" ], "text/plain": [ " species type location number_of_associated_tes\n", "0 a_alpina c3 start_end 1665\n", "1 a_alpina c3 upstream 1801\n", "2 a_alpina c3 downstream 639\n", "3 a_alpina c3 inside 234\n", "4 a_thaliana c3 start_end 654\n", ".. ... ... ... ...\n", "59 m_nitens c34 inside 251\n", "60 m_suffruticosa c34 start_end 842\n", "61 m_suffruticosa c34 upstream 1450\n", "62 m_suffruticosa c34 downstream 653\n", "63 m_suffruticosa c34 inside 257\n", "\n", "[64 rows x 4 columns]" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 71, "id": "a7d2c829-e982-43de-bf0b-af2009fdaa2f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "55148" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(df['number_of_associated_tes'])" ] }, { "cell_type": "code", "execution_count": 75, "id": "d3ac1be6-a578-4f92-bb73-3fe75b060e42", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "28379" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(df[df.type=='c34']['number_of_associated_tes'])" ] }, { "cell_type": "code", "execution_count": 195, "id": "269edc66-14aa-48ae-b187-481ce6068769", "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "handles = []\n", "colors = ['#2a788e', '#E0CB41','#22a884', '#7ad151']\n", "bottom_list = []\n", "all_locations = ['start_end', 'inside', 'downstream', 'upstream']\n", "for loc in range(len(all_locations)):\n", " sum_at_location = sum(df[(df.type == 'c3') & (df.location == all_locations[loc])] ['number_of_associated_tes'])\n", " total = sum(df[(df.type == 'c3')] ['number_of_associated_tes'])\n", " h = plt.bar(1, (sum_at_location/total)*100, \n", " bottom = (sum(df[(df.location.isin(bottom_list)) & (df.type == 'c3')]['number_of_associated_tes']) / total)*100, \n", " color = colors[loc], \n", " width = 0.75,\n", " label = all_locations[loc])\n", " bottom_list.append(all_locations[loc])\n", " handles.append(h)\n", "bottom_list = []\n", "for loc in range(len(all_locations)):\n", " sum_at_location = sum(df[(df.type == 'c34') & (df.location == all_locations[loc])] ['number_of_associated_tes'])\n", " total = sum(df[(df.type == 'c34')] ['number_of_associated_tes'])\n", " h1 = plt.bar(2, (sum_at_location/total)*100, \n", " bottom = (sum(df[(df.location.isin(bottom_list)) & (df.type == 'c34')]['number_of_associated_tes']) / total)*100, \n", " color = colors[loc], \n", " width = 0.75)\n", " bottom_list.append(all_locations[loc]) \n", " \n", "\n", "plt.xticks([1, 2], ['C3', 'C3-C4'])\n", "plt.ylabel('percentage of TEs in category')\n", "\n", "plt.legend(bbox_to_anchor=(1, 1), loc='upper left', handles=handles[::-1])\n", "\n", "plt.tight_layout()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }