{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "8c83058e-1fdf-4d1a-9202-4816a300e7d2",
   "metadata": {},
   "source": [
    "### read number of associated TEs for all species, store in tsv file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "0b3bd0f8-fc7e-4cc5-98c7-ea3ccd8fecb0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import gffpandas.gffpandas as gffpd\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import glob\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "bc351336-f616-43a3-ac51-b07e85d02c7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#get all associations\n",
    "all_locations = [i for i in os.listdir(\"<arc/runs/association_gff3>/.\") if i[0] != '.']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "3db55a8e-5a77-4182-a7e6-cf96fc00cdd7",
   "metadata": {},
   "outputs": [],
   "source": [
    "species_file = open('<arc/runs>/Fig6_species').read().split('\\n')\n",
    "species_dic = {}\n",
    "for i in species_file[:-1]:\n",
    "    species_dic[i.split('\\t')[0]] = i.split('\\t')[1] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "06fdddce-1198-4048-91dc-304ea7f00147",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"<arc/runs>/Fig5_data.tsv\", \"w\") as file:\n",
    "    file.write(\"species\\ttype\\tlocation\\tnumber_of_associated_tes\\n\")\n",
    "    file.close()\n",
    "for species in species_dic.keys():\n",
    "    for location in all_locations:\n",
    "        gff = gffpd.read_gff3(f'<arc/runs/association_gff3>/{location}/{species}_te_gene_associaton.gff3').df\n",
    "        with open(\"<arc/runs>/Fig5_data.tsv\", \"a\") as file:\n",
    "            file.write(f'{species}\\t{species_dic[species]}\\t{location}\\t{len(set(gff.attributes))}\\n')\n",
    "            "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "00c864fd-3021-4ed6-8389-7242cbc330a0",
   "metadata": {},
   "source": [
    "### now continue working with this data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "9d7af3ad-ff4f-4f18-bb5c-f0e8ee554c4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('<arc/runs>/Fig5_data.tsv', sep = '\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "91956aef-0364-467e-b1eb-0e2543d69520",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>species</th>\n",
       "      <th>type</th>\n",
       "      <th>location</th>\n",
       "      <th>number_of_associated_tes</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>a_alpina</td>\n",
       "      <td>c3</td>\n",
       "      <td>start_end</td>\n",
       "      <td>1665</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>a_alpina</td>\n",
       "      <td>c3</td>\n",
       "      <td>upstream</td>\n",
       "      <td>1801</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>a_alpina</td>\n",
       "      <td>c3</td>\n",
       "      <td>downstream</td>\n",
       "      <td>639</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>a_alpina</td>\n",
       "      <td>c3</td>\n",
       "      <td>inside</td>\n",
       "      <td>234</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>a_thaliana</td>\n",
       "      <td>c3</td>\n",
       "      <td>start_end</td>\n",
       "      <td>654</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59</th>\n",
       "      <td>m_nitens</td>\n",
       "      <td>c34</td>\n",
       "      <td>inside</td>\n",
       "      <td>251</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60</th>\n",
       "      <td>m_suffruticosa</td>\n",
       "      <td>c34</td>\n",
       "      <td>start_end</td>\n",
       "      <td>842</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>m_suffruticosa</td>\n",
       "      <td>c34</td>\n",
       "      <td>upstream</td>\n",
       "      <td>1450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>62</th>\n",
       "      <td>m_suffruticosa</td>\n",
       "      <td>c34</td>\n",
       "      <td>downstream</td>\n",
       "      <td>653</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63</th>\n",
       "      <td>m_suffruticosa</td>\n",
       "      <td>c34</td>\n",
       "      <td>inside</td>\n",
       "      <td>257</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>64 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           species type    location  number_of_associated_tes\n",
       "0         a_alpina   c3   start_end                      1665\n",
       "1         a_alpina   c3    upstream                      1801\n",
       "2         a_alpina   c3  downstream                       639\n",
       "3         a_alpina   c3      inside                       234\n",
       "4       a_thaliana   c3   start_end                       654\n",
       "..             ...  ...         ...                       ...\n",
       "59        m_nitens  c34      inside                       251\n",
       "60  m_suffruticosa  c34   start_end                       842\n",
       "61  m_suffruticosa  c34    upstream                      1450\n",
       "62  m_suffruticosa  c34  downstream                       653\n",
       "63  m_suffruticosa  c34      inside                       257\n",
       "\n",
       "[64 rows x 4 columns]"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "a7d2c829-e982-43de-bf0b-af2009fdaa2f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "55148"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(df['number_of_associated_tes'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "d3ac1be6-a578-4f92-bb73-3fe75b060e42",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "28379"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(df[df.type=='c34']['number_of_associated_tes'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "id": "269edc66-14aa-48ae-b187-481ce6068769",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "handles = []\n",
    "colors =  ['#2a788e', '#E0CB41','#22a884', '#7ad151']\n",
    "bottom_list = []\n",
    "all_locations = ['start_end', 'inside', 'downstream', 'upstream']\n",
    "for loc in range(len(all_locations)):\n",
    "    sum_at_location = sum(df[(df.type == 'c3') & (df.location == all_locations[loc])] ['number_of_associated_tes'])\n",
    "    total = sum(df[(df.type == 'c3')] ['number_of_associated_tes'])\n",
    "    h = plt.bar(1, (sum_at_location/total)*100, \n",
    "                bottom = (sum(df[(df.location.isin(bottom_list)) & (df.type == 'c3')]['number_of_associated_tes']) / total)*100, \n",
    "                color = colors[loc], \n",
    "                width = 0.75,\n",
    "                label = all_locations[loc])\n",
    "    bottom_list.append(all_locations[loc])\n",
    "    handles.append(h)\n",
    "bottom_list = []\n",
    "for loc in range(len(all_locations)):\n",
    "    sum_at_location = sum(df[(df.type == 'c34') & (df.location == all_locations[loc])] ['number_of_associated_tes'])\n",
    "    total = sum(df[(df.type == 'c34')] ['number_of_associated_tes'])\n",
    "    h1 = plt.bar(2, (sum_at_location/total)*100, \n",
    "                 bottom = (sum(df[(df.location.isin(bottom_list)) & (df.type == 'c34')]['number_of_associated_tes']) / total)*100, \n",
    "                 color = colors[loc], \n",
    "                 width = 0.75)\n",
    "    bottom_list.append(all_locations[loc])    \n",
    "    \n",
    "\n",
    "plt.xticks([1, 2], ['C3', 'C3-C4'])\n",
    "plt.ylabel('percentage of TEs in category')\n",
    "\n",
    "plt.legend(bbox_to_anchor=(1, 1), loc='upper left', handles=handles[::-1])\n",
    "\n",
    "plt.tight_layout()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}