{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 358,
   "id": "a4fca2a2-48b1-47ec-aee7-f49d6c52d680",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import glob\n",
    "import os\n",
    "import random\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 314,
   "id": "093c3be4-7c5d-4395-a767-981af3976bac",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_p_vals = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 315,
   "id": "05ae694d-0be8-4705-b005-8ab6b48d3cdb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>BINCODE</th>\n",
       "      <th>NAME</th>\n",
       "      <th>IDENTIFIER</th>\n",
       "      <th>DESCRIPTION</th>\n",
       "      <th>TYPE</th>\n",
       "      <th>AGI</th>\n",
       "      <th>SUB_BIN</th>\n",
       "      <th>p</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>414</th>\n",
       "      <td>'1.3.4.1'</td>\n",
       "      <td>'Photosynthesis.photorespiration.glycine decar...</td>\n",
       "      <td>'at4g33010.1'</td>\n",
       "      <td>'mercator4v5.0: glycine dehydrogenase componen...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT4G33010</td>\n",
       "      <td>Photosynthesis.photorespiration</td>\n",
       "      <td>0.001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>417</th>\n",
       "      <td>'1.3.4.2'</td>\n",
       "      <td>'Photosynthesis.photorespiration.glycine decar...</td>\n",
       "      <td>'at1g11860.1'</td>\n",
       "      <td>'mercator4v5.0: aminomethyltransferase compone...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT1G11860</td>\n",
       "      <td>Photosynthesis.photorespiration</td>\n",
       "      <td>0.018</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1762</th>\n",
       "      <td>'5.1.7.1.2'</td>\n",
       "      <td>'Lipid metabolism.fatty acid metabolism.fatty ...</td>\n",
       "      <td>'at2g46650.1'</td>\n",
       "      <td>'mercator4v5.0: cytochrome electron shuttle he...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT2G46650</td>\n",
       "      <td>Lipid metabolism.fatty acid metabolism</td>\n",
       "      <td>0.022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1892</th>\n",
       "      <td>'5.2.2.1'</td>\n",
       "      <td>'Lipid metabolism.glycerolipid metabolism.diac...</td>\n",
       "      <td>'at3g02600.1'</td>\n",
       "      <td>'mercator4v5.0: phosphatidate phosphatase *(LP...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT3G02600</td>\n",
       "      <td>Lipid metabolism.glycerolipid metabolism</td>\n",
       "      <td>0.023</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1945</th>\n",
       "      <td>'5.2.5.5.1'</td>\n",
       "      <td>'Lipid metabolism.glycerolipid metabolism.phos...</td>\n",
       "      <td>'at1g17710.1'</td>\n",
       "      <td>'mercator4v5.0: phosphocholine phosphatase &amp; p...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT1G17710</td>\n",
       "      <td>Lipid metabolism.glycerolipid metabolism</td>\n",
       "      <td>0.007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40190</th>\n",
       "      <td>'50.3.4'</td>\n",
       "      <td>'Enzyme classification.EC_3 hydrolases.EC_3.4 ...</td>\n",
       "      <td>'at3g12203.1'</td>\n",
       "      <td>'mercator4v5.0: EC_3.4 hydrolase acting on pep...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT3G12203</td>\n",
       "      <td>Enzyme classification.EC_3 hydrolases</td>\n",
       "      <td>0.014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40303</th>\n",
       "      <td>'50.3.4'</td>\n",
       "      <td>'Enzyme classification.EC_3 hydrolases.EC_3.4 ...</td>\n",
       "      <td>'at5g42790.1'</td>\n",
       "      <td>'mercator4v5.0: EC_3.4 hydrolase acting on pep...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT5G42790</td>\n",
       "      <td>Enzyme classification.EC_3 hydrolases</td>\n",
       "      <td>0.02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40584</th>\n",
       "      <td>'50.4.2'</td>\n",
       "      <td>'Enzyme classification.EC_4 lyases.EC_4.2 carb...</td>\n",
       "      <td>'at3g53190.1'</td>\n",
       "      <td>'mercator4v5.0: EC_4.2 carbon-oxygen lyase &amp; p...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT3G53190</td>\n",
       "      <td>Enzyme classification.EC_4 lyases</td>\n",
       "      <td>0.023</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40625</th>\n",
       "      <td>'50.4.2'</td>\n",
       "      <td>'Enzyme classification.EC_4 lyases.EC_4.2 carb...</td>\n",
       "      <td>'at5g44630.1'</td>\n",
       "      <td>'mercator4v5.0: EC_4.2 carbon-oxygen lyase &amp; p...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT5G44630</td>\n",
       "      <td>Enzyme classification.EC_4 lyases</td>\n",
       "      <td>0.015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40633</th>\n",
       "      <td>'50.4.2'</td>\n",
       "      <td>'Enzyme classification.EC_4 lyases.EC_4.2 carb...</td>\n",
       "      <td>'at5g63180.1'</td>\n",
       "      <td>'mercator4v5.0: EC_4.2 carbon-oxygen lyase &amp; p...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT5G63180</td>\n",
       "      <td>Enzyme classification.EC_4 lyases</td>\n",
       "      <td>0.005</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>129 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           BINCODE                                               NAME  \\\n",
       "414      '1.3.4.1'  'Photosynthesis.photorespiration.glycine decar...   \n",
       "417      '1.3.4.2'  'Photosynthesis.photorespiration.glycine decar...   \n",
       "1762   '5.1.7.1.2'  'Lipid metabolism.fatty acid metabolism.fatty ...   \n",
       "1892     '5.2.2.1'  'Lipid metabolism.glycerolipid metabolism.diac...   \n",
       "1945   '5.2.5.5.1'  'Lipid metabolism.glycerolipid metabolism.phos...   \n",
       "...            ...                                                ...   \n",
       "40190     '50.3.4'  'Enzyme classification.EC_3 hydrolases.EC_3.4 ...   \n",
       "40303     '50.3.4'  'Enzyme classification.EC_3 hydrolases.EC_3.4 ...   \n",
       "40584     '50.4.2'  'Enzyme classification.EC_4 lyases.EC_4.2 carb...   \n",
       "40625     '50.4.2'  'Enzyme classification.EC_4 lyases.EC_4.2 carb...   \n",
       "40633     '50.4.2'  'Enzyme classification.EC_4 lyases.EC_4.2 carb...   \n",
       "\n",
       "          IDENTIFIER                                        DESCRIPTION TYPE  \\\n",
       "414    'at4g33010.1'  'mercator4v5.0: glycine dehydrogenase componen...    T   \n",
       "417    'at1g11860.1'  'mercator4v5.0: aminomethyltransferase compone...    T   \n",
       "1762   'at2g46650.1'  'mercator4v5.0: cytochrome electron shuttle he...    T   \n",
       "1892   'at3g02600.1'  'mercator4v5.0: phosphatidate phosphatase *(LP...    T   \n",
       "1945   'at1g17710.1'  'mercator4v5.0: phosphocholine phosphatase & p...    T   \n",
       "...              ...                                                ...  ...   \n",
       "40190  'at3g12203.1'  'mercator4v5.0: EC_3.4 hydrolase acting on pep...    T   \n",
       "40303  'at5g42790.1'  'mercator4v5.0: EC_3.4 hydrolase acting on pep...    T   \n",
       "40584  'at3g53190.1'  'mercator4v5.0: EC_4.2 carbon-oxygen lyase & p...    T   \n",
       "40625  'at5g44630.1'  'mercator4v5.0: EC_4.2 carbon-oxygen lyase & p...    T   \n",
       "40633  'at5g63180.1'  'mercator4v5.0: EC_4.2 carbon-oxygen lyase & p...    T   \n",
       "\n",
       "             AGI                                   SUB_BIN      p  \n",
       "414    AT4G33010           Photosynthesis.photorespiration  0.001  \n",
       "417    AT1G11860           Photosynthesis.photorespiration  0.018  \n",
       "1762   AT2G46650    Lipid metabolism.fatty acid metabolism  0.022  \n",
       "1892   AT3G02600  Lipid metabolism.glycerolipid metabolism  0.023  \n",
       "1945   AT1G17710  Lipid metabolism.glycerolipid metabolism  0.007  \n",
       "...          ...                                       ...    ...  \n",
       "40190  AT3G12203     Enzyme classification.EC_3 hydrolases  0.014  \n",
       "40303  AT5G42790     Enzyme classification.EC_3 hydrolases   0.02  \n",
       "40584  AT3G53190         Enzyme classification.EC_4 lyases  0.023  \n",
       "40625  AT5G44630         Enzyme classification.EC_4 lyases  0.015  \n",
       "40633  AT5G63180         Enzyme classification.EC_4 lyases  0.005  \n",
       "\n",
       "[129 rows x 8 columns]"
      ]
     },
     "execution_count": 315,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subset = 'Significant'\n",
    "p_vals = {}\n",
    "for file in glob.glob(f\"<arc/runs/p_vals_per_gene>/{subset}/*.txt\"):\n",
    "    name = file.split('/')[-1].split('.')[0]\n",
    "    \n",
    "    p_vals[name] = open(file).read().split('Pvalue: ')[1].split('\"')[0]\n",
    "    all_p_vals[name] = open(file).read().split('Pvalue: ')[1].split('\"')[0]\n",
    "    \n",
    "df_sig = pd.read_csv('<arc/studies>/mercator/a_thaliana_tair10_protein.results.txt', sep = '\\t')\n",
    "df_sig = df_sig.dropna()\n",
    "df_sig = df_sig.reset_index()\n",
    "df_sig = df_sig.drop(columns='index')\n",
    "\n",
    "for i in range(len(df_sig)):\n",
    "    df_sig.loc[i, 'AGI'] = df_sig.loc[i]['IDENTIFIER'].split('_')[0][1:-3].upper()\n",
    "    df_sig.loc[i, 'SUB_BIN'] = '.'.join(df_sig.loc[i]['NAME'].split('.')[:2])[1:]\n",
    "df_sig['p'] = df_sig['AGI'].map(p_vals)\n",
    "df_sig = df_sig.dropna()\n",
    "df_sig = df_sig[df_sig.IDENTIFIER.str.endswith(\".1'\")] #remove secondary transcripts\n",
    "df_sig"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 316,
   "id": "03bb5ec4-6b0c-4fb9-858e-6d4442ca107c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>BINCODE</th>\n",
       "      <th>NAME</th>\n",
       "      <th>IDENTIFIER</th>\n",
       "      <th>DESCRIPTION</th>\n",
       "      <th>TYPE</th>\n",
       "      <th>AGI</th>\n",
       "      <th>SUB_BIN</th>\n",
       "      <th>p</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>'1.1.1.1.1'</td>\n",
       "      <td>'Photosynthesis.photophosphorylation.photosyst...</td>\n",
       "      <td>'at2g34430.1'</td>\n",
       "      <td>'mercator4v5.0: component *(LHCb1/2/3) of LHC-...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT2G34430</td>\n",
       "      <td>Photosynthesis.photophosphorylation</td>\n",
       "      <td>0.377</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>'1.1.1.1.1'</td>\n",
       "      <td>'Photosynthesis.photophosphorylation.photosyst...</td>\n",
       "      <td>'at5g54270.1'</td>\n",
       "      <td>'mercator4v5.0: component *(LHCb1/2/3) of LHC-...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT5G54270</td>\n",
       "      <td>Photosynthesis.photophosphorylation</td>\n",
       "      <td>0.347</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>'1.1.1.1.3'</td>\n",
       "      <td>'Photosynthesis.photophosphorylation.photosyst...</td>\n",
       "      <td>'at4g10340.1'</td>\n",
       "      <td>'mercator4v5.0: component *(LHCb5) of LHC-II c...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT4G10340</td>\n",
       "      <td>Photosynthesis.photophosphorylation</td>\n",
       "      <td>0.969</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>'1.1.1.2.2.1'</td>\n",
       "      <td>'Photosynthesis.photophosphorylation.photosyst...</td>\n",
       "      <td>'at3g50820.1'</td>\n",
       "      <td>'mercator4v5.0: component *(PsbO/OEC33) of PS-...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT3G50820</td>\n",
       "      <td>Photosynthesis.photophosphorylation</td>\n",
       "      <td>0.361</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>'1.1.1.2.2.1'</td>\n",
       "      <td>'Photosynthesis.photophosphorylation.photosyst...</td>\n",
       "      <td>'at5g66570.1'</td>\n",
       "      <td>'mercator4v5.0: component *(PsbO/OEC33) of PS-...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT5G66570</td>\n",
       "      <td>Photosynthesis.photophosphorylation</td>\n",
       "      <td>0.371</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40940</th>\n",
       "      <td>'50.6.3'</td>\n",
       "      <td>'Enzyme classification.EC_6 ligases.EC_6.3 lig...</td>\n",
       "      <td>'at4g20320.1'</td>\n",
       "      <td>'mercator4v5.0: EC_6.3 ligase forming carbon-n...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT4G20320</td>\n",
       "      <td>Enzyme classification.EC_6 ligases</td>\n",
       "      <td>0.343</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40943</th>\n",
       "      <td>'50.6.3'</td>\n",
       "      <td>'Enzyme classification.EC_6 ligases.EC_6.3 lig...</td>\n",
       "      <td>'at4g36940.1'</td>\n",
       "      <td>'mercator4v5.0: EC_6.3 ligase forming carbon-n...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT4G36940</td>\n",
       "      <td>Enzyme classification.EC_6 ligases</td>\n",
       "      <td>0.198</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40958</th>\n",
       "      <td>'50.6.3'</td>\n",
       "      <td>'Enzyme classification.EC_6 ligases.EC_6.3 lig...</td>\n",
       "      <td>'at5g48840.1'</td>\n",
       "      <td>'mercator4v5.0: EC_6.3 ligase forming carbon-n...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT5G48840</td>\n",
       "      <td>Enzyme classification.EC_6 ligases</td>\n",
       "      <td>0.52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40971</th>\n",
       "      <td>'50.6.5'</td>\n",
       "      <td>'Enzyme classification.EC_6 ligases.EC_6.5 lig...</td>\n",
       "      <td>'at1g08130.1'</td>\n",
       "      <td>'mercator4v5.0: EC_6.5 ligase forming phosphor...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT1G08130</td>\n",
       "      <td>Enzyme classification.EC_6 ligases</td>\n",
       "      <td>0.662</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40977</th>\n",
       "      <td>'50.6.6'</td>\n",
       "      <td>'Enzyme classification.EC_6 ligases.EC_6.6 lig...</td>\n",
       "      <td>'at5g45930.1'</td>\n",
       "      <td>'mercator4v5.0: EC_6.6 ligase forming nitrogen...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT5G45930</td>\n",
       "      <td>Enzyme classification.EC_6 ligases</td>\n",
       "      <td>0.288</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7279 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             BINCODE                                               NAME  \\\n",
       "6        '1.1.1.1.1'  'Photosynthesis.photophosphorylation.photosyst...   \n",
       "8        '1.1.1.1.1'  'Photosynthesis.photophosphorylation.photosyst...   \n",
       "12       '1.1.1.1.3'  'Photosynthesis.photophosphorylation.photosyst...   \n",
       "22     '1.1.1.2.2.1'  'Photosynthesis.photophosphorylation.photosyst...   \n",
       "23     '1.1.1.2.2.1'  'Photosynthesis.photophosphorylation.photosyst...   \n",
       "...              ...                                                ...   \n",
       "40940       '50.6.3'  'Enzyme classification.EC_6 ligases.EC_6.3 lig...   \n",
       "40943       '50.6.3'  'Enzyme classification.EC_6 ligases.EC_6.3 lig...   \n",
       "40958       '50.6.3'  'Enzyme classification.EC_6 ligases.EC_6.3 lig...   \n",
       "40971       '50.6.5'  'Enzyme classification.EC_6 ligases.EC_6.5 lig...   \n",
       "40977       '50.6.6'  'Enzyme classification.EC_6 ligases.EC_6.6 lig...   \n",
       "\n",
       "          IDENTIFIER                                        DESCRIPTION TYPE  \\\n",
       "6      'at2g34430.1'  'mercator4v5.0: component *(LHCb1/2/3) of LHC-...    T   \n",
       "8      'at5g54270.1'  'mercator4v5.0: component *(LHCb1/2/3) of LHC-...    T   \n",
       "12     'at4g10340.1'  'mercator4v5.0: component *(LHCb5) of LHC-II c...    T   \n",
       "22     'at3g50820.1'  'mercator4v5.0: component *(PsbO/OEC33) of PS-...    T   \n",
       "23     'at5g66570.1'  'mercator4v5.0: component *(PsbO/OEC33) of PS-...    T   \n",
       "...              ...                                                ...  ...   \n",
       "40940  'at4g20320.1'  'mercator4v5.0: EC_6.3 ligase forming carbon-n...    T   \n",
       "40943  'at4g36940.1'  'mercator4v5.0: EC_6.3 ligase forming carbon-n...    T   \n",
       "40958  'at5g48840.1'  'mercator4v5.0: EC_6.3 ligase forming carbon-n...    T   \n",
       "40971  'at1g08130.1'  'mercator4v5.0: EC_6.5 ligase forming phosphor...    T   \n",
       "40977  'at5g45930.1'  'mercator4v5.0: EC_6.6 ligase forming nitrogen...    T   \n",
       "\n",
       "             AGI                              SUB_BIN      p  \n",
       "6      AT2G34430  Photosynthesis.photophosphorylation  0.377  \n",
       "8      AT5G54270  Photosynthesis.photophosphorylation  0.347  \n",
       "12     AT4G10340  Photosynthesis.photophosphorylation  0.969  \n",
       "22     AT3G50820  Photosynthesis.photophosphorylation  0.361  \n",
       "23     AT5G66570  Photosynthesis.photophosphorylation  0.371  \n",
       "...          ...                                  ...    ...  \n",
       "40940  AT4G20320   Enzyme classification.EC_6 ligases  0.343  \n",
       "40943  AT4G36940   Enzyme classification.EC_6 ligases  0.198  \n",
       "40958  AT5G48840   Enzyme classification.EC_6 ligases   0.52  \n",
       "40971  AT1G08130   Enzyme classification.EC_6 ligases  0.662  \n",
       "40977  AT5G45930   Enzyme classification.EC_6 ligases  0.288  \n",
       "\n",
       "[7279 rows x 8 columns]"
      ]
     },
     "execution_count": 316,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subset = 'Non_Significant'\n",
    "p_vals = {}\n",
    "for file in glob.glob(f\"<arc/runs/p_vals_per_gene>/{subset}/*.txt\"):\n",
    "    name = file.split('/')[-1].split('.')[0]\n",
    "    \n",
    "    p_vals[name] = open(file).read().split('Pvalue: ')[1].split('\"')[0]\n",
    "    all_p_vals[name] = open(file).read().split('Pvalue: ')[1].split('\"')[0]\n",
    "    \n",
    "df_nonsig = pd.read_csv('<arc/studies>/mercator/a_thaliana_tair10_protein.results.txt', sep = '\\t')\n",
    "df_nonsig = df_nonsig.dropna()\n",
    "df_nonsig = df_nonsig.reset_index()\n",
    "df_nonsig = df_nonsig.drop(columns='index')\n",
    "\n",
    "for i in range(len(df_nonsig)):\n",
    "    df_nonsig.loc[i, 'AGI'] = df_nonsig.loc[i]['IDENTIFIER'].split('_')[0][1:-3].upper()\n",
    "    df_nonsig.loc[i, 'SUB_BIN'] = '.'.join(df_nonsig.loc[i]['NAME'].split('.')[:2])[1:]\n",
    "df_nonsig['p'] = df_nonsig['AGI'].map(p_vals)\n",
    "df_nonsig = df_nonsig[df_nonsig.IDENTIFIER.str.endswith(\".1'\")] #remove secondary transcripts\n",
    "df_nonsig = df_nonsig.dropna()\n",
    "df_nonsig"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ebad821f-4f8a-4cd1-8250-7e194732419d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 296,
   "id": "b52304bc-e7d1-45d1-824c-cdf9206b7ec7",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_bins = list(set(df_nonsig.SUB_BIN))\n",
    "i = all_bins[0]\n",
    "num_nonsig = len(df_nonsig[df_nonsig.SUB_BIN == i])\n",
    "num_sig = len(df_sig[df_sig.SUB_BIN == i])\n",
    "summ = num_sig + num_nonsig"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 297,
   "id": "8b98a2c5-ad44-4b59-af41-e914ee64d590",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_bins = pd.DataFrame()\n",
    "df_bins['bin'] = all_bins\n",
    "for i in range(len(all_bins)):\n",
    "    num_nonsig = len(df_nonsig[df_nonsig.SUB_BIN == all_bins[i]])\n",
    "    num_sig = len(df_sig[df_sig.SUB_BIN == all_bins[i]])\n",
    "    df_bins.loc[i, 'sig'] = num_sig\n",
    "    df_bins.loc[i, 'non_sig'] = num_nonsig"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb1d7ce2-8aad-4353-a03f-73153b2e1e13",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 355,
   "id": "c1153c11-a5c0-488e-a97f-c84d3f7e62da",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bin</th>\n",
       "      <th>sig</th>\n",
       "      <th>non_sig</th>\n",
       "      <th>p_values</th>\n",
       "      <th>odds_ratio</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>Photosynthesis.photorespiration</td>\n",
       "      <td>2.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0.002907</td>\n",
       "      <td>38.194226</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>RNA biosynthesis.RNA polymerase III-dependent ...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.051343</td>\n",
       "      <td>28.425781</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>120</th>\n",
       "      <td>Multi-process regulation.SnRK1-kinase regulato...</td>\n",
       "      <td>2.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>0.014932</td>\n",
       "      <td>12.720910</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98</th>\n",
       "      <td>Phytohormone action.strigolactone</td>\n",
       "      <td>1.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0.100070</td>\n",
       "      <td>11.365625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>External stimuli response.gravity</td>\n",
       "      <td>1.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>0.115753</td>\n",
       "      <td>9.470052</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>77</th>\n",
       "      <td>Cytoskeleton organisation.microtubular network</td>\n",
       "      <td>0.0</td>\n",
       "      <td>46.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>78</th>\n",
       "      <td>External stimuli response.drought</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>80</th>\n",
       "      <td>Cellular respiration.pyruvate oxidation</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>82</th>\n",
       "      <td>External stimuli response.damage</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>212</th>\n",
       "      <td>Redox homeostasis.ascorbate-based redox regula...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>14.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>213 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   bin  sig  non_sig  \\\n",
       "109                    Photosynthesis.photorespiration  2.0      3.0   \n",
       "26   RNA biosynthesis.RNA polymerase III-dependent ...  1.0      2.0   \n",
       "120  Multi-process regulation.SnRK1-kinase regulato...  2.0      9.0   \n",
       "98                   Phytohormone action.strigolactone  1.0      5.0   \n",
       "23                   External stimuli response.gravity  1.0      6.0   \n",
       "..                                                 ...  ...      ...   \n",
       "77      Cytoskeleton organisation.microtubular network  0.0     46.0   \n",
       "78                   External stimuli response.drought  0.0      2.0   \n",
       "80             Cellular respiration.pyruvate oxidation  0.0      5.0   \n",
       "82                    External stimuli response.damage  0.0      5.0   \n",
       "212  Redox homeostasis.ascorbate-based redox regula...  0.0     14.0   \n",
       "\n",
       "     p_values  odds_ratio  \n",
       "109  0.002907   38.194226  \n",
       "26   0.051343   28.425781  \n",
       "120  0.014932   12.720910  \n",
       "98   0.100070   11.365625  \n",
       "23   0.115753    9.470052  \n",
       "..        ...         ...  \n",
       "77   1.000000    0.000000  \n",
       "78   1.000000    0.000000  \n",
       "80   1.000000    0.000000  \n",
       "82   1.000000    0.000000  \n",
       "212  1.000000    0.000000  \n",
       "\n",
       "[213 rows x 5 columns]"
      ]
     },
     "execution_count": 355,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import scipy.stats as stats\n",
    "p_values = []\n",
    "odds = []\n",
    "for code in df_bins['bin']:\n",
    "    oddsratio, pvalue = stats.fisher_exact([[sum(df_bins['sig'][df_bins['bin'] == code]),\n",
    "                                             sum(df_bins['sig']) - sum(df_bins['sig'][df_bins['bin'] == code])],\n",
    "                                            [sum(df_bins['non_sig'][df_bins['bin'] == code]), \n",
    "                                             sum(df_bins['non_sig']) - sum(df_bins['non_sig'][df_bins['bin'] == code])]], alternative='two-sided')  \n",
    "    p_values.append(pvalue)\n",
    "    odds.append(oddsratio)\n",
    "df_bins['p_values'] = p_values\n",
    "df_bins['odds_ratio'] = odds\n",
    "df_bins = df_bins.sort_values(by = 'odds_ratio', ascending=False)\n",
    "#df_bins[df_bins['p_values'] <= 0.05]\n",
    "#sum(df_bins.sig)\n",
    "df_bins"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d26b1914-9579-4f5d-a764-e2f07ca5bc59",
   "metadata": {},
   "source": [
    "It seems that now (when using the AGI codes), 5/9 PR genes have a TE. Which are they?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 299,
   "id": "8c517f68-993a-44ae-bae8-a9a1b0b3aa9d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>BINCODE</th>\n",
       "      <th>NAME</th>\n",
       "      <th>IDENTIFIER</th>\n",
       "      <th>DESCRIPTION</th>\n",
       "      <th>TYPE</th>\n",
       "      <th>AGI</th>\n",
       "      <th>SUB_BIN</th>\n",
       "      <th>p</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>414</th>\n",
       "      <td>'1.3.4.1'</td>\n",
       "      <td>'Photosynthesis.photorespiration.glycine decar...</td>\n",
       "      <td>'at4g33010.1'</td>\n",
       "      <td>'mercator4v5.0: glycine dehydrogenase componen...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT4G33010</td>\n",
       "      <td>Photosynthesis.photorespiration</td>\n",
       "      <td>0.001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>417</th>\n",
       "      <td>'1.3.4.2'</td>\n",
       "      <td>'Photosynthesis.photorespiration.glycine decar...</td>\n",
       "      <td>'at1g11860.1'</td>\n",
       "      <td>'mercator4v5.0: aminomethyltransferase compone...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT1G11860</td>\n",
       "      <td>Photosynthesis.photorespiration</td>\n",
       "      <td>0.018</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       BINCODE                                               NAME  \\\n",
       "414  '1.3.4.1'  'Photosynthesis.photorespiration.glycine decar...   \n",
       "417  '1.3.4.2'  'Photosynthesis.photorespiration.glycine decar...   \n",
       "\n",
       "        IDENTIFIER                                        DESCRIPTION TYPE  \\\n",
       "414  'at4g33010.1'  'mercator4v5.0: glycine dehydrogenase componen...    T   \n",
       "417  'at1g11860.1'  'mercator4v5.0: aminomethyltransferase compone...    T   \n",
       "\n",
       "           AGI                          SUB_BIN      p  \n",
       "414  AT4G33010  Photosynthesis.photorespiration  0.001  \n",
       "417  AT1G11860  Photosynthesis.photorespiration  0.018  "
      ]
     },
     "execution_count": 299,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_sig[df_sig.BINCODE.str.startswith(\"'1.3\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 300,
   "id": "9eafe556-42e1-4bd2-93c8-b9b96bcbf781",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>BINCODE</th>\n",
       "      <th>NAME</th>\n",
       "      <th>IDENTIFIER</th>\n",
       "      <th>DESCRIPTION</th>\n",
       "      <th>TYPE</th>\n",
       "      <th>AGI</th>\n",
       "      <th>SUB_BIN</th>\n",
       "      <th>p</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6521</th>\n",
       "      <td>'15.4.5.3'</td>\n",
       "      <td>'RNA biosynthesis.RNA polymerase III-dependent...</td>\n",
       "      <td>'at4g01590.1'</td>\n",
       "      <td>'mercator4v5.0: component *(NRPC17/RPC31) of T...</td>\n",
       "      <td>T</td>\n",
       "      <td>AT4G01590</td>\n",
       "      <td>RNA biosynthesis.RNA polymerase III-dependent ...</td>\n",
       "      <td>0.017</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         BINCODE                                               NAME  \\\n",
       "6521  '15.4.5.3'  'RNA biosynthesis.RNA polymerase III-dependent...   \n",
       "\n",
       "         IDENTIFIER                                        DESCRIPTION TYPE  \\\n",
       "6521  'at4g01590.1'  'mercator4v5.0: component *(NRPC17/RPC31) of T...    T   \n",
       "\n",
       "            AGI                                            SUB_BIN      p  \n",
       "6521  AT4G01590  RNA biosynthesis.RNA polymerase III-dependent ...  0.017  "
      ]
     },
     "execution_count": 300,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_sig[df_sig.NAME.str.startswith(\"'RNA biosynthesis.RNA po\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "867117fa-1950-4007-841b-5085268769b2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab72c01f-2683-4f24-ba29-1466d3a367dd",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50da26a6-7477-48f0-af34-3b9ce2863154",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25d7479a-a60b-4751-bbaa-03111be277ad",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}