{ "cells": [ { "cell_type": "code", "execution_count": 358, "id": "a4fca2a2-48b1-47ec-aee7-f49d6c52d680", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import glob\n", "import os\n", "import random\n", "import json" ] }, { "cell_type": "code", "execution_count": 314, "id": "093c3be4-7c5d-4395-a767-981af3976bac", "metadata": {}, "outputs": [], "source": [ "all_p_vals = {}" ] }, { "cell_type": "code", "execution_count": 315, "id": "05ae694d-0be8-4705-b005-8ab6b48d3cdb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>BINCODE</th>\n", " <th>NAME</th>\n", " <th>IDENTIFIER</th>\n", " <th>DESCRIPTION</th>\n", " <th>TYPE</th>\n", " <th>AGI</th>\n", " <th>SUB_BIN</th>\n", " <th>p</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>414</th>\n", " <td>'1.3.4.1'</td>\n", " <td>'Photosynthesis.photorespiration.glycine decar...</td>\n", " <td>'at4g33010.1'</td>\n", " <td>'mercator4v5.0: glycine dehydrogenase componen...</td>\n", " <td>T</td>\n", " <td>AT4G33010</td>\n", " <td>Photosynthesis.photorespiration</td>\n", " <td>0.001</td>\n", " </tr>\n", " <tr>\n", " <th>417</th>\n", " <td>'1.3.4.2'</td>\n", " <td>'Photosynthesis.photorespiration.glycine decar...</td>\n", " <td>'at1g11860.1'</td>\n", " <td>'mercator4v5.0: aminomethyltransferase compone...</td>\n", " <td>T</td>\n", " <td>AT1G11860</td>\n", " <td>Photosynthesis.photorespiration</td>\n", " <td>0.018</td>\n", " </tr>\n", " <tr>\n", " <th>1762</th>\n", " <td>'5.1.7.1.2'</td>\n", " <td>'Lipid metabolism.fatty acid metabolism.fatty ...</td>\n", " <td>'at2g46650.1'</td>\n", " <td>'mercator4v5.0: cytochrome electron shuttle he...</td>\n", " <td>T</td>\n", " <td>AT2G46650</td>\n", " <td>Lipid metabolism.fatty acid metabolism</td>\n", " <td>0.022</td>\n", " </tr>\n", " <tr>\n", " <th>1892</th>\n", " <td>'5.2.2.1'</td>\n", " <td>'Lipid metabolism.glycerolipid metabolism.diac...</td>\n", " <td>'at3g02600.1'</td>\n", " <td>'mercator4v5.0: phosphatidate phosphatase *(LP...</td>\n", " <td>T</td>\n", " <td>AT3G02600</td>\n", " <td>Lipid metabolism.glycerolipid metabolism</td>\n", " <td>0.023</td>\n", " </tr>\n", " <tr>\n", " <th>1945</th>\n", " <td>'5.2.5.5.1'</td>\n", " <td>'Lipid metabolism.glycerolipid metabolism.phos...</td>\n", " <td>'at1g17710.1'</td>\n", " <td>'mercator4v5.0: phosphocholine phosphatase & p...</td>\n", " <td>T</td>\n", " <td>AT1G17710</td>\n", " <td>Lipid metabolism.glycerolipid metabolism</td>\n", " <td>0.007</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>40190</th>\n", " <td>'50.3.4'</td>\n", " <td>'Enzyme classification.EC_3 hydrolases.EC_3.4 ...</td>\n", " <td>'at3g12203.1'</td>\n", " <td>'mercator4v5.0: EC_3.4 hydrolase acting on pep...</td>\n", " <td>T</td>\n", " <td>AT3G12203</td>\n", " <td>Enzyme classification.EC_3 hydrolases</td>\n", " <td>0.014</td>\n", " </tr>\n", " <tr>\n", " <th>40303</th>\n", " <td>'50.3.4'</td>\n", " <td>'Enzyme classification.EC_3 hydrolases.EC_3.4 ...</td>\n", " <td>'at5g42790.1'</td>\n", " <td>'mercator4v5.0: EC_3.4 hydrolase acting on pep...</td>\n", " <td>T</td>\n", " <td>AT5G42790</td>\n", " <td>Enzyme classification.EC_3 hydrolases</td>\n", " <td>0.02</td>\n", " </tr>\n", " <tr>\n", " <th>40584</th>\n", " <td>'50.4.2'</td>\n", " <td>'Enzyme classification.EC_4 lyases.EC_4.2 carb...</td>\n", " <td>'at3g53190.1'</td>\n", " <td>'mercator4v5.0: EC_4.2 carbon-oxygen lyase & p...</td>\n", " <td>T</td>\n", " <td>AT3G53190</td>\n", " <td>Enzyme classification.EC_4 lyases</td>\n", " <td>0.023</td>\n", " </tr>\n", " <tr>\n", " <th>40625</th>\n", " <td>'50.4.2'</td>\n", " <td>'Enzyme classification.EC_4 lyases.EC_4.2 carb...</td>\n", " <td>'at5g44630.1'</td>\n", " <td>'mercator4v5.0: EC_4.2 carbon-oxygen lyase & p...</td>\n", " <td>T</td>\n", " <td>AT5G44630</td>\n", " <td>Enzyme classification.EC_4 lyases</td>\n", " <td>0.015</td>\n", " </tr>\n", " <tr>\n", " <th>40633</th>\n", " <td>'50.4.2'</td>\n", " <td>'Enzyme classification.EC_4 lyases.EC_4.2 carb...</td>\n", " <td>'at5g63180.1'</td>\n", " <td>'mercator4v5.0: EC_4.2 carbon-oxygen lyase & p...</td>\n", " <td>T</td>\n", " <td>AT5G63180</td>\n", " <td>Enzyme classification.EC_4 lyases</td>\n", " <td>0.005</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>129 rows × 8 columns</p>\n", "</div>" ], "text/plain": [ " BINCODE NAME \\\n", "414 '1.3.4.1' 'Photosynthesis.photorespiration.glycine decar... \n", "417 '1.3.4.2' 'Photosynthesis.photorespiration.glycine decar... \n", "1762 '5.1.7.1.2' 'Lipid metabolism.fatty acid metabolism.fatty ... \n", "1892 '5.2.2.1' 'Lipid metabolism.glycerolipid metabolism.diac... \n", "1945 '5.2.5.5.1' 'Lipid metabolism.glycerolipid metabolism.phos... \n", "... ... ... \n", "40190 '50.3.4' 'Enzyme classification.EC_3 hydrolases.EC_3.4 ... \n", "40303 '50.3.4' 'Enzyme classification.EC_3 hydrolases.EC_3.4 ... \n", "40584 '50.4.2' 'Enzyme classification.EC_4 lyases.EC_4.2 carb... \n", "40625 '50.4.2' 'Enzyme classification.EC_4 lyases.EC_4.2 carb... \n", "40633 '50.4.2' 'Enzyme classification.EC_4 lyases.EC_4.2 carb... \n", "\n", " IDENTIFIER DESCRIPTION TYPE \\\n", "414 'at4g33010.1' 'mercator4v5.0: glycine dehydrogenase componen... T \n", "417 'at1g11860.1' 'mercator4v5.0: aminomethyltransferase compone... T \n", "1762 'at2g46650.1' 'mercator4v5.0: cytochrome electron shuttle he... T \n", "1892 'at3g02600.1' 'mercator4v5.0: phosphatidate phosphatase *(LP... T \n", "1945 'at1g17710.1' 'mercator4v5.0: phosphocholine phosphatase & p... T \n", "... ... ... ... \n", "40190 'at3g12203.1' 'mercator4v5.0: EC_3.4 hydrolase acting on pep... T \n", "40303 'at5g42790.1' 'mercator4v5.0: EC_3.4 hydrolase acting on pep... T \n", "40584 'at3g53190.1' 'mercator4v5.0: EC_4.2 carbon-oxygen lyase & p... T \n", "40625 'at5g44630.1' 'mercator4v5.0: EC_4.2 carbon-oxygen lyase & p... T \n", "40633 'at5g63180.1' 'mercator4v5.0: EC_4.2 carbon-oxygen lyase & p... T \n", "\n", " AGI SUB_BIN p \n", "414 AT4G33010 Photosynthesis.photorespiration 0.001 \n", "417 AT1G11860 Photosynthesis.photorespiration 0.018 \n", "1762 AT2G46650 Lipid metabolism.fatty acid metabolism 0.022 \n", "1892 AT3G02600 Lipid metabolism.glycerolipid metabolism 0.023 \n", "1945 AT1G17710 Lipid metabolism.glycerolipid metabolism 0.007 \n", "... ... ... ... \n", "40190 AT3G12203 Enzyme classification.EC_3 hydrolases 0.014 \n", "40303 AT5G42790 Enzyme classification.EC_3 hydrolases 0.02 \n", "40584 AT3G53190 Enzyme classification.EC_4 lyases 0.023 \n", "40625 AT5G44630 Enzyme classification.EC_4 lyases 0.015 \n", "40633 AT5G63180 Enzyme classification.EC_4 lyases 0.005 \n", "\n", "[129 rows x 8 columns]" ] }, "execution_count": 315, "metadata": {}, "output_type": "execute_result" } ], "source": [ "subset = 'Significant'\n", "p_vals = {}\n", "for file in glob.glob(f\"<arc/runs/p_vals_per_gene>/{subset}/*.txt\"):\n", " name = file.split('/')[-1].split('.')[0]\n", " \n", " p_vals[name] = open(file).read().split('Pvalue: ')[1].split('\"')[0]\n", " all_p_vals[name] = open(file).read().split('Pvalue: ')[1].split('\"')[0]\n", " \n", "df_sig = pd.read_csv('<arc/studies>/mercator/a_thaliana_tair10_protein.results.txt', sep = '\\t')\n", "df_sig = df_sig.dropna()\n", "df_sig = df_sig.reset_index()\n", "df_sig = df_sig.drop(columns='index')\n", "\n", "for i in range(len(df_sig)):\n", " df_sig.loc[i, 'AGI'] = df_sig.loc[i]['IDENTIFIER'].split('_')[0][1:-3].upper()\n", " df_sig.loc[i, 'SUB_BIN'] = '.'.join(df_sig.loc[i]['NAME'].split('.')[:2])[1:]\n", "df_sig['p'] = df_sig['AGI'].map(p_vals)\n", "df_sig = df_sig.dropna()\n", "df_sig = df_sig[df_sig.IDENTIFIER.str.endswith(\".1'\")] #remove secondary transcripts\n", "df_sig" ] }, { "cell_type": "code", "execution_count": 316, "id": "03bb5ec4-6b0c-4fb9-858e-6d4442ca107c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>BINCODE</th>\n", " <th>NAME</th>\n", " <th>IDENTIFIER</th>\n", " <th>DESCRIPTION</th>\n", " <th>TYPE</th>\n", " <th>AGI</th>\n", " <th>SUB_BIN</th>\n", " <th>p</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>6</th>\n", " <td>'1.1.1.1.1'</td>\n", " <td>'Photosynthesis.photophosphorylation.photosyst...</td>\n", " <td>'at2g34430.1'</td>\n", " <td>'mercator4v5.0: component *(LHCb1/2/3) of LHC-...</td>\n", " <td>T</td>\n", " <td>AT2G34430</td>\n", " <td>Photosynthesis.photophosphorylation</td>\n", " <td>0.377</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>'1.1.1.1.1'</td>\n", " <td>'Photosynthesis.photophosphorylation.photosyst...</td>\n", " <td>'at5g54270.1'</td>\n", " <td>'mercator4v5.0: component *(LHCb1/2/3) of LHC-...</td>\n", " <td>T</td>\n", " <td>AT5G54270</td>\n", " <td>Photosynthesis.photophosphorylation</td>\n", " <td>0.347</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>'1.1.1.1.3'</td>\n", " <td>'Photosynthesis.photophosphorylation.photosyst...</td>\n", " <td>'at4g10340.1'</td>\n", " <td>'mercator4v5.0: component *(LHCb5) of LHC-II c...</td>\n", " <td>T</td>\n", " <td>AT4G10340</td>\n", " <td>Photosynthesis.photophosphorylation</td>\n", " <td>0.969</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>'1.1.1.2.2.1'</td>\n", " <td>'Photosynthesis.photophosphorylation.photosyst...</td>\n", " <td>'at3g50820.1'</td>\n", " <td>'mercator4v5.0: component *(PsbO/OEC33) of PS-...</td>\n", " <td>T</td>\n", " <td>AT3G50820</td>\n", " <td>Photosynthesis.photophosphorylation</td>\n", " <td>0.361</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>'1.1.1.2.2.1'</td>\n", " <td>'Photosynthesis.photophosphorylation.photosyst...</td>\n", " <td>'at5g66570.1'</td>\n", " <td>'mercator4v5.0: component *(PsbO/OEC33) of PS-...</td>\n", " <td>T</td>\n", " <td>AT5G66570</td>\n", " <td>Photosynthesis.photophosphorylation</td>\n", " <td>0.371</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>40940</th>\n", " <td>'50.6.3'</td>\n", " <td>'Enzyme classification.EC_6 ligases.EC_6.3 lig...</td>\n", " <td>'at4g20320.1'</td>\n", " <td>'mercator4v5.0: EC_6.3 ligase forming carbon-n...</td>\n", " <td>T</td>\n", " <td>AT4G20320</td>\n", " <td>Enzyme classification.EC_6 ligases</td>\n", " <td>0.343</td>\n", " </tr>\n", " <tr>\n", " <th>40943</th>\n", " <td>'50.6.3'</td>\n", " <td>'Enzyme classification.EC_6 ligases.EC_6.3 lig...</td>\n", " <td>'at4g36940.1'</td>\n", " <td>'mercator4v5.0: EC_6.3 ligase forming carbon-n...</td>\n", " <td>T</td>\n", " <td>AT4G36940</td>\n", " <td>Enzyme classification.EC_6 ligases</td>\n", " <td>0.198</td>\n", " </tr>\n", " <tr>\n", " <th>40958</th>\n", " <td>'50.6.3'</td>\n", " <td>'Enzyme classification.EC_6 ligases.EC_6.3 lig...</td>\n", " <td>'at5g48840.1'</td>\n", " <td>'mercator4v5.0: EC_6.3 ligase forming carbon-n...</td>\n", " <td>T</td>\n", " <td>AT5G48840</td>\n", " <td>Enzyme classification.EC_6 ligases</td>\n", " <td>0.52</td>\n", " </tr>\n", " <tr>\n", " <th>40971</th>\n", " <td>'50.6.5'</td>\n", " <td>'Enzyme classification.EC_6 ligases.EC_6.5 lig...</td>\n", " <td>'at1g08130.1'</td>\n", " <td>'mercator4v5.0: EC_6.5 ligase forming phosphor...</td>\n", " <td>T</td>\n", " <td>AT1G08130</td>\n", " <td>Enzyme classification.EC_6 ligases</td>\n", " <td>0.662</td>\n", " </tr>\n", " <tr>\n", " <th>40977</th>\n", " <td>'50.6.6'</td>\n", " <td>'Enzyme classification.EC_6 ligases.EC_6.6 lig...</td>\n", " <td>'at5g45930.1'</td>\n", " <td>'mercator4v5.0: EC_6.6 ligase forming nitrogen...</td>\n", " <td>T</td>\n", " <td>AT5G45930</td>\n", " <td>Enzyme classification.EC_6 ligases</td>\n", " <td>0.288</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>7279 rows × 8 columns</p>\n", "</div>" ], "text/plain": [ " BINCODE NAME \\\n", "6 '1.1.1.1.1' 'Photosynthesis.photophosphorylation.photosyst... \n", "8 '1.1.1.1.1' 'Photosynthesis.photophosphorylation.photosyst... \n", "12 '1.1.1.1.3' 'Photosynthesis.photophosphorylation.photosyst... \n", "22 '1.1.1.2.2.1' 'Photosynthesis.photophosphorylation.photosyst... \n", "23 '1.1.1.2.2.1' 'Photosynthesis.photophosphorylation.photosyst... \n", "... ... ... \n", "40940 '50.6.3' 'Enzyme classification.EC_6 ligases.EC_6.3 lig... \n", "40943 '50.6.3' 'Enzyme classification.EC_6 ligases.EC_6.3 lig... \n", "40958 '50.6.3' 'Enzyme classification.EC_6 ligases.EC_6.3 lig... \n", "40971 '50.6.5' 'Enzyme classification.EC_6 ligases.EC_6.5 lig... \n", "40977 '50.6.6' 'Enzyme classification.EC_6 ligases.EC_6.6 lig... \n", "\n", " IDENTIFIER DESCRIPTION TYPE \\\n", "6 'at2g34430.1' 'mercator4v5.0: component *(LHCb1/2/3) of LHC-... T \n", "8 'at5g54270.1' 'mercator4v5.0: component *(LHCb1/2/3) of LHC-... T \n", "12 'at4g10340.1' 'mercator4v5.0: component *(LHCb5) of LHC-II c... T \n", "22 'at3g50820.1' 'mercator4v5.0: component *(PsbO/OEC33) of PS-... T \n", "23 'at5g66570.1' 'mercator4v5.0: component *(PsbO/OEC33) of PS-... T \n", "... ... ... ... \n", "40940 'at4g20320.1' 'mercator4v5.0: EC_6.3 ligase forming carbon-n... T \n", "40943 'at4g36940.1' 'mercator4v5.0: EC_6.3 ligase forming carbon-n... T \n", "40958 'at5g48840.1' 'mercator4v5.0: EC_6.3 ligase forming carbon-n... T \n", "40971 'at1g08130.1' 'mercator4v5.0: EC_6.5 ligase forming phosphor... T \n", "40977 'at5g45930.1' 'mercator4v5.0: EC_6.6 ligase forming nitrogen... T \n", "\n", " AGI SUB_BIN p \n", "6 AT2G34430 Photosynthesis.photophosphorylation 0.377 \n", "8 AT5G54270 Photosynthesis.photophosphorylation 0.347 \n", "12 AT4G10340 Photosynthesis.photophosphorylation 0.969 \n", "22 AT3G50820 Photosynthesis.photophosphorylation 0.361 \n", "23 AT5G66570 Photosynthesis.photophosphorylation 0.371 \n", "... ... ... ... \n", "40940 AT4G20320 Enzyme classification.EC_6 ligases 0.343 \n", "40943 AT4G36940 Enzyme classification.EC_6 ligases 0.198 \n", "40958 AT5G48840 Enzyme classification.EC_6 ligases 0.52 \n", "40971 AT1G08130 Enzyme classification.EC_6 ligases 0.662 \n", "40977 AT5G45930 Enzyme classification.EC_6 ligases 0.288 \n", "\n", "[7279 rows x 8 columns]" ] }, "execution_count": 316, "metadata": {}, "output_type": "execute_result" } ], "source": [ "subset = 'Non_Significant'\n", "p_vals = {}\n", "for file in glob.glob(f\"<arc/runs/p_vals_per_gene>/{subset}/*.txt\"):\n", " name = file.split('/')[-1].split('.')[0]\n", " \n", " p_vals[name] = open(file).read().split('Pvalue: ')[1].split('\"')[0]\n", " all_p_vals[name] = open(file).read().split('Pvalue: ')[1].split('\"')[0]\n", " \n", "df_nonsig = pd.read_csv('<arc/studies>/mercator/a_thaliana_tair10_protein.results.txt', sep = '\\t')\n", "df_nonsig = df_nonsig.dropna()\n", "df_nonsig = df_nonsig.reset_index()\n", "df_nonsig = df_nonsig.drop(columns='index')\n", "\n", "for i in range(len(df_nonsig)):\n", " df_nonsig.loc[i, 'AGI'] = df_nonsig.loc[i]['IDENTIFIER'].split('_')[0][1:-3].upper()\n", " df_nonsig.loc[i, 'SUB_BIN'] = '.'.join(df_nonsig.loc[i]['NAME'].split('.')[:2])[1:]\n", "df_nonsig['p'] = df_nonsig['AGI'].map(p_vals)\n", "df_nonsig = df_nonsig[df_nonsig.IDENTIFIER.str.endswith(\".1'\")] #remove secondary transcripts\n", "df_nonsig = df_nonsig.dropna()\n", "df_nonsig" ] }, { "cell_type": "code", "execution_count": null, "id": "ebad821f-4f8a-4cd1-8250-7e194732419d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 296, "id": "b52304bc-e7d1-45d1-824c-cdf9206b7ec7", "metadata": {}, "outputs": [], "source": [ "all_bins = list(set(df_nonsig.SUB_BIN))\n", "i = all_bins[0]\n", "num_nonsig = len(df_nonsig[df_nonsig.SUB_BIN == i])\n", "num_sig = len(df_sig[df_sig.SUB_BIN == i])\n", "summ = num_sig + num_nonsig" ] }, { "cell_type": "code", "execution_count": 297, "id": "8b98a2c5-ad44-4b59-af41-e914ee64d590", "metadata": {}, "outputs": [], "source": [ "df_bins = pd.DataFrame()\n", "df_bins['bin'] = all_bins\n", "for i in range(len(all_bins)):\n", " num_nonsig = len(df_nonsig[df_nonsig.SUB_BIN == all_bins[i]])\n", " num_sig = len(df_sig[df_sig.SUB_BIN == all_bins[i]])\n", " df_bins.loc[i, 'sig'] = num_sig\n", " df_bins.loc[i, 'non_sig'] = num_nonsig" ] }, { "cell_type": "code", "execution_count": null, "id": "cb1d7ce2-8aad-4353-a03f-73153b2e1e13", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 355, "id": "c1153c11-a5c0-488e-a97f-c84d3f7e62da", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>bin</th>\n", " <th>sig</th>\n", " <th>non_sig</th>\n", " <th>p_values</th>\n", " <th>odds_ratio</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>109</th>\n", " <td>Photosynthesis.photorespiration</td>\n", " <td>2.0</td>\n", " <td>3.0</td>\n", " <td>0.002907</td>\n", " <td>38.194226</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>RNA biosynthesis.RNA polymerase III-dependent ...</td>\n", " <td>1.0</td>\n", " <td>2.0</td>\n", " <td>0.051343</td>\n", " <td>28.425781</td>\n", " </tr>\n", " <tr>\n", " <th>120</th>\n", " <td>Multi-process regulation.SnRK1-kinase regulato...</td>\n", " <td>2.0</td>\n", " <td>9.0</td>\n", " <td>0.014932</td>\n", " <td>12.720910</td>\n", " </tr>\n", " <tr>\n", " <th>98</th>\n", " <td>Phytohormone action.strigolactone</td>\n", " <td>1.0</td>\n", " <td>5.0</td>\n", " <td>0.100070</td>\n", " <td>11.365625</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>External stimuli response.gravity</td>\n", " <td>1.0</td>\n", " <td>6.0</td>\n", " <td>0.115753</td>\n", " <td>9.470052</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>77</th>\n", " <td>Cytoskeleton organisation.microtubular network</td>\n", " <td>0.0</td>\n", " <td>46.0</td>\n", " <td>1.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>78</th>\n", " <td>External stimuli response.drought</td>\n", " <td>0.0</td>\n", " <td>2.0</td>\n", " <td>1.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>80</th>\n", " <td>Cellular respiration.pyruvate oxidation</td>\n", " <td>0.0</td>\n", " <td>5.0</td>\n", " <td>1.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>82</th>\n", " <td>External stimuli response.damage</td>\n", " <td>0.0</td>\n", " <td>5.0</td>\n", " <td>1.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>212</th>\n", " <td>Redox homeostasis.ascorbate-based redox regula...</td>\n", " <td>0.0</td>\n", " <td>14.0</td>\n", " <td>1.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>213 rows × 5 columns</p>\n", "</div>" ], "text/plain": [ " bin sig non_sig \\\n", "109 Photosynthesis.photorespiration 2.0 3.0 \n", "26 RNA biosynthesis.RNA polymerase III-dependent ... 1.0 2.0 \n", "120 Multi-process regulation.SnRK1-kinase regulato... 2.0 9.0 \n", "98 Phytohormone action.strigolactone 1.0 5.0 \n", "23 External stimuli response.gravity 1.0 6.0 \n", ".. ... ... ... \n", "77 Cytoskeleton organisation.microtubular network 0.0 46.0 \n", "78 External stimuli response.drought 0.0 2.0 \n", "80 Cellular respiration.pyruvate oxidation 0.0 5.0 \n", "82 External stimuli response.damage 0.0 5.0 \n", "212 Redox homeostasis.ascorbate-based redox regula... 0.0 14.0 \n", "\n", " p_values odds_ratio \n", "109 0.002907 38.194226 \n", "26 0.051343 28.425781 \n", "120 0.014932 12.720910 \n", "98 0.100070 11.365625 \n", "23 0.115753 9.470052 \n", ".. ... ... \n", "77 1.000000 0.000000 \n", "78 1.000000 0.000000 \n", "80 1.000000 0.000000 \n", "82 1.000000 0.000000 \n", "212 1.000000 0.000000 \n", "\n", "[213 rows x 5 columns]" ] }, "execution_count": 355, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import scipy.stats as stats\n", "p_values = []\n", "odds = []\n", "for code in df_bins['bin']:\n", " oddsratio, pvalue = stats.fisher_exact([[sum(df_bins['sig'][df_bins['bin'] == code]),\n", " sum(df_bins['sig']) - sum(df_bins['sig'][df_bins['bin'] == code])],\n", " [sum(df_bins['non_sig'][df_bins['bin'] == code]), \n", " sum(df_bins['non_sig']) - sum(df_bins['non_sig'][df_bins['bin'] == code])]], alternative='two-sided') \n", " p_values.append(pvalue)\n", " odds.append(oddsratio)\n", "df_bins['p_values'] = p_values\n", "df_bins['odds_ratio'] = odds\n", "df_bins = df_bins.sort_values(by = 'odds_ratio', ascending=False)\n", "#df_bins[df_bins['p_values'] <= 0.05]\n", "#sum(df_bins.sig)\n", "df_bins" ] }, { "cell_type": "markdown", "id": "d26b1914-9579-4f5d-a764-e2f07ca5bc59", "metadata": {}, "source": [ "It seems that now (when using the AGI codes), 5/9 PR genes have a TE. Which are they?" ] }, { "cell_type": "code", "execution_count": 299, "id": "8c517f68-993a-44ae-bae8-a9a1b0b3aa9d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>BINCODE</th>\n", " <th>NAME</th>\n", " <th>IDENTIFIER</th>\n", " <th>DESCRIPTION</th>\n", " <th>TYPE</th>\n", " <th>AGI</th>\n", " <th>SUB_BIN</th>\n", " <th>p</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>414</th>\n", " <td>'1.3.4.1'</td>\n", " <td>'Photosynthesis.photorespiration.glycine decar...</td>\n", " <td>'at4g33010.1'</td>\n", " <td>'mercator4v5.0: glycine dehydrogenase componen...</td>\n", " <td>T</td>\n", " <td>AT4G33010</td>\n", " <td>Photosynthesis.photorespiration</td>\n", " <td>0.001</td>\n", " </tr>\n", " <tr>\n", " <th>417</th>\n", " <td>'1.3.4.2'</td>\n", " <td>'Photosynthesis.photorespiration.glycine decar...</td>\n", " <td>'at1g11860.1'</td>\n", " <td>'mercator4v5.0: aminomethyltransferase compone...</td>\n", " <td>T</td>\n", " <td>AT1G11860</td>\n", " <td>Photosynthesis.photorespiration</td>\n", " <td>0.018</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " BINCODE NAME \\\n", "414 '1.3.4.1' 'Photosynthesis.photorespiration.glycine decar... \n", "417 '1.3.4.2' 'Photosynthesis.photorespiration.glycine decar... \n", "\n", " IDENTIFIER DESCRIPTION TYPE \\\n", "414 'at4g33010.1' 'mercator4v5.0: glycine dehydrogenase componen... T \n", "417 'at1g11860.1' 'mercator4v5.0: aminomethyltransferase compone... T \n", "\n", " AGI SUB_BIN p \n", "414 AT4G33010 Photosynthesis.photorespiration 0.001 \n", "417 AT1G11860 Photosynthesis.photorespiration 0.018 " ] }, "execution_count": 299, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_sig[df_sig.BINCODE.str.startswith(\"'1.3\")]" ] }, { "cell_type": "code", "execution_count": 300, "id": "9eafe556-42e1-4bd2-93c8-b9b96bcbf781", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>BINCODE</th>\n", " <th>NAME</th>\n", " <th>IDENTIFIER</th>\n", " <th>DESCRIPTION</th>\n", " <th>TYPE</th>\n", " <th>AGI</th>\n", " <th>SUB_BIN</th>\n", " <th>p</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>6521</th>\n", " <td>'15.4.5.3'</td>\n", " <td>'RNA biosynthesis.RNA polymerase III-dependent...</td>\n", " <td>'at4g01590.1'</td>\n", " <td>'mercator4v5.0: component *(NRPC17/RPC31) of T...</td>\n", " <td>T</td>\n", " <td>AT4G01590</td>\n", " <td>RNA biosynthesis.RNA polymerase III-dependent ...</td>\n", " <td>0.017</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " BINCODE NAME \\\n", "6521 '15.4.5.3' 'RNA biosynthesis.RNA polymerase III-dependent... \n", "\n", " IDENTIFIER DESCRIPTION TYPE \\\n", "6521 'at4g01590.1' 'mercator4v5.0: component *(NRPC17/RPC31) of T... T \n", "\n", " AGI SUB_BIN p \n", "6521 AT4G01590 RNA biosynthesis.RNA polymerase III-dependent ... 0.017 " ] }, "execution_count": 300, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_sig[df_sig.NAME.str.startswith(\"'RNA biosynthesis.RNA po\")]" ] }, { "cell_type": "code", "execution_count": null, "id": "867117fa-1950-4007-841b-5085268769b2", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ab72c01f-2683-4f24-ba29-1466d3a367dd", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "50da26a6-7477-48f0-af34-3b9ce2863154", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "25d7479a-a60b-4751-bbaa-03111be277ad", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }