From 0bd71582d32c3de39c93df470f7d095f6d52f619 Mon Sep 17 00:00:00 2001
From: Joseph Atemia <j.atemia@fz-juelich.de>
Date: Thu, 26 Sep 2024 16:57:11 +0200
Subject: [PATCH] add: 'raw' data preprocessing scripts

---
 .../scripts/extract_indiv_spp_pheno_data.sh   |   47 +
 ...ion_selection_3455_accesions_matched.ipynb | 2014 +++++++++++++++++
 .../hapmap_convertion_scripts/README.md       |    4 +
 .../hapmap2numeric.py                         |   54 +
 .../hapmap2numeric.sh                         |   13 +
 .../impute_missing_with_het_hapmap.py         |   51 +
 .../impute_missing_with_het_hapmap.sh         |   13 +
 .../tassel_convert_imputed_h5.sh              |   14 +
 .../tassel_convert_imputed_plink.sh           |   16 +
 .../remove_spp_tags_in_hapmap_files.sh        |   15 +
 .../scripts/workflow_order.md                 |   12 +
 11 files changed, 2253 insertions(+)
 create mode 100755 workflows/preprocessing_data/scripts/extract_indiv_spp_pheno_data.sh
 create mode 100644 workflows/preprocessing_data/scripts/geno_pheno_accession_selection_3455_accesions_matched.ipynb
 create mode 100644 workflows/preprocessing_data/scripts/hapmap_convertion_scripts/README.md
 create mode 100755 workflows/preprocessing_data/scripts/hapmap_convertion_scripts/hapmap2numeric.py
 create mode 100644 workflows/preprocessing_data/scripts/hapmap_convertion_scripts/hapmap2numeric.sh
 create mode 100755 workflows/preprocessing_data/scripts/hapmap_convertion_scripts/impute_missing_with_het_hapmap.py
 create mode 100644 workflows/preprocessing_data/scripts/hapmap_convertion_scripts/impute_missing_with_het_hapmap.sh
 create mode 100644 workflows/preprocessing_data/scripts/hapmap_convertion_scripts/tassel_convert_imputed_h5.sh
 create mode 100644 workflows/preprocessing_data/scripts/hapmap_convertion_scripts/tassel_convert_imputed_plink.sh
 create mode 100755 workflows/preprocessing_data/scripts/remove_spp_tags_in_hapmap_files.sh
 create mode 100644 workflows/preprocessing_data/scripts/workflow_order.md

diff --git a/workflows/preprocessing_data/scripts/extract_indiv_spp_pheno_data.sh b/workflows/preprocessing_data/scripts/extract_indiv_spp_pheno_data.sh
new file mode 100755
index 0000000000..f880fbeb23
--- /dev/null
+++ b/workflows/preprocessing_data/scripts/extract_indiv_spp_pheno_data.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+
+# Extaract and create individual species phenotype files 
+
+# pheno_result_dir=../data/phenotype
+mkdir ../data/phenotype/data_with_headers
+
+head -n 1 ../data/v2_phenotype_and_env_data.csv  > ../data/phenotype/data_with_headers/mexicana.csv
+grep 'mexicana' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/mexicana.csv
+
+
+head -n 1 ../data/v2_phenotype_and_env_data.csv  > ../data/phenotype/data_with_headers/parviglumis.csv
+grep 'parviglumis'	../data/v2_phenotype_and_env_data.csv >>	../data/phenotype/data_with_headers/parviglumis.csv
+
+head -n 1 ../data/v2_phenotype_and_env_data.csv  > ../data/phenotype/data_with_headers/huehuetenangensis.csv
+grep 'huehuetenangensis'	../data/v2_phenotype_and_env_data.csv >>	../data/phenotype/data_with_headers/huehuetenangensis.csv
+
+head -n 1 ../data/v2_phenotype_and_env_data.csv  > ../data/phenotype/data_with_headers/nicaraguensis.csv
+grep 'nicaraguensis'	../data/v2_phenotype_and_env_data.csv >>	../data/phenotype/data_with_headers/nicaraguensis.csv
+
+head -n 1 ../data/v2_phenotype_and_env_data.csv  > ../data/phenotype/data_with_headers/luxurians.csv
+grep 'luxurians'	../data/v2_phenotype_and_env_data.csv >>	../data/phenotype/data_with_headers/luxurians.csv
+
+head -n 1 ../data/v2_phenotype_and_env_data.csv  > ../data/phenotype/data_with_headers/diploperennis.csv
+grep 'diploperennis'	../data/v2_phenotype_and_env_data.csv >>	../data/phenotype/data_with_headers/diploperennis.csv
+
+head -n 1 ../data/v2_phenotype_and_env_data.csv  > ../data/phenotype/data_with_headers/perennis.csv
+grep 'Zea perennis'	../data/v2_phenotype_and_env_data.csv >>	../data/phenotype/data_with_headers/perennnis.csv
+
+# split mexicana spp further according to estado
+head -n 1 ../data/v2_phenotype_and_env_data.csv  > ../data/phenotype/data_with_headers/mexicana_chalco.csv
+grep 'Zea mays mexicana - Chalco' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/mexicana_chalco.csv
+
+head -n 1 ../data/v2_phenotype_and_env_data.csv  > ../data/phenotype/data_with_headers/mexicana_durango.csv
+grep 'Zea mays mexicana - Durango' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/mexicana_durango.csv
+
+head -n 1 ../data/v2_phenotype_and_env_data.csv  > ../data/phenotype/data_with_headers/mexicana_mesa_central.csv
+grep 'Zea mays mexicana - Mesa Central' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/mexicana_mesa_central.csv
+
+head -n 1 ../data/v2_phenotype_and_env_data.csv  > ../data/phenotype/data_with_headers/mexicana_nobogame.csv
+grep 'Zea mays mexicana - Nobogame' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/mexicana_nobogame.csv
+
+head -n 1 ../data/v2_phenotype_and_env_data.csv  > ../data/phenotype/data_with_headers/mexicana_mesa_nobogame.csv
+grep 'Zea mays mexicana - Nobogame' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/mexicana_nobogame.csv
+
+head -n 1 ../data/v2_phenotype_and_env_data.csv  > ../data/phenotype/data_with_headers/perennis_diplo_per.csv
+grep 'Zea perennis'	../data/v2_phenotype_and_env_data.csv >>	../data/phenotype/data_with_headers/perennis_diplo_per.csv
\ No newline at end of file
diff --git a/workflows/preprocessing_data/scripts/geno_pheno_accession_selection_3455_accesions_matched.ipynb b/workflows/preprocessing_data/scripts/geno_pheno_accession_selection_3455_accesions_matched.ipynb
new file mode 100644
index 0000000000..b829cb018c
--- /dev/null
+++ b/workflows/preprocessing_data/scripts/geno_pheno_accession_selection_3455_accesions_matched.ipynb
@@ -0,0 +1,2014 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "427b5b28",
+   "metadata": {},
+   "source": [
+    "### Code used to filter out genotypes not in the phenotype accessions and vice verser for GWAS analysis using GAPIT "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9c50057a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e3db6b42-48a5-4387-b2df-4a2615997d9e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of genotype ids: 3604\n",
+      "Number of phenotype ids: 4153\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create variables with phenotype ids and genotype ids\n",
+    "\n",
+    "# Genotype ids \n",
+    "with open(\"../data/T3606_33929_hapmap.hmp.renamed_accessions.txt\", \"r\") as hapmap_file:\n",
+    "    c = 0\n",
+    "    for line in hapmap_file:\n",
+    "        if c == 0:\n",
+    "            geno_ids = line.split()[11:]\n",
+    "            break\n",
+    "        c += 1\n",
+    "print(\"Number of genotype ids:\", len(geno_ids))\n",
+    "\n",
+    "# Phenotype ids\n",
+    "phen = pd.read_csv(\"../data/v2_phenotype_and_env_data.csv\")\n",
+    "phen_ids = list(phen[\"taxa\"])\n",
+    "phen_ids = [i.replace(\"''\",'') for i in phen_ids]\n",
+    "\n",
+    "print(\"Number of phenotype ids:\", len(phen_ids))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "7cff35cc-a921-4d9a-9f5c-1b1a98b63897",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "List of accessesions not in the phenotype data:\n",
+      "\n",
+      "NDUR_10_1758\n",
+      "BZAC_110_11\n",
+      "BZAC_110_14\n",
+      "BTEL_112_15\n",
+      "BTEL_113_15\n",
+      "BTEL_113_6\n",
+      "BTEL_113_8\n",
+      "BAPX_114_8\n",
+      "BOLI_116_12\n",
+      "BTEJ_129_1199\n",
+      "ZDNA_12_9\n",
+      "BPCH_141_585\n",
+      "TOLU_59_768\n",
+      "TOLU_60_1233\n",
+      "BPCH_143_22\n",
+      "TOLU_57_1390\n",
+      "BCAR_148_9\n",
+      "MDOB_14_13\n",
+      "BNOC_152_8\n",
+      "BNOC_152_9\n",
+      "BHUE_156_19\n",
+      "BHUE_156_8\n",
+      "BHUE_156_9\n",
+      "BTIQ_158_10\n",
+      "BJUA_163_2\n",
+      "BTZI_166_14\n",
+      "BTAR_173_388\n",
+      "BMAN_177_5\n",
+      "BVPU_179_602\n",
+      "ZLJA_191_10\n",
+      "ZLOX_193_10\n",
+      "ZLOX_193_15\n",
+      "ZLOX_193_16\n",
+      "ZLOX_193_17\n",
+      "ZLOX_193_19\n",
+      "ZLOX_193_21\n",
+      "ZLOX_193_23\n",
+      "ZLOX_193_24\n",
+      "ZLOX_193_4\n",
+      "ZLOX_193_5\n",
+      "ZLOX_193_7\n",
+      "PENJ_19_1101\n",
+      "PENJ_19_1409\n",
+      "ZPMI_200_1768\n",
+      "ZPMI_200_309\n",
+      "MCUI_23_10\n",
+      "MORO_25_10\n",
+      "HUEH_261_10\n",
+      "HUEH_261_11\n",
+      "HUEH_261_12\n",
+      "HUEH_261_13\n",
+      "HUEH_261_14\n",
+      "HUEH_261_1\n",
+      "HUEH_261_2\n",
+      "HUEH_261_3\n",
+      "HUEH_261_4\n",
+      "HUEH_261_5\n",
+      "HUEH_261_6\n",
+      "HUEH_261_7\n",
+      "HUEH_261_8\n",
+      "HUEH_261_9\n",
+      "TLAX_265_1\n",
+      "CHPU_268_8\n",
+      "BNOC_269_12\n",
+      "BNOC_269_1\n",
+      "BTAL_271_12\n",
+      "BTAL_271_160\n",
+      "BTAL_271_261\n",
+      "BTAL_271_362\n",
+      "BTAL_271_564\n",
+      "BTAL_271_665\n",
+      "BTAL_271_766\n",
+      "BTAL_271_867\n",
+      "BTEJ_276_1353\n",
+      "BTEJ_276_1454\n",
+      "BTEJ_276_1555\n",
+      "ZLAB_281_13\n",
+      "ZLAB_282_1098\n",
+      "ZLAB_282_1199\n",
+      "ZLAB_282_1200\n",
+      "ZLAB_282_1301\n",
+      "ZLJA_283_11\n",
+      "ZLJA_283_6\n",
+      "ZLJA_284_9\n",
+      "ZDJA_285_2\n",
+      "ZLOX_286_10\n",
+      "ZLOX_286_2\n",
+      "ZLOX_286_4\n",
+      "ZLOX_286_5\n",
+      "ZLOX_286_6\n",
+      "ZLOX_286_7\n",
+      "BTEJ_287_1021\n",
+      "BTEJ_287_1222\n",
+      "BTEJ_287_1424\n",
+      "BTEJ_287_314\n",
+      "BTEJ_287_415\n",
+      "BTEJ_287_617\n",
+      "BTEJ_287_920\n",
+      "AMEC_289_12\n",
+      "CHTX_291_1497\n",
+      "CHTX_291_190\n",
+      "CHTX_291_4\n",
+      "CHTX_291_693\n",
+      "CHAP_293_297\n",
+      "CHTX_298_15\n",
+      "MVIJ_29_1116\n",
+      "MVIJ_29_1242\n",
+      "MVIJ_29_1446\n",
+      "TARI_302_12\n",
+      "BHUE_308_1\n",
+      "MVIJ_30_15\n",
+      "BTAC_310_7\n",
+      "BEJU_311_204\n",
+      "BVPU_314_11\n",
+      "BVPU_315_4\n",
+      "BHUI_317_203\n",
+      "BOLI_318_5\n",
+      "SJER_334_8\n",
+      "INDA_33_9\n",
+      "INDA_34_9\n",
+      "TARI_37_21\n",
+      "TARI_37_22\n",
+      "MPUR_38_1432\n",
+      "MPUR_40_1259\n",
+      "MPUR_40_8\n",
+      "SJER_41_3\n",
+      "MZAM_42_568\n",
+      "CHGO_45_1368\n",
+      "CHGO_45_632\n",
+      "CHMI_48_4\n",
+      "CHMI_48_5\n",
+      "CHDF_52_6\n",
+      "CHAP_53_1166\n",
+      "CHAP_53_950\n",
+      "CHAP_55_6\n",
+      "CHAP_55_8\n",
+      "CHPU_80_383\n",
+      "BGUA_83_582\n",
+      "BGUA_85_160\n",
+      "BGUA_85_484\n",
+      "BGUA_85_592\n",
+      "BGUA_85_713\n",
+      "BGUA_87_638\n",
+      "BEJU_91_108\n",
+      "BEJU_91_216\n",
+      "BEJU_93_8\n",
+      "BEJU_95_3\n",
+      "BEJU_95_448\n",
+      "NDUR_9_1403\n",
+      "Total genotype accessions 3604\n",
+      "genotype accessions in phenotype dataset 3455\n"
+     ]
+    }
+   ],
+   "source": [
+    "# display genotypic accessions lacking phenotype data\n",
+    "\n",
+    "c = 0\n",
+    "unmatched_genotype = []\n",
+    "\n",
+    "print('List of accessesions not in the phenotype data:\\n')\n",
+    "\n",
+    "for acce in geno_ids:\n",
+    "    if acce not in phen_ids:\n",
+    "        print(acce)\n",
+    "        unmatched_genotype.append(acce)\n",
+    "    else: \n",
+    "        c+=1\n",
+    "print('Total genotype accessions', len(geno_ids))\n",
+    "print('genotype accessions in phenotype dataset', c)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "7c554e82-01df-4d26-bc47-a20b97f8e472",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "List of accessesions not in the phenotype data:\n",
+      "\n",
+      "CHGO_45_4\n",
+      "CHGO_45_6\n",
+      "CHGO_45_11\n",
+      "CHGO_45_13\n",
+      "MCUI_46_7\n",
+      "MCUI_46_10\n",
+      "MCUI_46_11\n",
+      "CHMI_47_1\n",
+      "CHMI_47_5\n",
+      "CHMI_47_8\n",
+      "CHMI_47_15\n",
+      "CHMI_48_8\n",
+      "CHMI_48_9\n",
+      "CHMI_48_10\n",
+      "CHMI_48_13\n",
+      "CHGO_49_1\n",
+      "CHDF_50_8\n",
+      "CHDF_50_10\n",
+      "CHDF_50_13\n",
+      "CHDF_50_15\n",
+      "CHDF_51_12\n",
+      "CHDF_52_12\n",
+      "CHAP_53_9\n",
+      "CHAP_53_11\n",
+      "CHAP_53_12\n",
+      "CHAP_54_1\n",
+      "CHAP_54_10\n",
+      "CHAP_54_13\n",
+      "CHAP_54_14\n",
+      "CHAP_54_15\n",
+      "CHAP_55_2\n",
+      "CHAP_55_3\n",
+      "CHAP_55_4\n",
+      "CHAP_55_10\n",
+      "CHAP_56_8\n",
+      "TOLU_57_5\n",
+      "TOLU_59_7\n",
+      "TOLU_59_8\n",
+      "TOLU_59_9\n",
+      "TOLU_59_10\n",
+      "TOLU_59_14\n",
+      "TOLU_60_12\n",
+      "CHTX_62_1\n",
+      "CHTX_63_6\n",
+      "CHTX_63_15\n",
+      "CHTX_64_8\n",
+      "CHTX_64_11\n",
+      "CHTX_64_12\n",
+      "CHTX_64_13\n",
+      "AMEC_66_9\n",
+      "AMEC_67_9\n",
+      "AMEC_67_10\n",
+      "AMEC_67_13\n",
+      "AMEC_69_11\n",
+      "CHAL_73_1\n",
+      "CHAL_74_6\n",
+      "CHAL_74_19\n",
+      "CHPU_76_2\n",
+      "CHPU_76_9\n",
+      "CHPU_76_12\n",
+      "CHPU_77_2\n",
+      "CHPU_77_5\n",
+      "CHPU_77_8\n",
+      "CHPU_77_9\n",
+      "CHPU_79_9\n",
+      "CHPU_80_3\n",
+      "TLAX_81_1\n",
+      "TLAX_81_5\n",
+      "TLAX_81_8\n",
+      "TLAX_81_11\n",
+      "TLAX_81_12\n",
+      "TLAX_81_13\n",
+      "SNRA_82_15\n",
+      "CHPU_268_7\n",
+      "CHPU_268_12\n",
+      "CHTX_291_6\n",
+      "CHTX_291_10\n",
+      "CHTX_291_11\n",
+      "CHTX_291_12\n",
+      "CHTX_291_14\n",
+      "CHAL_292_7\n",
+      "CHAL_292_15\n",
+      "CHAP_293_2\n",
+      "CHAP_293_4\n",
+      "CHAP_293_6\n",
+      "CHAP_293_7\n",
+      "CHAP_293_10\n",
+      "CHAP_293_14\n",
+      "CHAP_294_6\n",
+      "CHAP_295_7\n",
+      "TOLU_296_1\n",
+      "TOLU_296_4\n",
+      "TOLU_296_5\n",
+      "TOLU_296_8\n",
+      "TOLU_296_9\n",
+      "TOLU_296_12\n",
+      "TOLU_296_13\n",
+      "CHAP_297_9\n",
+      "CHAP_297_10\n",
+      "CHTX_298_2\n",
+      "CHTX_298_3\n",
+      "CHTX_298_8\n",
+      "CHTX_298_9\n",
+      "CHTX_298_10\n",
+      "CHTX_298_14\n",
+      "DURA_6_3\n",
+      "NDUR_9_14\n",
+      "NDUR_10_17\n",
+      "NDUR_10_19\n",
+      "NDUR_10_27\n",
+      "MDOB_14_15\n",
+      "CHUR_18_6\n",
+      "CHUR_18_12\n",
+      "CHUR_18_14\n",
+      "CHUR_18_18\n",
+      "CHUR_18_22\n",
+      "PENJ_19_4\n",
+      "PENJ_19_5\n",
+      "PENJ_19_14\n",
+      "PENJ_20_5\n",
+      "PENJ_20_14\n",
+      "MCUI_22_11\n",
+      "MCUI_22_12\n",
+      "MCUI_23_6\n",
+      "MORO_24_1\n",
+      "MORO_25_5\n",
+      "MORO_26_1\n",
+      "MORO_26_3\n",
+      "MORO_26_7\n",
+      "MORO_26_8\n",
+      "YURI_27_1\n",
+      "YURI_27_3\n",
+      "YURI_27_4\n",
+      "YURI_27_5\n",
+      "YURI_27_6\n",
+      "YURI_27_14\n",
+      "YURI_28_11\n",
+      "MVIJ_29_2\n",
+      "MVIJ_29_5\n",
+      "MVIJ_29_7\n",
+      "MVIJ_29_8\n",
+      "MVIJ_29_10\n",
+      "MVIJ_29_11\n",
+      "MVIJ_29_12\n",
+      "MVIJ_29_13\n",
+      "MVIJ_29_14\n",
+      "MVIJ_30_1\n",
+      "MVIJ_30_6\n",
+      "MVIJ_30_7\n",
+      "MVIJ_30_12\n",
+      "MORE_31_1\n",
+      "MORE_31_15\n",
+      "INDA_32_12\n",
+      "INDA_32_15\n",
+      "INDA_33_11\n",
+      "INDA_34_1\n",
+      "INDA_34_10\n",
+      "INDA_34_11\n",
+      "INDA_34_14\n",
+      "MORE_35_2\n",
+      "MORE_35_7\n",
+      "MORE_35_13\n",
+      "MORE_35_15\n",
+      "TARI_36_1\n",
+      "TARI_36_9\n",
+      "TARI_36_10\n",
+      "TARI_36_11\n",
+      "TARI_37_2\n",
+      "TARI_37_12\n",
+      "TARI_37_13\n",
+      "TARI_37_14\n",
+      "TARI_37_15\n",
+      "MPUR_38_2\n",
+      "MPUR_38_3\n",
+      "MPUR_38_11\n",
+      "MPUR_38_13\n",
+      "MPUR_38_14\n",
+      "MPUR_39_2\n",
+      "MPUR_39_4\n",
+      "MPUR_39_5\n",
+      "MPUR_39_6\n",
+      "MPUR_39_8\n",
+      "MPUR_39_9\n",
+      "MPUR_39_12\n",
+      "MPUR_39_13\n",
+      "MPUR_40_2\n",
+      "MPUR_40_7\n",
+      "MPUR_40_10\n",
+      "MPUR_40_12\n",
+      "MPUR_40_13\n",
+      "MPUR_40_14\n",
+      "SJER_41_1\n",
+      "SJER_41_6\n",
+      "SJER_41_8\n",
+      "SJER_41_15\n",
+      "MZAM_42_1\n",
+      "MZAM_42_2\n",
+      "MZAM_42_3\n",
+      "MZAM_42_5\n",
+      "MORO_300_9\n",
+      "MORO_300_14\n",
+      "INDA_301_7\n",
+      "INDA_301_12\n",
+      "INDA_301_15\n",
+      "TARI_302_2\n",
+      "TARI_302_3\n",
+      "TARI_302_7\n",
+      "TARI_302_8\n",
+      "YURI_303_6\n",
+      "YURI_304_14\n",
+      "YURI_304_15\n",
+      "HUAN_329_2\n",
+      "HUAN_329_5\n",
+      "HUAN_330_4\n",
+      "HUAN_330_5\n",
+      "HUAN_330_9\n",
+      "HUAN_330_11\n",
+      "CHUR_331_1\n",
+      "CHUR_331_13\n",
+      "CHMI_332_2\n",
+      "CHMI_332_5\n",
+      "CHMI_332_6\n",
+      "CHMI_332_12\n",
+      "CHMI_332_13\n",
+      "MZAM_333_2\n",
+      "MZAM_333_3\n",
+      "MZAM_333_7\n",
+      "MZAM_333_9\n",
+      "MZAM_333_10\n",
+      "MZAM_333_13\n",
+      "SJER_334_2\n",
+      "SJER_334_5\n",
+      "SJER_334_6\n",
+      "SJER_334_12\n",
+      "SJER_334_14\n",
+      "SJER_334_15\n",
+      "SJER_336_3\n",
+      "SJER_336_4\n",
+      "NOBO_1_1\n",
+      "NOBO_1_10\n",
+      "NOBO_1_16\n",
+      "NOBO_1_20\n",
+      "NOBO_1_28\n",
+      "NOBO_3_8\n",
+      "NOBO_3_9\n",
+      "BGUA_83_5\n",
+      "BGUA_84_6\n",
+      "BGUA_84_8\n",
+      "BGUA_84_9\n",
+      "BGUA_84_12\n",
+      "BGUA_84_14\n",
+      "BGUA_85_1\n",
+      "BGUA_85_4\n",
+      "BGUA_85_5\n",
+      "BGUA_85_7\n",
+      "BGUA_86_5\n",
+      "BGUA_86_8\n",
+      "BGUA_86_12\n",
+      "BGUA_87_3\n",
+      "BGUA_87_6\n",
+      "BGUA_87_9\n",
+      "BGUA_87_17\n",
+      "BQUE_88_3\n",
+      "BGUA_90_13\n",
+      "BEJU_91_1\n",
+      "BEJU_91_2\n",
+      "BEJU_91_11\n",
+      "BEJU_91_12\n",
+      "BEJU_92_2\n",
+      "BEJU_93_14\n",
+      "BEJU_94_1\n",
+      "BEJU_95_4\n",
+      "BMOR_97_8\n",
+      "BMOR_97_11\n",
+      "BMOR_97_14\n",
+      "BMOR_97_15\n",
+      "BIXC_99_15\n",
+      "BIXC_100_6\n",
+      "BIXC_100_14\n",
+      "BZAC_102_3\n",
+      "BZAC_102_10\n",
+      "BZAC_106_1\n",
+      "BZAC_106_2\n",
+      "BZAC_106_4\n",
+      "BZAC_106_9\n",
+      "BZAC_108_14\n",
+      "BTEL_109_9\n",
+      "BZAC_110_9\n",
+      "BZAC_111_4\n",
+      "BZAC_111_11\n",
+      "BZAC_111_14\n",
+      "BTEL_113_7\n",
+      "BTEL_113_10\n",
+      "BTEL_113_11\n",
+      "BTEL_113_12\n",
+      "BAPX_114_13\n",
+      "BAPX_114_14\n",
+      "BOLI_116_10\n",
+      "BHUI_117_2\n",
+      "BHUI_117_4\n",
+      "BHUI_118_1\n",
+      "BHUI_118_7\n",
+      "BHUI_118_8\n",
+      "BMAZ_119_1\n",
+      "BMAZ_119_2\n",
+      "BMAZ_119_4\n",
+      "BMAZ_119_5\n",
+      "BMAZ_119_6\n",
+      "BMAZ_119_8\n",
+      "BMAZ_119_14\n",
+      "BMAZ_120_2\n",
+      "BMAZ_120_4\n",
+      "BMAZ_120_6\n",
+      "BMAZ_120_10\n",
+      "BMAZ_121_1\n",
+      "BMAZ_121_2\n",
+      "BMAZ_121_4\n",
+      "BMAZ_121_6\n",
+      "BMAZ_121_9\n",
+      "BMAZ_121_11\n",
+      "BMAZ_121_13\n",
+      "BMAZ_121_15\n",
+      "BCOL_122_1\n",
+      "BCOL_122_6\n",
+      "BCOL_122_8\n",
+      "BCOL_122_9\n",
+      "BCOL_124_10\n",
+      "BCOL_124_11\n",
+      "BCOL_124_13\n",
+      "BCOL_124_19\n",
+      "BCOL_124_20\n",
+      "BCOL_124_23\n",
+      "BCOL_124_25\n",
+      "BCOL_125_5\n",
+      "BMAZ_126_8\n",
+      "BMAZ_127_5\n",
+      "BMAZ_127_8\n",
+      "BMAZ_127_11\n",
+      "BMAZ_127_12\n",
+      "BMAZ_127_13\n",
+      "BMAZ_127_14\n",
+      "BMAZ_127_15\n",
+      "BSAU_128_4\n",
+      "BSAU_128_5\n",
+      "BSAU_128_9\n",
+      "BTEJ_129_11\n",
+      "BTEJ_130_2\n",
+      "BOTZ_133_12\n",
+      "BZUL_135_2\n",
+      "BZUL_135_6\n",
+      "BZUL_135_9\n",
+      "BZUL_135_10\n",
+      "BVBR_137_3\n",
+      "BVBR_138_1\n",
+      "BVBR_138_2\n",
+      "BVBR_138_3\n",
+      "BVBR_138_4\n",
+      "BVBR_138_5\n",
+      "BVBR_138_7\n",
+      "BVBR_138_8\n",
+      "BVBR_138_9\n",
+      "BVBR_138_10\n",
+      "BTAL_139_6\n",
+      "BTAL_139_8\n",
+      "BTAL_139_9\n",
+      "BTAL_139_15\n",
+      "BTAL_140_1\n",
+      "BTAL_140_3\n",
+      "BTAL_140_7\n",
+      "BTAL_140_12\n",
+      "BTAL_140_13\n",
+      "BTAL_140_15\n",
+      "BPCH_141_5\n",
+      "BPCH_142_4\n",
+      "BPCH_142_6\n",
+      "BPCH_142_7\n",
+      "BPCH_142_8\n",
+      "BPCH_142_10\n",
+      "BPCH_142_14\n",
+      "BPCH_143_3\n",
+      "BPCH_143_4\n",
+      "BPCH_143_5\n",
+      "BPCH_143_6\n",
+      "BPCH_143_7\n",
+      "BPCH_143_8\n",
+      "BPCH_143_9\n",
+      "BPCH_143_10\n",
+      "BPCH_143_11\n",
+      "BPCH_143_12\n",
+      "BPCH_143_13\n",
+      "BPCH_144_9\n",
+      "BPCH_145_13\n",
+      "BCAR_146_2\n",
+      "BCAR_146_3\n",
+      "BCAR_146_4\n",
+      "BCAR_146_5\n",
+      "BCAR_146_7\n",
+      "BCAR_146_8\n",
+      "BCAR_147_3\n",
+      "BCAR_147_6\n",
+      "BCAR_147_7\n",
+      "BCAR_147_9\n",
+      "BCAR_147_10\n",
+      "BCAR_147_11\n",
+      "BCAR_147_12\n",
+      "BCAR_147_13\n",
+      "BCAR_147_14\n",
+      "BCAR_147_15\n",
+      "BNOC_149_8\n",
+      "BNOC_149_9\n",
+      "BNOC_149_10\n",
+      "BNOC_149_12\n",
+      "BNOC_149_13\n",
+      "BNOC_149_15\n",
+      "BRED_150_4\n",
+      "BRED_150_5\n",
+      "BRED_150_6\n",
+      "BRED_150_7\n",
+      "BNOC_151_1\n",
+      "BNOC_151_3\n",
+      "BHUE_153_4\n",
+      "BHUE_153_5\n",
+      "BHUE_153_7\n",
+      "BHUE_153_8\n",
+      "BHUE_153_9\n",
+      "BTIQ_154_2\n",
+      "BTIQ_154_10\n",
+      "BTIQ_154_11\n",
+      "BHUE_156_5\n",
+      "BHUE_156_11\n",
+      "BHUE_156_13\n",
+      "BHUE_156_14\n",
+      "BHUE_156_22\n",
+      "BTIQ_158_7\n",
+      "BTUZ_159_12\n",
+      "BTUZ_160_4\n",
+      "BTUZ_160_9\n",
+      "BTUZ_160_10\n",
+      "BJUA_162_1\n",
+      "BJUA_162_2\n",
+      "BJUA_162_6\n",
+      "BJUA_162_9\n",
+      "BJUA_162_12\n",
+      "BJUA_162_13\n",
+      "BJUA_162_14\n",
+      "BJUA_163_4\n",
+      "BTZI_165_5\n",
+      "BTZI_165_13\n",
+      "BTAC_168_3\n",
+      "BTAC_168_10\n",
+      "BTAC_168_13\n",
+      "BTAC_169_12\n",
+      "BTAR_170_5\n",
+      "BTAR_173_1\n",
+      "BTAR_173_3\n",
+      "BOAX_175_29\n",
+      "BOAX_175_30\n",
+      "BMAN_176_10\n",
+      "BMAN_178_2\n",
+      "BMAN_178_6\n",
+      "BVPU_179_2\n",
+      "BVPU_179_5\n",
+      "BVPU_179_6\n",
+      "BVPU_179_10\n",
+      "BMAN_180_1\n",
+      "BMAN_180_2\n",
+      "BMAN_180_10\n",
+      "BMAN_180_13\n",
+      "BMAN_180_15\n",
+      "BMAN_180_18\n",
+      "BMAN_180_20\n",
+      "BVPU_181_9\n",
+      "BVPU_181_11\n",
+      "BVPU_181_15\n",
+      "BVPU_182_1\n",
+      "BVPU_182_2\n",
+      "BVPU_182_5\n",
+      "BVPU_182_6\n",
+      "BVPU_182_8\n",
+      "BVPU_182_9\n",
+      "BVPU_182_11\n",
+      "BVPU_182_12\n",
+      "BVPU_182_13\n",
+      "BVPU_182_14\n",
+      "BVPU_183_2\n",
+      "BVPU_183_9\n",
+      "BVPU_183_12\n",
+      "BVPU_184_15\n",
+      "BTCT_185_8\n",
+      "BTCT_185_9\n",
+      "BTCT_185_14\n",
+      "BTCT_186_3\n",
+      "BTCT_186_6\n",
+      "BTCT_186_8\n",
+      "BCAR_270_3\n",
+      "BCAR_270_11\n",
+      "BTAL_271_1\n",
+      "BTAL_271_2\n",
+      "BTAL_271_3\n",
+      "BTAL_271_5\n",
+      "BTAL_271_7\n",
+      "BTAL_271_8\n",
+      "BZUL_272_1\n",
+      "BZUL_272_4\n",
+      "BZUL_272_15\n",
+      "BTAL_273_6\n",
+      "BOTZ_274_1\n",
+      "BOTZ_274_15\n",
+      "BTEJ_275_5\n",
+      "BTEJ_275_6\n",
+      "BTEJ_275_9\n",
+      "BTEJ_275_10\n",
+      "BTEJ_275_11\n",
+      "BTEJ_275_13\n",
+      "BTEJ_275_14\n",
+      "BTEJ_276_13\n",
+      "BTEJ_276_14\n",
+      "BTEJ_276_15\n",
+      "BTZI_277_4\n",
+      "BTZI_277_5\n",
+      "BTZI_277_13\n",
+      "BTZI_277_14\n",
+      "BTZI_278_1\n",
+      "BTZI_279_1\n",
+      "BTZI_279_2\n",
+      "BTZI_279_6\n",
+      "BTZI_279_9\n",
+      "BTZI_279_13\n",
+      "BTZI_279_14\n",
+      "BNOC_280_1\n",
+      "BNOC_280_3\n",
+      "BNOC_280_7\n",
+      "BNOC_280_10\n",
+      "BTEJ_287_3\n",
+      "BTEJ_287_4\n",
+      "BTEJ_287_6\n",
+      "BTEJ_287_9\n",
+      "BTEJ_287_10\n",
+      "BTEJ_287_11\n",
+      "BTEJ_287_12\n",
+      "BTEJ_287_14\n",
+      "BTAR_306_3\n",
+      "BTAR_306_5\n",
+      "BTAR_306_9\n",
+      "BTAR_306_10\n",
+      "BTAR_306_12\n",
+      "BTUZ_307_2\n",
+      "BTUZ_307_3\n",
+      "BTUZ_307_7\n",
+      "BTAC_309_3\n",
+      "BTAC_309_7\n",
+      "BTAC_309_9\n",
+      "BTAC_309_11\n",
+      "BTAC_310_1\n",
+      "BEJU_311_2\n",
+      "BEJU_311_10\n",
+      "BEJU_312_10\n",
+      "BEJU_312_13\n",
+      "BVPU_313_6\n",
+      "BVPU_313_7\n",
+      "BVPU_313_10\n",
+      "BVPU_313_15\n",
+      "BVPU_314_1\n",
+      "BVPU_314_4\n",
+      "BVPU_315_11\n",
+      "BVPU_316_5\n",
+      "BVPU_316_11\n",
+      "BVPU_316_13\n",
+      "BHUI_317_2\n",
+      "BHUI_317_5\n",
+      "BHUI_317_11\n",
+      "BOLI_318_1\n",
+      "BCOL_319_2\n",
+      "BCOL_319_5\n",
+      "BCOL_319_8\n",
+      "BCOL_319_9\n",
+      "BCOL_319_12\n",
+      "BPCH_320_13\n",
+      "BPCH_320_15\n",
+      "BHUE_321_5\n",
+      "BHUE_321_6\n",
+      "BHUE_321_9\n",
+      "BNOC_322_2\n",
+      "BNOC_322_3\n",
+      "BNOC_322_5\n",
+      "BNOC_322_8\n",
+      "BNOC_322_9\n",
+      "BNOC_322_11\n",
+      "BTAR_324_1\n",
+      "BTAR_324_2\n",
+      "BTAR_324_3\n",
+      "BTAR_324_12\n",
+      "BHUE_325_6\n",
+      "BHUE_325_7\n",
+      "BHUE_325_9\n",
+      "BHUE_325_13\n",
+      "BHUE_325_14\n",
+      "BHUE_325_15\n",
+      "BCAR_326_1\n",
+      "BOTZ_327_10\n",
+      "BOTZ_327_13\n",
+      "BOTZ_327_14\n",
+      "BTEJ_328_6\n",
+      "BTEJ_328_9\n",
+      "BTEJ_328_11\n",
+      "BTEJ_328_12\n",
+      "BTEJ_328_13\n",
+      "BGUA_335_2\n",
+      "ZLOX_193_25\n",
+      "ZLOX_193_26\n",
+      "ZLOX_286_1\n",
+      "ZLAB_189_4\n",
+      "ZLAB_189_5\n",
+      "ZLAB_189_6\n",
+      "ZLAB_189_14\n",
+      "ZLJU_190_4\n",
+      "ZLJU_190_8\n",
+      "ZLAB_281_4\n",
+      "ZLAB_281_7\n",
+      "ZLAB_281_8\n",
+      "ZLAB_281_9\n",
+      "ZLAB_282_1\n",
+      "ZLAB_282_3\n",
+      "ZLAB_282_4\n",
+      "ZLAB_282_5\n",
+      "ZLAB_282_7\n",
+      "ZLAB_282_8\n",
+      "ZLAB_282_9\n",
+      "ZLAB_282_10\n",
+      "ZLAB_282_11\n",
+      "ZLAB_282_12\n",
+      "ZLAB_282_13\n",
+      "ZLAB_282_14\n",
+      "ZLAB_282_15\n",
+      "ZLJA_283_5\n",
+      "ZLJA_283_13\n",
+      "ZPJA_198_1\n",
+      "ZPJA_198_6\n",
+      "ZPJA_198_15\n",
+      "ZPJA_199_3\n",
+      "ZPJA_199_26\n",
+      "ZPMI_200_3\n",
+      "ZPMI_200_8\n",
+      "ZPMI_200_17\n",
+      "ZPMI_200_22\n",
+      "ZPMI_288_7\n",
+      "ZDNA_11_10\n",
+      "ZDNA_11_13\n",
+      "ZDNA_11_14\n",
+      "ZDNA_12_1\n",
+      "ZDNA_12_13\n",
+      "ZDNA_13_13\n",
+      "ZDNA_13_24\n",
+      "ZDNA_13_30\n",
+      "ZDJA_195_14\n",
+      "ZDJA_196_1\n",
+      "ZDJA_196_2\n",
+      "ZDJA_196_3\n",
+      "ZDJA_196_5\n",
+      "ZDJA_196_13\n",
+      "ZDJA_196_14\n",
+      "ZDJA_197_2\n",
+      "ZDJA_197_3\n",
+      "ZDJA_197_4\n",
+      "ZDJA_197_6\n",
+      "ZDJA_197_7\n",
+      "ZDJA_197_13\n",
+      "ZDJA_197_14\n",
+      "ZDJA_197_17\n",
+      "ZDJA_197_18\n",
+      "ZDJA_197_19\n",
+      "ZDJA_197_20\n",
+      "ZDJA_197_22\n",
+      "ZDJA_197_23\n",
+      "ZDJA_197_24\n",
+      "ZDJA_197_25\n",
+      "ZDJA_197_27\n",
+      "ZDNA_262_13\n",
+      "ZDNA_263_4\n",
+      "ZDNA_264_7\n",
+      "ZDNA_264_15\n",
+      "ZDJA_285_4\n",
+      "HUEH_187_7\n",
+      "HUEH_187_11\n",
+      "HUEH_187_12\n",
+      "HUEH_187_14\n",
+      "HUEH_187_15\n",
+      "HUEH_188_10\n",
+      "NICA_192_5\n",
+      "NICA_192_8\n",
+      "NICA_192_9\n",
+      "NICA_192_10\n",
+      "NICA_192_11\n",
+      "NICA_192_12\n",
+      "NICA_192_13\n",
+      "NICA_192_15\n",
+      "NICA_194_1\n",
+      "NICA_194_2\n",
+      "Total phenotype accessions:  4153\n",
+      "genotype accessions in phenotype dataset:  3455\n",
+      "Total phenotypes not in genotype file:  698\n"
+     ]
+    }
+   ],
+   "source": [
+    "# display phenotypic accessions lacking genotype data\n",
+    "c = 0\n",
+    "unmatched_count = 0\n",
+    "unmatched_pheno = []\n",
+    "\n",
+    "print('List of accessesions not in the phenotype data:\\n')\n",
+    "\n",
+    "for acce in phen_ids:\n",
+    "    if acce not in geno_ids:\n",
+    "        print(acce)\n",
+    "        unmatched_pheno.append(acce)\n",
+    "        unmatched_count+=1\n",
+    "    else: \n",
+    "        c+=1\n",
+    "print('Total phenotype accessions: ', len(phen_ids))\n",
+    "print('genotype accessions in phenotype dataset: ', c)\n",
+    "print('Total phenotypes not in genotype file: ', unmatched_count)        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "c167f996-f8e7-4cff-b764-9b1cc926758d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of genotyped accessions without phenotype data:   149\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('Number of genotyped accessions without phenotype data:  ', len(unmatched_genotype))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "72b5a0d4-2f68-4dcd-be73-08bf2d16e799",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of phenotyped accessions without genotype data:   698\n"
+     ]
+    }
+   ],
+   "source": [
+    "print ('Number of phenotyped accessions without genotype data:  ' ,len(unmatched_pheno))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a2c74614",
+   "metadata": {},
+   "source": [
+    "- Filter all taxa files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "8105da62-7e4b-4c60-a9db-432b8dbf84d0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>classification</th>\n",
+       "      <th>individual_idlocal</th>\n",
+       "      <th>accession_idlocal</th>\n",
+       "      <th>state</th>\n",
+       "      <th>taxa</th>\n",
+       "      <th>altitude</th>\n",
+       "      <th>latitude</th>\n",
+       "      <th>longitude</th>\n",
+       "      <th>plant_height</th>\n",
+       "      <th>leaf_width</th>\n",
+       "      <th>...</th>\n",
+       "      <th>bio_10</th>\n",
+       "      <th>bio_11</th>\n",
+       "      <th>bio_12</th>\n",
+       "      <th>bio_13</th>\n",
+       "      <th>bio_14</th>\n",
+       "      <th>bio_15</th>\n",
+       "      <th>bio_16</th>\n",
+       "      <th>bio_17</th>\n",
+       "      <th>bio_18</th>\n",
+       "      <th>bio_19</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Zea mays mexicana - Chalco</td>\n",
+       "      <td>45_1</td>\n",
+       "      <td>CIM27471</td>\n",
+       "      <td>Michoacán</td>\n",
+       "      <td>CHGO_45_1</td>\n",
+       "      <td>2086</td>\n",
+       "      <td>19.68</td>\n",
+       "      <td>-100.60</td>\n",
+       "      <td>193.0</td>\n",
+       "      <td>4.5</td>\n",
+       "      <td>...</td>\n",
+       "      <td>17.8</td>\n",
+       "      <td>12.8</td>\n",
+       "      <td>969</td>\n",
+       "      <td>213</td>\n",
+       "      <td>9</td>\n",
+       "      <td>93</td>\n",
+       "      <td>577</td>\n",
+       "      <td>39</td>\n",
+       "      <td>422</td>\n",
+       "      <td>57</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Zea mays mexicana - Chalco</td>\n",
+       "      <td>45_2</td>\n",
+       "      <td>CIM27471</td>\n",
+       "      <td>Michoacán</td>\n",
+       "      <td>CHGO_45_2</td>\n",
+       "      <td>2086</td>\n",
+       "      <td>19.68</td>\n",
+       "      <td>-100.60</td>\n",
+       "      <td>238.0</td>\n",
+       "      <td>6.2</td>\n",
+       "      <td>...</td>\n",
+       "      <td>17.8</td>\n",
+       "      <td>12.8</td>\n",
+       "      <td>969</td>\n",
+       "      <td>213</td>\n",
+       "      <td>9</td>\n",
+       "      <td>93</td>\n",
+       "      <td>577</td>\n",
+       "      <td>39</td>\n",
+       "      <td>422</td>\n",
+       "      <td>57</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Zea mays mexicana - Chalco</td>\n",
+       "      <td>45_3</td>\n",
+       "      <td>CIM27471</td>\n",
+       "      <td>Michoacán</td>\n",
+       "      <td>CHGO_45_3</td>\n",
+       "      <td>2086</td>\n",
+       "      <td>19.68</td>\n",
+       "      <td>-100.60</td>\n",
+       "      <td>251.0</td>\n",
+       "      <td>5.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>17.8</td>\n",
+       "      <td>12.8</td>\n",
+       "      <td>969</td>\n",
+       "      <td>213</td>\n",
+       "      <td>9</td>\n",
+       "      <td>93</td>\n",
+       "      <td>577</td>\n",
+       "      <td>39</td>\n",
+       "      <td>422</td>\n",
+       "      <td>57</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Zea mays mexicana - Chalco</td>\n",
+       "      <td>45_5</td>\n",
+       "      <td>CIM27471</td>\n",
+       "      <td>Michoacán</td>\n",
+       "      <td>CHGO_45_5</td>\n",
+       "      <td>2086</td>\n",
+       "      <td>19.68</td>\n",
+       "      <td>-100.60</td>\n",
+       "      <td>295.0</td>\n",
+       "      <td>6.2</td>\n",
+       "      <td>...</td>\n",
+       "      <td>17.8</td>\n",
+       "      <td>12.8</td>\n",
+       "      <td>969</td>\n",
+       "      <td>213</td>\n",
+       "      <td>9</td>\n",
+       "      <td>93</td>\n",
+       "      <td>577</td>\n",
+       "      <td>39</td>\n",
+       "      <td>422</td>\n",
+       "      <td>57</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Zea mays mexicana - Chalco</td>\n",
+       "      <td>45_7</td>\n",
+       "      <td>CIM27471</td>\n",
+       "      <td>Michoacán</td>\n",
+       "      <td>CHGO_45_7</td>\n",
+       "      <td>2086</td>\n",
+       "      <td>19.68</td>\n",
+       "      <td>-100.60</td>\n",
+       "      <td>251.0</td>\n",
+       "      <td>4.6</td>\n",
+       "      <td>...</td>\n",
+       "      <td>17.8</td>\n",
+       "      <td>12.8</td>\n",
+       "      <td>969</td>\n",
+       "      <td>213</td>\n",
+       "      <td>9</td>\n",
+       "      <td>93</td>\n",
+       "      <td>577</td>\n",
+       "      <td>39</td>\n",
+       "      <td>422</td>\n",
+       "      <td>57</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4148</th>\n",
+       "      <td>Zea nicaraguensis</td>\n",
+       "      <td>194_11</td>\n",
+       "      <td>CIM27487</td>\n",
+       "      <td>Chinandega</td>\n",
+       "      <td>NICA_194_11</td>\n",
+       "      <td>9</td>\n",
+       "      <td>12.89</td>\n",
+       "      <td>-86.98</td>\n",
+       "      <td>343.0</td>\n",
+       "      <td>5.5</td>\n",
+       "      <td>...</td>\n",
+       "      <td>29.4</td>\n",
+       "      <td>26.9</td>\n",
+       "      <td>1718</td>\n",
+       "      <td>365</td>\n",
+       "      <td>1</td>\n",
+       "      <td>94</td>\n",
+       "      <td>950</td>\n",
+       "      <td>8</td>\n",
+       "      <td>262</td>\n",
+       "      <td>433</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4149</th>\n",
+       "      <td>Zea nicaraguensis</td>\n",
+       "      <td>194_12</td>\n",
+       "      <td>CIM27487</td>\n",
+       "      <td>Chinandega</td>\n",
+       "      <td>NICA_194_12</td>\n",
+       "      <td>9</td>\n",
+       "      <td>12.89</td>\n",
+       "      <td>-86.98</td>\n",
+       "      <td>334.0</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>29.4</td>\n",
+       "      <td>26.9</td>\n",
+       "      <td>1718</td>\n",
+       "      <td>365</td>\n",
+       "      <td>1</td>\n",
+       "      <td>94</td>\n",
+       "      <td>950</td>\n",
+       "      <td>8</td>\n",
+       "      <td>262</td>\n",
+       "      <td>433</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4150</th>\n",
+       "      <td>Zea nicaraguensis</td>\n",
+       "      <td>194_13</td>\n",
+       "      <td>CIM27487</td>\n",
+       "      <td>Chinandega</td>\n",
+       "      <td>NICA_194_13</td>\n",
+       "      <td>9</td>\n",
+       "      <td>12.89</td>\n",
+       "      <td>-86.98</td>\n",
+       "      <td>397.0</td>\n",
+       "      <td>5.5</td>\n",
+       "      <td>...</td>\n",
+       "      <td>29.4</td>\n",
+       "      <td>26.9</td>\n",
+       "      <td>1718</td>\n",
+       "      <td>365</td>\n",
+       "      <td>1</td>\n",
+       "      <td>94</td>\n",
+       "      <td>950</td>\n",
+       "      <td>8</td>\n",
+       "      <td>262</td>\n",
+       "      <td>433</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4151</th>\n",
+       "      <td>Zea nicaraguensis</td>\n",
+       "      <td>194_14</td>\n",
+       "      <td>CIM27487</td>\n",
+       "      <td>Chinandega</td>\n",
+       "      <td>NICA_194_14</td>\n",
+       "      <td>9</td>\n",
+       "      <td>12.89</td>\n",
+       "      <td>-86.98</td>\n",
+       "      <td>254.0</td>\n",
+       "      <td>5.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>29.4</td>\n",
+       "      <td>26.9</td>\n",
+       "      <td>1718</td>\n",
+       "      <td>365</td>\n",
+       "      <td>1</td>\n",
+       "      <td>94</td>\n",
+       "      <td>950</td>\n",
+       "      <td>8</td>\n",
+       "      <td>262</td>\n",
+       "      <td>433</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4152</th>\n",
+       "      <td>Zea nicaraguensis</td>\n",
+       "      <td>194_15</td>\n",
+       "      <td>CIM27487</td>\n",
+       "      <td>Chinandega</td>\n",
+       "      <td>NICA_194_15</td>\n",
+       "      <td>9</td>\n",
+       "      <td>12.89</td>\n",
+       "      <td>-86.98</td>\n",
+       "      <td>175.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>29.4</td>\n",
+       "      <td>26.9</td>\n",
+       "      <td>1718</td>\n",
+       "      <td>365</td>\n",
+       "      <td>1</td>\n",
+       "      <td>94</td>\n",
+       "      <td>950</td>\n",
+       "      <td>8</td>\n",
+       "      <td>262</td>\n",
+       "      <td>433</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3455 rows × 263 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  classification individual_idlocal accession_idlocal  \\\n",
+       "0     Zea mays mexicana - Chalco               45_1          CIM27471   \n",
+       "1     Zea mays mexicana - Chalco               45_2          CIM27471   \n",
+       "2     Zea mays mexicana - Chalco               45_3          CIM27471   \n",
+       "4     Zea mays mexicana - Chalco               45_5          CIM27471   \n",
+       "6     Zea mays mexicana - Chalco               45_7          CIM27471   \n",
+       "...                          ...                ...               ...   \n",
+       "4148          Zea nicaraguensis              194_11          CIM27487   \n",
+       "4149          Zea nicaraguensis              194_12          CIM27487   \n",
+       "4150          Zea nicaraguensis              194_13          CIM27487   \n",
+       "4151          Zea nicaraguensis              194_14          CIM27487   \n",
+       "4152          Zea nicaraguensis              194_15          CIM27487   \n",
+       "\n",
+       "           state         taxa  altitude  latitude  longitude  plant_height  \\\n",
+       "0      Michoacán    CHGO_45_1      2086     19.68    -100.60         193.0   \n",
+       "1      Michoacán    CHGO_45_2      2086     19.68    -100.60         238.0   \n",
+       "2      Michoacán    CHGO_45_3      2086     19.68    -100.60         251.0   \n",
+       "4      Michoacán    CHGO_45_5      2086     19.68    -100.60         295.0   \n",
+       "6      Michoacán    CHGO_45_7      2086     19.68    -100.60         251.0   \n",
+       "...          ...          ...       ...       ...        ...           ...   \n",
+       "4148  Chinandega  NICA_194_11         9     12.89     -86.98         343.0   \n",
+       "4149  Chinandega  NICA_194_12         9     12.89     -86.98         334.0   \n",
+       "4150  Chinandega  NICA_194_13         9     12.89     -86.98         397.0   \n",
+       "4151  Chinandega  NICA_194_14         9     12.89     -86.98         254.0   \n",
+       "4152  Chinandega  NICA_194_15         9     12.89     -86.98         175.0   \n",
+       "\n",
+       "      leaf_width  ...  bio_10  bio_11  bio_12  bio_13  bio_14  bio_15  bio_16  \\\n",
+       "0            4.5  ...    17.8    12.8     969     213       9      93     577   \n",
+       "1            6.2  ...    17.8    12.8     969     213       9      93     577   \n",
+       "2            5.0  ...    17.8    12.8     969     213       9      93     577   \n",
+       "4            6.2  ...    17.8    12.8     969     213       9      93     577   \n",
+       "6            4.6  ...    17.8    12.8     969     213       9      93     577   \n",
+       "...          ...  ...     ...     ...     ...     ...     ...     ...     ...   \n",
+       "4148         5.5  ...    29.4    26.9    1718     365       1      94     950   \n",
+       "4149         6.0  ...    29.4    26.9    1718     365       1      94     950   \n",
+       "4150         5.5  ...    29.4    26.9    1718     365       1      94     950   \n",
+       "4151         5.0  ...    29.4    26.9    1718     365       1      94     950   \n",
+       "4152         4.0  ...    29.4    26.9    1718     365       1      94     950   \n",
+       "\n",
+       "      bio_17  bio_18  bio_19  \n",
+       "0         39     422      57  \n",
+       "1         39     422      57  \n",
+       "2         39     422      57  \n",
+       "4         39     422      57  \n",
+       "6         39     422      57  \n",
+       "...      ...     ...     ...  \n",
+       "4148       8     262     433  \n",
+       "4149       8     262     433  \n",
+       "4150       8     262     433  \n",
+       "4151       8     262     433  \n",
+       "4152       8     262     433  \n",
+       "\n",
+       "[3455 rows x 263 columns]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Remove unmatched accessions from the phenotype file\n",
+    "filtered_pheno = phen.loc[~phen['taxa'].isin(unmatched_pheno)].drop(\"Unnamed: 0\", axis=1)\n",
+    "filtered_pheno"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "2a05b2cb-d8b0-4548-a782-49cc69372217",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>rs#</th>\n",
+       "      <th>alleles</th>\n",
+       "      <th>chrom</th>\n",
+       "      <th>pos</th>\n",
+       "      <th>strand</th>\n",
+       "      <th>assembly#</th>\n",
+       "      <th>center</th>\n",
+       "      <th>protLSID</th>\n",
+       "      <th>assayLSID</th>\n",
+       "      <th>panelLSID</th>\n",
+       "      <th>...</th>\n",
+       "      <th>NDUR_9_15</th>\n",
+       "      <th>NDUR_9_1</th>\n",
+       "      <th>NDUR_9_2</th>\n",
+       "      <th>NDUR_9_3</th>\n",
+       "      <th>NDUR_9_4</th>\n",
+       "      <th>NDUR_9_5</th>\n",
+       "      <th>NDUR_9_6</th>\n",
+       "      <th>NDUR_9_7</th>\n",
+       "      <th>NDUR_9_8</th>\n",
+       "      <th>NDUR_9_9</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>S1_992727</td>\n",
+       "      <td>T/C</td>\n",
+       "      <td>1</td>\n",
+       "      <td>992727</td>\n",
+       "      <td>+</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>S1_1005413</td>\n",
+       "      <td>C/G</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1005413</td>\n",
+       "      <td>+</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>S1_1763292</td>\n",
+       "      <td>T/C</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1763292</td>\n",
+       "      <td>+</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>TT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>S1_1763397</td>\n",
+       "      <td>A/G</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1763397</td>\n",
+       "      <td>+</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>AA</td>\n",
+       "      <td>AA</td>\n",
+       "      <td>AA</td>\n",
+       "      <td>AA</td>\n",
+       "      <td>AA</td>\n",
+       "      <td>AA</td>\n",
+       "      <td>AA</td>\n",
+       "      <td>AA</td>\n",
+       "      <td>AA</td>\n",
+       "      <td>AA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>S1_1780412</td>\n",
+       "      <td>G/T</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1780412</td>\n",
+       "      <td>+</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>NN</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>NN</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33924</th>\n",
+       "      <td>S10_149309490</td>\n",
+       "      <td>C/G</td>\n",
+       "      <td>10</td>\n",
+       "      <td>149309490</td>\n",
+       "      <td>+</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>CG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33925</th>\n",
+       "      <td>S10_149390708</td>\n",
+       "      <td>G/T</td>\n",
+       "      <td>10</td>\n",
+       "      <td>149390708</td>\n",
+       "      <td>+</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33926</th>\n",
+       "      <td>S10_149557920</td>\n",
+       "      <td>G/T</td>\n",
+       "      <td>10</td>\n",
+       "      <td>149557920</td>\n",
+       "      <td>+</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>TT</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>NN</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33927</th>\n",
+       "      <td>S10_149596545</td>\n",
+       "      <td>C/G</td>\n",
+       "      <td>10</td>\n",
+       "      <td>149596545</td>\n",
+       "      <td>+</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>CG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "      <td>GG</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33928</th>\n",
+       "      <td>S10_149597102</td>\n",
+       "      <td>C/G</td>\n",
+       "      <td>10</td>\n",
+       "      <td>149597102</td>\n",
+       "      <td>+</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>CC</td>\n",
+       "      <td>CC</td>\n",
+       "      <td>CC</td>\n",
+       "      <td>CC</td>\n",
+       "      <td>CC</td>\n",
+       "      <td>CC</td>\n",
+       "      <td>CC</td>\n",
+       "      <td>CC</td>\n",
+       "      <td>CC</td>\n",
+       "      <td>CC</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>33929 rows × 3466 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                 rs# alleles  chrom        pos strand  assembly#  center  \\\n",
+       "0          S1_992727     T/C      1     992727      +        NaN     NaN   \n",
+       "1         S1_1005413     C/G      1    1005413      +        NaN     NaN   \n",
+       "2         S1_1763292     T/C      1    1763292      +        NaN     NaN   \n",
+       "3         S1_1763397     A/G      1    1763397      +        NaN     NaN   \n",
+       "4         S1_1780412     G/T      1    1780412      +        NaN     NaN   \n",
+       "...              ...     ...    ...        ...    ...        ...     ...   \n",
+       "33924  S10_149309490     C/G     10  149309490      +        NaN     NaN   \n",
+       "33925  S10_149390708     G/T     10  149390708      +        NaN     NaN   \n",
+       "33926  S10_149557920     G/T     10  149557920      +        NaN     NaN   \n",
+       "33927  S10_149596545     C/G     10  149596545      +        NaN     NaN   \n",
+       "33928  S10_149597102     C/G     10  149597102      +        NaN     NaN   \n",
+       "\n",
+       "       protLSID  assayLSID  panelLSID  ...  NDUR_9_15 NDUR_9_1 NDUR_9_2  \\\n",
+       "0           NaN        NaN        NaN  ...         TT       TT       TT   \n",
+       "1           NaN        NaN        NaN  ...         GG       GG       GG   \n",
+       "2           NaN        NaN        NaN  ...         TT       TT       TT   \n",
+       "3           NaN        NaN        NaN  ...         AA       AA       AA   \n",
+       "4           NaN        NaN        NaN  ...         GG       NN       GG   \n",
+       "...         ...        ...        ...  ...        ...      ...      ...   \n",
+       "33924       NaN        NaN        NaN  ...         CG       GG       GG   \n",
+       "33925       NaN        NaN        NaN  ...         GG       GG       GG   \n",
+       "33926       NaN        NaN        NaN  ...         TT       GG       GG   \n",
+       "33927       NaN        NaN        NaN  ...         CG       GG       GG   \n",
+       "33928       NaN        NaN        NaN  ...         CC       CC       CC   \n",
+       "\n",
+       "      NDUR_9_3 NDUR_9_4 NDUR_9_5 NDUR_9_6 NDUR_9_7 NDUR_9_8 NDUR_9_9  \n",
+       "0           TT       TT       TT       TT       TT       TT       TT  \n",
+       "1           GG       GG       GG       GG       GG       GG       GG  \n",
+       "2           TT       TT       TT       TT       TT       TT       TT  \n",
+       "3           AA       AA       AA       AA       AA       AA       AA  \n",
+       "4           GG       GG       NN       GG       GG       GG       GG  \n",
+       "...        ...      ...      ...      ...      ...      ...      ...  \n",
+       "33924       GG       GG       GG       GG       GG       GG       GG  \n",
+       "33925       GG       GG       GG       GG       GG       GG       GG  \n",
+       "33926       GG       GG       GG       GG       NN       GG       GG  \n",
+       "33927       GG       GG       GG       GG       GG       GG       GG  \n",
+       "33928       CC       CC       CC       CC       CC       CC       CC  \n",
+       "\n",
+       "[33929 rows x 3466 columns]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Remove unmatched accessions from the genotype file#\n",
+    "geno = pd.read_table(\"../data/T3606_33929_hapmap.hmp.renamed_accessions.txt\")\n",
+    "filtered_geno = geno.drop(unmatched_genotype, axis=1)\n",
+    "filtered_geno"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "ad980d61-c29b-4e68-924d-2e5100552955",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create new all taxa genotype hapmap file with 3455 accessions\n",
+    "filtered_geno.to_csv(\"../data/gwas_data_3455_accessions/genotype/all_taxa_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n",
+    "\n",
+    "# create new all taxa phenotype csv with 3455 accessions\n",
+    "filtered_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/all_taxa_3455_accessions.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "72f143e6-252a-4b99-97f7-b492ed55fb92",
+   "metadata": {},
+   "source": [
+    "- Per taxa filter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "186bc800",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 119,
+   "id": "6b747ebb-db50-4619-a9db-1d3680b701ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get ids for each taxa in individual lists from phenotype files\n",
+    "\n",
+    "hapmap_fields = filtered_geno.columns.to_list()[0:11] # hapmap format compulsory fields\n",
+    "\n",
+    "# parviglumis\n",
+    "parv_pheno = filtered_pheno.loc[phen['classification'] == 'Zea mays parviglumis - Balsas']\n",
+    "parv_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/parv_pheno_3455_accessions.csv\", index=False)\n",
+    "\n",
+    "parv_pheno_ids = hapmap_fields + parv_pheno['taxa'].to_list()\n",
+    "filtered_geno[parv_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/parv_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n",
+    "\n",
+    "\n",
+    "# mexicana\n",
+    "mex_chalco_pheno = filtered_pheno.loc[phen['classification'] == 'Zea mays mexicana - Chalco']\n",
+    "mex_chalco_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/mex_chalco_pheno_3455_accessions.csv\", index=False)\n",
+    "\n",
+    "mex_chalco_pheno_ids = hapmap_fields + mex_chalco_pheno['taxa'].to_list()\n",
+    "filtered_geno[mex_chalco_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/mex_chalco_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n",
+    "\n",
+    "\n",
+    "mex_durango_pheno = filtered_pheno.loc[phen['classification'] == 'Zea mays mexicana - Durango']\n",
+    "mex_durango_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/mex_durango_pheno_3455_accessions.csv\", index=False)\n",
+    "\n",
+    "mex_durango_pheno_ids = hapmap_fields + mex_durango_pheno['taxa'].to_list()\n",
+    "filtered_geno[mex_durango_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/mex_durango_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n",
+    "\n",
+    "\n",
+    "mex_mesa_central_pheno = filtered_pheno.loc[phen['classification'] == 'Zea mays mexicana - Mesa Central']\n",
+    "mex_mesa_central_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/mex_mesa_central_pheno_3455_accessions.csv\", index=False)\n",
+    "\n",
+    "mex_mesa_central_pheno_ids = hapmap_fields + mex_mesa_central_pheno['taxa'].to_list()\n",
+    "filtered_geno[mex_mesa_central_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/mex_mesa_central_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n",
+    "\n",
+    "\n",
+    "mex_nobogame_pheno = filtered_pheno.loc[phen['classification'] == 'Zea mays mexicana - Nobogame']\n",
+    "mex_nobogame_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/mex_nobogame_pheno_3455_accessions.csv\", index=False)\n",
+    "\n",
+    "mex_nobogame_pheno_ids = hapmap_fields + mex_nobogame_pheno['taxa'].to_list()\n",
+    "filtered_geno[mex_nobogame_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/mex_nobogame_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n",
+    "\n",
+    "\n",
+    "mex_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea mays mexicana')]\n",
+    "#mex_pheno = filtered_pheno[filtered_pheno['classsification'].str.match('Zea mays mexicana')]\n",
+    "mex_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/mex_pheno_3455_accessions.csv\", index=False)\n",
+    "\n",
+    "mex_pheno_ids = hapmap_fields + mex_pheno['taxa'].to_list()\n",
+    "filtered_geno[mex_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/mex_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n",
+    "\n",
+    "\n",
+    "# luxurians\n",
+    "lux_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea luxurians')]\n",
+    "lux_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/lux_pheno_3455_accessions.csv\", index=False)\n",
+    "\n",
+    "lux_pheno_ids = hapmap_fields + lux_pheno['taxa'].to_list()\n",
+    "filtered_geno[lux_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/lux_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n",
+    "\n",
+    "\n",
+    "# Perennis\n",
+    "per_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea perennis') ]\n",
+    "per_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/per_pheno_3455_accessions.csv\", index=False)\n",
+    "\n",
+    "per_pheno_ids = hapmap_fields + per_pheno['taxa'].to_list()\n",
+    "filtered_geno[per_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/per_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n",
+    "\n",
+    "\n",
+    "# diploperennis\n",
+    "diplo_per_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea diploperennis') ]\n",
+    "diplo_per_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/diplo_per_pheno_3455_accessions.csv\", index=False)\n",
+    "\n",
+    "diplo_per_pheno_ids = hapmap_fields + diplo_per_pheno['taxa'].to_list()\n",
+    "filtered_geno[diplo_per_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/diplo_per_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n",
+    "\n",
+    "\n",
+    "# huehuetanangensis\n",
+    "hue_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea mays huehuetenangensis') ]\n",
+    "hue_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/hue_pheno_3455_accessions.csv\", index=False)\n",
+    "\n",
+    "hue_pheno_ids = hapmap_fields + hue_pheno['taxa'].to_list()\n",
+    "filtered_geno[hue_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/hue_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n",
+    "\n",
+    "\n",
+    "# Nicaraguensis\n",
+    "nica_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea nicaraguensis') ]\n",
+    "nica_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/nica_pheno_3455_accessions.csv\", index=False)\n",
+    "\n",
+    "nica_pheno_ids = hapmap_fields + nica_pheno['taxa'].to_list()\n",
+    "filtered_geno[nica_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/nica_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 155,
+   "id": "5b53034c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Perennis and diploperennis subset\n",
+    "per_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea perennis') ]\n",
+    "diplo_per_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea diploperennis') ]\n",
+    "\n",
+    "# combine the two frames\n",
+    "frames_dper_per = [diplo_per_pheno,per_pheno]\n",
+    "diplo_plus_perennis = pd.concat(frames_dper_per)\n",
+    "\n",
+    "diplo_plus_perennis.to_csv(\"../data/gwas_data_3455_accessions/phenotype/diplo_plus_perennis_pheno_3455_accessions.csv\", index=False)\n",
+    "#diplo_plus_perennis\n",
+    "\n",
+    "\n",
+    "diplo_plus_per_pheno_ids = hapmap_fields + diplo_plus_perennis['taxa'].to_list()\n",
+    "#diplo_plus_per_pheno_ids\n",
+    "filtered_geno[diplo_plus_per_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/diplo_plus_perennis_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 158,
+   "id": "8fe20e4f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# luxurians and nicaraguensis subset\n",
+    "lux_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea luxurians') ]\n",
+    "nica_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea nicaraguensis') ]\n",
+    "\n",
+    "# combine the two frames\n",
+    "frames_lux_nica = [lux_pheno,nica_pheno]\n",
+    "lux_plus_nica = pd.concat(frames_lux_nica)\n",
+    "\n",
+    "lux_plus_nica.to_csv(\"../data/gwas_data_3455_accessions/phenotype/lux_plus_nica_pheno_3455_accessions.csv\", index=False)\n",
+    "#lux_plus_nica\n",
+    "\n",
+    "\n",
+    "lux_nica_pheno_ids = hapmap_fields + lux_plus_nica['taxa'].to_list()\n",
+    "#lux_nica_pheno_ids\n",
+    "filtered_geno[lux_nica_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/lux_plus_nica_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 159,
+   "id": "6f5e7eba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# parviglumis and huehuetenanguensis subset\n",
+    "parv_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea mays parviglumis - Balsas') ]\n",
+    "hue_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea mays huehuetenangensis') ]\n",
+    "\n",
+    "# # combine the two frames\n",
+    "frames_parv_hue = [parv_pheno,hue_pheno]\n",
+    "parv_plus_hue = pd.concat(frames_parv_hue)\n",
+    "\n",
+    "parv_plus_hue.to_csv(\"../data/gwas_data_3455_accessions/phenotype/parv_plus_hue_pheno_3455_accessions.csv\", index=False)\n",
+    "\n",
+    "\n",
+    "parv_hue_pheno_ids = hapmap_fields + parv_plus_hue['taxa'].to_list()\n",
+    "#parv_hue_pheno_ids\n",
+    "filtered_geno[parv_hue_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/parv_plus_hue_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5be98b4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.5 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5 (default, Jun  4 2021, 12:28:51) \n[GCC 7.5.0]"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "c0e123d1fea197aa518e3197211374e8a785689805234020a499f6375b71c087"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/README.md b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/README.md
new file mode 100644
index 0000000000..4633751d36
--- /dev/null
+++ b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/README.md
@@ -0,0 +1,4 @@
+# The directory contains scripts used to:
+
+- convert genotype hapmap file to other file formats (h5, hapmap, vcf and plink)
+- impute missing snps with heterozygous snps at the respective positions
diff --git a/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/hapmap2numeric.py b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/hapmap2numeric.py
new file mode 100755
index 0000000000..79c86edd9d
--- /dev/null
+++ b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/hapmap2numeric.py
@@ -0,0 +1,54 @@
+#!/usr/bin/python3
+
+import sys, getopt
+
+def main(argv):
+    inputfile = ''
+    outputfile = ''
+    try:
+        opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
+    except getopt.GetoptError:
+        print ('hapmap2numeirc.py -i <inputfile> -o <outputfile>')
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt == '-h':
+            print ('hapmap2numeirc.py -i <inputfile> -o <outputfile>')
+            sys.exit()
+        elif opt in ("-i", "--ifile"):
+            inputfile = arg
+        elif opt in ("-o", "--ofile"):
+            outputfile = arg
+    print ('Input file is "', inputfile)
+    print ('Output file is "', outputfile)
+    
+    
+    with open(inputfile, 'r') as hap:
+        with open(outputfile, 'w') as num_hap:
+            for l in hap:
+                line = l.split()
+                if line[0] == 'rs#':
+                    header = line
+                    num_hap.writelines(('\t').join(header))
+                    num_hap.writelines('\n')
+                    continue
+                else:
+                    ref_alle = line[1].split('/')[0]
+                    alt_alle = line[1].split('/')[1]
+
+                    tr_line = [snp.replace(ref_alle+ref_alle, '1') for snp in line]
+                    tr_line = [snp.replace('NN', 'NA') for snp in tr_line]
+                    tr_line = [snp.replace(alt_alle+alt_alle, '-1') for snp in tr_line]
+                    tr_line = [snp.replace(alt_alle+ref_alle, '0') for snp in tr_line]
+                    tr_line = [snp.replace(ref_alle+alt_alle, '0') for snp in tr_line]
+                    tr_line[5:5] = ['NA', 'NA', 'NA', 'NA', 'NA', 'NA' ]
+
+                    num_hap.writelines('\t'.join(tr_line))
+                    num_hap.writelines('\n')
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
+
+
+
+
diff --git a/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/hapmap2numeric.sh b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/hapmap2numeric.sh
new file mode 100644
index 0000000000..53501821ae
--- /dev/null
+++ b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/hapmap2numeric.sh
@@ -0,0 +1,13 @@
+./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/parv_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/parv_numeric.hapmap.txt
+./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/parv_plus_hue_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/parv_plus_hue_numeric.hapmap.txt
+./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/mex_numeric.hapmap.txt
+./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_mesa_central_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/mex_mesa_numeric.hapmap.txt
+./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_chalco_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/mex_chalco_numeric.hapmap.txt
+./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_durango_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/mex_durango_numeric.hapmap.txt
+./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_nobogame_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/mex_nobogame_numeric.hapmap.txt
+./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/diplo_per_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/diplo_per_numeric.hapmap.txt
+./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/diplo_plus_perennis_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/diplo_plus_perennis_numeric.hapmap.txt
+./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/per_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/per_numeric.hapmap.txt
+./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/hue_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/hue_numeric.hapmap.txt
+./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/nica_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/nica_numeric.hapmap.txt
+./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/all_taxa_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/all_numeric.hapmap.txt
diff --git a/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/impute_missing_with_het_hapmap.py b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/impute_missing_with_het_hapmap.py
new file mode 100755
index 0000000000..4d7d0f86e4
--- /dev/null
+++ b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/impute_missing_with_het_hapmap.py
@@ -0,0 +1,51 @@
+#!/usr/bin/python3
+
+import sys, getopt
+
+def main(argv):
+    inputfile = ''
+    outputfile = ''
+    try:
+        opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
+    except getopt.GetoptError:
+        print ('impute_missing_with_het_hapmap.py -i <inputfile> -o <outputfile>')
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt == '-h':
+            print ('impute_missing_with_het_hapmap.py -i <inputfile> -o <outputfile>')
+            sys.exit()
+        elif opt in ("-i", "--ifile"):
+            inputfile = arg
+        elif opt in ("-o", "--ofile"):
+            outputfile = arg
+    print ('Input file is "', inputfile)
+    print ('Output file is "', outputfile)
+    
+    
+    with open(inputfile, 'r') as hap:
+        with open(outputfile, 'w') as num_hap:
+            for l in hap:
+                line = l.split()
+                if line[0] == 'rs#':
+                    header = line
+                    num_hap.writelines(('\t').join(header))
+                    num_hap.writelines('\n')
+                    continue
+                else:
+                    ref_alle = line[1].split('/')[0]
+                    alt_alle = line[1].split('/')[1]
+
+                    imp_line = [snp.replace('NN', ref_alle+alt_alle) for snp in line]
+                  
+                    imp_line[5:5] = ['NA', 'NA', 'NA', 'NA', 'NA', 'NA' ]
+
+                    num_hap.writelines('\t'.join(imp_line))
+                    num_hap.writelines('\n')
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
+
+
+
+
diff --git a/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/impute_missing_with_het_hapmap.sh b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/impute_missing_with_het_hapmap.sh
new file mode 100644
index 0000000000..360ea94dda
--- /dev/null
+++ b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/impute_missing_with_het_hapmap.sh
@@ -0,0 +1,13 @@
+./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/parv_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/parv_imputed_missing_hetero.hapmap.txt
+./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/parv_plus_hue_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/parv_plus_hue_imputed_missing_hetero.hapmap.txt
+./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_imputed_missing_hetero.hapmap.txt
+./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_mesa_central_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_mesa_imputed_missing_hetero.hapmap.txt
+./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_chalco_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_chalco_imputed_missing_hetero.hapmap.txt
+./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_durango_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_durango_imputed_missing_hetero.hapmap.txt
+./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_nobogame_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_nobogame_imputed_missing_hetero.hapmap.txt
+./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/diplo_per_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/diplo_per_imputed_missing_hetero.hapmap.txt
+./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/diplo_plus_perennis_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/diplo_plus_perennis_imputed_missing_hetero.hapmap.txt
+./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/per_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/per_imputed_missing_hetero.hapmap.txt
+./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/hue_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/hue_imputed_missing_hetero.hapmap.txt
+./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/nica_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/nica_imputed_missing_hetero.hapmap.txt
+./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/all_taxa_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/all_imputed_missing_hetero.hapmap.txt
diff --git a/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/tassel_convert_imputed_h5.sh b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/tassel_convert_imputed_h5.sh
new file mode 100644
index 0000000000..9edb6e69f2
--- /dev/null
+++ b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/tassel_convert_imputed_h5.sh
@@ -0,0 +1,14 @@
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/parv_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/parv_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1  
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/parv_plus_hue_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/parv_plus_hue_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/mex_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_mesa_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/mex_mesa_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_chalco_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/mex_chalco_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_durango_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/mex_durango_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_nobogame_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/mex_nobogame_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/diplo_per_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/diplo_per_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/diplo_plus_perennis_imputed_missing_hetero.hapmap.txt -export  ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/diplo_plus_perennis_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/per_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/per_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1  
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/hue_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/hue_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/nica_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/nica_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/all_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/all_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 
+
diff --git a/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/tassel_convert_imputed_plink.sh b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/tassel_convert_imputed_plink.sh
new file mode 100644
index 0000000000..94bf5b4931
--- /dev/null
+++ b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/tassel_convert_imputed_plink.sh
@@ -0,0 +1,16 @@
+mkdir ../../../data/gwas_data_3455_accessions/genotype_imputed/plink
+
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/parv_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/parv_imputed_missing_hetero.plink -exportType Plink -runfork1  
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/parv_plus_hue_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/parv_plus_hue_imputed_missing_hetero.plink -exportType Plink -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/mex_imputed_missing_hetero.plink -exportType Plink -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_mesa_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/mex_mesa_imputed_missing_hetero.plink -exportType Plink -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_chalco_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/mex_chalco_imputed_missing_hetero.plink -exportType Plink -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_durango_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/mex_durango_imputed_missing_hetero.plink -exportType Plink -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_nobogame_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/mex_nobogame_imputed_missing_hetero.plink -exportType Plink -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/diplo_per_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/diplo_per_imputed_missing_hetero.plink -exportType Plink -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/diplo_plus_perennis_imputed_missing_hetero.hapmap.txt -export  ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/diplo_plus_perennis_imputed_missing_hetero.plink -exportType Plink -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/per_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/per_imputed_missing_hetero.plink -exportType Plink -runfork1  
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/hue_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/hue_imputed_missing_hetero.plink -exportType Plink -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/nica_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/nica_imputed_missing_hetero.plink -exportType Plink -runfork1 
+/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h  ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/all_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/all_imputed_missing_hetero.plink -exportType Plink -runfork1 
+
diff --git a/workflows/preprocessing_data/scripts/remove_spp_tags_in_hapmap_files.sh b/workflows/preprocessing_data/scripts/remove_spp_tags_in_hapmap_files.sh
new file mode 100755
index 0000000000..4ecdf8973d
--- /dev/null
+++ b/workflows/preprocessing_data/scripts/remove_spp_tags_in_hapmap_files.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+# Remove ssp tags on the overall teosinte hapmap files ids
+
+sed -e 's/parviglumis_//g' -e 's/mexicana_//g' -e 's/luxurians_//g' -e 's/huehuetenangensis_//g' -e 's/diploperennis_//g'  -e 's/perennis_//g' -e 's/nicaraguensis_//g' < ../data/T3606_33929_hapmap.hmp.txt > ../data/T3606_33929_hapmap.hmp.renamed_accessions.txt
+
+
+# Remove ssp tags on the indivdual spp hapmapfiles
+# sed -e 's/parviglumis_//g' < ../data/hapmap/parviglumis.hmp.txt > ../data/hapmap/parviglumis.renamed.hmp.txt
+# sed -e 's/mexicana_//g' < ../data/hapmap/mexicana.hmp.txt > ../data/hapmap/mexicana.renamed.hmp.txt
+# sed -e 's/luxurians_//g' < ../data/hapmap/luxurians.hmp.txt > ../data/hapmap/luxurians.renamed.hmp.txt
+# sed -e 's/huehuetenangensis_//g' < ../data/hapmap/huehuetenangensis.hmp.txt > ../data/hapmap/huehuetenangensis.renamed.hmp.txt
+# sed -e 's/diploperennis_//g' < ../data/hapmap/diploperennis.hmp.txt > ../data/hapmap/diploperennis.renamed.hmp.txt
+# sed -e 's/perennis_//g' < ../data/hapmap/perennis.hmp.txt > ../data/hapmap/perennis.renamed.hmp.txt
+# sed -e 's/nicaraguensis_//g' < ../data/hapmap/nicaraguensis.hmp.txt > ../data/hapmap/nicaraguensis.renamed.hmp.txt
diff --git a/workflows/preprocessing_data/scripts/workflow_order.md b/workflows/preprocessing_data/scripts/workflow_order.md
new file mode 100644
index 0000000000..77acdce9c3
--- /dev/null
+++ b/workflows/preprocessing_data/scripts/workflow_order.md
@@ -0,0 +1,12 @@
+# Workflow description
+
+This document describes the order in which the scripts were executed
+
+1. `remove_spp_tags_in_hapmap_files.sh`
+   - Removes ssp tags on the overall teosinte hapmap files ids
+2. `geno_pheno_accession_selection_3455_accesions_matched.ipynb`
+   - Code used to filter out genotypes not in the phenotype accessions and vice verser for GWAS analysis using GAPIT
+   - Individual taxa subsets are also generated within the notebook.
+3. `extract_indiv_spp_pheno_data.sh`
+   - Extaracts and creates individual species phenotype files
+4. The subdirectory `hapmap_convertion_scripts` contains scripts used to convert the hapmap genotype file type to other formats (plink, numeric, H5 formats)
-- 
GitLab