From 0bd71582d32c3de39c93df470f7d095f6d52f619 Mon Sep 17 00:00:00 2001 From: Joseph Atemia <j.atemia@fz-juelich.de> Date: Thu, 26 Sep 2024 16:57:11 +0200 Subject: [PATCH] add: 'raw' data preprocessing scripts --- .../scripts/extract_indiv_spp_pheno_data.sh | 47 + ...ion_selection_3455_accesions_matched.ipynb | 2014 +++++++++++++++++ .../hapmap_convertion_scripts/README.md | 4 + .../hapmap2numeric.py | 54 + .../hapmap2numeric.sh | 13 + .../impute_missing_with_het_hapmap.py | 51 + .../impute_missing_with_het_hapmap.sh | 13 + .../tassel_convert_imputed_h5.sh | 14 + .../tassel_convert_imputed_plink.sh | 16 + .../remove_spp_tags_in_hapmap_files.sh | 15 + .../scripts/workflow_order.md | 12 + 11 files changed, 2253 insertions(+) create mode 100755 workflows/preprocessing_data/scripts/extract_indiv_spp_pheno_data.sh create mode 100644 workflows/preprocessing_data/scripts/geno_pheno_accession_selection_3455_accesions_matched.ipynb create mode 100644 workflows/preprocessing_data/scripts/hapmap_convertion_scripts/README.md create mode 100755 workflows/preprocessing_data/scripts/hapmap_convertion_scripts/hapmap2numeric.py create mode 100644 workflows/preprocessing_data/scripts/hapmap_convertion_scripts/hapmap2numeric.sh create mode 100755 workflows/preprocessing_data/scripts/hapmap_convertion_scripts/impute_missing_with_het_hapmap.py create mode 100644 workflows/preprocessing_data/scripts/hapmap_convertion_scripts/impute_missing_with_het_hapmap.sh create mode 100644 workflows/preprocessing_data/scripts/hapmap_convertion_scripts/tassel_convert_imputed_h5.sh create mode 100644 workflows/preprocessing_data/scripts/hapmap_convertion_scripts/tassel_convert_imputed_plink.sh create mode 100755 workflows/preprocessing_data/scripts/remove_spp_tags_in_hapmap_files.sh create mode 100644 workflows/preprocessing_data/scripts/workflow_order.md diff --git a/workflows/preprocessing_data/scripts/extract_indiv_spp_pheno_data.sh b/workflows/preprocessing_data/scripts/extract_indiv_spp_pheno_data.sh new file mode 100755 index 0000000000..f880fbeb23 --- /dev/null +++ b/workflows/preprocessing_data/scripts/extract_indiv_spp_pheno_data.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +# Extaract and create individual species phenotype files + +# pheno_result_dir=../data/phenotype +mkdir ../data/phenotype/data_with_headers + +head -n 1 ../data/v2_phenotype_and_env_data.csv > ../data/phenotype/data_with_headers/mexicana.csv +grep 'mexicana' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/mexicana.csv + + +head -n 1 ../data/v2_phenotype_and_env_data.csv > ../data/phenotype/data_with_headers/parviglumis.csv +grep 'parviglumis' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/parviglumis.csv + +head -n 1 ../data/v2_phenotype_and_env_data.csv > ../data/phenotype/data_with_headers/huehuetenangensis.csv +grep 'huehuetenangensis' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/huehuetenangensis.csv + +head -n 1 ../data/v2_phenotype_and_env_data.csv > ../data/phenotype/data_with_headers/nicaraguensis.csv +grep 'nicaraguensis' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/nicaraguensis.csv + +head -n 1 ../data/v2_phenotype_and_env_data.csv > ../data/phenotype/data_with_headers/luxurians.csv +grep 'luxurians' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/luxurians.csv + +head -n 1 ../data/v2_phenotype_and_env_data.csv > ../data/phenotype/data_with_headers/diploperennis.csv +grep 'diploperennis' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/diploperennis.csv + +head -n 1 ../data/v2_phenotype_and_env_data.csv > ../data/phenotype/data_with_headers/perennis.csv +grep 'Zea perennis' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/perennnis.csv + +# split mexicana spp further according to estado +head -n 1 ../data/v2_phenotype_and_env_data.csv > ../data/phenotype/data_with_headers/mexicana_chalco.csv +grep 'Zea mays mexicana - Chalco' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/mexicana_chalco.csv + +head -n 1 ../data/v2_phenotype_and_env_data.csv > ../data/phenotype/data_with_headers/mexicana_durango.csv +grep 'Zea mays mexicana - Durango' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/mexicana_durango.csv + +head -n 1 ../data/v2_phenotype_and_env_data.csv > ../data/phenotype/data_with_headers/mexicana_mesa_central.csv +grep 'Zea mays mexicana - Mesa Central' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/mexicana_mesa_central.csv + +head -n 1 ../data/v2_phenotype_and_env_data.csv > ../data/phenotype/data_with_headers/mexicana_nobogame.csv +grep 'Zea mays mexicana - Nobogame' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/mexicana_nobogame.csv + +head -n 1 ../data/v2_phenotype_and_env_data.csv > ../data/phenotype/data_with_headers/mexicana_mesa_nobogame.csv +grep 'Zea mays mexicana - Nobogame' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/mexicana_nobogame.csv + +head -n 1 ../data/v2_phenotype_and_env_data.csv > ../data/phenotype/data_with_headers/perennis_diplo_per.csv +grep 'Zea perennis' ../data/v2_phenotype_and_env_data.csv >> ../data/phenotype/data_with_headers/perennis_diplo_per.csv \ No newline at end of file diff --git a/workflows/preprocessing_data/scripts/geno_pheno_accession_selection_3455_accesions_matched.ipynb b/workflows/preprocessing_data/scripts/geno_pheno_accession_selection_3455_accesions_matched.ipynb new file mode 100644 index 0000000000..b829cb018c --- /dev/null +++ b/workflows/preprocessing_data/scripts/geno_pheno_accession_selection_3455_accesions_matched.ipynb @@ -0,0 +1,2014 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "427b5b28", + "metadata": {}, + "source": [ + "### Code used to filter out genotypes not in the phenotype accessions and vice verser for GWAS analysis using GAPIT " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9c50057a", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e3db6b42-48a5-4387-b2df-4a2615997d9e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of genotype ids: 3604\n", + "Number of phenotype ids: 4153\n" + ] + } + ], + "source": [ + "# Create variables with phenotype ids and genotype ids\n", + "\n", + "# Genotype ids \n", + "with open(\"../data/T3606_33929_hapmap.hmp.renamed_accessions.txt\", \"r\") as hapmap_file:\n", + " c = 0\n", + " for line in hapmap_file:\n", + " if c == 0:\n", + " geno_ids = line.split()[11:]\n", + " break\n", + " c += 1\n", + "print(\"Number of genotype ids:\", len(geno_ids))\n", + "\n", + "# Phenotype ids\n", + "phen = pd.read_csv(\"../data/v2_phenotype_and_env_data.csv\")\n", + "phen_ids = list(phen[\"taxa\"])\n", + "phen_ids = [i.replace(\"''\",'') for i in phen_ids]\n", + "\n", + "print(\"Number of phenotype ids:\", len(phen_ids))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7cff35cc-a921-4d9a-9f5c-1b1a98b63897", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "List of accessesions not in the phenotype data:\n", + "\n", + "NDUR_10_1758\n", + "BZAC_110_11\n", + "BZAC_110_14\n", + "BTEL_112_15\n", + "BTEL_113_15\n", + "BTEL_113_6\n", + "BTEL_113_8\n", + "BAPX_114_8\n", + "BOLI_116_12\n", + "BTEJ_129_1199\n", + "ZDNA_12_9\n", + "BPCH_141_585\n", + "TOLU_59_768\n", + "TOLU_60_1233\n", + "BPCH_143_22\n", + "TOLU_57_1390\n", + "BCAR_148_9\n", + "MDOB_14_13\n", + "BNOC_152_8\n", + "BNOC_152_9\n", + "BHUE_156_19\n", + "BHUE_156_8\n", + "BHUE_156_9\n", + "BTIQ_158_10\n", + "BJUA_163_2\n", + "BTZI_166_14\n", + "BTAR_173_388\n", + "BMAN_177_5\n", + "BVPU_179_602\n", + "ZLJA_191_10\n", + "ZLOX_193_10\n", + "ZLOX_193_15\n", + "ZLOX_193_16\n", + "ZLOX_193_17\n", + "ZLOX_193_19\n", + "ZLOX_193_21\n", + "ZLOX_193_23\n", + "ZLOX_193_24\n", + "ZLOX_193_4\n", + "ZLOX_193_5\n", + "ZLOX_193_7\n", + "PENJ_19_1101\n", + "PENJ_19_1409\n", + "ZPMI_200_1768\n", + "ZPMI_200_309\n", + "MCUI_23_10\n", + "MORO_25_10\n", + "HUEH_261_10\n", + "HUEH_261_11\n", + "HUEH_261_12\n", + "HUEH_261_13\n", + "HUEH_261_14\n", + "HUEH_261_1\n", + "HUEH_261_2\n", + "HUEH_261_3\n", + "HUEH_261_4\n", + "HUEH_261_5\n", + "HUEH_261_6\n", + "HUEH_261_7\n", + "HUEH_261_8\n", + "HUEH_261_9\n", + "TLAX_265_1\n", + "CHPU_268_8\n", + "BNOC_269_12\n", + "BNOC_269_1\n", + "BTAL_271_12\n", + "BTAL_271_160\n", + "BTAL_271_261\n", + "BTAL_271_362\n", + "BTAL_271_564\n", + "BTAL_271_665\n", + "BTAL_271_766\n", + "BTAL_271_867\n", + "BTEJ_276_1353\n", + "BTEJ_276_1454\n", + "BTEJ_276_1555\n", + "ZLAB_281_13\n", + "ZLAB_282_1098\n", + "ZLAB_282_1199\n", + "ZLAB_282_1200\n", + "ZLAB_282_1301\n", + "ZLJA_283_11\n", + "ZLJA_283_6\n", + "ZLJA_284_9\n", + "ZDJA_285_2\n", + "ZLOX_286_10\n", + "ZLOX_286_2\n", + "ZLOX_286_4\n", + "ZLOX_286_5\n", + "ZLOX_286_6\n", + "ZLOX_286_7\n", + "BTEJ_287_1021\n", + "BTEJ_287_1222\n", + "BTEJ_287_1424\n", + "BTEJ_287_314\n", + "BTEJ_287_415\n", + "BTEJ_287_617\n", + "BTEJ_287_920\n", + "AMEC_289_12\n", + "CHTX_291_1497\n", + "CHTX_291_190\n", + "CHTX_291_4\n", + "CHTX_291_693\n", + "CHAP_293_297\n", + "CHTX_298_15\n", + "MVIJ_29_1116\n", + "MVIJ_29_1242\n", + "MVIJ_29_1446\n", + "TARI_302_12\n", + "BHUE_308_1\n", + "MVIJ_30_15\n", + "BTAC_310_7\n", + "BEJU_311_204\n", + "BVPU_314_11\n", + "BVPU_315_4\n", + "BHUI_317_203\n", + "BOLI_318_5\n", + "SJER_334_8\n", + "INDA_33_9\n", + "INDA_34_9\n", + "TARI_37_21\n", + "TARI_37_22\n", + "MPUR_38_1432\n", + "MPUR_40_1259\n", + "MPUR_40_8\n", + "SJER_41_3\n", + "MZAM_42_568\n", + "CHGO_45_1368\n", + "CHGO_45_632\n", + "CHMI_48_4\n", + "CHMI_48_5\n", + "CHDF_52_6\n", + "CHAP_53_1166\n", + "CHAP_53_950\n", + "CHAP_55_6\n", + "CHAP_55_8\n", + "CHPU_80_383\n", + "BGUA_83_582\n", + "BGUA_85_160\n", + "BGUA_85_484\n", + "BGUA_85_592\n", + "BGUA_85_713\n", + "BGUA_87_638\n", + "BEJU_91_108\n", + "BEJU_91_216\n", + "BEJU_93_8\n", + "BEJU_95_3\n", + "BEJU_95_448\n", + "NDUR_9_1403\n", + "Total genotype accessions 3604\n", + "genotype accessions in phenotype dataset 3455\n" + ] + } + ], + "source": [ + "# display genotypic accessions lacking phenotype data\n", + "\n", + "c = 0\n", + "unmatched_genotype = []\n", + "\n", + "print('List of accessesions not in the phenotype data:\\n')\n", + "\n", + "for acce in geno_ids:\n", + " if acce not in phen_ids:\n", + " print(acce)\n", + " unmatched_genotype.append(acce)\n", + " else: \n", + " c+=1\n", + "print('Total genotype accessions', len(geno_ids))\n", + "print('genotype accessions in phenotype dataset', c)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7c554e82-01df-4d26-bc47-a20b97f8e472", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "List of accessesions not in the phenotype data:\n", + "\n", + "CHGO_45_4\n", + "CHGO_45_6\n", + "CHGO_45_11\n", + "CHGO_45_13\n", + "MCUI_46_7\n", + "MCUI_46_10\n", + "MCUI_46_11\n", + "CHMI_47_1\n", + "CHMI_47_5\n", + "CHMI_47_8\n", + "CHMI_47_15\n", + "CHMI_48_8\n", + "CHMI_48_9\n", + "CHMI_48_10\n", + "CHMI_48_13\n", + "CHGO_49_1\n", + "CHDF_50_8\n", + "CHDF_50_10\n", + "CHDF_50_13\n", + "CHDF_50_15\n", + "CHDF_51_12\n", + "CHDF_52_12\n", + "CHAP_53_9\n", + "CHAP_53_11\n", + "CHAP_53_12\n", + "CHAP_54_1\n", + "CHAP_54_10\n", + "CHAP_54_13\n", + "CHAP_54_14\n", + "CHAP_54_15\n", + "CHAP_55_2\n", + "CHAP_55_3\n", + "CHAP_55_4\n", + "CHAP_55_10\n", + "CHAP_56_8\n", + "TOLU_57_5\n", + "TOLU_59_7\n", + "TOLU_59_8\n", + "TOLU_59_9\n", + "TOLU_59_10\n", + "TOLU_59_14\n", + "TOLU_60_12\n", + "CHTX_62_1\n", + "CHTX_63_6\n", + "CHTX_63_15\n", + "CHTX_64_8\n", + "CHTX_64_11\n", + "CHTX_64_12\n", + "CHTX_64_13\n", + "AMEC_66_9\n", + "AMEC_67_9\n", + "AMEC_67_10\n", + "AMEC_67_13\n", + "AMEC_69_11\n", + "CHAL_73_1\n", + "CHAL_74_6\n", + "CHAL_74_19\n", + "CHPU_76_2\n", + "CHPU_76_9\n", + "CHPU_76_12\n", + "CHPU_77_2\n", + "CHPU_77_5\n", + "CHPU_77_8\n", + "CHPU_77_9\n", + "CHPU_79_9\n", + "CHPU_80_3\n", + "TLAX_81_1\n", + "TLAX_81_5\n", + "TLAX_81_8\n", + "TLAX_81_11\n", + "TLAX_81_12\n", + "TLAX_81_13\n", + "SNRA_82_15\n", + "CHPU_268_7\n", + "CHPU_268_12\n", + "CHTX_291_6\n", + "CHTX_291_10\n", + "CHTX_291_11\n", + "CHTX_291_12\n", + "CHTX_291_14\n", + "CHAL_292_7\n", + "CHAL_292_15\n", + "CHAP_293_2\n", + "CHAP_293_4\n", + "CHAP_293_6\n", + "CHAP_293_7\n", + "CHAP_293_10\n", + "CHAP_293_14\n", + "CHAP_294_6\n", + "CHAP_295_7\n", + "TOLU_296_1\n", + "TOLU_296_4\n", + "TOLU_296_5\n", + "TOLU_296_8\n", + "TOLU_296_9\n", + "TOLU_296_12\n", + "TOLU_296_13\n", + "CHAP_297_9\n", + "CHAP_297_10\n", + "CHTX_298_2\n", + "CHTX_298_3\n", + "CHTX_298_8\n", + "CHTX_298_9\n", + "CHTX_298_10\n", + "CHTX_298_14\n", + "DURA_6_3\n", + "NDUR_9_14\n", + "NDUR_10_17\n", + "NDUR_10_19\n", + "NDUR_10_27\n", + "MDOB_14_15\n", + "CHUR_18_6\n", + "CHUR_18_12\n", + "CHUR_18_14\n", + "CHUR_18_18\n", + "CHUR_18_22\n", + "PENJ_19_4\n", + "PENJ_19_5\n", + "PENJ_19_14\n", + "PENJ_20_5\n", + "PENJ_20_14\n", + "MCUI_22_11\n", + "MCUI_22_12\n", + "MCUI_23_6\n", + "MORO_24_1\n", + "MORO_25_5\n", + "MORO_26_1\n", + "MORO_26_3\n", + "MORO_26_7\n", + "MORO_26_8\n", + "YURI_27_1\n", + "YURI_27_3\n", + "YURI_27_4\n", + "YURI_27_5\n", + "YURI_27_6\n", + "YURI_27_14\n", + "YURI_28_11\n", + "MVIJ_29_2\n", + "MVIJ_29_5\n", + "MVIJ_29_7\n", + "MVIJ_29_8\n", + "MVIJ_29_10\n", + "MVIJ_29_11\n", + "MVIJ_29_12\n", + "MVIJ_29_13\n", + "MVIJ_29_14\n", + "MVIJ_30_1\n", + "MVIJ_30_6\n", + "MVIJ_30_7\n", + "MVIJ_30_12\n", + "MORE_31_1\n", + "MORE_31_15\n", + "INDA_32_12\n", + "INDA_32_15\n", + "INDA_33_11\n", + "INDA_34_1\n", + "INDA_34_10\n", + "INDA_34_11\n", + "INDA_34_14\n", + "MORE_35_2\n", + "MORE_35_7\n", + "MORE_35_13\n", + "MORE_35_15\n", + "TARI_36_1\n", + "TARI_36_9\n", + "TARI_36_10\n", + "TARI_36_11\n", + "TARI_37_2\n", + "TARI_37_12\n", + "TARI_37_13\n", + "TARI_37_14\n", + "TARI_37_15\n", + "MPUR_38_2\n", + "MPUR_38_3\n", + "MPUR_38_11\n", + "MPUR_38_13\n", + "MPUR_38_14\n", + "MPUR_39_2\n", + "MPUR_39_4\n", + "MPUR_39_5\n", + "MPUR_39_6\n", + "MPUR_39_8\n", + "MPUR_39_9\n", + "MPUR_39_12\n", + "MPUR_39_13\n", + "MPUR_40_2\n", + "MPUR_40_7\n", + "MPUR_40_10\n", + "MPUR_40_12\n", + "MPUR_40_13\n", + "MPUR_40_14\n", + "SJER_41_1\n", + "SJER_41_6\n", + "SJER_41_8\n", + "SJER_41_15\n", + "MZAM_42_1\n", + "MZAM_42_2\n", + "MZAM_42_3\n", + "MZAM_42_5\n", + "MORO_300_9\n", + "MORO_300_14\n", + "INDA_301_7\n", + "INDA_301_12\n", + "INDA_301_15\n", + "TARI_302_2\n", + "TARI_302_3\n", + "TARI_302_7\n", + "TARI_302_8\n", + "YURI_303_6\n", + "YURI_304_14\n", + "YURI_304_15\n", + "HUAN_329_2\n", + "HUAN_329_5\n", + "HUAN_330_4\n", + "HUAN_330_5\n", + "HUAN_330_9\n", + "HUAN_330_11\n", + "CHUR_331_1\n", + "CHUR_331_13\n", + "CHMI_332_2\n", + "CHMI_332_5\n", + "CHMI_332_6\n", + "CHMI_332_12\n", + "CHMI_332_13\n", + "MZAM_333_2\n", + "MZAM_333_3\n", + "MZAM_333_7\n", + "MZAM_333_9\n", + "MZAM_333_10\n", + "MZAM_333_13\n", + "SJER_334_2\n", + "SJER_334_5\n", + "SJER_334_6\n", + "SJER_334_12\n", + "SJER_334_14\n", + "SJER_334_15\n", + "SJER_336_3\n", + "SJER_336_4\n", + "NOBO_1_1\n", + "NOBO_1_10\n", + "NOBO_1_16\n", + "NOBO_1_20\n", + "NOBO_1_28\n", + "NOBO_3_8\n", + "NOBO_3_9\n", + "BGUA_83_5\n", + "BGUA_84_6\n", + "BGUA_84_8\n", + "BGUA_84_9\n", + "BGUA_84_12\n", + "BGUA_84_14\n", + "BGUA_85_1\n", + "BGUA_85_4\n", + "BGUA_85_5\n", + "BGUA_85_7\n", + "BGUA_86_5\n", + "BGUA_86_8\n", + "BGUA_86_12\n", + "BGUA_87_3\n", + "BGUA_87_6\n", + "BGUA_87_9\n", + "BGUA_87_17\n", + "BQUE_88_3\n", + "BGUA_90_13\n", + "BEJU_91_1\n", + "BEJU_91_2\n", + "BEJU_91_11\n", + "BEJU_91_12\n", + "BEJU_92_2\n", + "BEJU_93_14\n", + "BEJU_94_1\n", + "BEJU_95_4\n", + "BMOR_97_8\n", + "BMOR_97_11\n", + "BMOR_97_14\n", + "BMOR_97_15\n", + "BIXC_99_15\n", + "BIXC_100_6\n", + "BIXC_100_14\n", + "BZAC_102_3\n", + "BZAC_102_10\n", + "BZAC_106_1\n", + "BZAC_106_2\n", + "BZAC_106_4\n", + "BZAC_106_9\n", + "BZAC_108_14\n", + "BTEL_109_9\n", + "BZAC_110_9\n", + "BZAC_111_4\n", + "BZAC_111_11\n", + "BZAC_111_14\n", + "BTEL_113_7\n", + "BTEL_113_10\n", + "BTEL_113_11\n", + "BTEL_113_12\n", + "BAPX_114_13\n", + "BAPX_114_14\n", + "BOLI_116_10\n", + "BHUI_117_2\n", + "BHUI_117_4\n", + "BHUI_118_1\n", + "BHUI_118_7\n", + "BHUI_118_8\n", + "BMAZ_119_1\n", + "BMAZ_119_2\n", + "BMAZ_119_4\n", + "BMAZ_119_5\n", + "BMAZ_119_6\n", + "BMAZ_119_8\n", + "BMAZ_119_14\n", + "BMAZ_120_2\n", + "BMAZ_120_4\n", + "BMAZ_120_6\n", + "BMAZ_120_10\n", + "BMAZ_121_1\n", + "BMAZ_121_2\n", + "BMAZ_121_4\n", + "BMAZ_121_6\n", + "BMAZ_121_9\n", + "BMAZ_121_11\n", + "BMAZ_121_13\n", + "BMAZ_121_15\n", + "BCOL_122_1\n", + "BCOL_122_6\n", + "BCOL_122_8\n", + "BCOL_122_9\n", + "BCOL_124_10\n", + "BCOL_124_11\n", + "BCOL_124_13\n", + "BCOL_124_19\n", + "BCOL_124_20\n", + "BCOL_124_23\n", + "BCOL_124_25\n", + "BCOL_125_5\n", + "BMAZ_126_8\n", + "BMAZ_127_5\n", + "BMAZ_127_8\n", + "BMAZ_127_11\n", + "BMAZ_127_12\n", + "BMAZ_127_13\n", + "BMAZ_127_14\n", + "BMAZ_127_15\n", + "BSAU_128_4\n", + "BSAU_128_5\n", + "BSAU_128_9\n", + "BTEJ_129_11\n", + "BTEJ_130_2\n", + "BOTZ_133_12\n", + "BZUL_135_2\n", + "BZUL_135_6\n", + "BZUL_135_9\n", + "BZUL_135_10\n", + "BVBR_137_3\n", + "BVBR_138_1\n", + "BVBR_138_2\n", + "BVBR_138_3\n", + "BVBR_138_4\n", + "BVBR_138_5\n", + "BVBR_138_7\n", + "BVBR_138_8\n", + "BVBR_138_9\n", + "BVBR_138_10\n", + "BTAL_139_6\n", + "BTAL_139_8\n", + "BTAL_139_9\n", + "BTAL_139_15\n", + "BTAL_140_1\n", + "BTAL_140_3\n", + "BTAL_140_7\n", + "BTAL_140_12\n", + "BTAL_140_13\n", + "BTAL_140_15\n", + "BPCH_141_5\n", + "BPCH_142_4\n", + "BPCH_142_6\n", + "BPCH_142_7\n", + "BPCH_142_8\n", + "BPCH_142_10\n", + "BPCH_142_14\n", + "BPCH_143_3\n", + "BPCH_143_4\n", + "BPCH_143_5\n", + "BPCH_143_6\n", + "BPCH_143_7\n", + "BPCH_143_8\n", + "BPCH_143_9\n", + "BPCH_143_10\n", + "BPCH_143_11\n", + "BPCH_143_12\n", + "BPCH_143_13\n", + "BPCH_144_9\n", + "BPCH_145_13\n", + "BCAR_146_2\n", + "BCAR_146_3\n", + "BCAR_146_4\n", + "BCAR_146_5\n", + "BCAR_146_7\n", + "BCAR_146_8\n", + "BCAR_147_3\n", + "BCAR_147_6\n", + "BCAR_147_7\n", + "BCAR_147_9\n", + "BCAR_147_10\n", + "BCAR_147_11\n", + "BCAR_147_12\n", + "BCAR_147_13\n", + "BCAR_147_14\n", + "BCAR_147_15\n", + "BNOC_149_8\n", + "BNOC_149_9\n", + "BNOC_149_10\n", + "BNOC_149_12\n", + "BNOC_149_13\n", + "BNOC_149_15\n", + "BRED_150_4\n", + "BRED_150_5\n", + "BRED_150_6\n", + "BRED_150_7\n", + "BNOC_151_1\n", + "BNOC_151_3\n", + "BHUE_153_4\n", + "BHUE_153_5\n", + "BHUE_153_7\n", + "BHUE_153_8\n", + "BHUE_153_9\n", + "BTIQ_154_2\n", + "BTIQ_154_10\n", + "BTIQ_154_11\n", + "BHUE_156_5\n", + "BHUE_156_11\n", + "BHUE_156_13\n", + "BHUE_156_14\n", + "BHUE_156_22\n", + "BTIQ_158_7\n", + "BTUZ_159_12\n", + "BTUZ_160_4\n", + "BTUZ_160_9\n", + "BTUZ_160_10\n", + "BJUA_162_1\n", + "BJUA_162_2\n", + "BJUA_162_6\n", + "BJUA_162_9\n", + "BJUA_162_12\n", + "BJUA_162_13\n", + "BJUA_162_14\n", + "BJUA_163_4\n", + "BTZI_165_5\n", + "BTZI_165_13\n", + "BTAC_168_3\n", + "BTAC_168_10\n", + "BTAC_168_13\n", + "BTAC_169_12\n", + "BTAR_170_5\n", + "BTAR_173_1\n", + "BTAR_173_3\n", + "BOAX_175_29\n", + "BOAX_175_30\n", + "BMAN_176_10\n", + "BMAN_178_2\n", + "BMAN_178_6\n", + "BVPU_179_2\n", + "BVPU_179_5\n", + "BVPU_179_6\n", + "BVPU_179_10\n", + "BMAN_180_1\n", + "BMAN_180_2\n", + "BMAN_180_10\n", + "BMAN_180_13\n", + "BMAN_180_15\n", + "BMAN_180_18\n", + "BMAN_180_20\n", + "BVPU_181_9\n", + "BVPU_181_11\n", + "BVPU_181_15\n", + "BVPU_182_1\n", + "BVPU_182_2\n", + "BVPU_182_5\n", + "BVPU_182_6\n", + "BVPU_182_8\n", + "BVPU_182_9\n", + "BVPU_182_11\n", + "BVPU_182_12\n", + "BVPU_182_13\n", + "BVPU_182_14\n", + "BVPU_183_2\n", + "BVPU_183_9\n", + "BVPU_183_12\n", + "BVPU_184_15\n", + "BTCT_185_8\n", + "BTCT_185_9\n", + "BTCT_185_14\n", + "BTCT_186_3\n", + "BTCT_186_6\n", + "BTCT_186_8\n", + "BCAR_270_3\n", + "BCAR_270_11\n", + "BTAL_271_1\n", + "BTAL_271_2\n", + "BTAL_271_3\n", + "BTAL_271_5\n", + "BTAL_271_7\n", + "BTAL_271_8\n", + "BZUL_272_1\n", + "BZUL_272_4\n", + "BZUL_272_15\n", + "BTAL_273_6\n", + "BOTZ_274_1\n", + "BOTZ_274_15\n", + "BTEJ_275_5\n", + "BTEJ_275_6\n", + "BTEJ_275_9\n", + "BTEJ_275_10\n", + "BTEJ_275_11\n", + "BTEJ_275_13\n", + "BTEJ_275_14\n", + "BTEJ_276_13\n", + "BTEJ_276_14\n", + "BTEJ_276_15\n", + "BTZI_277_4\n", + "BTZI_277_5\n", + "BTZI_277_13\n", + "BTZI_277_14\n", + "BTZI_278_1\n", + "BTZI_279_1\n", + "BTZI_279_2\n", + "BTZI_279_6\n", + "BTZI_279_9\n", + "BTZI_279_13\n", + "BTZI_279_14\n", + "BNOC_280_1\n", + "BNOC_280_3\n", + "BNOC_280_7\n", + "BNOC_280_10\n", + "BTEJ_287_3\n", + "BTEJ_287_4\n", + "BTEJ_287_6\n", + "BTEJ_287_9\n", + "BTEJ_287_10\n", + "BTEJ_287_11\n", + "BTEJ_287_12\n", + "BTEJ_287_14\n", + "BTAR_306_3\n", + "BTAR_306_5\n", + "BTAR_306_9\n", + "BTAR_306_10\n", + "BTAR_306_12\n", + "BTUZ_307_2\n", + "BTUZ_307_3\n", + "BTUZ_307_7\n", + "BTAC_309_3\n", + "BTAC_309_7\n", + "BTAC_309_9\n", + "BTAC_309_11\n", + "BTAC_310_1\n", + "BEJU_311_2\n", + "BEJU_311_10\n", + "BEJU_312_10\n", + "BEJU_312_13\n", + "BVPU_313_6\n", + "BVPU_313_7\n", + "BVPU_313_10\n", + "BVPU_313_15\n", + "BVPU_314_1\n", + "BVPU_314_4\n", + "BVPU_315_11\n", + "BVPU_316_5\n", + "BVPU_316_11\n", + "BVPU_316_13\n", + "BHUI_317_2\n", + "BHUI_317_5\n", + "BHUI_317_11\n", + "BOLI_318_1\n", + "BCOL_319_2\n", + "BCOL_319_5\n", + "BCOL_319_8\n", + "BCOL_319_9\n", + "BCOL_319_12\n", + "BPCH_320_13\n", + "BPCH_320_15\n", + "BHUE_321_5\n", + "BHUE_321_6\n", + "BHUE_321_9\n", + "BNOC_322_2\n", + "BNOC_322_3\n", + "BNOC_322_5\n", + "BNOC_322_8\n", + "BNOC_322_9\n", + "BNOC_322_11\n", + "BTAR_324_1\n", + "BTAR_324_2\n", + "BTAR_324_3\n", + "BTAR_324_12\n", + "BHUE_325_6\n", + "BHUE_325_7\n", + "BHUE_325_9\n", + "BHUE_325_13\n", + "BHUE_325_14\n", + "BHUE_325_15\n", + "BCAR_326_1\n", + "BOTZ_327_10\n", + "BOTZ_327_13\n", + "BOTZ_327_14\n", + "BTEJ_328_6\n", + "BTEJ_328_9\n", + "BTEJ_328_11\n", + "BTEJ_328_12\n", + "BTEJ_328_13\n", + "BGUA_335_2\n", + "ZLOX_193_25\n", + "ZLOX_193_26\n", + "ZLOX_286_1\n", + "ZLAB_189_4\n", + "ZLAB_189_5\n", + "ZLAB_189_6\n", + "ZLAB_189_14\n", + "ZLJU_190_4\n", + "ZLJU_190_8\n", + "ZLAB_281_4\n", + "ZLAB_281_7\n", + "ZLAB_281_8\n", + "ZLAB_281_9\n", + "ZLAB_282_1\n", + "ZLAB_282_3\n", + "ZLAB_282_4\n", + "ZLAB_282_5\n", + "ZLAB_282_7\n", + "ZLAB_282_8\n", + "ZLAB_282_9\n", + "ZLAB_282_10\n", + "ZLAB_282_11\n", + "ZLAB_282_12\n", + "ZLAB_282_13\n", + "ZLAB_282_14\n", + "ZLAB_282_15\n", + "ZLJA_283_5\n", + "ZLJA_283_13\n", + "ZPJA_198_1\n", + "ZPJA_198_6\n", + "ZPJA_198_15\n", + "ZPJA_199_3\n", + "ZPJA_199_26\n", + "ZPMI_200_3\n", + "ZPMI_200_8\n", + "ZPMI_200_17\n", + "ZPMI_200_22\n", + "ZPMI_288_7\n", + "ZDNA_11_10\n", + "ZDNA_11_13\n", + "ZDNA_11_14\n", + "ZDNA_12_1\n", + "ZDNA_12_13\n", + "ZDNA_13_13\n", + "ZDNA_13_24\n", + "ZDNA_13_30\n", + "ZDJA_195_14\n", + "ZDJA_196_1\n", + "ZDJA_196_2\n", + "ZDJA_196_3\n", + "ZDJA_196_5\n", + "ZDJA_196_13\n", + "ZDJA_196_14\n", + "ZDJA_197_2\n", + "ZDJA_197_3\n", + "ZDJA_197_4\n", + "ZDJA_197_6\n", + "ZDJA_197_7\n", + "ZDJA_197_13\n", + "ZDJA_197_14\n", + "ZDJA_197_17\n", + "ZDJA_197_18\n", + "ZDJA_197_19\n", + "ZDJA_197_20\n", + "ZDJA_197_22\n", + "ZDJA_197_23\n", + "ZDJA_197_24\n", + "ZDJA_197_25\n", + "ZDJA_197_27\n", + "ZDNA_262_13\n", + "ZDNA_263_4\n", + "ZDNA_264_7\n", + "ZDNA_264_15\n", + "ZDJA_285_4\n", + "HUEH_187_7\n", + "HUEH_187_11\n", + "HUEH_187_12\n", + "HUEH_187_14\n", + "HUEH_187_15\n", + "HUEH_188_10\n", + "NICA_192_5\n", + "NICA_192_8\n", + "NICA_192_9\n", + "NICA_192_10\n", + "NICA_192_11\n", + "NICA_192_12\n", + "NICA_192_13\n", + "NICA_192_15\n", + "NICA_194_1\n", + "NICA_194_2\n", + "Total phenotype accessions: 4153\n", + "genotype accessions in phenotype dataset: 3455\n", + "Total phenotypes not in genotype file: 698\n" + ] + } + ], + "source": [ + "# display phenotypic accessions lacking genotype data\n", + "c = 0\n", + "unmatched_count = 0\n", + "unmatched_pheno = []\n", + "\n", + "print('List of accessesions not in the phenotype data:\\n')\n", + "\n", + "for acce in phen_ids:\n", + " if acce not in geno_ids:\n", + " print(acce)\n", + " unmatched_pheno.append(acce)\n", + " unmatched_count+=1\n", + " else: \n", + " c+=1\n", + "print('Total phenotype accessions: ', len(phen_ids))\n", + "print('genotype accessions in phenotype dataset: ', c)\n", + "print('Total phenotypes not in genotype file: ', unmatched_count) " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c167f996-f8e7-4cff-b764-9b1cc926758d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of genotyped accessions without phenotype data: 149\n" + ] + } + ], + "source": [ + "print('Number of genotyped accessions without phenotype data: ', len(unmatched_genotype))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "72b5a0d4-2f68-4dcd-be73-08bf2d16e799", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of phenotyped accessions without genotype data: 698\n" + ] + } + ], + "source": [ + "print ('Number of phenotyped accessions without genotype data: ' ,len(unmatched_pheno))" + ] + }, + { + "cell_type": "markdown", + "id": "a2c74614", + "metadata": {}, + "source": [ + "- Filter all taxa files" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "8105da62-7e4b-4c60-a9db-432b8dbf84d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>classification</th>\n", + " <th>individual_idlocal</th>\n", + " <th>accession_idlocal</th>\n", + " <th>state</th>\n", + " <th>taxa</th>\n", + " <th>altitude</th>\n", + " <th>latitude</th>\n", + " <th>longitude</th>\n", + " <th>plant_height</th>\n", + " <th>leaf_width</th>\n", + " <th>...</th>\n", + " <th>bio_10</th>\n", + " <th>bio_11</th>\n", + " <th>bio_12</th>\n", + " <th>bio_13</th>\n", + " <th>bio_14</th>\n", + " <th>bio_15</th>\n", + " <th>bio_16</th>\n", + " <th>bio_17</th>\n", + " <th>bio_18</th>\n", + " <th>bio_19</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Zea mays mexicana - Chalco</td>\n", + " <td>45_1</td>\n", + " <td>CIM27471</td>\n", + " <td>Michoacán</td>\n", + " <td>CHGO_45_1</td>\n", + " <td>2086</td>\n", + " <td>19.68</td>\n", + " <td>-100.60</td>\n", + " <td>193.0</td>\n", + " <td>4.5</td>\n", + " <td>...</td>\n", + " <td>17.8</td>\n", + " <td>12.8</td>\n", + " <td>969</td>\n", + " <td>213</td>\n", + " <td>9</td>\n", + " <td>93</td>\n", + " <td>577</td>\n", + " <td>39</td>\n", + " <td>422</td>\n", + " <td>57</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Zea mays mexicana - Chalco</td>\n", + " <td>45_2</td>\n", + " <td>CIM27471</td>\n", + " <td>Michoacán</td>\n", + " <td>CHGO_45_2</td>\n", + " <td>2086</td>\n", + " <td>19.68</td>\n", + " <td>-100.60</td>\n", + " <td>238.0</td>\n", + " <td>6.2</td>\n", + " <td>...</td>\n", + " <td>17.8</td>\n", + " <td>12.8</td>\n", + " <td>969</td>\n", + " <td>213</td>\n", + " <td>9</td>\n", + " <td>93</td>\n", + " <td>577</td>\n", + " <td>39</td>\n", + " <td>422</td>\n", + " <td>57</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Zea mays mexicana - Chalco</td>\n", + " <td>45_3</td>\n", + " <td>CIM27471</td>\n", + " <td>Michoacán</td>\n", + " <td>CHGO_45_3</td>\n", + " <td>2086</td>\n", + " <td>19.68</td>\n", + " <td>-100.60</td>\n", + " <td>251.0</td>\n", + " <td>5.0</td>\n", + " <td>...</td>\n", + " <td>17.8</td>\n", + " <td>12.8</td>\n", + " <td>969</td>\n", + " <td>213</td>\n", + " <td>9</td>\n", + " <td>93</td>\n", + " <td>577</td>\n", + " <td>39</td>\n", + " <td>422</td>\n", + " <td>57</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Zea mays mexicana - Chalco</td>\n", + " <td>45_5</td>\n", + " <td>CIM27471</td>\n", + " <td>Michoacán</td>\n", + " <td>CHGO_45_5</td>\n", + " <td>2086</td>\n", + " <td>19.68</td>\n", + " <td>-100.60</td>\n", + " <td>295.0</td>\n", + " <td>6.2</td>\n", + " <td>...</td>\n", + " <td>17.8</td>\n", + " <td>12.8</td>\n", + " <td>969</td>\n", + " <td>213</td>\n", + " <td>9</td>\n", + " <td>93</td>\n", + " <td>577</td>\n", + " <td>39</td>\n", + " <td>422</td>\n", + " <td>57</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>Zea mays mexicana - Chalco</td>\n", + " <td>45_7</td>\n", + " <td>CIM27471</td>\n", + " <td>Michoacán</td>\n", + " <td>CHGO_45_7</td>\n", + " <td>2086</td>\n", + " <td>19.68</td>\n", + " <td>-100.60</td>\n", + " <td>251.0</td>\n", + " <td>4.6</td>\n", + " <td>...</td>\n", + " <td>17.8</td>\n", + " <td>12.8</td>\n", + " <td>969</td>\n", + " <td>213</td>\n", + " <td>9</td>\n", + " <td>93</td>\n", + " <td>577</td>\n", + " <td>39</td>\n", + " <td>422</td>\n", + " <td>57</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4148</th>\n", + " <td>Zea nicaraguensis</td>\n", + " <td>194_11</td>\n", + " <td>CIM27487</td>\n", + " <td>Chinandega</td>\n", + " <td>NICA_194_11</td>\n", + " <td>9</td>\n", + " <td>12.89</td>\n", + " <td>-86.98</td>\n", + " <td>343.0</td>\n", + " <td>5.5</td>\n", + " <td>...</td>\n", + " <td>29.4</td>\n", + " <td>26.9</td>\n", + " <td>1718</td>\n", + " <td>365</td>\n", + " <td>1</td>\n", + " <td>94</td>\n", + " <td>950</td>\n", + " <td>8</td>\n", + " <td>262</td>\n", + " <td>433</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4149</th>\n", + " <td>Zea nicaraguensis</td>\n", + " <td>194_12</td>\n", + " <td>CIM27487</td>\n", + " <td>Chinandega</td>\n", + " <td>NICA_194_12</td>\n", + " <td>9</td>\n", + " <td>12.89</td>\n", + " <td>-86.98</td>\n", + " <td>334.0</td>\n", + " <td>6.0</td>\n", + " <td>...</td>\n", + " <td>29.4</td>\n", + " <td>26.9</td>\n", + " <td>1718</td>\n", + " <td>365</td>\n", + " <td>1</td>\n", + " <td>94</td>\n", + " <td>950</td>\n", + " <td>8</td>\n", + " <td>262</td>\n", + " <td>433</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4150</th>\n", + " <td>Zea nicaraguensis</td>\n", + " <td>194_13</td>\n", + " <td>CIM27487</td>\n", + " <td>Chinandega</td>\n", + " <td>NICA_194_13</td>\n", + " <td>9</td>\n", + " <td>12.89</td>\n", + " <td>-86.98</td>\n", + " <td>397.0</td>\n", + " <td>5.5</td>\n", + " <td>...</td>\n", + " <td>29.4</td>\n", + " <td>26.9</td>\n", + " <td>1718</td>\n", + " <td>365</td>\n", + " <td>1</td>\n", + " <td>94</td>\n", + " <td>950</td>\n", + " <td>8</td>\n", + " <td>262</td>\n", + " <td>433</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4151</th>\n", + " <td>Zea nicaraguensis</td>\n", + " <td>194_14</td>\n", + " <td>CIM27487</td>\n", + " <td>Chinandega</td>\n", + " <td>NICA_194_14</td>\n", + " <td>9</td>\n", + " <td>12.89</td>\n", + " <td>-86.98</td>\n", + " <td>254.0</td>\n", + " <td>5.0</td>\n", + " <td>...</td>\n", + " <td>29.4</td>\n", + " <td>26.9</td>\n", + " <td>1718</td>\n", + " <td>365</td>\n", + " <td>1</td>\n", + " <td>94</td>\n", + " <td>950</td>\n", + " <td>8</td>\n", + " <td>262</td>\n", + " <td>433</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4152</th>\n", + " <td>Zea nicaraguensis</td>\n", + " <td>194_15</td>\n", + " <td>CIM27487</td>\n", + " <td>Chinandega</td>\n", + " <td>NICA_194_15</td>\n", + " <td>9</td>\n", + " <td>12.89</td>\n", + " <td>-86.98</td>\n", + " <td>175.0</td>\n", + " <td>4.0</td>\n", + " <td>...</td>\n", + " <td>29.4</td>\n", + " <td>26.9</td>\n", + " <td>1718</td>\n", + " <td>365</td>\n", + " <td>1</td>\n", + " <td>94</td>\n", + " <td>950</td>\n", + " <td>8</td>\n", + " <td>262</td>\n", + " <td>433</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>3455 rows × 263 columns</p>\n", + "</div>" + ], + "text/plain": [ + " classification individual_idlocal accession_idlocal \\\n", + "0 Zea mays mexicana - Chalco 45_1 CIM27471 \n", + "1 Zea mays mexicana - Chalco 45_2 CIM27471 \n", + "2 Zea mays mexicana - Chalco 45_3 CIM27471 \n", + "4 Zea mays mexicana - Chalco 45_5 CIM27471 \n", + "6 Zea mays mexicana - Chalco 45_7 CIM27471 \n", + "... ... ... ... \n", + "4148 Zea nicaraguensis 194_11 CIM27487 \n", + "4149 Zea nicaraguensis 194_12 CIM27487 \n", + "4150 Zea nicaraguensis 194_13 CIM27487 \n", + "4151 Zea nicaraguensis 194_14 CIM27487 \n", + "4152 Zea nicaraguensis 194_15 CIM27487 \n", + "\n", + " state taxa altitude latitude longitude plant_height \\\n", + "0 Michoacán CHGO_45_1 2086 19.68 -100.60 193.0 \n", + "1 Michoacán CHGO_45_2 2086 19.68 -100.60 238.0 \n", + "2 Michoacán CHGO_45_3 2086 19.68 -100.60 251.0 \n", + "4 Michoacán CHGO_45_5 2086 19.68 -100.60 295.0 \n", + "6 Michoacán CHGO_45_7 2086 19.68 -100.60 251.0 \n", + "... ... ... ... ... ... ... \n", + "4148 Chinandega NICA_194_11 9 12.89 -86.98 343.0 \n", + "4149 Chinandega NICA_194_12 9 12.89 -86.98 334.0 \n", + "4150 Chinandega NICA_194_13 9 12.89 -86.98 397.0 \n", + "4151 Chinandega NICA_194_14 9 12.89 -86.98 254.0 \n", + "4152 Chinandega NICA_194_15 9 12.89 -86.98 175.0 \n", + "\n", + " leaf_width ... bio_10 bio_11 bio_12 bio_13 bio_14 bio_15 bio_16 \\\n", + "0 4.5 ... 17.8 12.8 969 213 9 93 577 \n", + "1 6.2 ... 17.8 12.8 969 213 9 93 577 \n", + "2 5.0 ... 17.8 12.8 969 213 9 93 577 \n", + "4 6.2 ... 17.8 12.8 969 213 9 93 577 \n", + "6 4.6 ... 17.8 12.8 969 213 9 93 577 \n", + "... ... ... ... ... ... ... ... ... ... \n", + "4148 5.5 ... 29.4 26.9 1718 365 1 94 950 \n", + "4149 6.0 ... 29.4 26.9 1718 365 1 94 950 \n", + "4150 5.5 ... 29.4 26.9 1718 365 1 94 950 \n", + "4151 5.0 ... 29.4 26.9 1718 365 1 94 950 \n", + "4152 4.0 ... 29.4 26.9 1718 365 1 94 950 \n", + "\n", + " bio_17 bio_18 bio_19 \n", + "0 39 422 57 \n", + "1 39 422 57 \n", + "2 39 422 57 \n", + "4 39 422 57 \n", + "6 39 422 57 \n", + "... ... ... ... \n", + "4148 8 262 433 \n", + "4149 8 262 433 \n", + "4150 8 262 433 \n", + "4151 8 262 433 \n", + "4152 8 262 433 \n", + "\n", + "[3455 rows x 263 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Remove unmatched accessions from the phenotype file\n", + "filtered_pheno = phen.loc[~phen['taxa'].isin(unmatched_pheno)].drop(\"Unnamed: 0\", axis=1)\n", + "filtered_pheno" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2a05b2cb-d8b0-4548-a782-49cc69372217", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>rs#</th>\n", + " <th>alleles</th>\n", + " <th>chrom</th>\n", + " <th>pos</th>\n", + " <th>strand</th>\n", + " <th>assembly#</th>\n", + " <th>center</th>\n", + " <th>protLSID</th>\n", + " <th>assayLSID</th>\n", + " <th>panelLSID</th>\n", + " <th>...</th>\n", + " <th>NDUR_9_15</th>\n", + " <th>NDUR_9_1</th>\n", + " <th>NDUR_9_2</th>\n", + " <th>NDUR_9_3</th>\n", + " <th>NDUR_9_4</th>\n", + " <th>NDUR_9_5</th>\n", + " <th>NDUR_9_6</th>\n", + " <th>NDUR_9_7</th>\n", + " <th>NDUR_9_8</th>\n", + " <th>NDUR_9_9</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>S1_992727</td>\n", + " <td>T/C</td>\n", + " <td>1</td>\n", + " <td>992727</td>\n", + " <td>+</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>...</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>S1_1005413</td>\n", + " <td>C/G</td>\n", + " <td>1</td>\n", + " <td>1005413</td>\n", + " <td>+</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>...</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>S1_1763292</td>\n", + " <td>T/C</td>\n", + " <td>1</td>\n", + " <td>1763292</td>\n", + " <td>+</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>...</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " <td>TT</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>S1_1763397</td>\n", + " <td>A/G</td>\n", + " <td>1</td>\n", + " <td>1763397</td>\n", + " <td>+</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>...</td>\n", + " <td>AA</td>\n", + " <td>AA</td>\n", + " <td>AA</td>\n", + " <td>AA</td>\n", + " <td>AA</td>\n", + " <td>AA</td>\n", + " <td>AA</td>\n", + " <td>AA</td>\n", + " <td>AA</td>\n", + " <td>AA</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>S1_1780412</td>\n", + " <td>G/T</td>\n", + " <td>1</td>\n", + " <td>1780412</td>\n", + " <td>+</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>...</td>\n", + " <td>GG</td>\n", + " <td>NN</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>NN</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>33924</th>\n", + " <td>S10_149309490</td>\n", + " <td>C/G</td>\n", + " <td>10</td>\n", + " <td>149309490</td>\n", + " <td>+</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>...</td>\n", + " <td>CG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " </tr>\n", + " <tr>\n", + " <th>33925</th>\n", + " <td>S10_149390708</td>\n", + " <td>G/T</td>\n", + " <td>10</td>\n", + " <td>149390708</td>\n", + " <td>+</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>...</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " </tr>\n", + " <tr>\n", + " <th>33926</th>\n", + " <td>S10_149557920</td>\n", + " <td>G/T</td>\n", + " <td>10</td>\n", + " <td>149557920</td>\n", + " <td>+</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>...</td>\n", + " <td>TT</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>NN</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " </tr>\n", + " <tr>\n", + " <th>33927</th>\n", + " <td>S10_149596545</td>\n", + " <td>C/G</td>\n", + " <td>10</td>\n", + " <td>149596545</td>\n", + " <td>+</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>...</td>\n", + " <td>CG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " <td>GG</td>\n", + " </tr>\n", + " <tr>\n", + " <th>33928</th>\n", + " <td>S10_149597102</td>\n", + " <td>C/G</td>\n", + " <td>10</td>\n", + " <td>149597102</td>\n", + " <td>+</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>...</td>\n", + " <td>CC</td>\n", + " <td>CC</td>\n", + " <td>CC</td>\n", + " <td>CC</td>\n", + " <td>CC</td>\n", + " <td>CC</td>\n", + " <td>CC</td>\n", + " <td>CC</td>\n", + " <td>CC</td>\n", + " <td>CC</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>33929 rows × 3466 columns</p>\n", + "</div>" + ], + "text/plain": [ + " rs# alleles chrom pos strand assembly# center \\\n", + "0 S1_992727 T/C 1 992727 + NaN NaN \n", + "1 S1_1005413 C/G 1 1005413 + NaN NaN \n", + "2 S1_1763292 T/C 1 1763292 + NaN NaN \n", + "3 S1_1763397 A/G 1 1763397 + NaN NaN \n", + "4 S1_1780412 G/T 1 1780412 + NaN NaN \n", + "... ... ... ... ... ... ... ... \n", + "33924 S10_149309490 C/G 10 149309490 + NaN NaN \n", + "33925 S10_149390708 G/T 10 149390708 + NaN NaN \n", + "33926 S10_149557920 G/T 10 149557920 + NaN NaN \n", + "33927 S10_149596545 C/G 10 149596545 + NaN NaN \n", + "33928 S10_149597102 C/G 10 149597102 + NaN NaN \n", + "\n", + " protLSID assayLSID panelLSID ... NDUR_9_15 NDUR_9_1 NDUR_9_2 \\\n", + "0 NaN NaN NaN ... TT TT TT \n", + "1 NaN NaN NaN ... GG GG GG \n", + "2 NaN NaN NaN ... TT TT TT \n", + "3 NaN NaN NaN ... AA AA AA \n", + "4 NaN NaN NaN ... GG NN GG \n", + "... ... ... ... ... ... ... ... \n", + "33924 NaN NaN NaN ... CG GG GG \n", + "33925 NaN NaN NaN ... GG GG GG \n", + "33926 NaN NaN NaN ... TT GG GG \n", + "33927 NaN NaN NaN ... CG GG GG \n", + "33928 NaN NaN NaN ... CC CC CC \n", + "\n", + " NDUR_9_3 NDUR_9_4 NDUR_9_5 NDUR_9_6 NDUR_9_7 NDUR_9_8 NDUR_9_9 \n", + "0 TT TT TT TT TT TT TT \n", + "1 GG GG GG GG GG GG GG \n", + "2 TT TT TT TT TT TT TT \n", + "3 AA AA AA AA AA AA AA \n", + "4 GG GG NN GG GG GG GG \n", + "... ... ... ... ... ... ... ... \n", + "33924 GG GG GG GG GG GG GG \n", + "33925 GG GG GG GG GG GG GG \n", + "33926 GG GG GG GG NN GG GG \n", + "33927 GG GG GG GG GG GG GG \n", + "33928 CC CC CC CC CC CC CC \n", + "\n", + "[33929 rows x 3466 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Remove unmatched accessions from the genotype file#\n", + "geno = pd.read_table(\"../data/T3606_33929_hapmap.hmp.renamed_accessions.txt\")\n", + "filtered_geno = geno.drop(unmatched_genotype, axis=1)\n", + "filtered_geno" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "ad980d61-c29b-4e68-924d-2e5100552955", + "metadata": {}, + "outputs": [], + "source": [ + "# create new all taxa genotype hapmap file with 3455 accessions\n", + "filtered_geno.to_csv(\"../data/gwas_data_3455_accessions/genotype/all_taxa_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n", + "\n", + "# create new all taxa phenotype csv with 3455 accessions\n", + "filtered_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/all_taxa_3455_accessions.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "72f143e6-252a-4b99-97f7-b492ed55fb92", + "metadata": {}, + "source": [ + "- Per taxa filter" + ] + }, + { + "cell_type": "markdown", + "id": "186bc800", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "6b747ebb-db50-4619-a9db-1d3680b701ed", + "metadata": {}, + "outputs": [], + "source": [ + "# get ids for each taxa in individual lists from phenotype files\n", + "\n", + "hapmap_fields = filtered_geno.columns.to_list()[0:11] # hapmap format compulsory fields\n", + "\n", + "# parviglumis\n", + "parv_pheno = filtered_pheno.loc[phen['classification'] == 'Zea mays parviglumis - Balsas']\n", + "parv_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/parv_pheno_3455_accessions.csv\", index=False)\n", + "\n", + "parv_pheno_ids = hapmap_fields + parv_pheno['taxa'].to_list()\n", + "filtered_geno[parv_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/parv_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n", + "\n", + "\n", + "# mexicana\n", + "mex_chalco_pheno = filtered_pheno.loc[phen['classification'] == 'Zea mays mexicana - Chalco']\n", + "mex_chalco_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/mex_chalco_pheno_3455_accessions.csv\", index=False)\n", + "\n", + "mex_chalco_pheno_ids = hapmap_fields + mex_chalco_pheno['taxa'].to_list()\n", + "filtered_geno[mex_chalco_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/mex_chalco_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n", + "\n", + "\n", + "mex_durango_pheno = filtered_pheno.loc[phen['classification'] == 'Zea mays mexicana - Durango']\n", + "mex_durango_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/mex_durango_pheno_3455_accessions.csv\", index=False)\n", + "\n", + "mex_durango_pheno_ids = hapmap_fields + mex_durango_pheno['taxa'].to_list()\n", + "filtered_geno[mex_durango_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/mex_durango_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n", + "\n", + "\n", + "mex_mesa_central_pheno = filtered_pheno.loc[phen['classification'] == 'Zea mays mexicana - Mesa Central']\n", + "mex_mesa_central_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/mex_mesa_central_pheno_3455_accessions.csv\", index=False)\n", + "\n", + "mex_mesa_central_pheno_ids = hapmap_fields + mex_mesa_central_pheno['taxa'].to_list()\n", + "filtered_geno[mex_mesa_central_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/mex_mesa_central_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n", + "\n", + "\n", + "mex_nobogame_pheno = filtered_pheno.loc[phen['classification'] == 'Zea mays mexicana - Nobogame']\n", + "mex_nobogame_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/mex_nobogame_pheno_3455_accessions.csv\", index=False)\n", + "\n", + "mex_nobogame_pheno_ids = hapmap_fields + mex_nobogame_pheno['taxa'].to_list()\n", + "filtered_geno[mex_nobogame_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/mex_nobogame_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n", + "\n", + "\n", + "mex_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea mays mexicana')]\n", + "#mex_pheno = filtered_pheno[filtered_pheno['classsification'].str.match('Zea mays mexicana')]\n", + "mex_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/mex_pheno_3455_accessions.csv\", index=False)\n", + "\n", + "mex_pheno_ids = hapmap_fields + mex_pheno['taxa'].to_list()\n", + "filtered_geno[mex_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/mex_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n", + "\n", + "\n", + "# luxurians\n", + "lux_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea luxurians')]\n", + "lux_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/lux_pheno_3455_accessions.csv\", index=False)\n", + "\n", + "lux_pheno_ids = hapmap_fields + lux_pheno['taxa'].to_list()\n", + "filtered_geno[lux_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/lux_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n", + "\n", + "\n", + "# Perennis\n", + "per_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea perennis') ]\n", + "per_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/per_pheno_3455_accessions.csv\", index=False)\n", + "\n", + "per_pheno_ids = hapmap_fields + per_pheno['taxa'].to_list()\n", + "filtered_geno[per_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/per_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n", + "\n", + "\n", + "# diploperennis\n", + "diplo_per_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea diploperennis') ]\n", + "diplo_per_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/diplo_per_pheno_3455_accessions.csv\", index=False)\n", + "\n", + "diplo_per_pheno_ids = hapmap_fields + diplo_per_pheno['taxa'].to_list()\n", + "filtered_geno[diplo_per_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/diplo_per_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n", + "\n", + "\n", + "# huehuetanangensis\n", + "hue_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea mays huehuetenangensis') ]\n", + "hue_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/hue_pheno_3455_accessions.csv\", index=False)\n", + "\n", + "hue_pheno_ids = hapmap_fields + hue_pheno['taxa'].to_list()\n", + "filtered_geno[hue_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/hue_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n", + "\n", + "\n", + "# Nicaraguensis\n", + "nica_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea nicaraguensis') ]\n", + "nica_pheno.to_csv(\"../data/gwas_data_3455_accessions/phenotype/nica_pheno_3455_accessions.csv\", index=False)\n", + "\n", + "nica_pheno_ids = hapmap_fields + nica_pheno['taxa'].to_list()\n", + "filtered_geno[nica_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/nica_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "id": "5b53034c", + "metadata": {}, + "outputs": [], + "source": [ + "# Perennis and diploperennis subset\n", + "per_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea perennis') ]\n", + "diplo_per_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea diploperennis') ]\n", + "\n", + "# combine the two frames\n", + "frames_dper_per = [diplo_per_pheno,per_pheno]\n", + "diplo_plus_perennis = pd.concat(frames_dper_per)\n", + "\n", + "diplo_plus_perennis.to_csv(\"../data/gwas_data_3455_accessions/phenotype/diplo_plus_perennis_pheno_3455_accessions.csv\", index=False)\n", + "#diplo_plus_perennis\n", + "\n", + "\n", + "diplo_plus_per_pheno_ids = hapmap_fields + diplo_plus_perennis['taxa'].to_list()\n", + "#diplo_plus_per_pheno_ids\n", + "filtered_geno[diplo_plus_per_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/diplo_plus_perennis_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "id": "8fe20e4f", + "metadata": {}, + "outputs": [], + "source": [ + "# luxurians and nicaraguensis subset\n", + "lux_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea luxurians') ]\n", + "nica_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea nicaraguensis') ]\n", + "\n", + "# combine the two frames\n", + "frames_lux_nica = [lux_pheno,nica_pheno]\n", + "lux_plus_nica = pd.concat(frames_lux_nica)\n", + "\n", + "lux_plus_nica.to_csv(\"../data/gwas_data_3455_accessions/phenotype/lux_plus_nica_pheno_3455_accessions.csv\", index=False)\n", + "#lux_plus_nica\n", + "\n", + "\n", + "lux_nica_pheno_ids = hapmap_fields + lux_plus_nica['taxa'].to_list()\n", + "#lux_nica_pheno_ids\n", + "filtered_geno[lux_nica_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/lux_plus_nica_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "id": "6f5e7eba", + "metadata": {}, + "outputs": [], + "source": [ + "# parviglumis and huehuetenanguensis subset\n", + "parv_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea mays parviglumis - Balsas') ]\n", + "hue_pheno = filtered_pheno.loc[phen['classification'].str.match('Zea mays huehuetenangensis') ]\n", + "\n", + "# # combine the two frames\n", + "frames_parv_hue = [parv_pheno,hue_pheno]\n", + "parv_plus_hue = pd.concat(frames_parv_hue)\n", + "\n", + "parv_plus_hue.to_csv(\"../data/gwas_data_3455_accessions/phenotype/parv_plus_hue_pheno_3455_accessions.csv\", index=False)\n", + "\n", + "\n", + "parv_hue_pheno_ids = hapmap_fields + parv_plus_hue['taxa'].to_list()\n", + "#parv_hue_pheno_ids\n", + "filtered_geno[parv_hue_pheno_ids].to_csv(\"../data/gwas_data_3455_accessions/genotype/parv_plus_hue_geno_3455_accessions.hapmap.txt\", sep=\"\\t\", index=False)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5be98b4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.5 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5 (default, Jun 4 2021, 12:28:51) \n[GCC 7.5.0]" + }, + "vscode": { + "interpreter": { + "hash": "c0e123d1fea197aa518e3197211374e8a785689805234020a499f6375b71c087" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/README.md b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/README.md new file mode 100644 index 0000000000..4633751d36 --- /dev/null +++ b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/README.md @@ -0,0 +1,4 @@ +# The directory contains scripts used to: + +- convert genotype hapmap file to other file formats (h5, hapmap, vcf and plink) +- impute missing snps with heterozygous snps at the respective positions diff --git a/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/hapmap2numeric.py b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/hapmap2numeric.py new file mode 100755 index 0000000000..79c86edd9d --- /dev/null +++ b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/hapmap2numeric.py @@ -0,0 +1,54 @@ +#!/usr/bin/python3 + +import sys, getopt + +def main(argv): + inputfile = '' + outputfile = '' + try: + opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="]) + except getopt.GetoptError: + print ('hapmap2numeirc.py -i <inputfile> -o <outputfile>') + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print ('hapmap2numeirc.py -i <inputfile> -o <outputfile>') + sys.exit() + elif opt in ("-i", "--ifile"): + inputfile = arg + elif opt in ("-o", "--ofile"): + outputfile = arg + print ('Input file is "', inputfile) + print ('Output file is "', outputfile) + + + with open(inputfile, 'r') as hap: + with open(outputfile, 'w') as num_hap: + for l in hap: + line = l.split() + if line[0] == 'rs#': + header = line + num_hap.writelines(('\t').join(header)) + num_hap.writelines('\n') + continue + else: + ref_alle = line[1].split('/')[0] + alt_alle = line[1].split('/')[1] + + tr_line = [snp.replace(ref_alle+ref_alle, '1') for snp in line] + tr_line = [snp.replace('NN', 'NA') for snp in tr_line] + tr_line = [snp.replace(alt_alle+alt_alle, '-1') for snp in tr_line] + tr_line = [snp.replace(alt_alle+ref_alle, '0') for snp in tr_line] + tr_line = [snp.replace(ref_alle+alt_alle, '0') for snp in tr_line] + tr_line[5:5] = ['NA', 'NA', 'NA', 'NA', 'NA', 'NA' ] + + num_hap.writelines('\t'.join(tr_line)) + num_hap.writelines('\n') + + +if __name__ == "__main__": + main(sys.argv[1:]) + + + + diff --git a/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/hapmap2numeric.sh b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/hapmap2numeric.sh new file mode 100644 index 0000000000..53501821ae --- /dev/null +++ b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/hapmap2numeric.sh @@ -0,0 +1,13 @@ +./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/parv_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/parv_numeric.hapmap.txt +./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/parv_plus_hue_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/parv_plus_hue_numeric.hapmap.txt +./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/mex_numeric.hapmap.txt +./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_mesa_central_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/mex_mesa_numeric.hapmap.txt +./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_chalco_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/mex_chalco_numeric.hapmap.txt +./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_durango_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/mex_durango_numeric.hapmap.txt +./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_nobogame_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/mex_nobogame_numeric.hapmap.txt +./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/diplo_per_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/diplo_per_numeric.hapmap.txt +./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/diplo_plus_perennis_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/diplo_plus_perennis_numeric.hapmap.txt +./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/per_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/per_numeric.hapmap.txt +./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/hue_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/hue_numeric.hapmap.txt +./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/nica_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/nica_numeric.hapmap.txt +./hapmap2numeric.py -i ../../../data/gwas_data_3455_accessions/genotype/all_taxa_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_numeric/all_numeric.hapmap.txt diff --git a/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/impute_missing_with_het_hapmap.py b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/impute_missing_with_het_hapmap.py new file mode 100755 index 0000000000..4d7d0f86e4 --- /dev/null +++ b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/impute_missing_with_het_hapmap.py @@ -0,0 +1,51 @@ +#!/usr/bin/python3 + +import sys, getopt + +def main(argv): + inputfile = '' + outputfile = '' + try: + opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="]) + except getopt.GetoptError: + print ('impute_missing_with_het_hapmap.py -i <inputfile> -o <outputfile>') + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print ('impute_missing_with_het_hapmap.py -i <inputfile> -o <outputfile>') + sys.exit() + elif opt in ("-i", "--ifile"): + inputfile = arg + elif opt in ("-o", "--ofile"): + outputfile = arg + print ('Input file is "', inputfile) + print ('Output file is "', outputfile) + + + with open(inputfile, 'r') as hap: + with open(outputfile, 'w') as num_hap: + for l in hap: + line = l.split() + if line[0] == 'rs#': + header = line + num_hap.writelines(('\t').join(header)) + num_hap.writelines('\n') + continue + else: + ref_alle = line[1].split('/')[0] + alt_alle = line[1].split('/')[1] + + imp_line = [snp.replace('NN', ref_alle+alt_alle) for snp in line] + + imp_line[5:5] = ['NA', 'NA', 'NA', 'NA', 'NA', 'NA' ] + + num_hap.writelines('\t'.join(imp_line)) + num_hap.writelines('\n') + + +if __name__ == "__main__": + main(sys.argv[1:]) + + + + diff --git a/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/impute_missing_with_het_hapmap.sh b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/impute_missing_with_het_hapmap.sh new file mode 100644 index 0000000000..360ea94dda --- /dev/null +++ b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/impute_missing_with_het_hapmap.sh @@ -0,0 +1,13 @@ +./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/parv_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/parv_imputed_missing_hetero.hapmap.txt +./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/parv_plus_hue_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/parv_plus_hue_imputed_missing_hetero.hapmap.txt +./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_imputed_missing_hetero.hapmap.txt +./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_mesa_central_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_mesa_imputed_missing_hetero.hapmap.txt +./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_chalco_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_chalco_imputed_missing_hetero.hapmap.txt +./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_durango_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_durango_imputed_missing_hetero.hapmap.txt +./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/mex_nobogame_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_nobogame_imputed_missing_hetero.hapmap.txt +./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/diplo_per_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/diplo_per_imputed_missing_hetero.hapmap.txt +./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/diplo_plus_perennis_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/diplo_plus_perennis_imputed_missing_hetero.hapmap.txt +./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/per_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/per_imputed_missing_hetero.hapmap.txt +./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/hue_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/hue_imputed_missing_hetero.hapmap.txt +./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/nica_geno_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/nica_imputed_missing_hetero.hapmap.txt +./impute_missing_with_het_hapmap.py -i ../../../data/gwas_data_3455_accessions/genotype/all_taxa_3455_accessions.hapmap.txt -o ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/all_imputed_missing_hetero.hapmap.txt diff --git a/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/tassel_convert_imputed_h5.sh b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/tassel_convert_imputed_h5.sh new file mode 100644 index 0000000000..9edb6e69f2 --- /dev/null +++ b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/tassel_convert_imputed_h5.sh @@ -0,0 +1,14 @@ +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/parv_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/parv_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/parv_plus_hue_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/parv_plus_hue_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/mex_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_mesa_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/mex_mesa_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_chalco_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/mex_chalco_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_durango_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/mex_durango_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_nobogame_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/mex_nobogame_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/diplo_per_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/diplo_per_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/diplo_plus_perennis_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/diplo_plus_perennis_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/per_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/per_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/hue_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/hue_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/nica_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/nica_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/all_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/HDF5/all_imputed_missing_hetero.HDF5 -exportType HDF5 -runfork1 + diff --git a/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/tassel_convert_imputed_plink.sh b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/tassel_convert_imputed_plink.sh new file mode 100644 index 0000000000..94bf5b4931 --- /dev/null +++ b/workflows/preprocessing_data/scripts/hapmap_convertion_scripts/tassel_convert_imputed_plink.sh @@ -0,0 +1,16 @@ +mkdir ../../../data/gwas_data_3455_accessions/genotype_imputed/plink + +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/parv_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/parv_imputed_missing_hetero.plink -exportType Plink -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/parv_plus_hue_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/parv_plus_hue_imputed_missing_hetero.plink -exportType Plink -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/mex_imputed_missing_hetero.plink -exportType Plink -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_mesa_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/mex_mesa_imputed_missing_hetero.plink -exportType Plink -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_chalco_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/mex_chalco_imputed_missing_hetero.plink -exportType Plink -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_durango_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/mex_durango_imputed_missing_hetero.plink -exportType Plink -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/mex_nobogame_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/mex_nobogame_imputed_missing_hetero.plink -exportType Plink -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/diplo_per_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/diplo_per_imputed_missing_hetero.plink -exportType Plink -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/diplo_plus_perennis_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/diplo_plus_perennis_imputed_missing_hetero.plink -exportType Plink -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/per_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/per_imputed_missing_hetero.plink -exportType Plink -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/hue_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/hue_imputed_missing_hetero.plink -exportType Plink -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/nica_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/nica_imputed_missing_hetero.plink -exportType Plink -runfork1 +/home/joseph/TASSEL5/run_pipeline.pl -Xmx5g -fork1 -h ../../../data/gwas_data_3455_accessions/genotype_imputed/hapmap/all_imputed_missing_hetero.hapmap.txt -export ../../../data/gwas_data_3455_accessions/genotype_imputed/plink/all_imputed_missing_hetero.plink -exportType Plink -runfork1 + diff --git a/workflows/preprocessing_data/scripts/remove_spp_tags_in_hapmap_files.sh b/workflows/preprocessing_data/scripts/remove_spp_tags_in_hapmap_files.sh new file mode 100755 index 0000000000..4ecdf8973d --- /dev/null +++ b/workflows/preprocessing_data/scripts/remove_spp_tags_in_hapmap_files.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +# Remove ssp tags on the overall teosinte hapmap files ids + +sed -e 's/parviglumis_//g' -e 's/mexicana_//g' -e 's/luxurians_//g' -e 's/huehuetenangensis_//g' -e 's/diploperennis_//g' -e 's/perennis_//g' -e 's/nicaraguensis_//g' < ../data/T3606_33929_hapmap.hmp.txt > ../data/T3606_33929_hapmap.hmp.renamed_accessions.txt + + +# Remove ssp tags on the indivdual spp hapmapfiles +# sed -e 's/parviglumis_//g' < ../data/hapmap/parviglumis.hmp.txt > ../data/hapmap/parviglumis.renamed.hmp.txt +# sed -e 's/mexicana_//g' < ../data/hapmap/mexicana.hmp.txt > ../data/hapmap/mexicana.renamed.hmp.txt +# sed -e 's/luxurians_//g' < ../data/hapmap/luxurians.hmp.txt > ../data/hapmap/luxurians.renamed.hmp.txt +# sed -e 's/huehuetenangensis_//g' < ../data/hapmap/huehuetenangensis.hmp.txt > ../data/hapmap/huehuetenangensis.renamed.hmp.txt +# sed -e 's/diploperennis_//g' < ../data/hapmap/diploperennis.hmp.txt > ../data/hapmap/diploperennis.renamed.hmp.txt +# sed -e 's/perennis_//g' < ../data/hapmap/perennis.hmp.txt > ../data/hapmap/perennis.renamed.hmp.txt +# sed -e 's/nicaraguensis_//g' < ../data/hapmap/nicaraguensis.hmp.txt > ../data/hapmap/nicaraguensis.renamed.hmp.txt diff --git a/workflows/preprocessing_data/scripts/workflow_order.md b/workflows/preprocessing_data/scripts/workflow_order.md new file mode 100644 index 0000000000..77acdce9c3 --- /dev/null +++ b/workflows/preprocessing_data/scripts/workflow_order.md @@ -0,0 +1,12 @@ +# Workflow description + +This document describes the order in which the scripts were executed + +1. `remove_spp_tags_in_hapmap_files.sh` + - Removes ssp tags on the overall teosinte hapmap files ids +2. `geno_pheno_accession_selection_3455_accesions_matched.ipynb` + - Code used to filter out genotypes not in the phenotype accessions and vice verser for GWAS analysis using GAPIT + - Individual taxa subsets are also generated within the notebook. +3. `extract_indiv_spp_pheno_data.sh` + - Extaracts and creates individual species phenotype files +4. The subdirectory `hapmap_convertion_scripts` contains scripts used to convert the hapmap genotype file type to other formats (plink, numeric, H5 formats) -- GitLab