Skip to content
Snippets Groups Projects
Commit 9ad257c5 authored by Micha Wijesingha Ahchige's avatar Micha Wijesingha Ahchige
Browse files

updated GC-MS normalization script

parent ff0c2fc1
No related branches found
No related tags found
No related merge requests found
rm(list = ls()) source("C:/Users/Micha/Nextcloud/Data_Plant/cmqtl_val1_arc/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R", echo=TRUE)
library(openxlsx) isa_tidy <- isa_study_tidy %>%
library(tidyverse) full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>%
library(car) full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) #%>%
library(pheatmap) View(isa_tidy)
library(broom) isa_tidy <- isa_study_tidy %>%
library(ggpubr) full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>%
library(viridisLite) full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>%
library(modelr) full_join(isa_ms_tidy, by = c("sample_name" = "source_name"))
#library(dlookr) View(isa_tidy)
#library(imputeLCMD) isa_tidy <- isa_study_tidy %>%
library(ggrepel) full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>%
here::i_am("workflows/GC_MS_normalization/210927_primary_normalization_with_split.R") full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>%
library(here) full_join(isa_ms_tidy, by = c("sample_name_ext" = "source_name"))
out <- here("runs/GC-MS normalization") View(isa_ms_tidy)
if (file.exists(out)) { isa_tidy <- isa_study_tidy %>%
cat("The folder already exists") full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>%
} else { full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>%
dir.create(out) full_join(isa_ms_tidy, by = c("sample_name_ext" = "source_name", "sample_name" = "sample_name"))
} source("C:/Users/Micha/Nextcloud/Data_Plant/cmqtl_val1_arc/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R", echo=TRUE)
setwd(here())# Not recommended but convenient in Rstudio to start from root View(isa_gc)
sam_dat1 <- readxl::read_xlsx(here("studies/cmQTL_val1_GH_2020/isa.study.xlsx")) View(isa_ext_tidy)
View(sam_dat1) View(isa_gc_tidy)
isa_ext <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 1) isa_gc_tidy <- isa_gc %>%
isa_gc <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 2) rename(source_name =`Source Name`,
isa_ms <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 3) class = `Characteristic [sample type]`,
take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160_rt9.81", "glutamic_acid_246_363_rt8.31", batch_GC = `Parameter [Batch]`,
"glutamine_156_245_rt9.80", "malic_acid_233_245_rt7.22", "shikimic_acid_204_462_rt9.57", "shikimic_acid_204_462_rt9.57", run_date_GC = `Parameter [run date]`,
"pyroglutamic_acid_156_258_rt8.30", "sucrose_437_361_rt13.77", "sucrose2_204_361_rt13.79", "citric_acid_273_375_rt9.72", daily_num = `Parameter [daily number]`,
"arginine_157_256_rt9.92")
exclude_samples <- c("21106rA_31", "21107rA_54", "21109rA_59", "21109rA_86", "21109rA_78")
exclude_mets <- c("psicose_103_217_rt9.38", "glutamic_acid_246_363_rt8.31", "lactic_acid_117_219_rt3.07")#glu wrong peak
area1 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_seq_file_20210914143103_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
area2 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
area3 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
#Add primary metabolite MAF
metdat_GC_class <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/MAF_GC_MS.xlsx"))
area <- area1 %>%
bind_rows(area2, area3) %>%
select(component, area, machine_num_GC = machine_num,rt) %>%
mutate(area = as.numeric(area),
rt = as.numeric(rt))
rt_mean <- area %>%
group_by(component) %>%
summarise(RT_mean = mean(rt, na.rm = T))
View(metdat_GC_class)
metdat_GC_class <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/MAF_GC_MS.xlsx")) %>%
select(component = Xcal_name_xreport, Compound_Name = PubChem_Name_mapped)%>%
left_join(rt_mean) %>%
filter(!is.na(component)) %>%
arrange(Compound_Name, RT_mean) %>%
group_by(Compound_Name) %>%
mutate(peak_no = rank(RT_mean),
Compound_Name = if_else(duplicated(Compound_Name),
str_c(Compound_Name, "peak", peak_no, sep = "_"),
Compound_Name))
View(metdat_GC_class)
sam_vars <- c("plantline", "alias", "LIMS_ID",
"treatment", "tissue", "batch_GC", "run_date_GC",
"extraction_num", "sample_num", "machine_num_GC",
"class", "run_num_GC", "sample_weight", "exp", "genotype")
sam_dat1_tidy <- sam_dat1 %>%
left_join(isa_ext)#
colnames(sam_dat1)
sam_dat1_tidy <- sam_dat1 %>%
mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{2}\\.\\d{2}"))) %>%
rename(source_name = `Source Name`,
plantline = `Characteristic [plantline]`,
alias = `Characteristic [alias]`,
LIMS_ID = `Characteristic [LIMS aliquot]`,
treatment = `Factor [Irrigation factor]`,
tissue = `Characteristic [multi-tissue plant structure]`,
genotype = `Characteristic [genotype]`,
sample_num = `Characteristic [sample_name_non_unique]`,
extraction_num = `Characteristic [extract number]`) %>%
select(%in% sam_vars)
sam_dat1_tidy <- sam_dat1 %>%
mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{2}\\.\\d{2}"))) %>%
rename(source_name = `Source Name`,
plantline = `Characteristic [plantline]`,
alias = `Characteristic [alias]`,
LIMS_ID = `Characteristic [LIMS aliquot]`,
treatment = `Factor [Irrigation factor]`,
tissue = `Characteristic [multi-tissue plant structure]`,
genotype = `Characteristic [genotype]`,
sample_num = `Characteristic [sample_name_non_unique]`,
extraction_num = `Characteristic [extract number]`) %>%
select(any_of(sam_vars))
View(sam_dat1_tidy)
sam_dat1_tidy <- sam_dat1 %>%
mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{2}\\.\\d{2}"))) #%>%
View(sam_dat1_tidy)
sam_dat1_tidy$sample_weight
str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.\\d{2}")
str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}")
str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.")
str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.*")
str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.")
str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.*")
str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.*\\d{0,2}")
sam_dat1_tidy <- sam_dat1 %>%
mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{0,3}\\.*\\d{0,2}"))) #%>%
sam_dat1_tidy <- sam_dat1 %>%
mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{0,3}\\.*\\d{0,2}"))) %>%
rename(source_name = `Source Name`,
plantline = `Characteristic [plantline]`,
alias = `Characteristic [alias]`,
LIMS_ID = `Characteristic [LIMS aliquot]`,
treatment = `Factor [Irrigation factor]`,
tissue = `Characteristic [multi-tissue plant structure]`,
genotype = `Characteristic [genotype]`,
sample_num = `Characteristic [sample_name_non_unique]`,
extraction_num = `Characteristic [extract number]`) %>%
select(any_of(sam_vars))
View(sam_dat1_tidy)
sam_dat1_tidy <- sam_dat1 %>%
mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{0,3}\\.*\\d{0,2}"))) %>%
rename(source_name = `Source Name`,
plantline = `Characteristic [plantline]`,
alias = `Characteristic [alias]`,
LIMS_ID = `Characteristic [LIMS aliquot]`,
treatment = `Factor [Irrigation factor]`,
tissue = `Characteristic [multi-tissue plant structure]`,
genotype = `Characteristic [genotype]`,
sample_num = `Characteristic [sample_name_non_unique]`,
extraction_num = `Characteristic [extract number]`,
sample_name = `Sample Name`) %>% sample_name = `Sample Name`) %>%
select(any_of(sam_vars), sample_name) select(source_name, any_of(sam_vars), sample_name)
isa_tidy <- isa_study_tidy %>%
full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>%
full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>%
full_join(isa_ms_tidy, by = c("sample_name_ext" = "source_name", "sample_name" = "sample_name"))
No preview for this file type
...@@ -30,13 +30,6 @@ setwd(here())# Not recommended but convenient in Rstudio to start from root ...@@ -30,13 +30,6 @@ setwd(here())# Not recommended but convenient in Rstudio to start from root
# Data loading ------------------------------------------------------------ # Data loading ------------------------------------------------------------
sam_dat1 <- readxl::read_xlsx(here("studies/cmQTL_val1_GH_2020/isa.study.xlsx")) sam_dat1 <- readxl::read_xlsx(here("studies/cmQTL_val1_GH_2020/isa.study.xlsx"))
#sam_dat1 <- read_csv("210812_cmQTL_val1_samplelist.csv", col_types = "ffidficcccfccccc")
#GC_run1 <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 5)
#sam_dat2 <- read_csv("210812_cmQTL_val2_samplelist.csv", col_types = "ccfdifiiiff")
#GC_run2 <- readxl::read_xlsx("210324_WIJESI-130121-15_cmQTL_validation2.xlsx", sheet = 6)
isa_ext <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 1) isa_ext <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 1)
isa_gc <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 2) isa_gc <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 2)
isa_ms <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 3) isa_ms <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 3)
...@@ -45,8 +38,6 @@ isa_ms <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx ...@@ -45,8 +38,6 @@ isa_ms <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx
#GC_machine_nums <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 6) #GC_machine_nums <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 6)
#setwd(current)
take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160_rt9.81", "glutamic_acid_246_363_rt8.31", take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160_rt9.81", "glutamic_acid_246_363_rt8.31",
"glutamine_156_245_rt9.80", "malic_acid_233_245_rt7.22", "shikimic_acid_204_462_rt9.57", "shikimic_acid_204_462_rt9.57", "glutamine_156_245_rt9.80", "malic_acid_233_245_rt7.22", "shikimic_acid_204_462_rt9.57", "shikimic_acid_204_462_rt9.57",
"pyroglutamic_acid_156_258_rt8.30", "sucrose_437_361_rt13.77", "sucrose2_204_361_rt13.79", "citric_acid_273_375_rt9.72", "pyroglutamic_acid_156_258_rt8.30", "sucrose_437_361_rt13.77", "sucrose2_204_361_rt13.79", "citric_acid_273_375_rt9.72",
...@@ -59,7 +50,6 @@ area1 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_c ...@@ -59,7 +50,6 @@ area1 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_c
area2 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls"), na = c("", "N/A")) area2 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
area3 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls"), na = c("", "N/A")) area3 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
#Add primary metabolite MAF
metdat_GC_class <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/MAF_GC_MS.xlsx")) metdat_GC_class <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/MAF_GC_MS.xlsx"))
area <- area1 %>% area <- area1 %>%
...@@ -90,7 +80,7 @@ sam_vars <- c("plantline", "alias", "LIMS_ID", ...@@ -90,7 +80,7 @@ sam_vars <- c("plantline", "alias", "LIMS_ID",
"extraction_num", "sample_num", "machine_num_GC", "extraction_num", "sample_num", "machine_num_GC",
"class", "run_num_GC", "sample_weight", "exp", "genotype") "class", "run_num_GC", "sample_weight", "exp", "genotype")
sam_dat1_tidy <- sam_dat1 %>% isa_study_tidy <- sam_dat1 %>%
mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{0,3}\\.*\\d{0,2}"))) %>% mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{0,3}\\.*\\d{0,2}"))) %>%
rename(source_name = `Source Name`, rename(source_name = `Source Name`,
plantline = `Characteristic [plantline]`, plantline = `Characteristic [plantline]`,
...@@ -102,14 +92,35 @@ sam_dat1_tidy <- sam_dat1 %>% ...@@ -102,14 +92,35 @@ sam_dat1_tidy <- sam_dat1 %>%
sample_num = `Characteristic [sample_name_non_unique]`, sample_num = `Characteristic [sample_name_non_unique]`,
extraction_num = `Characteristic [extract number]`, extraction_num = `Characteristic [extract number]`,
sample_name = `Sample Name`) %>% sample_name = `Sample Name`) %>%
select(any_of(sam_vars), sample_name) select(source_name, any_of(sam_vars), sample_name)
isa_ext_tidy <- isa_ext %>% isa_ext_tidy <- isa_ext %>%
rename() rename(source_name =`Source Name`,
exp = `Characteristic [experiment name]`,
isa_gc_tidy sample_name = `Sample Name`) %>%
isa_ms_tidy select(source_name, any_of(sam_vars), sample_name)
isa_gc_tidy <- isa_gc %>%
rename(source_name =`Source Name`,
class = `Characteristic [sample type]`,
batch_GC = `Parameter [Batch]`,
run_date_GC = `Parameter [run date]`,
daily_num = `Parameter [daily number]`,
sample_name = `Sample Name`) %>%
select(source_name, any_of(sam_vars), sample_name)
isa_ms_tidy <- isa_ms %>%
rename(source_name =`Source Name`,
sample_name = `Sample Name`) %>%
mutate(machine_num_GC = sample_name) %>%
select(source_name, any_of(sam_vars), sample_name)
isa_tidy <- isa_study_tidy %>%
full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>%
full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>%
full_join(isa_ms_tidy, by = c("sample_name_ext" = "source_name", "sample_name" = "sample_name"))
#arrange run_num gc and create daily_num or supply daily_num
stop()
sam_dat1_tidy <- GC_run1 %>% sam_dat1_tidy <- GC_run1 %>%
left_join(GC_machine_nums) %>% left_join(GC_machine_nums) %>%
select(extraction_num = `Sample name`, everything())%>% select(extraction_num = `Sample name`, everything())%>%
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment