diff --git a/.Rhistory b/.Rhistory index 6015b4e3682cd11ee93cefac51ebe0f6a9b4f6bf..7cd0880c812f8f2a24f84c08bcdda042dcdc08be 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,128 +1,35 @@ -rm(list = ls()) -library(openxlsx) -library(tidyverse) -library(car) -library(pheatmap) -library(broom) -library(ggpubr) -library(viridisLite) -library(modelr) -#library(dlookr) -#library(imputeLCMD) -library(ggrepel) -here::i_am("workflows/GC_MS_normalization/210927_primary_normalization_with_split.R") -library(here) -out <- here("runs/GC-MS normalization") -if (file.exists(out)) { -cat("The folder already exists") -} else { -dir.create(out) -} -setwd(here())# Not recommended but convenient in Rstudio to start from root -sam_dat1 <- readxl::read_xlsx(here("studies/cmQTL_val1_GH_2020/isa.study.xlsx")) -View(sam_dat1) -isa_ext <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 1) -isa_gc <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 2) -isa_ms <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 3) -take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160_rt9.81", "glutamic_acid_246_363_rt8.31", -"glutamine_156_245_rt9.80", "malic_acid_233_245_rt7.22", "shikimic_acid_204_462_rt9.57", "shikimic_acid_204_462_rt9.57", -"pyroglutamic_acid_156_258_rt8.30", "sucrose_437_361_rt13.77", "sucrose2_204_361_rt13.79", "citric_acid_273_375_rt9.72", -"arginine_157_256_rt9.92") -exclude_samples <- c("21106rA_31", "21107rA_54", "21109rA_59", "21109rA_86", "21109rA_78") -exclude_mets <- c("psicose_103_217_rt9.38", "glutamic_acid_246_363_rt8.31", "lactic_acid_117_219_rt3.07")#glu wrong peak -area1 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_seq_file_20210914143103_comp_file_area_rt1.bkt.xls"), na = c("", "N/A")) -area2 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls"), na = c("", "N/A")) -area3 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls"), na = c("", "N/A")) -#Add primary metabolite MAF -metdat_GC_class <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/MAF_GC_MS.xlsx")) -area <- area1 %>% -bind_rows(area2, area3) %>% -select(component, area, machine_num_GC = machine_num,rt) %>% -mutate(area = as.numeric(area), -rt = as.numeric(rt)) -rt_mean <- area %>% -group_by(component) %>% -summarise(RT_mean = mean(rt, na.rm = T)) -View(metdat_GC_class) -metdat_GC_class <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/MAF_GC_MS.xlsx")) %>% -select(component = Xcal_name_xreport, Compound_Name = PubChem_Name_mapped)%>% -left_join(rt_mean) %>% -filter(!is.na(component)) %>% -arrange(Compound_Name, RT_mean) %>% -group_by(Compound_Name) %>% -mutate(peak_no = rank(RT_mean), -Compound_Name = if_else(duplicated(Compound_Name), -str_c(Compound_Name, "peak", peak_no, sep = "_"), -Compound_Name)) -View(metdat_GC_class) -sam_vars <- c("plantline", "alias", "LIMS_ID", -"treatment", "tissue", "batch_GC", "run_date_GC", -"extraction_num", "sample_num", "machine_num_GC", -"class", "run_num_GC", "sample_weight", "exp", "genotype") -sam_dat1_tidy <- sam_dat1 %>% -left_join(isa_ext)# -colnames(sam_dat1) -sam_dat1_tidy <- sam_dat1 %>% -mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{2}\\.\\d{2}"))) %>% -rename(source_name = `Source Name`, -plantline = `Characteristic [plantline]`, -alias = `Characteristic [alias]`, -LIMS_ID = `Characteristic [LIMS aliquot]`, -treatment = `Factor [Irrigation factor]`, -tissue = `Characteristic [multi-tissue plant structure]`, -genotype = `Characteristic [genotype]`, -sample_num = `Characteristic [sample_name_non_unique]`, -extraction_num = `Characteristic [extract number]`) %>% -select(%in% sam_vars) -sam_dat1_tidy <- sam_dat1 %>% -mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{2}\\.\\d{2}"))) %>% -rename(source_name = `Source Name`, -plantline = `Characteristic [plantline]`, -alias = `Characteristic [alias]`, -LIMS_ID = `Characteristic [LIMS aliquot]`, -treatment = `Factor [Irrigation factor]`, -tissue = `Characteristic [multi-tissue plant structure]`, -genotype = `Characteristic [genotype]`, -sample_num = `Characteristic [sample_name_non_unique]`, -extraction_num = `Characteristic [extract number]`) %>% -select(any_of(sam_vars)) -View(sam_dat1_tidy) -sam_dat1_tidy <- sam_dat1 %>% -mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{2}\\.\\d{2}"))) #%>% -View(sam_dat1_tidy) -sam_dat1_tidy$sample_weight -str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.\\d{2}") -str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}") -str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.") -str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.*") -str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.") -str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.*") -str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.*\\d{0,2}") -sam_dat1_tidy <- sam_dat1 %>% -mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{0,3}\\.*\\d{0,2}"))) #%>% -sam_dat1_tidy <- sam_dat1 %>% -mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{0,3}\\.*\\d{0,2}"))) %>% -rename(source_name = `Source Name`, -plantline = `Characteristic [plantline]`, -alias = `Characteristic [alias]`, -LIMS_ID = `Characteristic [LIMS aliquot]`, -treatment = `Factor [Irrigation factor]`, -tissue = `Characteristic [multi-tissue plant structure]`, -genotype = `Characteristic [genotype]`, -sample_num = `Characteristic [sample_name_non_unique]`, -extraction_num = `Characteristic [extract number]`) %>% -select(any_of(sam_vars)) -View(sam_dat1_tidy) -sam_dat1_tidy <- sam_dat1 %>% -mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{0,3}\\.*\\d{0,2}"))) %>% -rename(source_name = `Source Name`, -plantline = `Characteristic [plantline]`, -alias = `Characteristic [alias]`, -LIMS_ID = `Characteristic [LIMS aliquot]`, -treatment = `Factor [Irrigation factor]`, -tissue = `Characteristic [multi-tissue plant structure]`, -genotype = `Characteristic [genotype]`, -sample_num = `Characteristic [sample_name_non_unique]`, -extraction_num = `Characteristic [extract number]`, +source("C:/Users/Micha/Nextcloud/Data_Plant/cmqtl_val1_arc/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R", echo=TRUE) +isa_tidy <- isa_study_tidy %>% +full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>% +full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) #%>% +View(isa_tidy) +isa_tidy <- isa_study_tidy %>% +full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>% +full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>% +full_join(isa_ms_tidy, by = c("sample_name" = "source_name")) +View(isa_tidy) +isa_tidy <- isa_study_tidy %>% +full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>% +full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>% +full_join(isa_ms_tidy, by = c("sample_name_ext" = "source_name")) +View(isa_ms_tidy) +isa_tidy <- isa_study_tidy %>% +full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>% +full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>% +full_join(isa_ms_tidy, by = c("sample_name_ext" = "source_name", "sample_name" = "sample_name")) +source("C:/Users/Micha/Nextcloud/Data_Plant/cmqtl_val1_arc/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R", echo=TRUE) +View(isa_gc) +View(isa_ext_tidy) +View(isa_gc_tidy) +isa_gc_tidy <- isa_gc %>% +rename(source_name =`Source Name`, +class = `Characteristic [sample type]`, +batch_GC = `Parameter [Batch]`, +run_date_GC = `Parameter [run date]`, +daily_num = `Parameter [daily number]`, sample_name = `Sample Name`) %>% -select(any_of(sam_vars), sample_name) +select(source_name, any_of(sam_vars), sample_name) +isa_tidy <- isa_study_tidy %>% +full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>% +full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>% +full_join(isa_ms_tidy, by = c("sample_name_ext" = "source_name", "sample_name" = "sample_name")) diff --git a/assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx b/assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx index 46d812e8dea29a26a5a9ad136956fc45d8381765..8fe6eab9a464a94936fe6692ec026c82f214a6e1 100644 Binary files a/assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx and b/assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx differ diff --git a/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R b/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R index 9b1830c521ca077c4493f4d83741048e1af2dc68..883dcb470c558dc8b320b3d889e73bb232fb36e4 100644 --- a/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R +++ b/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R @@ -30,13 +30,6 @@ setwd(here())# Not recommended but convenient in Rstudio to start from root # Data loading ------------------------------------------------------------ sam_dat1 <- readxl::read_xlsx(here("studies/cmQTL_val1_GH_2020/isa.study.xlsx")) - -#sam_dat1 <- read_csv("210812_cmQTL_val1_samplelist.csv", col_types = "ffidficcccfccccc") -#GC_run1 <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 5) - -#sam_dat2 <- read_csv("210812_cmQTL_val2_samplelist.csv", col_types = "ccfdifiiiff") -#GC_run2 <- readxl::read_xlsx("210324_WIJESI-130121-15_cmQTL_validation2.xlsx", sheet = 6) - isa_ext <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 1) isa_gc <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 2) isa_ms <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 3) @@ -45,8 +38,6 @@ isa_ms <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx #GC_machine_nums <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 6) -#setwd(current) - take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160_rt9.81", "glutamic_acid_246_363_rt8.31", "glutamine_156_245_rt9.80", "malic_acid_233_245_rt7.22", "shikimic_acid_204_462_rt9.57", "shikimic_acid_204_462_rt9.57", "pyroglutamic_acid_156_258_rt8.30", "sucrose_437_361_rt13.77", "sucrose2_204_361_rt13.79", "citric_acid_273_375_rt9.72", @@ -59,7 +50,6 @@ area1 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_c area2 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls"), na = c("", "N/A")) area3 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls"), na = c("", "N/A")) -#Add primary metabolite MAF metdat_GC_class <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/MAF_GC_MS.xlsx")) area <- area1 %>% @@ -90,7 +80,7 @@ sam_vars <- c("plantline", "alias", "LIMS_ID", "extraction_num", "sample_num", "machine_num_GC", "class", "run_num_GC", "sample_weight", "exp", "genotype") -sam_dat1_tidy <- sam_dat1 %>% +isa_study_tidy <- sam_dat1 %>% mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{0,3}\\.*\\d{0,2}"))) %>% rename(source_name = `Source Name`, plantline = `Characteristic [plantline]`, @@ -102,14 +92,35 @@ sam_dat1_tidy <- sam_dat1 %>% sample_num = `Characteristic [sample_name_non_unique]`, extraction_num = `Characteristic [extract number]`, sample_name = `Sample Name`) %>% - select(any_of(sam_vars), sample_name) + select(source_name, any_of(sam_vars), sample_name) isa_ext_tidy <- isa_ext %>% - rename() - -isa_gc_tidy -isa_ms_tidy + rename(source_name =`Source Name`, + exp = `Characteristic [experiment name]`, + sample_name = `Sample Name`) %>% + select(source_name, any_of(sam_vars), sample_name) + +isa_gc_tidy <- isa_gc %>% + rename(source_name =`Source Name`, + class = `Characteristic [sample type]`, + batch_GC = `Parameter [Batch]`, + run_date_GC = `Parameter [run date]`, + daily_num = `Parameter [daily number]`, + sample_name = `Sample Name`) %>% + select(source_name, any_of(sam_vars), sample_name) +isa_ms_tidy <- isa_ms %>% + rename(source_name =`Source Name`, + sample_name = `Sample Name`) %>% + mutate(machine_num_GC = sample_name) %>% + select(source_name, any_of(sam_vars), sample_name) + +isa_tidy <- isa_study_tidy %>% + full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>% + full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>% + full_join(isa_ms_tidy, by = c("sample_name_ext" = "source_name", "sample_name" = "sample_name")) +#arrange run_num gc and create daily_num or supply daily_num + stop() sam_dat1_tidy <- GC_run1 %>% left_join(GC_machine_nums) %>% select(extraction_num = `Sample name`, everything())%>%