diff --git a/.Rhistory b/.Rhistory
index 6015b4e3682cd11ee93cefac51ebe0f6a9b4f6bf..7cd0880c812f8f2a24f84c08bcdda042dcdc08be 100644
--- a/.Rhistory
+++ b/.Rhistory
@@ -1,128 +1,35 @@
-rm(list = ls())
-library(openxlsx)
-library(tidyverse)
-library(car)
-library(pheatmap)
-library(broom)
-library(ggpubr)
-library(viridisLite)
-library(modelr)
-#library(dlookr)
-#library(imputeLCMD)
-library(ggrepel)
-here::i_am("workflows/GC_MS_normalization/210927_primary_normalization_with_split.R")
-library(here)
-out <- here("runs/GC-MS normalization")
-if (file.exists(out)) {
-cat("The folder already exists")
-} else {
-dir.create(out)
-}
-setwd(here())# Not recommended but convenient in Rstudio to start from root
-sam_dat1 <- readxl::read_xlsx(here("studies/cmQTL_val1_GH_2020/isa.study.xlsx"))
-View(sam_dat1)
-isa_ext <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 1)
-isa_gc <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 2)
-isa_ms <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 3)
-take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160_rt9.81", "glutamic_acid_246_363_rt8.31",
-"glutamine_156_245_rt9.80", "malic_acid_233_245_rt7.22", "shikimic_acid_204_462_rt9.57", "shikimic_acid_204_462_rt9.57",
-"pyroglutamic_acid_156_258_rt8.30", "sucrose_437_361_rt13.77", "sucrose2_204_361_rt13.79", "citric_acid_273_375_rt9.72",
-"arginine_157_256_rt9.92")
-exclude_samples <- c("21106rA_31", "21107rA_54", "21109rA_59", "21109rA_86", "21109rA_78")
-exclude_mets <- c("psicose_103_217_rt9.38", "glutamic_acid_246_363_rt8.31", "lactic_acid_117_219_rt3.07")#glu wrong peak
-area1 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_seq_file_20210914143103_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
-area2 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
-area3 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
-#Add primary metabolite MAF
-metdat_GC_class <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/MAF_GC_MS.xlsx"))
-area <- area1 %>%
-bind_rows(area2, area3) %>%
-select(component, area, machine_num_GC = machine_num,rt) %>%
-mutate(area = as.numeric(area),
-rt = as.numeric(rt))
-rt_mean <- area %>%
-group_by(component) %>%
-summarise(RT_mean = mean(rt, na.rm = T))
-View(metdat_GC_class)
-metdat_GC_class <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/MAF_GC_MS.xlsx")) %>%
-select(component = Xcal_name_xreport, Compound_Name = PubChem_Name_mapped)%>%
-left_join(rt_mean) %>%
-filter(!is.na(component)) %>%
-arrange(Compound_Name, RT_mean) %>%
-group_by(Compound_Name)  %>%
-mutate(peak_no = rank(RT_mean),
-Compound_Name = if_else(duplicated(Compound_Name),
-str_c(Compound_Name, "peak", peak_no, sep = "_"),
-Compound_Name))
-View(metdat_GC_class)
-sam_vars <- c("plantline", "alias", "LIMS_ID",
-"treatment", "tissue", "batch_GC", "run_date_GC",
-"extraction_num", "sample_num", "machine_num_GC",
-"class", "run_num_GC", "sample_weight", "exp", "genotype")
-sam_dat1_tidy <- sam_dat1 %>%
-left_join(isa_ext)#
-colnames(sam_dat1)
-sam_dat1_tidy <- sam_dat1 %>%
-mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{2}\\.\\d{2}"))) %>%
-rename(source_name = `Source Name`,
-plantline = `Characteristic [plantline]`,
-alias = `Characteristic [alias]`,
-LIMS_ID = `Characteristic [LIMS aliquot]`,
-treatment = `Factor [Irrigation factor]`,
-tissue = `Characteristic [multi-tissue plant structure]`,
-genotype = `Characteristic [genotype]`,
-sample_num = `Characteristic [sample_name_non_unique]`,
-extraction_num = `Characteristic [extract number]`) %>%
-select(%in% sam_vars)
-sam_dat1_tidy <- sam_dat1 %>%
-mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{2}\\.\\d{2}"))) %>%
-rename(source_name = `Source Name`,
-plantline = `Characteristic [plantline]`,
-alias = `Characteristic [alias]`,
-LIMS_ID = `Characteristic [LIMS aliquot]`,
-treatment = `Factor [Irrigation factor]`,
-tissue = `Characteristic [multi-tissue plant structure]`,
-genotype = `Characteristic [genotype]`,
-sample_num = `Characteristic [sample_name_non_unique]`,
-extraction_num = `Characteristic [extract number]`) %>%
-select(any_of(sam_vars))
-View(sam_dat1_tidy)
-sam_dat1_tidy <- sam_dat1 %>%
-mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{2}\\.\\d{2}"))) #%>%
-View(sam_dat1_tidy)
-sam_dat1_tidy$sample_weight
-str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.\\d{2}")
-str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}")
-str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.")
-str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.*")
-str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.")
-str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.*")
-str_extract(sam_dat1$`Factor [sample fresh weight]`, "\\d{2}\\.*\\d{0,2}")
-sam_dat1_tidy <- sam_dat1 %>%
-mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{0,3}\\.*\\d{0,2}"))) #%>%
-sam_dat1_tidy <- sam_dat1 %>%
-mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{0,3}\\.*\\d{0,2}"))) %>%
-rename(source_name = `Source Name`,
-plantline = `Characteristic [plantline]`,
-alias = `Characteristic [alias]`,
-LIMS_ID = `Characteristic [LIMS aliquot]`,
-treatment = `Factor [Irrigation factor]`,
-tissue = `Characteristic [multi-tissue plant structure]`,
-genotype = `Characteristic [genotype]`,
-sample_num = `Characteristic [sample_name_non_unique]`,
-extraction_num = `Characteristic [extract number]`) %>%
-select(any_of(sam_vars))
-View(sam_dat1_tidy)
-sam_dat1_tidy <- sam_dat1 %>%
-mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{0,3}\\.*\\d{0,2}"))) %>%
-rename(source_name = `Source Name`,
-plantline = `Characteristic [plantline]`,
-alias = `Characteristic [alias]`,
-LIMS_ID = `Characteristic [LIMS aliquot]`,
-treatment = `Factor [Irrigation factor]`,
-tissue = `Characteristic [multi-tissue plant structure]`,
-genotype = `Characteristic [genotype]`,
-sample_num = `Characteristic [sample_name_non_unique]`,
-extraction_num = `Characteristic [extract number]`,
+source("C:/Users/Micha/Nextcloud/Data_Plant/cmqtl_val1_arc/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R", echo=TRUE)
+isa_tidy <- isa_study_tidy %>%
+full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>%
+full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) #%>%
+View(isa_tidy)
+isa_tidy <- isa_study_tidy %>%
+full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>%
+full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>%
+full_join(isa_ms_tidy, by = c("sample_name" = "source_name"))
+View(isa_tidy)
+isa_tidy <- isa_study_tidy %>%
+full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>%
+full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>%
+full_join(isa_ms_tidy, by = c("sample_name_ext" = "source_name"))
+View(isa_ms_tidy)
+isa_tidy <- isa_study_tidy %>%
+full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>%
+full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>%
+full_join(isa_ms_tidy, by = c("sample_name_ext" = "source_name", "sample_name" = "sample_name"))
+source("C:/Users/Micha/Nextcloud/Data_Plant/cmqtl_val1_arc/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R", echo=TRUE)
+View(isa_gc)
+View(isa_ext_tidy)
+View(isa_gc_tidy)
+isa_gc_tidy <- isa_gc %>%
+rename(source_name =`Source Name`,
+class = `Characteristic [sample type]`,
+batch_GC = `Parameter [Batch]`,
+run_date_GC = `Parameter [run date]`,
+daily_num = `Parameter [daily number]`,
 sample_name = `Sample Name`) %>%
-select(any_of(sam_vars), sample_name)
+select(source_name, any_of(sam_vars), sample_name)
+isa_tidy <- isa_study_tidy %>%
+full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>%
+full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>%
+full_join(isa_ms_tidy, by = c("sample_name_ext" = "source_name", "sample_name" = "sample_name"))
diff --git a/assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx b/assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx
index 46d812e8dea29a26a5a9ad136956fc45d8381765..8fe6eab9a464a94936fe6692ec026c82f214a6e1 100644
Binary files a/assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx and b/assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx differ
diff --git a/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R b/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R
index 9b1830c521ca077c4493f4d83741048e1af2dc68..883dcb470c558dc8b320b3d889e73bb232fb36e4 100644
--- a/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R
+++ b/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R
@@ -30,13 +30,6 @@ setwd(here())# Not recommended but convenient in Rstudio to start from root
 # Data loading ------------------------------------------------------------
 
 sam_dat1 <- readxl::read_xlsx(here("studies/cmQTL_val1_GH_2020/isa.study.xlsx"))
-
-#sam_dat1 <- read_csv("210812_cmQTL_val1_samplelist.csv", col_types = "ffidficcccfccccc")
-#GC_run1 <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 5)
-
-#sam_dat2 <- read_csv("210812_cmQTL_val2_samplelist.csv", col_types = "ccfdifiiiff")
-#GC_run2 <- readxl::read_xlsx("210324_WIJESI-130121-15_cmQTL_validation2.xlsx", sheet = 6)
-
 isa_ext <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 1)
 isa_gc <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 2)
 isa_ms <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 3)
@@ -45,8 +38,6 @@ isa_ms <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx
 
 #GC_machine_nums <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 6)
 
-#setwd(current)
-
 take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160_rt9.81", "glutamic_acid_246_363_rt8.31",
                 "glutamine_156_245_rt9.80", "malic_acid_233_245_rt7.22", "shikimic_acid_204_462_rt9.57", "shikimic_acid_204_462_rt9.57",
                 "pyroglutamic_acid_156_258_rt8.30", "sucrose_437_361_rt13.77", "sucrose2_204_361_rt13.79", "citric_acid_273_375_rt9.72",
@@ -59,7 +50,6 @@ area1 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_c
 area2 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
 area3 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
 
-#Add primary metabolite MAF
 metdat_GC_class <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/MAF_GC_MS.xlsx"))
   
 area <- area1 %>% 
@@ -90,7 +80,7 @@ sam_vars <- c("plantline", "alias", "LIMS_ID",
               "extraction_num", "sample_num", "machine_num_GC",
               "class", "run_num_GC", "sample_weight", "exp", "genotype")
 
-sam_dat1_tidy <- sam_dat1 %>% 
+isa_study_tidy <- sam_dat1 %>% 
   mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{0,3}\\.*\\d{0,2}"))) %>% 
   rename(source_name = `Source Name`,
          plantline = `Characteristic [plantline]`,
@@ -102,14 +92,35 @@ sam_dat1_tidy <- sam_dat1 %>%
          sample_num = `Characteristic [sample_name_non_unique]`,
          extraction_num = `Characteristic [extract number]`,
          sample_name = `Sample Name`) %>% 
-  select(any_of(sam_vars), sample_name)
+  select(source_name, any_of(sam_vars), sample_name)
 
 isa_ext_tidy <- isa_ext %>% 
-  rename()
-
-isa_gc_tidy
-isa_ms_tidy
+  rename(source_name =`Source Name`,
+         exp = `Characteristic [experiment name]`,
+         sample_name = `Sample Name`) %>% 
+  select(source_name, any_of(sam_vars), sample_name)
+
+isa_gc_tidy <- isa_gc %>% 
+  rename(source_name =`Source Name`,
+         class = `Characteristic [sample type]`,
+         batch_GC = `Parameter [Batch]`,
+         run_date_GC = `Parameter [run date]`,
+         daily_num = `Parameter [daily number]`,
+         sample_name = `Sample Name`) %>% 
+  select(source_name, any_of(sam_vars), sample_name)
 
+isa_ms_tidy <- isa_ms %>% 
+  rename(source_name =`Source Name`,
+         sample_name = `Sample Name`) %>% 
+  mutate(machine_num_GC = sample_name) %>% 
+  select(source_name, any_of(sam_vars), sample_name)
+
+isa_tidy <- isa_study_tidy %>% 
+  full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>% 
+  full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>% 
+  full_join(isa_ms_tidy, by = c("sample_name_ext" = "source_name", "sample_name" = "sample_name"))
+#arrange run_num gc and create daily_num or supply daily_num
+  stop()
 sam_dat1_tidy <- GC_run1 %>% 
   left_join(GC_machine_nums) %>% 
   select(extraction_num = `Sample name`, everything())%>%