diff --git a/.Rhistory b/.Rhistory
new file mode 100644
index 0000000000000000000000000000000000000000..ca96489870cdb82792ee9fd2a44711ee29488fae
--- /dev/null
+++ b/.Rhistory
@@ -0,0 +1,129 @@
+rm(list = ls())
+library(openxlsx)
+library(tidyverse)
+library(car)
+library(pheatmap)
+library(broom)
+library(ggpubr)
+library(viridisLite)
+library(modelr)
+#library(dlookr)
+#library(imputeLCMD)
+library(ggrepel)
+here::i_am("workflows/GC_MS_normalization/210927_primary_normalization_with_split.R")
+library(here)
+out <- here("runs/GC-MS normalization")
+if (file.exists(out)) {
+cat("The folder already exists")
+} else {
+dir.create(out)
+}
+setwd(here())# Not recommended but convenient in Rstudio to start from root
+sam_dat1 <- readxl::read_xlsx(here("studies/cmQTL_val1_GH_2020/isa.study.xlsx"))
+isa_ext <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 1)
+isa_gc <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 2)
+take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160_rt9.81", "glutamic_acid_246_363_rt8.31",
+"glutamine_156_245_rt9.80", "malic_acid_233_245_rt7.22", "shikimic_acid_204_462_rt9.57", "shikimic_acid_204_462_rt9.57",
+"pyroglutamic_acid_156_258_rt8.30", "sucrose_437_361_rt13.77", "sucrose2_204_361_rt13.79", "citric_acid_273_375_rt9.72",
+"arginine_157_256_rt9.92")
+exclude_samples <- c("21106rA_31", "21107rA_54", "21109rA_59", "21109rA_86", "21109rA_78")
+exclude_mets <- c("psicose_103_217_rt9.38", "glutamic_acid_246_363_rt8.31", "lactic_acid_117_219_rt3.07")#glu wrong peak
+area1 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_seq_file_20210914143103_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
+area2 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
+area3 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
+#Add primary metabolite MAF
+metdat_GC_class <- readxl::read_xlsx("H:/3. cmQTL mapping/Ath_Dark_Light_GC_Xcal/current_source_files/210118_primary_metabolites_classification.xlsx") %>%
+select(component = Xcal_name_xreport, Compound_Name = HMDB_clear_name, Compound_Class = ChEBI_Ontology_dense)%>%
+mutate(RT_mean = str_extract(component, "\\d+\\.\\d+$")) %>%
+filter(!is.na(component)) %>%
+group_by(Compound_Name)  %>%
+mutate(peak_no = rank(RT_mean),
+Compound_Name = if_else(duplicated(Compound_Name),
+str_c(Compound_Name, "peak", peak_no, sep = "_"),
+Compound_Name))
+area <- area1 %>%
+bind_rows(area2, area3) %>%
+select(component, area, machine_num_GC = machine_num,rt) %>%
+mutate(area = as.numeric(area),
+rt = as.numeric(rt))
+rt_mean <- area %>%
+group_by(component) %>%
+summarise(RT_mean = mean(rt, na.rm = T))
+metdat_GC_class <- readxl::read_xlsx("H:/3. cmQTL mapping/Shared_source_files/210118_primary_metabolites_classification.xlsx") %>%
+select(component = Xcal_name_xreport, Compound_Name = HMDB_clear_name, Compound_Class = ChEBI_Ontology_dense)%>%
+left_join(rt_mean) %>%
+filter(!is.na(component)) %>%
+arrange(Compound_Name, RT_mean) %>%
+group_by(Compound_Name)  %>%
+mutate(peak_no = rank(RT_mean),
+Compound_Name = if_else(duplicated(Compound_Name),
+str_c(Compound_Name, "peak", peak_no, sep = "_"),
+Compound_Name))
+sam_vars <- c("plantline", "alias", "LIMS_ID",
+"treatment", "tissue", "batch_GC", "run_date_GC",
+"extraction_num", "sample_num", "machine_num_GC",
+"class", "run_num_GC", "sample_weight", "exp", "genotype")
+sam_dat1_tidy <- sam_dat1 %>%
+sam_dat1_tidy <- GC_run1 %>%
+left_join(GC_machine_nums) %>%
+select(extraction_num = `Sample name`, everything())%>%
+mutate(class = as_factor(if_else(str_detect(extraction_num, "run_qc"), "run_qc",
+if_else(str_detect(extraction_num, "blank"), "blank", "sample"))),
+extraction_num = as.numeric(if_else(str_detect(extraction_num, "run_qc"), "0",
+if_else(str_detect(extraction_num, "blank"), "-1",extraction_num))),
+exp = as_factor(1)) %>%
+left_join(sam_dat1) %>%
+left_join(genotypes) %>%
+select(treatment = irrigation, everything()) %>%
+select(all_of(sam_vars)) %>%
+arrange(run_num_GC)
+sam_dat2_tidy <- GC_run2 %>%
+left_join(GC_machine_nums) %>%
+select(extraction_num = `Sample name`, everything()) %>%
+mutate(class = as_factor(if_else(str_detect(extraction_num, "run_qc"), "run_qc",
+if_else(str_detect(extraction_num, "blank"), "blank", "sample"))),
+extraction_num = as.numeric(if_else(str_detect(extraction_num, "run_qc"), "0",
+if_else(str_detect(extraction_num, "blank"), "-1",extraction_num))),
+exp = as_factor(2)) %>%
+left_join(sam_dat2) %>%
+left_join(genotypes) %>%
+select(treatment = irrigation, everything()) %>%
+select(all_of(sam_vars)) %>%
+arrange(run_num_GC)
+sam_dat <- sam_dat1_tidy %>%
+bind_rows(sam_dat2_tidy) %>%
+arrange(run_num_GC) %>%
+group_by(batch_GC) %>%
+mutate(daily_num = row_number()) %>%
+fill(tissue, .direction = "updown") %>%
+ungroup() %>%
+left_join(genotypes) %>%
+mutate(treatment = as_factor(treatment))
+area_long <- area %>%
+left_join(sam_dat) %>%
+filter(!is.na(exp))
+area <- area_long %>%
+left_join(metdat_GC_class) %>%
+filter(!str_detect(component, "^\\!|FAME|component|empty"), !is.na(component),
+!component %in% exclude_mets)
+library(tidyverse)
+library(igraph)
+library(ggnetwork)
+#install.packages("igraph")
+#install.packages("ggnetwork")
+diamonds <- diamonds
+a <- ggplot(economics, aes(date, unemploy))
+a + geom_path()
+economics %>%
+arrange(unemploy) %>%
+ggplot(aes(date, unemploy)) +
+geom_path()
+met_path <- readxl::read_xlsx("KEGG_sly_pathways.xlsx")
+node_cols <- met_path %>%
+pivot_longer(cols = c(educt, product),
+names_to = "node",
+values_to = "met") %>%
+distinct(met) %>%
+select(met, everything()) %>%
+mutate(Compound_Name = met)
+met_path <- readxl::read_xlsx("KEGG_sly_pathways.xlsx")
diff --git a/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R b/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R
index fc4c856062e174becac027134122742dd9154af3..641268852b0beaf65953ebe36dfd6c593c7cc97d 100644
--- a/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R
+++ b/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R
@@ -13,47 +13,44 @@ library(broom)
 library(ggpubr)
 library(viridisLite)
 library(modelr)
-library(dlookr)
-library(imputeLCMD)
+#library(dlookr)
+#library(imputeLCMD)
 library(ggrepel)
-here::i_am("workflows/GC_MS_normalization/210927_primary_normalization_with_split.R")
-library(here)
-
-# Directory setting -------------------------------------------------------
 
 
-current <- getwd()
-source <- str_c(current,"/..")
+# Directory setting -------------------------------------------------------
 
-cur_date <- str_c(str_replace_all(Sys.Date(),"^.{2}|-",""))
+here::i_am("workflows/GC_MS_normalization/210927_primary_normalization_with_split.R")
+library(here)
 
-out <- str_c(cur_date, "normalization", sep = "_")
+out <- here("runs/GC-MS normalization")
 
 if (file.exists(out)) {
   cat("The folder already exists")
 } else {
   dir.create(out)
 }
-
-out_dir <- str_c(current, out, sep = "/")
+setwd(here())# Not recommended but convenient in Rstudio to start from root
+#out_dir <- str_c(current, out, sep = "/")
 
 # Data loading ------------------------------------------------------------
-setwd(source)
 
 sam_dat1 <- readxl::read_xlsx(here("studies/cmQTL_val1_GH_2020/isa.study.xlsx"))
 
-sam_dat1 <- read_csv("210812_cmQTL_val1_samplelist.csv", col_types = "ffidficcccfccccc")
-GC_run1 <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 5)
+#sam_dat1 <- read_csv("210812_cmQTL_val1_samplelist.csv", col_types = "ffidficcccfccccc")
+#GC_run1 <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 5)
 
-sam_dat2 <- read_csv("210812_cmQTL_val2_samplelist.csv", col_types = "ccfdifiiiff")
-GC_run2 <- readxl::read_xlsx("210324_WIJESI-130121-15_cmQTL_validation2.xlsx", sheet = 6)
+#sam_dat2 <- read_csv("210812_cmQTL_val2_samplelist.csv", col_types = "ccfdifiiiff")
+#GC_run2 <- readxl::read_xlsx("210324_WIJESI-130121-15_cmQTL_validation2.xlsx", sheet = 6)
 
-genotypes <- readxl::read_xlsx("Genotype_names.xlsx") %>%
-  mutate(plantline = as_factor(plantline))
+isa_ext <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 1)
+isa_gc <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 2)
+#genotypes <- readxl::read_xlsx("Genotype_names.xlsx") %>%
+#  mutate(plantline = as_factor(plantline))
 
-GC_machine_nums <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 6)
+#GC_machine_nums <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 6)
 
-setwd(current)
+#setwd(current)
 
 take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160_rt9.81", "glutamic_acid_246_363_rt8.31",
                 "glutamine_156_245_rt9.80", "malic_acid_233_245_rt7.22", "shikimic_acid_204_462_rt9.57", "shikimic_acid_204_462_rt9.57",
@@ -63,13 +60,11 @@ take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160
 exclude_samples <- c("21106rA_31", "21107rA_54", "21109rA_59", "21109rA_86", "21109rA_78")
 exclude_mets <- c("psicose_103_217_rt9.38", "glutamic_acid_246_363_rt8.31", "lactic_acid_117_219_rt3.07")#glu wrong peak
 
-area1 <- readxl::read_xls("210914_cmQTL_val_1_2_fruits_seq_file_20210914143103_comp_file_area_rt1.bkt.xls", na = c("", "N/A"))
-area2 <- readxl::read_xls("210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls", na = c("", "N/A"))
-area3 <- readxl::read_xls("210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls", na = c("", "N/A"))
-
-
-
+area1 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_seq_file_20210914143103_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
+area2 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
+area3 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
 
+#Add primary metabolite MAF
 metdat_GC_class <- readxl::read_xlsx("H:/3. cmQTL mapping/Ath_Dark_Light_GC_Xcal/current_source_files/210118_primary_metabolites_classification.xlsx") %>% 
   select(component = Xcal_name_xreport, Compound_Name = HMDB_clear_name, Compound_Class = ChEBI_Ontology_dense)%>% 
   mutate(RT_mean = str_extract(component, "\\d+\\.\\d+$")) %>% 
@@ -111,6 +106,16 @@ sam_vars <- c("plantline", "alias", "LIMS_ID",
               "extraction_num", "sample_num", "machine_num_GC",
               "class", "run_num_GC", "sample_weight", "exp", "genotype")
 
+sam_dat1_tidy <- sam_dat1 %>% 
+  mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{2}\\.\\d{2}"))) %>% 
+  rename(source_name = `Source Name`) %>% 
+  rename_with(.cols = everything(), .fn = ~str_remove(string = .x, pattern = "Parameter \\[")) %>% 
+  rename_with(.cols = everything(), .fn = ~str_remove(string = .x, pattern = "Characteristic \\[")) %>% 
+  rename_with(.cols = everything(), .fn = ~str_remove(string = .x, pattern = "Factor \\[")) %>% 
+  rename_with(.cols = everything(), .fn = ~str_remove(string = .x, pattern = "\\]")) %>% 
+  rename_with(.cols = everything(), .fn = ~str_replace_all(string = .x, pattern = "[:blank:]", replacement = "_"))
+  
+
 sam_dat1_tidy <- GC_run1 %>% 
   left_join(GC_machine_nums) %>% 
   select(extraction_num = `Sample name`, everything())%>%