diff --git a/.Rhistory b/.Rhistory new file mode 100644 index 0000000000000000000000000000000000000000..ca96489870cdb82792ee9fd2a44711ee29488fae --- /dev/null +++ b/.Rhistory @@ -0,0 +1,129 @@ +rm(list = ls()) +library(openxlsx) +library(tidyverse) +library(car) +library(pheatmap) +library(broom) +library(ggpubr) +library(viridisLite) +library(modelr) +#library(dlookr) +#library(imputeLCMD) +library(ggrepel) +here::i_am("workflows/GC_MS_normalization/210927_primary_normalization_with_split.R") +library(here) +out <- here("runs/GC-MS normalization") +if (file.exists(out)) { +cat("The folder already exists") +} else { +dir.create(out) +} +setwd(here())# Not recommended but convenient in Rstudio to start from root +sam_dat1 <- readxl::read_xlsx(here("studies/cmQTL_val1_GH_2020/isa.study.xlsx")) +isa_ext <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 1) +isa_gc <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 2) +take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160_rt9.81", "glutamic_acid_246_363_rt8.31", +"glutamine_156_245_rt9.80", "malic_acid_233_245_rt7.22", "shikimic_acid_204_462_rt9.57", "shikimic_acid_204_462_rt9.57", +"pyroglutamic_acid_156_258_rt8.30", "sucrose_437_361_rt13.77", "sucrose2_204_361_rt13.79", "citric_acid_273_375_rt9.72", +"arginine_157_256_rt9.92") +exclude_samples <- c("21106rA_31", "21107rA_54", "21109rA_59", "21109rA_86", "21109rA_78") +exclude_mets <- c("psicose_103_217_rt9.38", "glutamic_acid_246_363_rt8.31", "lactic_acid_117_219_rt3.07")#glu wrong peak +area1 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_seq_file_20210914143103_comp_file_area_rt1.bkt.xls"), na = c("", "N/A")) +area2 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls"), na = c("", "N/A")) +area3 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls"), na = c("", "N/A")) +#Add primary metabolite MAF +metdat_GC_class <- readxl::read_xlsx("H:/3. cmQTL mapping/Ath_Dark_Light_GC_Xcal/current_source_files/210118_primary_metabolites_classification.xlsx") %>% +select(component = Xcal_name_xreport, Compound_Name = HMDB_clear_name, Compound_Class = ChEBI_Ontology_dense)%>% +mutate(RT_mean = str_extract(component, "\\d+\\.\\d+$")) %>% +filter(!is.na(component)) %>% +group_by(Compound_Name) %>% +mutate(peak_no = rank(RT_mean), +Compound_Name = if_else(duplicated(Compound_Name), +str_c(Compound_Name, "peak", peak_no, sep = "_"), +Compound_Name)) +area <- area1 %>% +bind_rows(area2, area3) %>% +select(component, area, machine_num_GC = machine_num,rt) %>% +mutate(area = as.numeric(area), +rt = as.numeric(rt)) +rt_mean <- area %>% +group_by(component) %>% +summarise(RT_mean = mean(rt, na.rm = T)) +metdat_GC_class <- readxl::read_xlsx("H:/3. cmQTL mapping/Shared_source_files/210118_primary_metabolites_classification.xlsx") %>% +select(component = Xcal_name_xreport, Compound_Name = HMDB_clear_name, Compound_Class = ChEBI_Ontology_dense)%>% +left_join(rt_mean) %>% +filter(!is.na(component)) %>% +arrange(Compound_Name, RT_mean) %>% +group_by(Compound_Name) %>% +mutate(peak_no = rank(RT_mean), +Compound_Name = if_else(duplicated(Compound_Name), +str_c(Compound_Name, "peak", peak_no, sep = "_"), +Compound_Name)) +sam_vars <- c("plantline", "alias", "LIMS_ID", +"treatment", "tissue", "batch_GC", "run_date_GC", +"extraction_num", "sample_num", "machine_num_GC", +"class", "run_num_GC", "sample_weight", "exp", "genotype") +sam_dat1_tidy <- sam_dat1 %>% +sam_dat1_tidy <- GC_run1 %>% +left_join(GC_machine_nums) %>% +select(extraction_num = `Sample name`, everything())%>% +mutate(class = as_factor(if_else(str_detect(extraction_num, "run_qc"), "run_qc", +if_else(str_detect(extraction_num, "blank"), "blank", "sample"))), +extraction_num = as.numeric(if_else(str_detect(extraction_num, "run_qc"), "0", +if_else(str_detect(extraction_num, "blank"), "-1",extraction_num))), +exp = as_factor(1)) %>% +left_join(sam_dat1) %>% +left_join(genotypes) %>% +select(treatment = irrigation, everything()) %>% +select(all_of(sam_vars)) %>% +arrange(run_num_GC) +sam_dat2_tidy <- GC_run2 %>% +left_join(GC_machine_nums) %>% +select(extraction_num = `Sample name`, everything()) %>% +mutate(class = as_factor(if_else(str_detect(extraction_num, "run_qc"), "run_qc", +if_else(str_detect(extraction_num, "blank"), "blank", "sample"))), +extraction_num = as.numeric(if_else(str_detect(extraction_num, "run_qc"), "0", +if_else(str_detect(extraction_num, "blank"), "-1",extraction_num))), +exp = as_factor(2)) %>% +left_join(sam_dat2) %>% +left_join(genotypes) %>% +select(treatment = irrigation, everything()) %>% +select(all_of(sam_vars)) %>% +arrange(run_num_GC) +sam_dat <- sam_dat1_tidy %>% +bind_rows(sam_dat2_tidy) %>% +arrange(run_num_GC) %>% +group_by(batch_GC) %>% +mutate(daily_num = row_number()) %>% +fill(tissue, .direction = "updown") %>% +ungroup() %>% +left_join(genotypes) %>% +mutate(treatment = as_factor(treatment)) +area_long <- area %>% +left_join(sam_dat) %>% +filter(!is.na(exp)) +area <- area_long %>% +left_join(metdat_GC_class) %>% +filter(!str_detect(component, "^\\!|FAME|component|empty"), !is.na(component), +!component %in% exclude_mets) +library(tidyverse) +library(igraph) +library(ggnetwork) +#install.packages("igraph") +#install.packages("ggnetwork") +diamonds <- diamonds +a <- ggplot(economics, aes(date, unemploy)) +a + geom_path() +economics %>% +arrange(unemploy) %>% +ggplot(aes(date, unemploy)) + +geom_path() +met_path <- readxl::read_xlsx("KEGG_sly_pathways.xlsx") +node_cols <- met_path %>% +pivot_longer(cols = c(educt, product), +names_to = "node", +values_to = "met") %>% +distinct(met) %>% +select(met, everything()) %>% +mutate(Compound_Name = met) +met_path <- readxl::read_xlsx("KEGG_sly_pathways.xlsx") diff --git a/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R b/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R index fc4c856062e174becac027134122742dd9154af3..641268852b0beaf65953ebe36dfd6c593c7cc97d 100644 --- a/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R +++ b/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R @@ -13,47 +13,44 @@ library(broom) library(ggpubr) library(viridisLite) library(modelr) -library(dlookr) -library(imputeLCMD) +#library(dlookr) +#library(imputeLCMD) library(ggrepel) -here::i_am("workflows/GC_MS_normalization/210927_primary_normalization_with_split.R") -library(here) - -# Directory setting ------------------------------------------------------- -current <- getwd() -source <- str_c(current,"/..") +# Directory setting ------------------------------------------------------- -cur_date <- str_c(str_replace_all(Sys.Date(),"^.{2}|-","")) +here::i_am("workflows/GC_MS_normalization/210927_primary_normalization_with_split.R") +library(here) -out <- str_c(cur_date, "normalization", sep = "_") +out <- here("runs/GC-MS normalization") if (file.exists(out)) { cat("The folder already exists") } else { dir.create(out) } - -out_dir <- str_c(current, out, sep = "/") +setwd(here())# Not recommended but convenient in Rstudio to start from root +#out_dir <- str_c(current, out, sep = "/") # Data loading ------------------------------------------------------------ -setwd(source) sam_dat1 <- readxl::read_xlsx(here("studies/cmQTL_val1_GH_2020/isa.study.xlsx")) -sam_dat1 <- read_csv("210812_cmQTL_val1_samplelist.csv", col_types = "ffidficcccfccccc") -GC_run1 <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 5) +#sam_dat1 <- read_csv("210812_cmQTL_val1_samplelist.csv", col_types = "ffidficcccfccccc") +#GC_run1 <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 5) -sam_dat2 <- read_csv("210812_cmQTL_val2_samplelist.csv", col_types = "ccfdifiiiff") -GC_run2 <- readxl::read_xlsx("210324_WIJESI-130121-15_cmQTL_validation2.xlsx", sheet = 6) +#sam_dat2 <- read_csv("210812_cmQTL_val2_samplelist.csv", col_types = "ccfdifiiiff") +#GC_run2 <- readxl::read_xlsx("210324_WIJESI-130121-15_cmQTL_validation2.xlsx", sheet = 6) -genotypes <- readxl::read_xlsx("Genotype_names.xlsx") %>% - mutate(plantline = as_factor(plantline)) +isa_ext <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 1) +isa_gc <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 2) +#genotypes <- readxl::read_xlsx("Genotype_names.xlsx") %>% +# mutate(plantline = as_factor(plantline)) -GC_machine_nums <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 6) +#GC_machine_nums <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 6) -setwd(current) +#setwd(current) take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160_rt9.81", "glutamic_acid_246_363_rt8.31", "glutamine_156_245_rt9.80", "malic_acid_233_245_rt7.22", "shikimic_acid_204_462_rt9.57", "shikimic_acid_204_462_rt9.57", @@ -63,13 +60,11 @@ take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160 exclude_samples <- c("21106rA_31", "21107rA_54", "21109rA_59", "21109rA_86", "21109rA_78") exclude_mets <- c("psicose_103_217_rt9.38", "glutamic_acid_246_363_rt8.31", "lactic_acid_117_219_rt3.07")#glu wrong peak -area1 <- readxl::read_xls("210914_cmQTL_val_1_2_fruits_seq_file_20210914143103_comp_file_area_rt1.bkt.xls", na = c("", "N/A")) -area2 <- readxl::read_xls("210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls", na = c("", "N/A")) -area3 <- readxl::read_xls("210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls", na = c("", "N/A")) - - - +area1 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_seq_file_20210914143103_comp_file_area_rt1.bkt.xls"), na = c("", "N/A")) +area2 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls"), na = c("", "N/A")) +area3 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls"), na = c("", "N/A")) +#Add primary metabolite MAF metdat_GC_class <- readxl::read_xlsx("H:/3. cmQTL mapping/Ath_Dark_Light_GC_Xcal/current_source_files/210118_primary_metabolites_classification.xlsx") %>% select(component = Xcal_name_xreport, Compound_Name = HMDB_clear_name, Compound_Class = ChEBI_Ontology_dense)%>% mutate(RT_mean = str_extract(component, "\\d+\\.\\d+$")) %>% @@ -111,6 +106,16 @@ sam_vars <- c("plantline", "alias", "LIMS_ID", "extraction_num", "sample_num", "machine_num_GC", "class", "run_num_GC", "sample_weight", "exp", "genotype") +sam_dat1_tidy <- sam_dat1 %>% + mutate(sample_weight = as.double(str_extract(`Factor [sample fresh weight]`, "\\d{2}\\.\\d{2}"))) %>% + rename(source_name = `Source Name`) %>% + rename_with(.cols = everything(), .fn = ~str_remove(string = .x, pattern = "Parameter \\[")) %>% + rename_with(.cols = everything(), .fn = ~str_remove(string = .x, pattern = "Characteristic \\[")) %>% + rename_with(.cols = everything(), .fn = ~str_remove(string = .x, pattern = "Factor \\[")) %>% + rename_with(.cols = everything(), .fn = ~str_remove(string = .x, pattern = "\\]")) %>% + rename_with(.cols = everything(), .fn = ~str_replace_all(string = .x, pattern = "[:blank:]", replacement = "_")) + + sam_dat1_tidy <- GC_run1 %>% left_join(GC_machine_nums) %>% select(extraction_num = `Sample name`, everything())%>%