diff --git a/workflows/GC_MS_normalization/.Rproj.user/DF293CF6/sources/prop/0AF9DF01 b/workflows/GC_MS_normalization/.Rproj.user/DF293CF6/sources/prop/0AF9DF01 new file mode 100644 index 0000000000000000000000000000000000000000..21b544991eb33a2d9039aeda0c4976b754140725 --- /dev/null +++ b/workflows/GC_MS_normalization/.Rproj.user/DF293CF6/sources/prop/0AF9DF01 @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "41,34", + "scrollLine": "24" +} \ No newline at end of file diff --git a/workflows/GC_MS_normalization/.Rproj.user/DF293CF6/sources/prop/INDEX b/workflows/GC_MS_normalization/.Rproj.user/DF293CF6/sources/prop/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..87046b9700b78bd21f80a92c76154313238eb89e --- /dev/null +++ b/workflows/GC_MS_normalization/.Rproj.user/DF293CF6/sources/prop/INDEX @@ -0,0 +1 @@ +H%3A%2F1.%20cmQTL%20validation%2FcmQTL_val1_arc%2Fruns%2FGC_MS_normalization%2F210927_primary_normalization_with_split.R="0AF9DF01" diff --git a/workflows/GC_MS_normalization/.Rproj.user/DF293CF6/sources/session-6e7b8359/78975B07 b/workflows/GC_MS_normalization/.Rproj.user/DF293CF6/sources/session-6e7b8359/78975B07 new file mode 100644 index 0000000000000000000000000000000000000000..2098d4826030caeaf648863b57914f1e494462df --- /dev/null +++ b/workflows/GC_MS_normalization/.Rproj.user/DF293CF6/sources/session-6e7b8359/78975B07 @@ -0,0 +1,26 @@ +{ + "id": "78975B07", + "path": "H:/1. cmQTL validation/cmQTL_val1_arc/runs/GC_MS_normalization/210927_primary_normalization_with_split.R", + "project_path": "210927_primary_normalization_with_split.R", + "type": "r_source", + "hash": "2490589321", + "contents": "", + "dirty": false, + "created": 1674232784622.0, + "source_on_save": false, + "relative_order": 1, + "properties": { + "source_window_id": "", + "Source": "Source", + "cursorPosition": "41,34", + "scrollLine": "24" + }, + "folds": "", + "lastKnownWriteTime": 1674233264, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1674233264176, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/workflows/GC_MS_normalization/.Rproj.user/DF293CF6/sources/session-6e7b8359/78975B07-contents b/workflows/GC_MS_normalization/.Rproj.user/DF293CF6/sources/session-6e7b8359/78975B07-contents new file mode 100644 index 0000000000000000000000000000000000000000..1fa012bb31cc8cf61c020d075e5ebe0dc15072d2 --- /dev/null +++ b/workflows/GC_MS_normalization/.Rproj.user/DF293CF6/sources/session-6e7b8359/78975B07-contents @@ -0,0 +1,1123 @@ +rm(list = ls()) +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +getwd() +current <- getwd() +source <- ".." + + +library(openxlsx) +library(tidyverse) +library(car) +library(pheatmap) +library(broom) +library(ggpubr) +library(viridisLite) +library(modelr) +library(dlookr) +library(imputeLCMD) +library(ggrepel) +library(here) + +# Directory setting ------------------------------------------------------- + + +current <- getwd() +source <- str_c(current,"/..") + +cur_date <- str_c(str_replace_all(Sys.Date(),"^.{2}|-","")) + +out <- str_c(cur_date, "normalization", sep = "_") + +if (file.exists(out)) { + cat("The folder already exists") +} else { + dir.create(out) +} + +out_dir <- str_c(current, out, sep = "/") + +# Data loading ------------------------------------------------------------ +setwd(source) + +sam_dat1 <- readxl::read_xlsx(here) + +sam_dat1 <- read_csv("210812_cmQTL_val1_samplelist.csv", col_types = "ffidficcccfccccc") +GC_run1 <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 5) + +sam_dat2 <- read_csv("210812_cmQTL_val2_samplelist.csv", col_types = "ccfdifiiiff") +GC_run2 <- readxl::read_xlsx("210324_WIJESI-130121-15_cmQTL_validation2.xlsx", sheet = 6) + +genotypes <- readxl::read_xlsx("Genotype_names.xlsx") %>% + mutate(plantline = as_factor(plantline)) + +GC_machine_nums <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 6) + +setwd(current) + +take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160_rt9.81", "glutamic_acid_246_363_rt8.31", + "glutamine_156_245_rt9.80", "malic_acid_233_245_rt7.22", "shikimic_acid_204_462_rt9.57", "shikimic_acid_204_462_rt9.57", + "pyroglutamic_acid_156_258_rt8.30", "sucrose_437_361_rt13.77", "sucrose2_204_361_rt13.79", "citric_acid_273_375_rt9.72", + "arginine_157_256_rt9.92") + +exclude_samples <- c("21106rA_31", "21107rA_54", "21109rA_59", "21109rA_86", "21109rA_78") +exclude_mets <- c("psicose_103_217_rt9.38", "glutamic_acid_246_363_rt8.31", "lactic_acid_117_219_rt3.07")#glu wrong peak + +area1 <- readxl::read_xls("210914_cmQTL_val_1_2_fruits_seq_file_20210914143103_comp_file_area_rt1.bkt.xls", na = c("", "N/A")) +area2 <- readxl::read_xls("210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls", na = c("", "N/A")) +area3 <- readxl::read_xls("210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls", na = c("", "N/A")) + + + + +metdat_GC_class <- readxl::read_xlsx("H:/3. cmQTL mapping/Ath_Dark_Light_GC_Xcal/current_source_files/210118_primary_metabolites_classification.xlsx") %>% + select(component = Xcal_name_xreport, Compound_Name = HMDB_clear_name, Compound_Class = ChEBI_Ontology_dense)%>% + mutate(RT_mean = str_extract(component, "\\d+\\.\\d+$")) %>% + filter(!is.na(component)) %>% + group_by(Compound_Name) %>% + mutate(peak_no = rank(RT_mean), + Compound_Name = if_else(duplicated(Compound_Name), + str_c(Compound_Name, "peak", peak_no, sep = "_"), + Compound_Name)) + +area <- area1 %>% + bind_rows(area2, area3) %>% + select(component, area, machine_num_GC = machine_num,rt) %>% + mutate(area = as.numeric(area), + rt = as.numeric(rt)) + +rt_mean <- area %>% + group_by(component) %>% + summarise(RT_mean = mean(rt, na.rm = T)) + +metdat_GC_class <- readxl::read_xlsx("H:/3. cmQTL mapping/Shared_source_files/210118_primary_metabolites_classification.xlsx") %>% + select(component = Xcal_name_xreport, Compound_Name = HMDB_clear_name, Compound_Class = ChEBI_Ontology_dense)%>% + left_join(rt_mean) %>% + filter(!is.na(component)) %>% + arrange(Compound_Name, RT_mean) %>% + group_by(Compound_Name) %>% + mutate(peak_no = rank(RT_mean), + Compound_Name = if_else(duplicated(Compound_Name), + str_c(Compound_Name, "peak", peak_no, sep = "_"), + Compound_Name)) + + +# Data combination -------------------------------------------------------- + + + +sam_vars <- c("plantline", "alias", "LIMS_ID", + "treatment", "tissue", "batch_GC", "run_date_GC", + "extraction_num", "sample_num", "machine_num_GC", + "class", "run_num_GC", "sample_weight", "exp", "genotype") + +sam_dat1_tidy <- GC_run1 %>% + left_join(GC_machine_nums) %>% + select(extraction_num = `Sample name`, everything())%>% + mutate(class = as_factor(if_else(str_detect(extraction_num, "run_qc"), "run_qc", + if_else(str_detect(extraction_num, "blank"), "blank", "sample"))), + extraction_num = as.numeric(if_else(str_detect(extraction_num, "run_qc"), "0", + if_else(str_detect(extraction_num, "blank"), "-1",extraction_num))), + exp = as_factor(1)) %>% + left_join(sam_dat1) %>% + left_join(genotypes) %>% + select(treatment = irrigation, everything()) %>% + select(all_of(sam_vars)) %>% + arrange(run_num_GC) + +sam_dat2_tidy <- GC_run2 %>% + left_join(GC_machine_nums) %>% + select(extraction_num = `Sample name`, everything()) %>% + mutate(class = as_factor(if_else(str_detect(extraction_num, "run_qc"), "run_qc", + if_else(str_detect(extraction_num, "blank"), "blank", "sample"))), + extraction_num = as.numeric(if_else(str_detect(extraction_num, "run_qc"), "0", + if_else(str_detect(extraction_num, "blank"), "-1",extraction_num))), + exp = as_factor(2)) %>% + left_join(sam_dat2) %>% + left_join(genotypes) %>% + select(treatment = irrigation, everything()) %>% + select(all_of(sam_vars)) %>% + arrange(run_num_GC) + +sam_dat <- sam_dat1_tidy %>% + bind_rows(sam_dat2_tidy) %>% + arrange(run_num_GC) %>% + group_by(batch_GC) %>% + mutate(daily_num = row_number()) %>% + fill(tissue, .direction = "updown") %>% + ungroup() %>% + left_join(genotypes) %>% + mutate(treatment = as_factor(treatment)) + +area_long <- area %>% + left_join(sam_dat) %>% + filter(!is.na(exp)) + +area <- area_long %>% + left_join(metdat_GC_class) %>% + filter(!str_detect(component, "^\\!|FAME|component|empty"), !is.na(component), + !component %in% exclude_mets) + +met_dat = area %>% + distinct(RT_mean, Compound_Name, Compound_Class, component) %>% + mutate(met = str_c("m", row_number(), sep = "_")) + +area <- area %>% + left_join(met_dat) %>% + mutate(area = as.numeric(area)) + +setwd(out_dir) + +# PCA to justify filtering ------------------------------------------------ + +features_pca <- area %>% + group_by(met) %>% + mutate(imp = if_else(is.na(area), 0.5*min(area, na.rm = T),area), + batch_pca = str_extract(batch_GC, "\\d+")) %>% + ungroup() + +pca_base <- features_pca %>% + ungroup() %>% + filter(exp == 1) %>% + arrange(run_num_GC) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = log_imp) %>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + left_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + #left_join(sam_dat) %>% + mutate(batch_GC= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = batch_GC)) + + stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) + + geom_text_repel(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC == "21106rA_31"| machine_num_GC == "21107rA_54", machine_num_GC, NULL)), box.padding = 1) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_treatment_loess_outlier_exp1_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +pca_base <- features_pca %>% + ungroup() %>% + filter(exp == 2) %>% + arrange(batch_GC, daily_num) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = log_imp) %>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + left_join(pca_base %>% + arrange(batch_GC, daily_num) %>% + mutate(join_num=1:nrow(pca$x))) %>% + #left_join(sam_dat) %>% + mutate(batch_GC= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = class)) + + stat_ellipse(aes(x=PC1, y=PC2, color = class)) + + geom_label_repel(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC %in% exclude_samples, machine_num_GC, NULL)),force_pull = -0.1) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_treatment_loess_outlier_exp2_imp_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +# Data filtering ---------------------------------------------------------- + + +set_back_NA <- function(x){ + for (i in seq_along(x)) { + if (x[[i]] == 0) { + x[[i]] <- NA + } else { + x[[i]] <- x[[i]] + } + } + x +} + +area_long <- area %>% + filter(!machine_num_GC %in% exclude_samples) + + +# Imputation -------------------------------------------------------------- + +features_na <- area_long #%>% + #group_by(met) %>% + #mutate(area = set_back_NA(area)) %>% + #ungroup() + +missingness <- features_na %>% + group_by(exp, met, tissue, treatment) %>% + summarise(na = sum(is.na(area)), + n = n()) %>% + mutate(percent_na = na/n*100) %>% + ungroup() + +features <- features_na %>% + left_join(missingness) %>% + group_by(met) %>% + #mutate(area = ifelse(percent_na >= 0 & is.na(area), rnorm(n = 1, mean = 0.5*min(area, na.rm = T)), area)) %>% + ungroup() + + +features_filtered <- area_long %>% + group_by(met) %>% + mutate(imp = if_else(is.na(area), 0.5*min(area, na.rm = T),area), + batch_pca = str_extract(batch_GC, "\\d+")) %>% + ungroup() %>% + left_join(missingness) #%>% + #filter(percent_na <= 60) + + +# Internal standard check ------------------------------------------------- + +features_filtered %>% + filter(str_detect(Compound_Name, "ribitol")) %>% + mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC)), + is_miss = as_factor(if_else(is.na(area), T, F))) %>% + filter(class!="blank") %>% + ggplot(aes(x=run_num_GC, y = log2(imp))) + + geom_point(aes(color = class, shape = is_miss)) + + geom_vline(aes(xintercept = xint)) + + facet_grid(rows = vars(Compound_Name)) + +# Internal Standard Normalization ----------------------------------------- + +isvec <- features_filtered %>% + filter(component == "ribitol_217_rt8.20") %>% + select(machine_num_GC, is = imp) + +features <- features_filtered %>% + full_join(isvec) %>% + mutate(isnorm = imp/is) %>% + filter(str_detect(Compound_Name, "ribitol", negate = T)) %>% + mutate(sample_weight = if_else(sample_weight == 0|is.na(sample_weight), 50, sample_weight)) + + +# Fit linear model on QCs ------------------------------------------------- + +by_batch_GC <- features %>% + filter(class=="run_qc") %>% + group_by(batch_GC, met) %>% # need to change treatment_batch_GC_corr + nest() + +feature_model <- function(df) { + lm(isnorm ~ daily_num, data = df) +} + + +by_batch_GC <- by_batch_GC %>% + mutate(model = map(data, feature_model), + predicts = map2(data, model, add_predictions), + coefficients = map(model, tidy), + aug = map(model, augment), + glance = map (model, glance)) + +coefs_aug <- unnest(by_batch_GC, aug) %>% + select(batch_GC, met, .resid, daily_num) + +coefs_glance <- unnest(by_batch_GC,glance) %>% + select(batch_GC, met, adj.r.squared) + +coefs_term <- unnest(by_batch_GC, coefficients) %>% + pivot_wider(id_cols = c(batch_GC, met), + names_from = "term", + values_from = estimate) %>% + rename(x = daily_num, + intercept = `(Intercept)`) + + +coefs_pvalue <- by_batch_GC %>% + select(batch_GC, met, coefficients) %>% + ungroup() %>% + unnest(coefficients) %>% + pivot_wider(id_cols = c(batch_GC, met), + names_from = "term", + values_from = p.value) %>% + rename(intercept_pval = `(Intercept)`, + daily_num_pval = daily_num) %>% + full_join(features) %>% + full_join(coefs_term) %>% + #left_join(coefs_aug) %>% + full_join(coefs_glance) + +median <- features %>% + filter(class=="run_qc") %>% + group_by(batch_GC, met) %>% + summarise(median = median (isnorm)) + +features_lin <- unnest(by_batch_GC, predicts) %>% + full_join(coefs_pvalue) %>% + full_join(median) %>% + mutate(pred = intercept + x*daily_num, + predp = ifelse(daily_num_pval <=0.05 & adj.r.squared >= 0.75, pred, median), + predp = ifelse(is.na(predp), median, predp)) %>% + select(- c("data", "model", "coefficients")) + +adjust_lin <- features_lin %>% + ungroup() %>% + group_by(met) %>% + summarise(lin_offset = if_else(any(predp <0), 1.001*abs(min(predp)),0)) + +features_lin <- features_lin %>% + ungroup() %>% + full_join(adjust_lin) %>% + full_join(features) %>% + mutate(linnorm = (isnorm+lin_offset)/(predp+lin_offset), + linnorm_fw = linnorm/sample_weight, + linnorm_fw_log = log2(linnorm_fw)) + + + + +#Fit loess-model for QC-RLSC batch_GCwise#### +rejoin <- features +features <- rejoin + +QC_loess <- features %>% + ungroup() %>% + filter(class == "run_qc") %>% + #filter(met!="m_44") %>% + group_by(met, batch_GC) %>% + nest() +#revert span back to 1.5 later +loess_model <- function (df) { + loess(isnorm ~ daily_num, span = 1.5, data = df,control = loess.control(surface = "direct")) +} + +start <- Sys.time() +QC_loess <- QC_loess %>% + mutate(model = map (data, loess_model), + daily_num = list(seq(1,101,1)), + predict = map2(model,daily_num,stats::predict)) +end <- Sys.time() +end-start + +features_loess <- QC_loess %>% + unnest(c(predict, daily_num, batch_GC)) %>% + select(-model, -data) %>% + full_join(rejoin) %>% + filter(!is.na(machine_num_GC)) + +adjust <- features_loess %>% + ungroup() %>% + group_by(met) %>% + summarise(offset = if_else (any(predict <0), 1.001*abs(min(predict, na.rm = T)),0)) + +features_loess <- features_loess %>% + left_join(adjust) %>% + mutate(loess_norm = (isnorm+offset)/(predict+offset), + loess_norm_fw = loess_norm/sample_weight, + loess_norm_fw_log = log2(loess_norm_fw))%>% + ungroup() + +features_all <- features_lin %>% + left_join(features_loess) %>% + mutate(batch_pca = str_extract(batch_GC, "\\d+")) + + +# Relative log abundance plots -------------------------------------------- + + + + +features_all %>% + filter(!is.na(genotype)) %>% + group_by(met, treatment) %>% + mutate(sub = loess_norm_fw, + rla = log2(sub) - median(log2(sub))) %>% + ggplot(aes(x = met, y = rla)) + + geom_boxplot() + + facet_grid(rows = vars(treatment), scales = "free")+ + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + +features_all %>% + #filter(!is.na(taxa)) %>% + filter(treatment == 1, class == "sample") %>% + mutate(run_num_GC = as_factor(run_num_GC), + batch_GC = as_factor(batch_GC)) %>% + group_by(met, treatment) %>% + mutate(rla_imp = log2(imp) - median(log2(imp)), + rla_isnorm = log2(isnorm) - median(log2(isnorm)), + rla_loess_norm = log2(loess_norm_fw) - median(log2(loess_norm_fw))) %>% + pivot_longer(starts_with("rla"), names_to = "normalization", values_to = "rla") %>% + ggplot(aes(x = met, y = rla)) + + geom_boxplot() + + facet_grid(rows = vars(normalization), cols = vars(treatment), scales = "free")+ + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + ylim(c(-2,2)) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_2_GC_met_rla_plot.jpg", + sep = "_")) + +features_all %>% + #filter(!is.na(taxa)) %>% + filter(class == "sample") %>% + mutate(run_num_GC = as_factor(run_num_GC), + batch_GC = as_factor(batch_GC)) %>% + group_by(met, treatment) %>% + mutate(rla_imp = log2(imp) - median(log2(imp)), + rla_isnorm = log2(isnorm) - median(log2(isnorm)), + rla_loess_norm = log2(loess_norm_fw) - median(log2(loess_norm_fw))) %>% + pivot_longer(starts_with("rla"), names_to = "normalization", values_to = "rla") %>% + ggplot(aes(x = run_num_GC, y = rla, color = batch_GC)) + + geom_boxplot() + + facet_grid(rows = vars(normalization), scales = "free")+ + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + ylim(c(-1,1)) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_2_samples_rla_plot.jpg", + sep = "_")) + + +# RSD estimation ---------------------------------------------------------- + + + + +RSD_bio_reps <- features_all %>% + filter(class == "sample") %>% + #filter(!is.na(taxa)) %>% + group_by(met) %>% + #filter((any(qc_imputed) == T)==F) %>% + group_by(met, treatment, genotype, tissue, exp) %>% + summarise(RSD_imp = sd(imp)/mean(imp), + RSD_isnorm = sd(isnorm) / mean(isnorm), + RSD_linnorm = abs(sd (linnorm_fw)/ mean(linnorm_fw)), + RSD_loess_norm = abs(sd(loess_norm_fw) / mean(loess_norm_fw))) + + +RSD_bio_reps_mean <- RSD_bio_reps %>% + #mutate(RSD_diff = RSD_imp-RSD_loess) %>% + ungroup() %>% + #group_by(treatment, tissue, exp) %>% + summarise(#mean_RSD_loess = mean(RSD_loess), + mean_imp = mean(na.omit(RSD_imp)), + mean_loess_norm = mean(na.omit(RSD_loess_norm)), + mean_RSD_isnorm = mean(na.omit(RSD_isnorm )), + mean_linnorm = mean(na.omit(RSD_linnorm))) + +RSD_qcs <- features_all %>% + filter(class == "run_qc") %>% + group_by(met) %>% + #filter((any(qc_imputed) == T)==F) %>% + group_by(met, exp) %>% + summarise(RSD_imp = sd(imp)/mean(imp), + RSD_isnorm = sd(isnorm) / mean(isnorm), + RSD_linnorm = abs(sd (linnorm_fw)/ mean(linnorm_fw)), + RSD_loess_norm = abs(sd(loess_norm_fw) / mean(loess_norm_fw))) + +RSD_qcs_mean <- RSD_qcs %>% + #mutate(RSD_diff = RSD_imp-RSD_loess) %>% + ungroup() %>% + group_by(exp) %>% + summarise(#mean_RSD_loess = mean(RSD_loess), + mean_imp = mean(na.omit(RSD_imp)), + mean_loess_norm = mean(na.omit(RSD_loess_norm)), + mean_RSD_isnorm = mean(na.omit(RSD_isnorm )), + mean_linnorm = mean(na.omit(RSD_linnorm))) + + + +#PCA testing chunk#### +##https://www.intechopen.com/books/metabolomics-fundamentals-and-applications/processing-and-visualization-of-metabolomics-data-using-r +#exp1 +pca_base <- features_all %>% + ungroup() %>% + filter(exp == 1) %>% + arrange(batch_pca, daily_num) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = linnorm_fw_log) %>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + left_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + #left_join(sam_dat) %>% + mutate(batch_GC= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = tissue)) + + stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_tissue_lin_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = treatment)) + + stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_treatment_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +#PCA after loess normalization + +pca_base <- features_all %>% + ungroup() %>% + filter(exp == 1) %>% + arrange(batch_pca, daily_num) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = loess_norm_fw_log) %>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + full_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + #left_join(sam_dat) %>% + mutate(batch_pca= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = tissue)) + + stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_tissue_loess_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = treatment)) + + stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_treatment_loess_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = class)) + + stat_ellipse(aes(x=PC1, y=PC2, color = class)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + + + +#PCA after loess normalization exp2 + +pca_base <- features_all %>% + ungroup() %>% + filter(exp == 2) %>% + arrange(batch_pca, daily_num) %>% + mutate(log_imp = log2(isnorm)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = log_imp) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + full_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + #left_join(sam_dat) %>% + mutate(batch_pca= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = class)) + + stat_ellipse(aes(x=PC1, y=PC2, color = class)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = batch_GC)) + + stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) + + geom_text(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC %in% exclude_samples, machine_num_GC, NULL))) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_treatment_loess_outlier_exp2_loess_norm_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +# Per tissue loess pca ---------------------------------------------------- + + +tissues <- c("leaves", "fruits") +for(tiss in seq_along(tissues)){ + + pca_base <- features_all %>% + ungroup() %>% + filter(exp == 1, tissue == tissues[[tiss]]) %>% + arrange(batch_pca, daily_num) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment,batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = loess_norm_fw_log) %>% + filter(!is.na(m_1)) + + pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + + sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + + pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + full_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + left_join(sam_dat) %>% + mutate(batch_pca= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + + exp_var <- as_tibble(pca[["importance"]]) + + pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = treatment)) + + stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + + ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1", tissues[[tiss]], "treatment_loess_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) +} + + +# PCA before normalization + +pca_base <- features_all %>% + ungroup() %>% + filter(exp == 1) %>% + arrange(batch_pca, daily_num) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_pca, daily_num, class, machine_num_GC), + names_from = met, + values_from = log_imp)%>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + full_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + left_join(sam_dat) %>% + mutate(batch_pca= as_factor(batch_pca), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = tissue)) + + stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) + + geom_text(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, NULL)), + nudge_y = -2, + nudge_x = 4) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_tissue_imp_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = batch_GC)) + + stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) + + geom_text(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, NULL)), + nudge_y = -2, + nudge_x = 4) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_batch_imp_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = class)) + + stat_ellipse(aes(x=PC1, y=PC2, color = class)) + + geom_text(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, NULL)), + nudge_y = -2, + nudge_x = 4) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_class_imp_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + +#cmQTLval2 +pca_base <- features_all %>% + ungroup() %>% + filter(exp == 2, batch_GC == "3_split") %>% + arrange(batch_GC, daily_num) %>% + mutate(log_imp = log2(isnorm)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = log_imp)%>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + full_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + left_join(sam_dat) %>% + mutate(batch_pca= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = class)) + + stat_ellipse(aes(x=PC1, y=PC2, color = class)) + + geom_text(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC == "21109rA_59", machine_num_GC, NULL)), + nudge_y = -2, + nudge_x = 4) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val2_class_isnorm_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +# Plot all metabolites ---------------------------------------------------- + +plotmets <- features_all %>% distinct(met) %>% as_vector +plotmet_labs <- plotmets %>% as_tibble() %>% + left_join(met_dat, by= c("value" = "met")) %>% + #mutate(peak_num = base::rank(HMDB_clear_name, ties.method = "first"), + # dup = sum(peak_num), + # HMDB_clear_name_unique = if_else(dup>1, str_c(HMDB_clear_name, peak_num), HMDB_clear_name)) %>% + #ungroup() %>% + select(Compound_Name) %>% as_vector() + +plot_out <- vector("list", length = length(plotmets)) + +for (meta in seq_along(plotmets)) { + + plot_out [[meta]] <- features_all %>% + mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC)), + is_miss = as_factor(if_else(is.na(area), T, F))) %>% + filter(met == plotmets[[meta]]) %>% + filter(class!="blank") %>% + ggplot(aes(x=run_num_GC, y=isnorm)) + + geom_point(aes(color = class, shape = is_miss)) + + geom_point(aes(y=predict), color="black", size=0.1) + + geom_vline(aes(xintercept = xint))+ + # facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") + + ggtitle(label = plotmet_labs[[meta]]) + +} + +pdf(file = str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),"_loess_fit.pdf")) + + +for (meta in seq_along(plotmets)) { + print(plot_out[[meta]]) +} + +dev.off() + +plot_out <- vector("list", length = length(plotmets)) + +for (meta in seq_along(plotmets)) { + + plot_out [[meta]] <- features_all %>% + mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC))) %>% + filter(met == plotmets[[meta]]) %>% + filter(class!="blank") %>% + ggplot(aes(x=run_num_GC, y=loess_norm_fw_log)) + + geom_point(aes(color=class)) + + #geom_point(aes(y=predp), color="black", size=0.1) + + geom_vline(aes(xintercept = xint))+ + # facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") + + ggtitle(label = plotmet_labs[[meta]]) + +} + +pdf(file = str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),"_loess_norm_fw_log.pdf")) + + +for (meta in seq_along(plotmets)) { + print(plot_out[[meta]]) +} + +dev.off() + +for (meta in seq_along(plotmets)) { + + plot_out [[meta]] <- features_all %>% + mutate(is_miss = as_factor(if_else(is.na(area), T, F))) %>% + filter(met == plotmets[[meta]]) %>% + filter(class!="blank") %>% + ggplot(aes(x=daily_num, y=isnorm)) + + geom_point(aes(color = class, shape = is_miss)) + + geom_point(aes(y=predict), color="black", size=0.1) + + facet_wrap(facets = vars(batch_GC), nrow = 1, scales = "free") + + ggtitle(label = plotmet_labs[[meta]]) + + theme(axis.ticks.y = element_blank(), + axis.text.y = element_blank(), + panel.spacing = unit(-0.1,"lines")) + +} + +pdf(file = str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),"_loess_norm_no_scale.pdf"), + height = 3.5) + + +for (meta in seq_along(plotmets)) { + print(plot_out[[meta]]) +} + +dev.off() + +for (meta in seq_along(plotmets)) { + + plot_out [[meta]] <- features_all %>% + mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC)), + is_miss = as_factor(if_else(is.na(area), T, F))) %>% + filter(met == plotmets[[meta]]) %>% + filter(class!="blank") %>% + ggplot(aes(x=run_num_GC, y=isnorm)) + + geom_point(aes(color = class, shape = is_miss)) + + geom_point(aes(y=predp), color="black", size=0.1) + + geom_vline(aes(xintercept = xint))+ + # facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") + + ggtitle(label = plotmet_labs[[meta]]) + +} + +pdf(file = str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),"_linnorm.pdf")) + +for (meta in seq_along(plotmets)) { + print(plot_out[[meta]]) +} + +dev.off() + +rescale <- features_all %>% + group_by(met, tissue, exp) %>% + summarise(rescale = median(imp)) + +sam_vars <- c("plantline", "alias", "LIMS_ID", + "treatment", "tissue", "batch_GC", "run_date_GC", + "extraction_num", "sample_num", "machine_num_GC", + "class", "run_num_GC", "sample_weight", "exp", "genotype") + + +features_out <- features_all %>% + left_join(rescale) %>% + filter(class != "blank", + (component %in% take_split & str_detect(batch_GC, "split"))| + (!component %in% take_split & str_detect(batch_GC, "split", negate = T))| + tissue == "leaves") %>% + group_by(met, tissue, exp) %>% + mutate(loess_norm_med = loess_norm_fw/median(loess_norm_fw), + rescaled = loess_norm_med*rescale) %>% + select(all_of(sam_vars), met, Compound_Name, Compound_Class, loess_norm_fw, loess_norm_med, area, rescaled) + +#features_out %>% +# filter(met == "m_02177", tissue == "fruits", exp == 1) %>% +# ggplot(aes(x = run_num_GC, y = loess_norm_fw)) + +# geom_point() + +# Write files ------------------------------------------------------------- + + + +write_csv(features_out, + str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val_1_2_feat_dat_GC.csv", + sep = "_")) + +write_csv(met_dat, + str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val_1_2_met_dat_GC.csv", + sep = "_")) + +write_csv(sam_dat, + str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val_1_2_sam_dat_GC.csv", + sep = "_")) + +write_csv(missingness, + str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val_1_2_miss_GC.csv", + sep = "_")) + +# Log used code ------------------------------------------------------------ + +file_name <- sys.frame(1)$ofile + +file.copy(file_name, + to = str_c(out_dir, str_remove(file_name, current), "_", str_replace_all(Sys.Date(),"^.{2}|-",""), ".R"),overwrite = T) diff --git a/workflows/GC_MS_normalization/.Rproj.user/DF293CF6/sources/session-6e7b8359/lock_file b/workflows/GC_MS_normalization/.Rproj.user/DF293CF6/sources/session-6e7b8359/lock_file new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/workflows/GC_MS_normalization/.Rproj.user/shared/notebooks/patch-chunk-names b/workflows/GC_MS_normalization/.Rproj.user/shared/notebooks/patch-chunk-names new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R b/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R new file mode 100644 index 0000000000000000000000000000000000000000..fc4c856062e174becac027134122742dd9154af3 --- /dev/null +++ b/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R @@ -0,0 +1,1124 @@ +rm(list = ls()) +#setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +#getwd() +#current <- getwd() +#source <- ".." + + +library(openxlsx) +library(tidyverse) +library(car) +library(pheatmap) +library(broom) +library(ggpubr) +library(viridisLite) +library(modelr) +library(dlookr) +library(imputeLCMD) +library(ggrepel) +here::i_am("workflows/GC_MS_normalization/210927_primary_normalization_with_split.R") +library(here) + +# Directory setting ------------------------------------------------------- + + +current <- getwd() +source <- str_c(current,"/..") + +cur_date <- str_c(str_replace_all(Sys.Date(),"^.{2}|-","")) + +out <- str_c(cur_date, "normalization", sep = "_") + +if (file.exists(out)) { + cat("The folder already exists") +} else { + dir.create(out) +} + +out_dir <- str_c(current, out, sep = "/") + +# Data loading ------------------------------------------------------------ +setwd(source) + +sam_dat1 <- readxl::read_xlsx(here("studies/cmQTL_val1_GH_2020/isa.study.xlsx")) + +sam_dat1 <- read_csv("210812_cmQTL_val1_samplelist.csv", col_types = "ffidficcccfccccc") +GC_run1 <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 5) + +sam_dat2 <- read_csv("210812_cmQTL_val2_samplelist.csv", col_types = "ccfdifiiiff") +GC_run2 <- readxl::read_xlsx("210324_WIJESI-130121-15_cmQTL_validation2.xlsx", sheet = 6) + +genotypes <- readxl::read_xlsx("Genotype_names.xlsx") %>% + mutate(plantline = as_factor(plantline)) + +GC_machine_nums <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 6) + +setwd(current) + +take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160_rt9.81", "glutamic_acid_246_363_rt8.31", + "glutamine_156_245_rt9.80", "malic_acid_233_245_rt7.22", "shikimic_acid_204_462_rt9.57", "shikimic_acid_204_462_rt9.57", + "pyroglutamic_acid_156_258_rt8.30", "sucrose_437_361_rt13.77", "sucrose2_204_361_rt13.79", "citric_acid_273_375_rt9.72", + "arginine_157_256_rt9.92") + +exclude_samples <- c("21106rA_31", "21107rA_54", "21109rA_59", "21109rA_86", "21109rA_78") +exclude_mets <- c("psicose_103_217_rt9.38", "glutamic_acid_246_363_rt8.31", "lactic_acid_117_219_rt3.07")#glu wrong peak + +area1 <- readxl::read_xls("210914_cmQTL_val_1_2_fruits_seq_file_20210914143103_comp_file_area_rt1.bkt.xls", na = c("", "N/A")) +area2 <- readxl::read_xls("210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls", na = c("", "N/A")) +area3 <- readxl::read_xls("210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls", na = c("", "N/A")) + + + + +metdat_GC_class <- readxl::read_xlsx("H:/3. cmQTL mapping/Ath_Dark_Light_GC_Xcal/current_source_files/210118_primary_metabolites_classification.xlsx") %>% + select(component = Xcal_name_xreport, Compound_Name = HMDB_clear_name, Compound_Class = ChEBI_Ontology_dense)%>% + mutate(RT_mean = str_extract(component, "\\d+\\.\\d+$")) %>% + filter(!is.na(component)) %>% + group_by(Compound_Name) %>% + mutate(peak_no = rank(RT_mean), + Compound_Name = if_else(duplicated(Compound_Name), + str_c(Compound_Name, "peak", peak_no, sep = "_"), + Compound_Name)) + +area <- area1 %>% + bind_rows(area2, area3) %>% + select(component, area, machine_num_GC = machine_num,rt) %>% + mutate(area = as.numeric(area), + rt = as.numeric(rt)) + +rt_mean <- area %>% + group_by(component) %>% + summarise(RT_mean = mean(rt, na.rm = T)) + +metdat_GC_class <- readxl::read_xlsx("H:/3. cmQTL mapping/Shared_source_files/210118_primary_metabolites_classification.xlsx") %>% + select(component = Xcal_name_xreport, Compound_Name = HMDB_clear_name, Compound_Class = ChEBI_Ontology_dense)%>% + left_join(rt_mean) %>% + filter(!is.na(component)) %>% + arrange(Compound_Name, RT_mean) %>% + group_by(Compound_Name) %>% + mutate(peak_no = rank(RT_mean), + Compound_Name = if_else(duplicated(Compound_Name), + str_c(Compound_Name, "peak", peak_no, sep = "_"), + Compound_Name)) + + +# Data combination -------------------------------------------------------- + + + +sam_vars <- c("plantline", "alias", "LIMS_ID", + "treatment", "tissue", "batch_GC", "run_date_GC", + "extraction_num", "sample_num", "machine_num_GC", + "class", "run_num_GC", "sample_weight", "exp", "genotype") + +sam_dat1_tidy <- GC_run1 %>% + left_join(GC_machine_nums) %>% + select(extraction_num = `Sample name`, everything())%>% + mutate(class = as_factor(if_else(str_detect(extraction_num, "run_qc"), "run_qc", + if_else(str_detect(extraction_num, "blank"), "blank", "sample"))), + extraction_num = as.numeric(if_else(str_detect(extraction_num, "run_qc"), "0", + if_else(str_detect(extraction_num, "blank"), "-1",extraction_num))), + exp = as_factor(1)) %>% + left_join(sam_dat1) %>% + left_join(genotypes) %>% + select(treatment = irrigation, everything()) %>% + select(all_of(sam_vars)) %>% + arrange(run_num_GC) + +sam_dat2_tidy <- GC_run2 %>% + left_join(GC_machine_nums) %>% + select(extraction_num = `Sample name`, everything()) %>% + mutate(class = as_factor(if_else(str_detect(extraction_num, "run_qc"), "run_qc", + if_else(str_detect(extraction_num, "blank"), "blank", "sample"))), + extraction_num = as.numeric(if_else(str_detect(extraction_num, "run_qc"), "0", + if_else(str_detect(extraction_num, "blank"), "-1",extraction_num))), + exp = as_factor(2)) %>% + left_join(sam_dat2) %>% + left_join(genotypes) %>% + select(treatment = irrigation, everything()) %>% + select(all_of(sam_vars)) %>% + arrange(run_num_GC) + +sam_dat <- sam_dat1_tidy %>% + bind_rows(sam_dat2_tidy) %>% + arrange(run_num_GC) %>% + group_by(batch_GC) %>% + mutate(daily_num = row_number()) %>% + fill(tissue, .direction = "updown") %>% + ungroup() %>% + left_join(genotypes) %>% + mutate(treatment = as_factor(treatment)) + +area_long <- area %>% + left_join(sam_dat) %>% + filter(!is.na(exp)) + +area <- area_long %>% + left_join(metdat_GC_class) %>% + filter(!str_detect(component, "^\\!|FAME|component|empty"), !is.na(component), + !component %in% exclude_mets) + +met_dat = area %>% + distinct(RT_mean, Compound_Name, Compound_Class, component) %>% + mutate(met = str_c("m", row_number(), sep = "_")) + +area <- area %>% + left_join(met_dat) %>% + mutate(area = as.numeric(area)) + +setwd(out_dir) + +# PCA to justify filtering ------------------------------------------------ + +features_pca <- area %>% + group_by(met) %>% + mutate(imp = if_else(is.na(area), 0.5*min(area, na.rm = T),area), + batch_pca = str_extract(batch_GC, "\\d+")) %>% + ungroup() + +pca_base <- features_pca %>% + ungroup() %>% + filter(exp == 1) %>% + arrange(run_num_GC) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = log_imp) %>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + left_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + #left_join(sam_dat) %>% + mutate(batch_GC= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = batch_GC)) + + stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) + + geom_text_repel(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC == "21106rA_31"| machine_num_GC == "21107rA_54", machine_num_GC, NULL)), box.padding = 1) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_treatment_loess_outlier_exp1_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +pca_base <- features_pca %>% + ungroup() %>% + filter(exp == 2) %>% + arrange(batch_GC, daily_num) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = log_imp) %>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + left_join(pca_base %>% + arrange(batch_GC, daily_num) %>% + mutate(join_num=1:nrow(pca$x))) %>% + #left_join(sam_dat) %>% + mutate(batch_GC= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = class)) + + stat_ellipse(aes(x=PC1, y=PC2, color = class)) + + geom_label_repel(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC %in% exclude_samples, machine_num_GC, NULL)),force_pull = -0.1) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_treatment_loess_outlier_exp2_imp_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +# Data filtering ---------------------------------------------------------- + + +set_back_NA <- function(x){ + for (i in seq_along(x)) { + if (x[[i]] == 0) { + x[[i]] <- NA + } else { + x[[i]] <- x[[i]] + } + } + x +} + +area_long <- area %>% + filter(!machine_num_GC %in% exclude_samples) + + +# Imputation -------------------------------------------------------------- + +features_na <- area_long #%>% + #group_by(met) %>% + #mutate(area = set_back_NA(area)) %>% + #ungroup() + +missingness <- features_na %>% + group_by(exp, met, tissue, treatment) %>% + summarise(na = sum(is.na(area)), + n = n()) %>% + mutate(percent_na = na/n*100) %>% + ungroup() + +features <- features_na %>% + left_join(missingness) %>% + group_by(met) %>% + #mutate(area = ifelse(percent_na >= 0 & is.na(area), rnorm(n = 1, mean = 0.5*min(area, na.rm = T)), area)) %>% + ungroup() + + +features_filtered <- area_long %>% + group_by(met) %>% + mutate(imp = if_else(is.na(area), 0.5*min(area, na.rm = T),area), + batch_pca = str_extract(batch_GC, "\\d+")) %>% + ungroup() %>% + left_join(missingness) #%>% + #filter(percent_na <= 60) + + +# Internal standard check ------------------------------------------------- + +features_filtered %>% + filter(str_detect(Compound_Name, "ribitol")) %>% + mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC)), + is_miss = as_factor(if_else(is.na(area), T, F))) %>% + filter(class!="blank") %>% + ggplot(aes(x=run_num_GC, y = log2(imp))) + + geom_point(aes(color = class, shape = is_miss)) + + geom_vline(aes(xintercept = xint)) + + facet_grid(rows = vars(Compound_Name)) + +# Internal Standard Normalization ----------------------------------------- + +isvec <- features_filtered %>% + filter(component == "ribitol_217_rt8.20") %>% + select(machine_num_GC, is = imp) + +features <- features_filtered %>% + full_join(isvec) %>% + mutate(isnorm = imp/is) %>% + filter(str_detect(Compound_Name, "ribitol", negate = T)) %>% + mutate(sample_weight = if_else(sample_weight == 0|is.na(sample_weight), 50, sample_weight)) + + +# Fit linear model on QCs ------------------------------------------------- + +by_batch_GC <- features %>% + filter(class=="run_qc") %>% + group_by(batch_GC, met) %>% # need to change treatment_batch_GC_corr + nest() + +feature_model <- function(df) { + lm(isnorm ~ daily_num, data = df) +} + + +by_batch_GC <- by_batch_GC %>% + mutate(model = map(data, feature_model), + predicts = map2(data, model, add_predictions), + coefficients = map(model, tidy), + aug = map(model, augment), + glance = map (model, glance)) + +coefs_aug <- unnest(by_batch_GC, aug) %>% + select(batch_GC, met, .resid, daily_num) + +coefs_glance <- unnest(by_batch_GC,glance) %>% + select(batch_GC, met, adj.r.squared) + +coefs_term <- unnest(by_batch_GC, coefficients) %>% + pivot_wider(id_cols = c(batch_GC, met), + names_from = "term", + values_from = estimate) %>% + rename(x = daily_num, + intercept = `(Intercept)`) + + +coefs_pvalue <- by_batch_GC %>% + select(batch_GC, met, coefficients) %>% + ungroup() %>% + unnest(coefficients) %>% + pivot_wider(id_cols = c(batch_GC, met), + names_from = "term", + values_from = p.value) %>% + rename(intercept_pval = `(Intercept)`, + daily_num_pval = daily_num) %>% + full_join(features) %>% + full_join(coefs_term) %>% + #left_join(coefs_aug) %>% + full_join(coefs_glance) + +median <- features %>% + filter(class=="run_qc") %>% + group_by(batch_GC, met) %>% + summarise(median = median (isnorm)) + +features_lin <- unnest(by_batch_GC, predicts) %>% + full_join(coefs_pvalue) %>% + full_join(median) %>% + mutate(pred = intercept + x*daily_num, + predp = ifelse(daily_num_pval <=0.05 & adj.r.squared >= 0.75, pred, median), + predp = ifelse(is.na(predp), median, predp)) %>% + select(- c("data", "model", "coefficients")) + +adjust_lin <- features_lin %>% + ungroup() %>% + group_by(met) %>% + summarise(lin_offset = if_else(any(predp <0), 1.001*abs(min(predp)),0)) + +features_lin <- features_lin %>% + ungroup() %>% + full_join(adjust_lin) %>% + full_join(features) %>% + mutate(linnorm = (isnorm+lin_offset)/(predp+lin_offset), + linnorm_fw = linnorm/sample_weight, + linnorm_fw_log = log2(linnorm_fw)) + + + + +#Fit loess-model for QC-RLSC batch_GCwise#### +rejoin <- features +features <- rejoin + +QC_loess <- features %>% + ungroup() %>% + filter(class == "run_qc") %>% + #filter(met!="m_44") %>% + group_by(met, batch_GC) %>% + nest() +#revert span back to 1.5 later +loess_model <- function (df) { + loess(isnorm ~ daily_num, span = 1.5, data = df,control = loess.control(surface = "direct")) +} + +start <- Sys.time() +QC_loess <- QC_loess %>% + mutate(model = map (data, loess_model), + daily_num = list(seq(1,101,1)), + predict = map2(model,daily_num,stats::predict)) +end <- Sys.time() +end-start + +features_loess <- QC_loess %>% + unnest(c(predict, daily_num, batch_GC)) %>% + select(-model, -data) %>% + full_join(rejoin) %>% + filter(!is.na(machine_num_GC)) + +adjust <- features_loess %>% + ungroup() %>% + group_by(met) %>% + summarise(offset = if_else (any(predict <0), 1.001*abs(min(predict, na.rm = T)),0)) + +features_loess <- features_loess %>% + left_join(adjust) %>% + mutate(loess_norm = (isnorm+offset)/(predict+offset), + loess_norm_fw = loess_norm/sample_weight, + loess_norm_fw_log = log2(loess_norm_fw))%>% + ungroup() + +features_all <- features_lin %>% + left_join(features_loess) %>% + mutate(batch_pca = str_extract(batch_GC, "\\d+")) + + +# Relative log abundance plots -------------------------------------------- + + + + +features_all %>% + filter(!is.na(genotype)) %>% + group_by(met, treatment) %>% + mutate(sub = loess_norm_fw, + rla = log2(sub) - median(log2(sub))) %>% + ggplot(aes(x = met, y = rla)) + + geom_boxplot() + + facet_grid(rows = vars(treatment), scales = "free")+ + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + +features_all %>% + #filter(!is.na(taxa)) %>% + filter(treatment == 1, class == "sample") %>% + mutate(run_num_GC = as_factor(run_num_GC), + batch_GC = as_factor(batch_GC)) %>% + group_by(met, treatment) %>% + mutate(rla_imp = log2(imp) - median(log2(imp)), + rla_isnorm = log2(isnorm) - median(log2(isnorm)), + rla_loess_norm = log2(loess_norm_fw) - median(log2(loess_norm_fw))) %>% + pivot_longer(starts_with("rla"), names_to = "normalization", values_to = "rla") %>% + ggplot(aes(x = met, y = rla)) + + geom_boxplot() + + facet_grid(rows = vars(normalization), cols = vars(treatment), scales = "free")+ + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + ylim(c(-2,2)) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_2_GC_met_rla_plot.jpg", + sep = "_")) + +features_all %>% + #filter(!is.na(taxa)) %>% + filter(class == "sample") %>% + mutate(run_num_GC = as_factor(run_num_GC), + batch_GC = as_factor(batch_GC)) %>% + group_by(met, treatment) %>% + mutate(rla_imp = log2(imp) - median(log2(imp)), + rla_isnorm = log2(isnorm) - median(log2(isnorm)), + rla_loess_norm = log2(loess_norm_fw) - median(log2(loess_norm_fw))) %>% + pivot_longer(starts_with("rla"), names_to = "normalization", values_to = "rla") %>% + ggplot(aes(x = run_num_GC, y = rla, color = batch_GC)) + + geom_boxplot() + + facet_grid(rows = vars(normalization), scales = "free")+ + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + ylim(c(-1,1)) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_2_samples_rla_plot.jpg", + sep = "_")) + + +# RSD estimation ---------------------------------------------------------- + + + + +RSD_bio_reps <- features_all %>% + filter(class == "sample") %>% + #filter(!is.na(taxa)) %>% + group_by(met) %>% + #filter((any(qc_imputed) == T)==F) %>% + group_by(met, treatment, genotype, tissue, exp) %>% + summarise(RSD_imp = sd(imp)/mean(imp), + RSD_isnorm = sd(isnorm) / mean(isnorm), + RSD_linnorm = abs(sd (linnorm_fw)/ mean(linnorm_fw)), + RSD_loess_norm = abs(sd(loess_norm_fw) / mean(loess_norm_fw))) + + +RSD_bio_reps_mean <- RSD_bio_reps %>% + #mutate(RSD_diff = RSD_imp-RSD_loess) %>% + ungroup() %>% + #group_by(treatment, tissue, exp) %>% + summarise(#mean_RSD_loess = mean(RSD_loess), + mean_imp = mean(na.omit(RSD_imp)), + mean_loess_norm = mean(na.omit(RSD_loess_norm)), + mean_RSD_isnorm = mean(na.omit(RSD_isnorm )), + mean_linnorm = mean(na.omit(RSD_linnorm))) + +RSD_qcs <- features_all %>% + filter(class == "run_qc") %>% + group_by(met) %>% + #filter((any(qc_imputed) == T)==F) %>% + group_by(met, exp) %>% + summarise(RSD_imp = sd(imp)/mean(imp), + RSD_isnorm = sd(isnorm) / mean(isnorm), + RSD_linnorm = abs(sd (linnorm_fw)/ mean(linnorm_fw)), + RSD_loess_norm = abs(sd(loess_norm_fw) / mean(loess_norm_fw))) + +RSD_qcs_mean <- RSD_qcs %>% + #mutate(RSD_diff = RSD_imp-RSD_loess) %>% + ungroup() %>% + group_by(exp) %>% + summarise(#mean_RSD_loess = mean(RSD_loess), + mean_imp = mean(na.omit(RSD_imp)), + mean_loess_norm = mean(na.omit(RSD_loess_norm)), + mean_RSD_isnorm = mean(na.omit(RSD_isnorm )), + mean_linnorm = mean(na.omit(RSD_linnorm))) + + + +#PCA testing chunk#### +##https://www.intechopen.com/books/metabolomics-fundamentals-and-applications/processing-and-visualization-of-metabolomics-data-using-r +#exp1 +pca_base <- features_all %>% + ungroup() %>% + filter(exp == 1) %>% + arrange(batch_pca, daily_num) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = linnorm_fw_log) %>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + left_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + #left_join(sam_dat) %>% + mutate(batch_GC= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = tissue)) + + stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_tissue_lin_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = treatment)) + + stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_treatment_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +#PCA after loess normalization + +pca_base <- features_all %>% + ungroup() %>% + filter(exp == 1) %>% + arrange(batch_pca, daily_num) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = loess_norm_fw_log) %>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + full_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + #left_join(sam_dat) %>% + mutate(batch_pca= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = tissue)) + + stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_tissue_loess_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = treatment)) + + stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_treatment_loess_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = class)) + + stat_ellipse(aes(x=PC1, y=PC2, color = class)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + + + +#PCA after loess normalization exp2 + +pca_base <- features_all %>% + ungroup() %>% + filter(exp == 2) %>% + arrange(batch_pca, daily_num) %>% + mutate(log_imp = log2(isnorm)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = log_imp) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + full_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + #left_join(sam_dat) %>% + mutate(batch_pca= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = class)) + + stat_ellipse(aes(x=PC1, y=PC2, color = class)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = batch_GC)) + + stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) + + geom_text(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC %in% exclude_samples, machine_num_GC, NULL))) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_treatment_loess_outlier_exp2_loess_norm_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +# Per tissue loess pca ---------------------------------------------------- + + +tissues <- c("leaves", "fruits") +for(tiss in seq_along(tissues)){ + + pca_base <- features_all %>% + ungroup() %>% + filter(exp == 1, tissue == tissues[[tiss]]) %>% + arrange(batch_pca, daily_num) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment,batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = loess_norm_fw_log) %>% + filter(!is.na(m_1)) + + pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + + sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + + pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + full_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + left_join(sam_dat) %>% + mutate(batch_pca= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + + exp_var <- as_tibble(pca[["importance"]]) + + pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = treatment)) + + stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + + ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1", tissues[[tiss]], "treatment_loess_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) +} + + +# PCA before normalization + +pca_base <- features_all %>% + ungroup() %>% + filter(exp == 1) %>% + arrange(batch_pca, daily_num) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_pca, daily_num, class, machine_num_GC), + names_from = met, + values_from = log_imp)%>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + full_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + left_join(sam_dat) %>% + mutate(batch_pca= as_factor(batch_pca), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = tissue)) + + stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) + + geom_text(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, NULL)), + nudge_y = -2, + nudge_x = 4) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_tissue_imp_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = batch_GC)) + + stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) + + geom_text(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, NULL)), + nudge_y = -2, + nudge_x = 4) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_batch_imp_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = class)) + + stat_ellipse(aes(x=PC1, y=PC2, color = class)) + + geom_text(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, NULL)), + nudge_y = -2, + nudge_x = 4) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val1_class_imp_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + +#cmQTLval2 +pca_base <- features_all %>% + ungroup() %>% + filter(exp == 2, batch_GC == "3_split") %>% + arrange(batch_GC, daily_num) %>% + mutate(log_imp = log2(isnorm)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = log_imp)%>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + full_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + left_join(sam_dat) %>% + mutate(batch_pca= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = class)) + + stat_ellipse(aes(x=PC1, y=PC2, color = class)) + + geom_text(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC == "21109rA_59", machine_num_GC, NULL)), + nudge_y = -2, + nudge_x = 4) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val2_class_isnorm_PCA.jpg", + sep = "_"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +# Plot all metabolites ---------------------------------------------------- + +plotmets <- features_all %>% distinct(met) %>% as_vector +plotmet_labs <- plotmets %>% as_tibble() %>% + left_join(met_dat, by= c("value" = "met")) %>% + #mutate(peak_num = base::rank(HMDB_clear_name, ties.method = "first"), + # dup = sum(peak_num), + # HMDB_clear_name_unique = if_else(dup>1, str_c(HMDB_clear_name, peak_num), HMDB_clear_name)) %>% + #ungroup() %>% + select(Compound_Name) %>% as_vector() + +plot_out <- vector("list", length = length(plotmets)) + +for (meta in seq_along(plotmets)) { + + plot_out [[meta]] <- features_all %>% + mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC)), + is_miss = as_factor(if_else(is.na(area), T, F))) %>% + filter(met == plotmets[[meta]]) %>% + filter(class!="blank") %>% + ggplot(aes(x=run_num_GC, y=isnorm)) + + geom_point(aes(color = class, shape = is_miss)) + + geom_point(aes(y=predict), color="black", size=0.1) + + geom_vline(aes(xintercept = xint))+ + # facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") + + ggtitle(label = plotmet_labs[[meta]]) + +} + +pdf(file = str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),"_loess_fit.pdf")) + + +for (meta in seq_along(plotmets)) { + print(plot_out[[meta]]) +} + +dev.off() + +plot_out <- vector("list", length = length(plotmets)) + +for (meta in seq_along(plotmets)) { + + plot_out [[meta]] <- features_all %>% + mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC))) %>% + filter(met == plotmets[[meta]]) %>% + filter(class!="blank") %>% + ggplot(aes(x=run_num_GC, y=loess_norm_fw_log)) + + geom_point(aes(color=class)) + + #geom_point(aes(y=predp), color="black", size=0.1) + + geom_vline(aes(xintercept = xint))+ + # facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") + + ggtitle(label = plotmet_labs[[meta]]) + +} + +pdf(file = str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),"_loess_norm_fw_log.pdf")) + + +for (meta in seq_along(plotmets)) { + print(plot_out[[meta]]) +} + +dev.off() + +for (meta in seq_along(plotmets)) { + + plot_out [[meta]] <- features_all %>% + mutate(is_miss = as_factor(if_else(is.na(area), T, F))) %>% + filter(met == plotmets[[meta]]) %>% + filter(class!="blank") %>% + ggplot(aes(x=daily_num, y=isnorm)) + + geom_point(aes(color = class, shape = is_miss)) + + geom_point(aes(y=predict), color="black", size=0.1) + + facet_wrap(facets = vars(batch_GC), nrow = 1, scales = "free") + + ggtitle(label = plotmet_labs[[meta]]) + + theme(axis.ticks.y = element_blank(), + axis.text.y = element_blank(), + panel.spacing = unit(-0.1,"lines")) + +} + +pdf(file = str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),"_loess_norm_no_scale.pdf"), + height = 3.5) + + +for (meta in seq_along(plotmets)) { + print(plot_out[[meta]]) +} + +dev.off() + +for (meta in seq_along(plotmets)) { + + plot_out [[meta]] <- features_all %>% + mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC)), + is_miss = as_factor(if_else(is.na(area), T, F))) %>% + filter(met == plotmets[[meta]]) %>% + filter(class!="blank") %>% + ggplot(aes(x=run_num_GC, y=isnorm)) + + geom_point(aes(color = class, shape = is_miss)) + + geom_point(aes(y=predp), color="black", size=0.1) + + geom_vline(aes(xintercept = xint))+ + # facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") + + ggtitle(label = plotmet_labs[[meta]]) + +} + +pdf(file = str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),"_linnorm.pdf")) + +for (meta in seq_along(plotmets)) { + print(plot_out[[meta]]) +} + +dev.off() + +rescale <- features_all %>% + group_by(met, tissue, exp) %>% + summarise(rescale = median(imp)) + +sam_vars <- c("plantline", "alias", "LIMS_ID", + "treatment", "tissue", "batch_GC", "run_date_GC", + "extraction_num", "sample_num", "machine_num_GC", + "class", "run_num_GC", "sample_weight", "exp", "genotype") + + +features_out <- features_all %>% + left_join(rescale) %>% + filter(class != "blank", + (component %in% take_split & str_detect(batch_GC, "split"))| + (!component %in% take_split & str_detect(batch_GC, "split", negate = T))| + tissue == "leaves") %>% + group_by(met, tissue, exp) %>% + mutate(loess_norm_med = loess_norm_fw/median(loess_norm_fw), + rescaled = loess_norm_med*rescale) %>% + select(all_of(sam_vars), met, Compound_Name, Compound_Class, loess_norm_fw, loess_norm_med, area, rescaled) + +#features_out %>% +# filter(met == "m_02177", tissue == "fruits", exp == 1) %>% +# ggplot(aes(x = run_num_GC, y = loess_norm_fw)) + +# geom_point() + +# Write files ------------------------------------------------------------- + + + +write_csv(features_out, + str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val_1_2_feat_dat_GC.csv", + sep = "_")) + +write_csv(met_dat, + str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val_1_2_met_dat_GC.csv", + sep = "_")) + +write_csv(sam_dat, + str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val_1_2_sam_dat_GC.csv", + sep = "_")) + +write_csv(missingness, + str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), + "cmQTL_val_1_2_miss_GC.csv", + sep = "_")) + +# Log used code ------------------------------------------------------------ + +file_name <- sys.frame(1)$ofile + +file.copy(file_name, + to = str_c(out_dir, str_remove(file_name, current), "_", str_replace_all(Sys.Date(),"^.{2}|-",""), ".R"),overwrite = T)