diff --git a/.Rhistory b/.Rhistory index 2d2303bb43ca6d0e2251f2ea8cf3c122d01e3a7f..eafeceb68ebcb3a4f99f5521ef509431f3f24baf 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,512 +1,222 @@ -filter(exp == 1) %>% -arrange(batch_pca, daily_num) %>% -mutate(log_imp = log2(imp)) %>% -pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), -names_from = met, -values_from = loess_norm_fw_log) %>% -filter(!is.na(m_1)) -pca <- summary(prcomp( -pca_base%>% -select(starts_with("m_")))) -sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() -pca_plot <- as_tibble(pca$x) %>% -mutate(join_num = 1:nrow(pca$x)) %>% -full_join(pca_base %>% -mutate(join_num=1:nrow(pca$x))) %>% -#left_join(sam_dat) %>% -mutate(batch_pca= as_factor(batch_GC), -class= as_factor(class)) %>% -select(all_of(sam_vars), everything()) -exp_var <- as_tibble(pca[["importance"]]) -pca_plot %>% -# filter(treatment == "HL") %>% -ggplot()+ -geom_jitter(aes(x=PC1, y=PC2, color = tissue)) + -stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) + -ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + -xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) -ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), -"cmQTL_val1_tissue_loess_PCA.jpg", -sep = "_"), -width = 183, -height = 100, -units = "mm", -dpi = 300) -pca_plot %>% -# filter(treatment == "HL") %>% -ggplot()+ -geom_jitter(aes(x=PC1, y=PC2, color = treatment)) + -stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) + -ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + -xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) -ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), -"cmQTL_val1_treatment_loess_PCA.jpg", -sep = "_"), -width = 183, -height = 100, -units = "mm", -dpi = 300) -pca_plot %>% -# filter(treatment == "HL") %>% -ggplot()+ -geom_jitter(aes(x=PC1, y=PC2, color = class)) + -stat_ellipse(aes(x=PC1, y=PC2, color = class)) + -ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + -xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) -pca_base <- features_all %>% -ungroup() %>% -filter(exp == 2) %>% -arrange(batch_pca, daily_num) %>% -mutate(log_imp = log2(isnorm)) %>% -pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), -names_from = met, -values_from = log_imp) -pca <- summary(prcomp( -pca_base%>% -select(starts_with("m_")))) -sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() -pca_plot <- as_tibble(pca$x) %>% -mutate(join_num = 1:nrow(pca$x)) %>% -full_join(pca_base %>% -mutate(join_num=1:nrow(pca$x))) %>% -#left_join(sam_dat) %>% -mutate(batch_pca= as_factor(batch_GC), -class= as_factor(class)) %>% -select(all_of(sam_vars), everything()) -exp_var <- as_tibble(pca[["importance"]]) -pca_plot %>% -# filter(treatment == "HL") %>% -ggplot()+ -geom_jitter(aes(x=PC1, y=PC2, color = class)) + -stat_ellipse(aes(x=PC1, y=PC2, color = class)) + -ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + -xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) -pca_plot %>% -# filter(treatment == "HL") %>% -ggplot()+ -geom_jitter(aes(x=PC1, y=PC2, color = batch_GC)) + -stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) + -geom_text(aes(x = PC1, y = PC2, -label = if_else(machine_num_GC %in% exclude_samples, machine_num_GC, NULL))) + -ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + -xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) -pca_plot %>% -# filter(treatment == "HL") %>% -ggplot()+ -geom_jitter(aes(x=PC1, y=PC2, color = batch_GC)) + -stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) + -geom_text(aes(x = PC1, y = PC2, -label = if_else(machine_num_GC %in% exclude_samples, machine_num_GC, ""))) + -ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + -xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) -ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), -"cmQTL_val1_treatment_loess_outlier_exp2_loess_norm_PCA.jpg", -sep = "_"), -width = 183, -height = 100, -units = "mm", -dpi = 300) -tissues <- c("leaves", "fruits") -for(tiss in seq_along(tissues)){ -pca_base <- features_all %>% -ungroup() %>% -filter(exp == 1, tissue == tissues[[tiss]]) %>% -arrange(batch_pca, daily_num) %>% -mutate(log_imp = log2(imp)) %>% -pivot_wider(id_cols = c(tissue, treatment,batch_GC, daily_num, class, machine_num_GC), -names_from = met, -values_from = loess_norm_fw_log) %>% -filter(!is.na(m_1)) -pca <- summary(prcomp( -pca_base%>% -select(starts_with("m_")))) -sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() -pca_plot <- as_tibble(pca$x) %>% -mutate(join_num = 1:nrow(pca$x)) %>% -full_join(pca_base %>% -mutate(join_num=1:nrow(pca$x))) %>% -left_join(sam_dat) %>% -mutate(batch_pca= as_factor(batch_GC), -class= as_factor(class)) %>% -select(all_of(sam_vars), everything()) -exp_var <- as_tibble(pca[["importance"]]) -pca_plot %>% -# filter(treatment == "HL") %>% -ggplot()+ -geom_jitter(aes(x=PC1, y=PC2, color = treatment)) + -stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) + -ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + -xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) -ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), -"cmQTL_val1", tissues[[tiss]], "treatment_loess_PCA.jpg", -sep = "_"), -width = 183, -height = 100, -units = "mm", -dpi = 300) -} -View(features_all) -pca_base <- features_all %>% -ungroup() %>% -filter(exp == 1, tissue == tissues[[tiss]]) %>% -arrange(batch_pca, daily_num) %>% -mutate(log_imp = log2(imp)) %>% -pivot_wider(id_cols = c(tissue, treatment,batch_GC, daily_num, class, machine_num_GC), -names_from = met, -values_from = loess_norm_fw_log) %>% -filter(!is.na(m_1)) -pca_base <- features_all %>% -ungroup() %>% -filter(exp == 1, tissue == tissues[[tiss]]) %>% -arrange(batch_pca, daily_num) %>% -mutate(log_imp = log2(imp)) %>% -pivot_wider(id_cols = c(tissue, treatment,batch_GC, daily_num, class, machine_num_GC), -names_from = met, -values_from = loess_norm_fw_log) #%>% -tissues <- c("leaf", "fruit") -for(tiss in seq_along(tissues)){ -pca_base <- features_all %>% -ungroup() %>% -filter(exp == 1, tissue == tissues[[tiss]]) %>% -arrange(batch_pca, daily_num) %>% -mutate(log_imp = log2(imp)) %>% -pivot_wider(id_cols = c(tissue, treatment,batch_GC, daily_num, class, machine_num_GC), -names_from = met, -values_from = loess_norm_fw_log) %>% -filter(!is.na(m_1)) -pca <- summary(prcomp( -pca_base%>% -select(starts_with("m_")))) -sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() -pca_plot <- as_tibble(pca$x) %>% -mutate(join_num = 1:nrow(pca$x)) %>% -full_join(pca_base %>% -mutate(join_num=1:nrow(pca$x))) %>% -left_join(sam_dat) %>% -mutate(batch_pca= as_factor(batch_GC), -class= as_factor(class)) %>% -select(all_of(sam_vars), everything()) -exp_var <- as_tibble(pca[["importance"]]) -pca_plot %>% -# filter(treatment == "HL") %>% -ggplot()+ -geom_jitter(aes(x=PC1, y=PC2, color = treatment)) + -stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) + -ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + -xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) -ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), -"cmQTL_val1", tissues[[tiss]], "treatment_loess_PCA.jpg", -sep = "_"), -width = 183, -height = 100, -units = "mm", -dpi = 300) -} -pca_base <- features_all %>% -ungroup() %>% -filter(exp == 1) %>% -arrange(batch_pca, daily_num) %>% -mutate(log_imp = log2(imp)) %>% -pivot_wider(id_cols = c(tissue, treatment, batch_pca, daily_num, class, machine_num_GC), -names_from = met, -values_from = log_imp)%>% -filter(!is.na(m_1)) -pca <- summary(prcomp( -pca_base%>% -select(starts_with("m_")))) -sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() -pca_plot <- as_tibble(pca$x) %>% -mutate(join_num = 1:nrow(pca$x)) %>% -full_join(pca_base %>% -mutate(join_num=1:nrow(pca$x))) %>% -left_join(sam_dat) %>% -mutate(batch_pca= as_factor(batch_pca), -class= as_factor(class)) %>% -select(all_of(sam_vars), everything()) -exp_var <- as_tibble(pca[["importance"]]) -pca_plot %>% -# filter(treatment == "HL") %>% -ggplot()+ -geom_jitter(aes(x=PC1, y=PC2, color = tissue)) + -stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) + -geom_text(aes(x = PC1, y = PC2, -label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, NULL)), -nudge_y = -2, -nudge_x = 4) + -ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + -xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) -pca_plot %>% -# filter(treatment == "HL") %>% -ggplot()+ -geom_jitter(aes(x=PC1, y=PC2, color = tissue)) + -stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) + -geom_text(aes(x = PC1, y = PC2, -label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, "")), -nudge_y = -2, -nudge_x = 4) + -ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + -xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) -ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), -"cmQTL_val1_tissue_imp_PCA.jpg", -sep = "_"), -width = 183, -height = 100, -units = "mm", -dpi = 300) -pca_plot %>% -# filter(treatment == "HL") %>% -ggplot()+ -geom_jitter(aes(x=PC1, y=PC2, color = batch_GC)) + -stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) + -geom_text(aes(x = PC1, y = PC2, -label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, NULL)), -nudge_y = -2, -nudge_x = 4) + -ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + -xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) -pca_plot %>% -# filter(treatment == "HL") %>% -ggplot()+ -geom_jitter(aes(x=PC1, y=PC2, color = batch_GC)) + -stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) + -geom_text(aes(x = PC1, y = PC2, -label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, "")), -nudge_y = -2, -nudge_x = 4) + -ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + -xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) -ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), -"cmQTL_val1_batch_imp_PCA.jpg", -sep = "_"), -width = 183, -height = 100, -units = "mm", -dpi = 300) -pca_plot %>% -# filter(treatment == "HL") %>% -ggplot()+ -geom_jitter(aes(x=PC1, y=PC2, color = class)) + -stat_ellipse(aes(x=PC1, y=PC2, color = class)) + -geom_text(aes(x = PC1, y = PC2, -label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, NULL)), -nudge_y = -2, -nudge_x = 4) + -ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + -xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) -pca_plot %>% -# filter(treatment == "HL") %>% -ggplot()+ -geom_jitter(aes(x=PC1, y=PC2, color = class)) + -stat_ellipse(aes(x=PC1, y=PC2, color = class)) + -geom_text(aes(x = PC1, y = PC2, -label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, "")), -nudge_y = -2, -nudge_x = 4) + -ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + -xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) -ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), -"cmQTL_val1_class_imp_PCA.jpg", -sep = "_"), -width = 183, -height = 100, -units = "mm", -dpi = 300) -#cmQTLval2 -pca_base <- features_all %>% -ungroup() %>% -filter(exp == 2, batch_GC == "3_split") %>% -arrange(batch_GC, daily_num) %>% -mutate(log_imp = log2(isnorm)) %>% -pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), -names_from = met, -values_from = log_imp)%>% -filter(!is.na(m_1)) -pca <- summary(prcomp( -pca_base%>% -select(starts_with("m_")))) -sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() -pca_plot <- as_tibble(pca$x) %>% -mutate(join_num = 1:nrow(pca$x)) %>% -full_join(pca_base %>% -mutate(join_num=1:nrow(pca$x))) %>% -left_join(sam_dat) %>% -mutate(batch_pca= as_factor(batch_GC), -class= as_factor(class)) %>% -select(all_of(sam_vars), everything()) -exp_var <- as_tibble(pca[["importance"]]) -pca_plot %>% -# filter(treatment == "HL") %>% -ggplot()+ -geom_jitter(aes(x=PC1, y=PC2, color = class)) + -stat_ellipse(aes(x=PC1, y=PC2, color = class)) + -geom_text(aes(x = PC1, y = PC2, -label = if_else(machine_num_GC == "21109rA_59", machine_num_GC, NULL)), -nudge_y = -2, -nudge_x = 4) + -ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + -xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) -pca_plot %>% -# filter(treatment == "HL") %>% -ggplot()+ -geom_jitter(aes(x=PC1, y=PC2, color = class)) + -stat_ellipse(aes(x=PC1, y=PC2, color = class)) + -geom_text(aes(x = PC1, y = PC2, -label = if_else(machine_num_GC == "21109rA_59", machine_num_GC, "")), -nudge_y = -2, -nudge_x = 4) + -ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + -xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) -ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), -"cmQTL_val2_class_isnorm_PCA.jpg", -sep = "_"), -width = 183, -height = 100, -units = "mm", -dpi = 300) -plotmets <- features_all %>% distinct(met) %>% as_vector -plotmet_labs <- plotmets %>% as_tibble() %>% -left_join(met_dat, by= c("value" = "met")) %>% -#mutate(peak_num = base::rank(HMDB_clear_name, ties.method = "first"), -# dup = sum(peak_num), -# HMDB_clear_name_unique = if_else(dup>1, str_c(HMDB_clear_name, peak_num), HMDB_clear_name)) %>% -#ungroup() %>% -select(Compound_Name) %>% as_vector() -plot_out <- vector("list", length = length(plotmets)) -for (meta in seq_along(plotmets)) { -plot_out [[meta]] <- features_all %>% -mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC)), -is_miss = as_factor(if_else(is.na(area), T, F))) %>% -filter(met == plotmets[[meta]]) %>% -filter(class!="blank") %>% -ggplot(aes(x=run_num_GC, y=isnorm)) + -geom_point(aes(color = class, shape = is_miss)) + -geom_point(aes(y=predict), color="black", size=0.1) + -geom_vline(aes(xintercept = xint))+ -# facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") + -ggtitle(label = plotmet_labs[[meta]]) -} -pdf(file = str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),"_loess_fit.pdf")) -for (meta in seq_along(plotmets)) { -print(plot_out[[meta]]) -} -dev.off() -plot_out <- vector("list", length = length(plotmets)) -for (meta in seq_along(plotmets)) { -plot_out [[meta]] <- features_all %>% -mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC))) %>% -filter(met == plotmets[[meta]]) %>% -filter(class!="blank") %>% -ggplot(aes(x=run_num_GC, y=loess_norm_fw_log)) + -geom_point(aes(color=class)) + -#geom_point(aes(y=predp), color="black", size=0.1) + -geom_vline(aes(xintercept = xint))+ -# facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") + -ggtitle(label = plotmet_labs[[meta]]) -} -pdf(file = str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),"_loess_norm_fw_log.pdf")) -for (meta in seq_along(plotmets)) { -print(plot_out[[meta]]) -} -dev.off() -for (meta in seq_along(plotmets)) { -plot_out [[meta]] <- features_all %>% -mutate(is_miss = as_factor(if_else(is.na(area), T, F))) %>% -filter(met == plotmets[[meta]]) %>% -filter(class!="blank") %>% -ggplot(aes(x=daily_num, y=isnorm)) + -geom_point(aes(color = class, shape = is_miss)) + -geom_point(aes(y=predict), color="black", size=0.1) + -facet_wrap(facets = vars(batch_GC), nrow = 1, scales = "free") + -ggtitle(label = plotmet_labs[[meta]]) + -theme(axis.ticks.y = element_blank(), -axis.text.y = element_blank(), -panel.spacing = unit(-0.1,"lines")) -} -pdf(file = str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),"_loess_norm_no_scale.pdf"), -height = 3.5) -for (meta in seq_along(plotmets)) { -print(plot_out[[meta]]) -} -dev.off() -for (meta in seq_along(plotmets)) { -plot_out [[meta]] <- features_all %>% -mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC)), -is_miss = as_factor(if_else(is.na(area), T, F))) %>% -filter(met == plotmets[[meta]]) %>% -filter(class!="blank") %>% -ggplot(aes(x=run_num_GC, y=isnorm)) + -geom_point(aes(color = class, shape = is_miss)) + -geom_point(aes(y=predp), color="black", size=0.1) + -geom_vline(aes(xintercept = xint))+ -# facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") + -ggtitle(label = plotmet_labs[[meta]]) -} -pdf(file = str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),"_linnorm.pdf")) -for (meta in seq_along(plotmets)) { -print(plot_out[[meta]]) -} -dev.off() -rescale <- features_all %>% -group_by(met, tissue, exp) %>% -summarise(rescale = median(imp)) -sam_vars <- c("plantline", "alias", "LIMS_ID", -"treatment", "tissue", "batch_GC", "run_date_GC", -"extraction_num", "sample_num", "machine_num_GC", -"class", "run_num_GC", "sample_weight", "exp", "genotype") -features_out <- features_all %>% -left_join(rescale) %>% -filter(class != "blank", -(component %in% take_split & str_detect(batch_GC, "split"))| -(!component %in% take_split & str_detect(batch_GC, "split", negate = T))| -tissue == "leaves") %>% -group_by(met, tissue, exp) %>% -mutate(loess_norm_med = loess_norm_fw/median(loess_norm_fw), -rescaled = loess_norm_med*rescale) %>% -select(all_of(sam_vars), met, Compound_Name, loess_norm_fw, loess_norm_med, area, rescaled) -features_all %>% colnames() -sam_vars %in% colnames(features_all) -sam_vars <- c(#"plantline", -"alias", "LIMS_ID", -"treatment", "tissue", "batch_GC", "run_date_GC", -"extraction_num", "sample_num", "machine_num_GC", -"class", "run_num_GC", "sample_weight", "exp", "genotype") -sam_vars -sam_vars <- c(#"plantline", -"alias", "LIMS_ID", -"treatment", "tissue", "batch_GC", "run_date_GC", -"extraction_num", "sample_num", "machine_num_GC", -"class", "run_num_GC", "sample_weight", "exp", "genotype") -sam_vars %in% colnames(features_all) -sam_vars <- c(#"plantline","extraction_num", "sample_num", #in case I add them back -"alias", "LIMS_ID", -"treatment", "tissue", "batch_GC", "run_date_GC", -"machine_num_GC", -"class", "run_num_GC", "sample_weight", "exp", "genotype") -features_out <- features_all %>% -left_join(rescale) %>% -filter(class != "blank", -(component %in% take_split & str_detect(batch_GC, "split"))| -(!component %in% take_split & str_detect(batch_GC, "split", negate = T))| -tissue == "leaves") %>% -group_by(met, tissue, exp) %>% -mutate(loess_norm_med = loess_norm_fw/median(loess_norm_fw), -rescaled = loess_norm_med*rescale) %>% -select(all_of(sam_vars), met, Compound_Name, loess_norm_fw, loess_norm_med, area, rescaled) -View(features_out) -write_csv(features_out, -str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), -"cmQTL_val_1_2_feat_dat_GC.csv", -sep = "_")) -write_csv(met_dat, -str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), -"cmQTL_val_1_2_met_dat_GC.csv", -sep = "_")) -write_csv(sam_dat, -str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), -"cmQTL_val_1_2_sam_dat_GC.csv", -sep = "_")) -write_csv(missingness, -str_c(str_replace_all(Sys.Date(),"^.{2}|-",""), -"cmQTL_val_1_2_miss_GC.csv", -sep = "_")) -file_name <- sys.frame(1)$ofile +library(here) +load(here("./runs/GC-MS normalization/Image.RData")) +library(openxlsx) +library(tidyverse) +library(car) +library(pheatmap) +library(broom) +library(ggpubr) +library(viridisLite) +library(modelr) +#library(dlookr) +#library(imputeLCMD) +library(ggrepel) +sam_dat_dic <- colnames(sam_dat) %>% +as_tibble() +orig <- colnames(sam_dat1) %>% +as_tibble() +break_lines <- function(x){ +group = vector(mode = "list", length = length(x)) +#id <- vector(mode = "character", length = length(x)) +#isa_unit <- +#term_source <- vector(mode = "character", length = length(x)) +#term_acc <- vector(mode = "character", length = length(x)) +for(i in seq_along(x)){ +if(str_detect(x[[i]], "Characteristic|Parameter|Factor|Component")){ +group[[i]] <- tibble(id = x[[i]]) +if(str_detect(x[[i+1]], "Unit")) { +group[[i]] <- group[[i]] %>% mutate(isa_unit = x[[i+1]]) +if(str_detect(x[[i+2]], "Term Source REF")) { +group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+2]]) +if(str_detect(x[[i+3]], "Term Accession Number")){ +group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+3]]) +} +} +} else if(str_detect(x[[i+1]], "Term Source REF")) { +group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+1]]) +if(str_detect(x[[i+2]], "Term Accession Number")){ +group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+2]]) +} +} +} else { +#id[[i]] <- x[[i]] +} +} +group_tib <- group %>% +discard(.p = is.null) %>% +reduce(.f = bind_rows) +#test_tib <- tibble(id = id, +# unit = isa_unit, +# term_source = term_source, +# term_acc = term_acc) %>% +# filter() +assign("output",group, pos = 1) +} +break_lines(orig$value) +break_lines <- function(x){ +group = vector(mode = "list", length = length(x)) +#id <- vector(mode = "character", length = length(x)) +#isa_unit <- +#term_source <- vector(mode = "character", length = length(x)) +#term_acc <- vector(mode = "character", length = length(x)) +for(i in seq_along(x)){ +if(str_detect(x[[i]], "Characteristic|Parameter|Factor|Component")){ +group[[i]] <- tibble(id = x[[i]]) +if(str_detect(x[[i+1]], "Unit")) { +group[[i]] <- group[[i]] %>% mutate(isa_unit = x[[i+1]]) +if(str_detect(x[[i+2]], "Term Source REF")) { +group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+2]]) +if(str_detect(x[[i+3]], "Term Accession Number")){ +group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+3]]) +} +} +} else if(str_detect(x[[i+1]], "Term Source REF")) { +group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+1]]) +if(str_detect(x[[i+2]], "Term Accession Number")){ +group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+2]]) +} +} +} else { +#id[[i]] <- x[[i]] +} +} +group_tib <- group %>% +discard(.p = is.null) %>% +reduce(.f = bind_rows) +#test_tib <- tibble(id = id, +# unit = isa_unit, +# term_source = term_source, +# term_acc = term_acc) %>% +# filter() +assign("output",group_tib, pos = 1) +} +break_lines(orig$value) +View(output) +str_detect(c("test", "different", "capture", "groups"), "test") +str_detect(c("test", "different", "capture", "groups"), "test|different") +break_lines <- function(x){ +group = vector(mode = "list", length = length(x)) +for(i in seq_along(x)){ +if(str_detect(x[[i]], "Characteristic|Parameter|Factor|Component")){ +group[[i]] <- tibble(id = x[[i]]) +if(str_detect(x[[i+1]], "Unit")) { +group[[i]] <- group[[i]] %>% mutate(isa_unit = x[[i+1]]) +if(str_detect(x[[i+2]], "Term Source REF")) { +group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+2]]) +if(str_detect(x[[i+3]], "Term Accession Number")){ +group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+3]]) +} +} +} else if(str_detect(x[[i+1]], "Term Source REF")) { +group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+1]]) +if(str_detect(x[[i+2]], "Term Accession Number")){ +group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+2]]) +} +} +} else if(str_detect(x[[i]], "Source REF|Accession Number|Unit",negate = T)){ +group[[i]] <- tibble(id = x[[i]]) +} +} +group_tib <- group %>% +discard(.p = is.null) %>% +reduce(.f = bind_rows) +assign("output",group_tib, pos = 1) +} +break_lines(orig$value) +View(output) +here() +View(sam_dat_dic) +View(sam_dat) +View(isa_ext) +View(met_dat) +break_lines <- function(x){ +x <- colnames(x) %>% +as_tibble() +x <- x$value +group = vector(mode = "list", length = length(x)) +for(i in seq_along(x)){ +if(str_detect(x[[i]], "Characteristic|Parameter|Factor|Component")){ +group[[i]] <- tibble(id = x[[i]]) +if(str_detect(x[[i+1]], "Unit")) { +group[[i]] <- group[[i]] %>% mutate(isa_unit = x[[i+1]]) +if(str_detect(x[[i+2]], "Term Source REF")) { +group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+2]]) +if(str_detect(x[[i+3]], "Term Accession Number")){ +group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+3]]) +} +} +} else if(str_detect(x[[i+1]], "Term Source REF")) { +group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+1]]) +if(str_detect(x[[i+2]], "Term Accession Number")){ +group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+2]]) +} +} +} else if(str_detect(x[[i]], "Source REF|Accession Number|Unit",negate = T)){ +group[[i]] <- tibble(id = x[[i]]) +} +} +group_tib <- group %>% +discard(.p = is.null) %>% +reduce(.f = bind_rows) +assign("output",group_tib, pos = 1) +} +break_lines(sam_dat1) +break_lines <- function(x, output_name){ +x <- colnames(x) %>% +as_tibble() +x <- x$value +group = vector(mode = "list", length = length(x)) +for(i in seq_along(x)){ +if(str_detect(x[[i]], "Characteristic|Parameter|Factor|Component")){ +group[[i]] <- tibble(id = x[[i]]) +if(str_detect(x[[i+1]], "Unit")) { +group[[i]] <- group[[i]] %>% mutate(isa_unit = x[[i+1]]) +if(str_detect(x[[i+2]], "Term Source REF")) { +group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+2]]) +if(str_detect(x[[i+3]], "Term Accession Number")){ +group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+3]]) +} +} +} else if(str_detect(x[[i+1]], "Term Source REF")) { +group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+1]]) +if(str_detect(x[[i+2]], "Term Accession Number")){ +group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+2]]) +} +} +} else if(str_detect(x[[i]], "Source REF|Accession Number|Unit",negate = T)){ +group[[i]] <- tibble(id = x[[i]]) +} +} +group_tib <- group %>% +discard(.p = is.null) %>% +reduce(.f = bind_rows) +assign(output_name,group_tib, pos = 1) +} +break_lines(sam_dat1, "output_samdat") +break_lines(isa_ext, "output_ext") +break_lines(isa_gc, "output_gc") +View(output_gc) +output <- bind_rows(output_samdat, +output_ext, +output_gc, +output_ms) +beak_lines(isa_ms, "output_ms") +break_lines(isa_ms, "output_ms") +output <- bind_rows(output_samdat, +output_ext, +output_gc, +output_ms) +here() +getwd() +setwd(here())# Not recommended but convenient in Rstudio to start from root +write.xlsx(output, +here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/isa.dataset_temp.xlsx")) +here(out, +"cmQTL_val_1_2_met_dat_GC.csv" +) +here(out, +"cmQTL_val_1_2_sam_dat_GC.csv" +) +View(isa_ms) +View(isa_ms_tidy) +1176-1145 diff --git a/assays/cmQTL_val1_GH_2020_GC_MS/dataset/isa.dataset.xlsx b/assays/cmQTL_val1_GH_2020_GC_MS/dataset/isa.dataset.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..565d6b5087a772cf0e5d39ca9bc7e607e4804dba --- /dev/null +++ b/assays/cmQTL_val1_GH_2020_GC_MS/dataset/isa.dataset.xlsx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68d73ca9855cf91cfd9ea8c200f65bef8982e2c6be605924ea29c9865aacc1b3 +size 11468 diff --git a/assays/cmQTL_val1_GH_2020_GC_MS/dataset/isa.dataset_temp.xlsx b/assays/cmQTL_val1_GH_2020_GC_MS/dataset/isa.dataset_temp.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..270d5f18db8cc20042b209303be26c4e92266ecd --- /dev/null +++ b/assays/cmQTL_val1_GH_2020_GC_MS/dataset/isa.dataset_temp.xlsx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d63d90d0e4df45541ea52c0349c41dd82b4c320e98e4382d1728583401de1f5c +size 9962 diff --git a/runs/GC-MS normalization/Image.RData b/runs/GC-MS normalization/Image.RData new file mode 100644 index 0000000000000000000000000000000000000000..8be71b208442c3116797ce1ca7ca0cab18d3b75e Binary files /dev/null and b/runs/GC-MS normalization/Image.RData differ diff --git a/runs/GC-MS normalization/_linnorm.pdf b/runs/GC-MS normalization/_linnorm.pdf index a1b56f697faca85f53cffe087e10188020b487eb..0b6ddced2d2b929731c5d317f19c49e63609e7b6 100644 Binary files a/runs/GC-MS normalization/_linnorm.pdf and b/runs/GC-MS normalization/_linnorm.pdf differ diff --git a/runs/GC-MS normalization/_loess_fit.pdf b/runs/GC-MS normalization/_loess_fit.pdf index bd907e7c808daf905f2eb0c1fefbf4b6dd67cec7..7bfdd6326ceb485a60331fad6624b9d758d3557f 100644 Binary files a/runs/GC-MS normalization/_loess_fit.pdf and b/runs/GC-MS normalization/_loess_fit.pdf differ diff --git a/runs/GC-MS normalization/_loess_norm_fw_log.pdf b/runs/GC-MS normalization/_loess_norm_fw_log.pdf index a492791f0567d2695de7bfc0aebffd11767487b9..22ae1df4324951ec29e20aa70d41f420c9c7a5cc 100644 Binary files a/runs/GC-MS normalization/_loess_norm_fw_log.pdf and b/runs/GC-MS normalization/_loess_norm_fw_log.pdf differ diff --git a/runs/GC-MS normalization/_loess_norm_no_scale.pdf b/runs/GC-MS normalization/_loess_norm_no_scale.pdf index d0eab2cc0a08350ee287e82656d9992acc32f877..2090c30d3b5277dedeb25afad9bc4c4a476210e5 100644 Binary files a/runs/GC-MS normalization/_loess_norm_no_scale.pdf and b/runs/GC-MS normalization/_loess_norm_no_scale.pdf differ diff --git a/runs/GC-MS normalization/cmQTL_val1/fruit/treatment_loess_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1/fruit/treatment_loess_PCA.jpg index 789d9d8d26208396f7a2535f46c55dde33d6a2eb..a687b9e48a031a8c5b287b7f75fdd1bc5832e813 100644 Binary files a/runs/GC-MS normalization/cmQTL_val1/fruit/treatment_loess_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1/fruit/treatment_loess_PCA.jpg differ diff --git a/runs/GC-MS normalization/cmQTL_val1/leaf/treatment_loess_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1/leaf/treatment_loess_PCA.jpg index e659cf390d18d9ca65f85fe4bec0043faf6410ce..67c5f2771b2781afc26556b0bc74b45dadf9b0b4 100644 Binary files a/runs/GC-MS normalization/cmQTL_val1/leaf/treatment_loess_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1/leaf/treatment_loess_PCA.jpg differ diff --git a/runs/GC-MS normalization/cmQTL_val1_2_GC_met_rla_plot.jpg b/runs/GC-MS normalization/cmQTL_val1_2_GC_met_rla_plot.jpg index 06b3542a0890dcdddecd65920ad49fb776d37e6b..29bca42b2d92c463ef4374566f89e9ffc9b550be 100644 Binary files a/runs/GC-MS normalization/cmQTL_val1_2_GC_met_rla_plot.jpg and b/runs/GC-MS normalization/cmQTL_val1_2_GC_met_rla_plot.jpg differ diff --git a/runs/GC-MS normalization/cmQTL_val1_2_samples_rla_plot.jpg b/runs/GC-MS normalization/cmQTL_val1_2_samples_rla_plot.jpg index 3183af762fd7b550d01f976017f1e26b10c88778..53f32e810a5d957a0e90e82fcc492472660707d2 100644 Binary files a/runs/GC-MS normalization/cmQTL_val1_2_samples_rla_plot.jpg and b/runs/GC-MS normalization/cmQTL_val1_2_samples_rla_plot.jpg differ diff --git a/runs/GC-MS normalization/cmQTL_val1_batch_imp_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_batch_imp_PCA.jpg index bfb90fe7db1ec011aff8c1c5fa846dd028ff8dde..35477ad863ad9ddc53de4df7f3c721f174e06f6a 100644 Binary files a/runs/GC-MS normalization/cmQTL_val1_batch_imp_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_batch_imp_PCA.jpg differ diff --git a/runs/GC-MS normalization/cmQTL_val1_class_imp_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_class_imp_PCA.jpg index 8bf923236138f1abd6da0478170f2059501ace7e..b72fd0531a873787004673c8a1978ce9b1f57a45 100644 Binary files a/runs/GC-MS normalization/cmQTL_val1_class_imp_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_class_imp_PCA.jpg differ diff --git a/runs/GC-MS normalization/cmQTL_val1_tissue_imp_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_tissue_imp_PCA.jpg index 585c8bc5615e1edcf65763edcb06b9d84dad3bfd..729c414ef28a035d73db4e8374c24094767bb049 100644 Binary files a/runs/GC-MS normalization/cmQTL_val1_tissue_imp_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_tissue_imp_PCA.jpg differ diff --git a/runs/GC-MS normalization/cmQTL_val1_tissue_lin_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_tissue_lin_PCA.jpg index faf8d896aedd0bd6a754d65e01b0af3aa7ea8149..2f259827bad98892c43145cdd5c86136d461e83b 100644 Binary files a/runs/GC-MS normalization/cmQTL_val1_tissue_lin_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_tissue_lin_PCA.jpg differ diff --git a/runs/GC-MS normalization/cmQTL_val1_tissue_loess_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_tissue_loess_PCA.jpg index 2cbbe7ed76721a8296bd43591e3f75a9d15e2a44..10cfd249bc7c2551f7393dff512b14ea7c1e9ffa 100644 Binary files a/runs/GC-MS normalization/cmQTL_val1_tissue_loess_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_tissue_loess_PCA.jpg differ diff --git a/runs/GC-MS normalization/cmQTL_val1_treatment_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_treatment_PCA.jpg index fc6cbc86aa15983c3f60d9e081fe9609d01219be..5449fe34da91b73c18e263d45ff333f86f1be4b8 100644 Binary files a/runs/GC-MS normalization/cmQTL_val1_treatment_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_treatment_PCA.jpg differ diff --git a/runs/GC-MS normalization/cmQTL_val1_treatment_loess_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_treatment_loess_PCA.jpg index 585c72dbe95ad7565935212cf792b17ea91b0903..cf488b11c19ebea36f3f4a6686cc56024bc72f4c 100644 Binary files a/runs/GC-MS normalization/cmQTL_val1_treatment_loess_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_treatment_loess_PCA.jpg differ diff --git a/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp1_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp1_PCA.jpg index 2009c9ad376ef6f66ef4cf225379862907daf3e7..8a0493bbcb5991cab0e77a86983d541aca4065d3 100644 Binary files a/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp1_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp1_PCA.jpg differ diff --git a/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp2_imp_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp2_imp_PCA.jpg index 5114cb5449c9d180cd337340f4b4f7f3f28992b2..9338c3c8b38a238d168c7a476c8a2aa78100c8c3 100644 Binary files a/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp2_imp_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp2_imp_PCA.jpg differ diff --git a/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp2_loess_norm_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp2_loess_norm_PCA.jpg index f34ef7a03390e9da1dad9f6864016268a9a5f259..f53f7af88ccf9e389f2d775e08d1e365199e0bbe 100644 Binary files a/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp2_loess_norm_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp2_loess_norm_PCA.jpg differ diff --git a/runs/GC-MS normalization/cmQTL_val2_class_isnorm_PCA.jpg b/runs/GC-MS normalization/cmQTL_val2_class_isnorm_PCA.jpg index 62fcac95ef7682be33061aecdd854a6f74be4459..d82b4de00e63ff9514e8518cdf89af741eac11c2 100644 Binary files a/runs/GC-MS normalization/cmQTL_val2_class_isnorm_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val2_class_isnorm_PCA.jpg differ diff --git a/runs/GC-MS normalization/executing.R b/runs/GC-MS normalization/executing.R new file mode 100644 index 0000000000000000000000000000000000000000..bfe2ec36bf1597b8fe35ed02257ac3f43b94a1d6 --- /dev/null +++ b/runs/GC-MS normalization/executing.R @@ -0,0 +1,1118 @@ +rm(list = ls()) + +library(openxlsx) +library(tidyverse) +library(car) +library(pheatmap) +library(broom) +library(ggpubr) +library(viridisLite) +library(modelr) +#library(dlookr) +#library(imputeLCMD) +library(ggrepel) + + +# Directory setting ------------------------------------------------------- + +here::i_am("workflows/GC_MS_normalization/210927_primary_normalization_with_split.R") +library(here) + +out <- here("runs/GC-MS normalization") + +if (file.exists(out)) { + cat("The folder already exists") +} else { + dir.create(out) +} +setwd(here())# Not recommended but convenient in Rstudio to start from root + +# Data loading ------------------------------------------------------------ + +load(here("Temp.RData")) + +sam_dat1 <- readxl::read_xlsx(here("studies/cmQTL_val1_GH_2020/isa.study.xlsx")) +isa_ext <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 1) +isa_gc <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 2) +isa_ms <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 3) +#genotypes <- readxl::read_xlsx("Genotype_names.xlsx") %>% +# mutate(plantline = as_factor(plantline)) + +#GC_machine_nums <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 6) + +take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160_rt9.81", "glutamic_acid_246_363_rt8.31", + "glutamine_156_245_rt9.80", "malic_acid_233_245_rt7.22", "shikimic_acid_204_462_rt9.57", "shikimic_acid_204_462_rt9.57", + "pyroglutamic_acid_156_258_rt8.30", "sucrose_437_361_rt13.77", "sucrose2_204_361_rt13.79", "citric_acid_273_375_rt9.72", + "arginine_157_256_rt9.92") + +exclude_samples <- c("21106rA_31", "21107rA_54", "21109rA_59", "21109rA_86", "21109rA_78") +exclude_mets <- c("psicose_103_217_rt9.38", "glutamic_acid_246_363_rt8.31", "lactic_acid_117_219_rt3.07")#glu wrong peak + +area1 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_seq_file_20210914143103_comp_file_area_rt1.bkt.xls"), na = c("", "N/A")) +area2 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls"), na = c("", "N/A")) +area3 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls"), na = c("", "N/A")) + +metdat_GC_class <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/MAF_GC_MS.xlsx")) + +area <- area1 %>% + bind_rows(area2, area3) %>% + select(component, area, machine_num_GC = machine_num,rt) %>% + mutate(area = as.numeric(area), + rt = as.numeric(rt)) + +rt_mean <- area %>% + group_by(component) %>% + summarise(RT_mean = mean(rt, na.rm = T)) + +metdat_GC_class <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/MAF_GC_MS.xlsx")) %>% + select(component = Xcal_name_xreport, Compound_Name = PubChem_Name_mapped)%>% + left_join(rt_mean) %>% + filter(!is.na(component)) %>% + arrange(Compound_Name, RT_mean) %>% + group_by(Compound_Name) %>% + mutate(peak_no = rank(RT_mean), + Compound_Name = if_else(duplicated(Compound_Name), + str_c(Compound_Name, "peak", peak_no, sep = "_"), + Compound_Name)) + +# Data combination -------------------------------------------------------- + +sam_vars <- c("plantline", "alias", "LIMS_ID", + "treatment", "tissue", "batch_GC", "run_date_GC", + "extraction_num", "sample_num", "machine_num_GC", + "class", "run_num_GC", "sample_weight", "exp", "genotype") + +isa_study_tidy <- sam_dat1 %>% + mutate(sample_weight = as.double(str_extract(`Parameter [Sample Weight]`, "\\d{0,3}\\.*\\d{0,2}"))) %>% + rename(source_name = `Source Name`, + #plantline = `Characteristic [plantline]`, + alias = `Characteristic [alias]`, + LIMS_ID = `Characteristic [LIMS aliquot]`, + treatment = `Factor [Irrigation factor]`, + tissue = `Parameter [organism part]`, + genotype = `Characteristic [genotype]`, + #sample_num = `Characteristic [sample_name_non_unique]`, + #extraction_num = `Characteristic [extract number]`, + sample_name = `Sample Name`) %>% + select(source_name, any_of(sam_vars), sample_name) + +isa_ext_tidy <- isa_ext %>% + rename(source_name =`Source Name`, + exp = `Characteristic [experiment name]`, + sample_name = `Sample Name`) %>% + select(source_name, any_of(sam_vars), sample_name) + +isa_gc_tidy <- isa_gc %>% + rename(source_name =`Source Name`, + class = `Characteristic [sample type]`, + batch_GC = `Parameter [Batch]`, + run_date_GC = `Parameter [run date]`, + #daily_num = `Parameter [daily number]`, + run_num_GC = `Parameter [Number]`, + sample_name = `Sample Name`) %>% + select(source_name, any_of(sam_vars), sample_name) + +isa_ms_tidy <- isa_ms %>% + rename(source_name =`Source Name`, + sample_name = `Sample Name`) %>% + mutate(machine_num_GC = sample_name) %>% + select(source_name, any_of(sam_vars), sample_name) + +isa_tidy <- isa_study_tidy %>% + full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>% + full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>% + full_join(isa_ms_tidy, by = c("sample_name_ext" = "source_name", "sample_name" = "sample_name")) + +batch_exp_tissue <- isa_tidy %>% + distinct(batch_GC, exp, tissue) %>% + filter(!is.na(exp)) %>% + rename(exp_fill = exp, + tissue_fill = tissue) + +sam_dat <- isa_tidy %>% + arrange(run_num_GC) %>% + #fill(c(tissue, exp), .direction = "updown") %>% + arrange(run_num_GC) %>% + group_by(batch_GC) %>% + mutate(daily_num = row_number()) %>% + ungroup() %>% + left_join(batch_exp_tissue) %>% + mutate(tissue = tissue_fill, + exp = exp_fill) %>% + select(-c(tissue_fill, exp_fill)) + +area_long <- area %>% + filter(!is.na(component)) %>% + left_join(sam_dat) %>% + filter(!is.na(exp)) + +area <- area_long %>% + left_join(metdat_GC_class) %>% + filter(!str_detect(component, "^\\!|FAME|component|empty"), !is.na(component), + !component %in% exclude_mets) + +met_dat = area %>% + distinct(RT_mean, Compound_Name, component) %>% + mutate(met = str_c("m", row_number(), sep = "_")) + +area <- area %>% + left_join(met_dat) %>% + mutate(area = as.numeric(area)) + +# PCA to justify filtering ------------------------------------------------ + +features_pca <- area %>% + group_by(met) %>% + mutate(imp = if_else(is.na(area), 0.5*min(area, na.rm = T),area), + batch_pca = str_extract(batch_GC, "\\d+")) %>% + ungroup() + +pca_base <- features_pca %>% + ungroup() %>% + filter(exp == 1) %>% + arrange(run_num_GC) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC, run_num_GC), + names_from = met, + values_from = log_imp) %>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + left_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + #left_join(sam_dat) %>% + mutate(batch_GC= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = batch_GC)) + + stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) + + geom_text_repel(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC == "21106rA_31"| machine_num_GC == "21107rA_54", machine_num_GC, "")), box.padding = 1) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(here(out, + "cmQTL_val1_treatment_loess_outlier_exp1_PCA.jpg"), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +pca_base <- features_pca %>% + ungroup() %>% + filter(exp == 2) %>% + arrange(batch_GC, daily_num) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = log_imp) %>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + left_join(pca_base %>% + arrange(batch_GC, daily_num) %>% + mutate(join_num=1:nrow(pca$x))) %>% + #left_join(sam_dat) %>% + mutate(batch_GC= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = class)) + + stat_ellipse(aes(x=PC1, y=PC2, color = class)) + + geom_label_repel(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC %in% exclude_samples, machine_num_GC, "")),force_pull = -0.1, + max.overlaps = Inf) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(here(out, + "cmQTL_val1_treatment_loess_outlier_exp2_imp_PCA.jpg" + ), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +# Data filtering ---------------------------------------------------------- + + +set_back_NA <- function(x){ + for (i in seq_along(x)) { + if (x[[i]] == 0) { + x[[i]] <- NA + } else { + x[[i]] <- x[[i]] + } + } + x +} + +area_long <- area %>% + filter(!machine_num_GC %in% exclude_samples) + + +# Imputation -------------------------------------------------------------- + +features_na <- area_long #%>% + #group_by(met) %>% + #mutate(area = set_back_NA(area)) %>% + #ungroup() + +missingness <- features_na %>% + group_by(exp, met, tissue, treatment) %>% + summarise(na = sum(is.na(area)), + n = n()) %>% + mutate(percent_na = na/n*100) %>% + ungroup() + +features <- features_na %>% + left_join(missingness) %>% + group_by(met) %>% + #mutate(area = ifelse(percent_na >= 0 & is.na(area), rnorm(n = 1, mean = 0.5*min(area, na.rm = T)), area)) %>% + ungroup() + + +features_filtered <- area_long %>% + group_by(met) %>% + mutate(imp = if_else(is.na(area), 0.5*min(area, na.rm = T),area), + batch_pca = str_extract(batch_GC, "\\d+")) %>% + ungroup() %>% + left_join(missingness) #%>% + #filter(percent_na <= 60) + + +# Internal standard check ------------------------------------------------- + +features_filtered %>% + filter(str_detect(component, "ribitol")) %>% + mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC)), + is_miss = as_factor(if_else(is.na(area), T, F))) %>% + filter(class!="blank sample") %>% + ggplot(aes(x=run_num_GC, y = log2(imp))) + + geom_point(aes(color = class, shape = is_miss)) + + geom_vline(aes(xintercept = xint)) + + facet_grid(rows = vars(component)) + +# Internal Standard Normalization ----------------------------------------- + +isvec <- features_filtered %>% + filter(component == "ribitol_217_rt8.20") %>% + select(machine_num_GC, is = imp) + +features <- features_filtered %>% + full_join(isvec) %>% + mutate(isnorm = imp/is) %>% + filter(str_detect(Compound_Name, "ribitol", negate = T)) %>% + mutate(sample_weight = if_else(sample_weight == 0|is.na(sample_weight), 50, sample_weight)) + + +# Fit linear model on QCs ------------------------------------------------- + +by_batch_GC <- features %>% + filter(class=="Pooled Sample") %>% + group_by(batch_GC, met) %>% # need to change treatment_batch_GC_corr + nest() + +feature_model <- function(df) { + lm(isnorm ~ daily_num, data = df) +} + + +by_batch_GC <- by_batch_GC %>% + mutate(model = map(data, feature_model), + predicts = map2(data, model, add_predictions), + coefficients = map(model, tidy), + aug = map(model, augment), + glance = map (model, glance)) + +coefs_aug <- unnest(by_batch_GC, aug) %>% + select(batch_GC, met, .resid, daily_num) + +coefs_glance <- unnest(by_batch_GC,glance) %>% + select(batch_GC, met, adj.r.squared) + +coefs_term <- unnest(by_batch_GC, coefficients) %>% + pivot_wider(id_cols = c(batch_GC, met), + names_from = "term", + values_from = estimate) %>% + rename(x = daily_num, + intercept = `(Intercept)`) + + +coefs_pvalue <- by_batch_GC %>% + select(batch_GC, met, coefficients) %>% + ungroup() %>% + unnest(coefficients) %>% + pivot_wider(id_cols = c(batch_GC, met), + names_from = "term", + values_from = p.value) %>% + rename(intercept_pval = `(Intercept)`, + daily_num_pval = daily_num) %>% + full_join(features) %>% + full_join(coefs_term) %>% + #left_join(coefs_aug) %>% + full_join(coefs_glance) + +median <- features %>% + filter(class=="Pooled Sample") %>% + group_by(batch_GC, met) %>% + summarise(median = median (isnorm)) + +features_lin <- unnest(by_batch_GC, predicts) %>% + full_join(coefs_pvalue) %>% + full_join(median) %>% + mutate(pred = intercept + x*daily_num, + predp = ifelse(daily_num_pval <=0.05 & adj.r.squared >= 0.75, pred, median), + predp = ifelse(is.na(predp), median, predp)) %>% + select(- c("data", "model", "coefficients")) + +adjust_lin <- features_lin %>% + ungroup() %>% + group_by(met) %>% + summarise(lin_offset = if_else(any(predp <0), 1.001*abs(min(predp)),0)) + +features_lin <- features_lin %>% + ungroup() %>% + full_join(adjust_lin) %>% + full_join(features) %>% + mutate(linnorm = (isnorm+lin_offset)/(predp+lin_offset), + linnorm_fw = linnorm/sample_weight, + linnorm_fw_log = log2(linnorm_fw)) + + + + +#Fit loess-model for QC-RLSC batch_GCwise#### +rejoin <- features +features <- rejoin + +QC_loess <- features %>% + ungroup() %>% + filter(class == "Pooled Sample") %>% + #filter(met!="m_44") %>% + group_by(met, batch_GC) %>% + nest() +#revert span back to 1.5 later +loess_model <- function (df) { + loess(isnorm ~ daily_num, span = 1.5, data = df,control = loess.control(surface = "direct")) +} + +start <- Sys.time() +QC_loess <- QC_loess %>% + mutate(model = map (data, loess_model), + daily_num = list(seq(1,101,1)), + predict = map2(model,daily_num,stats::predict)) +end <- Sys.time() +end-start + +features_loess <- QC_loess %>% + unnest(c(predict, daily_num, batch_GC)) %>% + select(-model, -data) %>% + full_join(rejoin) %>% + filter(!is.na(machine_num_GC)) + +adjust <- features_loess %>% + ungroup() %>% + group_by(met) %>% + summarise(offset = if_else (any(predict <0), 1.001*abs(min(predict, na.rm = T)),0)) + +features_loess <- features_loess %>% + left_join(adjust) %>% + mutate(loess_norm = (isnorm+offset)/(predict+offset), + loess_norm_fw = loess_norm/sample_weight, + loess_norm_fw_log = log2(loess_norm_fw))%>% + ungroup() + +features_all <- features_lin %>% + left_join(features_loess) %>% + mutate(batch_pca = str_extract(batch_GC, "\\d+")) + + +# Relative log abundance plots -------------------------------------------- + + + + +features_all %>% + filter(!is.na(genotype)) %>% + group_by(met, treatment) %>% + mutate(sub = loess_norm_fw, + rla = log2(sub) - median(log2(sub))) %>% + ggplot(aes(x = met, y = rla)) + + geom_boxplot() + + facet_grid(rows = vars(treatment), scales = "free")+ + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + +features_all %>% + #filter(!is.na(taxa)) %>% + filter(treatment == 1, class == "Sample") %>% + mutate(run_num_GC = as_factor(run_num_GC), + batch_GC = as_factor(batch_GC)) %>% + group_by(met, treatment) %>% + mutate(rla_imp = log2(imp) - median(log2(imp)), + rla_isnorm = log2(isnorm) - median(log2(isnorm)), + rla_loess_norm = log2(loess_norm_fw) - median(log2(loess_norm_fw))) %>% + pivot_longer(starts_with("rla"), names_to = "normalization", values_to = "rla") %>% + ggplot(aes(x = met, y = rla)) + + geom_boxplot() + + facet_grid(rows = vars(normalization), cols = vars(treatment), scales = "free")+ + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + ylim(c(-2,2)) + +ggsave(here(out, + "cmQTL_val1_2_GC_met_rla_plot.jpg" + )) + +features_all %>% + #filter(!is.na(taxa)) %>% + filter(class == "Sample") %>% + mutate(run_num_GC = as_factor(run_num_GC), + batch_GC = as_factor(batch_GC)) %>% + group_by(met, treatment) %>% + mutate(rla_imp = log2(imp) - median(log2(imp)), + rla_isnorm = log2(isnorm) - median(log2(isnorm)), + rla_loess_norm = log2(loess_norm_fw) - median(log2(loess_norm_fw))) %>% + pivot_longer(starts_with("rla"), names_to = "normalization", values_to = "rla") %>% + ggplot(aes(x = run_num_GC, y = rla, color = batch_GC)) + + geom_boxplot() + + facet_grid(rows = vars(normalization), scales = "free")+ + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + ylim(c(-1,1)) + +ggsave(here(out, + "cmQTL_val1_2_samples_rla_plot.jpg" + )) + + +# RSD estimation ---------------------------------------------------------- + +RSD_bio_reps <- features_all %>% + filter(class == "sample") %>% + #filter(!is.na(taxa)) %>% + group_by(met) %>% + #filter((any(qc_imputed) == T)==F) %>% + group_by(met, treatment, genotype, tissue, exp) %>% + summarise(RSD_imp = sd(imp)/mean(imp), + RSD_isnorm = sd(isnorm) / mean(isnorm), + RSD_linnorm = abs(sd (linnorm_fw)/ mean(linnorm_fw)), + RSD_loess_norm = abs(sd(loess_norm_fw) / mean(loess_norm_fw))) + + +RSD_bio_reps_mean <- RSD_bio_reps %>% + #mutate(RSD_diff = RSD_imp-RSD_loess) %>% + ungroup() %>% + #group_by(treatment, tissue, exp) %>% + summarise(#mean_RSD_loess = mean(RSD_loess), + mean_imp = mean(na.omit(RSD_imp)), + mean_loess_norm = mean(na.omit(RSD_loess_norm)), + mean_RSD_isnorm = mean(na.omit(RSD_isnorm )), + mean_linnorm = mean(na.omit(RSD_linnorm))) + +RSD_qcs <- features_all %>% + filter(class == "Pooled Sample") %>% + group_by(met) %>% + #filter((any(qc_imputed) == T)==F) %>% + group_by(met, exp) %>% + summarise(RSD_imp = sd(imp)/mean(imp), + RSD_isnorm = sd(isnorm) / mean(isnorm), + RSD_linnorm = abs(sd (linnorm_fw)/ mean(linnorm_fw)), + RSD_loess_norm = abs(sd(loess_norm_fw) / mean(loess_norm_fw))) + +RSD_qcs_mean <- RSD_qcs %>% + #mutate(RSD_diff = RSD_imp-RSD_loess) %>% + ungroup() %>% + group_by(exp) %>% + summarise(#mean_RSD_loess = mean(RSD_loess), + mean_imp = mean(na.omit(RSD_imp)), + mean_loess_norm = mean(na.omit(RSD_loess_norm)), + mean_RSD_isnorm = mean(na.omit(RSD_isnorm )), + mean_linnorm = mean(na.omit(RSD_linnorm))) + + + +#PCA testing chunk#### +##https://www.intechopen.com/books/metabolomics-fundamentals-and-applications/processing-and-visualization-of-metabolomics-data-using-r +#exp1 +pca_base <- features_all %>% + ungroup() %>% + filter(exp == 1) %>% + arrange(batch_pca, daily_num) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = linnorm_fw_log) %>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + left_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + #left_join(sam_dat) %>% + mutate(batch_GC= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = tissue)) + + stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(here(out, + "cmQTL_val1_tissue_lin_PCA.jpg" + ), + width = 183, + height = 100, + units = "mm", + dpi = 300) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = treatment)) + + stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(here(out, + "cmQTL_val1_treatment_PCA.jpg" + ), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +#PCA after loess normalization + +pca_base <- features_all %>% + ungroup() %>% + filter(exp == 1) %>% + arrange(batch_pca, daily_num) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = loess_norm_fw_log) %>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + full_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + #left_join(sam_dat) %>% + mutate(batch_pca= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = tissue)) + + stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(here(out, + "cmQTL_val1_tissue_loess_PCA.jpg" + ), + width = 183, + height = 100, + units = "mm", + dpi = 300) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = treatment)) + + stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(here(out, + "cmQTL_val1_treatment_loess_PCA.jpg" + ), + width = 183, + height = 100, + units = "mm", + dpi = 300) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = class)) + + stat_ellipse(aes(x=PC1, y=PC2, color = class)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + + + +#PCA after loess normalization exp2 + +pca_base <- features_all %>% + ungroup() %>% + filter(exp == 2) %>% + arrange(batch_pca, daily_num) %>% + mutate(log_imp = log2(isnorm)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = log_imp) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + full_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + #left_join(sam_dat) %>% + mutate(batch_pca= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = class)) + + stat_ellipse(aes(x=PC1, y=PC2, color = class)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = batch_GC)) + + stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) + + geom_text(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC %in% exclude_samples, machine_num_GC, ""))) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(here(out, + "cmQTL_val1_treatment_loess_outlier_exp2_loess_norm_PCA.jpg" + ), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +# Per tissue loess pca ---------------------------------------------------- + + +tissues <- c("leaf", "fruit") +for(tiss in seq_along(tissues)){ + + pca_base <- features_all %>% + ungroup() %>% + filter(exp == 1, tissue == tissues[[tiss]]) %>% + arrange(batch_pca, daily_num) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment,batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = loess_norm_fw_log) %>% + filter(!is.na(m_1)) + + pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + + sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + + pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + full_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + left_join(sam_dat) %>% + mutate(batch_pca= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + + exp_var <- as_tibble(pca[["importance"]]) + + pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = treatment)) + + stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + + ggsave(here(out, + "cmQTL_val1", tissues[[tiss]], "treatment_loess_PCA.jpg" + ), + width = 183, + height = 100, + units = "mm", + dpi = 300) +} + + +# PCA before normalization + +pca_base <- features_all %>% + ungroup() %>% + filter(exp == 1) %>% + arrange(batch_pca, daily_num) %>% + mutate(log_imp = log2(imp)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_pca, daily_num, class, machine_num_GC), + names_from = met, + values_from = log_imp)%>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + full_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + left_join(sam_dat) %>% + mutate(batch_pca= as_factor(batch_pca), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = tissue)) + + stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) + + geom_text(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, "")), + nudge_y = -2, + nudge_x = 4) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(here(out, + "cmQTL_val1_tissue_imp_PCA.jpg" + ), + width = 183, + height = 100, + units = "mm", + dpi = 300) + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = batch_GC)) + + stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) + + geom_text(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, "")), + nudge_y = -2, + nudge_x = 4) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(here(out, + "cmQTL_val1_batch_imp_PCA.jpg" + ), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = class)) + + stat_ellipse(aes(x=PC1, y=PC2, color = class)) + + geom_text(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, "")), + nudge_y = -2, + nudge_x = 4) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(here(out, + "cmQTL_val1_class_imp_PCA.jpg" + ), + width = 183, + height = 100, + units = "mm", + dpi = 300) + +#cmQTLval2 +pca_base <- features_all %>% + ungroup() %>% + filter(exp == 2, batch_GC == "3_split") %>% + arrange(batch_GC, daily_num) %>% + mutate(log_imp = log2(isnorm)) %>% + pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC), + names_from = met, + values_from = log_imp)%>% + filter(!is.na(m_1)) + +pca <- summary(prcomp( + pca_base%>% + select(starts_with("m_")))) + +sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames() + +pca_plot <- as_tibble(pca$x) %>% + mutate(join_num = 1:nrow(pca$x)) %>% + full_join(pca_base %>% + mutate(join_num=1:nrow(pca$x))) %>% + left_join(sam_dat) %>% + mutate(batch_pca= as_factor(batch_GC), + class= as_factor(class)) %>% + select(all_of(sam_vars), everything()) + +exp_var <- as_tibble(pca[["importance"]]) + + +pca_plot %>% + # filter(treatment == "HL") %>% + ggplot()+ + geom_jitter(aes(x=PC1, y=PC2, color = class)) + + stat_ellipse(aes(x=PC1, y=PC2, color = class)) + + geom_text(aes(x = PC1, y = PC2, + label = if_else(machine_num_GC == "21109rA_59", machine_num_GC, "")), + nudge_y = -2, + nudge_x = 4) + + ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) + + xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)")) + +ggsave(here(out, + "cmQTL_val2_class_isnorm_PCA.jpg" + ), + width = 183, + height = 100, + units = "mm", + dpi = 300) + + +# Plot all metabolites ---------------------------------------------------- + +plotmets <- features_all %>% distinct(met) %>% as_vector +plotmet_labs <- plotmets %>% as_tibble() %>% + left_join(met_dat, by= c("value" = "met")) %>% + #mutate(peak_num = base::rank(HMDB_clear_name, ties.method = "first"), + # dup = sum(peak_num), + # HMDB_clear_name_unique = if_else(dup>1, str_c(HMDB_clear_name, peak_num), HMDB_clear_name)) %>% + #ungroup() %>% + select(Compound_Name) %>% as_vector() + +plot_out <- vector("list", length = length(plotmets)) + +for (meta in seq_along(plotmets)) { + + plot_out [[meta]] <- features_all %>% + mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC)), + is_miss = as_factor(if_else(is.na(area), T, F))) %>% + filter(met == plotmets[[meta]]) %>% + filter(class!="blank") %>% + ggplot(aes(x=run_num_GC, y=isnorm)) + + geom_point(aes(color = class, shape = is_miss)) + + geom_point(aes(y=predict), color="black", size=0.1) + + geom_vline(aes(xintercept = xint))+ + # facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") + + ggtitle(label = plotmet_labs[[meta]]) + +} + +pdf(file = here(out,"_loess_fit.pdf")) + + +for (meta in seq_along(plotmets)) { + print(plot_out[[meta]]) +} + +dev.off() + +plot_out <- vector("list", length = length(plotmets)) + +for (meta in seq_along(plotmets)) { + + plot_out [[meta]] <- features_all %>% + mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC))) %>% + filter(met == plotmets[[meta]]) %>% + filter(class!="blank") %>% + ggplot(aes(x=run_num_GC, y=loess_norm_fw_log)) + + geom_point(aes(color=class)) + + #geom_point(aes(y=predp), color="black", size=0.1) + + geom_vline(aes(xintercept = xint))+ + # facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") + + ggtitle(label = plotmet_labs[[meta]]) + +} + +pdf(file = here(out,"_loess_norm_fw_log.pdf")) + + +for (meta in seq_along(plotmets)) { + print(plot_out[[meta]]) +} + +dev.off() + +for (meta in seq_along(plotmets)) { + + plot_out [[meta]] <- features_all %>% + mutate(is_miss = as_factor(if_else(is.na(area), T, F))) %>% + filter(met == plotmets[[meta]]) %>% + filter(class!="blank") %>% + ggplot(aes(x=daily_num, y=isnorm)) + + geom_point(aes(color = class, shape = is_miss)) + + geom_point(aes(y=predict), color="black", size=0.1) + + facet_wrap(facets = vars(batch_GC), nrow = 1, scales = "free") + + ggtitle(label = plotmet_labs[[meta]]) + + theme(axis.ticks.y = element_blank(), + axis.text.y = element_blank(), + panel.spacing = unit(-0.1,"lines")) + +} + +pdf(file = here(out,"_loess_norm_no_scale.pdf"), + height = 3.5) + + +for (meta in seq_along(plotmets)) { + print(plot_out[[meta]]) +} + +dev.off() + +for (meta in seq_along(plotmets)) { + + plot_out [[meta]] <- features_all %>% + mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC)), + is_miss = as_factor(if_else(is.na(area), T, F))) %>% + filter(met == plotmets[[meta]]) %>% + filter(class!="blank") %>% + ggplot(aes(x=run_num_GC, y=isnorm)) + + geom_point(aes(color = class, shape = is_miss)) + + geom_point(aes(y=predp), color="black", size=0.1) + + geom_vline(aes(xintercept = xint))+ + # facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") + + ggtitle(label = plotmet_labs[[meta]]) + +} + +pdf(file = here(out,"_linnorm.pdf")) + +for (meta in seq_along(plotmets)) { + print(plot_out[[meta]]) +} + +dev.off() + +rescale <- features_all %>% + group_by(met, tissue, exp) %>% + summarise(rescale = median(imp)) + +sam_vars <- c(#"plantline","extraction_num", "sample_num", #in case I add them back + "alias", "LIMS_ID", + "treatment", "tissue", "batch_GC", "run_date_GC", + "machine_num_GC", + "class", "run_num_GC", "sample_weight", "exp", "genotype") + + +features_out <- features_all %>% + left_join(rescale) %>% + filter(class != "blank", + (component %in% take_split & str_detect(batch_GC, "split"))| + (!component %in% take_split & str_detect(batch_GC, "split", negate = T))| + tissue == "leaves") %>% + group_by(met, tissue, exp) %>% + mutate(loess_norm_med = loess_norm_fw/median(loess_norm_fw), + rescaled = loess_norm_med*rescale) %>% + select(all_of(sam_vars), met, Compound_Name, loess_norm_fw, loess_norm_med, area, rescaled) + +#features_out %>% +# filter(met == "m_02177", tissue == "fruits", exp == 1) %>% +# ggplot(aes(x = run_num_GC, y = loess_norm_fw)) + +# geom_point() + +# Write files ------------------------------------------------------------- + + + +write_csv(features_out, + here(out, + "cmQTL_val_1_2_feat_dat_GC.csv" + )) + +write_csv(met_dat, + here(out, + "cmQTL_val_1_2_met_dat_GC.csv" + )) + +write_csv(sam_dat, + here(out, + "cmQTL_val_1_2_sam_dat_GC.csv" + )) + +write_csv(missingness, + here(out, + "cmQTL_val_1_2_miss_GC.csv" + )) + +# Log used code ------------------------------------------------------------ + +file_name <- sys.frame(1)$ofile + +file.copy(file_name, + to = here(out, "executing.R"),overwrite = T) + + +# Temporary .RData file to avoid having to rerun code --------------------- + + diff --git a/workflows/GC_MS_analysis/210903_primary_analysis_cmQTL_val_1_levene_RxNV.R b/workflows/GC_MS_analysis/210903_primary_analysis_cmQTL_val_1_levene_RxNV.R index 291e54c653db34aefca7a7b2e4fafd9e7c2eba79..e1d3217ecb1f5fd1f79c12eff68941089f27f9fe 100644 --- a/workflows/GC_MS_analysis/210903_primary_analysis_cmQTL_val_1_levene_RxNV.R +++ b/workflows/GC_MS_analysis/210903_primary_analysis_cmQTL_val_1_levene_RxNV.R @@ -13,13 +13,14 @@ library(ggpubr) library(viridisLite) library(ggtext) library(glue) +library(here) current <- getwd() source <- str_c(current,"/..") cur_date <- str_c(str_replace_all(Sys.Date(),"^.{2}|-","")) -out <- str_c(cur_date, "analysis", sep = "_") +out <- str_c("GC_MS_analysis", sep = "_") if (file.exists(out)) { cat("The folder already exists") diff --git a/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R b/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R index fb2c78b6c017b8983876102ce48a6cb21cdeff94..033dfcf79a7ede6c299cd44b58bea53f7afc2648 100644 --- a/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R +++ b/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R @@ -29,6 +29,8 @@ setwd(here())# Not recommended but convenient in Rstudio to start from root # Data loading ------------------------------------------------------------ +load(here("./runs/GC-MS normalization/Image.RData")) + sam_dat1 <- readxl::read_xlsx(here("studies/cmQTL_val1_GH_2020/isa.study.xlsx")) isa_ext <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 1) isa_gc <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 2) @@ -1103,9 +1105,95 @@ write_csv(missingness, "cmQTL_val_1_2_miss_GC.csv" )) +# Data dictionary --------------------------------------------------------- + +sam_dat_dic <- colnames(sam_dat) %>% + as_tibble() + +orig <- colnames(sam_dat1) %>% + as_tibble() + +break_lines_old <- function(x){ + id <- vector(mode = "character", length = length(x)) + term_source <- vector(mode = "character", length = length(x)) + term_acc <- vector(mode = "character", length = length(x)) + for(i in seq_along(x)){ + if(str_detect(x[[i]], "Characteristic|Parameter|Factor|Component")){ + id[[i]] <- x[[i]] + #if(str_detect(x[[i+1]], "unit")) { +# isa_unit[[i]] <- x[[i+1]] + term_source[[i]] <- x[[i+2]] + term_acc[[i]] <- x[[i+3]] + # } else { + term_source[[i]] <- x[[i+1]] + term_acc[[i]] <- x[[i+2]] +# } + } else { + id[[i]] <- x[[i]] + } + } + test_tib <- tibble(id = id, + # unit = isa_unit, + term_source = term_source, + term_acc = term_acc) %>% + filter() + assign("output",test_tib, pos = 1) +} + +break_lines(orig$value) #works without unit + +break_lines <- function(x, output_name){ + x <- colnames(x) %>% + as_tibble() + x <- x$value + group = vector(mode = "list", length = length(x)) + for(i in seq_along(x)){ + if(str_detect(x[[i]], "Characteristic|Parameter|Factor|Component")){ + group[[i]] <- tibble(id = x[[i]]) + if(str_detect(x[[i+1]], "Unit")) { + group[[i]] <- group[[i]] %>% mutate(isa_unit = x[[i+1]]) + if(str_detect(x[[i+2]], "Term Source REF")) { + group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+2]]) + if(str_detect(x[[i+3]], "Term Accession Number")){ + group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+3]]) + } + } + } else if(str_detect(x[[i+1]], "Term Source REF")) { + group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+1]]) + if(str_detect(x[[i+2]], "Term Accession Number")){ + group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+2]]) + } + } + } else if(str_detect(x[[i]], "Source REF|Accession Number|Unit",negate = T)){ + group[[i]] <- tibble(id = x[[i]]) + } + } + group_tib <- group %>% + discard(.p = is.null) %>% + reduce(.f = bind_rows) + assign(output_name,group_tib, pos = 1) +} +break_lines(sam_dat1, "output_samdat") +break_lines(isa_ext, "output_ext") +break_lines(isa_gc, "output_gc") +break_lines(isa_ms, "output_ms") + +output <- bind_rows(output_samdat, + output_ext, + output_gc, + output_ms) + +write.xlsx(output, + here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/isa.dataset_temp.xlsx")) + # Log used code ------------------------------------------------------------ file_name <- sys.frame(1)$ofile file.copy(file_name, - to = str_c(out_dir, str_remove(file_name, current), "_", str_replace_all(Sys.Date(),"^.{2}|-",""), ".R"),overwrite = T) + to = here(out, "executing.R"),overwrite = T) + + +# Temporary .RData file to avoid having to rerun code --------------------- + +save.image(here(out, "Image.RData"))