diff --git a/.Rhistory b/.Rhistory
index 2d2303bb43ca6d0e2251f2ea8cf3c122d01e3a7f..eafeceb68ebcb3a4f99f5521ef509431f3f24baf 100644
--- a/.Rhistory
+++ b/.Rhistory
@@ -1,512 +1,222 @@
-filter(exp == 1) %>%
-arrange(batch_pca, daily_num) %>%
-mutate(log_imp = log2(imp)) %>%
-pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC),
-names_from = met,
-values_from = loess_norm_fw_log) %>%
-filter(!is.na(m_1))
-pca <- summary(prcomp(
-pca_base%>%
-select(starts_with("m_"))))
-sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames()
-pca_plot <- as_tibble(pca$x) %>%
-mutate(join_num = 1:nrow(pca$x)) %>%
-full_join(pca_base %>%
-mutate(join_num=1:nrow(pca$x))) %>%
-#left_join(sam_dat) %>%
-mutate(batch_pca= as_factor(batch_GC),
-class= as_factor(class)) %>%
-select(all_of(sam_vars), everything())
-exp_var <- as_tibble(pca[["importance"]])
-pca_plot %>%
-#  filter(treatment == "HL") %>%
-ggplot()+
-geom_jitter(aes(x=PC1, y=PC2,  color = tissue)) +
-stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) +
-ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
-xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
-ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),
-"cmQTL_val1_tissue_loess_PCA.jpg",
-sep = "_"),
-width = 183,
-height = 100,
-units = "mm",
-dpi = 300)
-pca_plot %>%
-#  filter(treatment == "HL") %>%
-ggplot()+
-geom_jitter(aes(x=PC1, y=PC2,  color = treatment)) +
-stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) +
-ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
-xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
-ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),
-"cmQTL_val1_treatment_loess_PCA.jpg",
-sep = "_"),
-width = 183,
-height = 100,
-units = "mm",
-dpi = 300)
-pca_plot %>%
-#  filter(treatment == "HL") %>%
-ggplot()+
-geom_jitter(aes(x=PC1, y=PC2,  color = class)) +
-stat_ellipse(aes(x=PC1, y=PC2, color = class)) +
-ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
-xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
-pca_base <- features_all %>%
-ungroup() %>%
-filter(exp == 2) %>%
-arrange(batch_pca, daily_num) %>%
-mutate(log_imp = log2(isnorm)) %>%
-pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC),
-names_from = met,
-values_from = log_imp)
-pca <- summary(prcomp(
-pca_base%>%
-select(starts_with("m_"))))
-sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames()
-pca_plot <- as_tibble(pca$x) %>%
-mutate(join_num = 1:nrow(pca$x)) %>%
-full_join(pca_base %>%
-mutate(join_num=1:nrow(pca$x))) %>%
-#left_join(sam_dat) %>%
-mutate(batch_pca= as_factor(batch_GC),
-class= as_factor(class)) %>%
-select(all_of(sam_vars), everything())
-exp_var <- as_tibble(pca[["importance"]])
-pca_plot %>%
-#  filter(treatment == "HL") %>%
-ggplot()+
-geom_jitter(aes(x=PC1, y=PC2,  color = class)) +
-stat_ellipse(aes(x=PC1, y=PC2, color = class)) +
-ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
-xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
-pca_plot %>%
-#  filter(treatment == "HL") %>%
-ggplot()+
-geom_jitter(aes(x=PC1, y=PC2,  color = batch_GC)) +
-stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) +
-geom_text(aes(x = PC1, y = PC2,
-label = if_else(machine_num_GC %in% exclude_samples, machine_num_GC, NULL))) +
-ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
-xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
-pca_plot %>%
-#  filter(treatment == "HL") %>%
-ggplot()+
-geom_jitter(aes(x=PC1, y=PC2,  color = batch_GC)) +
-stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) +
-geom_text(aes(x = PC1, y = PC2,
-label = if_else(machine_num_GC %in% exclude_samples, machine_num_GC, ""))) +
-ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
-xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
-ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),
-"cmQTL_val1_treatment_loess_outlier_exp2_loess_norm_PCA.jpg",
-sep = "_"),
-width = 183,
-height = 100,
-units = "mm",
-dpi = 300)
-tissues <- c("leaves", "fruits")
-for(tiss in seq_along(tissues)){
-pca_base <- features_all %>%
-ungroup() %>%
-filter(exp == 1, tissue == tissues[[tiss]]) %>%
-arrange(batch_pca, daily_num) %>%
-mutate(log_imp = log2(imp)) %>%
-pivot_wider(id_cols = c(tissue, treatment,batch_GC, daily_num, class, machine_num_GC),
-names_from = met,
-values_from = loess_norm_fw_log) %>%
-filter(!is.na(m_1))
-pca <- summary(prcomp(
-pca_base%>%
-select(starts_with("m_"))))
-sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames()
-pca_plot <- as_tibble(pca$x) %>%
-mutate(join_num = 1:nrow(pca$x)) %>%
-full_join(pca_base %>%
-mutate(join_num=1:nrow(pca$x))) %>%
-left_join(sam_dat) %>%
-mutate(batch_pca= as_factor(batch_GC),
-class= as_factor(class)) %>%
-select(all_of(sam_vars), everything())
-exp_var <- as_tibble(pca[["importance"]])
-pca_plot %>%
-#  filter(treatment == "HL") %>%
-ggplot()+
-geom_jitter(aes(x=PC1, y=PC2,  color = treatment)) +
-stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) +
-ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
-xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
-ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),
-"cmQTL_val1", tissues[[tiss]], "treatment_loess_PCA.jpg",
-sep = "_"),
-width = 183,
-height = 100,
-units = "mm",
-dpi = 300)
-}
-View(features_all)
-pca_base <- features_all %>%
-ungroup() %>%
-filter(exp == 1, tissue == tissues[[tiss]]) %>%
-arrange(batch_pca, daily_num) %>%
-mutate(log_imp = log2(imp)) %>%
-pivot_wider(id_cols = c(tissue, treatment,batch_GC, daily_num, class, machine_num_GC),
-names_from = met,
-values_from = loess_norm_fw_log) %>%
-filter(!is.na(m_1))
-pca_base <- features_all %>%
-ungroup() %>%
-filter(exp == 1, tissue == tissues[[tiss]]) %>%
-arrange(batch_pca, daily_num) %>%
-mutate(log_imp = log2(imp)) %>%
-pivot_wider(id_cols = c(tissue, treatment,batch_GC, daily_num, class, machine_num_GC),
-names_from = met,
-values_from = loess_norm_fw_log) #%>%
-tissues <- c("leaf", "fruit")
-for(tiss in seq_along(tissues)){
-pca_base <- features_all %>%
-ungroup() %>%
-filter(exp == 1, tissue == tissues[[tiss]]) %>%
-arrange(batch_pca, daily_num) %>%
-mutate(log_imp = log2(imp)) %>%
-pivot_wider(id_cols = c(tissue, treatment,batch_GC, daily_num, class, machine_num_GC),
-names_from = met,
-values_from = loess_norm_fw_log) %>%
-filter(!is.na(m_1))
-pca <- summary(prcomp(
-pca_base%>%
-select(starts_with("m_"))))
-sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames()
-pca_plot <- as_tibble(pca$x) %>%
-mutate(join_num = 1:nrow(pca$x)) %>%
-full_join(pca_base %>%
-mutate(join_num=1:nrow(pca$x))) %>%
-left_join(sam_dat) %>%
-mutate(batch_pca= as_factor(batch_GC),
-class= as_factor(class)) %>%
-select(all_of(sam_vars), everything())
-exp_var <- as_tibble(pca[["importance"]])
-pca_plot %>%
-#  filter(treatment == "HL") %>%
-ggplot()+
-geom_jitter(aes(x=PC1, y=PC2,  color = treatment)) +
-stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) +
-ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
-xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
-ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),
-"cmQTL_val1", tissues[[tiss]], "treatment_loess_PCA.jpg",
-sep = "_"),
-width = 183,
-height = 100,
-units = "mm",
-dpi = 300)
-}
-pca_base <- features_all %>%
-ungroup() %>%
-filter(exp == 1) %>%
-arrange(batch_pca, daily_num) %>%
-mutate(log_imp = log2(imp)) %>%
-pivot_wider(id_cols = c(tissue, treatment, batch_pca, daily_num, class, machine_num_GC),
-names_from = met,
-values_from = log_imp)%>%
-filter(!is.na(m_1))
-pca <- summary(prcomp(
-pca_base%>%
-select(starts_with("m_"))))
-sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames()
-pca_plot <- as_tibble(pca$x) %>%
-mutate(join_num = 1:nrow(pca$x)) %>%
-full_join(pca_base %>%
-mutate(join_num=1:nrow(pca$x))) %>%
-left_join(sam_dat) %>%
-mutate(batch_pca= as_factor(batch_pca),
-class= as_factor(class)) %>%
-select(all_of(sam_vars), everything())
-exp_var <- as_tibble(pca[["importance"]])
-pca_plot %>%
-#  filter(treatment == "HL") %>%
-ggplot()+
-geom_jitter(aes(x=PC1, y=PC2,  color = tissue)) +
-stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) +
-geom_text(aes(x = PC1, y = PC2,
-label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, NULL)),
-nudge_y = -2,
-nudge_x = 4) +
-ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
-xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
-pca_plot %>%
-#  filter(treatment == "HL") %>%
-ggplot()+
-geom_jitter(aes(x=PC1, y=PC2,  color = tissue)) +
-stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) +
-geom_text(aes(x = PC1, y = PC2,
-label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, "")),
-nudge_y = -2,
-nudge_x = 4) +
-ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
-xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
-ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),
-"cmQTL_val1_tissue_imp_PCA.jpg",
-sep = "_"),
-width = 183,
-height = 100,
-units = "mm",
-dpi = 300)
-pca_plot %>%
-#  filter(treatment == "HL") %>%
-ggplot()+
-geom_jitter(aes(x=PC1, y=PC2,  color = batch_GC)) +
-stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) +
-geom_text(aes(x = PC1, y = PC2,
-label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, NULL)),
-nudge_y = -2,
-nudge_x = 4) +
-ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
-xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
-pca_plot %>%
-#  filter(treatment == "HL") %>%
-ggplot()+
-geom_jitter(aes(x=PC1, y=PC2,  color = batch_GC)) +
-stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) +
-geom_text(aes(x = PC1, y = PC2,
-label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, "")),
-nudge_y = -2,
-nudge_x = 4) +
-ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
-xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
-ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),
-"cmQTL_val1_batch_imp_PCA.jpg",
-sep = "_"),
-width = 183,
-height = 100,
-units = "mm",
-dpi = 300)
-pca_plot %>%
-#  filter(treatment == "HL") %>%
-ggplot()+
-geom_jitter(aes(x=PC1, y=PC2,  color = class)) +
-stat_ellipse(aes(x=PC1, y=PC2, color = class)) +
-geom_text(aes(x = PC1, y = PC2,
-label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, NULL)),
-nudge_y = -2,
-nudge_x = 4) +
-ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
-xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
-pca_plot %>%
-#  filter(treatment == "HL") %>%
-ggplot()+
-geom_jitter(aes(x=PC1, y=PC2,  color = class)) +
-stat_ellipse(aes(x=PC1, y=PC2, color = class)) +
-geom_text(aes(x = PC1, y = PC2,
-label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, "")),
-nudge_y = -2,
-nudge_x = 4) +
-ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
-xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
-ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),
-"cmQTL_val1_class_imp_PCA.jpg",
-sep = "_"),
-width = 183,
-height = 100,
-units = "mm",
-dpi = 300)
-#cmQTLval2
-pca_base <- features_all %>%
-ungroup() %>%
-filter(exp == 2, batch_GC == "3_split") %>%
-arrange(batch_GC, daily_num) %>%
-mutate(log_imp = log2(isnorm)) %>%
-pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC),
-names_from = met,
-values_from = log_imp)%>%
-filter(!is.na(m_1))
-pca <- summary(prcomp(
-pca_base%>%
-select(starts_with("m_"))))
-sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames()
-pca_plot <- as_tibble(pca$x) %>%
-mutate(join_num = 1:nrow(pca$x)) %>%
-full_join(pca_base %>%
-mutate(join_num=1:nrow(pca$x))) %>%
-left_join(sam_dat) %>%
-mutate(batch_pca= as_factor(batch_GC),
-class= as_factor(class)) %>%
-select(all_of(sam_vars), everything())
-exp_var <- as_tibble(pca[["importance"]])
-pca_plot %>%
-#  filter(treatment == "HL") %>%
-ggplot()+
-geom_jitter(aes(x=PC1, y=PC2,  color = class)) +
-stat_ellipse(aes(x=PC1, y=PC2, color = class)) +
-geom_text(aes(x = PC1, y = PC2,
-label = if_else(machine_num_GC == "21109rA_59", machine_num_GC, NULL)),
-nudge_y = -2,
-nudge_x = 4) +
-ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
-xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
-pca_plot %>%
-#  filter(treatment == "HL") %>%
-ggplot()+
-geom_jitter(aes(x=PC1, y=PC2,  color = class)) +
-stat_ellipse(aes(x=PC1, y=PC2, color = class)) +
-geom_text(aes(x = PC1, y = PC2,
-label = if_else(machine_num_GC == "21109rA_59", machine_num_GC, "")),
-nudge_y = -2,
-nudge_x = 4) +
-ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
-xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
-ggsave(str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),
-"cmQTL_val2_class_isnorm_PCA.jpg",
-sep = "_"),
-width = 183,
-height = 100,
-units = "mm",
-dpi = 300)
-plotmets <- features_all %>% distinct(met) %>% as_vector
-plotmet_labs <- plotmets %>% as_tibble() %>%
-left_join(met_dat, by= c("value" = "met")) %>%
-#mutate(peak_num = base::rank(HMDB_clear_name, ties.method = "first"),
-#       dup = sum(peak_num),
-#       HMDB_clear_name_unique = if_else(dup>1, str_c(HMDB_clear_name, peak_num), HMDB_clear_name)) %>%
-#ungroup() %>%
-select(Compound_Name) %>% as_vector()
-plot_out <- vector("list", length = length(plotmets))
-for (meta in seq_along(plotmets)) {
-plot_out [[meta]] <- features_all %>%
-mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC)),
-is_miss = as_factor(if_else(is.na(area), T, F))) %>%
-filter(met == plotmets[[meta]]) %>%
-filter(class!="blank") %>%
-ggplot(aes(x=run_num_GC, y=isnorm)) +
-geom_point(aes(color = class, shape = is_miss)) +
-geom_point(aes(y=predict), color="black", size=0.1) +
-geom_vline(aes(xintercept = xint))+
-# facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") +
-ggtitle(label = plotmet_labs[[meta]])
-}
-pdf(file = str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),"_loess_fit.pdf"))
-for (meta in seq_along(plotmets)) {
-print(plot_out[[meta]])
-}
-dev.off()
-plot_out <- vector("list", length = length(plotmets))
-for (meta in seq_along(plotmets)) {
-plot_out [[meta]] <- features_all %>%
-mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC))) %>%
-filter(met == plotmets[[meta]]) %>%
-filter(class!="blank") %>%
-ggplot(aes(x=run_num_GC, y=loess_norm_fw_log)) +
-geom_point(aes(color=class)) +
-#geom_point(aes(y=predp), color="black", size=0.1) +
-geom_vline(aes(xintercept = xint))+
-# facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") +
-ggtitle(label = plotmet_labs[[meta]])
-}
-pdf(file = str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),"_loess_norm_fw_log.pdf"))
-for (meta in seq_along(plotmets)) {
-print(plot_out[[meta]])
-}
-dev.off()
-for (meta in seq_along(plotmets)) {
-plot_out [[meta]] <- features_all %>%
-mutate(is_miss = as_factor(if_else(is.na(area), T, F))) %>%
-filter(met == plotmets[[meta]]) %>%
-filter(class!="blank") %>%
-ggplot(aes(x=daily_num, y=isnorm)) +
-geom_point(aes(color = class, shape = is_miss)) +
-geom_point(aes(y=predict), color="black", size=0.1) +
-facet_wrap(facets = vars(batch_GC), nrow = 1, scales = "free") +
-ggtitle(label = plotmet_labs[[meta]]) +
-theme(axis.ticks.y = element_blank(),
-axis.text.y = element_blank(),
-panel.spacing = unit(-0.1,"lines"))
-}
-pdf(file = str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),"_loess_norm_no_scale.pdf"),
-height = 3.5)
-for (meta in seq_along(plotmets)) {
-print(plot_out[[meta]])
-}
-dev.off()
-for (meta in seq_along(plotmets)) {
-plot_out [[meta]] <- features_all %>%
-mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC)),
-is_miss = as_factor(if_else(is.na(area), T, F))) %>%
-filter(met == plotmets[[meta]]) %>%
-filter(class!="blank") %>%
-ggplot(aes(x=run_num_GC, y=isnorm)) +
-geom_point(aes(color = class, shape = is_miss)) +
-geom_point(aes(y=predp), color="black", size=0.1) +
-geom_vline(aes(xintercept = xint))+
-# facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") +
-ggtitle(label = plotmet_labs[[meta]])
-}
-pdf(file = str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),"_linnorm.pdf"))
-for (meta in seq_along(plotmets)) {
-print(plot_out[[meta]])
-}
-dev.off()
-rescale <- features_all %>%
-group_by(met, tissue, exp) %>%
-summarise(rescale = median(imp))
-sam_vars <- c("plantline", "alias", "LIMS_ID",
-"treatment", "tissue", "batch_GC", "run_date_GC",
-"extraction_num", "sample_num", "machine_num_GC",
-"class", "run_num_GC", "sample_weight", "exp", "genotype")
-features_out <- features_all %>%
-left_join(rescale) %>%
-filter(class != "blank",
-(component %in% take_split & str_detect(batch_GC, "split"))|
-(!component %in% take_split & str_detect(batch_GC, "split", negate = T))|
-tissue == "leaves") %>%
-group_by(met, tissue, exp) %>%
-mutate(loess_norm_med = loess_norm_fw/median(loess_norm_fw),
-rescaled = loess_norm_med*rescale) %>%
-select(all_of(sam_vars), met, Compound_Name,  loess_norm_fw, loess_norm_med, area, rescaled)
-features_all %>% colnames()
-sam_vars %in% colnames(features_all)
-sam_vars <- c(#"plantline",
-"alias", "LIMS_ID",
-"treatment", "tissue", "batch_GC", "run_date_GC",
-"extraction_num", "sample_num", "machine_num_GC",
-"class", "run_num_GC", "sample_weight", "exp", "genotype")
-sam_vars
-sam_vars <- c(#"plantline",
-"alias", "LIMS_ID",
-"treatment", "tissue", "batch_GC", "run_date_GC",
-"extraction_num", "sample_num", "machine_num_GC",
-"class", "run_num_GC", "sample_weight", "exp", "genotype")
-sam_vars %in% colnames(features_all)
-sam_vars <- c(#"plantline","extraction_num", "sample_num", #in case I add them back
-"alias", "LIMS_ID",
-"treatment", "tissue", "batch_GC", "run_date_GC",
-"machine_num_GC",
-"class", "run_num_GC", "sample_weight", "exp", "genotype")
-features_out <- features_all %>%
-left_join(rescale) %>%
-filter(class != "blank",
-(component %in% take_split & str_detect(batch_GC, "split"))|
-(!component %in% take_split & str_detect(batch_GC, "split", negate = T))|
-tissue == "leaves") %>%
-group_by(met, tissue, exp) %>%
-mutate(loess_norm_med = loess_norm_fw/median(loess_norm_fw),
-rescaled = loess_norm_med*rescale) %>%
-select(all_of(sam_vars), met, Compound_Name,  loess_norm_fw, loess_norm_med, area, rescaled)
-View(features_out)
-write_csv(features_out,
-str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),
-"cmQTL_val_1_2_feat_dat_GC.csv",
-sep = "_"))
-write_csv(met_dat,
-str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),
-"cmQTL_val_1_2_met_dat_GC.csv",
-sep = "_"))
-write_csv(sam_dat,
-str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),
-"cmQTL_val_1_2_sam_dat_GC.csv",
-sep = "_"))
-write_csv(missingness,
-str_c(str_replace_all(Sys.Date(),"^.{2}|-",""),
-"cmQTL_val_1_2_miss_GC.csv",
-sep = "_"))
-file_name <- sys.frame(1)$ofile
+library(here)
+load(here("./runs/GC-MS normalization/Image.RData"))
+library(openxlsx)
+library(tidyverse)
+library(car)
+library(pheatmap)
+library(broom)
+library(ggpubr)
+library(viridisLite)
+library(modelr)
+#library(dlookr)
+#library(imputeLCMD)
+library(ggrepel)
+sam_dat_dic <- colnames(sam_dat) %>%
+as_tibble()
+orig <- colnames(sam_dat1) %>%
+as_tibble()
+break_lines <- function(x){
+group = vector(mode = "list", length = length(x))
+#id <- vector(mode = "character", length = length(x))
+#isa_unit <-
+#term_source <- vector(mode = "character", length = length(x))
+#term_acc <- vector(mode = "character", length = length(x))
+for(i in seq_along(x)){
+if(str_detect(x[[i]], "Characteristic|Parameter|Factor|Component")){
+group[[i]] <- tibble(id = x[[i]])
+if(str_detect(x[[i+1]], "Unit")) {
+group[[i]] <- group[[i]] %>% mutate(isa_unit = x[[i+1]])
+if(str_detect(x[[i+2]], "Term Source REF")) {
+group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+2]])
+if(str_detect(x[[i+3]], "Term Accession Number")){
+group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+3]])
+}
+}
+} else if(str_detect(x[[i+1]], "Term Source REF")) {
+group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+1]])
+if(str_detect(x[[i+2]], "Term Accession Number")){
+group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+2]])
+}
+}
+} else {
+#id[[i]] <- x[[i]]
+}
+}
+group_tib <- group %>%
+discard(.p = is.null) %>%
+reduce(.f = bind_rows)
+#test_tib <- tibble(id = id,
+#                   unit = isa_unit,
+#                   term_source = term_source,
+#                   term_acc = term_acc) %>%
+#  filter()
+assign("output",group, pos = 1)
+}
+break_lines(orig$value)
+break_lines <- function(x){
+group = vector(mode = "list", length = length(x))
+#id <- vector(mode = "character", length = length(x))
+#isa_unit <-
+#term_source <- vector(mode = "character", length = length(x))
+#term_acc <- vector(mode = "character", length = length(x))
+for(i in seq_along(x)){
+if(str_detect(x[[i]], "Characteristic|Parameter|Factor|Component")){
+group[[i]] <- tibble(id = x[[i]])
+if(str_detect(x[[i+1]], "Unit")) {
+group[[i]] <- group[[i]] %>% mutate(isa_unit = x[[i+1]])
+if(str_detect(x[[i+2]], "Term Source REF")) {
+group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+2]])
+if(str_detect(x[[i+3]], "Term Accession Number")){
+group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+3]])
+}
+}
+} else if(str_detect(x[[i+1]], "Term Source REF")) {
+group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+1]])
+if(str_detect(x[[i+2]], "Term Accession Number")){
+group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+2]])
+}
+}
+} else {
+#id[[i]] <- x[[i]]
+}
+}
+group_tib <- group %>%
+discard(.p = is.null) %>%
+reduce(.f = bind_rows)
+#test_tib <- tibble(id = id,
+#                   unit = isa_unit,
+#                   term_source = term_source,
+#                   term_acc = term_acc) %>%
+#  filter()
+assign("output",group_tib, pos = 1)
+}
+break_lines(orig$value)
+View(output)
+str_detect(c("test", "different", "capture", "groups"), "test")
+str_detect(c("test", "different", "capture", "groups"), "test|different")
+break_lines <- function(x){
+group = vector(mode = "list", length = length(x))
+for(i in seq_along(x)){
+if(str_detect(x[[i]], "Characteristic|Parameter|Factor|Component")){
+group[[i]] <- tibble(id = x[[i]])
+if(str_detect(x[[i+1]], "Unit")) {
+group[[i]] <- group[[i]] %>% mutate(isa_unit = x[[i+1]])
+if(str_detect(x[[i+2]], "Term Source REF")) {
+group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+2]])
+if(str_detect(x[[i+3]], "Term Accession Number")){
+group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+3]])
+}
+}
+} else if(str_detect(x[[i+1]], "Term Source REF")) {
+group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+1]])
+if(str_detect(x[[i+2]], "Term Accession Number")){
+group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+2]])
+}
+}
+} else if(str_detect(x[[i]], "Source REF|Accession Number|Unit",negate = T)){
+group[[i]] <- tibble(id = x[[i]])
+}
+}
+group_tib <- group %>%
+discard(.p = is.null) %>%
+reduce(.f = bind_rows)
+assign("output",group_tib, pos = 1)
+}
+break_lines(orig$value)
+View(output)
+here()
+View(sam_dat_dic)
+View(sam_dat)
+View(isa_ext)
+View(met_dat)
+break_lines <- function(x){
+x <- colnames(x) %>%
+as_tibble()
+x <- x$value
+group = vector(mode = "list", length = length(x))
+for(i in seq_along(x)){
+if(str_detect(x[[i]], "Characteristic|Parameter|Factor|Component")){
+group[[i]] <- tibble(id = x[[i]])
+if(str_detect(x[[i+1]], "Unit")) {
+group[[i]] <- group[[i]] %>% mutate(isa_unit = x[[i+1]])
+if(str_detect(x[[i+2]], "Term Source REF")) {
+group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+2]])
+if(str_detect(x[[i+3]], "Term Accession Number")){
+group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+3]])
+}
+}
+} else if(str_detect(x[[i+1]], "Term Source REF")) {
+group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+1]])
+if(str_detect(x[[i+2]], "Term Accession Number")){
+group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+2]])
+}
+}
+} else if(str_detect(x[[i]], "Source REF|Accession Number|Unit",negate = T)){
+group[[i]] <- tibble(id = x[[i]])
+}
+}
+group_tib <- group %>%
+discard(.p = is.null) %>%
+reduce(.f = bind_rows)
+assign("output",group_tib, pos = 1)
+}
+break_lines(sam_dat1)
+break_lines <- function(x, output_name){
+x <- colnames(x) %>%
+as_tibble()
+x <- x$value
+group = vector(mode = "list", length = length(x))
+for(i in seq_along(x)){
+if(str_detect(x[[i]], "Characteristic|Parameter|Factor|Component")){
+group[[i]] <- tibble(id = x[[i]])
+if(str_detect(x[[i+1]], "Unit")) {
+group[[i]] <- group[[i]] %>% mutate(isa_unit = x[[i+1]])
+if(str_detect(x[[i+2]], "Term Source REF")) {
+group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+2]])
+if(str_detect(x[[i+3]], "Term Accession Number")){
+group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+3]])
+}
+}
+} else if(str_detect(x[[i+1]], "Term Source REF")) {
+group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+1]])
+if(str_detect(x[[i+2]], "Term Accession Number")){
+group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+2]])
+}
+}
+} else if(str_detect(x[[i]], "Source REF|Accession Number|Unit",negate = T)){
+group[[i]] <- tibble(id = x[[i]])
+}
+}
+group_tib <- group %>%
+discard(.p = is.null) %>%
+reduce(.f = bind_rows)
+assign(output_name,group_tib, pos = 1)
+}
+break_lines(sam_dat1, "output_samdat")
+break_lines(isa_ext, "output_ext")
+break_lines(isa_gc, "output_gc")
+View(output_gc)
+output <- bind_rows(output_samdat,
+output_ext,
+output_gc,
+output_ms)
+beak_lines(isa_ms, "output_ms")
+break_lines(isa_ms, "output_ms")
+output <- bind_rows(output_samdat,
+output_ext,
+output_gc,
+output_ms)
+here()
+getwd()
+setwd(here())# Not recommended but convenient in Rstudio to start from root
+write.xlsx(output,
+here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/isa.dataset_temp.xlsx"))
+here(out,
+"cmQTL_val_1_2_met_dat_GC.csv"
+)
+here(out,
+"cmQTL_val_1_2_sam_dat_GC.csv"
+)
+View(isa_ms)
+View(isa_ms_tidy)
+1176-1145
diff --git a/assays/cmQTL_val1_GH_2020_GC_MS/dataset/isa.dataset.xlsx b/assays/cmQTL_val1_GH_2020_GC_MS/dataset/isa.dataset.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..565d6b5087a772cf0e5d39ca9bc7e607e4804dba
--- /dev/null
+++ b/assays/cmQTL_val1_GH_2020_GC_MS/dataset/isa.dataset.xlsx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68d73ca9855cf91cfd9ea8c200f65bef8982e2c6be605924ea29c9865aacc1b3
+size 11468
diff --git a/assays/cmQTL_val1_GH_2020_GC_MS/dataset/isa.dataset_temp.xlsx b/assays/cmQTL_val1_GH_2020_GC_MS/dataset/isa.dataset_temp.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..270d5f18db8cc20042b209303be26c4e92266ecd
--- /dev/null
+++ b/assays/cmQTL_val1_GH_2020_GC_MS/dataset/isa.dataset_temp.xlsx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d63d90d0e4df45541ea52c0349c41dd82b4c320e98e4382d1728583401de1f5c
+size 9962
diff --git a/runs/GC-MS normalization/Image.RData b/runs/GC-MS normalization/Image.RData
new file mode 100644
index 0000000000000000000000000000000000000000..8be71b208442c3116797ce1ca7ca0cab18d3b75e
Binary files /dev/null and b/runs/GC-MS normalization/Image.RData differ
diff --git a/runs/GC-MS normalization/_linnorm.pdf b/runs/GC-MS normalization/_linnorm.pdf
index a1b56f697faca85f53cffe087e10188020b487eb..0b6ddced2d2b929731c5d317f19c49e63609e7b6 100644
Binary files a/runs/GC-MS normalization/_linnorm.pdf and b/runs/GC-MS normalization/_linnorm.pdf differ
diff --git a/runs/GC-MS normalization/_loess_fit.pdf b/runs/GC-MS normalization/_loess_fit.pdf
index bd907e7c808daf905f2eb0c1fefbf4b6dd67cec7..7bfdd6326ceb485a60331fad6624b9d758d3557f 100644
Binary files a/runs/GC-MS normalization/_loess_fit.pdf and b/runs/GC-MS normalization/_loess_fit.pdf differ
diff --git a/runs/GC-MS normalization/_loess_norm_fw_log.pdf b/runs/GC-MS normalization/_loess_norm_fw_log.pdf
index a492791f0567d2695de7bfc0aebffd11767487b9..22ae1df4324951ec29e20aa70d41f420c9c7a5cc 100644
Binary files a/runs/GC-MS normalization/_loess_norm_fw_log.pdf and b/runs/GC-MS normalization/_loess_norm_fw_log.pdf differ
diff --git a/runs/GC-MS normalization/_loess_norm_no_scale.pdf b/runs/GC-MS normalization/_loess_norm_no_scale.pdf
index d0eab2cc0a08350ee287e82656d9992acc32f877..2090c30d3b5277dedeb25afad9bc4c4a476210e5 100644
Binary files a/runs/GC-MS normalization/_loess_norm_no_scale.pdf and b/runs/GC-MS normalization/_loess_norm_no_scale.pdf differ
diff --git a/runs/GC-MS normalization/cmQTL_val1/fruit/treatment_loess_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1/fruit/treatment_loess_PCA.jpg
index 789d9d8d26208396f7a2535f46c55dde33d6a2eb..a687b9e48a031a8c5b287b7f75fdd1bc5832e813 100644
Binary files a/runs/GC-MS normalization/cmQTL_val1/fruit/treatment_loess_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1/fruit/treatment_loess_PCA.jpg differ
diff --git a/runs/GC-MS normalization/cmQTL_val1/leaf/treatment_loess_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1/leaf/treatment_loess_PCA.jpg
index e659cf390d18d9ca65f85fe4bec0043faf6410ce..67c5f2771b2781afc26556b0bc74b45dadf9b0b4 100644
Binary files a/runs/GC-MS normalization/cmQTL_val1/leaf/treatment_loess_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1/leaf/treatment_loess_PCA.jpg differ
diff --git a/runs/GC-MS normalization/cmQTL_val1_2_GC_met_rla_plot.jpg b/runs/GC-MS normalization/cmQTL_val1_2_GC_met_rla_plot.jpg
index 06b3542a0890dcdddecd65920ad49fb776d37e6b..29bca42b2d92c463ef4374566f89e9ffc9b550be 100644
Binary files a/runs/GC-MS normalization/cmQTL_val1_2_GC_met_rla_plot.jpg and b/runs/GC-MS normalization/cmQTL_val1_2_GC_met_rla_plot.jpg differ
diff --git a/runs/GC-MS normalization/cmQTL_val1_2_samples_rla_plot.jpg b/runs/GC-MS normalization/cmQTL_val1_2_samples_rla_plot.jpg
index 3183af762fd7b550d01f976017f1e26b10c88778..53f32e810a5d957a0e90e82fcc492472660707d2 100644
Binary files a/runs/GC-MS normalization/cmQTL_val1_2_samples_rla_plot.jpg and b/runs/GC-MS normalization/cmQTL_val1_2_samples_rla_plot.jpg differ
diff --git a/runs/GC-MS normalization/cmQTL_val1_batch_imp_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_batch_imp_PCA.jpg
index bfb90fe7db1ec011aff8c1c5fa846dd028ff8dde..35477ad863ad9ddc53de4df7f3c721f174e06f6a 100644
Binary files a/runs/GC-MS normalization/cmQTL_val1_batch_imp_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_batch_imp_PCA.jpg differ
diff --git a/runs/GC-MS normalization/cmQTL_val1_class_imp_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_class_imp_PCA.jpg
index 8bf923236138f1abd6da0478170f2059501ace7e..b72fd0531a873787004673c8a1978ce9b1f57a45 100644
Binary files a/runs/GC-MS normalization/cmQTL_val1_class_imp_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_class_imp_PCA.jpg differ
diff --git a/runs/GC-MS normalization/cmQTL_val1_tissue_imp_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_tissue_imp_PCA.jpg
index 585c8bc5615e1edcf65763edcb06b9d84dad3bfd..729c414ef28a035d73db4e8374c24094767bb049 100644
Binary files a/runs/GC-MS normalization/cmQTL_val1_tissue_imp_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_tissue_imp_PCA.jpg differ
diff --git a/runs/GC-MS normalization/cmQTL_val1_tissue_lin_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_tissue_lin_PCA.jpg
index faf8d896aedd0bd6a754d65e01b0af3aa7ea8149..2f259827bad98892c43145cdd5c86136d461e83b 100644
Binary files a/runs/GC-MS normalization/cmQTL_val1_tissue_lin_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_tissue_lin_PCA.jpg differ
diff --git a/runs/GC-MS normalization/cmQTL_val1_tissue_loess_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_tissue_loess_PCA.jpg
index 2cbbe7ed76721a8296bd43591e3f75a9d15e2a44..10cfd249bc7c2551f7393dff512b14ea7c1e9ffa 100644
Binary files a/runs/GC-MS normalization/cmQTL_val1_tissue_loess_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_tissue_loess_PCA.jpg differ
diff --git a/runs/GC-MS normalization/cmQTL_val1_treatment_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_treatment_PCA.jpg
index fc6cbc86aa15983c3f60d9e081fe9609d01219be..5449fe34da91b73c18e263d45ff333f86f1be4b8 100644
Binary files a/runs/GC-MS normalization/cmQTL_val1_treatment_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_treatment_PCA.jpg differ
diff --git a/runs/GC-MS normalization/cmQTL_val1_treatment_loess_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_treatment_loess_PCA.jpg
index 585c72dbe95ad7565935212cf792b17ea91b0903..cf488b11c19ebea36f3f4a6686cc56024bc72f4c 100644
Binary files a/runs/GC-MS normalization/cmQTL_val1_treatment_loess_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_treatment_loess_PCA.jpg differ
diff --git a/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp1_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp1_PCA.jpg
index 2009c9ad376ef6f66ef4cf225379862907daf3e7..8a0493bbcb5991cab0e77a86983d541aca4065d3 100644
Binary files a/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp1_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp1_PCA.jpg differ
diff --git a/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp2_imp_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp2_imp_PCA.jpg
index 5114cb5449c9d180cd337340f4b4f7f3f28992b2..9338c3c8b38a238d168c7a476c8a2aa78100c8c3 100644
Binary files a/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp2_imp_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp2_imp_PCA.jpg differ
diff --git a/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp2_loess_norm_PCA.jpg b/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp2_loess_norm_PCA.jpg
index f34ef7a03390e9da1dad9f6864016268a9a5f259..f53f7af88ccf9e389f2d775e08d1e365199e0bbe 100644
Binary files a/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp2_loess_norm_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val1_treatment_loess_outlier_exp2_loess_norm_PCA.jpg differ
diff --git a/runs/GC-MS normalization/cmQTL_val2_class_isnorm_PCA.jpg b/runs/GC-MS normalization/cmQTL_val2_class_isnorm_PCA.jpg
index 62fcac95ef7682be33061aecdd854a6f74be4459..d82b4de00e63ff9514e8518cdf89af741eac11c2 100644
Binary files a/runs/GC-MS normalization/cmQTL_val2_class_isnorm_PCA.jpg and b/runs/GC-MS normalization/cmQTL_val2_class_isnorm_PCA.jpg differ
diff --git a/runs/GC-MS normalization/executing.R b/runs/GC-MS normalization/executing.R
new file mode 100644
index 0000000000000000000000000000000000000000..bfe2ec36bf1597b8fe35ed02257ac3f43b94a1d6
--- /dev/null
+++ b/runs/GC-MS normalization/executing.R	
@@ -0,0 +1,1118 @@
+rm(list = ls())
+
+library(openxlsx)
+library(tidyverse)
+library(car)
+library(pheatmap)
+library(broom)
+library(ggpubr)
+library(viridisLite)
+library(modelr)
+#library(dlookr)
+#library(imputeLCMD)
+library(ggrepel)
+
+
+# Directory setting -------------------------------------------------------
+
+here::i_am("workflows/GC_MS_normalization/210927_primary_normalization_with_split.R")
+library(here)
+
+out <- here("runs/GC-MS normalization")
+
+if (file.exists(out)) {
+  cat("The folder already exists")
+} else {
+  dir.create(out)
+}
+setwd(here())# Not recommended but convenient in Rstudio to start from root
+
+# Data loading ------------------------------------------------------------
+
+load(here("Temp.RData"))
+
+sam_dat1 <- readxl::read_xlsx(here("studies/cmQTL_val1_GH_2020/isa.study.xlsx"))
+isa_ext <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 1)
+isa_gc <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 2)
+isa_ms <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 3)
+#genotypes <- readxl::read_xlsx("Genotype_names.xlsx") %>%
+#  mutate(plantline = as_factor(plantline))
+
+#GC_machine_nums <- readxl::read_xlsx("200923_samplelist_WIJESI-030820-13_cmQTL_validation.xlsx", sheet = 6)
+
+take_split <- c("fructose_307_217_rt9.48", "glucose_160_319_rt9.68","glucose_160_rt9.81", "glutamic_acid_246_363_rt8.31",
+                "glutamine_156_245_rt9.80", "malic_acid_233_245_rt7.22", "shikimic_acid_204_462_rt9.57", "shikimic_acid_204_462_rt9.57",
+                "pyroglutamic_acid_156_258_rt8.30", "sucrose_437_361_rt13.77", "sucrose2_204_361_rt13.79", "citric_acid_273_375_rt9.72",
+                "arginine_157_256_rt9.92")
+
+exclude_samples <- c("21106rA_31", "21107rA_54", "21109rA_59", "21109rA_86", "21109rA_78")
+exclude_mets <- c("psicose_103_217_rt9.38", "glutamic_acid_246_363_rt8.31", "lactic_acid_117_219_rt3.07")#glu wrong peak
+
+area1 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_seq_file_20210914143103_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
+area2 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_fruits_split_seq_file_20210914164507_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
+area3 <- readxl::read_xls(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/210914_cmQTL_val_1_2_leaves_seq_file_20210914125126_comp_file_area_rt1.bkt.xls"), na = c("", "N/A"))
+
+metdat_GC_class <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/MAF_GC_MS.xlsx"))
+  
+area <- area1 %>% 
+  bind_rows(area2, area3) %>% 
+  select(component, area, machine_num_GC = machine_num,rt) %>% 
+  mutate(area = as.numeric(area),
+         rt = as.numeric(rt))
+
+rt_mean <- area %>%
+  group_by(component) %>% 
+  summarise(RT_mean = mean(rt, na.rm = T))
+
+metdat_GC_class <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/MAF_GC_MS.xlsx")) %>% 
+  select(component = Xcal_name_xreport, Compound_Name = PubChem_Name_mapped)%>% 
+  left_join(rt_mean) %>% 
+  filter(!is.na(component)) %>% 
+  arrange(Compound_Name, RT_mean) %>% 
+  group_by(Compound_Name)  %>% 
+  mutate(peak_no = rank(RT_mean),
+         Compound_Name = if_else(duplicated(Compound_Name),
+                                 str_c(Compound_Name, "peak", peak_no, sep = "_"),
+                                 Compound_Name))
+
+# Data combination --------------------------------------------------------
+
+sam_vars <- c("plantline", "alias", "LIMS_ID",
+              "treatment", "tissue", "batch_GC", "run_date_GC",
+              "extraction_num", "sample_num", "machine_num_GC",
+              "class", "run_num_GC", "sample_weight", "exp", "genotype")
+
+isa_study_tidy <- sam_dat1 %>% 
+  mutate(sample_weight = as.double(str_extract(`Parameter [Sample Weight]`, "\\d{0,3}\\.*\\d{0,2}"))) %>% 
+  rename(source_name = `Source Name`,
+         #plantline = `Characteristic [plantline]`,
+         alias = `Characteristic [alias]`,
+         LIMS_ID = `Characteristic [LIMS aliquot]`,
+         treatment = `Factor [Irrigation factor]`,
+         tissue = `Parameter [organism part]`,
+         genotype = `Characteristic [genotype]`,
+         #sample_num = `Characteristic [sample_name_non_unique]`,
+         #extraction_num = `Characteristic [extract number]`,
+         sample_name = `Sample Name`) %>% 
+  select(source_name, any_of(sam_vars), sample_name)
+
+isa_ext_tidy <- isa_ext %>% 
+  rename(source_name =`Source Name`,
+         exp = `Characteristic [experiment name]`,
+         sample_name = `Sample Name`) %>% 
+  select(source_name, any_of(sam_vars), sample_name)
+
+isa_gc_tidy <- isa_gc %>% 
+  rename(source_name =`Source Name`,
+         class = `Characteristic [sample type]`,
+         batch_GC = `Parameter [Batch]`,
+         run_date_GC = `Parameter [run date]`,
+         #daily_num = `Parameter [daily number]`,
+         run_num_GC = `Parameter [Number]`,
+         sample_name = `Sample Name`) %>% 
+  select(source_name, any_of(sam_vars), sample_name)
+
+isa_ms_tidy <- isa_ms %>% 
+  rename(source_name =`Source Name`,
+         sample_name = `Sample Name`) %>% 
+  mutate(machine_num_GC = sample_name) %>% 
+  select(source_name, any_of(sam_vars), sample_name)
+
+isa_tidy <- isa_study_tidy %>% 
+  full_join(isa_ext_tidy, by = c("sample_name" = "source_name"), keep = T, suffix = c("_study", "_ext")) %>% 
+  full_join(isa_gc_tidy, by = c("sample_name_ext" = "source_name")) %>% 
+  full_join(isa_ms_tidy, by = c("sample_name_ext" = "source_name", "sample_name" = "sample_name"))
+
+batch_exp_tissue <- isa_tidy %>% 
+  distinct(batch_GC, exp, tissue) %>% 
+  filter(!is.na(exp)) %>% 
+  rename(exp_fill = exp,
+         tissue_fill = tissue)
+
+sam_dat <- isa_tidy %>% 
+  arrange(run_num_GC) %>% 
+  #fill(c(tissue, exp), .direction = "updown") %>% 
+  arrange(run_num_GC) %>% 
+  group_by(batch_GC) %>% 
+  mutate(daily_num = row_number()) %>% 
+  ungroup() %>% 
+  left_join(batch_exp_tissue) %>% 
+  mutate(tissue = tissue_fill,
+         exp = exp_fill) %>% 
+  select(-c(tissue_fill, exp_fill))
+
+area_long <- area %>% 
+  filter(!is.na(component)) %>% 
+  left_join(sam_dat) %>% 
+  filter(!is.na(exp))
+
+area <- area_long %>% 
+  left_join(metdat_GC_class) %>% 
+  filter(!str_detect(component, "^\\!|FAME|component|empty"), !is.na(component),
+         !component %in% exclude_mets)
+
+met_dat = area %>% 
+  distinct(RT_mean, Compound_Name,  component) %>% 
+  mutate(met = str_c("m", row_number(), sep = "_"))
+
+area <- area %>% 
+  left_join(met_dat) %>% 
+  mutate(area = as.numeric(area))
+
+# PCA to justify filtering ------------------------------------------------
+
+features_pca <- area %>% 
+  group_by(met) %>% 
+  mutate(imp = if_else(is.na(area), 0.5*min(area, na.rm = T),area),
+         batch_pca = str_extract(batch_GC, "\\d+")) %>% 
+  ungroup() 
+
+pca_base <- features_pca %>%
+  ungroup() %>% 
+  filter(exp == 1) %>% 
+  arrange(run_num_GC) %>%
+  mutate(log_imp = log2(imp)) %>% 
+  pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC, run_num_GC),
+              names_from = met,
+              values_from = log_imp) %>% 
+  filter(!is.na(m_1))
+
+pca <- summary(prcomp(
+  pca_base%>%
+    select(starts_with("m_"))))
+
+sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames()
+
+pca_plot <- as_tibble(pca$x) %>% 
+  mutate(join_num = 1:nrow(pca$x)) %>% 
+  left_join(pca_base %>% 
+              mutate(join_num=1:nrow(pca$x))) %>%
+  #left_join(sam_dat) %>% 
+  mutate(batch_GC= as_factor(batch_GC),
+         class= as_factor(class)) %>% 
+  select(all_of(sam_vars), everything())
+
+exp_var <- as_tibble(pca[["importance"]])
+
+
+pca_plot %>% 
+  #  filter(treatment == "HL") %>% 
+  ggplot()+
+  geom_jitter(aes(x=PC1, y=PC2,  color = batch_GC)) +
+  stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) +
+  geom_text_repel(aes(x = PC1, y = PC2,
+                label = if_else(machine_num_GC == "21106rA_31"| machine_num_GC == "21107rA_54", machine_num_GC, "")), box.padding = 1) +
+  ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
+  xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
+
+ggsave(here(out,
+             "cmQTL_val1_treatment_loess_outlier_exp1_PCA.jpg"),
+       width = 183,
+       height = 100,
+       units = "mm",
+       dpi = 300)
+
+
+pca_base <- features_pca %>%
+  ungroup() %>% 
+  filter(exp == 2) %>% 
+  arrange(batch_GC, daily_num) %>%
+  mutate(log_imp = log2(imp)) %>% 
+  pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC),
+              names_from = met,
+              values_from = log_imp) %>% 
+  filter(!is.na(m_1))
+
+pca <- summary(prcomp(
+  pca_base%>%
+    select(starts_with("m_"))))
+
+sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames()
+
+pca_plot <- as_tibble(pca$x) %>% 
+  mutate(join_num = 1:nrow(pca$x)) %>% 
+  left_join(pca_base %>% 
+              arrange(batch_GC, daily_num) %>% 
+              mutate(join_num=1:nrow(pca$x))) %>%
+  #left_join(sam_dat) %>% 
+  mutate(batch_GC= as_factor(batch_GC),
+         class= as_factor(class)) %>% 
+  select(all_of(sam_vars), everything())
+
+exp_var <- as_tibble(pca[["importance"]])
+
+pca_plot %>% 
+  #  filter(treatment == "HL") %>% 
+  ggplot()+
+  geom_jitter(aes(x=PC1, y=PC2,  color = class)) +
+  stat_ellipse(aes(x=PC1, y=PC2, color = class)) +
+  geom_label_repel(aes(x = PC1, y = PC2,
+                label = if_else(machine_num_GC %in% exclude_samples, machine_num_GC, "")),force_pull = -0.1,
+                max.overlaps = Inf) +
+  ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
+  xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
+
+ggsave(here(out,
+             "cmQTL_val1_treatment_loess_outlier_exp2_imp_PCA.jpg"
+             ),
+       width = 183,
+       height = 100,
+       units = "mm",
+       dpi = 300)
+
+
+# Data filtering ----------------------------------------------------------
+
+
+set_back_NA <- function(x){
+  for (i in seq_along(x)) {
+    if (x[[i]] == 0) {
+      x[[i]] <- NA
+    } else {
+      x[[i]] <- x[[i]]
+    }
+  }
+  x
+}
+
+area_long <- area %>% 
+  filter(!machine_num_GC %in% exclude_samples)
+
+
+# Imputation --------------------------------------------------------------
+
+features_na <- area_long #%>% 
+  #group_by(met) %>% 
+  #mutate(area = set_back_NA(area)) %>% 
+  #ungroup()
+
+missingness <- features_na %>% 
+  group_by(exp, met, tissue, treatment) %>% 
+  summarise(na = sum(is.na(area)),
+            n = n()) %>% 
+  mutate(percent_na = na/n*100) %>% 
+  ungroup()
+
+features <- features_na %>% 
+  left_join(missingness) %>% 
+  group_by(met) %>% 
+  #mutate(area = ifelse(percent_na >= 0 & is.na(area), rnorm(n = 1, mean = 0.5*min(area, na.rm = T)), area)) %>% 
+  ungroup()
+
+
+features_filtered <- area_long %>% 
+  group_by(met) %>% 
+  mutate(imp = if_else(is.na(area), 0.5*min(area, na.rm = T),area),
+         batch_pca = str_extract(batch_GC, "\\d+")) %>% 
+  ungroup() %>% 
+  left_join(missingness) #%>%
+  #filter(percent_na <= 60)
+
+
+# Internal standard check -------------------------------------------------
+
+features_filtered %>% 
+  filter(str_detect(component, "ribitol")) %>% 
+  mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC)),
+         is_miss = as_factor(if_else(is.na(area), T, F))) %>% 
+  filter(class!="blank sample") %>% 
+  ggplot(aes(x=run_num_GC, y = log2(imp))) +
+  geom_point(aes(color = class, shape = is_miss)) +
+  geom_vline(aes(xintercept = xint)) +
+  facet_grid(rows = vars(component))
+
+# Internal Standard Normalization -----------------------------------------
+
+isvec <- features_filtered %>% 
+  filter(component == "ribitol_217_rt8.20") %>% 
+  select(machine_num_GC, is = imp)
+
+features <- features_filtered %>% 
+  full_join(isvec) %>% 
+  mutate(isnorm = imp/is) %>% 
+  filter(str_detect(Compound_Name, "ribitol", negate = T)) %>% 
+  mutate(sample_weight = if_else(sample_weight == 0|is.na(sample_weight), 50, sample_weight))
+
+
+# Fit linear model on QCs -------------------------------------------------
+
+by_batch_GC <- features %>% 
+  filter(class=="Pooled Sample") %>% 
+  group_by(batch_GC, met) %>% # need to change treatment_batch_GC_corr
+  nest()
+
+feature_model <- function(df) {
+  lm(isnorm ~ daily_num, data = df)
+}
+
+
+by_batch_GC <- by_batch_GC %>% 
+  mutate(model = map(data, feature_model),
+         predicts = map2(data, model, add_predictions),
+         coefficients = map(model, tidy),
+         aug = map(model, augment),
+         glance = map (model, glance))
+
+coefs_aug <- unnest(by_batch_GC, aug) %>% 
+  select(batch_GC, met, .resid, daily_num)
+
+coefs_glance <- unnest(by_batch_GC,glance) %>% 
+  select(batch_GC, met, adj.r.squared)
+
+coefs_term <- unnest(by_batch_GC, coefficients) %>% 
+  pivot_wider(id_cols = c(batch_GC, met),
+              names_from = "term",
+              values_from = estimate) %>% 
+  rename(x = daily_num,
+         intercept = `(Intercept)`)
+
+
+coefs_pvalue <- by_batch_GC %>% 
+  select(batch_GC, met, coefficients) %>%
+  ungroup() %>% 
+  unnest(coefficients) %>% 
+  pivot_wider(id_cols = c(batch_GC, met),
+              names_from = "term",
+              values_from = p.value) %>% 
+  rename(intercept_pval = `(Intercept)`,
+         daily_num_pval = daily_num) %>% 
+  full_join(features) %>% 
+  full_join(coefs_term) %>% 
+  #left_join(coefs_aug) %>% 
+  full_join(coefs_glance) 
+
+median <- features %>% 
+  filter(class=="Pooled Sample") %>% 
+  group_by(batch_GC, met) %>% 
+  summarise(median = median (isnorm))
+
+features_lin <- unnest(by_batch_GC, predicts) %>% 
+  full_join(coefs_pvalue) %>% 
+  full_join(median) %>% 
+  mutate(pred = intercept + x*daily_num,
+         predp = ifelse(daily_num_pval <=0.05 & adj.r.squared >= 0.75, pred, median),
+         predp = ifelse(is.na(predp), median, predp)) %>%
+  select(- c("data", "model", "coefficients")) 
+
+adjust_lin <- features_lin %>% 
+  ungroup() %>% 
+  group_by(met) %>% 
+  summarise(lin_offset = if_else(any(predp <0), 1.001*abs(min(predp)),0))
+
+features_lin <- features_lin %>%
+  ungroup() %>% 
+  full_join(adjust_lin) %>%
+  full_join(features) %>% 
+  mutate(linnorm = (isnorm+lin_offset)/(predp+lin_offset),
+         linnorm_fw = linnorm/sample_weight,
+         linnorm_fw_log = log2(linnorm_fw))
+
+
+
+
+#Fit loess-model for QC-RLSC batch_GCwise####
+rejoin <- features
+features <- rejoin
+
+QC_loess <- features %>% 
+  ungroup() %>% 
+  filter(class == "Pooled Sample") %>%
+  #filter(met!="m_44") %>% 
+  group_by(met,  batch_GC) %>% 
+  nest()
+#revert span back to 1.5 later
+loess_model <- function (df) {
+  loess(isnorm ~ daily_num, span = 1.5, data = df,control = loess.control(surface = "direct"))
+}
+
+start <- Sys.time()
+QC_loess <- QC_loess %>% 
+  mutate(model = map (data, loess_model),
+         daily_num = list(seq(1,101,1)),
+         predict = map2(model,daily_num,stats::predict))
+end <- Sys.time()
+end-start
+
+features_loess <- QC_loess %>%
+  unnest(c(predict, daily_num, batch_GC)) %>% 
+  select(-model, -data) %>% 
+  full_join(rejoin) %>% 
+  filter(!is.na(machine_num_GC))
+
+adjust <- features_loess %>% 
+  ungroup() %>% 
+  group_by(met) %>% 
+  summarise(offset = if_else (any(predict <0), 1.001*abs(min(predict, na.rm = T)),0))
+
+features_loess <- features_loess %>% 
+  left_join(adjust) %>% 
+  mutate(loess_norm = (isnorm+offset)/(predict+offset),
+         loess_norm_fw = loess_norm/sample_weight,
+         loess_norm_fw_log = log2(loess_norm_fw))%>% 
+  ungroup()
+
+features_all <- features_lin %>% 
+  left_join(features_loess) %>% 
+  mutate(batch_pca = str_extract(batch_GC, "\\d+"))
+
+
+# Relative log abundance plots --------------------------------------------
+
+
+
+
+features_all %>% 
+  filter(!is.na(genotype)) %>% 
+  group_by(met,  treatment) %>% 
+  mutate(sub = loess_norm_fw,
+         rla = log2(sub) - median(log2(sub))) %>% 
+  ggplot(aes(x = met, y = rla)) +
+  geom_boxplot() +
+  facet_grid(rows = vars(treatment), scales = "free")+
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
+features_all %>% 
+  #filter(!is.na(taxa)) %>% 
+  filter(treatment == 1, class == "Sample") %>% 
+  mutate(run_num_GC = as_factor(run_num_GC),
+         batch_GC = as_factor(batch_GC)) %>% 
+  group_by(met,  treatment) %>% 
+  mutate(rla_imp = log2(imp) - median(log2(imp)),
+         rla_isnorm = log2(isnorm) - median(log2(isnorm)),
+         rla_loess_norm = log2(loess_norm_fw) - median(log2(loess_norm_fw))) %>%
+  pivot_longer(starts_with("rla"), names_to = "normalization", values_to = "rla") %>% 
+  ggplot(aes(x = met, y = rla)) +
+  geom_boxplot() +
+  facet_grid(rows = vars(normalization), cols = vars(treatment), scales = "free")+
+  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
+  ylim(c(-2,2))
+
+ggsave(here(out,
+             "cmQTL_val1_2_GC_met_rla_plot.jpg"
+             ))
+
+features_all %>% 
+  #filter(!is.na(taxa)) %>% 
+  filter(class == "Sample") %>% 
+  mutate(run_num_GC = as_factor(run_num_GC),
+         batch_GC = as_factor(batch_GC)) %>% 
+  group_by(met,  treatment) %>% 
+  mutate(rla_imp = log2(imp) - median(log2(imp)),
+         rla_isnorm = log2(isnorm) - median(log2(isnorm)),
+         rla_loess_norm = log2(loess_norm_fw) - median(log2(loess_norm_fw))) %>%
+  pivot_longer(starts_with("rla"), names_to = "normalization", values_to = "rla") %>% 
+  ggplot(aes(x = run_num_GC, y = rla, color = batch_GC)) +
+  geom_boxplot() +
+  facet_grid(rows = vars(normalization), scales = "free")+
+  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
+  ylim(c(-1,1))
+
+ggsave(here(out,
+             "cmQTL_val1_2_samples_rla_plot.jpg"
+             ))
+
+
+# RSD estimation ----------------------------------------------------------
+
+RSD_bio_reps <- features_all %>% 
+  filter(class == "sample") %>% 
+  #filter(!is.na(taxa)) %>% 
+  group_by(met) %>% 
+  #filter((any(qc_imputed) == T)==F) %>% 
+  group_by(met, treatment, genotype, tissue, exp) %>% 
+  summarise(RSD_imp = sd(imp)/mean(imp),
+            RSD_isnorm = sd(isnorm) / mean(isnorm),
+            RSD_linnorm = abs(sd (linnorm_fw)/ mean(linnorm_fw)),
+            RSD_loess_norm = abs(sd(loess_norm_fw) / mean(loess_norm_fw)))
+            
+            
+RSD_bio_reps_mean <- RSD_bio_reps %>% 
+  #mutate(RSD_diff = RSD_imp-RSD_loess) %>% 
+  ungroup() %>% 
+  #group_by(treatment, tissue, exp) %>% 
+  summarise(#mean_RSD_loess = mean(RSD_loess),
+    mean_imp = mean(na.omit(RSD_imp)),
+    mean_loess_norm = mean(na.omit(RSD_loess_norm)),
+    mean_RSD_isnorm = mean(na.omit(RSD_isnorm )),
+    mean_linnorm = mean(na.omit(RSD_linnorm)))
+
+RSD_qcs <- features_all %>% 
+  filter(class == "Pooled Sample") %>% 
+  group_by(met) %>% 
+  #filter((any(qc_imputed) == T)==F) %>% 
+  group_by(met, exp) %>% 
+  summarise(RSD_imp = sd(imp)/mean(imp),
+            RSD_isnorm = sd(isnorm) / mean(isnorm),
+            RSD_linnorm = abs(sd (linnorm_fw)/ mean(linnorm_fw)),
+            RSD_loess_norm = abs(sd(loess_norm_fw) / mean(loess_norm_fw)))
+
+RSD_qcs_mean <- RSD_qcs %>% 
+  #mutate(RSD_diff = RSD_imp-RSD_loess) %>% 
+  ungroup() %>% 
+  group_by(exp) %>% 
+  summarise(#mean_RSD_loess = mean(RSD_loess),
+    mean_imp = mean(na.omit(RSD_imp)),
+    mean_loess_norm = mean(na.omit(RSD_loess_norm)),
+    mean_RSD_isnorm = mean(na.omit(RSD_isnorm )),
+    mean_linnorm = mean(na.omit(RSD_linnorm)))
+
+
+
+#PCA testing chunk####
+##https://www.intechopen.com/books/metabolomics-fundamentals-and-applications/processing-and-visualization-of-metabolomics-data-using-r
+#exp1
+pca_base <- features_all %>%
+  ungroup() %>% 
+  filter(exp == 1) %>% 
+  arrange(batch_pca, daily_num) %>%
+  mutate(log_imp = log2(imp)) %>% 
+  pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC),
+              names_from = met,
+              values_from = linnorm_fw_log) %>% 
+  filter(!is.na(m_1))
+
+pca <- summary(prcomp(
+  pca_base%>%
+    select(starts_with("m_"))))
+
+sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames()
+
+pca_plot <- as_tibble(pca$x) %>% 
+  mutate(join_num = 1:nrow(pca$x)) %>% 
+  left_join(pca_base %>% 
+              mutate(join_num=1:nrow(pca$x))) %>%
+  #left_join(sam_dat) %>% 
+  mutate(batch_GC= as_factor(batch_GC),
+         class= as_factor(class)) %>% 
+  select(all_of(sam_vars), everything())
+
+exp_var <- as_tibble(pca[["importance"]])
+
+pca_plot %>% 
+  #  filter(treatment == "HL") %>% 
+  ggplot()+
+  geom_jitter(aes(x=PC1, y=PC2,  color = tissue)) +
+  stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) +
+  ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
+  xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
+
+ggsave(here(out,
+             "cmQTL_val1_tissue_lin_PCA.jpg"
+             ),
+       width = 183,
+       height = 100,
+       units = "mm",
+       dpi = 300)
+
+pca_plot %>% 
+  #  filter(treatment == "HL") %>% 
+  ggplot()+
+  geom_jitter(aes(x=PC1, y=PC2,  color = treatment)) +
+  stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) +
+  ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
+  xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
+
+ggsave(here(out,
+             "cmQTL_val1_treatment_PCA.jpg"
+             ),
+       width = 183,
+       height = 100,
+       units = "mm",
+       dpi = 300)
+
+
+#PCA after loess normalization
+
+pca_base <- features_all %>%
+  ungroup() %>% 
+  filter(exp == 1) %>% 
+  arrange(batch_pca, daily_num) %>%
+  mutate(log_imp = log2(imp)) %>% 
+  pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC),
+              names_from = met,
+              values_from = loess_norm_fw_log) %>% 
+  filter(!is.na(m_1))
+
+pca <- summary(prcomp(
+  pca_base%>%
+    select(starts_with("m_"))))
+
+sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames()
+
+pca_plot <- as_tibble(pca$x) %>% 
+  mutate(join_num = 1:nrow(pca$x)) %>% 
+  full_join(pca_base %>% 
+              mutate(join_num=1:nrow(pca$x))) %>%
+  #left_join(sam_dat) %>% 
+  mutate(batch_pca= as_factor(batch_GC),
+         class= as_factor(class)) %>% 
+  select(all_of(sam_vars), everything())
+
+exp_var <- as_tibble(pca[["importance"]])
+
+pca_plot %>% 
+  #  filter(treatment == "HL") %>% 
+  ggplot()+
+  geom_jitter(aes(x=PC1, y=PC2,  color = tissue)) +
+  stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) +
+  ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
+  xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
+
+ggsave(here(out,
+             "cmQTL_val1_tissue_loess_PCA.jpg"
+             ),
+       width = 183,
+       height = 100,
+       units = "mm",
+       dpi = 300)
+
+pca_plot %>% 
+  #  filter(treatment == "HL") %>% 
+  ggplot()+
+  geom_jitter(aes(x=PC1, y=PC2,  color = treatment)) +
+  stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) +
+  ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
+  xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
+
+ggsave(here(out,
+             "cmQTL_val1_treatment_loess_PCA.jpg"
+             ),
+       width = 183,
+       height = 100,
+       units = "mm",
+       dpi = 300)
+
+pca_plot %>% 
+  #  filter(treatment == "HL") %>% 
+  ggplot()+
+  geom_jitter(aes(x=PC1, y=PC2,  color = class)) +
+  stat_ellipse(aes(x=PC1, y=PC2, color = class)) +
+  ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
+  xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
+
+
+
+#PCA after loess normalization exp2
+
+pca_base <- features_all %>%
+  ungroup() %>% 
+  filter(exp == 2) %>% 
+  arrange(batch_pca, daily_num) %>%
+  mutate(log_imp = log2(isnorm)) %>% 
+  pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC),
+              names_from = met,
+              values_from = log_imp)
+
+pca <- summary(prcomp(
+  pca_base%>%
+    select(starts_with("m_"))))
+
+sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames()
+
+pca_plot <- as_tibble(pca$x) %>% 
+  mutate(join_num = 1:nrow(pca$x)) %>% 
+  full_join(pca_base %>% 
+              mutate(join_num=1:nrow(pca$x))) %>%
+  #left_join(sam_dat) %>% 
+  mutate(batch_pca= as_factor(batch_GC),
+         class= as_factor(class)) %>% 
+  select(all_of(sam_vars), everything())
+
+exp_var <- as_tibble(pca[["importance"]])
+
+pca_plot %>% 
+  #  filter(treatment == "HL") %>% 
+  ggplot()+
+  geom_jitter(aes(x=PC1, y=PC2,  color = class)) +
+  stat_ellipse(aes(x=PC1, y=PC2, color = class)) +
+  ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
+  xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
+
+pca_plot %>% 
+  #  filter(treatment == "HL") %>% 
+  ggplot()+
+  geom_jitter(aes(x=PC1, y=PC2,  color = batch_GC)) +
+  stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) +
+  geom_text(aes(x = PC1, y = PC2,
+                label = if_else(machine_num_GC %in% exclude_samples, machine_num_GC, ""))) +
+  ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
+  xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
+
+ggsave(here(out,
+             "cmQTL_val1_treatment_loess_outlier_exp2_loess_norm_PCA.jpg"
+             ),
+       width = 183,
+       height = 100,
+       units = "mm",
+       dpi = 300)
+
+
+# Per tissue loess pca ----------------------------------------------------
+
+
+tissues <- c("leaf", "fruit")
+for(tiss in seq_along(tissues)){
+  
+  pca_base <- features_all %>%
+    ungroup() %>% 
+    filter(exp == 1, tissue == tissues[[tiss]]) %>% 
+    arrange(batch_pca, daily_num) %>%
+    mutate(log_imp = log2(imp)) %>% 
+    pivot_wider(id_cols = c(tissue, treatment,batch_GC, daily_num, class, machine_num_GC),
+                names_from = met,
+                values_from = loess_norm_fw_log) %>% 
+    filter(!is.na(m_1))
+  
+  pca <- summary(prcomp(
+    pca_base%>%
+      select(starts_with("m_"))))
+  
+  sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames()
+  
+  pca_plot <- as_tibble(pca$x) %>% 
+    mutate(join_num = 1:nrow(pca$x)) %>% 
+    full_join(pca_base %>% 
+                mutate(join_num=1:nrow(pca$x))) %>%
+    left_join(sam_dat) %>% 
+    mutate(batch_pca= as_factor(batch_GC),
+           class= as_factor(class)) %>% 
+    select(all_of(sam_vars), everything())
+  
+  exp_var <- as_tibble(pca[["importance"]])
+  
+  pca_plot %>% 
+    #  filter(treatment == "HL") %>% 
+    ggplot()+
+    geom_jitter(aes(x=PC1, y=PC2,  color = treatment)) +
+    stat_ellipse(aes(x=PC1, y=PC2, color = treatment)) +
+    ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
+    xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
+  
+  ggsave(here(out,
+               "cmQTL_val1", tissues[[tiss]], "treatment_loess_PCA.jpg"
+               ),
+         width = 183,
+         height = 100,
+         units = "mm",
+         dpi = 300)
+}
+
+
+# PCA before normalization
+
+pca_base <- features_all %>%
+  ungroup() %>% 
+  filter(exp == 1) %>% 
+  arrange(batch_pca, daily_num) %>%
+  mutate(log_imp = log2(imp)) %>% 
+  pivot_wider(id_cols = c(tissue, treatment, batch_pca, daily_num, class, machine_num_GC),
+              names_from = met,
+              values_from = log_imp)%>% 
+  filter(!is.na(m_1))
+
+pca <- summary(prcomp(
+  pca_base%>%
+    select(starts_with("m_"))))
+
+sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames()
+
+pca_plot <- as_tibble(pca$x) %>% 
+  mutate(join_num = 1:nrow(pca$x)) %>% 
+  full_join(pca_base %>% 
+              mutate(join_num=1:nrow(pca$x))) %>%
+  left_join(sam_dat) %>% 
+  mutate(batch_pca= as_factor(batch_pca),
+         class= as_factor(class)) %>% 
+  select(all_of(sam_vars), everything())
+
+exp_var <- as_tibble(pca[["importance"]])
+
+
+pca_plot %>% 
+  #  filter(treatment == "HL") %>% 
+  ggplot()+
+  geom_jitter(aes(x=PC1, y=PC2,  color = tissue)) +
+  stat_ellipse(aes(x=PC1, y=PC2, color = tissue)) +
+  geom_text(aes(x = PC1, y = PC2,
+                label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, "")),
+            nudge_y = -2,
+            nudge_x = 4) +
+  ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
+  xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
+
+ggsave(here(out,
+             "cmQTL_val1_tissue_imp_PCA.jpg"
+             ),
+       width = 183,
+       height = 100,
+       units = "mm",
+       dpi = 300)
+
+pca_plot %>% 
+  #  filter(treatment == "HL") %>% 
+  ggplot()+
+  geom_jitter(aes(x=PC1, y=PC2,  color = batch_GC)) +
+  stat_ellipse(aes(x=PC1, y=PC2, color = batch_GC)) +
+  geom_text(aes(x = PC1, y = PC2,
+                label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, "")),
+            nudge_y = -2,
+            nudge_x = 4) +
+  ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
+  xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
+
+ggsave(here(out,
+             "cmQTL_val1_batch_imp_PCA.jpg"
+             ),
+       width = 183,
+       height = 100,
+       units = "mm",
+       dpi = 300)
+
+
+pca_plot %>% 
+  #  filter(treatment == "HL") %>% 
+  ggplot()+
+  geom_jitter(aes(x=PC1, y=PC2,  color = class)) +
+  stat_ellipse(aes(x=PC1, y=PC2, color = class)) +
+  geom_text(aes(x = PC1, y = PC2,
+                label = if_else(machine_num_GC == "21106rA_31", machine_num_GC, "")),
+            nudge_y = -2,
+            nudge_x = 4) +
+  ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
+  xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
+
+ggsave(here(out,
+             "cmQTL_val1_class_imp_PCA.jpg"
+             ),
+       width = 183,
+       height = 100,
+       units = "mm",
+       dpi = 300)
+
+#cmQTLval2
+pca_base <- features_all %>%
+  ungroup() %>% 
+  filter(exp == 2, batch_GC == "3_split") %>% 
+  arrange(batch_GC, daily_num) %>%
+  mutate(log_imp = log2(isnorm)) %>% 
+  pivot_wider(id_cols = c(tissue, treatment, batch_GC, daily_num, class, machine_num_GC),
+              names_from = met,
+              values_from = log_imp)%>% 
+  filter(!is.na(m_1))
+
+pca <- summary(prcomp(
+  pca_base%>%
+    select(starts_with("m_"))))
+
+sam_vars <- pca_base %>% select(-starts_with("m_")) %>% colnames()
+
+pca_plot <- as_tibble(pca$x) %>% 
+  mutate(join_num = 1:nrow(pca$x)) %>% 
+  full_join(pca_base %>% 
+              mutate(join_num=1:nrow(pca$x))) %>%
+  left_join(sam_dat) %>% 
+  mutate(batch_pca= as_factor(batch_GC),
+         class= as_factor(class)) %>% 
+  select(all_of(sam_vars), everything())
+
+exp_var <- as_tibble(pca[["importance"]])
+
+
+pca_plot %>% 
+  #  filter(treatment == "HL") %>% 
+  ggplot()+
+  geom_jitter(aes(x=PC1, y=PC2,  color = class)) +
+  stat_ellipse(aes(x=PC1, y=PC2, color = class)) +
+  geom_text(aes(x = PC1, y = PC2,
+                label = if_else(machine_num_GC == "21109rA_59", machine_num_GC, "")),
+            nudge_y = -2,
+            nudge_x = 4) +
+  ylab(str_c("PC2 ", "(", exp_var$PC2[[2]]*100, "%)")) +
+  xlab(str_c("PC1 ", "(", exp_var$PC1[[2]]*100, "%)"))
+
+ggsave(here(out,
+             "cmQTL_val2_class_isnorm_PCA.jpg"
+             ),
+       width = 183,
+       height = 100,
+       units = "mm",
+       dpi = 300)
+
+
+# Plot all metabolites ----------------------------------------------------
+
+plotmets <- features_all %>% distinct(met) %>% as_vector
+plotmet_labs <- plotmets %>% as_tibble() %>% 
+  left_join(met_dat, by= c("value" = "met")) %>% 
+  #mutate(peak_num = base::rank(HMDB_clear_name, ties.method = "first"),
+  #       dup = sum(peak_num),
+  #       HMDB_clear_name_unique = if_else(dup>1, str_c(HMDB_clear_name, peak_num), HMDB_clear_name)) %>% 
+  #ungroup() %>% 
+  select(Compound_Name) %>% as_vector()
+
+plot_out <- vector("list", length = length(plotmets))
+
+for (meta in seq_along(plotmets)) {
+  
+  plot_out [[meta]] <- features_all %>% 
+    mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC)),
+           is_miss = as_factor(if_else(is.na(area), T, F))) %>% 
+    filter(met == plotmets[[meta]]) %>%
+    filter(class!="blank") %>% 
+    ggplot(aes(x=run_num_GC, y=isnorm)) +
+    geom_point(aes(color = class, shape = is_miss)) +
+    geom_point(aes(y=predict), color="black", size=0.1) +
+    geom_vline(aes(xintercept = xint))+
+   # facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") +
+    ggtitle(label = plotmet_labs[[meta]])
+
+}
+
+pdf(file = here(out,"_loess_fit.pdf"))
+
+
+for (meta in seq_along(plotmets)) {
+  print(plot_out[[meta]])
+}
+
+dev.off()
+
+plot_out <- vector("list", length = length(plotmets))
+
+for (meta in seq_along(plotmets)) {
+  
+  plot_out [[meta]] <- features_all %>% 
+    mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC))) %>% 
+    filter(met == plotmets[[meta]]) %>%
+    filter(class!="blank") %>% 
+    ggplot(aes(x=run_num_GC, y=loess_norm_fw_log)) +
+    geom_point(aes(color=class)) +
+    #geom_point(aes(y=predp), color="black", size=0.1) +
+    geom_vline(aes(xintercept = xint))+
+    # facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") +
+    ggtitle(label = plotmet_labs[[meta]])
+  
+}
+
+pdf(file = here(out,"_loess_norm_fw_log.pdf"))
+
+
+for (meta in seq_along(plotmets)) {
+  print(plot_out[[meta]])
+}
+
+dev.off()
+
+for (meta in seq_along(plotmets)) {
+  
+  plot_out [[meta]] <- features_all %>% 
+    mutate(is_miss = as_factor(if_else(is.na(area), T, F))) %>% 
+    filter(met == plotmets[[meta]]) %>%
+    filter(class!="blank") %>% 
+    ggplot(aes(x=daily_num, y=isnorm)) +
+    geom_point(aes(color = class, shape = is_miss)) +
+    geom_point(aes(y=predict), color="black", size=0.1) +
+    facet_wrap(facets = vars(batch_GC), nrow = 1, scales = "free") +
+    ggtitle(label = plotmet_labs[[meta]]) +
+    theme(axis.ticks.y = element_blank(),
+          axis.text.y = element_blank(),
+          panel.spacing = unit(-0.1,"lines"))
+  
+}
+
+pdf(file = here(out,"_loess_norm_no_scale.pdf"),
+    height = 3.5)
+
+
+for (meta in seq_along(plotmets)) {
+  print(plot_out[[meta]])
+}
+
+dev.off()
+
+for (meta in seq_along(plotmets)) {
+  
+  plot_out [[meta]] <- features_all %>% 
+    mutate(xint = if_else(daily_num == 4, run_num_GC-3.5, max(run_num_GC)),
+           is_miss = as_factor(if_else(is.na(area), T, F))) %>% 
+    filter(met == plotmets[[meta]]) %>%
+    filter(class!="blank") %>% 
+    ggplot(aes(x=run_num_GC, y=isnorm)) +
+    geom_point(aes(color = class, shape = is_miss)) +
+    geom_point(aes(y=predp), color="black", size=0.1) +
+    geom_vline(aes(xintercept = xint))+
+    # facet_grid(rows = vars(treatment), cols = vars(rep), scales = "free") +
+    ggtitle(label = plotmet_labs[[meta]])
+  
+}
+
+pdf(file = here(out,"_linnorm.pdf"))
+
+for (meta in seq_along(plotmets)) {
+  print(plot_out[[meta]])
+}
+
+dev.off()
+
+rescale <- features_all %>% 
+  group_by(met, tissue, exp) %>% 
+  summarise(rescale = median(imp))
+
+sam_vars <- c(#"plantline","extraction_num", "sample_num", #in case I add them back
+              "alias", "LIMS_ID",
+              "treatment", "tissue", "batch_GC", "run_date_GC",
+               "machine_num_GC",
+              "class", "run_num_GC", "sample_weight", "exp", "genotype")
+
+
+features_out <- features_all %>% 
+  left_join(rescale) %>% 
+  filter(class != "blank",
+         (component %in% take_split & str_detect(batch_GC, "split"))|
+           (!component %in% take_split & str_detect(batch_GC, "split", negate = T))|
+         tissue == "leaves") %>% 
+  group_by(met, tissue, exp) %>% 
+  mutate(loess_norm_med = loess_norm_fw/median(loess_norm_fw),
+         rescaled = loess_norm_med*rescale) %>% 
+  select(all_of(sam_vars), met, Compound_Name,  loess_norm_fw, loess_norm_med, area, rescaled)
+
+#features_out %>% 
+#  filter(met == "m_02177", tissue == "fruits", exp == 1) %>% 
+#  ggplot(aes(x = run_num_GC, y = loess_norm_fw)) +
+#  geom_point()
+
+# Write files -------------------------------------------------------------
+
+
+
+write_csv(features_out, 
+          here(out,
+                "cmQTL_val_1_2_feat_dat_GC.csv"
+                ))
+
+write_csv(met_dat, 
+          here(out,
+                "cmQTL_val_1_2_met_dat_GC.csv"
+                ))
+
+write_csv(sam_dat, 
+          here(out,
+                "cmQTL_val_1_2_sam_dat_GC.csv"
+                ))
+
+write_csv(missingness, 
+          here(out,
+                "cmQTL_val_1_2_miss_GC.csv"
+                ))
+
+# Log used code ------------------------------------------------------------
+
+file_name <- sys.frame(1)$ofile
+
+file.copy(file_name,
+          to = here(out, "executing.R"),overwrite = T)
+
+
+# Temporary .RData file to avoid having to rerun code ---------------------
+
+
diff --git a/workflows/GC_MS_analysis/210903_primary_analysis_cmQTL_val_1_levene_RxNV.R b/workflows/GC_MS_analysis/210903_primary_analysis_cmQTL_val_1_levene_RxNV.R
index 291e54c653db34aefca7a7b2e4fafd9e7c2eba79..e1d3217ecb1f5fd1f79c12eff68941089f27f9fe 100644
--- a/workflows/GC_MS_analysis/210903_primary_analysis_cmQTL_val_1_levene_RxNV.R
+++ b/workflows/GC_MS_analysis/210903_primary_analysis_cmQTL_val_1_levene_RxNV.R
@@ -13,13 +13,14 @@ library(ggpubr)
 library(viridisLite)
 library(ggtext)
 library(glue)
+library(here)
 
 current <- getwd()
 source <- str_c(current,"/..")
 
 cur_date <- str_c(str_replace_all(Sys.Date(),"^.{2}|-",""))
 
-out <- str_c(cur_date, "analysis", sep = "_")
+out <- str_c("GC_MS_analysis", sep = "_")
 
 if (file.exists(out)) {
   cat("The folder already exists")
diff --git a/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R b/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R
index fb2c78b6c017b8983876102ce48a6cb21cdeff94..033dfcf79a7ede6c299cd44b58bea53f7afc2648 100644
--- a/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R
+++ b/workflows/GC_MS_normalization/210927_primary_normalization_with_split.R
@@ -29,6 +29,8 @@ setwd(here())# Not recommended but convenient in Rstudio to start from root
 
 # Data loading ------------------------------------------------------------
 
+load(here("./runs/GC-MS normalization/Image.RData"))
+
 sam_dat1 <- readxl::read_xlsx(here("studies/cmQTL_val1_GH_2020/isa.study.xlsx"))
 isa_ext <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 1)
 isa_gc <- readxl::read_xlsx(here("assays/cmQTL_val1_GH_2020_GC_MS/isa.assay.xlsx"), sheet = 2)
@@ -1103,9 +1105,95 @@ write_csv(missingness,
                 "cmQTL_val_1_2_miss_GC.csv"
                 ))
 
+# Data dictionary ---------------------------------------------------------
+
+sam_dat_dic <- colnames(sam_dat) %>%
+  as_tibble()
+
+orig <- colnames(sam_dat1) %>% 
+  as_tibble()
+
+break_lines_old <- function(x){
+  id <- vector(mode = "character", length = length(x))
+  term_source <- vector(mode = "character", length = length(x))
+  term_acc <- vector(mode = "character", length = length(x))
+  for(i in seq_along(x)){
+    if(str_detect(x[[i]], "Characteristic|Parameter|Factor|Component")){
+      id[[i]] <- x[[i]]
+      #if(str_detect(x[[i+1]], "unit")) {
+#        isa_unit[[i]] <- x[[i+1]]
+        term_source[[i]] <- x[[i+2]]
+        term_acc[[i]] <- x[[i+3]]
+  #      } else {
+      term_source[[i]] <- x[[i+1]]
+      term_acc[[i]] <- x[[i+2]]
+#        }
+    } else {
+      id[[i]] <- x[[i]]
+    }
+  }
+  test_tib <- tibble(id = id,
+ #                    unit = isa_unit,
+                     term_source = term_source,
+                     term_acc = term_acc) %>% 
+    filter()
+  assign("output",test_tib, pos = 1)
+}
+
+break_lines(orig$value) #works without unit
+
+break_lines <- function(x, output_name){
+  x <- colnames(x) %>% 
+    as_tibble()
+  x <- x$value
+  group = vector(mode = "list", length = length(x))
+  for(i in seq_along(x)){
+    if(str_detect(x[[i]], "Characteristic|Parameter|Factor|Component")){
+      group[[i]] <- tibble(id = x[[i]])
+      if(str_detect(x[[i+1]], "Unit")) {
+        group[[i]] <- group[[i]] %>% mutate(isa_unit = x[[i+1]])
+        if(str_detect(x[[i+2]], "Term Source REF")) {
+          group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+2]])
+          if(str_detect(x[[i+3]], "Term Accession Number")){
+            group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+3]])
+          }
+          }
+      } else if(str_detect(x[[i+1]], "Term Source REF")) {
+        group[[i]] <- group[[i]] %>% mutate(term_source = x[[i+1]])
+        if(str_detect(x[[i+2]], "Term Accession Number")){
+          group[[i]] <- group[[i]] %>% mutate(term_acc = x[[i+2]])
+    }
+      }
+    } else if(str_detect(x[[i]], "Source REF|Accession Number|Unit",negate = T)){
+      group[[i]] <- tibble(id = x[[i]])
+    }
+  }
+  group_tib <- group %>% 
+    discard(.p = is.null) %>% 
+    reduce(.f = bind_rows) 
+  assign(output_name,group_tib, pos = 1)
+}
+break_lines(sam_dat1, "output_samdat")
+break_lines(isa_ext, "output_ext")
+break_lines(isa_gc, "output_gc")
+break_lines(isa_ms, "output_ms")
+
+output <- bind_rows(output_samdat,
+                    output_ext,
+                    output_gc,
+                    output_ms)
+
+write.xlsx(output,
+           here("assays/cmQTL_val1_GH_2020_GC_MS/dataset/isa.dataset_temp.xlsx"))
+
 # Log used code ------------------------------------------------------------
 
 file_name <- sys.frame(1)$ofile
 
 file.copy(file_name,
-          to = str_c(out_dir, str_remove(file_name, current), "_", str_replace_all(Sys.Date(),"^.{2}|-",""), ".R"),overwrite = T)
+          to = here(out, "executing.R"),overwrite = T)
+
+
+# Temporary .RData file to avoid having to rerun code ---------------------
+
+save.image(here(out, "Image.RData"))