diff --git a/workflows/2022-04-13_FirstIterations/2022-04-13_skim_results.Rmd b/workflows/2022-04-13_FirstIterations/2022-04-13_skim_results.Rmd deleted file mode 100644 index 85bfcc434c9c65dbb1261456cb7e42ff30b89bfe..0000000000000000000000000000000000000000 --- a/workflows/2022-04-13_FirstIterations/2022-04-13_skim_results.Rmd +++ /dev/null @@ -1,342 +0,0 @@ ---- -title: "Untitled" -author: "Dominik" -date: "4/13/2022" -output: - pdf_document: default - html_document: default ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) - -library(knitr) -library(tidyverse) -library(openxlsx) - -``` - - -```{r} -getwd() - -dir("../assays/2022-04-13_proteome_discoverer/dataset/") - -``` - -## Count peptides in fasta - -```{bash} -grep ">" ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta | wc -l - -head ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta - -grep "OS=Ricinus communis" ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta | wc -l -``` - -## skim excel files - -### quantification.xlsx - -```{r} -quantification <- readxl::read_xlsx("../assays/2022-04-13_proteome_discoverer/dataset/quantification.xlsx") -dim(quantification) -colnames(quantification) -length(unique(quantification$Accession)) - -``` - -### quantification_with_peptides.xlsx - -```{r} -quantification_peptides <- readxl::read_xlsx("../assays/2022-04-13_proteome_discoverer/dataset/quantification_with_peptides.xlsx") -dim(quantification_peptides) -colnames(quantification_peptides) -length(unique(quantification_peptides$Accession)) - -## This is a whole new level of untidy. A table nested in a table. I hate it. Thank you, Thermo Fisher Scientific. - -### 1. keep only high confidence rows - -quant_proteins <- filter(quantification_peptides, `Protein FDR Confidence: Combined` == "High") - -### 2. remove empty columns -quant_proteins <- quant_proteins[, colSums(is.na(quant_proteins)) != nrow(quant_proteins)] - -### 3. dummy check, that content is the same... -dim(quant_proteins) == dim(quantification) -sum(as.numeric(unlist(quantification[,6])) == as.numeric(unlist(quant_proteins[,6]))) - - -## pull out peptide data - -peptides <- filter(quantification_peptides, is.na(`Protein FDR Confidence: Combined`)) -nrow(quantification_peptides) - nrow(peptides) - -### 2. remove empty columns -peptides <- peptides[, colSums(is.na(peptides)) != nrow(peptides)] - -### 3. strip between-data headlines -colnames(peptides) = as.character(peptides[1, ]) -peptides <- filter(peptides, Checked != "Checked") - -peptides <- type_convert(peptides) - - -``` - - -## create an isa.run.xlsx like data dictionary - -```{r} - -sum_tmp <- data.frame(summary(quantification)) %>% - pivot_wider(names_from = Var1, values_from = c(Var1, Freq), - values_fn = function(x){paste(na.exclude(x), collapse = "|")}) - -quantification_summary <- cbind.data.frame(sum_tmp[,c(1,3)],t(summarise_all(quantification, class))) -colnames(quantification_summary) <- c("Identifier", "ObjectSummary", "ObjectType") -quantification_summary$TargetFile <- "quantification.xlsx" - - -sum_tmp <- data.frame(summary(peptides)) %>% - pivot_wider(names_from = Var1, values_from = c(Var1, Freq), - values_fn = function(x){paste(na.exclude(x), collapse = "|")}) - -peptides_summary <- cbind.data.frame(sum_tmp[,c(1,3)],t(summarise_all(peptides, class))) - -colnames(peptides_summary) <- c("Identifier", "ObjectSummary", "ObjectType") -peptides_summary$TargetFile <- "quantification_with_peptides.xlsx" - - - -isa_run <- rbind.data.frame(quantification_summary, peptides_summary)[, c(4, 1, 3, 2)] -row.names(isa_run) <- NULL -isa_run$Comment <- "" -isa_run$Definition <- "" - - -write.xlsx(isa_run, file = "../assays/2022-04-13_proteome_discoverer/isa.run.xlsx", overwrite = T, asTable = T) - - -``` - - - - - - -```{r} - - -# extract abundances per accession only - -abundances_grouped <- quantification[, c("Accession", grep("Abundances (Grouped)", colnames(quantification), fixed = T, value = T))] - -# pivot and split column - -abundances_grouped2 <- - abundances_grouped %>% - pivot_longer(!Accession, names_to = c("organelle", "compartment"), values_to = "abundance", names_sep = ', ') - -# remove - -abundances_grouped2$organelle <- gsub("Abundances (Grouped): ", "", abundances_grouped2$organelle, fixed = T) - -# transform columns to factor - -abundances_grouped2$organelle <- as.factor(abundances_grouped2$organelle) -abundances_grouped2$compartment <- as.factor(abundances_grouped2$compartment) - -# pick random accessions -selected_accs <- sample(unique(abundances_grouped2$Accession), 4) - -# filter plot subset -plotsub <- filter(abundances_grouped2, Accession %in% selected_accs) - - -ggplot(plotsub, aes(x = organelle, y = abundance, fill = compartment)) + - geom_col(position = position_dodge(width = 0.7), width = 0.7) + - facet_wrap(~Accession, scales = 'free') + - scale_fill_brewer(palette = "Dark2") - -``` - - -### Calculate and draw a PCA to get an overview of the dataset - -```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T} - -pca_data <- as.data.frame(pivot_wider(abundances_grouped2, - names_from = Accession, - values_from = abundance, - id_cols = c("organelle", "compartment"))) - -pca_data <- unite(pca_data, organelle, compartment, col = 'merger', sep = '_') -rownames(pca_data) <- pca_data$merger -pca_data <- pca_data[, -1] - - -####### double-check -pca_data[is.na(pca_data)] <- 0 -####### double-check - -pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})] - -pca <- prcomp(pca_data, scale = T) -pcaPlotData <- as.data.frame(pca$x) -pcaPlotData$merger <- rownames(pcaPlotData) -pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_', - into = c("organelle", "compartment")) - -ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) + - geom_point(size = 3, stroke = 1.5) + - coord_equal() + - # theme_dominik + - scale_color_brewer(palette = 'Dark2') - -``` - - - - - -```{r} - - -# extract abundances per accession only - -abundances <- quantification[, c("Accession", grep("Abundances (Normalized)", colnames(quantification), fixed = T, value = T))] - -# pivot and split column - -abundances2 <- - abundances %>% - pivot_longer(!Accession, names_to = c("sample", "organelle", "compartment"), values_to = "abundance", names_sep = ', ') - -# remove - -abundances2$sample <- gsub("Abundances (Normalized): ", "", abundances2$sample, fixed = T) -abundances2$sample <- gsub(": Sample", "", abundances2$sample, fixed = T) - -# transform columns to factor - -abundances2$sample <- as.factor(abundances2$sample) -abundances2$organelle <- as.factor(abundances2$organelle) -abundances2$compartment <- as.factor(abundances2$compartment) - -# pick random accessions -selected_accs <- sample(unique(abundances2$Accession), 4) - -# filter plot subset -plotsub <- filter(abundances2, Accession %in% selected_accs) - - -ggplot(plotsub, aes(x = organelle, y = abundance, fill = compartment)) + - geom_point(position = position_dodge(width = 0.7), width = 0.7) + - facet_wrap(~Accession, scales = 'free') + - scale_fill_brewer(palette = "Dark2") - -``` - - - -### Calculate and draw a PCA to get an overview of the dataset - -```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T} - -pca_data <- as.data.frame(pivot_wider(abundances2, - names_from = Accession, - values_from = abundance, - id_cols = c("sample", "organelle", "compartment"))) - -pca_data <- unite(pca_data, sample, organelle, compartment, col = 'merger', sep = '_') -rownames(pca_data) <- pca_data$merger -pca_data <- pca_data[, -1] - - -####### double-check -pca_data[is.na(pca_data)] <- 0 -####### double-check - -pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})] - -pca <- prcomp(pca_data, scale = T) -pcaPlotData <- as.data.frame(pca$x) -pcaPlotData$merger <- rownames(pcaPlotData) -pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_', - into = c("sample", "organelle", "compartment")) - -pca_plot_individuals <- ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) + - geom_point(size = 2, stroke = 1.5) + - coord_equal() + - # theme_dominik + - scale_color_brewer(palette = 'Dark2') - -png(file = "pca_plot_individuals.png", res = 300, width = 2000, height = 2000) -pca_plot_individuals -dev.off() - -png(file = "pca_plot_individuals_labelled.png", res = 300, width = 2000, height = 2000) -pca_plot_individuals + geom_text(aes(label = sample), nudge_x = 5) -dev.off() - - -``` - -```{r, out.width = "100%", eval= T, echo=F} -include_graphics('pca_plot_individuals.png') -include_graphics('pca_plot_individuals_labelled.png') -``` - - - - -### exlude ER - -```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T} - -pca_data <- as.data.frame(pivot_wider(filter(abundances2, organelle != "endoplasmatic reticulum"), - names_from = Accession, - values_from = abundance, - id_cols = c("sample", "organelle", "compartment"))) - - -pca_data <- unite(pca_data, sample, organelle, compartment, col = 'merger', sep = '_') -rownames(pca_data) <- pca_data$merger -pca_data <- pca_data[, -1] - - -####### double-check -pca_data[is.na(pca_data)] <- 0 -####### double-check - -pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})] - -pca <- prcomp(pca_data, scale = T) -pcaPlotData <- as.data.frame(pca$x) -pcaPlotData$merger <- rownames(pcaPlotData) -pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_', - into = c("sample", "organelle", "compartment")) - -pca_plot_excl_ER <- ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) + - geom_point(size = 2, stroke = 1.5) + - coord_equal() + - # theme_dominik + - scale_color_brewer(palette = 'Dark2') - -png(file = "pca_plot_excl_ER.png", res = 300, width = 2000, height = 2000) -pca_plot_excl_ER -dev.off() - -``` - -```{r, out.width = "100%", eval= T, echo=F} -include_graphics('pca_plot_excl_ER.png') -``` - - - - - - diff --git a/workflows/2022-04-13_FirstIterations/2022-04-13_skim_results.pdf b/workflows/2022-04-13_FirstIterations/2022-04-13_skim_results.pdf deleted file mode 100644 index ebe0d3c54b693aa377fd8dc94c190a445d7c684b..0000000000000000000000000000000000000000 Binary files a/workflows/2022-04-13_FirstIterations/2022-04-13_skim_results.pdf and /dev/null differ diff --git a/workflows/2022-04-13_FirstIterations/pca_plot_excl_ER.png b/workflows/2022-04-13_FirstIterations/pca_plot_excl_ER.png deleted file mode 100644 index 3b39591c40da84188d3e3a68cd756cbd124ef2a7..0000000000000000000000000000000000000000 Binary files a/workflows/2022-04-13_FirstIterations/pca_plot_excl_ER.png and /dev/null differ diff --git a/workflows/2022-04-13_FirstIterations/pca_plot_individuals.png b/workflows/2022-04-13_FirstIterations/pca_plot_individuals.png deleted file mode 100644 index e480d20f6b909626aae0378b8d6b20a9fcb3863d..0000000000000000000000000000000000000000 Binary files a/workflows/2022-04-13_FirstIterations/pca_plot_individuals.png and /dev/null differ diff --git a/workflows/2022-04-13_FirstIterations/pca_plot_individuals_labelled.png b/workflows/2022-04-13_FirstIterations/pca_plot_individuals_labelled.png deleted file mode 100644 index ca3c290ccb6ce39270b98affd6181d5d41814100..0000000000000000000000000000000000000000 Binary files a/workflows/2022-04-13_FirstIterations/pca_plot_individuals_labelled.png and /dev/null differ