rm intermediate data

131c8a0c · Dominik Brilhaus · 7c3f0fc2 · 7c3f0fc2 · 7c3f0fc2 · 7c3f0fc2
Commit 131c8a0c authored 1 year ago by Dominik Brilhaus
--- a/workflows/2022-04-13_FirstIterations/2022-04-13_skim_results.Rmd
+++ b/workflows/2022-04-13_FirstIterations/2022-04-13_skim_results.Rmd
---
-title: "Untitled"
-author: "Dominik"
-date: "4/13/2022"
-output:
-  pdf_document: default
-  html_document: default
---
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = TRUE)
-library(knitr)
-library(tidyverse)
-library(openxlsx)
-```
-```{r}
-getwd()
-dir("../assays/2022-04-13_proteome_discoverer/dataset/")
-```
-## Count peptides in fasta
-```{bash}
-grep ">" ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta | wc -l
-head ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta 
-grep "OS=Ricinus communis" ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta | wc -l
-```
-## skim excel files
-### quantification.xlsx
-```{r}
-quantification <- readxl::read_xlsx("../assays/2022-04-13_proteome_discoverer/dataset/quantification.xlsx")
-dim(quantification)
-colnames(quantification)
-length(unique(quantification$Accession))
-```
-### quantification_with_peptides.xlsx
-```{r}
-quantification_peptides <- readxl::read_xlsx("../assays/2022-04-13_proteome_discoverer/dataset/quantification_with_peptides.xlsx")
-dim(quantification_peptides)
-colnames(quantification_peptides)
-length(unique(quantification_peptides$Accession))
-## This is a whole new level of untidy. A table nested in a table. I hate it. Thank you, Thermo Fisher Scientific. 
-### 1. keep only high confidence rows
-quant_proteins <- filter(quantification_peptides, `Protein FDR Confidence: Combined` == "High")
-### 2. remove empty columns
-quant_proteins <- quant_proteins[, colSums(is.na(quant_proteins)) != nrow(quant_proteins)]
-### 3. dummy check, that content is the same... 
-dim(quant_proteins) == dim(quantification)
-sum(as.numeric(unlist(quantification[,6])) == as.numeric(unlist(quant_proteins[,6])))
-## pull out peptide data 
-peptides <- filter(quantification_peptides, is.na(`Protein FDR Confidence: Combined`))
-nrow(quantification_peptides) - nrow(peptides)
-### 2. remove empty columns
-peptides <- peptides[, colSums(is.na(peptides)) != nrow(peptides)]
-### 3. strip between-data headlines
-colnames(peptides) = as.character(peptides[1, ])
-peptides <- filter(peptides, Checked != "Checked")
-peptides <- type_convert(peptides)
-```
-## create an isa.run.xlsx like data dictionary 
-```{r}
-sum_tmp <- data.frame(summary(quantification)) %>% 
-  pivot_wider(names_from = Var1, values_from = c(Var1, Freq), 
-              values_fn = function(x){paste(na.exclude(x), collapse = "|")})
-quantification_summary <-  cbind.data.frame(sum_tmp[,c(1,3)],t(summarise_all(quantification, class)))
-colnames(quantification_summary) <- c("Identifier", "ObjectSummary", "ObjectType")
-quantification_summary$TargetFile <- "quantification.xlsx"
-sum_tmp <- data.frame(summary(peptides)) %>% 
-  pivot_wider(names_from = Var1, values_from = c(Var1, Freq), 
-              values_fn = function(x){paste(na.exclude(x), collapse = "|")})
-peptides_summary <-  cbind.data.frame(sum_tmp[,c(1,3)],t(summarise_all(peptides, class)))
-colnames(peptides_summary) <- c("Identifier", "ObjectSummary", "ObjectType")
-peptides_summary$TargetFile <- "quantification_with_peptides.xlsx"
-isa_run <- rbind.data.frame(quantification_summary, peptides_summary)[, c(4, 1, 3, 2)]
-row.names(isa_run) <- NULL
-isa_run$Comment <- ""
-isa_run$Definition <- ""
-write.xlsx(isa_run, file =  "../assays/2022-04-13_proteome_discoverer/isa.run.xlsx", overwrite = T, asTable = T)
-```
-```{r}
-# extract abundances per accession only
-abundances_grouped <- quantification[, c("Accession", grep("Abundances (Grouped)", colnames(quantification), fixed = T, value = T))]
-# pivot and split column
-abundances_grouped2 <-  
-  abundances_grouped %>%
-  pivot_longer(!Accession, names_to = c("organelle", "compartment"), values_to = "abundance", names_sep = ', ')
-# remove 
-abundances_grouped2$organelle <- gsub("Abundances (Grouped): ", "", abundances_grouped2$organelle, fixed = T)
-# transform columns to factor 
-abundances_grouped2$organelle <- as.factor(abundances_grouped2$organelle)
-abundances_grouped2$compartment <- as.factor(abundances_grouped2$compartment)
-# pick random accessions
-selected_accs <- sample(unique(abundances_grouped2$Accession), 4)
-# filter plot subset
-plotsub <- filter(abundances_grouped2, Accession %in% selected_accs)
-ggplot(plotsub, aes(x = organelle, y = abundance, fill = compartment)) +
-  geom_col(position = position_dodge(width = 0.7), width = 0.7) + 
-  facet_wrap(~Accession, scales = 'free') + 
-  scale_fill_brewer(palette = "Dark2")
-```
-### Calculate and draw a PCA to get an overview of the dataset
-```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T}
-pca_data <- as.data.frame(pivot_wider(abundances_grouped2,
-                                      names_from = Accession,
-                                      values_from =  abundance,
-                                      id_cols = c("organelle", "compartment")))
-pca_data <- unite(pca_data,  organelle, compartment, col = 'merger', sep = '_')
-rownames(pca_data) <- pca_data$merger
-pca_data <- pca_data[, -1]
-####### double-check
-pca_data[is.na(pca_data)] <- 0
-####### double-check
-pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})]
-pca <- prcomp(pca_data, scale = T)
-pcaPlotData <- as.data.frame(pca$x)
-pcaPlotData$merger <- rownames(pcaPlotData)
-pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_',
-                        into = c("organelle", "compartment"))
-ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) +
-  geom_point(size = 3,  stroke = 1.5) +
-  coord_equal() +
-  # theme_dominik +
-  scale_color_brewer(palette = 'Dark2')
-```
-```{r}
-# extract abundances per accession only
-abundances <- quantification[, c("Accession", grep("Abundances (Normalized)", colnames(quantification), fixed = T, value = T))]
-# pivot and split column
-abundances2 <-  
-  abundances %>%
-  pivot_longer(!Accession, names_to = c("sample", "organelle", "compartment"), values_to = "abundance", names_sep = ', ')
-# remove 
-abundances2$sample <- gsub("Abundances (Normalized): ", "", abundances2$sample, fixed = T)
-abundances2$sample <- gsub(": Sample", "", abundances2$sample, fixed = T)
-# transform columns to factor 
-abundances2$sample <- as.factor(abundances2$sample)
-abundances2$organelle <- as.factor(abundances2$organelle)
-abundances2$compartment <- as.factor(abundances2$compartment)
-# pick random accessions
-selected_accs <- sample(unique(abundances2$Accession), 4)
-# filter plot subset
-plotsub <- filter(abundances2, Accession %in% selected_accs)
-ggplot(plotsub, aes(x = organelle, y = abundance, fill = compartment)) +
-  geom_point(position = position_dodge(width = 0.7), width = 0.7) +
-  facet_wrap(~Accession, scales = 'free') + 
-  scale_fill_brewer(palette = "Dark2")
-```
-### Calculate and draw a PCA to get an overview of the dataset
-```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T}
-pca_data <- as.data.frame(pivot_wider(abundances2,
-                                      names_from = Accession,
-                                      values_from =  abundance,
-                                      id_cols = c("sample", "organelle", "compartment")))
-pca_data <- unite(pca_data,  sample, organelle, compartment, col = 'merger', sep = '_')
-rownames(pca_data) <- pca_data$merger
-pca_data <- pca_data[, -1]
-####### double-check
-pca_data[is.na(pca_data)] <- 0
-####### double-check
-pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})]
-pca <- prcomp(pca_data, scale = T)
-pcaPlotData <- as.data.frame(pca$x)
-pcaPlotData$merger <- rownames(pcaPlotData)
-pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_',
-                        into = c("sample", "organelle", "compartment"))
-pca_plot_individuals <- ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) +
-  geom_point(size = 2,  stroke = 1.5) +
-  coord_equal() +
-  # theme_dominik +
-  scale_color_brewer(palette = 'Dark2')
-png(file = "pca_plot_individuals.png", res = 300, width = 2000, height = 2000)
-pca_plot_individuals
-dev.off()
-png(file = "pca_plot_individuals_labelled.png", res = 300, width = 2000, height = 2000)
-pca_plot_individuals + geom_text(aes(label = sample), nudge_x = 5)
-dev.off()
-```
-```{r, out.width = "100%", eval= T, echo=F}
-include_graphics('pca_plot_individuals.png')
-include_graphics('pca_plot_individuals_labelled.png')
-```
-### exlude ER
-```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T}
-pca_data <- as.data.frame(pivot_wider(filter(abundances2, organelle != "endoplasmatic reticulum"),
-                                      names_from = Accession,
-                                      values_from =  abundance,
-                                      id_cols = c("sample", "organelle", "compartment")))
-pca_data <- unite(pca_data,  sample, organelle, compartment, col = 'merger', sep = '_')
-rownames(pca_data) <- pca_data$merger
-pca_data <- pca_data[, -1]
-####### double-check
-pca_data[is.na(pca_data)] <- 0
-####### double-check
-pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})]
-pca <- prcomp(pca_data, scale = T)
-pcaPlotData <- as.data.frame(pca$x)
-pcaPlotData$merger <- rownames(pcaPlotData)
-pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_',
-                        into = c("sample", "organelle", "compartment"))
-pca_plot_excl_ER <- ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) +
-  geom_point(size = 2,  stroke = 1.5) +
-  coord_equal() +
-  # theme_dominik +
-  scale_color_brewer(palette = 'Dark2')
-png(file = "pca_plot_excl_ER.png", res = 300, width = 2000, height = 2000)
-pca_plot_excl_ER
-dev.off()
-```
-```{r, out.width = "100%", eval= T, echo=F}
-include_graphics('pca_plot_excl_ER.png')
-```
--- a/workflows/2022-04-13_FirstIterations/2022-04-13_skim_results.pdf
+++ b/workflows/2022-04-13_FirstIterations/2022-04-13_skim_results.pdf
--- a/workflows/2022-04-13_FirstIterations/pca_plot_excl_ER.png
+++ b/workflows/2022-04-13_FirstIterations/pca_plot_excl_ER.png
--- a/workflows/2022-04-13_FirstIterations/pca_plot_individuals.png
+++ b/workflows/2022-04-13_FirstIterations/pca_plot_individuals.png
--- a/workflows/2022-04-13_FirstIterations/pca_plot_individuals_labelled.png
+++ b/workflows/2022-04-13_FirstIterations/pca_plot_individuals_labelled.png