From 3fd7a4aaf187aa46026fabee00ef7fadcb405564 Mon Sep 17 00:00:00 2001 From: Dominik Brilhaus <dominik.brilhaus@hhu.de> Date: Wed, 13 Apr 2022 17:35:36 +0200 Subject: [PATCH] first glance at the data --- _DominikNotes/2022-04-13_data_notes.md | 9 ++ .../dataset/quantification.xlsx | 4 +- workflows/2022-04-13_skim_results.Rmd | 152 ++++++++++++++++++ 3 files changed, 163 insertions(+), 2 deletions(-) create mode 100644 _DominikNotes/2022-04-13_data_notes.md create mode 100644 workflows/2022-04-13_skim_results.Rmd diff --git a/_DominikNotes/2022-04-13_data_notes.md b/_DominikNotes/2022-04-13_data_notes.md new file mode 100644 index 0000000..58b1706 --- /dev/null +++ b/_DominikNotes/2022-04-13_data_notes.md @@ -0,0 +1,9 @@ + +# questions to Anja + +- what's the ricinus reference? +- analysis protocols? + - what tests? + - why membrane vs. lumen? + - can we also get ER vs. mito etc.? + - \ No newline at end of file diff --git a/assays/2022-04-13_proteome_discoverer/dataset/quantification.xlsx b/assays/2022-04-13_proteome_discoverer/dataset/quantification.xlsx index 47967ba..4ae078e 100644 --- a/assays/2022-04-13_proteome_discoverer/dataset/quantification.xlsx +++ b/assays/2022-04-13_proteome_discoverer/dataset/quantification.xlsx @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:949e0dbf38dd4ee0c7157801e810c156abcc7b32f17fe31d14062e1d48becfe5 -size 1718573 +oid sha256:4bf55b97e2b45586c2558111189f1dc4ddac6b153b3715fdc8298944837f86c7 +size 1719958 diff --git a/workflows/2022-04-13_skim_results.Rmd b/workflows/2022-04-13_skim_results.Rmd new file mode 100644 index 0000000..0628dfe --- /dev/null +++ b/workflows/2022-04-13_skim_results.Rmd @@ -0,0 +1,152 @@ +--- +title: "Untitled" +author: "Dominik" +date: "4/13/2022" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + + +```{r} +getwd() + +dir("../assays/2022-04-13_proteome_discoverer/dataset/") + +``` + +## Count peptides in fasta + +```{bash} +grep ">" ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta | wc -l +``` + +## skim excel files + +```{r} +library(tidyverse) + +quantification <- readxl::read_xlsx("../assays/2022-04-13_proteome_discoverer/dataset/quantification.xlsx") +dim(quantification) +colnames(quantification) +length(unique(quantification$Accession)) + + +quantification_peptides <- readxl::read_xlsx("../assays/2022-04-13_proteome_discoverer/dataset/quantification_with_peptides.xlsx") +dim(quantification_peptides) +colnames(quantification_peptides) +length(unique(quantification_peptides$Accession)) + +## This is a whole new level of untidy. A table nested in a table. I hate it. Thank you, Thermo Fisher Scientific. + +### 1. keep only high confidence rows + +quant_proteins <- filter(quantification_peptides, `Protein FDR Confidence: Combined` == "High") + +### 2. remove empty columns +quant_proteins <- quant_proteins[, colSums(is.na(quant_proteins)) != nrow(quant_proteins)] + +### 3. dummy check, that content is the same... +dim(quant_proteins) == dim(quantification) +sum(as.numeric(unlist(quantification[,6])) == as.numeric(unlist(quant_proteins[,6]))) + + +## pull out peptide data + +peptides <- filter(quantification_peptides, is.na(`Protein FDR Confidence: Combined`)) +nrow(quantification_peptides) - nrow(peptides) + +### 2. remove empty columns +peptides <- peptides[, colSums(is.na(peptides)) != nrow(peptides)] + +### 3. strip between-data headlines +colnames(peptides) = as.character(peptides[1, ]) +peptides <- filter(peptides, Checked != "Checked") + +``` + + + +```{r} + + +# extract abundances per accession only + +abundances_grouped <- quantification[, c("Accession", grep("Abundances (Grouped)", colnames(quantification), fixed = T, value = T))] + +# pivot and split column + +abundances_grouped2 <- + abundances_grouped %>% + pivot_longer(!Accession, names_to = c("organelle", "compartment"), values_to = "abundance", names_sep = ', ') + +# remove + +abundances_grouped2$organelle <- gsub("Abundances (Grouped): ", "", abundances_grouped2$organelle, fixed = T) + +# transform colums to factor + +abundances_grouped2$organelle <- as.factor(abundances_grouped2$organelle) +abundances_grouped2$compartment <- as.factor(abundances_grouped2$compartment) + +# pick random accessions +selected_accs <- sample(unique(abundances_grouped2$Accession), 4) + +# filter plot subset +plotsub <- filter(abundances_grouped2, Accession %in% selected_accs) + + +ggplot(plotsub, aes(x = organelle, y = abundance, fill = compartment)) + + geom_col(position = position_dodge(width = 0.7), width = 0.7) + + facet_wrap(~Accession, scales = 'free') + + scale_fill_brewer(palette = "Dark2") + +``` + + +### Calculate and draw a PCA to get an overview of the dataset + +```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T} + +pca_data <- as.data.frame(pivot_wider(abundances_grouped2, + names_from = Accession, + values_from = abundance, + id_cols = c("organelle", "compartment"))) + +pca_data <- unite(pca_data, organelle, compartment, col = 'merger', sep = '_') +rownames(pca_data) <- pca_data$merger +pca_data <- pca_data[, -1] + + +####### double-check +pca_data[is.na(pca_data)] <- 0 +####### double-check + +pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})] + +pca <- prcomp(pca_data, scale = T) +pcaPlotData <- as.data.frame(pca$x) +pcaPlotData$merger <- rownames(pcaPlotData) +pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_', + into = c("organelle", "compartment")) + +ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) + + geom_point(size = 3, stroke = 1.5) + + coord_equal() + + # theme_dominik + + scale_color_brewer(palette = 'Dark2') + +``` + + + + + + + + + + + -- GitLab