From 3fd7a4aaf187aa46026fabee00ef7fadcb405564 Mon Sep 17 00:00:00 2001
From: Dominik Brilhaus <dominik.brilhaus@hhu.de>
Date: Wed, 13 Apr 2022 17:35:36 +0200
Subject: [PATCH] first glance at the data

---
 _DominikNotes/2022-04-13_data_notes.md        |   9 ++
 .../dataset/quantification.xlsx               |   4 +-
 workflows/2022-04-13_skim_results.Rmd         | 152 ++++++++++++++++++
 3 files changed, 163 insertions(+), 2 deletions(-)
 create mode 100644 _DominikNotes/2022-04-13_data_notes.md
 create mode 100644 workflows/2022-04-13_skim_results.Rmd

diff --git a/_DominikNotes/2022-04-13_data_notes.md b/_DominikNotes/2022-04-13_data_notes.md
new file mode 100644
index 0000000..58b1706
--- /dev/null
+++ b/_DominikNotes/2022-04-13_data_notes.md
@@ -0,0 +1,9 @@
+
+# questions to Anja 
+
+- what's the ricinus reference? 
+- analysis protocols? 
+  - what tests?
+  - why membrane vs. lumen?
+    - can we also get ER vs. mito etc.?
+  - 
\ No newline at end of file
diff --git a/assays/2022-04-13_proteome_discoverer/dataset/quantification.xlsx b/assays/2022-04-13_proteome_discoverer/dataset/quantification.xlsx
index 47967ba..4ae078e 100644
--- a/assays/2022-04-13_proteome_discoverer/dataset/quantification.xlsx
+++ b/assays/2022-04-13_proteome_discoverer/dataset/quantification.xlsx
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:949e0dbf38dd4ee0c7157801e810c156abcc7b32f17fe31d14062e1d48becfe5
-size 1718573
+oid sha256:4bf55b97e2b45586c2558111189f1dc4ddac6b153b3715fdc8298944837f86c7
+size 1719958
diff --git a/workflows/2022-04-13_skim_results.Rmd b/workflows/2022-04-13_skim_results.Rmd
new file mode 100644
index 0000000..0628dfe
--- /dev/null
+++ b/workflows/2022-04-13_skim_results.Rmd
@@ -0,0 +1,152 @@
+---
+title: "Untitled"
+author: "Dominik"
+date: "4/13/2022"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+
+```{r}
+getwd()
+
+dir("../assays/2022-04-13_proteome_discoverer/dataset/")
+
+```
+
+## Count peptides in fasta
+
+```{bash}
+grep ">" ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta | wc -l
+```
+
+## skim excel files
+
+```{r}
+library(tidyverse)
+
+quantification <- readxl::read_xlsx("../assays/2022-04-13_proteome_discoverer/dataset/quantification.xlsx")
+dim(quantification)
+colnames(quantification)
+length(unique(quantification$Accession))
+
+
+quantification_peptides <- readxl::read_xlsx("../assays/2022-04-13_proteome_discoverer/dataset/quantification_with_peptides.xlsx")
+dim(quantification_peptides)
+colnames(quantification_peptides)
+length(unique(quantification_peptides$Accession))
+
+## This is a whole new level of untidy. A table nested in a table. I hate it. Thank you, Thermo Fisher Scientific. 
+
+### 1. keep only high confidence rows
+
+quant_proteins <- filter(quantification_peptides, `Protein FDR Confidence: Combined` == "High")
+
+### 2. remove empty columns
+quant_proteins <- quant_proteins[, colSums(is.na(quant_proteins)) != nrow(quant_proteins)]
+
+### 3. dummy check, that content is the same... 
+dim(quant_proteins) == dim(quantification)
+sum(as.numeric(unlist(quantification[,6])) == as.numeric(unlist(quant_proteins[,6])))
+
+
+## pull out peptide data 
+
+peptides <- filter(quantification_peptides, is.na(`Protein FDR Confidence: Combined`))
+nrow(quantification_peptides) - nrow(peptides)
+
+### 2. remove empty columns
+peptides <- peptides[, colSums(is.na(peptides)) != nrow(peptides)]
+
+### 3. strip between-data headlines
+colnames(peptides) = as.character(peptides[1, ])
+peptides <- filter(peptides, Checked != "Checked")
+
+```
+
+
+
+```{r}
+
+
+# extract abundances per accession only
+
+abundances_grouped <- quantification[, c("Accession", grep("Abundances (Grouped)", colnames(quantification), fixed = T, value = T))]
+
+# pivot and split column
+
+abundances_grouped2 <-  
+  abundances_grouped %>%
+  pivot_longer(!Accession, names_to = c("organelle", "compartment"), values_to = "abundance", names_sep = ', ')
+
+# remove 
+
+abundances_grouped2$organelle <- gsub("Abundances (Grouped): ", "", abundances_grouped2$organelle, fixed = T)
+
+# transform colums to factor 
+
+abundances_grouped2$organelle <- as.factor(abundances_grouped2$organelle)
+abundances_grouped2$compartment <- as.factor(abundances_grouped2$compartment)
+
+# pick random accessions
+selected_accs <- sample(unique(abundances_grouped2$Accession), 4)
+
+# filter plot subset
+plotsub <- filter(abundances_grouped2, Accession %in% selected_accs)
+
+
+ggplot(plotsub, aes(x = organelle, y = abundance, fill = compartment)) +
+  geom_col(position = position_dodge(width = 0.7), width = 0.7) + 
+  facet_wrap(~Accession, scales = 'free') + 
+  scale_fill_brewer(palette = "Dark2")
+  
+```
+
+
+### Calculate and draw a PCA to get an overview of the dataset
+
+```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T}
+
+pca_data <- as.data.frame(pivot_wider(abundances_grouped2,
+                                      names_from = Accession,
+                                      values_from =  abundance,
+                                      id_cols = c("organelle", "compartment")))
+
+pca_data <- unite(pca_data,  organelle, compartment, col = 'merger', sep = '_')
+rownames(pca_data) <- pca_data$merger
+pca_data <- pca_data[, -1]
+
+
+####### double-check
+pca_data[is.na(pca_data)] <- 0
+####### double-check
+
+pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})]
+
+pca <- prcomp(pca_data, scale = T)
+pcaPlotData <- as.data.frame(pca$x)
+pcaPlotData$merger <- rownames(pcaPlotData)
+pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_',
+                        into = c("organelle", "compartment"))
+
+ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) +
+  geom_point(size = 3,  stroke = 1.5) +
+  coord_equal() +
+  # theme_dominik +
+  scale_color_brewer(palette = 'Dark2')
+
+```
+
+
+
+
+
+
+
+
+
+
+
-- 
GitLab