second glances

45336f98 · Dominik Brilhaus · 3fd7a4aa · 45336f98 · 45336f98 · 45336f98
Commit 45336f98 authored 3 years ago by Dominik Brilhaus
--- a/_DominikNotes/2022-04-13_data_notes.md
+++ b/_DominikNotes/2022-04-13_data_notes.md
 # questions to Anja 
- what's the ricinus reference? 
+- what's the ricinus reference?
- analysis protocols? 
-  - what tests?
+- analysis protocols?
+  - what statistical tests?
  - why membrane vs. lumen?
    - can we also get ER vs. mito etc.?
-  - 
\ No newline at end of file
+1. Hast du eine spezifische Referenz für Ricinus genommen (welche?) oder vergleicht das komplett gegen UniProt (und daher die gefundenen Contaminants)?
+2. Bezieht sich die "Protein FDR Confidence: Combined"? auf den Match zwischen identifiziertem Protein und der Accession?
+3. Es gibt insgesamt 31 Contaminant=TRUE. Sollten wir die herausfiltern? Und mal aus Neugierde: Das ist überwiegend Keratin - ist das üblich oder unsaubere Arbeit?
+4. Zu den abundances:
+   - Wie werden die Ratios berechnet? fold changes, log2 fold changes? Und wie werden dann NAs behandelt?
+   - Welche Statistik (t test, Anova?) und Methode für adjusted p-value?
+   - Wie werden die normalisiert und gruppiert?
+5. Gibt's dazu vielleicht einen standard Report oder andere Beschreibung von Proteome Discoverer? Ich versuche für unsere Daten immer so eine Art Data Dictionary anzulegen (siehe Anhang) - vielleicht gibt Proteome Discoverer auch standardisierte Definitionen für die einzelnen Spalten raus. 
+6. Komplette ANOVA, die dann auch z.B. ER-Membrane gegen Mito-Lumen vergleicht wäre vermutlich overkill. Aber könntest du mit wenig Aufwand auch jeweils die Membrane Samples und die Lumen Samples gegeneinander testen (Also Mito-Mem vs. ER-Mem vs. Peroxisome-Mem, ...)?
+7. Dürfen NAs bei den abundances durch 0 ersetzt werden?
+8. Gibt es Insgesamt für die Datenanalyse eine Art Protokoll?  
+Auf den ersten Blick (schnelle PCA) sehen die Ergebnisse nicht so verkehrt aus bzw. Trennen zumindest schön nach Organelle und compartment. Lediglich die ER Proben schlagen etwas raus. 
--- a/assays/2022-04-13_proteome_discoverer/isa.run.xlsx
+++ b/assays/2022-04-13_proteome_discoverer/isa.run.xlsx
--- a/workflows/2022-04-13_skim_results.Rmd
+++ b/workflows/2022-04-13_skim_results.Rmd
@@ -2,11 +2,18 @@
 title: "Untitled"
 author: "Dominik"
 date: "4/13/2022"
-output: html_document
+output:
+  pdf_document: default
+  html_document: default
 ---
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE)
+library(knitr)
+library(tidyverse)
+library(openxlsx)
 ```
@@ -21,19 +28,27 @@ dir("../assays/2022-04-13_proteome_discoverer/dataset/")
 ```{bash}
 grep ">" ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta | wc -l
+head ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta 
+grep "OS=Ricinus communis" ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta | wc -l
 ```
 ## skim excel files
-```{r}
+### quantification.xlsx
-library(tidyverse)
+```{r}
 quantification <- readxl::read_xlsx("../assays/2022-04-13_proteome_discoverer/dataset/quantification.xlsx")
 dim(quantification)
 colnames(quantification)
 length(unique(quantification$Accession))
+```
+### quantification_with_peptides.xlsx
+```{r}
 quantification_peptides <- readxl::read_xlsx("../assays/2022-04-13_proteome_discoverer/dataset/quantification_with_peptides.xlsx")
 dim(quantification_peptides)
 colnames(quantification_peptides)
@@ -65,10 +80,52 @@ peptides <- peptides[, colSums(is.na(peptides)) != nrow(peptides)]
 colnames(peptides) = as.character(peptides[1, ])
 peptides <- filter(peptides, Checked != "Checked")
+peptides <- type_convert(peptides)
+```
+## create an isa.run.xlsx like data dictionary 
+```{r}
+sum_tmp <- data.frame(summary(quantification)) %>% 
+  pivot_wider(names_from = Var1, values_from = c(Var1, Freq), 
+              values_fn = function(x){paste(na.exclude(x), collapse = "|")})
+quantification_summary <-  cbind.data.frame(sum_tmp[,c(1,3)],t(summarise_all(quantification, class)))
+colnames(quantification_summary) <- c("Identifier", "ObjectSummary", "ObjectType")
+quantification_summary$TargetFile <- "quantification.xlsx"
+sum_tmp <- data.frame(summary(peptides)) %>% 
+  pivot_wider(names_from = Var1, values_from = c(Var1, Freq), 
+              values_fn = function(x){paste(na.exclude(x), collapse = "|")})
+peptides_summary <-  cbind.data.frame(sum_tmp[,c(1,3)],t(summarise_all(peptides, class)))
+colnames(peptides_summary) <- c("Identifier", "ObjectSummary", "ObjectType")
+peptides_summary$TargetFile <- "quantification_with_peptides.xlsx"
+isa_run <- rbind.data.frame(quantification_summary, peptides_summary)[, c(4, 1, 3, 2)]
+row.names(isa_run) <- NULL
+isa_run$Comment <- ""
+isa_run$Definition <- ""
+write.xlsx(isa_run, file =  "../assays/2022-04-13_proteome_discoverer/isa.run.xlsx", overwrite = T, asTable = T)
 ```
 ```{r}
@@ -86,7 +143,7 @@ abundances_grouped2 <-
 abundances_grouped2$organelle <- gsub("Abundances (Grouped): ", "", abundances_grouped2$organelle, fixed = T)
-# transform colums to factor 
+# transform columns to factor 
 abundances_grouped2$organelle <- as.factor(abundances_grouped2$organelle)
 abundances_grouped2$compartment <- as.factor(abundances_grouped2$compartment)
@@ -144,6 +201,139 @@ ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = '
+```{r}
+# extract abundances per accession only
+abundances <- quantification[, c("Accession", grep("Abundances (Normalized)", colnames(quantification), fixed = T, value = T))]
+# pivot and split column
+abundances2 <-  
+  abundances %>%
+  pivot_longer(!Accession, names_to = c("sample", "organelle", "compartment"), values_to = "abundance", names_sep = ', ')
+# remove 
+abundances2$sample <- gsub("Abundances (Normalized): ", "", abundances2$sample, fixed = T)
+abundances2$sample <- gsub(": Sample", "", abundances2$sample, fixed = T)
+# transform columns to factor 
+abundances2$sample <- as.factor(abundances2$sample)
+abundances2$organelle <- as.factor(abundances2$organelle)
+abundances2$compartment <- as.factor(abundances2$compartment)
+# pick random accessions
+selected_accs <- sample(unique(abundances2$Accession), 4)
+# filter plot subset
+plotsub <- filter(abundances2, Accession %in% selected_accs)
+ggplot(plotsub, aes(x = organelle, y = abundance, fill = compartment)) +
+  geom_point(position = position_dodge(width = 0.7), width = 0.7) +
+  facet_wrap(~Accession, scales = 'free') + 
+  scale_fill_brewer(palette = "Dark2")
+```
+### Calculate and draw a PCA to get an overview of the dataset
+```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T}
+pca_data <- as.data.frame(pivot_wider(abundances2,
+                                      names_from = Accession,
+                                      values_from =  abundance,
+                                      id_cols = c("sample", "organelle", "compartment")))
+pca_data <- unite(pca_data,  sample, organelle, compartment, col = 'merger', sep = '_')
+rownames(pca_data) <- pca_data$merger
+pca_data <- pca_data[, -1]
+####### double-check
+pca_data[is.na(pca_data)] <- 0
+####### double-check
+pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})]
+pca <- prcomp(pca_data, scale = T)
+pcaPlotData <- as.data.frame(pca$x)
+pcaPlotData$merger <- rownames(pcaPlotData)
+pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_',
+                        into = c("sample", "organelle", "compartment"))
+pca_plot_individuals <- ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) +
+  geom_point(size = 2,  stroke = 1.5) +
+  coord_equal() +
+  # theme_dominik +
+  scale_color_brewer(palette = 'Dark2')
+png(file = "pca_plot_individuals.png", res = 300, width = 2000, height = 2000)
+pca_plot_individuals
+dev.off()
+png(file = "pca_plot_individuals_labelled.png", res = 300, width = 2000, height = 2000)
+pca_plot_individuals + geom_text(aes(label = sample), nudge_x = 5)
+dev.off()
+```
+```{r, out.width = "100%", eval= T, echo=F}
+include_graphics('pca_plot_individuals.png')
+include_graphics('pca_plot_individuals_labelled.png')
+```
+### exlude ER
+```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T}
+pca_data <- as.data.frame(pivot_wider(filter(abundances2, organelle != "endoplasmatic reticulum"),
+                                      names_from = Accession,
+                                      values_from =  abundance,
+                                      id_cols = c("sample", "organelle", "compartment")))
+pca_data <- unite(pca_data,  sample, organelle, compartment, col = 'merger', sep = '_')
+rownames(pca_data) <- pca_data$merger
+pca_data <- pca_data[, -1]
+####### double-check
+pca_data[is.na(pca_data)] <- 0
+####### double-check
+pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})]
+pca <- prcomp(pca_data, scale = T)
+pcaPlotData <- as.data.frame(pca$x)
+pcaPlotData$merger <- rownames(pcaPlotData)
+pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_',
+                        into = c("sample", "organelle", "compartment"))
+pca_plot_excl_ER <- ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) +
+  geom_point(size = 2,  stroke = 1.5) +
+  coord_equal() +
+  # theme_dominik +
+  scale_color_brewer(palette = 'Dark2')
+png(file = "pca_plot_excl_ER.png", res = 300, width = 2000, height = 2000)
+pca_plot_excl_ER
+dev.off()
+```
+```{r, out.width = "100%", eval= T, echo=F}
+include_graphics('pca_plot_excl_ER.png')
+```

--- a/workflows/2022-04-13_skim_results.pdf
+++ b/workflows/2022-04-13_skim_results.pdf
--- a/workflows/pca_plot_excl_ER.png
+++ b/workflows/pca_plot_excl_ER.png
--- a/workflows/pca_plot_individuals.png
+++ b/workflows/pca_plot_individuals.png
--- a/workflows/pca_plot_individuals_labelled.png
+++ b/workflows/pca_plot_individuals_labelled.png