diff --git a/_DominikNotes/2022-04-13_data_notes.md b/_DominikNotes/2022-04-13_data_notes.md index 58b17066b282283ebff2ac034a8fa778dc7b2cda..ae4506c01438e51196a6ba4e3292f8d497212568 100644 --- a/_DominikNotes/2022-04-13_data_notes.md +++ b/_DominikNotes/2022-04-13_data_notes.md @@ -1,9 +1,33 @@ # questions to Anja -- what's the ricinus reference? -- analysis protocols? - - what tests? +- what's the ricinus reference? + +- analysis protocols? + - what statistical tests? - why membrane vs. lumen? - can we also get ER vs. mito etc.? - - \ No newline at end of file + +1. Hast du eine spezifische Referenz für Ricinus genommen (welche?) oder vergleicht das komplett gegen UniProt (und daher die gefundenen Contaminants)? + +2. Bezieht sich die "Protein FDR Confidence: Combined"? auf den Match zwischen identifiziertem Protein und der Accession? + +3. Es gibt insgesamt 31 Contaminant=TRUE. Sollten wir die herausfiltern? Und mal aus Neugierde: Das ist überwiegend Keratin - ist das üblich oder unsaubere Arbeit? + +4. Zu den abundances: + - Wie werden die Ratios berechnet? fold changes, log2 fold changes? Und wie werden dann NAs behandelt? + - Welche Statistik (t test, Anova?) und Methode für adjusted p-value? + - Wie werden die normalisiert und gruppiert? + +5. Gibt's dazu vielleicht einen standard Report oder andere Beschreibung von Proteome Discoverer? Ich versuche für unsere Daten immer so eine Art Data Dictionary anzulegen (siehe Anhang) - vielleicht gibt Proteome Discoverer auch standardisierte Definitionen für die einzelnen Spalten raus. + +6. Komplette ANOVA, die dann auch z.B. ER-Membrane gegen Mito-Lumen vergleicht wäre vermutlich overkill. Aber könntest du mit wenig Aufwand auch jeweils die Membrane Samples und die Lumen Samples gegeneinander testen (Also Mito-Mem vs. ER-Mem vs. Peroxisome-Mem, ...)? + +7. Dürfen NAs bei den abundances durch 0 ersetzt werden? + +8. Gibt es Insgesamt für die Datenanalyse eine Art Protokoll? + +Auf den ersten Blick (schnelle PCA) sehen die Ergebnisse nicht so verkehrt aus bzw. Trennen zumindest schön nach Organelle und compartment. Lediglich die ER Proben schlagen etwas raus. + + + diff --git a/assays/2022-04-13_proteome_discoverer/isa.run.xlsx b/assays/2022-04-13_proteome_discoverer/isa.run.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..17db67e4af60dced61c4e82b473ec24c84410709 Binary files /dev/null and b/assays/2022-04-13_proteome_discoverer/isa.run.xlsx differ diff --git a/workflows/2022-04-13_skim_results.Rmd b/workflows/2022-04-13_skim_results.Rmd index 0628dfe75466a6dc5a55a193cdc87efcfb48c053..85bfcc434c9c65dbb1261456cb7e42ff30b89bfe 100644 --- a/workflows/2022-04-13_skim_results.Rmd +++ b/workflows/2022-04-13_skim_results.Rmd @@ -2,11 +2,18 @@ title: "Untitled" author: "Dominik" date: "4/13/2022" -output: html_document +output: + pdf_document: default + html_document: default --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) + +library(knitr) +library(tidyverse) +library(openxlsx) + ``` @@ -21,19 +28,27 @@ dir("../assays/2022-04-13_proteome_discoverer/dataset/") ```{bash} grep ">" ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta | wc -l + +head ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta + +grep "OS=Ricinus communis" ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta | wc -l ``` ## skim excel files -```{r} -library(tidyverse) +### quantification.xlsx +```{r} quantification <- readxl::read_xlsx("../assays/2022-04-13_proteome_discoverer/dataset/quantification.xlsx") dim(quantification) colnames(quantification) length(unique(quantification$Accession)) +``` + +### quantification_with_peptides.xlsx +```{r} quantification_peptides <- readxl::read_xlsx("../assays/2022-04-13_proteome_discoverer/dataset/quantification_with_peptides.xlsx") dim(quantification_peptides) colnames(quantification_peptides) @@ -65,10 +80,52 @@ peptides <- peptides[, colSums(is.na(peptides)) != nrow(peptides)] colnames(peptides) = as.character(peptides[1, ]) peptides <- filter(peptides, Checked != "Checked") +peptides <- type_convert(peptides) + + +``` + + +## create an isa.run.xlsx like data dictionary + +```{r} + +sum_tmp <- data.frame(summary(quantification)) %>% + pivot_wider(names_from = Var1, values_from = c(Var1, Freq), + values_fn = function(x){paste(na.exclude(x), collapse = "|")}) + +quantification_summary <- cbind.data.frame(sum_tmp[,c(1,3)],t(summarise_all(quantification, class))) +colnames(quantification_summary) <- c("Identifier", "ObjectSummary", "ObjectType") +quantification_summary$TargetFile <- "quantification.xlsx" + + +sum_tmp <- data.frame(summary(peptides)) %>% + pivot_wider(names_from = Var1, values_from = c(Var1, Freq), + values_fn = function(x){paste(na.exclude(x), collapse = "|")}) + +peptides_summary <- cbind.data.frame(sum_tmp[,c(1,3)],t(summarise_all(peptides, class))) + +colnames(peptides_summary) <- c("Identifier", "ObjectSummary", "ObjectType") +peptides_summary$TargetFile <- "quantification_with_peptides.xlsx" + + + +isa_run <- rbind.data.frame(quantification_summary, peptides_summary)[, c(4, 1, 3, 2)] +row.names(isa_run) <- NULL +isa_run$Comment <- "" +isa_run$Definition <- "" + + +write.xlsx(isa_run, file = "../assays/2022-04-13_proteome_discoverer/isa.run.xlsx", overwrite = T, asTable = T) + + ``` + + + ```{r} @@ -86,7 +143,7 @@ abundances_grouped2 <- abundances_grouped2$organelle <- gsub("Abundances (Grouped): ", "", abundances_grouped2$organelle, fixed = T) -# transform colums to factor +# transform columns to factor abundances_grouped2$organelle <- as.factor(abundances_grouped2$organelle) abundances_grouped2$compartment <- as.factor(abundances_grouped2$compartment) @@ -144,6 +201,139 @@ ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = ' +```{r} + + +# extract abundances per accession only + +abundances <- quantification[, c("Accession", grep("Abundances (Normalized)", colnames(quantification), fixed = T, value = T))] + +# pivot and split column + +abundances2 <- + abundances %>% + pivot_longer(!Accession, names_to = c("sample", "organelle", "compartment"), values_to = "abundance", names_sep = ', ') + +# remove + +abundances2$sample <- gsub("Abundances (Normalized): ", "", abundances2$sample, fixed = T) +abundances2$sample <- gsub(": Sample", "", abundances2$sample, fixed = T) + +# transform columns to factor + +abundances2$sample <- as.factor(abundances2$sample) +abundances2$organelle <- as.factor(abundances2$organelle) +abundances2$compartment <- as.factor(abundances2$compartment) + +# pick random accessions +selected_accs <- sample(unique(abundances2$Accession), 4) + +# filter plot subset +plotsub <- filter(abundances2, Accession %in% selected_accs) + + +ggplot(plotsub, aes(x = organelle, y = abundance, fill = compartment)) + + geom_point(position = position_dodge(width = 0.7), width = 0.7) + + facet_wrap(~Accession, scales = 'free') + + scale_fill_brewer(palette = "Dark2") + +``` + + + +### Calculate and draw a PCA to get an overview of the dataset + +```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T} + +pca_data <- as.data.frame(pivot_wider(abundances2, + names_from = Accession, + values_from = abundance, + id_cols = c("sample", "organelle", "compartment"))) + +pca_data <- unite(pca_data, sample, organelle, compartment, col = 'merger', sep = '_') +rownames(pca_data) <- pca_data$merger +pca_data <- pca_data[, -1] + + +####### double-check +pca_data[is.na(pca_data)] <- 0 +####### double-check + +pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})] + +pca <- prcomp(pca_data, scale = T) +pcaPlotData <- as.data.frame(pca$x) +pcaPlotData$merger <- rownames(pcaPlotData) +pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_', + into = c("sample", "organelle", "compartment")) + +pca_plot_individuals <- ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) + + geom_point(size = 2, stroke = 1.5) + + coord_equal() + + # theme_dominik + + scale_color_brewer(palette = 'Dark2') + +png(file = "pca_plot_individuals.png", res = 300, width = 2000, height = 2000) +pca_plot_individuals +dev.off() + +png(file = "pca_plot_individuals_labelled.png", res = 300, width = 2000, height = 2000) +pca_plot_individuals + geom_text(aes(label = sample), nudge_x = 5) +dev.off() + + +``` + +```{r, out.width = "100%", eval= T, echo=F} +include_graphics('pca_plot_individuals.png') +include_graphics('pca_plot_individuals_labelled.png') +``` + + + + +### exlude ER + +```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T} + +pca_data <- as.data.frame(pivot_wider(filter(abundances2, organelle != "endoplasmatic reticulum"), + names_from = Accession, + values_from = abundance, + id_cols = c("sample", "organelle", "compartment"))) + + +pca_data <- unite(pca_data, sample, organelle, compartment, col = 'merger', sep = '_') +rownames(pca_data) <- pca_data$merger +pca_data <- pca_data[, -1] + + +####### double-check +pca_data[is.na(pca_data)] <- 0 +####### double-check + +pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})] + +pca <- prcomp(pca_data, scale = T) +pcaPlotData <- as.data.frame(pca$x) +pcaPlotData$merger <- rownames(pcaPlotData) +pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_', + into = c("sample", "organelle", "compartment")) + +pca_plot_excl_ER <- ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) + + geom_point(size = 2, stroke = 1.5) + + coord_equal() + + # theme_dominik + + scale_color_brewer(palette = 'Dark2') + +png(file = "pca_plot_excl_ER.png", res = 300, width = 2000, height = 2000) +pca_plot_excl_ER +dev.off() + +``` + +```{r, out.width = "100%", eval= T, echo=F} +include_graphics('pca_plot_excl_ER.png') +``` diff --git a/workflows/2022-04-13_skim_results.pdf b/workflows/2022-04-13_skim_results.pdf new file mode 100644 index 0000000000000000000000000000000000000000..ebe0d3c54b693aa377fd8dc94c190a445d7c684b Binary files /dev/null and b/workflows/2022-04-13_skim_results.pdf differ diff --git a/workflows/pca_plot_excl_ER.png b/workflows/pca_plot_excl_ER.png new file mode 100644 index 0000000000000000000000000000000000000000..3b39591c40da84188d3e3a68cd756cbd124ef2a7 Binary files /dev/null and b/workflows/pca_plot_excl_ER.png differ diff --git a/workflows/pca_plot_individuals.png b/workflows/pca_plot_individuals.png new file mode 100644 index 0000000000000000000000000000000000000000..e480d20f6b909626aae0378b8d6b20a9fcb3863d Binary files /dev/null and b/workflows/pca_plot_individuals.png differ diff --git a/workflows/pca_plot_individuals_labelled.png b/workflows/pca_plot_individuals_labelled.png new file mode 100644 index 0000000000000000000000000000000000000000..ca3c290ccb6ce39270b98affd6181d5d41814100 Binary files /dev/null and b/workflows/pca_plot_individuals_labelled.png differ