Skip to content
Snippets Groups Projects
Commit 131c8a0c authored by Dominik Brilhaus's avatar Dominik Brilhaus
Browse files

rm intermediate data

parent 7c3f0fc2
No related branches found
No related tags found
No related merge requests found
---
title: "Untitled"
author: "Dominik"
date: "4/13/2022"
output:
pdf_document: default
html_document: default
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(knitr)
library(tidyverse)
library(openxlsx)
```
```{r}
getwd()
dir("../assays/2022-04-13_proteome_discoverer/dataset/")
```
## Count peptides in fasta
```{bash}
grep ">" ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta | wc -l
head ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta
grep "OS=Ricinus communis" ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta | wc -l
```
## skim excel files
### quantification.xlsx
```{r}
quantification <- readxl::read_xlsx("../assays/2022-04-13_proteome_discoverer/dataset/quantification.xlsx")
dim(quantification)
colnames(quantification)
length(unique(quantification$Accession))
```
### quantification_with_peptides.xlsx
```{r}
quantification_peptides <- readxl::read_xlsx("../assays/2022-04-13_proteome_discoverer/dataset/quantification_with_peptides.xlsx")
dim(quantification_peptides)
colnames(quantification_peptides)
length(unique(quantification_peptides$Accession))
## This is a whole new level of untidy. A table nested in a table. I hate it. Thank you, Thermo Fisher Scientific.
### 1. keep only high confidence rows
quant_proteins <- filter(quantification_peptides, `Protein FDR Confidence: Combined` == "High")
### 2. remove empty columns
quant_proteins <- quant_proteins[, colSums(is.na(quant_proteins)) != nrow(quant_proteins)]
### 3. dummy check, that content is the same...
dim(quant_proteins) == dim(quantification)
sum(as.numeric(unlist(quantification[,6])) == as.numeric(unlist(quant_proteins[,6])))
## pull out peptide data
peptides <- filter(quantification_peptides, is.na(`Protein FDR Confidence: Combined`))
nrow(quantification_peptides) - nrow(peptides)
### 2. remove empty columns
peptides <- peptides[, colSums(is.na(peptides)) != nrow(peptides)]
### 3. strip between-data headlines
colnames(peptides) = as.character(peptides[1, ])
peptides <- filter(peptides, Checked != "Checked")
peptides <- type_convert(peptides)
```
## create an isa.run.xlsx like data dictionary
```{r}
sum_tmp <- data.frame(summary(quantification)) %>%
pivot_wider(names_from = Var1, values_from = c(Var1, Freq),
values_fn = function(x){paste(na.exclude(x), collapse = "|")})
quantification_summary <- cbind.data.frame(sum_tmp[,c(1,3)],t(summarise_all(quantification, class)))
colnames(quantification_summary) <- c("Identifier", "ObjectSummary", "ObjectType")
quantification_summary$TargetFile <- "quantification.xlsx"
sum_tmp <- data.frame(summary(peptides)) %>%
pivot_wider(names_from = Var1, values_from = c(Var1, Freq),
values_fn = function(x){paste(na.exclude(x), collapse = "|")})
peptides_summary <- cbind.data.frame(sum_tmp[,c(1,3)],t(summarise_all(peptides, class)))
colnames(peptides_summary) <- c("Identifier", "ObjectSummary", "ObjectType")
peptides_summary$TargetFile <- "quantification_with_peptides.xlsx"
isa_run <- rbind.data.frame(quantification_summary, peptides_summary)[, c(4, 1, 3, 2)]
row.names(isa_run) <- NULL
isa_run$Comment <- ""
isa_run$Definition <- ""
write.xlsx(isa_run, file = "../assays/2022-04-13_proteome_discoverer/isa.run.xlsx", overwrite = T, asTable = T)
```
```{r}
# extract abundances per accession only
abundances_grouped <- quantification[, c("Accession", grep("Abundances (Grouped)", colnames(quantification), fixed = T, value = T))]
# pivot and split column
abundances_grouped2 <-
abundances_grouped %>%
pivot_longer(!Accession, names_to = c("organelle", "compartment"), values_to = "abundance", names_sep = ', ')
# remove
abundances_grouped2$organelle <- gsub("Abundances (Grouped): ", "", abundances_grouped2$organelle, fixed = T)
# transform columns to factor
abundances_grouped2$organelle <- as.factor(abundances_grouped2$organelle)
abundances_grouped2$compartment <- as.factor(abundances_grouped2$compartment)
# pick random accessions
selected_accs <- sample(unique(abundances_grouped2$Accession), 4)
# filter plot subset
plotsub <- filter(abundances_grouped2, Accession %in% selected_accs)
ggplot(plotsub, aes(x = organelle, y = abundance, fill = compartment)) +
geom_col(position = position_dodge(width = 0.7), width = 0.7) +
facet_wrap(~Accession, scales = 'free') +
scale_fill_brewer(palette = "Dark2")
```
### Calculate and draw a PCA to get an overview of the dataset
```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T}
pca_data <- as.data.frame(pivot_wider(abundances_grouped2,
names_from = Accession,
values_from = abundance,
id_cols = c("organelle", "compartment")))
pca_data <- unite(pca_data, organelle, compartment, col = 'merger', sep = '_')
rownames(pca_data) <- pca_data$merger
pca_data <- pca_data[, -1]
####### double-check
pca_data[is.na(pca_data)] <- 0
####### double-check
pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})]
pca <- prcomp(pca_data, scale = T)
pcaPlotData <- as.data.frame(pca$x)
pcaPlotData$merger <- rownames(pcaPlotData)
pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_',
into = c("organelle", "compartment"))
ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) +
geom_point(size = 3, stroke = 1.5) +
coord_equal() +
# theme_dominik +
scale_color_brewer(palette = 'Dark2')
```
```{r}
# extract abundances per accession only
abundances <- quantification[, c("Accession", grep("Abundances (Normalized)", colnames(quantification), fixed = T, value = T))]
# pivot and split column
abundances2 <-
abundances %>%
pivot_longer(!Accession, names_to = c("sample", "organelle", "compartment"), values_to = "abundance", names_sep = ', ')
# remove
abundances2$sample <- gsub("Abundances (Normalized): ", "", abundances2$sample, fixed = T)
abundances2$sample <- gsub(": Sample", "", abundances2$sample, fixed = T)
# transform columns to factor
abundances2$sample <- as.factor(abundances2$sample)
abundances2$organelle <- as.factor(abundances2$organelle)
abundances2$compartment <- as.factor(abundances2$compartment)
# pick random accessions
selected_accs <- sample(unique(abundances2$Accession), 4)
# filter plot subset
plotsub <- filter(abundances2, Accession %in% selected_accs)
ggplot(plotsub, aes(x = organelle, y = abundance, fill = compartment)) +
geom_point(position = position_dodge(width = 0.7), width = 0.7) +
facet_wrap(~Accession, scales = 'free') +
scale_fill_brewer(palette = "Dark2")
```
### Calculate and draw a PCA to get an overview of the dataset
```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T}
pca_data <- as.data.frame(pivot_wider(abundances2,
names_from = Accession,
values_from = abundance,
id_cols = c("sample", "organelle", "compartment")))
pca_data <- unite(pca_data, sample, organelle, compartment, col = 'merger', sep = '_')
rownames(pca_data) <- pca_data$merger
pca_data <- pca_data[, -1]
####### double-check
pca_data[is.na(pca_data)] <- 0
####### double-check
pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})]
pca <- prcomp(pca_data, scale = T)
pcaPlotData <- as.data.frame(pca$x)
pcaPlotData$merger <- rownames(pcaPlotData)
pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_',
into = c("sample", "organelle", "compartment"))
pca_plot_individuals <- ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) +
geom_point(size = 2, stroke = 1.5) +
coord_equal() +
# theme_dominik +
scale_color_brewer(palette = 'Dark2')
png(file = "pca_plot_individuals.png", res = 300, width = 2000, height = 2000)
pca_plot_individuals
dev.off()
png(file = "pca_plot_individuals_labelled.png", res = 300, width = 2000, height = 2000)
pca_plot_individuals + geom_text(aes(label = sample), nudge_x = 5)
dev.off()
```
```{r, out.width = "100%", eval= T, echo=F}
include_graphics('pca_plot_individuals.png')
include_graphics('pca_plot_individuals_labelled.png')
```
### exlude ER
```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T}
pca_data <- as.data.frame(pivot_wider(filter(abundances2, organelle != "endoplasmatic reticulum"),
names_from = Accession,
values_from = abundance,
id_cols = c("sample", "organelle", "compartment")))
pca_data <- unite(pca_data, sample, organelle, compartment, col = 'merger', sep = '_')
rownames(pca_data) <- pca_data$merger
pca_data <- pca_data[, -1]
####### double-check
pca_data[is.na(pca_data)] <- 0
####### double-check
pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})]
pca <- prcomp(pca_data, scale = T)
pcaPlotData <- as.data.frame(pca$x)
pcaPlotData$merger <- rownames(pcaPlotData)
pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_',
into = c("sample", "organelle", "compartment"))
pca_plot_excl_ER <- ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) +
geom_point(size = 2, stroke = 1.5) +
coord_equal() +
# theme_dominik +
scale_color_brewer(palette = 'Dark2')
png(file = "pca_plot_excl_ER.png", res = 300, width = 2000, height = 2000)
pca_plot_excl_ER
dev.off()
```
```{r, out.width = "100%", eval= T, echo=F}
include_graphics('pca_plot_excl_ER.png')
```
File deleted
workflows/2022-04-13_FirstIterations/pca_plot_excl_ER.png

139 KiB

workflows/2022-04-13_FirstIterations/pca_plot_individuals.png

147 KiB

workflows/2022-04-13_FirstIterations/pca_plot_individuals_labelled.png

171 KiB

0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment