diff --git a/_DominikNotes/2022-04-13_data_notes.md b/_DominikNotes/2022-04-13_data_notes.md
index 58b17066b282283ebff2ac034a8fa778dc7b2cda..ae4506c01438e51196a6ba4e3292f8d497212568 100644
--- a/_DominikNotes/2022-04-13_data_notes.md
+++ b/_DominikNotes/2022-04-13_data_notes.md
@@ -1,9 +1,33 @@
 
 # questions to Anja 
 
-- what's the ricinus reference? 
-- analysis protocols? 
-  - what tests?
+- what's the ricinus reference?
+
+- analysis protocols?
+  - what statistical tests?
   - why membrane vs. lumen?
     - can we also get ER vs. mito etc.?
-  - 
\ No newline at end of file
+
+1. Hast du eine spezifische Referenz fÃ¼r Ricinus genommen (welche?) oder vergleicht das komplett gegen UniProt (und daher die gefundenen Contaminants)?
+
+2. Bezieht sich die "Protein FDR Confidence: Combined"? auf den Match zwischen identifiziertem Protein und der Accession?
+
+3. Es gibt insgesamt 31 Contaminant=TRUE. Sollten wir die herausfiltern? Und mal aus Neugierde: Das ist Ã¼berwiegend Keratin - ist das Ã¼blich oder unsaubere Arbeit?
+
+4. Zu den abundances:
+   - Wie werden die Ratios berechnet? fold changes, log2 fold changes? Und wie werden dann NAs behandelt?
+   - Welche Statistik (t test, Anova?) und Methode fÃ¼r adjusted p-value?
+   - Wie werden die normalisiert und gruppiert?
+
+5. Gibt's dazu vielleicht einen standard Report oder andere Beschreibung von Proteome Discoverer? Ich versuche fÃ¼r unsere Daten immer so eine Art Data Dictionary anzulegen (siehe Anhang) - vielleicht gibt Proteome Discoverer auch standardisierte Definitionen fÃ¼r die einzelnen Spalten raus. 
+
+6. Komplette ANOVA, die dann auch z.B. ER-Membrane gegen Mito-Lumen vergleicht wÃ¤re vermutlich overkill. Aber kÃ¶nntest du mit wenig Aufwand auch jeweils die Membrane Samples und die Lumen Samples gegeneinander testen (Also Mito-Mem vs. ER-Mem vs. Peroxisome-Mem, ...)?
+
+7. DÃ¼rfen NAs bei den abundances durch 0 ersetzt werden?
+
+8. Gibt es Insgesamt fÃ¼r die Datenanalyse eine Art Protokoll?  
+
+Auf den ersten Blick (schnelle PCA) sehen die Ergebnisse nicht so verkehrt aus bzw. Trennen zumindest schÃ¶n nach Organelle und compartment. Lediglich die ER Proben schlagen etwas raus. 
+
+
+
diff --git a/assays/2022-04-13_proteome_discoverer/isa.run.xlsx b/assays/2022-04-13_proteome_discoverer/isa.run.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..17db67e4af60dced61c4e82b473ec24c84410709
Binary files /dev/null and b/assays/2022-04-13_proteome_discoverer/isa.run.xlsx differ
diff --git a/workflows/2022-04-13_skim_results.Rmd b/workflows/2022-04-13_skim_results.Rmd
index 0628dfe75466a6dc5a55a193cdc87efcfb48c053..85bfcc434c9c65dbb1261456cb7e42ff30b89bfe 100644
--- a/workflows/2022-04-13_skim_results.Rmd
+++ b/workflows/2022-04-13_skim_results.Rmd
@@ -2,11 +2,18 @@
 title: "Untitled"
 author: "Dominik"
 date: "4/13/2022"
-output: html_document
+output:
+  pdf_document: default
+  html_document: default
 ---
 
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE)
+
+library(knitr)
+library(tidyverse)
+library(openxlsx)
+
 ```
 
 
@@ -21,19 +28,27 @@ dir("../assays/2022-04-13_proteome_discoverer/dataset/")
 
 ```{bash}
 grep ">" ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta | wc -l
+
+head ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta 
+
+grep "OS=Ricinus communis" ../assays/2022-04-13_proteome_discoverer/dataset/AllProteins.fasta | wc -l
 ```
 
 ## skim excel files
 
-```{r}
-library(tidyverse)
+### quantification.xlsx
 
+```{r}
 quantification <- readxl::read_xlsx("../assays/2022-04-13_proteome_discoverer/dataset/quantification.xlsx")
 dim(quantification)
 colnames(quantification)
 length(unique(quantification$Accession))
 
+```
+
+### quantification_with_peptides.xlsx
 
+```{r}
 quantification_peptides <- readxl::read_xlsx("../assays/2022-04-13_proteome_discoverer/dataset/quantification_with_peptides.xlsx")
 dim(quantification_peptides)
 colnames(quantification_peptides)
@@ -65,10 +80,52 @@ peptides <- peptides[, colSums(is.na(peptides)) != nrow(peptides)]
 colnames(peptides) = as.character(peptides[1, ])
 peptides <- filter(peptides, Checked != "Checked")
 
+peptides <- type_convert(peptides)
+
+
+```
+
+
+## create an isa.run.xlsx like data dictionary 
+
+```{r}
+
+sum_tmp <- data.frame(summary(quantification)) %>% 
+  pivot_wider(names_from = Var1, values_from = c(Var1, Freq), 
+              values_fn = function(x){paste(na.exclude(x), collapse = "|")})
+
+quantification_summary <-  cbind.data.frame(sum_tmp[,c(1,3)],t(summarise_all(quantification, class)))
+colnames(quantification_summary) <- c("Identifier", "ObjectSummary", "ObjectType")
+quantification_summary$TargetFile <- "quantification.xlsx"
+
+
+sum_tmp <- data.frame(summary(peptides)) %>% 
+  pivot_wider(names_from = Var1, values_from = c(Var1, Freq), 
+              values_fn = function(x){paste(na.exclude(x), collapse = "|")})
+
+peptides_summary <-  cbind.data.frame(sum_tmp[,c(1,3)],t(summarise_all(peptides, class)))
+
+colnames(peptides_summary) <- c("Identifier", "ObjectSummary", "ObjectType")
+peptides_summary$TargetFile <- "quantification_with_peptides.xlsx"
+
+
+
+isa_run <- rbind.data.frame(quantification_summary, peptides_summary)[, c(4, 1, 3, 2)]
+row.names(isa_run) <- NULL
+isa_run$Comment <- ""
+isa_run$Definition <- ""
+
+
+write.xlsx(isa_run, file =  "../assays/2022-04-13_proteome_discoverer/isa.run.xlsx", overwrite = T, asTable = T)
+
+
 ```
 
 
 
+
+
+
 ```{r}
 
 
@@ -86,7 +143,7 @@ abundances_grouped2 <-
 
 abundances_grouped2$organelle <- gsub("Abundances (Grouped): ", "", abundances_grouped2$organelle, fixed = T)
 
-# transform colums to factor 
+# transform columns to factor 
 
 abundances_grouped2$organelle <- as.factor(abundances_grouped2$organelle)
 abundances_grouped2$compartment <- as.factor(abundances_grouped2$compartment)
@@ -144,6 +201,139 @@ ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = '
 
 
 
+```{r}
+
+
+# extract abundances per accession only
+
+abundances <- quantification[, c("Accession", grep("Abundances (Normalized)", colnames(quantification), fixed = T, value = T))]
+
+# pivot and split column
+
+abundances2 <-  
+  abundances %>%
+  pivot_longer(!Accession, names_to = c("sample", "organelle", "compartment"), values_to = "abundance", names_sep = ', ')
+
+# remove 
+
+abundances2$sample <- gsub("Abundances (Normalized): ", "", abundances2$sample, fixed = T)
+abundances2$sample <- gsub(": Sample", "", abundances2$sample, fixed = T)
+
+# transform columns to factor 
+
+abundances2$sample <- as.factor(abundances2$sample)
+abundances2$organelle <- as.factor(abundances2$organelle)
+abundances2$compartment <- as.factor(abundances2$compartment)
+
+# pick random accessions
+selected_accs <- sample(unique(abundances2$Accession), 4)
+
+# filter plot subset
+plotsub <- filter(abundances2, Accession %in% selected_accs)
+
+
+ggplot(plotsub, aes(x = organelle, y = abundance, fill = compartment)) +
+  geom_point(position = position_dodge(width = 0.7), width = 0.7) +
+  facet_wrap(~Accession, scales = 'free') + 
+  scale_fill_brewer(palette = "Dark2")
+  
+```
+
+
+
+### Calculate and draw a PCA to get an overview of the dataset
+
+```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T}
+
+pca_data <- as.data.frame(pivot_wider(abundances2,
+                                      names_from = Accession,
+                                      values_from =  abundance,
+                                      id_cols = c("sample", "organelle", "compartment")))
+
+pca_data <- unite(pca_data,  sample, organelle, compartment, col = 'merger', sep = '_')
+rownames(pca_data) <- pca_data$merger
+pca_data <- pca_data[, -1]
+
+
+####### double-check
+pca_data[is.na(pca_data)] <- 0
+####### double-check
+
+pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})]
+
+pca <- prcomp(pca_data, scale = T)
+pcaPlotData <- as.data.frame(pca$x)
+pcaPlotData$merger <- rownames(pcaPlotData)
+pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_',
+                        into = c("sample", "organelle", "compartment"))
+
+pca_plot_individuals <- ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) +
+  geom_point(size = 2,  stroke = 1.5) +
+  coord_equal() +
+  # theme_dominik +
+  scale_color_brewer(palette = 'Dark2')
+
+png(file = "pca_plot_individuals.png", res = 300, width = 2000, height = 2000)
+pca_plot_individuals
+dev.off()
+
+png(file = "pca_plot_individuals_labelled.png", res = 300, width = 2000, height = 2000)
+pca_plot_individuals + geom_text(aes(label = sample), nudge_x = 5)
+dev.off()
+
+
+```
+
+```{r, out.width = "100%", eval= T, echo=F}
+include_graphics('pca_plot_individuals.png')
+include_graphics('pca_plot_individuals_labelled.png')
+```
+
+
+
+
+### exlude ER
+
+```{r, fig.width = 3, fig.height = 3, fig.align = "center", eval=T}
+
+pca_data <- as.data.frame(pivot_wider(filter(abundances2, organelle != "endoplasmatic reticulum"),
+                                      names_from = Accession,
+                                      values_from =  abundance,
+                                      id_cols = c("sample", "organelle", "compartment")))
+
+
+pca_data <- unite(pca_data,  sample, organelle, compartment, col = 'merger', sep = '_')
+rownames(pca_data) <- pca_data$merger
+pca_data <- pca_data[, -1]
+
+
+####### double-check
+pca_data[is.na(pca_data)] <- 0
+####### double-check
+
+pca_data <- pca_data[, apply(pca_data, 2, function(x) {sum(x) != 0})]
+
+pca <- prcomp(pca_data, scale = T)
+pcaPlotData <- as.data.frame(pca$x)
+pcaPlotData$merger <- rownames(pcaPlotData)
+pcaPlotData <- separate(data = pcaPlotData, col = merger, sep = '_',
+                        into = c("sample", "organelle", "compartment"))
+
+pca_plot_excl_ER <- ggplot(pcaPlotData, aes_string(color = 'organelle', shape = 'compartment', x = 'PC1', y = 'PC2')) +
+  geom_point(size = 2,  stroke = 1.5) +
+  coord_equal() +
+  # theme_dominik +
+  scale_color_brewer(palette = 'Dark2')
+
+png(file = "pca_plot_excl_ER.png", res = 300, width = 2000, height = 2000)
+pca_plot_excl_ER
+dev.off()
+
+```
+
+```{r, out.width = "100%", eval= T, echo=F}
+include_graphics('pca_plot_excl_ER.png')
+```
 
 
 
diff --git a/workflows/2022-04-13_skim_results.pdf b/workflows/2022-04-13_skim_results.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..ebe0d3c54b693aa377fd8dc94c190a445d7c684b
Binary files /dev/null and b/workflows/2022-04-13_skim_results.pdf differ
diff --git a/workflows/pca_plot_excl_ER.png b/workflows/pca_plot_excl_ER.png
new file mode 100644
index 0000000000000000000000000000000000000000..3b39591c40da84188d3e3a68cd756cbd124ef2a7
Binary files /dev/null and b/workflows/pca_plot_excl_ER.png differ
diff --git a/workflows/pca_plot_individuals.png b/workflows/pca_plot_individuals.png
new file mode 100644
index 0000000000000000000000000000000000000000..e480d20f6b909626aae0378b8d6b20a9fcb3863d
Binary files /dev/null and b/workflows/pca_plot_individuals.png differ
diff --git a/workflows/pca_plot_individuals_labelled.png b/workflows/pca_plot_individuals_labelled.png
new file mode 100644
index 0000000000000000000000000000000000000000..ca3c290ccb6ce39270b98affd6181d5d41814100
Binary files /dev/null and b/workflows/pca_plot_individuals_labelled.png differ