clustering_PCA_HCL.R

## PCA
# we need a few plotting-related libraries we have not used before
library(ggplot2)

load(file = "runs/kallisto_combined/mothertableV2.Rdata")

# we need the normalized data for this analysis, the columns with tpm
for_clust <- dfr[, grep("tpm", colnames(dfr))]
head(for_clust)
# we'll filter to leave only, more abundant, always-above-0 genes
for_clust <- for_clust[apply(for_clust, 1, max) > 100, ]
for_clust <- for_clust[apply(for_clust, 1, min) > 0, ]
# and log2 transform
for_clust <- log2(for_clust)
# the prcomp (PCA) function assumes samples are in rows, and genes columns
# so we will transpose our data
for_clust <- t(for_clust)
# and we will scale it (z-score: mean center, and divide by standard deviation)
for_clust <- scale(for_clust)

# now we can do the PCA
pca <- prcomp(for_clust)
# and look at the outcome
s <- summary(pca)
s
# in row two, we see the 1st PC explains about 55% of the variance
# let's make a plot! As with most things in R, there's many ways to plot,
# we'll be using the package "ggplot2", it has some overhead, but it's
# faster to make a publication-quality plot in.
# first we extract the values we need for the plot from the pca object
scores <- as.data.frame(pca$x)
# now we add a column indicating what is mock and what is treatment.
scores$treatment <- rep(c("mock", "treatment"), each = 3)
# and we look if you data.frame is as expected
scores

# now we produce the figure with ggplot
# look at the beginning of the course or on the cheatsheet to figure out
# what the different lines mean
# the line labs() is difficult to understand, it is a fancy way to directly
# get the percentages in the plot without looking them up and writing them in

pca12 <- ggplot(data = scores, aes(x = PC1, y = PC2)) +
  theme_bw() +
  geom_point(aes(color = treatment), size = 2.5) +
  geom_text(aes(label = treatment), vjust = 1.5, size = 3) +
  labs(
    x = paste0("PCA1 (", s$importance[2, 1] * 100, "%)"),
    y = paste0("PCA2 (", s$importance[2, 2] * 100, "%)")
  ) +
  ggtitle("PCA") +
  theme(legend.position = "none")
# with the next line we make Rstudio show us the figure
pca12

# <<< challenge assignments >>> #
# 1. modify the above plot to compare PC1 with PC3
# 2. comment out elements, to see what each does
# 3. use the cheatsheet, google and your intuition to replace
#    the individual labels with a joint legend

# save as pdf
dir.create(path = "runs/results_figures", recursive = T, showWarnings = F)
pdf("runs/results_figures/PCA.pdf", height = 4, width = 4)
print(pca12) # "print" only important in non-interactive use
dev.off()

# clean up
remove(pca, pca12, scores, s)
## Hierarchical Clustering
# Another nice overview plot is a heatmap of hierarchical clustering.

# we'll be using the same data as for the PCA
head(for_clust)

# before our hierarchical clustering, we need to decide on a distance metric
# type ?dist to find out which methods are available
dist(for_clust, method = "euclidean")
# more commonly we invert a correlation method, like Pearson or Spearman (?cor)
as.dist(1 - cor(t(for_clust), method = "spearman"))
# did you notice that we had to transpose the data for 'cor'?
# no, this isn't intuitive. R, is a conglomeration of a language, written by
# many different people. Some people like different defaults. Use the help
# functions, use google, find examples, keep track of how you did it last time,
# check as you go, so bugs don't propogate, hang in there.

# and we cluster, again type ?hclust to see other methods of clustering
samp_dist <- as.dist(1 - cor(t(for_clust), method = "spearman"))
gene_dist <- as.dist(1 - cor(for_clust, method = "pearson"))
hierarchy_samples <- hclust(samp_dist, method = "complete")
hierarchy_genes <- hclust(gene_dist, method = "complete")
# now our information is ready, but we cannot see it
# first we'll set up a color gradient function
blueyellow <- colorRampPalette(c("blue", "black", "yellow"))
# now let's make the heatmap
heatmap(for_clust,
  Rowv = as.dendrogram(hierarchy_samples),
  Colv = as.dendrogram(hierarchy_genes),
  labCol = NA,
  col = blueyellow(40),
  mar = c(4, 10)
)
# we'll save the image with a nifty one-liner
dev.copy2eps(file = "runs/results_figures/sampleclustering.eps")

# cleanup
remove(
  blueyellow, for_clust, gene_dist, samp_dist, hierarchy_genes,
  hierarchy_samples
)