clustering_kmeans.R

library(RColorBrewer)
library(gridExtra)

# We've looked at a couple of clustering methods already (PCA, Hierarchical)
# Now we'll look at k-means.
# Advantages: fast
# It depends: favors even cluster sizes
# Challenges: user-defined 'k' for number of clusters, non-deterministic

# This only makes sense on a somewhat larger dataset
to_cluster <- read.csv("studies/kmeansexample/resources/to_cluster.csv",
  row.names = 1
)
head(to_cluster)
dim(to_cluster)
# with a larger dataset like this,
# it's often appropriate to average replicates before clustering
groups <- factor(gsub("\\.[1-9]", "", colnames(to_cluster)))
mean_to_cluster <- sapply(
  levels(groups),
  function(x) rowMeans(to_cluster[, which(groups == x)])
)
head(mean_to_cluster) # we'll need new column names now
colnames(mean_to_cluster) <- levels(groups)

# filter, log transform, and convert to z-scores, much like we've done before
sub_to_cluster <- mean_to_cluster[apply(
  mean_to_cluster, 1,
  function(x) min(x) > 0 & max(x) > 50
), ]
sub_to_cluster <- log2(sub_to_cluster)
sub_to_cluster <- t(scale(t(sub_to_cluster)))
# check
head(sub_to_cluster)
dim(sub_to_cluster)
# run kmeans
km <- kmeans(sub_to_cluster, 8)
str(km) # take a look at the kmeans object
km$centers # these are the 'center' of each produced cluster
head(km$cluster) # which cluster each gene is in
km$tot.withinss # summed up sum-of-squares between each gene and cluster center
?kmeans # more information (particularly under Value)

# OK, so far so good, but why 8 clusters? Let's choose more carefully
# we can use tot.withinss as a quality measure, and just try different values
k_to_test <- 2:42
km_k2up <- sapply(
  k_to_test,
  function(x) kmeans(sub_to_cluster, x, nstart = 3)$tot.withinss
)
# we'll already start getting ready for visualization
to_plot <- data.frame(ss = km_k2up, k = k_to_test, run = "real")
# Now we'll want negative controls, which shouldn't cluster well
n_negative_controls <- 7
for (i in 1:n_negative_controls) {
  scrambled_to_cluster <- t(apply(sub_to_cluster, 1, sample))
  km_k2tok24negative <- sapply(
    k_to_test,
    function(x) kmeans(scrambled_to_cluster, x)$tot.withinss
  )
  to_plot <- rbind(to_plot, data.frame(
    ss = km_k2tok24negative,
    k = k_to_test,
    run = paste0("neg", i)
  ))
}
# can you think of a reason that there are more warnings with the scrambled data?
warnings()
# now let's get this ready for visualization
choose_k <- ggplot(data = to_plot, aes(x = k, y = ss, color = run, group = run)) +
  geom_line() +
  geom_point(size = 1, shape = 20) +
  scale_color_manual(values = brewer.pal(length(unique(to_plot$run)), "Set1"))
choose_k
# we can either use the 'elbow rule' (peak of the curve is around 8)
# or we can take the maximum difference between real and scrambled data
neg_minus_real <- sapply(
  paste0("neg", 1:n_negative_controls),
  function(x) to_plot$ss[to_plot$run == x]
)
neg_minus_real <- rowMeans(neg_minus_real) - to_plot$ss[to_plot$run == "real"]
# neg_minus_real <- to_plot[to_plot$run=="neg1", "ss"] -
# to_plot[to_plot$run=="real", "ss"]
neg_minus_real <- data.frame(neg_minus_real, k = k_to_test)

choose_k_by_diff <- ggplot(neg_minus_real, aes(y = neg_minus_real, x = k)) +
  geom_bar(stat = "identity")
choose_k_by_diff
# and if it's still hard to see where greatest differences is
neg_minus_real[order(neg_minus_real$neg_minus_real, decreasing = T)[1:5], ]
# you may notice, if you run this again, that there are slight variations
# we recommend looking at the graphs, as the eye is hard to fool
# keep in mind there is no 'correct' k,
# you might choose fewer for easier visualization
# or more for identifying gene sets for CRE prediction, etc...
# we'll continue with 6, and run the clustering 100 more times, taking the best
km <- kmeans(sub_to_cluster, 6, nstart = 100)
# let's get the data from one cluster
a_cluster <- sub_to_cluster[km$cluster == 1, ]
a_cluster <- melt(a_cluster)
head(a_cluster) # those column names are hard to remember
colnames(a_cluster) <- c("locus", "sample", "z.score")

a_cluster_plot <- ggplot(
  data = a_cluster,
  aes(x = sample, y = z.score, group = locus)
) +
  geom_line(alpha = 0.1, color = "blue3") +
  labs(x = "", y = "z-score") +
  theme(
    text = element_text(size = 20),
    axis.text.x = element_text(angle = 90, vjust = 1)
  )

# and now run that for every cluster and save it
cluster_plots <- list()
for (i in 1:6) {
  a_cluster <- sub_to_cluster[km$cluster == i, ]
  a_cluster <- melt(a_cluster)
  colnames(a_cluster) <- c("locus", "sample", "z.score")

  a_cluster_plot <- ggplot(
    data = a_cluster,
    aes(x = sample, y = z.score, group = locus)
  ) +
    geom_line(alpha = 0.1, color = brewer.pal(6, "Set1")[i]) +
    labs(x = "", y = "z-score") +
    theme_bw() +
    theme(
      text = element_text(size = 20),
      axis.text.x = element_text(angle = 90, vjust = 1)
    ) +
    theme(aspect.ratio = 1) +
    ggtitle(paste("Cluster", i))

  cluster_plots[[i]] <- a_cluster_plot
}

pdf("runs/results_figures/kmeans.pdf", width = 10, height = 16)
do.call(grid.arrange, cluster_plots)
dev.off()