### Analysis of differential gene expression using sleuth

library(sleuth)
library(ggplot2)

# First we need to specify where the kallisto results are stored.
# If you didn't specify this in your kallisto script, move all kallisto results
# folders (one for each sample) by GUI or the command line into a new folder called
# "kallisto_results".

# Begin by storing the base directory of the kallisto results in a variable
base_dir <- "runs/kallisto_results/"

# Next get the list of sample IDs with
### Note: this only works, if the only folder content is the kallisto output
# (manual alternative below)
sample_id <- dir(base_dir)
sample_id

# A list of paths to the kallisto results indexed by the sample IDs is collated with
kal_dirs <- dir(base_dir, full.names = T) ## Sleuth requires full paths
kal_dirs


# Alternatively by hand:
# sample_id <- c("mock1", "mock2", "mock3", "treatment1", "treatment2", "treatment3")
# kal_dirs <-  c("kallisto_results//mock1",
#   "kallisto_results//mock2",
#   "kallisto_results//mock3",
#   "kallisto_results//treatment1",
#   "kallisto_results//treatment2",
#   "kallisto_results//treatment3")


# The next step is to load (here: build) an auxillary "sample to condition" / "s2c"
# table that describes the experimental design and the relationship between
# the kallisto directories and the samples:

# Double-check the order
sample_id
kal_dirs
condition <- rep(c("mock", "treatment"), each = 3)

s2c <- data.frame(sample = sample_id, condition)
s2c

# Now, we must add a column with the kallisto_directories for each sample.
# This column must be labeled 'path', otherwise sleuth will throw an error.
# The user should check whether or not the order is correct.
# In this case, the kallisto output is correctly matched with the sample identifiers.

s2c$path <- kal_dirs
s2c

# Now the "sleuth object" can be constructed.
# This requires four commands:

#   (1) load the kallisto processed data into the object
so <- sleuth_prep(s2c, ~condition)

#   (2) estimate parameters for the sleuth response error measurement (full) model
so <- sleuth_fit(so)

#   (3) estimate parameters for the sleuth reduced model, and
so <- sleuth_fit(so, ~1, "reduced")

#   (4) perform differential analysis (testing).
so <- sleuth_lrt(so, "reduced", "full")

# Now generate a table of results for analysis
treatment.vs.mock <- sleuth_results(so, "reduced:full", test_type = "lrt")


# count significant genes (e.g.)
table(treatment.vs.mock$qval <= 0.01)
# another look at the data.frame
head(treatment.vs.mock)

# <<< challenge excercises >>> #
# 1. compare the logFC edgeR calculated to that which we did
# 2. where does the difference comes from? (it's in the edgeR manual)

# now we transfer the result to our compilation data.frame 'dfr'
# actually, all we really want is the 'false discovery rate' AKA 'q_value'
dfr <- merge(dfr, treatment.vs.mock[, c("target_id", "qval")],
  by = "target_id",
  all.x = T
)

# and rename the multi-hypothesis corrected values
names(dfr)[names(dfr) == "qval"] <- "treatment.vs.mock_q_value"

# and some clean-up
# remove(sample_id, kal_dirs, condition, base_dir, s2c, so, treatment.vs.mock)

## Volcano Plot
# With this, we have the information about differential expression in the table.
# Now we can make a figure to visualize the result.
# One typical method is a volcano plot.

# we have to store the information about significance for coloring our plot


dfr[is.na(dfr$treatment.vs.mock_q_value), "treatment.vs.mock_q_value"] <- 1

dfr[dfr$treatment.vs.mock_q_value >= 0.01, "treatment.vs.mock_significant"] <- F
dfr[dfr$treatment.vs.mock_q_value < 0.01, "treatment.vs.mock_significant"] <- T


# and make a volcano plot using ggplot2
treatment.vs.mock <- ggplot(data = dfr, aes(
  x = log2FC,
  y = -log10(treatment.vs.mock_q_value),
  color = treatment.vs.mock_significant
)) +
  geom_point(size = 1, shape = 20) +
  scale_color_manual(values = c(
    "FALSE" = "black",
    "TRUE" = "red"
  )) +
  xlab("log2 fold change") +
  ylab("-log10 p-value") +
  theme(legend.position = "none")

treatment.vs.mock
# go through line by line and see if you understand what is plotted
# you can also make separate plots with the first two lines, the first three
# lines, etc. to see what each line is actually doing and how it might work

# save the plot
pdf("runs/results_figures/volcano_plot_sleuth.pdf", height = 8, width = 8)
treatment.vs.mock
dev.off()

# clean up
remove(treatment.vs.mock)


# one last setup item, we will occasionally need not the transcript but the gene IDs
# the gene (locus) ID is simply the first nine characters of an AGI
locus <- substr(dfr$target_id, 1, 9)
dfr <- cbind(locus, dfr)

# we can store the data in biologist readable format by using write.table
write.table(dfr,
  file = "runs/kallisto_combined/mothertableV3.txt",
  row.names = F, sep = "\t", quote = F
)
# and we can store the data as an R object
save(dfr, file = "runs/kallisto_combined/mothertableV3.Rdata")
