# as always, we need a library for that
library(topGO)

load(file = "runs/kallisto_combined/mothertableV3.Rdata")

# this reads in the GO annotation file
# you will need an equivalent file for your species!
geneID2GO_Ath <- readMappings("studies/AthalianaReferences/resources/Athid2go.map")
head(geneID2GO_Ath)
# we first prepare a factor for all genes indicating
# whether genes were upregulated "1" or not "0"
up_or_not <- dfr["log2FC"] > 0 & dfr["treatment.vs.mock_q_value"] < 0.01
up_or_not <- factor(as.integer(up_or_not))
# we want to give this list gene names that exactly match those in "Athidgo.map"
names(up_or_not) <- dfr$locus # attach gene IDs
# check
head(up_or_not)
table(up_or_not)

# we save the prepared information in an object topGO understands
# we'll focus on the ontology "BP" biological process.
# the factor marking upregulated genes goes in at allGenes
# the annotation and it's type go in at gene2GO and annot, respectively.
GOdata_sig <- new("topGOdata",
  description = "treatment.vs.mock_up",
  ontology = "BP",
  allGenes = up_or_not,
  nodeSize = 10,
  annot = annFUN.gene2GO,
  gene2GO = geneID2GO_Ath
)

# now we need to do the statistical test, classic Fishers Exact Test is chosen
resultsGOfisher <- runTest(GOdata_sig, algorithm = "classic", statistic = "fisher")
# this extracts a sorted, summary table
tableGOresults <- GenTable(GOdata_sig,
  classicFisher = resultsGOfisher,
  topNodes = length(resultsGOfisher@score)
)
# the P-value is in the column "classicFisher". We will also calculate the q_value.
tableGOresults$q_value <- p.adjust(tableGOresults$classicFisher, method = "BY")
# BY is a different method for calculating FDR, that's more dependency tolerant.
# filter to significant
tableGOresults <- tableGOresults[tableGOresults$q_value < 0.05, ]

# people always ask which genes are behind the GO terms
# topGO provides a function to find them
genesInTerm(GOdata_sig, "GO:0050896")
# now let's run this for all our top terms
# broken down by whether they were sinificantly up or not
# we pre-cache a character vector of the significant gene IDs
sig_genes <- dfr$locus[dfr["log2FC"] > 0 & dfr["treatment.vs.mock_q_value"] < 0.01]
# we'll save a mini-function to get and organize the IDs
genes_in_term_by_sig <- function(ontology, whichGO, sig_genes) {
  all_ids <- genesInTerm(ontology, whichGO)[[1]]
  # break up by significance
  by_sig_ids <- split(all_ids, all_ids %in% sig_genes)
  # vector to comma-separated string
  by_sig_ids <- sapply(by_sig_ids, paste, collapse = ",")
  names(by_sig_ids) <- c("ns_ids", "sig_ids") # set names
  return(by_sig_ids)
}
# test the function
genes_in_term_by_sig(GOdata_sig, "GO:0050896", sig_genes)
# use the function
tableGOresults <- cbind(
  tableGOresults,
  t(sapply(
    tableGOresults$GO.ID,
    function(x) genes_in_term_by_sig(GOdata_sig, x, sig_genes)
  ))
)
tail(tableGOresults) # tail only because the gene lists were shorter

# our final steps are to export the table in biologist readable format
write.table(tableGOresults,
  file = "runs/results_figures/GO_treatment_vs_mock_up_Fisher.txt", sep = "\t",
  row.names = FALSE
)

# and to export a graphical representation
# if you want to see more GO terms than 20, change firstSigNodes
printGraph(GOdata_sig, resultsGOfisher,
  firstSigNodes = 20,
  fn.prefix = "runs/results_figures/GO_treatment_vs_mock_up",
  useInfo = "all", pdfSW = TRUE
)

# <<< challenge assignments >>> #
# 1. perform GO enrichment on down-regulated genes

# clean up
remove(
  geneID2GO_Ath, GOdata_sig, resultsGOfisher, sig_genes, tableGOresults,
  up_or_not
)
