# we'll start with the mock1 results, as an example
first <- read.delim("runs/kallisto_results/mock1/abundance.tsv")
# and look at the first lines of the resulting data.frame
head(first)
# and check the dimensions, we expect one row per gene, and 5 columns
dim(first)
# we need new names, so that we know est_counts and tpm are from mock1
colnames(first)[4:5] <- c("mock1_estcounts", "mock1_tpm")
head(first)

# OK, for the second file, mock2, we'll do the same
second <- read.delim("runs/kallisto_results/mock2/abundance.tsv")
# we again need new names, so that we know est_counts and tpm are from mock2
colnames(second)[4:5] <- c("mock2_estcounts", "mock2_tpm")
# and look at the first lines of the resulting data.frame
head(second)
# some of the columns (length, eff_length) we only need once
# so we'll keep just the IDs (for merging) and the abundance
second <- second[, c(1, 4, 5)]
# and look again!
head(second)

# now we'll merge the tables and store the result in a data.frame called 'dfr'
dfr <- merge(first, second, by = "target_id", all.x = T)
# merge combines the first two arguments by the column with the name
# specified with the 'by'. Setting all.x=TRUE makes
# sure that the original data.frame stays complete
head(dfr)

# now onto "mock3", but we don't want to retype this every time
# so we'll change a few things, 1st we'll save the sample_name
# so we can type it once and use it more
sample_name <- "mock3"
# for instance, we can use the sample_name to enter the file name
paste0("runs/kallisto_results/", sample_name, "/abundance.tsv")
# and now actually import the file
newfile <- read.delim(paste0(
  "runs/kallisto_results/",
  sample_name, "/abundance.tsv"
))
head(newfile)
# 2nd, the code above is "hidden-bug prone", imagine (or test) what happens if
# we'd run these two lines from above out of order, or we'd run them twice
# ---
# colnames(second)[4:5] <- c("mock2_estcounts", "mock2_tpm")
# second <- second[,c(1,4,5)]
# ---
# so we'll use a more robust version from now on
# we'll subset the table by name instead of number
newfile <- newfile[, c("target_id", "est_counts", "tpm")]
# we'll check the old names when renaming the columns
colnames(newfile)[names(newfile) == "est_counts"] <- paste0(sample_name, "_estcounts")
colnames(newfile)[names(newfile) == "tpm"] <- paste0(sample_name, "_tpm")
# merging remains the same
dfr <- merge(dfr, newfile, by = "target_id", all.x = T)
head(dfr)

# for the remaining three samples all we have to do is change the sample name
sample_name <- "treatment1"
newfile <- read.delim(paste0("runs/kallisto_results/", sample_name, "/abundance.tsv"))
newfile <- newfile[, c("target_id", "est_counts", "tpm")]
colnames(newfile)[names(newfile) == "est_counts"] <- paste0(sample_name, "_estcounts")
colnames(newfile)[names(newfile) == "tpm"] <- paste0(sample_name, "_tpm")
dfr <- merge(dfr, newfile, by = "target_id", all.x = T)

sample_name <- "treatment2"
newfile <- read.delim(paste0("runs/kallisto_results/", sample_name, "/abundance.tsv"))
newfile <- newfile[, c("target_id", "est_counts", "tpm")]
colnames(newfile)[names(newfile) == "est_counts"] <- paste0(sample_name, "_estcounts")
colnames(newfile)[names(newfile) == "tpm"] <- paste0(sample_name, "_tpm")
dfr <- merge(dfr, newfile, by = "target_id", all.x = T)

sample_name <- "treatment3"
newfile <- read.delim(paste0("runs/kallisto_results/", sample_name, "/abundance.tsv"))
newfile <- newfile[, c("target_id", "est_counts", "tpm")]
colnames(newfile)[names(newfile) == "est_counts"] <- paste0(sample_name, "_estcounts")
colnames(newfile)[names(newfile) == "tpm"] <- paste0(sample_name, "_tpm")
dfr <- merge(dfr, newfile, by = "target_id", all.x = T)
# and we look at the result
head(dfr)

# now some ordering of the result
# we are going to use the function 'grep' for this, which is
# a search function that returns the index(es) where a pattern was found.
# syntax: grep(pattern_to_look_for, item_to_search_in)
grep("target", colnames(dfr))
grep("tpm", colnames(dfr))
grep("unicorn", colnames(dfr))
# we want the columns with general information
new_order <- c(
  grep("id|length", colnames(dfr)),
  # followed by the columns with "estcounts"
  grep("estcounts", colnames(dfr)),
  # followed by the columns with "tpm"
  grep("tpm", colnames(dfr))
)
# double check that was the order we wanted
new_order
colnames(dfr)[new_order]
# and now change the table
dfr <- dfr[, new_order]
head(dfr)
dim(dfr)
# if it's easier for you, remember you could have done this more manually
# dfr <- dfr[, c(1, 2, 3, 4, 6, 8, 10, 12, 14, 5, 7, 9, 11, 13, 15)]

# now export the table in biologist readable format
dir.create(path = "runs/kallisto_combined/", recursive = T, showWarnings = F)
write.table(dfr,
  file = "runs/kallisto_combined/mothertableV1.txt",
  row.names = F, sep = "\t", quote = F
)

### importing data 2 ###
# that was fun, and you want to do it again, right?
# OK, technically you can skip this part, your data is imported.
# If, however, you want to import dozens or hundreds of samples,
# you will want to use loops. This example should return the
# exact same result as above.

# find and save names of files to import
files <- dir("runs/kallisto_results")
# the first file gets special handling,
# since we want to keep the "length" related columns
sample_name <- files[1]
dfr <- read.delim(paste0("runs/kallisto_results/", sample_name, "/abundance.tsv"))
colnames(dfr)[names(dfr) == "est_counts"] <- paste0(sample_name, "_estcounts")
colnames(dfr)[names(dfr) == "tpm"] <- paste0(sample_name, "_tpm")
# then we run the code in the block for all but the first file

for (sample_name in files[-1]) {
  newfile <- read.delim(paste0(
    "runs/kallisto_results/",
    sample_name, "/abundance.tsv"
  ))
  newfile <- newfile[, c("target_id", "est_counts", "tpm")]
  colnames(newfile)[names(newfile) == "est_counts"] <- paste0(
    sample_name,
    "_estcounts"
  )
  colnames(newfile)[names(newfile) == "tpm"] <- paste0(sample_name, "_tpm")
  dfr <- merge(dfr, newfile, by = "target_id", all.x = T)
} # the right curly bracket closes our loop

new_order <- c(
  grep("id|length", colnames(dfr)),
  grep("estcounts", colnames(dfr)),
  grep("tpm", colnames(dfr))
)

dfr <- dfr[, new_order]

# the loop makes for less code that is easier to modify and maintain
# but it's more criptic at first, for your projects, it's your call!
# we will be emphasizing basic and readable for the workshop

# one last setup item, we will occasionally need not the transcript but the gene IDs
# the gene (locus) ID is simply the first nine characters of an AGI
locus <- substr(dfr$target_id, 1, 9)
dfr <- cbind(locus, dfr)
