########################
#### To be replaced by CWL routine
########################

ARC_root="~/Hackathon_ARCexample_rnaseq/"
setwd(paste0(ARC_root, 'workflows/'))

########################

########################
# Collect kallisto data
########################


# ### sleuth installation
# 
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install()
# BiocManager::install("devtools")    # only if devtools not yet installed
# BiocManager::install("pachterlab/sleuth")

library(sleuth)
library(tidyverse)
library(jsonlite)
library(openxlsx)

## read experimental metadata from isa.assay wb

isa_assay <- paste0(ARC_root, 'assays/Talinum_RNASeq_minimal/assay.isa.xlsx')

assay_data <- merge(readWorkbook(isa_assay, "1SPL01_plants", startRow = 2), 
                    readWorkbook(isa_assay, "3ASY01_RNASeq", startRow = 2), 
                    by = "Sample.Name"
                    )

## remove empty cols
assay_data <- assay_data[, !apply(assay_data, 2, function(x){sum(is.na(x)) == nrow(assay_data)})]  

# Pointer to kallisto results folder
base_dir <- paste0(ARC_root, '/runs/01_kallisto_results/')

# A list of paths to the kallisto results indexed by the sample IDs is collated with
kal_dirs <- dir(base_dir, full.names = T) ## Sleuth requires full paths

s2c <- assay_data[order(assay_data$Sample.Name), c('Sample.Name', "Characteristics.[Photosynthesis.mode]")]
# For kallisto / sleuth: 's2c' (sample_to_covariates) must contain a column named 'sample'
colnames(s2c) <- c("sample", "Photosynthesis.mode")

s2c$path <- kal_dirs
s2c <- s2c[order(s2c$sample), ]

# Build a sleuth object
so <- sleuth_prep(s2c, ~Photosynthesis.mode)
save(so, file = paste0(ARC_root, 'runs/03_kallisto_sleuthObject.RData'))

# Extract expression tables

## as data.frame
expression_data <- kallisto_table(so)
write.csv(expression_data, paste0(ARC_root, 'runs/03_kallisto_df.csv'), row.names = F)

## as tpm matrix (gene x sample)
tpm_table <- pivot_wider(expression_data, id_cols = target_id, names_from = sample, values_from = tpm)
write.csv(tpm_table, paste0(ARC_root, 'runs/03_kallisto_tpmMatrix.csv'), row.names = F)


# Summarize mapping stats

mapping_stats <- c()
for(i in dir(kal_dirs, pattern = '.json', full.names = T))
{
  id <- unlist(strsplit(i, split = '/'))
  
  z <- data.frame(ID = id[length(id) - 1], read_json(i, simplifyVector = T))
  mapping_stats <- rbind(mapping_stats, z)
}

write.csv(mapping_stats, paste0(ARC_root, 'runs/03_kallisto_mappingStats.csv'), row.names = F)