diff --git a/workflows/kallisto_collect/kallisto_collect.R b/workflows/kallisto_collect/kallisto_collect.R index 1386c9f0d795bd872a01d9063c190662fbaa8b30..63744dbb47f79c1e330b9254fbb17771a322fca5 100644 --- a/workflows/kallisto_collect/kallisto_collect.R +++ b/workflows/kallisto_collect/kallisto_collect.R @@ -13,46 +13,59 @@ library("jsonlite") args <- commandArgs(trailingOnly = T) -in_kallisto_results <- args[1] -in_metadata_file <- args[2] -in_metadata_sample <- args[3] -in_metadata_factor <- args[4] -out_folder <- args[5] +inKallistoResults <- args[1] +inMetadataFile <- args[2] +inMetadataSample <- args[3] +inMetadataFactorList <- args[4] +outFolder <- args[5] + +# inKallistoResults <- "../../runs/kallisto/kallisto_results" +# inMetadataFile <- "../../runs/isaSampleToRawDataSeq/rnaseq-samples.csv" +# inMetadataSample <- "Input [Source Name]" +# inMetadataFactorList <- list("Factor [Photosynthesis mode]") +# inMetadataDataCol <- "Output [Data]" +# outFolder <- "." ################################################ #### If it does not exist, create out dir ################################################ -dir.create(out_folder, recursive = T, showWarnings = F) +dir.create(outFolder, recursive = T, showWarnings = F) ################################################ #### Read ISA sample metadata ################################################ -samples <- read.table(file = in_metadata_file, sep = "\t", header = TRUE) +samples <- read.csv(file = inMetadataFile, check.names = FALSE) ################################################ #### Read Kallisto results ################################################ -base_dir <- in_kallisto_results +base_dir <- inKallistoResults # A list of paths to the kallisto results indexed by the sample IDs is collated with kal_dirs <- dir(base_dir, full.names = T) ## Sleuth requires full paths -s2c <- samples[order(samples[[in_metadata_sample]]), c(in_metadata_sample, in_metadata_factor, "Data.File.Name")] +s2c <- samples[order(samples[[inMetadataSample]]), c(inMetadataSample, unlist(inMetadataFactorList), inMetadataDataCol)] # For kallisto / sleuth: 's2c' (sample_to_covariates) must contain a column named 'sample' -colnames(s2c)[1:2] <- c("sample", "condition") - -s2c$out_name <- gsub(".fastq.+", "", s2c$Data.File.Name) -path_df <- data.frame(path = kal_dirs, out_name = gsub(".+/", "", kal_dirs)) -s2c <- merge(s2c, path_df, by = "out_name") +colnames(s2c)[1] <- c("sample") +path_df <- data.frame(path = kal_dirs, sample = basename(kal_dirs)) +s2c <- merge(s2c, path_df, by = "sample") # Build a sleuth object -so <- sleuth_prep(s2c, full_model = ~condition, num_cores = 1) -save(so, file = paste(out_folder, "kallisto_sleuthObject.RData", sep = "/")) +factors <- sapply(inMetadataFactorList, function(x) x[[1]]) +## Annoying workaround to prevent formula error with special chars in column headers +colnames(s2c) <- make.names(colnames(s2c)) +factors <- make.names(factors) + +design_formula <- as.formula(paste("~", paste(rev(factors), collapse = " + "))) + +so <- sleuth_prep(s2c, full_model = design_formula) + +save(so, file = file.path(outFolder, "kallisto_sleuthObject.RData")) ################################################ #### Extract expression tables @@ -62,14 +75,13 @@ save(so, file = paste(out_folder, "kallisto_sleuthObject.RData", sep = "/")) expression_data <- kallisto_table(so) ## write to file -write.csv(expression_data, paste(out_folder, "/kallisto_df.csv", sep = "/"), row.names = F) +write.csv(expression_data, paste(outFolder, "/kallisto_df.csv", sep = "/"), row.names = F) ## as tpm matrix (gene x sample) -# Replacing pivot_wider() with base R operations tpm_table <- reshape(expression_data, idvar = "target_id", timevar = "sample", direction = "wide", v.names = "tpm") # Write to file -write.csv(tpm_table, paste(out_folder, "/kallisto_tpmMatrix.csv", sep = "/"), row.names = F) +write.csv(tpm_table, file.path(outFolder, "kallisto_tpmMatrix.csv"), row.names = F) ################################################ #### Summarize mapping stats @@ -82,4 +94,4 @@ for (i in dir(kal_dirs, pattern = ".json", full.names = T)) { mapping_stats <- rbind(mapping_stats, z) } -write.csv(mapping_stats, paste(out_folder, "/kallisto_mappingStats.csv", sep = "/"), row.names = F) +write.csv(mapping_stats, file.path(outFolder, "kallisto_mappingStats.csv"), row.names = F)