Skip to content
Snippets Groups Projects
Select Git revision
  • b9409528afe303f62de781e54ad2a417fce14426
  • main default protected
  • cqc
3 results

kallisto_collect.R

Blame
  • kallisto_collect.R 3.15 KiB
    #!/usr/bin/env Rscript
    
    ################################################
    #### Load required library
    ################################################
    
    if(!"BiocManager" %in% row.names(installed.packages())){install.packages('BiocManager')}
    
    if(!"sleuth" %in% row.names(installed.packages())){
      BiocManager::install('pachterlab/sleuth')
    }
    if(!"tidyverse" %in% row.names(installed.packages())){install.packages("tidyverse")}
    if(!"openxlsx" %in% row.names(installed.packages())){install.packages("openxlsx")}
    if(!"jsonlite" %in% row.names(installed.packages())){install.packages("jsonlite")}
    
    ################################################
    #### Read arguments from CLI
    ################################################
    
    args <- commandArgs(trailingOnly = T)
    
    in_kallisto_results <- args[1]
    in_metadata_file <- args[2]
    in_metadata_sample <- args[3]
    in_metadata_factor <- args[4]
    out_folder <- args[5]
    
    
    ################################################
    #### If it does not exist, create out dir
    ################################################
    
    dir.create(out_folder, recursive = T, showWarnings = F)
    
    ################################################
    #### Read ISA sample metadata
    ################################################
    
    samples <- read.table(file = in_metadata_file, sep = "\t")
    
    ################################################
    #### Read Kallisto results
    ################################################
    
    base_dir <- in_kallisto_results
    
    # A list of paths to the kallisto results indexed by the sample IDs is collated with
    kal_dirs <- dir(base_dir, full.names = T) ## Sleuth requires full paths
    
    s2c <- samples[order(samples[[in_metadata_sample]]), c(in_metadata_sample, in_metadata_factor, "Data.File.Name")]
    
    # For kallisto / sleuth: 's2c' (sample_to_covariates) must contain a column named 'sample'
    colnames(s2c)[1:2] <- c("sample", "condition")
    
    s2c$out_name <- gsub(".fastq.+", "", s2c$Data.File.Name)
    path_df <- data.frame(path = kal_dirs, out_name = gsub(".+/", "", kal_dirs))
    s2c <- merge(s2c, path_df, by = "out_name")
    
    # Build a sleuth object
    
    so <- sleuth_prep(s2c, full_model = ~condition, num_cores = 1)
    
    save(so, file = paste(out_folder, "kallisto_sleuthObject.RData", sep = "/"))
    
    
    ################################################
    #### Extract expression tables
    ################################################
    
    ## as data.frame
    expression_data <- kallisto_table(so)
    
    ## write to file
    write.csv(expression_data, paste(out_folder, "/kallisto_df.csv", sep = "/"), row.names = F)
    
    ## as tpm matrix (gene x sample)
    tpm_table <- pivot_wider(expression_data, id_cols = target_id, names_from = sample, values_from = tpm)
    
    ## write to file
    write.csv(tpm_table, paste(out_folder, "/kallisto_tpmMatrix.csv", sep = "/"), row.names = F)
    
    ################################################
    #### Summarize mapping stats
    ################################################
    
    mapping_stats <- c()
    for (i in dir(kal_dirs, pattern = ".json", full.names = T))
    {
      id <- unlist(strsplit(i, split = "/"))
    
      z <- data.frame(ID = id[length(id) - 1], read_json(i, simplifyVector = T))
      mapping_stats <- rbind(mapping_stats, z)
    }
    
    write.csv(mapping_stats, paste(out_folder, "/kallisto_mappingStats.csv", sep = "/"), row.names = F)