Skip to content
Snippets Groups Projects
annotate.fsx 5.63 KiB
#r "nuget: Argu,6.1.1"
#r "nuget: Deedle"

open System
open Argu
open Deedle 

printfn "=====\nCheck if dotnet sdk reference can be rechanged to latest: https://stackoverflow.com/a/65934809 \n====="

/// \t or "\t" doesnt work as argument
let getSeparator str = 
    match str with
    | "tab"             -> "\t"
    | "tabulator"       -> "\t"
    | _ -> str

/// Snapshots of MapMan and Gene Ontology were generated with FATool and genome releases 5.5 (Chlamy) and Araport11 (Arabidopsis)
module Mapping =
    
    /// reads FA tool snapshot for araport11 or chlamyJGI_v5.5 or other annotation files
    let readMapping (mappingFilePath:string) (columnSeparator:string) (identifierColHeader:string) :Frame<string,string> = 
        Frame.ReadCsv(mappingFilePath,hasHeaders=true,separators = columnSeparator)
        |> Frame.indexRows identifierColHeader
 
    /// truncateID processes identifier (e.g. Cre10.g123456.t2.1 -> Cre10.g123456)
    let getAnnotationsFromIdentifier (frame:Frame<string,string>) (annotationHeader:string[]) (multipleIdentifierSeparator:string) (multipleAnnotationSeparator:string) (truncateID:string->string) (identifier:string) =
        
        /// identifier that should be mapped
        let identifier = identifier.Split([|multipleIdentifierSeparator|],StringSplitOptions.None)
            
        /// Mappings from all identifer to annotations, that are present in annotation frame
        let mappings :Series<string,string> []= 
            identifier 
            |> Array.choose (fun ident -> 
                let truncId = truncateID ident
                let k = frame.TryGetRow truncId
                if k.HasValue then 
                    Some k.Value
                else
                    printfn "Warning: The following id could not be found within mapping file: %s" truncId
                    None
                )

        /// all annotations that should be used
        annotationHeader //["GO","Synonym"]
        |> Array.map (fun annotationType -> 
            mappings //[[GO => GO:006; Synonym => Q0WV96];[GO => GO:001; Synonym => Q01337]]
            |> Array.map (fun mapping -> 
                let annotation = mapping.[annotationType]
                annotation.Split ';' 
                |> String.concat multipleAnnotationSeparator
                )
            )

    let getAnnotationRow (frame:Frame<string,string>) (annotationHeader:string[]) multipleIdentifierSeparator multipleAnnotationSeparator truncateID (identifier:string) = 
        getAnnotationsFromIdentifier frame annotationHeader multipleIdentifierSeparator multipleAnnotationSeparator truncateID identifier
        |> Array.map (String.concat multipleIdentifierSeparator)


/// User data is read, annotated and written to a new file
module Data = 
    
    let getDataFrame (columnSeparator:string) inputPath = 
        System.IO.File.ReadAllLines(inputPath)
        |> Array.map (fun x ->
            x.Split([|columnSeparator|],System.StringSplitOptions.None)
            )

    /// index of column that contains the identifier to annotate
    let getColIndex (dataFrame:string[][]) columnHeader= 
        Array.tryFindIndex (fun x -> x = columnHeader) dataFrame.[0]
        |> fun o -> 
            match o with
            | Some i -> i
            | _ -> failwithf "ColumnHeader %s not found." columnHeader
    
    /// based on given mapping arguments the file is extended with given mapping columns
    let getHeader (dataFrame:string[][]) (columnSeparator:string) (annotationHeader:string[]) = 
        Array.append dataFrame.[0] annotationHeader
        |> String.concat columnSeparator
   
    /// every row of the file is processed and converted to a new string with additional information attached at the end of the line
    let getAnnotatedLines annotationFrame inputPath (columnSeparator:string) columnHeader (annotations:string[]) (multipleIdentifierSeparator:string) truncateID (multipleAnnotationSeparator:string) =       
        let dataFrame = getDataFrame columnSeparator inputPath           
        let colIndex = getColIndex dataFrame columnHeader     
        let header = getHeader dataFrame columnSeparator annotations
        //let rowCount = dataFrame.Length
        dataFrame
        |> Array.tail
        |> Array.mapi (fun i x -> 
            let identifier = x.[colIndex].Replace("\"","")
            
            //if i%50=0 then printfn "%04i/%i: %s" i rowCount identifier
            //printfn "%04i/%i: %s" i rowCount identifier
            let annotations = 
                if identifier = "" then 
                    Array.init annotations.Length (fun _ -> "")
                else 
                    Mapping.getAnnotationRow annotationFrame annotations multipleIdentifierSeparator multipleAnnotationSeparator truncateID identifier
            Seq.append x annotations
            |> String.concat columnSeparator
            )
        |> Array.append [|header|]

    let annotateAndWriteData annotationFrame inputPath (columnSeparator:string) columnHeader truncateID (annotations:string[]) (multipleIdentifierSeparator:string) (multipleAnnotationSeparator:string) outputPath =
        /// warns user if separators are identical
        let checkSeparators = 
            if columnSeparator = multipleIdentifierSeparator then failwith "WARNING: Column separator is equal to identifier separator"
            if columnSeparator = multipleAnnotationSeparator then failwith "WARNING: Column separator is equal to annotation separator"
        let annotatedRows = 
            getAnnotatedLines annotationFrame inputPath columnSeparator columnHeader annotations multipleIdentifierSeparator truncateID multipleAnnotationSeparator
        System.IO.File.WriteAllLines(outputPath,annotatedRows)