-
David Zimmer authoredDavid Zimmer authored
annotate.fsx 5.63 KiB
#r "nuget: Argu,6.1.1"
#r "nuget: Deedle"
open System
open Argu
open Deedle
printfn "=====\nCheck if dotnet sdk reference can be rechanged to latest: https://stackoverflow.com/a/65934809 \n====="
/// \t or "\t" doesnt work as argument
let getSeparator str =
match str with
| "tab" -> "\t"
| "tabulator" -> "\t"
| _ -> str
/// Snapshots of MapMan and Gene Ontology were generated with FATool and genome releases 5.5 (Chlamy) and Araport11 (Arabidopsis)
module Mapping =
/// reads FA tool snapshot for araport11 or chlamyJGI_v5.5 or other annotation files
let readMapping (mappingFilePath:string) (columnSeparator:string) (identifierColHeader:string) :Frame<string,string> =
Frame.ReadCsv(mappingFilePath,hasHeaders=true,separators = columnSeparator)
|> Frame.indexRows identifierColHeader
/// truncateID processes identifier (e.g. Cre10.g123456.t2.1 -> Cre10.g123456)
let getAnnotationsFromIdentifier (frame:Frame<string,string>) (annotationHeader:string[]) (multipleIdentifierSeparator:string) (multipleAnnotationSeparator:string) (truncateID:string->string) (identifier:string) =
/// identifier that should be mapped
let identifier = identifier.Split([|multipleIdentifierSeparator|],StringSplitOptions.None)
/// Mappings from all identifer to annotations, that are present in annotation frame
let mappings :Series<string,string> []=
identifier
|> Array.choose (fun ident ->
let truncId = truncateID ident
let k = frame.TryGetRow truncId
if k.HasValue then
Some k.Value
else
printfn "Warning: The following id could not be found within mapping file: %s" truncId
None
)
/// all annotations that should be used
annotationHeader //["GO","Synonym"]
|> Array.map (fun annotationType ->
mappings //[[GO => GO:006; Synonym => Q0WV96];[GO => GO:001; Synonym => Q01337]]
|> Array.map (fun mapping ->
let annotation = mapping.[annotationType]
annotation.Split ';'
|> String.concat multipleAnnotationSeparator
)
)
let getAnnotationRow (frame:Frame<string,string>) (annotationHeader:string[]) multipleIdentifierSeparator multipleAnnotationSeparator truncateID (identifier:string) =
getAnnotationsFromIdentifier frame annotationHeader multipleIdentifierSeparator multipleAnnotationSeparator truncateID identifier
|> Array.map (String.concat multipleIdentifierSeparator)
/// User data is read, annotated and written to a new file
module Data =
let getDataFrame (columnSeparator:string) inputPath =
System.IO.File.ReadAllLines(inputPath)
|> Array.map (fun x ->
x.Split([|columnSeparator|],System.StringSplitOptions.None)
)
/// index of column that contains the identifier to annotate
let getColIndex (dataFrame:string[][]) columnHeader=
Array.tryFindIndex (fun x -> x = columnHeader) dataFrame.[0]
|> fun o ->
match o with
| Some i -> i
| _ -> failwithf "ColumnHeader %s not found." columnHeader
/// based on given mapping arguments the file is extended with given mapping columns
let getHeader (dataFrame:string[][]) (columnSeparator:string) (annotationHeader:string[]) =
Array.append dataFrame.[0] annotationHeader
|> String.concat columnSeparator
/// every row of the file is processed and converted to a new string with additional information attached at the end of the line
let getAnnotatedLines annotationFrame inputPath (columnSeparator:string) columnHeader (annotations:string[]) (multipleIdentifierSeparator:string) truncateID (multipleAnnotationSeparator:string) =
let dataFrame = getDataFrame columnSeparator inputPath
let colIndex = getColIndex dataFrame columnHeader
let header = getHeader dataFrame columnSeparator annotations
//let rowCount = dataFrame.Length
dataFrame
|> Array.tail
|> Array.mapi (fun i x ->
let identifier = x.[colIndex].Replace("\"","")
//if i%50=0 then printfn "%04i/%i: %s" i rowCount identifier
//printfn "%04i/%i: %s" i rowCount identifier
let annotations =
if identifier = "" then
Array.init annotations.Length (fun _ -> "")
else
Mapping.getAnnotationRow annotationFrame annotations multipleIdentifierSeparator multipleAnnotationSeparator truncateID identifier
Seq.append x annotations
|> String.concat columnSeparator
)
|> Array.append [|header|]
let annotateAndWriteData annotationFrame inputPath (columnSeparator:string) columnHeader truncateID (annotations:string[]) (multipleIdentifierSeparator:string) (multipleAnnotationSeparator:string) outputPath =
/// warns user if separators are identical
let checkSeparators =
if columnSeparator = multipleIdentifierSeparator then failwith "WARNING: Column separator is equal to identifier separator"
if columnSeparator = multipleAnnotationSeparator then failwith "WARNING: Column separator is equal to annotation separator"
let annotatedRows =
getAnnotatedLines annotationFrame inputPath columnSeparator columnHeader annotations multipleIdentifierSeparator truncateID multipleAnnotationSeparator
System.IO.File.WriteAllLines(outputPath,annotatedRows)