diff --git a/workflows/EvalTurboID/EvalTurboID.fsx b/workflows/EvalTurboID/EvalTurboID.fsx index f61208f09cb38087816244f4e458e2edad99dd0f..c2b580c9179c84f94712c4b7190ab7ea44138da7 100644 --- a/workflows/EvalTurboID/EvalTurboID.fsx +++ b/workflows/EvalTurboID/EvalTurboID.fsx @@ -26,7 +26,8 @@ open Argu open SamPlotting #load "scripts/Plotting.fsx" open Plotting - +#load "scripts/MetaxaTemp.fsx" +open MetaxaTemp module Arguments = type WorkflowArgs = | [<Unique>] [<Mandatory>] [<AltCommandLine("-i")>] InputPath of path:string @@ -165,7 +166,7 @@ Data.annotateAndWriteData ";" "|" outPathA - +printfn "Finished Annotation" //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////// @@ -175,7 +176,7 @@ Data.annotateAndWriteData //////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// let doAnalysis (allDataForAnalysis:Frame<string,string>) dataForAnalysis (allControlSamples:QNode []) baitGroup (allBaitSamples:QNode []) = //controlsToUse normGlobal normByBait batch conditionGenotype control1 control2 inP proteinID inAssay = - + printfn "Start Analysis: %s vs Control" baitGroup let allSamples = [|allBaitSamples;allControlSamples|] |> Array.concat let allColsOfIBait = @@ -193,19 +194,22 @@ let doAnalysis (allDataForAnalysis:Frame<string,string>) dataForAnalysis (allCon ///////////////////////////////////////////////////// // Normalized Data ///////////////////////////////////////////////////// - let normalizedAndTransposed = + printfn "Normalize Data" + /// + + let normalizedAndTransposed, abundanceChart, corrFChart, corrAbundanceChart = // if normGlobal then - let fst = + let fst,ai,cor,aCorr = quantValues |> Frame.sliceCols allColsOfIBait |> Frame.transpose - |> Stats.Normalization.Global.MedianOfRatios.medianOfRatiosWide - let snd = + |> Global.MedianOfRatios.medianOfRatiosWide + let snd,aiCon,corCon,aCorrCon = quantValues |> Frame.sliceCols allColsOfIControl |> Frame.transpose - |> Stats.Normalization.Global.MedianOfRatios.medianOfRatiosWide - Frame.merge fst snd + |> Global.MedianOfRatios.medianOfRatiosWide + Frame.merge fst snd, Chart.combine[ai;aiCon], Chart.combine[cor;corCon],Chart.combine[aCorr;aCorrCon] // else // quantValuesOfInterest // |> Frame.transpose @@ -213,6 +217,7 @@ let doAnalysis (allDataForAnalysis:Frame<string,string>) dataForAnalysis (allCon ///////////////////////////////////////////////////// // Imputation ///////////////////////////////////////////////////// + printfn "Impute Data" let descByISa :Frame<string,string> = [ "BioRepGroup", @@ -269,12 +274,12 @@ let doAnalysis (allDataForAnalysis:Frame<string,string>) dataForAnalysis (allCon // Then, we perform an imputation by simple gaussian sampling, if no value was recorded we sample out of the lowest 0.1 quantile of the data set. let imputeSamplingCondition = - Stats.Imputation.SimpleSampling.computeColumnWiseUsing Distributions.Continuous.Normal.Sample 3 (Some 0.05) (fitCV) + Imputation.SimpleSampling.computeColumnWiseUsing Distributions.Continuous.Normal.Sample 3 (Some 0.01) (fitCV) (dataSorted |> Frame.filterRows (fun k s -> allColsOfIBait |> Array.contains (snd k))) |> Frame.mapValues log2 let imputeSamplingControl = - Stats.Imputation.SimpleSampling.computeColumnWiseUsing Distributions.Continuous.Normal.Sample 3 (Some 0.05) (fitCV) + Imputation.SimpleSampling.computeColumnWiseUsing Distributions.Continuous.Normal.Sample 3 (Some 0.01) (fitCV) (dataSorted |> Frame.filterRows (fun k s -> allColsOfIControl |> Array.contains (snd k))) |> Frame.mapValues log2 @@ -283,9 +288,39 @@ let doAnalysis (allDataForAnalysis:Frame<string,string>) dataForAnalysis (allCon let imputeSamplingT = imputeSampling.Transpose() + + let preImpChart = + let ck = + quantValues.ColumnKeys + |> Array.ofSeq + quantValues + |> Frame.mapValues log2 + |> Frame.toJaggedArray + |> Matrix.ofJaggedArray + |> Matrix.mapiCols (fun i c -> + Chart.Histogram c + |> Chart.withTraceInfo ck.[i] + ) + |> Chart.combine + |> Chart.withXAxisStyle "log2 Abundance" + let afterImpChart = + let ck = + imputeSamplingT.ColumnKeys + |> Array.ofSeq + imputeSamplingT + |> Frame.toJaggedArray + |> Matrix.ofJaggedArray + |> Matrix.mapiCols (fun i c -> + Chart.Histogram c + |> Chart.withTraceInfo ck.[i] + ) + |> Chart.combine + |> Chart.withXAxisStyle "log2 Abundance" ///////////////////////////////////////////////////// // Norm By Quantile Normalization ///////////////////////////////////////////////////// + printfn "perform Quantile Normalization:%b" Arguments.performNorm + let normFrame = if Arguments.performNorm then let normM = @@ -330,6 +365,7 @@ let doAnalysis (allDataForAnalysis:Frame<string,string>) dataForAnalysis (allCon ///////////////////////////////////////////////////// // Testing ///////////////////////////////////////////////////// + printfn "Perform Testing" let (preData1,preData2) :float[][] * float [][]= normFrame |> Frame.mapRows (fun k s -> @@ -353,6 +389,7 @@ let doAnalysis (allDataForAnalysis:Frame<string,string>) dataForAnalysis (allCon ///////////////////////////////////////////////////// // Summary Statistics ///////////////////////////////////////////////////// + printfn "Compute Summary Statistics" let missingValuesCondition = quantValues |> Frame.sliceCols allColsOfIBait @@ -440,26 +477,43 @@ let doAnalysis (allDataForAnalysis:Frame<string,string>) dataForAnalysis (allCon let outP = System.IO.Path.Combine([|runP;baitGroup|]) System.IO.Directory.CreateDirectory outP |> ignore + + abundanceChart + |> Chart.withTemplate ChartTemplates.lightMirrored + |> Chart.saveHtml (outP + @"\1_Norm_PreNormGlobalAbundanceChart.html") + corrFChart + |> Chart.withTemplate ChartTemplates.lightMirrored + |> Chart.saveHtml (outP + @"\2_Norm_CorrectionfactorChart.html") + corrAbundanceChart + |> Chart.withTemplate ChartTemplates.lightMirrored + |> Chart.saveHtml (outP + @"\3_Norm_AfterNormGlobalAbundanceChart.html") + preImpChart + |> Chart.withTemplate ChartTemplates.lightMirrored + |> Chart.saveHtml (outP + @"\4_Imputation_preImpChart.html") + afterImpChart + |> Chart.withTemplate ChartTemplates.lightMirrored + |> Chart.saveHtml (outP + @"\5_Imputation_afterImpChart.html") preNormChart |> Chart.withTemplate ChartTemplates.lightMirrored - |> Chart.saveHtml (outP + @"\1_PreNormChart.html") + |> Chart.saveHtml (outP + @"\6_QuantilNorm_PreNormChart.html") afterNormChart |> Chart.withTemplate ChartTemplates.lightMirrored - |> Chart.saveHtml (outP + @"\2_AfterNormChart.html") + |> Chart.saveHtml (outP + @"\7_QuantilNorm_AfterNormChart.html") createSAMChart res |> Chart.withTemplate ChartTemplates.lightMirrored - |> Chart.saveHtml (outP + @"\3_SAM.html") + |> Chart.saveHtml (outP + @"\8_Testing_SAM.html") plotHisto finPlot |> Chart.withTemplate ChartTemplates.lightMirrored - |> Chart.saveHtml (outP + @"\4_DifferencesHistogram.html") + |> Chart.saveHtml (outP + @"\9_Testing_DifferencesHistogram.html") plotMA [] finPlot // ["CDJ5";"LON";"PSBQLD";"PGRL1"] |> Chart.withTemplate ChartTemplates.lightMirrored - |> Chart.saveHtml (outP + @"\5_MAPlot.html") + |> Chart.saveHtml (outP + @"\10_Testing_MAPlot.html") plotVulcano [] finPlot // ["CDJ5";"LON";"PSBQLD";"PGRL1"] |> Chart.withTemplate ChartTemplates.lightMirrored - |> Chart.saveHtml (outP + @"\6_VulcanoPlot.html") + |> Chart.saveHtml (outP + @"\11_Testing_VulcanoPlot.html") toSave.SaveCsv(outP + @"\Analysis_complete.tsv",includeRowKeys=true,keyNames=["Protein"],separator='\t') - + System.IO.File.WriteAllLines(outP + @"\params.txt",fsi.CommandLineArgs) + printfn "Finished Analysis! results can be found at:%s" outP //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////// //////////////////////////////////////// @@ -508,7 +562,7 @@ let allControlSamples = allBaitGroups |> Array.iter (fun g -> doAnalysis allDataForAnalysis quantDataForAnalysis allControlSamples g.BaitGroup g.QNodes) - +printfn "Done!" diff --git a/workflows/EvalTurboID/runWithNorm.cmd b/workflows/EvalTurboID/runWithQuantilNorm.cmd similarity index 100% rename from workflows/EvalTurboID/runWithNorm.cmd rename to workflows/EvalTurboID/runWithQuantilNorm.cmd diff --git a/workflows/EvalTurboID/scripts/MetaxaTemp.fsx b/workflows/EvalTurboID/scripts/MetaxaTemp.fsx new file mode 100644 index 0000000000000000000000000000000000000000..31c405fa6364e6b937293db8c1350c11dacde458 --- /dev/null +++ b/workflows/EvalTurboID/scripts/MetaxaTemp.fsx @@ -0,0 +1,239 @@ +#r "nuget: arcIO.NET, 0.0.1" +#r "nuget: ISADotNet.QueryModel, 0.5.4" +#r "nuget: BioFSharp.IO, 2.0.0-preview.3" +#r "nuget: FSharp.Stats, 0.4.12-preview.1" +#r "nuget: Plotly.NET, 3.0.1" +// #r "nuget: Deedle, 3.0.0-beta.1" +#r "nuget: Deedle" +#r "nuget: DeedleExtensions, 0.2.0" +#r "nuget: metaXa, 0.1.0" +open MetaXa +open MetaFrame +open Stats +open FSharp.Stats +open Plotly.NET +open Deedle +open BioFSharp.IO +open DeedleExtensions +open ISADotNet + + +module Global = + + /// As used by Deseq2, see: https://github.com/hbctraining/DGE_workshop/blob/master/lessons/02_DGE_count_normalization.md + module MedianOfRatios = + + /// expects data frame with shape frame<sampleIds,GeneIds> + let medianOfRatiosWide (data:Frame<string,string>) = + //data frame with shape frame<sampleIds,GeneIds> + let ratios = + data + |> Frame.dropSparseCols + |> Frame.getNumericCols + |> Series.map (fun geneID s -> + let geometricMeanAcrossAllSamples = + s.Values + |> FSharp.Stats.Seq.median + let ratioPerSample = + s + |> Series.map (fun x sample -> sample / geometricMeanAcrossAllSamples) + ratioPerSample + ) + |> Frame.ofColumns + // Series of shape: Series<sampleId,correctionFactor:float> + let correctionFactorPerSample = + ratios + |> Frame.transpose + |> Stats.median + // data frame with frame<sampleIds,GeneIds> + // contains gene data divided by the sample wise estimated correction factor + let correctedData = + data + |> Frame.getNumericCols + |> Series.map (fun geneID s -> + s / correctionFactorPerSample + ) + |> Frame.ofColumns + let abundanceInit = + Chart.Column(data |> Frame.transpose |> Frame.dropSparseRows |> Stats.median |> Series.observations) + |> Chart.withXAxisStyle "SampleId" + |> Chart.withXAxisStyle "Median of Initial Abundances" + let c = + Chart.Column(correctionFactorPerSample |> Series.observations) + |> Chart.withXAxisStyle "SampleId" + |> Chart.withXAxisStyle "Correction Factor." + let abundanceCorrected = + Chart.Column(correctedData |> Frame.transpose |> Frame.dropSparseRows |> Stats.median |> Series.observations) + |> Chart.withXAxisStyle "SampleId" + |> Chart.withXAxisStyle "Median of corrected abundances" + correctedData,abundanceInit,c,abundanceCorrected + + +module Imputation = + + + module SimpleSampling = + /// Expects normalized data. + /// (learnVarianceMeanDependence: seq<seq<float>> -> (float -> float)) function, which learns how to map from a mean expression to an estimator for variance. + let computeColumnWiseUsing (sampleFunction: float -> float -> float) minCountsForVarEst (quantileForLowMean:float option) (learnVarianceMeanDependence: seq<float*float> -> (float -> float)) (grouped:Frame<string*string,string>) = + /// + let meanWithAtLeast k (x:Series<_,float>) = + let tmp = Series.values x + if tmp |> Seq.length < k then + nan + else + x |> Series.values |> Seq.mean + /// + let cvWithAtLeast k (x:Series<_,float>) = + let tmp = Series.values x + if tmp |> Seq.length < k then + nan + else + x |> Series.values |> cv //Seq.disp + + /// Computes means and variances of complete replicates. + let log2MeanToCVEmpirical = + grouped + |> Frame.getNumericCols + |> Series.map (fun geneID s -> + let means = s |> (meanWithAtLeast minCountsForVarEst) + let cols = s |> (cvWithAtLeast minCountsForVarEst) + means, cols + ) + |> Series.values + |> Array.ofSeq + |> Array.filter (fun x -> nan.Equals(fst x) |> not && nan.Equals(snd x) |> not) + |> Array.sortBy fst + |> Array.map (fun (mean,dis) -> log2 mean, dis) + |> fun data -> + let borders = FSharp.Stats.Testing.Outliers.tukey 1.5 (data |> Array.map snd) + data + |> Array.filter (fun x -> snd x > borders.Lower && snd x < borders.Upper ) + /// function mapping from a mean expression to an estimator for dispersion (cv). + let log2MeanToCv = + learnVarianceMeanDependence log2MeanToCVEmpirical + + let log2MeanToCVByFit = + log2MeanToCVEmpirical + |> Array.map (fun x -> fst x, fst x |> log2MeanToCv) + let meanToCvEstChart = + [ + [ + + Chart.PointDensity(log2MeanToCVEmpirical |> Array.map fst,log2MeanToCVEmpirical |> Array.map snd) + // |> Chart.withMarkerStyle (3) + ] + |> Chart.combine + |> Chart.withTraceInfo "mean to CV empirical" + + Chart.Spline(log2MeanToCVByFit) + |> Chart.withTraceInfo "mean Vs. CV fitted" + ] + |> Chart.combine + |> Chart.withXAxisStyle "log2(mean Abundance)" + |> Chart.withYAxisStyle "CV" + + let quantileForLowMean, quantilesToMeansChart = + let data = + Array.map fst log2MeanToCVEmpirical + |> Array.map (fun x -> 2.**x) + match quantileForLowMean with + | None -> + None, + [ + [|0.05 .. 0.05 .. 0.95|] + |> Array.map (fun q -> q,FSharp.Stats.Quantile.compute q data) + |> Chart.Point + |> Chart.withTraceInfo "quantiles VS Abundance" + ] + |> Chart.combine + |> Chart.withXAxisStyle "Quantiles" + |> Chart.withYAxisStyle "Abundance" + | Some quantileForLowMean -> + let q = data |> FSharp.Stats.Quantile.compute quantileForLowMean + Some q, + [ + [|0.00 .. 0.05 .. 0.95|] + |> Array.mapi (fun i q -> if i = 0 then 0.01 else q) + |> Array.map (fun q -> q,FSharp.Stats.Quantile.compute q data) + |> Chart.Point + |> Chart.withTraceInfo "quantiles VS Abundance" + Chart.Point([quantileForLowMean],[q]) + |> Chart.withTraceInfo "selected quantile" + ] + |> Chart.combine + |> Chart.withXAxisStyle "Quantiles" + |> Chart.withYAxisStyle "Abundance" + + /// Imputation by sampling from a gausian normal distribution based on the input vector + /// function is truncated at zero. Can not return negative values. + let initImputeUsing sampleF (estimateCv:float -> float) (lowMean:float option) (d: seq<float>) = + let clean = d |> Seq.filter (fun x -> nan.Equals x |> not) + let sampleSafe mean standardDev = + let rec sample () = + let tmp = sampleF mean standardDev + if tmp < 0. then sample () else tmp + sample() + if clean |> Seq.isEmpty |> not then + let mean = Seq.mean clean + let cvEst = estimateCv mean + let standardDev = cvEst * mean + //printfn "mean %f" mean + //printfn "stndev %f" standardDev + //printfn "before %A" d + let res = + d + |> Seq.map (fun x -> if nan.Equals x then sampleSafe mean standardDev else x) + //printfn "%A" res + res + else + match lowMean with + | Some lowMean -> + let mean = lowMean + let cvEst = estimateCv mean + let standardDev = cvEst * mean + d + |> Seq.map (fun x -> if nan.Equals x then sampleSafe mean standardDev else x) + | None -> d + + let imputeUsing (x:seq<float>) = + initImputeUsing sampleFunction (log2 >> log2MeanToCv) (quantileForLowMean) x + + /// Computes means and variances of complete replicates. + let imputed = + // Step 1: Set genes missing when they are not comparable + // because all sample groups have 1 or zero observations. + grouped + |> Frame.getNumericCols + |> Series.map (fun geneID s -> + let geneDataByFactorLevels = + s + |> Series.applyLevel fst (Series.values >> Array.ofSeq) + let geneData = + geneDataByFactorLevels + |> Series.observations + let present = + geneData + |> Seq.map (fun (sampleID,v) -> + v.Length > 1 + ) + |> Seq.contains true + if present then s else s |> Series.mapValues (fun s -> nan) + ) + |> Frame.ofColumns + // Step 2: + // x present 0 not present *nan* + // [x; x; x] -> [x; x; x] + // [x; x; 0] -> [x; x; x] by sampling, mean from sample, var by est + // Will be biased towards the remaining point. + // [x; 0; 0] -> [x; x; x] by sampling, mean from sample, var by est + // + // [0; 0; 0] -> + //depending on the value of quantileForLowMean : + //Some x -> [x; x; x] + //None -> [0; 0; 0] + |> Map.mapColWiseGroupedByFactorLevels imputeUsing + + // meanToCvEstChart |> Chart.show + // quantilesToMeansChart |> Chart.show + imputed \ No newline at end of file diff --git a/workflows/EvalTurboID/scripts/statistics.fsx b/workflows/EvalTurboID/scripts/statistics.fsx deleted file mode 100644 index fe8be8bf2eae11051a17c9eb33a39c910556cb87..0000000000000000000000000000000000000000 --- a/workflows/EvalTurboID/scripts/statistics.fsx +++ /dev/null @@ -1,393 +0,0 @@ -#r "nuget: arcIO.NET, 0.0.1" -#r "nuget: ISADotNet.QueryModel, 0.5.4" -#r "nuget: BioFSharp.IO, 2.0.0-preview.3" -#r "nuget: FSharp.Stats, 0.4.12-preview.1" -#r "nuget: Plotly.NET, 3.0.1" -// #r "nuget: Deedle, 3.0.0-beta.1" -#r "nuget: Deedle" -#r "nuget: DeedleExtensions, 0.2.0" -#r "nuget: metaXa, 0.1.0" -open MetaXa -open MetaFrame -open Stats -open FSharp.Stats -open Plotly.NET -open Deedle -open BioFSharp.IO -open DeedleExtensions -open ISADotNet - -type UseControl = - | C4 - | C13 - | Both -5+5 -// let batch = 2 -// let conditionGenotype = "UVM_CDJ5_WT_TurboID" -let control1 = "UVM_Cher4" -let control2 = "UVM_Cher13" -let inP = __SOURCE_DIRECTORY__ + @"\..\..\runs\4_CDJ5_MSFraggerCombined_annotated\MSFraggerCombined_annotated.tsv" -let proteinID = "Protein" -let inAssay = "CDJ5Proteomics" -let conditionGenotype = "UVM_TEF_TurboID"//"UVM_CDJ5_WT_TurboID" -let batch = 1//2 -let normGlobal = true -let fdr = 0.01 -// let doAnalysis controlsToUse normGlobal normByBait batch conditionGenotype control1 control2 inP proteinID inAssay = -///////////////////////////////////////////////////// -// / params -///////////////////////////////////////////////////// -// let conditionGenotype = conditionGenotype//"UVM_CDJ5_WT_TurboID" -// let control1 = control1//"UVM_Cher4" -// let control2 = control2//"UVM_Cher13" -// let inP = inP//__SOURCE_DIRECTORY__ + @"\..\..\runs\4_CDJ5_MSFraggerCombined_annotated\MSFraggerCombined_annotated.tsv" -// let proteinID = proteinID//"Protein" -// let inAssay = inAssay//"CDJ5Proteomics" -// let normGlobal = normGlobal -///////////////////////////////////////////////////// -// Isa -///////////////////////////////////////////////////// -let arcPath = __SOURCE_DIRECTORY__ + @"\..\..\" - -let p,a = arcIO.Net.Assay.readByName arcPath inAssay - -let qa = QueryModel.QStudy.fromAssay a - -let allSamples = - qa.LastNodes() - |> Set.ofSeq - -let getBioRep fN = - qa.ValuesOf(fN,ProtocolName = "Growth").WithName("biological replicate").First.ValueWithUnitText - |> int - -let getGenotype fN = - qa.ValuesOf(fN,ProtocolName = "GenotypeLib").WithName("Genotype").First.ValueWithUnitText - |> string - -let getBatch fN = - qa.ValuesOf(fN,ProtocolName = "Growth").WithName("Batch").First.ValueWithUnitText - |> int - -let getTurboIDIdentifier fN = - qa.ValuesOf(fN,ProtocolName = "GenotypeLib").WithName("gene identifier").First.ValueWithUnitText - |> string - -///////////////////////////////////////////////////// -// / Raw Data -///////////////////////////////////////////////////// -let schema = - allSamples - |> Seq.map (fun x -> sprintf "%s=float" x) - |> String.concat "," - -// / -let all :Frame<string,string> = - Frame.ReadCsv(path = inP, hasHeaders=true, separators="\t",schema= schema) - |> Frame.indexRows proteinID - |> Frame.mapValues (fun x -> if x > 0. then x else nan) - -let quantValues :Frame<string,string> = - all - |> Frame.sliceCols allSamples - -let allColumnsCondition = - quantValues.ColumnKeys - |> Seq.filter (fun x -> getGenotype x = conditionGenotype) - |> Array.ofSeq - -let allColumnsCher4Control = - quantValues.ColumnKeys - |> Seq.filter (fun x -> getGenotype x = control1 && getBatch x = batch) - |> Array.ofSeq - -let allColumnsCher13Control = - quantValues.ColumnKeys - |> Seq.filter (fun x -> getGenotype x = control2 && getBatch x = batch) - |> Array.ofSeq - -let allColumnsControl = - (Array.append allColumnsCher4Control allColumnsCher13Control) - -let quantValuesOfInterest = - quantValues - |> Frame.sliceCols (Array.concat [|allColumnsCondition;allColumnsControl|]) - -quantValuesOfInterest.ColumnKeys - -///////////////////////////////////////////////////// -// Normalized Data -///////////////////////////////////////////////////// -let normalizedAndTransposed = - if normGlobal then - let fst = - quantValuesOfInterest - |> Frame.sliceCols allColumnsCondition - |> Frame.transpose - |> Stats.Normalization.Global.MedianOfRatios.medianOfRatiosWide - let snd = - quantValuesOfInterest - |> Frame.sliceCols allColumnsControl - |> Frame.transpose - |> Stats.Normalization.Global.MedianOfRatios.medianOfRatiosWide - Frame.merge fst snd - else - quantValuesOfInterest - |> Frame.transpose - - -///////////////////////////////////////////////////// -// Imputation -///////////////////////////////////////////////////// -let descByISa :Frame<string,string> = - [ - "Genotype", - allSamples - |> Seq.map (fun x -> x,getGenotype x) - |> Series.ofObservations - "biological replicate", - allSamples - |> Seq.map (fun x -> x,getBioRep x |> string) - |> Series.ofObservations - ] - |> Frame.ofColumns - |> Frame.filterRows (fun k s -> [|allColumnsCondition;allColumnsControl|] |> Array.concat |> Array.contains k ) -descByISa.Print() - -let fittingSpline lambda (data:seq<(float*float)>) = - let x,y = data |> Array.ofSeq |> Array.unzip - let xV,yV = x |> vector, y |> vector - let basisP = // xV |> Array.ofSeq - [| - for i = 0 to xV.Length-1 do - if i = 0 then - xV.[i] - elif i % 50 = 0 then xV.[i] - elif i = xV.Length-1 then xV.[xV.Length-1] - |] - - FSharp.Stats.Fitting.Spline.smoothingSpline (data |> Array.ofSeq) basisP lambda - -let fittingSpline' (data:seq<(float*float)>) = - fittingSpline 1000000000000. data - -let sortingInner = - (fun (s:ObjectSeries<string>) -> - let repIdx = s.GetAs<int>("biological replicate") - repIdx - ) - -let sortingOuter = - (fun (s:ObjectSeries<string>) -> - let g = s.GetAs<string>("Genotype") - if g = conditionGenotype then 0 - elif g = control1 then 1 - elif g = control2 then 2 - else failwith "Genotype not supported by design" - ) - -// First we create a new !sorted! description based on a grouping created specifing the columns in the original description which are used to create a grouping. -let descriptionSorted = Group.groupAndSortByFactorLevels sortingInner sortingOuter ["Genotype";] (String.concat "_") descByISa -descriptionSorted.Print() - -// Then, we sort and group the data based on the new description -let dataSorted = Group.groupRowsByGroupedFactors descriptionSorted normalizedAndTransposed -dataSorted.Print() - -let centerData (imputedData:Frame<string,string>) = - let median = - imputedData - |> Frame.toJaggedArray - |> Array.concat - |> Seq.mean - let stabw = - imputedData - |> Frame.toJaggedArray - |> Array.concat - |> Seq.stDev - imputedData - |> Frame.mapValues (fun x -> - (x-median) / stabw - ) - -// Then, we perform an imputation by simple gaussian sampling, if no value was recorded we sample out of the lowest 0.1 quantile of the data set. -let imputeSamplingCondition = - Stats.Imputation.SimpleSampling.computeColumnWiseUsing Distributions.Continuous.Normal.Sample 3 (Some 0.1) (fittingSpline') - (dataSorted |> Frame.filterRows (fun k s -> allColumnsCondition |> Array.contains (snd k))) - |> Frame.mapValues log2 - // |> centerData - -let imputeSamplingControl = - Stats.Imputation.SimpleSampling.computeColumnWiseUsing Distributions.Continuous.Normal.Sample 3 (Some 0.1) (fittingSpline') - (dataSorted |> Frame.filterRows (fun k s -> (Array.append allColumnsCher4Control allColumnsCher13Control) |> Array.contains (snd k))) - |> Frame.mapValues log2 - // |> centerData - -let imputeSampling = - Frame.merge imputeSamplingCondition imputeSamplingControl - - -let imputeSamplingT' = - imputeSampling.Transpose() - -imputeSamplingT' -|> Frame.toJaggedArray -|> Matrix.ofJaggedArray -|> Matrix.mapCols (fun c -> - Chart.Histogram c - ) -|> Chart.combine -|> Chart.show -imputeSamplingT' -|> Frame.toJaggedArray -|> Matrix.ofJaggedArray -// |> FSharp.Stats.Signal.Normalization.quantile -|> Matrix.mapCols (fun c -> - Chart.Histogram c - ) -|> Chart.combine -|> Chart.show - -let imputeSamplingT = - imputeSamplingT' - |> Frame.toJaggedArray - |> Matrix.ofJaggedArray - |> FSharp.Stats.Signal.Normalization.quantile - |> Matrix.toArray2D - |> Frame.ofArray2D - |> Frame.indexColsWith imputeSamplingT'.ColumnKeys - |> Frame.indexRowsWith imputeSamplingT'.RowKeys - - -// to separate control and factor sets (sample1 and sample2, respectively), -// the datasets are chunked by the number of replicates (here: triplicates -> chunkBySize 3). -let (preData1,preData2) :float[][] * float [][]= - imputeSamplingT - |> Frame.mapRows (fun k s -> - allColumnsCondition - |> Array.map (fun c -> s.GetAs<float>(c)), - [| - allColumnsCher4Control - |> Array.map (fun c -> s.GetAs<float>(c)) - allColumnsCher13Control - |> Array.map (fun c -> s.GetAs<float>(c)) - |] - |> Array.concat - ) - |> Series.values - |> Array.ofSeq - |> Array.unzip -// let (preData1Norm,preData2Norm) = centerData preData1, centerData preData2 -// After chunking, the datasets are separated by factor and the rowkey is added to the data for later identification. -let rowheader :string[] = imputeSamplingT.RowKeys |> Seq.toArray -let data1 = Array.zip rowheader preData1//preData1// -let data2 = Array.zip rowheader preData2//preData2// - -let res = FSharp.Stats.Testing.SAM.twoClassUnpaired 100 fdr data2 data1 (System.Random(1234)) - -res.NegSigBioitem.Length - -res.PosSigBioitem.Length - -#load "SamPlotting.fsx" -open SamPlotting -createSAMChart res -|> Chart.show - -let missingValuesCondition = - all - |> Frame.sliceCols allColumnsCondition - |> Frame.mapRows (fun k s -> - allColumnsCondition - |> Array.map (fun x -> - s.GetAs<float>(x,nan) - ) - |> Array.filter (fun x -> nan.Equals x) - |> Array.length - ) - -let missingValuesControl = - all - |> Frame.sliceCols allColumnsControl - |> Frame.mapRows (fun k s -> - allColumnsControl - |> Array.map (fun x -> - s.GetAs<float>(x,nan) - ) - |> Array.filter (fun x -> nan.Equals x) - |> Array.length - ) - -let fc = - imputeSamplingT - |> Frame.mapRows (fun k s -> - let m1 = allColumnsCondition |> Seq.map (fun x -> s.GetAs<float>(x,nan)) |> Seq.mean - let m2 = allColumnsControl |> Seq.map (fun x -> s.GetAs<float>(x,nan)) |> Seq.mean - m1-m2 - ) - -let meanAbundance = - imputeSamplingT - |> Frame.transpose - |> Stats.mean - -let stdev = - imputeSamplingT - |> Frame.sliceCols allColumnsCondition - |> Frame.transpose - |> Stats.stdDev - -let qValue = - [|res.PosSigBioitem;res.NegSigBioitem;res.NonSigBioitem|] - |> Array.concat - |> Array.map (fun x -> x.ID,x.QValue) - |> Series.ofObservations - -let IsCandidate = - [|res.PosSigBioitem;res.NegSigBioitem;res.NonSigBioitem|] - |> Array.concat - |> Array.map (fun x -> x.ID,x.QValue < fdr ) - |> Series.ofObservations - - -let finPlot = - imputeSamplingT - |> Frame.addCol "MeanR" fc - |> Frame.addCol "MeanAbundance" meanAbundance - |> Frame.addCol "StDevR" stdev - |> Frame.addCol "qValue" qValue - |> Frame.addCol "missingValuesCondition" missingValuesCondition - |> Frame.addCol "missingValuesControl" missingValuesControl - |> Frame.addCol "IsCandidate" IsCandidate - |> Frame.addCol "ProtName" (all.GetColumn<string>("Synonym")) - -#load "Plotting.fsx" -open Plotting - -plotMA ["CDJ5";"LON";"PSBQLD";"PGRL1"] finPlot -|> Chart.show -// |> Chart.saveHtml (__SOURCE_DIRECTORY__ + @"\..\..\runs\5_Analysis_SAM\MAPLot_1PercentFDRCutoff.") - - -plotHisto finPlot -|> Chart.show - -plotVulcano ["CDJ5";"LON";"PSBQLD";"PGRL1"] finPlot -|> Chart.show - -finPlot.Print() - -let fin = - imputeSamplingT - |> Frame.addCol "log2FC_Condition_Minus_Control" fc - |> Frame.addCol "AverageLog2Abundance" meanAbundance - |> Frame.addCol "qValue" qValue - |> Frame.addCol "missingValuesCondition" missingValuesCondition - |> Frame.addCol "missingValuesControl" missingValuesControl - |> Frame.addCol "IsCandidate" IsCandidate - |> Frame.addCol "ProtName" (all.GetColumn<string>("Synonym")) - -let toSave = Frame.join JoinKind.Outer fin (Frame.sliceCols ["Synonym"; "MapMan";"MapManDescription"; "GO"; "GODescription"] all) - -// toSave.SaveCsv(__SOURCE_DIRECTORY__ + @"\..\..\runs\5_Analysis_SAM\Analysis_complete.tsv",includeRowKeys=true,keyNames=["Protein"],separator='\t') -