From b34a3340995e83a193a2b8aea5168acd9a1a29a4 Mon Sep 17 00:00:00 2001 From: Alex Zubiaga Date: Wed, 29 Mar 2017 15:00:44 +0200 Subject: [PATCH] capturing the training set in a closure to avoid retraining for each item --- .../MachineLearningInAction/NaiveBayes.fs | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/MachineLearningInAction/MachineLearningInAction/NaiveBayes.fs b/MachineLearningInAction/MachineLearningInAction/NaiveBayes.fs index e9b1a94..8145a04 100644 --- a/MachineLearningInAction/MachineLearningInAction/NaiveBayes.fs +++ b/MachineLearningInAction/MachineLearningInAction/NaiveBayes.fs @@ -101,6 +101,7 @@ module NaiveBayes = label, prop(total, size), Map.map (fun token count -> prop(count, totTokens)) tokenCount) + |> Seq.toList // Classifier function: // the classifier is trained on the dataset, @@ -112,16 +113,17 @@ module NaiveBayes = // and returning the highest scoring label. // Probabilities are log-transformed to avoid underflow. // See "Chapter4.fsx" for an illustration. - let classifier frequency dataset words text = + let classifier frequency dataset words = let estimator = train frequency dataset words - let tokenized = vocabulary text - estimator - |> Seq.map (fun (label, proba, tokens) -> - label, - tokens - |> Map.fold (fun p token value -> - if Set.exists (fun w -> w = token) tokenized - then p + log(value) - else p) (log proba)) - |> Seq.maxBy snd - |> fst \ No newline at end of file + fun text -> + let tokenized = vocabulary text + estimator + |> Seq.map (fun (label, proba, tokens) -> + label, + tokens + |> Map.fold (fun p token value -> + if Set.exists (fun w -> w = token) tokenized + then p + log(value) + else p) (log proba)) + |> Seq.maxBy snd + |> fst \ No newline at end of file