From 09d94f1ff1e1446d269bf9f2dbb7bc905f665d66 Mon Sep 17 00:00:00 2001 From: RAditi Date: Mon, 9 Mar 2015 13:08:19 +0530 Subject: [PATCH 01/14] Added the main code for intergration. Code has to be cleaned up. --- src/main/scala/Algorithm.scala | 45 +++++++++++++++++--------------- src/main/scala/DataSource.scala | 46 ++++++++++++++++++++++----------- src/main/scala/Engine.scala | 16 ++++++++---- src/main/scala/Preparator.scala | 17 ++++++------ src/main/scala/Serving.scala | 5 ++-- 5 files changed, 76 insertions(+), 53 deletions(-) diff --git a/src/main/scala/Algorithm.scala b/src/main/scala/Algorithm.scala index d80f5c9..78a7eda 100644 --- a/src/main/scala/Algorithm.scala +++ b/src/main/scala/Algorithm.scala @@ -1,37 +1,40 @@ -package org.template.vanilla +package org.template.classification import io.prediction.controller.P2LAlgorithm import io.prediction.controller.Params -import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ -import org.apache.spark.rdd.RDD +import org.apache.spark.mllib.classification.NaiveBayes +import org.apache.spark.mllib.classification.NaiveBayesModel +import org.apache.spark.mllib.regression.LinearRegressionWithSGD +import org.apache.spark.mllib.regression.LinearRegressionModel +import org.apache.spark.mllib.linalg.Vectors import grizzled.slf4j.Logger -case class AlgorithmParams(mult: Int) extends Params +case class AlgorithmParams( + val lambda: Double +) extends Params -class Algorithm(val ap: AlgorithmParams) - // extends PAlgorithm if Model contains RDD[] - extends P2LAlgorithm[PreparedData, Model, Query, PredictedResult] { +// extends P2LAlgorithm because the MLlib's NaiveBayesModel doesn't contain RDD. +class NaiveBayesAlgorithm(val ap: AlgorithmParams) + extends P2LAlgorithm[PreparedData, LinearRegressionModel, Query, PredictedResult] { @transient lazy val logger = Logger[this.type] - def train(data: PreparedData): Model = { - // Simply count number of events - // and multiple it by the algorithm parameter - // and store the number as model - val count = data.events.count().toInt * ap.mult - new Model(mc = count) + def train(data: PreparedData): LinearRegressionModel = { + // MLLib NaiveBayes cannot handle empty training data. + require(!data.labeledPoints.take(1).isEmpty, + s"RDD[labeldPoints] in PreparedData cannot be empty." + + " Please check if DataSource generates TrainingData" + + " and Preprator generates PreparedData correctly.") + val lin = new LinearRegressionWithSGD() + lin.run(data.labeledPoints) } - def predict(model: Model, query: Query): PredictedResult = { - // Prefix the query with the model data - val result = s"${model.mc}-${query.q}" - PredictedResult(p = result) + def predict(model: LinearRegressionModel, query: Query): PredictedResult = { + + val label = model.predict(Vectors.dense(query.features)) + new PredictedResult(label) } -} -class Model(val mc: Int) extends Serializable { - override def toString = s"mc=${mc}" } diff --git a/src/main/scala/DataSource.scala b/src/main/scala/DataSource.scala index 6116ee7..fd151b4 100644 --- a/src/main/scala/DataSource.scala +++ b/src/main/scala/DataSource.scala @@ -1,4 +1,4 @@ -package org.template.vanilla +package org.template.classification import io.prediction.controller.PDataSource import io.prediction.controller.EmptyEvaluationInfo @@ -10,10 +10,12 @@ import io.prediction.data.storage.Storage import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.linalg.Vectors import grizzled.slf4j.Logger -case class DataSourceParams(appId: Int) extends Params +case class DataSourceParams(val appId: Int) extends Params class DataSource(val dsp: DataSourceParams) extends PDataSource[TrainingData, @@ -24,21 +26,35 @@ class DataSource(val dsp: DataSourceParams) override def readTraining(sc: SparkContext): TrainingData = { val eventsDb = Storage.getPEvents() - // read all events of EVENT involving ENTITY_TYPE and TARGET_ENTITY_TYPE - val eventsRDD: RDD[Event] = eventsDb.find( + val labeledPoints: RDD[LabeledPoint] = eventsDb.aggregateProperties( appId = dsp.appId, - entityType = Some("ENTITY_TYPE"), - eventNames = Some(List("EVENT")), - targetEntityType = Some(Some("TARGET_ENTITY_TYPE")))(sc) - - new TrainingData(eventsRDD) + entityType = "user", + // only keep entities with these required properties defined + required = Some(List("plan", "attr0", "attr1", "attr2")))(sc) + // aggregateProperties() returns RDD pair of + // entity ID and its aggregated properties + .map { case (entityId, properties) => + try { + LabeledPoint(properties.get[Double]("plan"), + Vectors.dense(Array( + properties.get[Double]("attr0"), + properties.get[Double]("attr1"), + properties.get[Double]("attr2") + )) + ) + } catch { + case e: Exception => { + logger.error(s"Failed to get properties ${properties} of" + + s" ${entityId}. Exception: ${e}.") + throw e + } + } + } + + new TrainingData(labeledPoints) } } class TrainingData( - val events: RDD[Event] -) extends Serializable { - override def toString = { - s"events: [${events.count()}] (${events.take(2).toList}...)" - } -} + val labeledPoints: RDD[LabeledPoint] +) extends Serializable diff --git a/src/main/scala/Engine.scala b/src/main/scala/Engine.scala index f2b4734..7c75201 100644 --- a/src/main/scala/Engine.scala +++ b/src/main/scala/Engine.scala @@ -1,18 +1,24 @@ -package org.template.vanilla + +package org.template.classification + import io.prediction.controller.IEngineFactory import io.prediction.controller.Engine -case class Query(q: String) extends Serializable +class Query( + val features: Array[Double] +) extends Serializable -case class PredictedResult(p: String) extends Serializable +class PredictedResult( + val label: Double +) extends Serializable -object VanillaEngine extends IEngineFactory { +object ClassificationEngine extends IEngineFactory { def apply() = { new Engine( classOf[DataSource], classOf[Preparator], - Map("algo" -> classOf[Algorithm]), + Map("naive" -> classOf[NaiveBayesAlgorithm]), classOf[Serving]) } } diff --git a/src/main/scala/Preparator.scala b/src/main/scala/Preparator.scala index 552a94d..4e581ea 100644 --- a/src/main/scala/Preparator.scala +++ b/src/main/scala/Preparator.scala @@ -1,20 +1,19 @@ -package org.template.vanilla +package org.template.classification import io.prediction.controller.PPreparator -import io.prediction.data.storage.Event import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD +import org.apache.spark.mllib.regression.LabeledPoint -class Preparator - extends PPreparator[TrainingData, PreparedData] { +class PreparedData( + val labeledPoints: RDD[LabeledPoint] +) extends Serializable + +class Preparator extends PPreparator[TrainingData, PreparedData] { def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = { - new PreparedData(events = trainingData.events) + new PreparedData(trainingData.labeledPoints) } } - -class PreparedData( - val events: RDD[Event] -) extends Serializable diff --git a/src/main/scala/Serving.scala b/src/main/scala/Serving.scala index 0477f08..ef06088 100644 --- a/src/main/scala/Serving.scala +++ b/src/main/scala/Serving.scala @@ -1,9 +1,8 @@ -package org.template.vanilla +package org.template.classification import io.prediction.controller.LServing -class Serving - extends LServing[Query, PredictedResult] { +class Serving extends LServing[Query, PredictedResult] { override def serve(query: Query, From c0c77406161c0aa511a8d0603cb82269310bdd9e Mon Sep 17 00:00:00 2001 From: RAditi Date: Mon, 23 Mar 2015 13:10:21 +0530 Subject: [PATCH 02/14] Integrated the linear regression algorithm from MLLib --- engine.json | 4 ++-- src/main/scala/Algorithm.scala | 36 +++++++++++++++++++++++---------- src/main/scala/DataSource.scala | 18 ++++++++++++----- src/main/scala/Engine.scala | 8 ++++---- src/main/scala/Preparator.scala | 6 +++--- src/main/scala/Serving.scala | 2 +- 6 files changed, 48 insertions(+), 26 deletions(-) diff --git a/engine.json b/engine.json index 9ae7254..c737d28 100644 --- a/engine.json +++ b/engine.json @@ -4,14 +4,14 @@ "engineFactory": "org.template.vanilla.VanillaEngine", "datasource": { "params" : { - "appId": 1 + "appId": 7 } }, "algorithms": [ { "name": "algo", "params": { - "mult" : 1 + "intercept" : 1 } } ] diff --git a/src/main/scala/Algorithm.scala b/src/main/scala/Algorithm.scala index 78a7eda..47c2b42 100644 --- a/src/main/scala/Algorithm.scala +++ b/src/main/scala/Algorithm.scala @@ -1,40 +1,54 @@ -package org.template.classification +package org.template.vanilla import io.prediction.controller.P2LAlgorithm import io.prediction.controller.Params -import org.apache.spark.mllib.classification.NaiveBayes -import org.apache.spark.mllib.classification.NaiveBayesModel + import org.apache.spark.mllib.regression.LinearRegressionWithSGD import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ import grizzled.slf4j.Logger + + + case class AlgorithmParams( - val lambda: Double +//Whether the model should train with an intercept + val intercept : Double ) extends Params -// extends P2LAlgorithm because the MLlib's NaiveBayesModel doesn't contain RDD. -class NaiveBayesAlgorithm(val ap: AlgorithmParams) + +// extends P2LAlgorithm if Model contains RDD[] + +class algo(val ap: AlgorithmParams) extends P2LAlgorithm[PreparedData, LinearRegressionModel, Query, PredictedResult] { @transient lazy val logger = Logger[this.type] def train(data: PreparedData): LinearRegressionModel = { - // MLLib NaiveBayes cannot handle empty training data. - require(!data.labeledPoints.take(1).isEmpty, + // MLLib Linear Regression cannot handle empty training data. + require(!data.training_points.take(1).isEmpty, s"RDD[labeldPoints] in PreparedData cannot be empty." + " Please check if DataSource generates TrainingData" + " and Preprator generates PreparedData correctly.") val lin = new LinearRegressionWithSGD() - lin.run(data.labeledPoints) + + +implicit def str2bool(string:String):Boolean = string.toUpperCase.equals("TRUE") + + + + lin.setIntercept(ap.intercept.equals(1)) + lin.run(data.training_points) } def predict(model: LinearRegressionModel, query: Query): PredictedResult = { - val label = model.predict(Vectors.dense(query.features)) - new PredictedResult(label) + val result = model.predict(Vectors.dense(query.features)) + new PredictedResult(result) } } diff --git a/src/main/scala/DataSource.scala b/src/main/scala/DataSource.scala index fd151b4..398dfc9 100644 --- a/src/main/scala/DataSource.scala +++ b/src/main/scala/DataSource.scala @@ -1,4 +1,4 @@ -package org.template.classification +package org.template.vanilla import io.prediction.controller.PDataSource import io.prediction.controller.EmptyEvaluationInfo @@ -26,15 +26,23 @@ class DataSource(val dsp: DataSourceParams) override def readTraining(sc: SparkContext): TrainingData = { val eventsDb = Storage.getPEvents() - val labeledPoints: RDD[LabeledPoint] = eventsDb.aggregateProperties( + +//Read all events involving "point" type + println("Gathering data from the event server") + + + val training_points: RDD[LabeledPoint] = eventsDb.aggregateProperties( + appId = dsp.appId, - entityType = "user", + entityType = "training_point", + // only keep entities with these required properties defined required = Some(List("plan", "attr0", "attr1", "attr2")))(sc) // aggregateProperties() returns RDD pair of // entity ID and its aggregated properties .map { case (entityId, properties) => try { + //Converting to Labeled Point as the LinearRegression Algorithm requires LabeledPoint(properties.get[Double]("plan"), Vectors.dense(Array( properties.get[Double]("attr0"), @@ -51,10 +59,10 @@ class DataSource(val dsp: DataSourceParams) } } - new TrainingData(labeledPoints) + new TrainingData(training_points) } } class TrainingData( - val labeledPoints: RDD[LabeledPoint] + val training_points: RDD[LabeledPoint] ) extends Serializable diff --git a/src/main/scala/Engine.scala b/src/main/scala/Engine.scala index 7c75201..fc6e6f6 100644 --- a/src/main/scala/Engine.scala +++ b/src/main/scala/Engine.scala @@ -1,5 +1,5 @@ -package org.template.classification +package org.template.vanilla import io.prediction.controller.IEngineFactory @@ -10,15 +10,15 @@ class Query( ) extends Serializable class PredictedResult( - val label: Double + val prediction: Double ) extends Serializable -object ClassificationEngine extends IEngineFactory { +object VanillaEngine extends IEngineFactory { def apply() = { new Engine( classOf[DataSource], classOf[Preparator], - Map("naive" -> classOf[NaiveBayesAlgorithm]), + Map("algo" -> classOf[algo]), classOf[Serving]) } } diff --git a/src/main/scala/Preparator.scala b/src/main/scala/Preparator.scala index 4e581ea..c1afe16 100644 --- a/src/main/scala/Preparator.scala +++ b/src/main/scala/Preparator.scala @@ -1,4 +1,4 @@ -package org.template.classification +package org.template.vanilla import io.prediction.controller.PPreparator @@ -8,12 +8,12 @@ import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint class PreparedData( - val labeledPoints: RDD[LabeledPoint] + val training_points: RDD[LabeledPoint] ) extends Serializable class Preparator extends PPreparator[TrainingData, PreparedData] { def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = { - new PreparedData(trainingData.labeledPoints) + new PreparedData(trainingData.training_points) } } diff --git a/src/main/scala/Serving.scala b/src/main/scala/Serving.scala index ef06088..7ae3848 100644 --- a/src/main/scala/Serving.scala +++ b/src/main/scala/Serving.scala @@ -1,4 +1,4 @@ -package org.template.classification +package org.template.vanilla import io.prediction.controller.LServing From 4db83a6b6584ebe16a2b56d576c31f4e165ee3c7 Mon Sep 17 00:00:00 2001 From: RAditi Date: Mon, 23 Mar 2015 13:13:44 +0530 Subject: [PATCH 03/14] Create README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..041ccfc --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# template-scala-parallel-vanilla +PredictionIO vanilla engine template (Scala-based parallelized engine) +This template allows the users to use the Linear Regression Algorithm from MLLib as a prediction algorithm in the Prediction IO engine. From 2e0de4529141d6de1c565001399d63905a2dad98 Mon Sep 17 00:00:00 2001 From: RAditi Date: Mon, 23 Mar 2015 13:17:43 +0530 Subject: [PATCH 04/14] Integrated Linear Regression Algorithm --- src/main/scala/Algorithm.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/scala/Algorithm.scala b/src/main/scala/Algorithm.scala index 47c2b42..de4b232 100644 --- a/src/main/scala/Algorithm.scala +++ b/src/main/scala/Algorithm.scala @@ -37,9 +37,10 @@ class algo(val ap: AlgorithmParams) val lin = new LinearRegressionWithSGD() -implicit def str2bool(string:String):Boolean = string.toUpperCase.equals("TRUE") + //It is set to True only in the intercept field is set to 1 + //Right now, I am inputting this parameter as an integer, could be changed to String or Bool as necessary lin.setIntercept(ap.intercept.equals(1)) lin.run(data.training_points) From 0f164d850e25b6ccce956b4cc005a4814937be4d Mon Sep 17 00:00:00 2001 From: RAditi Date: Wed, 1 Apr 2015 02:07:41 +0530 Subject: [PATCH 05/14] Added the sample data --- src/main/scala/DataSource.scala | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/main/scala/DataSource.scala b/src/main/scala/DataSource.scala index 398dfc9..03cb854 100644 --- a/src/main/scala/DataSource.scala +++ b/src/main/scala/DataSource.scala @@ -37,7 +37,7 @@ class DataSource(val dsp: DataSourceParams) entityType = "training_point", // only keep entities with these required properties defined - required = Some(List("plan", "attr0", "attr1", "attr2")))(sc) + required = Some(List("plan", "attr0", "attr1", "attr2", "attr3", "attr4", "attr5", "attr6", "attr7")))(sc) // aggregateProperties() returns RDD pair of // entity ID and its aggregated properties .map { case (entityId, properties) => @@ -47,7 +47,12 @@ class DataSource(val dsp: DataSourceParams) Vectors.dense(Array( properties.get[Double]("attr0"), properties.get[Double]("attr1"), - properties.get[Double]("attr2") + properties.get[Double]("attr2"), + properties.get[Double]("attr3"), + properties.get[Double]("attr4"), + properties.get[Double]("attr5"), + properties.get[Double]("attr6"), + properties.get[Double]("attr7") )) ) } catch { From 8ae6cc66ad07908ebe6862462f39fd09b4640d90 Mon Sep 17 00:00:00 2001 From: RAditi Date: Wed, 1 Apr 2015 02:11:51 +0530 Subject: [PATCH 06/14] Added sample data --- data/import_eventserver.py | 52 +++++++++++++++++++++++ data/import_eventserver.py~ | 52 +++++++++++++++++++++++ data/sample_data.txt | 67 +++++++++++++++++++++++++++++ data/sample_data.txt~ | 70 +++++++++++++++++++++++++++++++ data/sample_naive_bayes_data.txt~ | 4 ++ data/sample_regression_data.txt | 6 +++ data/sample_regression_data.txt~ | 6 +++ 7 files changed, 257 insertions(+) create mode 100644 data/import_eventserver.py create mode 100644 data/import_eventserver.py~ create mode 100644 data/sample_data.txt create mode 100644 data/sample_data.txt~ create mode 100644 data/sample_naive_bayes_data.txt~ create mode 100644 data/sample_regression_data.txt create mode 100644 data/sample_regression_data.txt~ diff --git a/data/import_eventserver.py b/data/import_eventserver.py new file mode 100644 index 0000000..6f6ee7d --- /dev/null +++ b/data/import_eventserver.py @@ -0,0 +1,52 @@ +""" +Import sample data for classification engine +""" + +import predictionio +import argparse + +def import_events(client, file): + f = open(file, 'r') + count = 0 + print "Importing data..." + for line in f: + data = line.rstrip('\r\n').split(",") + plan = data[0] + attr = data[1].split(" ") + #print(attr); + client.create_event( + event="$set", + entity_type="training_point", + entity_id=str(count), # use the count num as user ID + properties= { + "attr0" : float(attr[0]), + "attr1" : float(attr[1]), + "attr2" : float(attr[2]), + "attr3" : float(attr[3]), + "attr4" : float(attr[4]), + "attr5" : float(attr[5]), + "attr6" : float(attr[6]), + "attr7" : float(attr[7]), + "plan" : float(plan) + } + ) + count += 1 + f.close() + print "%s events are imported." % count + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="Import sample data for classification engine") + parser.add_argument('--access_key', default='invald_access_key') + parser.add_argument('--url', default="http://localhost:7070") + parser.add_argument('--file', default="./data/sample_data.txt") + + args = parser.parse_args() + print args + + client = predictionio.EventClient( + access_key=args.access_key, + url=args.url, + threads=5, + qsize=500) + import_events(client, args.file) diff --git a/data/import_eventserver.py~ b/data/import_eventserver.py~ new file mode 100644 index 0000000..186d40b --- /dev/null +++ b/data/import_eventserver.py~ @@ -0,0 +1,52 @@ +""" +Import sample data for classification engine +""" + +import predictionio +import argparse + +def import_events(client, file): + f = open(file, 'r') + count = 0 + print "Importing data..." + for line in f: + data = line.rstrip('\r\n').split(",") + plan = data[0] + attr = data[1].split(" ") + print(attr); + client.create_event( + event="$set", + entity_type="training_point", + entity_id=str(count), # use the count num as user ID + properties= { + "attr0" : float(attr[0]), + "attr1" : float(attr[1]), + "attr2" : float(attr[2]), + "attr3" : float(attr[3]), + "attr4" : float(attr[4]), + "attr5" : float(attr[5]), + "attr6" : float(attr[6]), + "attr7" : float(attr[7]), + "plan" : float(plan) + } + ) + count += 1 + f.close() + print "%s events are imported." % count + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="Import sample data for classification engine") + parser.add_argument('--access_key', default='invald_access_key') + parser.add_argument('--url', default="http://localhost:7070") + parser.add_argument('--file', default="./data/sample_data.txt") + + args = parser.parse_args() + print args + + client = predictionio.EventClient( + access_key=args.access_key, + url=args.url, + threads=5, + qsize=500) + import_events(client, args.file) diff --git a/data/sample_data.txt b/data/sample_data.txt new file mode 100644 index 0000000..fdd16e3 --- /dev/null +++ b/data/sample_data.txt @@ -0,0 +1,67 @@ +-0.4307829,-1.63735562648104 -2.00621178480549 -1.86242597251066 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +-0.1625189,-1.98898046126935 -0.722008756122123 -0.787896192088153 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +-0.1625189,-1.57881887548545 -2.1887840293994 1.36116336875686 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.155348103855541 +-0.1625189,-2.16691708463163 -0.807993896938655 -0.787896192088153 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +0.3715636,-0.507874475300631 -0.458834049396776 -0.250631301876899 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +0.7654678,-2.03612849966376 -0.933954647105133 -1.86242597251066 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +0.8544153,-0.557312518810673 -0.208756571683607 -0.787896192088153 0.990146852537193 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +1.2669476,-0.929360463147704 -0.0578991819441687 0.152317365781542 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +1.2669476,-2.28833047634983 -0.0706369432557794 -0.116315079324086 0.80409888772376 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +1.2669476,0.223498042876113 -1.41471935455355 -0.116315079324086 -1.02470580167082 -0.522940888712441 -0.29928234305568 0.342627053981254 0.199211097885341 +1.3480731,0.107785900236813 -1.47221551299731 0.420949810887169 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.687186906466865 +1.446919,0.162180092313795 -1.32557369901905 0.286633588334355 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +1.4701758,-1.49795329918548 -0.263601072284232 0.823898478545609 0.788388310173035 -0.522940888712441 -0.29928234305568 0.342627053981254 0.199211097885341 +1.4929041,0.796247055396743 0.0476559407005752 0.286633588334355 -1.02470580167082 -0.522940888712441 0.394013435896129 -1.04215728919298 -0.864466507337306 +1.5581446,-1.62233848461465 -0.843294091975396 -3.07127197548598 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +1.5993876,-0.990720665490831 0.458513517212311 0.823898478545609 1.07379746308195 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +1.6389967,-0.171901281967138 -0.489197399065355 -0.65357996953534 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +1.6956156,-1.60758252338831 -0.590700340358265 -0.65357996953534 -0.619561070667254 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +1.7137979,0.366273918511144 -0.414014962912583 -0.116315079324086 0.232904453212813 -0.522940888712441 0.971228997418125 0.342627053981254 1.26288870310799 +1.8000583,-0.710307384579833 0.211731938156277 0.152317365781542 -1.02470580167082 -0.522940888712441 -0.442797990776478 0.342627053981254 1.61744790484887 +1.8484548,-0.262791728113881 -1.16708345615721 0.420949810887169 0.0846342590816532 -0.522940888712441 0.163172393491611 0.342627053981254 1.97200710658975 +1.8946169,0.899043117369237 -0.590700340358265 0.152317365781542 -1.02470580167082 -0.522940888712441 1.28643254437683 -1.04215728919298 -0.864466507337306 +1.9242487,-0.903451690500615 1.07659722048274 0.152317365781542 1.28380453408541 -0.522940888712441 -0.442797990776478 -1.04215728919298 -0.864466507337306 +2.008214,-0.0633337899773081 -1.38088970920094 0.958214701098423 0.80409888772376 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +2.0476928,-1.15393789990757 -0.961853075398404 -0.116315079324086 -1.02470580167082 -0.522940888712441 -0.442797990776478 -1.04215728919298 -0.864466507337306 +2.1575593,0.0620203721138446 0.0657973885499142 1.22684714620405 -0.468824786336838 -0.522940888712441 1.31421001659859 1.72741139715549 -0.332627704725983 +2.1916535,-0.75731027755674 -2.92717970468456 0.018001143228728 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983 +2.2137539,1.11226993252773 1.06484916245061 0.555266033439982 0.877691038550889 1.89254797819741 1.43890404648442 0.342627053981254 0.376490698755783 +2.2772673,-0.468768642850639 -1.43754788774533 -1.05652863719378 0.576050411655607 -0.522940888712441 0.0120483832567209 0.342627053981254 -0.687186906466865 +2.2975726,-0.618884859896728 -1.1366360750781 -0.519263746982526 -1.02470580167082 -0.522940888712441 -0.863171185425945 3.11219574032972 1.97200710658975 +2.3272777,-0.651431999123483 0.55329161145762 -0.250631301876899 1.11210019001038 -0.522940888712441 -0.179808625688859 -1.04215728919298 -0.864466507337306 +2.5217206,0.115499102435224 -0.512233676577595 0.286633588334355 1.13650173283446 -0.522940888712441 -0.179808625688859 0.342627053981254 -0.155348103855541 +2.5533438,0.266341329949937 -0.551137885443386 -0.384947524429713 0.354857790686005 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983 +2.5687881,1.16902610257751 0.855491905752846 2.03274448152093 1.22628985326088 1.89254797819741 2.02833774827712 3.11219574032972 2.68112551007152 +2.6567569,-0.218972367124187 0.851192298581141 0.555266033439982 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 0.908329501367106 +2.677591,0.263121415733908 1.4142681068416 0.018001143228728 1.35980653053822 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +2.7180005,-0.0704736333296423 1.52000996595417 0.286633588334355 1.39364261119802 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983 +2.7942279,-0.751957286017338 0.316843561689933 -1.99674219506348 0.911736065044475 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +2.8063861,-0.685277652430997 1.28214038482516 0.823898478545609 0.232904453212813 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.155348103855541 +2.8124102,-0.244991501432929 0.51882005949686 -0.384947524429713 0.823246560137838 -0.522940888712441 -0.863171185425945 0.342627053981254 0.553770299626224 +2.8419982,-0.75731027755674 2.09041984898851 1.22684714620405 1.53428167116843 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +2.8535925,1.20962937075363 -0.242882661178889 1.09253092365124 -1.02470580167082 -0.522940888712441 1.24263233939889 3.11219574032972 2.50384590920108 +2.9204698,0.570886990493502 0.58243883987948 0.555266033439982 1.16006887775962 -0.522940888712441 1.07357183940747 0.342627053981254 1.61744790484887 +2.9626924,0.719758684343624 0.984970304132004 1.09253092365124 1.52137230773457 -0.522940888712441 -0.179808625688859 0.342627053981254 -0.509907305596424 +2.9626924,-1.52406140158064 1.81975700990333 0.689582255992796 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +2.9729753,-0.132431544081234 2.68769877553723 1.09253092365124 1.53428167116843 -0.522940888712441 -0.442797990776478 0.342627053981254 -0.687186906466865 +3.0130809,0.436161292804989 -0.0834447307428255 -0.519263746982526 -1.02470580167082 1.89254797819741 1.07357183940747 0.342627053981254 1.26288870310799 +3.0373539,-0.161195191984091 -0.671900359186746 1.7641120364153 1.13650173283446 -0.522940888712441 -0.863171185425945 0.342627053981254 0.0219314970149 +3.2752562,1.39927182372944 0.513852869452676 0.689582255992796 -1.02470580167082 1.89254797819741 1.49394503405693 0.342627053981254 -0.155348103855541 +3.3375474,1.51967002306341 -0.852203755696565 0.555266033439982 -0.104527297798983 1.89254797819741 1.85927724828569 0.342627053981254 0.908329501367106 +3.3928291,0.560725834706224 1.87867703391426 1.09253092365124 1.39364261119802 -0.522940888712441 0.486423065822545 0.342627053981254 1.26288870310799 +3.4355988,1.00765532502814 1.69426310090641 1.89842825896812 1.53428167116843 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.509907305596424 +3.4578927,1.10152996153577 -0.10927271844907 0.689582255992796 -1.02470580167082 1.89254797819741 1.97630171771485 0.342627053981254 1.61744790484887 +3.5160131,0.100001934217311 -1.30380956369388 0.286633588334355 0.316555063757567 -0.522940888712441 0.28786643052924 0.342627053981254 0.553770299626224 +3.5307626,0.987291634724086 -0.36279314978779 -0.922212414640967 0.232904453212813 -0.522940888712441 1.79270085261407 0.342627053981254 1.26288870310799 +3.5652984,1.07158528137575 0.606453149641961 1.7641120364153 -0.432854616994416 1.89254797819741 0.528504607720369 0.342627053981254 0.199211097885341 +3.5876769,0.180156323255198 0.188987436375017 -0.519263746982526 1.09956763075594 -0.522940888712441 0.708239632330506 0.342627053981254 0.199211097885341 +3.6309855,1.65687973755377 -0.256675483533719 0.018001143228728 -1.02470580167082 1.89254797819741 1.79270085261407 0.342627053981254 1.26288870310799 +3.6800909,0.5720085322365 0.239854450210939 -0.787896192088153 1.0605418233138 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +3.7123518,0.323806133438225 -0.606717660886078 -0.250631301876899 -1.02470580167082 1.89254797819741 0.342907418101747 0.342627053981254 0.199211097885341 +3.9843437,1.23668206715898 2.54220539083611 0.152317365781542 -1.02470580167082 1.89254797819741 1.89037692416194 0.342627053981254 1.26288870310799 +3.993603,0.180156323255198 0.154448192444669 1.62979581386249 0.576050411655607 1.89254797819741 0.708239632330506 0.342627053981254 1.79472750571931 +4.029806,1.60906277046565 1.10378605019827 0.555266033439982 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +4.1295508,1.0036214996026 0.113496885050331 -0.384947524429713 0.860016436332751 1.89254797819741 -0.863171185425945 0.342627053981254 -0.332627704725983 +4.3851468,1.25591974271076 0.577607033774471 0.555266033439982 -1.02470580167082 1.89254797819741 1.07357183940747 0.342627053981254 1.26288870310799 +4.6844434,2.09650591351268 0.625488598331018 -2.66832330782754 -1.02470580167082 1.89254797819741 1.67954222367555 0.342627053981254 0.553770299626224 +5.477509,1.30028987435881 0.338383613253713 0.555266033439982 1.00481276295349 1.89254797819741 1.24263233939889 0.342627053981254 1.97200710658975 diff --git a/data/sample_data.txt~ b/data/sample_data.txt~ new file mode 100644 index 0000000..79237eb --- /dev/null +++ b/data/sample_data.txt~ @@ -0,0 +1,70 @@ +-0.4307829,-1.63735562648104 -2.00621178480549 -1.86242597251066 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +-0.1625189,-1.98898046126935 -0.722008756122123 -0.787896192088153 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +-0.1625189,-1.57881887548545 -2.1887840293994 1.36116336875686 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.155348103855541 +-0.1625189,-2.16691708463163 -0.807993896938655 -0.787896192088153 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +0.3715636,-0.507874475300631 -0.458834049396776 -0.250631301876899 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +0.7654678,-2.03612849966376 -0.933954647105133 -1.86242597251066 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +0.8544153,-0.557312518810673 -0.208756571683607 -0.787896192088153 0.990146852537193 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +1.2669476,-0.929360463147704 -0.0578991819441687 0.152317365781542 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +1.2669476,-2.28833047634983 -0.0706369432557794 -0.116315079324086 0.80409888772376 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +1.2669476,0.223498042876113 -1.41471935455355 -0.116315079324086 -1.02470580167082 -0.522940888712441 -0.29928234305568 0.342627053981254 0.199211097885341 +1.3480731,0.107785900236813 -1.47221551299731 0.420949810887169 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.687186906466865 +1.446919,0.162180092313795 -1.32557369901905 0.286633588334355 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +1.4701758,-1.49795329918548 -0.263601072284232 0.823898478545609 0.788388310173035 -0.522940888712441 -0.29928234305568 0.342627053981254 0.199211097885341 +1.4929041,0.796247055396743 0.0476559407005752 0.286633588334355 -1.02470580167082 -0.522940888712441 0.394013435896129 -1.04215728919298 -0.864466507337306 +1.5581446,-1.62233848461465 -0.843294091975396 -3.07127197548598 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +1.5993876,-0.990720665490831 0.458513517212311 0.823898478545609 1.07379746308195 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +1.6389967,-0.171901281967138 -0.489197399065355 -0.65357996953534 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +1.6956156,-1.60758252338831 -0.590700340358265 -0.65357996953534 -0.619561070667254 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +1.7137979,0.366273918511144 -0.414014962912583 -0.116315079324086 0.232904453212813 -0.522940888712441 0.971228997418125 0.342627053981254 1.26288870310799 +1.8000583,-0.710307384579833 0.211731938156277 0.152317365781542 -1.02470580167082 -0.522940888712441 -0.442797990776478 0.342627053981254 1.61744790484887 +1.8484548,-0.262791728113881 -1.16708345615721 0.420949810887169 0.0846342590816532 -0.522940888712441 0.163172393491611 0.342627053981254 1.97200710658975 +1.8946169,0.899043117369237 -0.590700340358265 0.152317365781542 -1.02470580167082 -0.522940888712441 1.28643254437683 -1.04215728919298 -0.864466507337306 +1.9242487,-0.903451690500615 1.07659722048274 0.152317365781542 1.28380453408541 -0.522940888712441 -0.442797990776478 -1.04215728919298 -0.864466507337306 +2.008214,-0.0633337899773081 -1.38088970920094 0.958214701098423 0.80409888772376 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +2.0476928,-1.15393789990757 -0.961853075398404 -0.116315079324086 -1.02470580167082 -0.522940888712441 -0.442797990776478 -1.04215728919298 -0.864466507337306 +2.1575593,0.0620203721138446 0.0657973885499142 1.22684714620405 -0.468824786336838 -0.522940888712441 1.31421001659859 1.72741139715549 -0.332627704725983 +2.1916535,-0.75731027755674 -2.92717970468456 0.018001143228728 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983 +2.2137539,1.11226993252773 1.06484916245061 0.555266033439982 0.877691038550889 1.89254797819741 1.43890404648442 0.342627053981254 0.376490698755783 +2.2772673,-0.468768642850639 -1.43754788774533 -1.05652863719378 0.576050411655607 -0.522940888712441 0.0120483832567209 0.342627053981254 -0.687186906466865 +2.2975726,-0.618884859896728 -1.1366360750781 -0.519263746982526 -1.02470580167082 -0.522940888712441 -0.863171185425945 3.11219574032972 1.97200710658975 +2.3272777,-0.651431999123483 0.55329161145762 -0.250631301876899 1.11210019001038 -0.522940888712441 -0.179808625688859 -1.04215728919298 -0.864466507337306 +2.5217206,0.115499102435224 -0.512233676577595 0.286633588334355 1.13650173283446 -0.522940888712441 -0.179808625688859 0.342627053981254 -0.155348103855541 +2.5533438,0.266341329949937 -0.551137885443386 -0.384947524429713 0.354857790686005 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983 +2.5687881,1.16902610257751 0.855491905752846 2.03274448152093 1.22628985326088 1.89254797819741 2.02833774827712 3.11219574032972 2.68112551007152 +2.6567569,-0.218972367124187 0.851192298581141 0.555266033439982 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 0.908329501367106 +2.677591,0.263121415733908 1.4142681068416 0.018001143228728 1.35980653053822 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +2.7180005,-0.0704736333296423 1.52000996595417 0.286633588334355 1.39364261119802 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983 +2.7942279,-0.751957286017338 0.316843561689933 -1.99674219506348 0.911736065044475 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +2.8063861,-0.685277652430997 1.28214038482516 0.823898478545609 0.232904453212813 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.155348103855541 +2.8124102,-0.244991501432929 0.51882005949686 -0.384947524429713 0.823246560137838 -0.522940888712441 -0.863171185425945 0.342627053981254 0.553770299626224 +2.8419982,-0.75731027755674 2.09041984898851 1.22684714620405 1.53428167116843 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +2.8535925,1.20962937075363 -0.242882661178889 1.09253092365124 -1.02470580167082 -0.522940888712441 1.24263233939889 3.11219574032972 2.50384590920108 +2.9204698,0.570886990493502 0.58243883987948 0.555266033439982 1.16006887775962 -0.522940888712441 1.07357183940747 0.342627053981254 1.61744790484887 +2.9626924,0.719758684343624 0.984970304132004 1.09253092365124 1.52137230773457 -0.522940888712441 -0.179808625688859 0.342627053981254 -0.509907305596424 +2.9626924,-1.52406140158064 1.81975700990333 0.689582255992796 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +2.9729753,-0.132431544081234 2.68769877553723 1.09253092365124 1.53428167116843 -0.522940888712441 -0.442797990776478 0.342627053981254 -0.687186906466865 +3.0130809,0.436161292804989 -0.0834447307428255 -0.519263746982526 -1.02470580167082 1.89254797819741 1.07357183940747 0.342627053981254 1.26288870310799 +3.0373539,-0.161195191984091 -0.671900359186746 1.7641120364153 1.13650173283446 -0.522940888712441 -0.863171185425945 0.342627053981254 0.0219314970149 +3.2752562,1.39927182372944 0.513852869452676 0.689582255992796 -1.02470580167082 1.89254797819741 1.49394503405693 0.342627053981254 -0.155348103855541 +3.3375474,1.51967002306341 -0.852203755696565 0.555266033439982 -0.104527297798983 1.89254797819741 1.85927724828569 0.342627053981254 0.908329501367106 +3.3928291,0.560725834706224 1.87867703391426 1.09253092365124 1.39364261119802 -0.522940888712441 0.486423065822545 0.342627053981254 1.26288870310799 +3.4355988,1.00765532502814 1.69426310090641 1.89842825896812 1.53428167116843 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.509907305596424 +3.4578927,1.10152996153577 -0.10927271844907 0.689582255992796 -1.02470580167082 1.89254797819741 1.97630171771485 0.342627053981254 1.61744790484887 +3.5160131,0.100001934217311 -1.30380956369388 0.286633588334355 0.316555063757567 -0.522940888712441 0.28786643052924 0.342627053981254 0.553770299626224 +3.5307626,0.987291634724086 -0.36279314978779 -0.922212414640967 0.232904453212813 -0.522940888712441 1.79270085261407 0.342627053981254 1.26288870310799 +3.5652984,1.07158528137575 0.606453149641961 1.7641120364153 -0.432854616994416 1.89254797819741 0.528504607720369 0.342627053981254 0.199211097885341 +3.5876769,0.180156323255198 0.188987436375017 -0.519263746982526 1.09956763075594 -0.522940888712441 0.708239632330506 0.342627053981254 0.199211097885341 +3.6309855,1.65687973755377 -0.256675483533719 0.018001143228728 -1.02470580167082 1.89254797819741 1.79270085261407 0.342627053981254 1.26288870310799 +3.6800909,0.5720085322365 0.239854450210939 -0.787896192088153 1.0605418233138 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +3.7123518,0.323806133438225 -0.606717660886078 -0.250631301876899 -1.02470580167082 1.89254797819741 0.342907418101747 0.342627053981254 0.199211097885341 +3.9843437,1.23668206715898 2.54220539083611 0.152317365781542 -1.02470580167082 1.89254797819741 1.89037692416194 0.342627053981254 1.26288870310799 +3.993603,0.180156323255198 0.154448192444669 1.62979581386249 0.576050411655607 1.89254797819741 0.708239632330506 0.342627053981254 1.79472750571931 +4.029806,1.60906277046565 1.10378605019827 0.555266033439982 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 +4.1295508,1.0036214996026 0.113496885050331 -0.384947524429713 0.860016436332751 1.89254797819741 -0.863171185425945 0.342627053981254 -0.332627704725983 +4.3851468,1.25591974271076 0.577607033774471 0.555266033439982 -1.02470580167082 1.89254797819741 1.07357183940747 0.342627053981254 1.26288870310799 +4.6844434,2.09650591351268 0.625488598331018 -2.66832330782754 -1.02470580167082 1.89254797819741 1.67954222367555 0.342627053981254 0.553770299626224 +5.477509,1.30028987435881 0.338383613253713 0.555266033439982 1.00481276295349 1.89254797819741 1.24263233939889 0.342627053981254 1.97200710658975 + + + diff --git a/data/sample_naive_bayes_data.txt~ b/data/sample_naive_bayes_data.txt~ new file mode 100644 index 0000000..b7fa7d2 --- /dev/null +++ b/data/sample_naive_bayes_data.txt~ @@ -0,0 +1,4 @@ +3, 1, 1, 1 +60, 20, 30, 40 +37, 15, 12, 20 + diff --git a/data/sample_regression_data.txt b/data/sample_regression_data.txt new file mode 100644 index 0000000..649a820 --- /dev/null +++ b/data/sample_regression_data.txt @@ -0,0 +1,6 @@ +46,12 20 14 +22,22 0 0 +32,30 6 6 +17,7 2 8 +28,13 12 3 +29,10 12 7 diff --git a/data/sample_regression_data.txt~ b/data/sample_regression_data.txt~ new file mode 100644 index 0000000..349b245 --- /dev/null +++ b/data/sample_regression_data.txt~ @@ -0,0 +1,6 @@ +46,12 20 14 +22,22 0 0 +32,30 6 6 +17,7 2 8 +28,13 12 3 +29,10 12 2 From c60c745fbf1de74aaa590d99bcfd34ed07ab170a Mon Sep 17 00:00:00 2001 From: RAditi Date: Wed, 1 Apr 2015 02:21:20 +0530 Subject: [PATCH 07/14] Added sampke data from MLLib specification --- data/import_eventserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/import_eventserver.py b/data/import_eventserver.py index 6f6ee7d..f566495 100644 --- a/data/import_eventserver.py +++ b/data/import_eventserver.py @@ -13,7 +13,7 @@ def import_events(client, file): data = line.rstrip('\r\n').split(",") plan = data[0] attr = data[1].split(" ") - #print(attr); + client.create_event( event="$set", entity_type="training_point", From 4b1d26c9c0edea271525bb3b2b2f5a1848395e52 Mon Sep 17 00:00:00 2001 From: RAditi Date: Wed, 1 Apr 2015 02:24:24 +0530 Subject: [PATCH 08/14] Added the sample dataset for Linear Regression from MLLib documentation, cleared old files --- data/sample_naive_bayes_data.txt~ | 4 ---- data/sample_regression_data.txt | 6 ------ 2 files changed, 10 deletions(-) delete mode 100644 data/sample_naive_bayes_data.txt~ delete mode 100644 data/sample_regression_data.txt diff --git a/data/sample_naive_bayes_data.txt~ b/data/sample_naive_bayes_data.txt~ deleted file mode 100644 index b7fa7d2..0000000 --- a/data/sample_naive_bayes_data.txt~ +++ /dev/null @@ -1,4 +0,0 @@ -3, 1, 1, 1 -60, 20, 30, 40 -37, 15, 12, 20 - diff --git a/data/sample_regression_data.txt b/data/sample_regression_data.txt deleted file mode 100644 index 649a820..0000000 --- a/data/sample_regression_data.txt +++ /dev/null @@ -1,6 +0,0 @@ -46,12 20 14 -22,22 0 0 -32,30 6 6 -17,7 2 8 -28,13 12 3 -29,10 12 7 From ebe8469cb31e4797c5d87ec192629eec39193556 Mon Sep 17 00:00:00 2001 From: RAditi Date: Wed, 1 Apr 2015 15:14:35 +0530 Subject: [PATCH 09/14] Updated to PredictionIO 0.9 --- build.sbt | 2 +- src/main/scala/Algorithm.scala | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/build.sbt b/build.sbt index fb84e44..754bf7f 100644 --- a/build.sbt +++ b/build.sbt @@ -7,6 +7,6 @@ name := "template-scala-parallel-vanilla" organization := "io.prediction" libraryDependencies ++= Seq( - "io.prediction" %% "core" % "0.8.6" % "provided", + "io.prediction" %% "core" % pioVersion.value % "provided", "org.apache.spark" %% "spark-core" % "1.2.0" % "provided", "org.apache.spark" %% "spark-mllib" % "1.2.0" % "provided") diff --git a/src/main/scala/Algorithm.scala b/src/main/scala/Algorithm.scala index de4b232..87eaf31 100644 --- a/src/main/scala/Algorithm.scala +++ b/src/main/scala/Algorithm.scala @@ -10,6 +10,7 @@ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ +import org.apache.spark.SparkContext import grizzled.slf4j.Logger @@ -28,7 +29,7 @@ class algo(val ap: AlgorithmParams) @transient lazy val logger = Logger[this.type] - def train(data: PreparedData): LinearRegressionModel = { + def train(sc:SparkContext, data: PreparedData): LinearRegressionModel = { // MLLib Linear Regression cannot handle empty training data. require(!data.training_points.take(1).isEmpty, s"RDD[labeldPoints] in PreparedData cannot be empty." + @@ -41,8 +42,10 @@ class algo(val ap: AlgorithmParams) //It is set to True only in the intercept field is set to 1 //Right now, I am inputting this parameter as an integer, could be changed to String or Bool as necessary + + lin.setIntercept(ap.intercept.equals(1.0)) - lin.setIntercept(ap.intercept.equals(1)) + //lin.setIntercept(true) lin.run(data.training_points) } From 6ff81f12c3e4fb125c03437b827ba68ae8dd822c Mon Sep 17 00:00:00 2001 From: RAditi Date: Wed, 1 Apr 2015 15:16:47 +0530 Subject: [PATCH 10/14] Fixed the issue with intercept --- src/main/scala/Algorithm.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/scala/Algorithm.scala b/src/main/scala/Algorithm.scala index 87eaf31..339798a 100644 --- a/src/main/scala/Algorithm.scala +++ b/src/main/scala/Algorithm.scala @@ -44,8 +44,6 @@ class algo(val ap: AlgorithmParams) //Right now, I am inputting this parameter as an integer, could be changed to String or Bool as necessary lin.setIntercept(ap.intercept.equals(1.0)) - - //lin.setIntercept(true) lin.run(data.training_points) } From 65f7770ba5594895b4e14512e16f46a8bdbf712c Mon Sep 17 00:00:00 2001 From: RAditi Date: Wed, 1 Apr 2015 15:18:30 +0530 Subject: [PATCH 11/14] Updated to PredictionIO 0.9 --- project/pio-build.sbt | 1 + 1 file changed, 1 insertion(+) create mode 100644 project/pio-build.sbt diff --git a/project/pio-build.sbt b/project/pio-build.sbt new file mode 100644 index 0000000..878fc0d --- /dev/null +++ b/project/pio-build.sbt @@ -0,0 +1 @@ +addSbtPlugin("io.prediction" % "pio-build" % "0.9.0") \ No newline at end of file From 8cad38f118e663ff6d7925ebdb64720aa9747e64 Mon Sep 17 00:00:00 2001 From: RAditi Date: Wed, 1 Apr 2015 16:24:23 +0530 Subject: [PATCH 12/14] Edited the README --- README.md | 234 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 231 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 041ccfc..b7e12ba 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,231 @@ -# template-scala-parallel-vanilla -PredictionIO vanilla engine template (Scala-based parallelized engine) -This template allows the users to use the Linear Regression Algorithm from MLLib as a prediction algorithm in the Prediction IO engine. +# PredictionIO-MLlib-Linear Regression-Template +## Overview +An engine template is an almost-complete implementation of an engine. In this Engine Template, we have integrated Apache Spark MLlib's Linear Regression algorithm by default. + +## Linear Regression +In statistics, linear regression is an approach for modeling the relationship between a scalar dependent variable and one or more explanatory variables (or independent variables). Linear Regression is widely used in practice, to learn real-valued outputs. The linear regression model of this template is trained using "Stochastic Gradient Descent", an on-line version of gradient descent where the true gradient is approximated by the gradient at a single training point. + + +## Prediction Engine Template +The default use case of this Prediction Engine Template is to predict the level of prostate specific antigen from a number of clinical measures in men who were about to receive a radical prostatectomy. There are 8 measures that are a part of the clinical measures. This is based on the dataset that is publicly available. + +You can customize it easily to fit your specific use case and needs. + +We are going to show you how to create your own prediction engine for production use based on this template. + +## Usage + +### Event Data Requirements +By default, the template requires the following events to be collected ( we can check this at TemplateFolder/data/import_eventserver.py ): + +- user $set event, which set the attributes of the user + +### Input Query +- array of features values ( 8 features) +``` +{"features": [-1,-2, -1 , -3, 0, 0, -1, 0]} +``` + +### Output Predicted Result +- the predicted label +``` +{"prediction": 0.7915352224221848} +``` + +### Dataset +We will be using the sample data set from https://github.com/apache/spark/blob/master/data/mllib/ridge-data/lpsa.data +The training sample events have the following format (Generated by data/import_eventserver.py): +``` +client.create_event( + event="$set", + entity_type="user", + entity_id=str(count), # use the count num as user ID + properties= { + "attr0" : int(attr[0]), + "attr1" : int(attr[1]), + "attr2" : int(attr[2]), + "attr3" : int(attr[3]), + "attr4" : int(attr[4]), + "attr5" : int(attr[5]), + "attr6" : int(attr[6]), + "attr7" : int(attr[7]), + "attr8" : int(attr[8]), + "plan" : int(plan) + } +``` +## Install and Run PredictionIO +First you need to [install PredictionIO 0.9.1](http://docs.prediction.io/install/) (if you haven't done it). +Let's say you have installed PredictionIO at /home/yourname/PredictionIO/. For convenience, add PredictionIO's binary command path to your PATH, i.e. /home/yourname/PredictionIO/bin +``` +$ PATH=$PATH:/home/yourname/PredictionIO/bin; export PATH +``` +Once you have completed the installation process, please make sure all the components (PredictionIO Event Server, Elasticsearch, and HBase) are up and running. + +``` +$ pio-start-all +``` +For versions before 0.9.1, you need to individually get the PredictionIO Event Server, Elasticsearch, and HBase up and running. + +You can check the status by running: +``` +$ pio status +``` +## Create a new Engine Template +Clone the current repository by executing the following command in the directory where you want the code to reside: + +``` +git clone https://github.com/RAditi/PredictionIO-MLLib-LinReg-Template.git +cd PredictionIO-MLLib-LinReg-Template +``` +## Generate an App ID and Access Key +Let's assume you want to use this engine in an application named "MyApp1". You will need to collect some training data for machine learning modeling. You can generate an App ID and Access Key that represent "MyApp1" on the Event Server easily: +``` +$ pio app new MyApp1 + +``` +You should find the following in the console output: +``` +... +[INFO] [App$] Initialized Event Store for this app ID: 1. +[INFO] [App$] Created new app: +[INFO] [App$] Name: MyApp1 +[INFO] [App$] ID: 1 +[INFO] [App$] Access Key: 3mZWDzci2D5YsqAnqNnXH9SB6Rg3dsTBs8iHkK6X2i54IQsIZI1eEeQQyMfs7b3F +``` +Take note of the Access Key and App ID. You will need the Access Key to refer to "MyApp1" when you collect data. At the same time, you will use App ID to refer to "MyApp1" in engine code. + +$ pio app list will return a list of names and IDs of apps created in the Event Server. + +``` +$ pio app list +[INFO] [App$] Name | ID | Access Key | Allowed Event(s) +[INFO] [App$] MyApp1 | 1 | 3mZWDzci2D5YsqAnqNnXH9SB6Rg3dsTBs8iHkK6X2i54IQsIZI1eEeQQyMfs7b3F | (all) +[INFO] [App$] MyApp2 | 2 | io5lz6Eg4m3Xe4JZTBFE13GMAf1dhFl6ZteuJfrO84XpdOz9wRCrDU44EUaYuXq5 | (all) +[INFO] [App$] Finished listing 2 app(s). +``` + +## Collecting Data + +Next, let's collect some training data. By default, the Prediction Engine Template reads 9 properties of a user record: attr0, attr1, attr2, attr3, attr4, attr5, attr6, attr7 and plan. + +You can send these data to PredictionIO Event Server in real-time easily by making a HTTP request or through the EventClient of an SDK. + +Although you can integrate your app with PredictionIO and collect training data in real-time, we are going to import a sample dataset with the provided scripts for demonstration purpose. + +Execute the following command in the Engine directory to get the sample public dataset. +``` +curl https://raw.githubusercontent.com/apache/spark/master/data/mllib/ridge-data/lpsa.data --create-dirs -o data/sample_data.txt + +``` + +A Python import script import_eventserver.py is provided in the template to import the data to Event Server using Python SDK. +Replace the value of access_key parameter by your Access Key and run: +```python +$ cd MyRecomendation +$ python data/import_eventserver.py --access_key 3mZWDzci2D5YsqAnqNnXH9SB6Rg3dsTBs8iHkK6X2i54IQsIZI1eEeQQyMfs7b3F +``` +You should see the following output: +``` +Importing data... +67 events are imported. +``` +This python script converts the data file to proper events formats as needed by the event server. +Now the training data (the record of the clinical data downloaded above)is stored as events inside the Event Store. + +## Deploy the Engine as a Service +Now you can build, train, and deploy the engine. First, make sure you are under the PredictionIO-MLLib-LinReg-Template. + +### Engine.json + +Under the directory, you should find an engine.json file; this is where you specify parameters for the engine. +Make sure the appId defined in the file match your App ID. (This links the template engine with the App) + +Parameters for the Linear Regression model are to be set here. +If the "intercept" parameter is set to 1, then the linear regression model is trained allowing for an intercept/bias (the constant term in the expression of the target variable as a linear combination of the explanatory variables). If the intercept parameter is 0, the bias term of the linear model is set to 0. + + +``` +{ + "id": "default", + "description": "Default settings", + "engineFactory": "org.template.classification.ClassificationEngine", + "datasource": { + "params": { + "appId": 1 + } + }, + "algorithms": [ + { + "name": "decisiontree", + "params": { + + "intercept": 1, + } + } + ] +} +``` +### Build + +Start with building your PredictionIO-MLLib-LinReg-Template engine. +``` +$ pio build +``` +This command should take few minutes for the first time; all subsequent builds should be less than a minute. You can also run it with --verbose to see all log messages. + +Upon successful build, you should see a console message similar to the following. +``` +[INFO] [Console$] Your engine is ready for training. +``` + +### Training the Predictive Model + +Train your engine. + +``` +$ pio train +``` +When your engine is trained successfully, you should see a console message similar to the following. + +``` +[INFO] [CoreWorkflow$] Training completed successfully. +``` +### Deploying the Engine + +Now your engine is ready to deploy. + +``` +$ pio deploy +``` +This will deploy an engine that binds to http://localhost:8000. You can visit that page in your web browser to check its status. + +## Use the Engine + +Now, You can try to retrieve predicted results. For example, to predict the target variable(i.e. prostate specific antigen) of a user with +attr0 = -1 +attr1 = -2 +attr2 = -1 +attr3 = -3 +attr4 = 0 +attr5 = 0 +attr6 = -1 +attr7 = 0 +(with the attributes here being integers for convenience. They can take float values) +you send this JSON { "features": [-1, -2, -1, -3, 0, 0, -1] } to the deployed engine and it will return a JSON of the predicted level of prostate specific antigen. Simply send a query by making a HTTP request or through the EngineClient of an SDK: +```python +import predictionio +engine_client = predictionio.EngineClient(url="http://localhost:8000") +print engine_client.send_query({"features" :[-1, -2, -1, -3, 0, 0, -1, 0]}) + +``` +The following is sample JSON response: + +``` +{"prediction": 0.7915352224221848} +``` + +The sample query can be found in **test.py** + + + + From 595dc80f4e49f318d1adc0fd9967d3e9f646a00d Mon Sep 17 00:00:00 2001 From: RAditi Date: Wed, 1 Apr 2015 16:28:30 +0530 Subject: [PATCH 13/14] Modified README --- README.md | 11 +++++++---- sample.py | 4 ++++ 2 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 sample.py diff --git a/README.md b/README.md index b7e12ba..5de6412 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,6 @@ client.create_event( "attr5" : int(attr[5]), "attr6" : int(attr[6]), "attr7" : int(attr[7]), - "attr8" : int(attr[8]), "plan" : int(plan) } ``` @@ -156,7 +155,7 @@ If the "intercept" parameter is set to 1, then the linear regression model is tr }, "algorithms": [ { - "name": "decisiontree", + "name": "algo", "params": { "intercept": 1, @@ -201,7 +200,7 @@ This will deploy an engine that binds to http://localhost:8000. You can visit th ## Use the Engine -Now, You can try to retrieve predicted results. For example, to predict the target variable(i.e. prostate specific antigen) of a user with +Now, You can try to retrieve predicted results. For example, to predict the target variable(i.e. prostate specific antigen) of a patient with attr0 = -1 attr1 = -2 attr2 = -1 @@ -224,7 +223,11 @@ The following is sample JSON response: {"prediction": 0.7915352224221848} ``` -The sample query can be found in **test.py** +The sample query can be found in **sample.py**, which can be executed : + +``` +python sample.py +``` diff --git a/sample.py b/sample.py new file mode 100644 index 0000000..0514419 --- /dev/null +++ b/sample.py @@ -0,0 +1,4 @@ +import predictionio +engine_client = predictionio.EngineClient(url = "http://localhost:8000") +print engine_client.send_query({"features" :[-1, -2, -1, -3, 0, 0, -1, 0]}) + From 63b02b792d92911ee1353481d6540bc78c0d4b2b Mon Sep 17 00:00:00 2001 From: RAditi Date: Thu, 23 Apr 2015 02:32:20 +0530 Subject: [PATCH 14/14] Added the howto --- HOWTO.md | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ HOWTO.md~ | 181 +++++++++++++++++++++++++++++++++++++++++++++++++++ pio.sbt | 4 ++ 3 files changed, 374 insertions(+) create mode 100644 HOWTO.md create mode 100644 HOWTO.md~ create mode 100644 pio.sbt diff --git a/HOWTO.md b/HOWTO.md new file mode 100644 index 0000000..d69b8b6 --- /dev/null +++ b/HOWTO.md @@ -0,0 +1,189 @@ +#Using MLLib for Linear Regression + +This HOWTO describes how the vanilla prediction-io template can be modified to turn it into a regression template with MLLib-Linear Regression integration. You can easily add and use any other MLlib regression algorithms. The following will demonstrate how to add the MLlib Linear REgression algorithm into the engine. + +## Updating Algorithm.scala + +Since we have to include and use an algorithm from a library, this is possibly the most important step in the integration. In 'Algorithm.scala' import the MLlib Linear Regression algorithm by adding the following lines: + +```Scala + import org.apache.spark.mllib.regression.LinearRegressionWithSGD + import org.apache.spark.mllib.clustering.LinearRegressionModel + import org.apache.spark.mllib.linalg.Vector + import org.apache.spark.mllib.linalg.Vectors +``` + +These are the necessary classes in order to use the MLLib's Linear Regression algporithm. +Modify the AlgorithmParams class for the MLLib Linear Regression algorithm: + +```Scala + case class AlgorithmParams( + val intercept : Double + ) extends Params +``` +The parameters of the Linear Regression algorithm were obtained by refering to the MLLib documentation for the Linear Regression Algorithm. +This class contains the inputs to the training algorithm, other than the training data. In the case of regression, it turns out to be whether the algorithm should train with an intercept or not + +Since we have added some parameters that are specific to the algorithm, the *engine.json* file has to be changed suitably, to include the newly added parameters. + + +Original: +```Javascript + "algorithms": [ + { + "name": "algo", + "params": { + "mult" : 1 + } + } + ] + ``` + Changed to: +```Javascript + "algorithms": [ + { + "name": "algo", + "params": { + "intercept" : 2, + + } + } + ] + ``` +This *engine.json* file can be found in the main directory of the vanilla template. + +After effecting the above changes, we need to change class *Algorithm* because the model in consideration is LinearRegressionModel. *Model* what is used has to be replaced by *LinearRegressionModel* + +Original: +```Scala +class Algorithm(val ap: AlgorithmParams) + // extends PAlgorithm if Model contains RDD[] + extends P2LAlgorithm[PreparedData, Model, Query, PredictedResult] { +``` +Changed to: +```Scala +class Algorithm(val ap: AlgorithmParams) + // extends PAlgorithm if Model contains RDD[] + extends P2LAlgorithm[PreparedData, LinearRegressionModel, Query, PredictedResult] { + ``` + +Next, we look at the 2 functions that are implemented in *Algorithm.scala*. These are the *train* and *predict* functions. We look at *train* first. *train* is used to prepare the LinearRegressionModel. + + +The code which accomplishes this is: + +Train: + +```Scala + def train(data: PreparedData): KMeansModel = { + + + // Creates a new LinearRegression class which generates the LinearRegressionModel + val lin = new LinearRegressionWithSGD() + + // Setting the parameters obtained + lin.setIntercept(ap.Intercept.equals(1.0)) + //Training the model on the data obtained from the preparator class + lin.run(data.points) + } + ``` +Next we look at the *predict* function. Using the model that has been trained, this function returns the required prediction on a new data point, and send to the *serving* class + + + Predict: + ```Scala + def predict(model: LinearRegressionModel, query: Query): PredictedResult = { + // Use the KMeansModel to predict cluster for new dataPoint + val result = model.predict(Vectors.dense(query.features)) + new PredictedResult(result) + } + ``` +## Updating DataSource.scala + +*DataSource.scala* has to be customised depending on both the input data format and the format required by the *Preparator* and *Algorithm* class. In the linear regression case, the inputs are all double, with a special atrribute that we want to predict. This can be easily obtained in the form of RDD[Vector], which also happens to be the class required by Linear Regression algorithm in MLLib, making the Preparator class simple. We first import the required libraries + + +```Scala + import org.apache.spark.mllib.linalg.Vector + import org.apache.spark.mllib.linalg.Vectors +``` +The main function in the DataSource class is the *readTraining* function. It reads the data points from the prediction-io event server and adds it to the RDD of Vector which the Preparator class is expecting. +For the dataset that this template uses, there are 9 attributes and a value to be predicted. All these features are compulsory. This has to be reflected in *readTraining* function. Also, all these features have to be considered as *double*, because the features take double values + +Original: +```Scala + def readTraining(sc: SparkContext): TrainingData = { + + // read all events of EVENT involving ENTITY_TYPE and TARGET_ENTITY_TYPE + val eventsRDD: RDD[Event] = PEventStore.find( + appName = dsp.appName, + entityType = Some("ENTITY_TYPE"), + eventNames = Some(List("EVENT")), + targetEntityType = Some(Some("TARGET_ENTITY_TYPE")))(sc) + + new TrainingData(eventsRDD) + } + ``` +Changed to: +```Scala + def readTraining(sc: SparkContext): TrainingData = { + val pointsDb = Storage.getPEvents() + // read all events involving "point" type (the only type in our case) + println("Gathering data from event server.") + val pointsRDD: RDD[Vector] = pointsDb.aggregateProperties( + appId = dsp.appId, + entityType = "point", + required = Some(List("plan","attr0","attr1", "attr2", "attr3", "attr4", "attr5", "attr6", "attr7")))(sc) + .map { case (entityId, properties) => + try { + + // Convert the data from an Array to a RDD[vector] which is what KMeans + // expects as input + Vectors.dense(Array( + properties.get[Double]("attr0"), + properties.get[Double]("attr1"), + properties.get[Double]("attr2"), + properties.get[Double]("attr3"), + properties.get[Double]("attr4"), + properties.get[Double]("attr5"), + properties.get[Double]("attr6"), + properties.get[Double]("attr7") + )) + + } catch { + case e: Exception => { + logger.error(s"Failed to get properties ${properties} of" + + s" ${entityId}. Exception: ${e}.") + throw e + } + } + } + + new TrainingData(training_points) + } + ``` +Note : + +* The class TrainingData has attribute training_points which is RDD[LabeledPoint] + +* LabeledPoint has a double value : label, which will the target variable in the case of regression + +* LabeledPoint has a Vector[Doubles] as the features, and this is obtained using Vectors.dense() + +* In order to use another dataset, the number of 'required' attributes and expected attributes that are present in the dataset have to be updated accordingly in *DataSource.scala* + + +* A python wrapper code was written to parse the input and pass the values to *eventserver* in the format expected by *DataSource.scala* + +* As mentioned earlier, *preparator.scala* doesn't have to do much in this case. We just have to ensure consistency in the datatypes + +* *Serving.scala* can be personalised further, depending on the requirements. The main step is to ensure consistency among all datatypes used + +We have now succesfully integreated the Linear Regression Algorithm from MLLib. To use the template, + +```Scala +pio build +pio train +pio deploy +``` +For detailed instructions on how to run the template, once created, check the Quixk Start Guide for Vanilla Template diff --git a/HOWTO.md~ b/HOWTO.md~ new file mode 100644 index 0000000..b3ee4c0 --- /dev/null +++ b/HOWTO.md~ @@ -0,0 +1,181 @@ +#Using MLLib for Linear Regression + +This HOWTO describes how the vanilla prediction-io template can be modified to turn it into a regression template with MLLib-Linear Regression integration. You can easily add and use any other MLlib regression algorithms. The following will demonstrate how to add the MLlib Linear REgression algorithm into the engine. + +## Updating Algorithm.scala + +Since we have to include and use an algorithm from a library, this is possibly the most important step in the integration. In 'Algorithm.scala' import the MLlib Linear Regression algorithm by adding the following lines: + +```Scala + import org.apache.spark.mllib.regression.LinearRegressionWithSGD + import org.apache.spark.mllib.clustering.LinearRegressionModel + import org.apache.spark.mllib.linalg.Vector + import org.apache.spark.mllib.linalg.Vectors +``` + +These are the necessary classes in order to use the MLLib's Linear Regression algporithm. +Modify the AlgorithmParams class for the MLLib Linear Regression algorithm: + +```Scala + case class AlgorithmParams( + val intercept : Double + ) extends Params +``` +The parameters of the Linear Regression algorithm were obtained by refering to the MLLib documentation for the Linear Regression Algorithm. +This class contains the inputs to the training algorithm, other than the training data. In the case of regression, it turns out to be whether the algorithm should train with an intercept or not + +Since we have added some parameters that are specific to the algorithm, the *engine.json* file has to be changed suitably, to include the newly added parameters. + + +Original: +```Javascript + "algorithms": [ + { + "name": "algo", + "params": { + "mult" : 1 + } + } + ] + ``` + Changed to: +```Javascript + "algorithms": [ + { + "name": "algo", + "params": { + "intercept" : 2, + + } + } + ] + ``` +This *engine.json* file can be found in the main directory of the vanilla template. + +After effecting the above changes, we need to change class *Algorithm* because the model in consideration is LinearRegressionModel. *Model* what is used has to be replaced by *LinearRegressionModel* + +Original: +```Scala +class Algorithm(val ap: AlgorithmParams) + // extends PAlgorithm if Model contains RDD[] + extends P2LAlgorithm[PreparedData, Model, Query, PredictedResult] { +``` +Changed to: +```Scala +class Algorithm(val ap: AlgorithmParams) + // extends PAlgorithm if Model contains RDD[] + extends P2LAlgorithm[PreparedData, LinearRegressionModel, Query, PredictedResult] { + ``` + +Next, we look at the 2 functions that are implemented in *Algorithm.scala*. These are the *train* and *predict* functions. We look at *train* first. *train* is used to prepare the LinearRegressionModel. + + +The code which accomplishes this is: + +Train: + +```Scala + def train(data: PreparedData): KMeansModel = { + + + // Creates a new LinearRegression class which generates the LinearRegressionModel + val lin = new LinearRegressionWithSGD() + + // Setting the parameters obtained + lin.setIntercept(ap.Intercept.equals(1.0)) + //Training the model on the data obtained from the preparator class + lin.run(data.points) + } + ``` +Next we look at the *predict* function. Using the model that has been trained, this function returns the required prediction on a new data point, and send to the *serving* class + + + Predict: + ```Scala + def predict(model: LinearRegressionModel, query: Query): PredictedResult = { + // Use the KMeansModel to predict cluster for new dataPoint + val result = model.predict(Vectors.dense(query.features)) + new PredictedResult(result) + } + ``` +## Updating DataSource.scala + +*DataSource.scala* has to be customised depending on both the input data format and the format required by the *Preparator* and *Algorithm* class. In the linear regression case, the inputs are all double, with a special atrribute that we want to predict. This can be easily obtained in the form of RDD[Vector], which also happens to be the class required by Linear Regression algorithm in MLLib, making the Preparator class simple. We first import the required libraries + + +```Scala + import org.apache.spark.mllib.linalg.Vector + import org.apache.spark.mllib.linalg.Vectors +``` +The main function in the DataSource class is the *readTraining* function. It reads the data points from the prediction-io event server and adds it to the RDD of Vector which the Preparator class is expecting. +For the dataset that this template uses, there are 9 attributes and a value to be predicted. All these features are compulsory. This has to be reflected in *readTraining* function. Also, all these features have to be considered as *double*, because the features take double values + +Original: +```Scala + def readTraining(sc: SparkContext): TrainingData = { + + // read all events of EVENT involving ENTITY_TYPE and TARGET_ENTITY_TYPE + val eventsRDD: RDD[Event] = PEventStore.find( + appName = dsp.appName, + entityType = Some("ENTITY_TYPE"), + eventNames = Some(List("EVENT")), + targetEntityType = Some(Some("TARGET_ENTITY_TYPE")))(sc) + + new TrainingData(eventsRDD) + } + ``` +Changed to: +```Scala + def readTraining(sc: SparkContext): TrainingData = { + val pointsDb = Storage.getPEvents() + // read all events involving "point" type (the only type in our case) + println("Gathering data from event server.") + val pointsRDD: RDD[Vector] = pointsDb.aggregateProperties( + appId = dsp.appId, + entityType = "point", + required = Some(List("plan","attr0","attr1", "attr2", "attr3", "attr4", "attr5", "attr6", "attr7")))(sc) + .map { case (entityId, properties) => + try { + + // Convert the data from an Array to a RDD[vector] which is what KMeans + // expects as input + Vectors.dense(Array( + properties.get[Double]("attr0"), + properties.get[Double]("attr1"), + properties.get[Double]("attr2"), + properties.get[Double]("attr3"), + properties.get[Double]("attr4"), + properties.get[Double]("attr5"), + properties.get[Double]("attr6"), + properties.get[Double]("attr7") + )) + + } catch { + case e: Exception => { + logger.error(s"Failed to get properties ${properties} of" + + s" ${entityId}. Exception: ${e}.") + throw e + } + } + } + + new TrainingData(training_points) + } + ``` +The main changes are: + +* Instead of creating an RDD of Event we create an RDD of Vector, which is the kind of input which KMeans algorithm exp[ects. + +* The *entity_type* and *properties* of the data points should be made in sync with the type which was inputted to the prediction-io event server. + +* The original cluster of the data point represented by the attribute *plan* is dropped since clustering is an unsupervised learning algorithm. + +We have already updated *engine.json*. No updates need to be made to *Preparator.scala* and *Serving.Scala*. + +The engine now uses MLlib KMeans algorithm. We are ready to build, train and deploy the engine as described in README.md. + +```Scala +pio build +pio train +pio deploy +``` diff --git a/pio.sbt b/pio.sbt new file mode 100644 index 0000000..28fc722 --- /dev/null +++ b/pio.sbt @@ -0,0 +1,4 @@ +// Generated automatically by pio build. +// Changes in this file will be overridden. + +pioVersion := "0.9.1"