diff --git a/pom.xml b/pom.xml index 5e3ce0b..0e8ee4b 100644 --- a/pom.xml +++ b/pom.xml @@ -1,16 +1,16 @@ + http://maven.apache.org/maven-v4_0_0.xsd"> 4.0.0 com.neilferguson popstrat 0.1-SNAPSHOT - 1.2.0 - 3.0.0.8 - 1.2.5 - 0.16.0 + 1.6.1 + 3.8.2.6 + 1.6.5 + 0.19.0 @@ -27,12 +27,12 @@ org.bdgenomics.adam - adam-core + adam-core_2.10 ${adam.version} org.bdgenomics.adam - adam-apis + adam-apis_2.10 ${adam.version} @@ -87,7 +87,7 @@ org.scala-tools maven-scala-plugin - 2.14.1 + 2.15.2 diff --git a/src/main/scala/com/neilferguson/PopStrat.scala b/src/main/scala/com/neilferguson/PopStrat.scala index 286fb39..9fb9eed 100644 --- a/src/main/scala/com/neilferguson/PopStrat.scala +++ b/src/main/scala/com/neilferguson/PopStrat.scala @@ -17,6 +17,14 @@ import scala.collection.JavaConverters._ import scala.collection.immutable.Range.inclusive import scala.io.Source +import org.apache.spark.sql.types.DataTypes +import hex._ +import water.fvec._ +import water.support._ +import _root_.hex.Distribution.Family +import _root_.hex.deeplearning.DeepLearningModel +import _root_.hex.tree.gbm.GBMModel +import _root_.hex.{Model, ModelMetricsBinomial} object PopStrat { def main(args: Array[String]): Unit = { @@ -96,8 +104,8 @@ object PopStrat { case (sampleId, variants) => (sampleId, variants.toArray.sortBy(_.variantId)) } - val header = StructType(Array(StructField("Region", StringType)) ++ - sortedVariantsBySampleId.first()._2.map(variant => {StructField(variant.variantId.toString, IntegerType)})) + val header = DataTypes.createStructType(Array(DataTypes.createStructField("Region", DataTypes.StringType,false)) ++ + sortedVariantsBySampleId.first()._2.map(variant => {DataTypes.createStructField(variant.variantId.toString,DataTypes.IntegerType,false)})) val rowRDD: RDD[Row] = sortedVariantsBySampleId.map { case (sampleId, sortedVariants) => val region: Array[String] = Array(panel.getOrElse(sampleId, "Unknown")) @@ -107,13 +115,15 @@ object PopStrat { // Create the SchemaRDD from the header and rows and convert the SchemaRDD into a H2O dataframe val sqlContext = new org.apache.spark.sql.SQLContext(sc) + //val dataFrame=sqlContext.createDataFrame(rowRDD, header) val schemaRDD = sqlContext.applySchema(rowRDD, header) val h2oContext = new H2OContext(sc).start() - import h2oContext._ - val dataFrame = h2oContext.toDataFrame(schemaRDD) + import h2oContext._ + val dataFrame1 =h2oContext.asH2OFrame(schemaRDD) + val dataFrame=H2OFrameSupport.allStringVecToCategorical(dataFrame1) // Split the dataframe into 50% training, 30% test, and 20% validation data - val frameSplitter = new FrameSplitter(dataFrame, Array(.5, .3), Array("training", "test", "validation").map(Key.make), null) + val frameSplitter = new FrameSplitter(dataFrame, Array(.5, .3), Array("training", "test", "validation").map(Key.make[Frame](_)), null) water.H2O.submitTask(frameSplitter) val splits = frameSplitter.getResult val training = splits(0) @@ -121,8 +131,8 @@ object PopStrat { // Set the parameters for our deep learning model. val deepLearningParameters = new DeepLearningParameters() - deepLearningParameters._train = training - deepLearningParameters._valid = validation + deepLearningParameters._train = training._key + deepLearningParameters._valid = validation._key deepLearningParameters._response_column = "Region" deepLearningParameters._epochs = 10 deepLearningParameters._activation = Activation.RectifierWithDropout @@ -134,7 +144,7 @@ object PopStrat { // Score the model against the entire dataset (training, test, and validation data) // This causes the confusion matrix to be printed - deepLearningModel.score(dataFrame)('predict) + deepLearningModel.score(dataFrame) }