Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.neilferguson</groupId>
<artifactId>popstrat</artifactId>
<version>0.1-SNAPSHOT</version>
<properties>
<spark.version>1.2.0</spark.version>
<h2o.version>3.0.0.8</h2o.version>
<sparklingwater.version>1.2.5</sparklingwater.version>
<adam.version>0.16.0</adam.version>
<spark.version>1.6.1</spark.version>
<h2o.version>3.8.2.6</h2o.version>
<sparklingwater.version>1.6.5</sparklingwater.version>
<adam.version>0.19.0</adam.version>
</properties>
<pluginRepositories>
<pluginRepository>
Expand All @@ -27,12 +27,12 @@
</dependency>
<dependency>
<groupId>org.bdgenomics.adam</groupId>
<artifactId>adam-core</artifactId>
<artifactId>adam-core_2.10</artifactId>
<version>${adam.version}</version>
</dependency>
<dependency>
<groupId>org.bdgenomics.adam</groupId>
<artifactId>adam-apis</artifactId>
<artifactId>adam-apis_2.10</artifactId>
<version>${adam.version}</version>
</dependency>
<dependency>
Expand Down Expand Up @@ -87,7 +87,7 @@
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.14.1</version>
<version>2.15.2</version>
<executions>
<execution>
<goals>
Expand Down
26 changes: 18 additions & 8 deletions src/main/scala/com/neilferguson/PopStrat.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@ import scala.collection.JavaConverters._
import scala.collection.immutable.Range.inclusive
import scala.io.Source

import org.apache.spark.sql.types.DataTypes
import hex._
import water.fvec._
import water.support._
import _root_.hex.Distribution.Family
import _root_.hex.deeplearning.DeepLearningModel
import _root_.hex.tree.gbm.GBMModel
import _root_.hex.{Model, ModelMetricsBinomial}
object PopStrat {

def main(args: Array[String]): Unit = {
Expand Down Expand Up @@ -96,8 +104,8 @@ object PopStrat {
case (sampleId, variants) =>
(sampleId, variants.toArray.sortBy(_.variantId))
}
val header = StructType(Array(StructField("Region", StringType)) ++
sortedVariantsBySampleId.first()._2.map(variant => {StructField(variant.variantId.toString, IntegerType)}))
val header = DataTypes.createStructType(Array(DataTypes.createStructField("Region", DataTypes.StringType,false)) ++
sortedVariantsBySampleId.first()._2.map(variant => {DataTypes.createStructField(variant.variantId.toString,DataTypes.IntegerType,false)}))
val rowRDD: RDD[Row] = sortedVariantsBySampleId.map {
case (sampleId, sortedVariants) =>
val region: Array[String] = Array(panel.getOrElse(sampleId, "Unknown"))
Expand All @@ -107,22 +115,24 @@ object PopStrat {

// Create the SchemaRDD from the header and rows and convert the SchemaRDD into a H2O dataframe
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
//val dataFrame=sqlContext.createDataFrame(rowRDD, header)
val schemaRDD = sqlContext.applySchema(rowRDD, header)
val h2oContext = new H2OContext(sc).start()
import h2oContext._
val dataFrame = h2oContext.toDataFrame(schemaRDD)
import h2oContext._
val dataFrame1 =h2oContext.asH2OFrame(schemaRDD)
val dataFrame=H2OFrameSupport.allStringVecToCategorical(dataFrame1)

// Split the dataframe into 50% training, 30% test, and 20% validation data
val frameSplitter = new FrameSplitter(dataFrame, Array(.5, .3), Array("training", "test", "validation").map(Key.make), null)
val frameSplitter = new FrameSplitter(dataFrame, Array(.5, .3), Array("training", "test", "validation").map(Key.make[Frame](_)), null)
water.H2O.submitTask(frameSplitter)
val splits = frameSplitter.getResult
val training = splits(0)
val validation = splits(2)

// Set the parameters for our deep learning model.
val deepLearningParameters = new DeepLearningParameters()
deepLearningParameters._train = training
deepLearningParameters._valid = validation
deepLearningParameters._train = training._key
deepLearningParameters._valid = validation._key
deepLearningParameters._response_column = "Region"
deepLearningParameters._epochs = 10
deepLearningParameters._activation = Activation.RectifierWithDropout
Expand All @@ -134,7 +144,7 @@ object PopStrat {

// Score the model against the entire dataset (training, test, and validation data)
// This causes the confusion matrix to be printed
deepLearningModel.score(dataFrame)('predict)
deepLearningModel.score(dataFrame)

}

Expand Down