From f9ffdd5fc5c1a1b539c9e1fc29e1cab1c8829e85 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Thu, 7 May 2015 11:43:52 -0300
Subject: [PATCH 001/268] Fixed typo

---
 src/main/scala/ignition/core/jobs/utils/RDDUtils.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
index fc42ded5..7e75d5ec 100644
--- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
@@ -84,7 +84,7 @@ object RDDUtils {
       rdd.aggregateByKey(List.empty[V])(
         (lst, v) =>
           if (lst.size >= n) {
-            logger.warn(s"Ignoring value '$v' due aggregation result of size '${lst.size}' is bigger then n = '$n'")
+            logger.warn(s"Ignoring value '$v' due aggregation result of size '${lst.size}' is bigger than n=$n")
             lst
           } else {
             v :: lst

From b0e168e5d79b2cd46fd9722eca572fb358e3d421 Mon Sep 17 00:00:00 2001
From: ZaGo <lucaszago@gmail.com>
Date: Fri, 8 May 2015 13:38:26 -0300
Subject: [PATCH 002/268] refactoring to allow changes in ignition.mail

---
 .../ignition/core/jobs/utils/SparkContextUtils.scala | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 29c32112..a1090d20 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -130,6 +130,13 @@ object SparkContextUtils {
     }
 
 
+    def getTextFiles(paths: Seq[String], synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = {
+      if (synchLocally)
+        processTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths)
+      else
+        processTextFiles(paths, minimumPaths)
+    }
+
     def filterAndGetTextFiles(path: String,
                               requireSuccess: Boolean = false,
                               inclusiveStartDate: Boolean = true,
@@ -144,10 +151,7 @@ object SparkContextUtils {
       val paths = getFilteredPaths(Seq(path), requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates)
       if (paths.size < minimumPaths)
         throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of paths $paths is less than the required")
-      else if (synchLocally)
-        processTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths)
-      else
-        processTextFiles(paths, minimumPaths)
+      getTextFiles(paths, synchLocally, forceSynch, minimumPaths)
     }
 
     private def stringHadoopFile(paths: Seq[String], minimumPaths: Int): RDD[Try[String]] = {

From a8e9734b844bf9d5bdbe0572c0a7e247399983de Mon Sep 17 00:00:00 2001
From: Filipe Niero Felisbino <filipenf@gmail.com>
Date: Fri, 8 May 2015 15:49:25 -0300
Subject: [PATCH 003/268] Fix ec2 request issue

---
 tools/spark-ec2/spark_ec2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
index 5fdf0467..a608f9ce 100755
--- a/tools/spark-ec2/spark_ec2.py
+++ b/tools/spark-ec2/spark_ec2.py
@@ -540,7 +540,7 @@ def launch_cluster(conn, opts, cluster_name):
                         (invalid[0].id, invalid[0].status.message))
                 if len(active_instance_ids) == opts.slaves:
                     print "All %d slaves granted" % opts.slaves
-                    reservations = conn.get_all_reservations(active_instance_ids)
+                    reservations = conn.get_all_reservations([r.instance_id for r in active_instance_ids])
                     slave_nodes = []
                     for r in reservations:
                         slave_nodes += r.instances

From 9ae5178549af17b57a19e0ff2fefcb385c5401bf Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Mon, 18 May 2015 10:41:19 -0300
Subject: [PATCH 004/268] Minor improvements

---
 build.sbt                                            |  2 +-
 src/main/scala/ignition/core/utils/DateUtils.scala   | 10 +++++++++-
 src/main/scala/ignition/core/utils/FutureUtils.scala |  4 +++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/build.sbt b/build.sbt
index 095c1228..4dfcd1ae 100644
--- a/build.sbt
+++ b/build.sbt
@@ -17,7 +17,7 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided
 
 libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided")
 
-libraryDependencies += "com.github.nscala-time" %% "nscala-time" % "0.8.0"
+libraryDependencies += "com.github.nscala-time" %% "nscala-time" % "2.0.0"
 
 libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.0"
 
diff --git a/src/main/scala/ignition/core/utils/DateUtils.scala b/src/main/scala/ignition/core/utils/DateUtils.scala
index 231817c7..c3fb5163 100644
--- a/src/main/scala/ignition/core/utils/DateUtils.scala
+++ b/src/main/scala/ignition/core/utils/DateUtils.scala
@@ -1,6 +1,6 @@
 package ignition.core.utils
 
-import org.joda.time.{Period, DateTimeZone, DateTime}
+import org.joda.time.{Seconds, Period, DateTimeZone, DateTime}
 import org.joda.time.format.ISODateTimeFormat
 
 object DateUtils {
@@ -21,4 +21,12 @@ object DateUtils {
     def isEqualOrBefore(other: DateTime) =
       dateTime.isBefore(other) || dateTime.saneEqual(other)
   }
+
+  implicit class SecondsImprovements(val seconds: Seconds) {
+
+    implicit def toScalaDuration: scala.concurrent.duration.FiniteDuration = {
+      scala.concurrent.duration.Duration(seconds.getSeconds, scala.concurrent.duration.SECONDS)
+    }
+
+  }
 }
diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala
index 068d63bc..81b0490e 100644
--- a/src/main/scala/ignition/core/utils/FutureUtils.scala
+++ b/src/main/scala/ignition/core/utils/FutureUtils.scala
@@ -1,10 +1,12 @@
 package ignition.core.utils
 
-import scala.concurrent.{ExecutionContext, Future, Promise}
+import scala.concurrent.{ExecutionContext, Future, Promise, blocking, future}
 import scala.util.{Failure, Success}
 
 object FutureUtils {
 
+  def blockingFuture[T](body: =>T)(implicit ec: ExecutionContext): Future[T] = future { blocking { body } }
+
   implicit class FutureImprovements[V](future: Future[V]) {
     def toOptionOnFailure(errorHandler: (Throwable) => Option[V])(implicit ec: ExecutionContext): Future[Option[V]] = {
       future.map(Option.apply).recover { case t => errorHandler(t) }

From 53cfe885d21307acb4072260f68d6d2f718dc746 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Mon, 18 May 2015 17:05:35 -0300
Subject: [PATCH 005/268] remove unused lib

---
 build.sbt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/build.sbt b/build.sbt
index 4dfcd1ae..046d9503 100644
--- a/build.sbt
+++ b/build.sbt
@@ -17,8 +17,6 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided
 
 libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided")
 
-libraryDependencies += "com.github.nscala-time" %% "nscala-time" % "2.0.0"
-
 libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.0"
 
 libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16"
@@ -29,6 +27,10 @@ libraryDependencies += "com.github.scopt" %% "scopt" % "3.2.0"
 
 libraryDependencies += "net.java.dev.jets3t" % "jets3t" % "0.7.1"
 
+libraryDependencies += "joda-time" % "joda-time" % "2.7"
+
+libraryDependencies += "org.joda" % "joda-convert" % "1.7"
+
 resolvers += "Akka Repository" at "http://repo.akka.io/releases/"
 
 resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/"

From d965fd6ad12bbf2fadf9302837ec7b242661eba8 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Mon, 1 Jun 2015 14:10:58 -0300
Subject: [PATCH 006/268] Added utilitary function for better stack traces

---
 .../scala/ignition/core/utils/BetterTrace.scala     | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 src/main/scala/ignition/core/utils/BetterTrace.scala

diff --git a/src/main/scala/ignition/core/utils/BetterTrace.scala b/src/main/scala/ignition/core/utils/BetterTrace.scala
new file mode 100644
index 00000000..158e261e
--- /dev/null
+++ b/src/main/scala/ignition/core/utils/BetterTrace.scala
@@ -0,0 +1,13 @@
+package ignition.core.utils
+
+// Used mainly to augment scalacheck traces in scalatest
+trait BetterTrace {
+  def fail(message: String): Nothing
+  def withBetterTrace(block: => Unit): Unit =
+    try {
+      block
+    } catch {
+      case t: Throwable => fail(s"${t.getMessage}: ${t.getStackTraceString}")
+    }
+
+}

From c89961984bbd4be54c63366d4df5b915a25c89fc Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Mon, 8 Jun 2015 18:08:22 -0300
Subject: [PATCH 007/268] Updated scalatest to fix conflicts

---
 build.sbt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.sbt b/build.sbt
index 046d9503..be7e1b12 100644
--- a/build.sbt
+++ b/build.sbt
@@ -17,7 +17,7 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided
 
 libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided")
 
-libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.0"
+libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.1.3"
 
 libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16"
 

From 82a09c0ff017484bfbada7d1a4b451e7c288a025 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Thu, 11 Jun 2015 16:34:05 -0300
Subject: [PATCH 008/268] Improved s3 service

---
 src/main/scala/ignition/core/utils/S3Client.scala | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/ignition/core/utils/S3Client.scala b/src/main/scala/ignition/core/utils/S3Client.scala
index f02d7acd..a988aa7f 100644
--- a/src/main/scala/ignition/core/utils/S3Client.scala
+++ b/src/main/scala/ignition/core/utils/S3Client.scala
@@ -37,7 +37,14 @@ class S3Client {
   }
 
   def list(bucket: String, key: String): Array[S3Object] = {
-    service.listObjects(bucket, key, null, 99999L)
+    service.listObjectsChunked(bucket, key, null, 99999L, null, true).getObjects
+  }
+
+  def copyFile(sourceBucket: String, sourceKey: String, destBucket: String, destKey: String, destContentType: Option[String] = None): Unit = {
+    val destFile = new S3Object(destKey)
+    val replaceMetaData = destContentType.isDefined
+    destContentType.foreach(contentType => destFile.setContentType(contentType))
+    service.copyObject(sourceBucket, sourceKey, destBucket, destFile, replaceMetaData)
   }
 
   def fileExists(bucket: String, key: String): Boolean = {

From c32cce56fab86bc3372bd17adc1f7745f3d0c797 Mon Sep 17 00:00:00 2001
From: Flavio Sales Truzzi <flaviotruzzi@gmail.com>
Date: Thu, 11 Jun 2015 19:11:30 -0300
Subject: [PATCH 009/268] Add optinal content type

---
 src/main/scala/ignition/core/utils/S3Client.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/ignition/core/utils/S3Client.scala b/src/main/scala/ignition/core/utils/S3Client.scala
index a988aa7f..fe509a4b 100644
--- a/src/main/scala/ignition/core/utils/S3Client.scala
+++ b/src/main/scala/ignition/core/utils/S3Client.scala
@@ -26,9 +26,9 @@ class S3Client {
     null, null, jets3tProperties
   )
 
-  def writeContent(bucket: String, key: String, content: String): S3Object = {
+  def writeContent(bucket: String, key: String, content: String, contentType: String = "text/plain"): S3Object = {
     val obj = new S3Object(key, content)
-    obj.setContentType("text/plain")
+    obj.setContentType(contentType)
     service.putObject(bucket, obj)
   }
 

From 8f51a86897eb401c9190a8e1fbc2e40359e8a678 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Fri, 12 Jun 2015 16:27:35 -0300
Subject: [PATCH 010/268] Added content encoding

---
 src/main/scala/ignition/core/utils/S3Client.scala | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/ignition/core/utils/S3Client.scala b/src/main/scala/ignition/core/utils/S3Client.scala
index fe509a4b..b806b376 100644
--- a/src/main/scala/ignition/core/utils/S3Client.scala
+++ b/src/main/scala/ignition/core/utils/S3Client.scala
@@ -40,9 +40,13 @@ class S3Client {
     service.listObjectsChunked(bucket, key, null, 99999L, null, true).getObjects
   }
 
-  def copyFile(sourceBucket: String, sourceKey: String, destBucket: String, destKey: String, destContentType: Option[String] = None): Unit = {
+  def copyFile(sourceBucket: String, sourceKey: String,
+               destBucket: String, destKey: String,
+               destContentType: Option[String] = None,
+               destContentEncoding: Option[String] = None): Unit = {
     val destFile = new S3Object(destKey)
-    val replaceMetaData = destContentType.isDefined
+    val replaceMetaData = destContentType.isDefined || destContentEncoding.isDefined
+    destContentEncoding.foreach(encoding => destFile.setContentEncoding(encoding))
     destContentType.foreach(contentType => destFile.setContentType(contentType))
     service.copyObject(sourceBucket, sourceKey, destBucket, destFile, replaceMetaData)
   }

From c752d9379edc91e37c261eed5610dfe09a3a06bf Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Thu, 18 Jun 2015 20:01:26 -0300
Subject: [PATCH 011/268] Upgraded scalatest

---
 build.sbt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.sbt b/build.sbt
index be7e1b12..c4723faf 100644
--- a/build.sbt
+++ b/build.sbt
@@ -17,7 +17,7 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided
 
 libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided")
 
-libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.1.3"
+libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.2.4"
 
 libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16"
 

From 59f818da5aedc7dd919eca2d6e58f21208672316 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Tue, 23 Jun 2015 18:01:01 -0300
Subject: [PATCH 012/268] Added removeEmpty to Maps

---
 src/main/scala/ignition/core/utils/CollectionUtils.scala | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala
index 27977270..5994b153 100644
--- a/src/main/scala/ignition/core/utils/CollectionUtils.scala
+++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala
@@ -103,4 +103,10 @@ object CollectionUtils {
         .toList
     }
   }
+
+
+  implicit class CollectionMap[K, V <: TraversableOnce[Any]](map: Map[K, V]) {
+    def removeEmpty(): Map[K, V] =
+      map.filter { case (k, v) => !v.isEmpty }
+  }
 }

From 842ca9dba49ed76ddedb4990779928b01d46cfc3 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Tue, 23 Jun 2015 18:01:47 -0300
Subject: [PATCH 013/268] Added removeEmpty to Maps

---
 src/main/scala/ignition/core/utils/CollectionUtils.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala
index 5994b153..52828ca7 100644
--- a/src/main/scala/ignition/core/utils/CollectionUtils.scala
+++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala
@@ -107,6 +107,6 @@ object CollectionUtils {
 
   implicit class CollectionMap[K, V <: TraversableOnce[Any]](map: Map[K, V]) {
     def removeEmpty(): Map[K, V] =
-      map.filter { case (k, v) => !v.isEmpty }
+      map.filter { case (k, v) => v.nonEmpty }
   }
 }

From d05f836d8967657fd6df96293d65e013f45861e5 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Thu, 25 Jun 2015 09:34:14 -0300
Subject: [PATCH 014/268] exclude slf4j-log4j12 backend

---
 build.sbt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/build.sbt b/build.sbt
index c4723faf..7eb2bffe 100644
--- a/build.sbt
+++ b/build.sbt
@@ -13,7 +13,9 @@ ideaExcludeFolders += ".idea_modules"
 // Because we can't run two spark contexts on same VM
 parallelExecution in Test := false
 
-libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided").exclude("org.apache.hadoop", "hadoop-client")
+libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided")
+  .exclude("org.apache.hadoop", "hadoop-client")
+  .exclude("org.slf4j", "slf4j-log4j12")
 
 libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided")
 

From c9abcd53eb2d839d61c017e0b574e00be911be4c Mon Sep 17 00:00:00 2001
From: sisso <sisso@chaordicsystems.com>
Date: Tue, 30 Jun 2015 14:41:15 -0300
Subject: [PATCH 015/268] added method that allow to map future using
 success/failure

---
 .../ignition/core/utils/FutureUtils.scala     | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala
index 81b0490e..41cf75a3 100644
--- a/src/main/scala/ignition/core/utils/FutureUtils.scala
+++ b/src/main/scala/ignition/core/utils/FutureUtils.scala
@@ -1,7 +1,7 @@
 package ignition.core.utils
 
 import scala.concurrent.{ExecutionContext, Future, Promise, blocking, future}
-import scala.util.{Failure, Success}
+import scala.util.{Failure, Success, Try}
 
 object FutureUtils {
 
@@ -11,6 +11,23 @@ object FutureUtils {
     def toOptionOnFailure(errorHandler: (Throwable) => Option[V])(implicit ec: ExecutionContext): Future[Option[V]] = {
       future.map(Option.apply).recover { case t => errorHandler(t) }
     }
+
+    /**
+     * Appear to be redundant. But its the only way to map a future with
+     * Success and Failure in same algorithm without split it to use map/recover
+     * or transform.
+     *
+     * future.asTry.map { case Success(v) => 1; case Failure(e) => 0 }
+     *
+     * instead
+     *
+     * future.map(i=>1).recover(case _: Exception => 0)
+     * future.transform(=> 1, => 0)
+     *
+     */
+    def asTry()(implicit ec: ExecutionContext) : Future[Try[V]] = {
+      future.map(v => Success(v)).recover { case e: Exception => Failure(e) }
+    }
   }
 
   implicit class FutureGeneratorImprovements[V](generator: Iterable[() => Future[V]]){

From 48f4e2c2798c3d5a2262234ae001ae6cb5ad5cc6 Mon Sep 17 00:00:00 2001
From: sisso <sisso@chaordicsystems.com>
Date: Tue, 7 Jul 2015 18:09:12 -0300
Subject: [PATCH 016/268] change catch to NonFatal

---
 src/main/scala/ignition/core/utils/FutureUtils.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala
index 41cf75a3..95b44c2f 100644
--- a/src/main/scala/ignition/core/utils/FutureUtils.scala
+++ b/src/main/scala/ignition/core/utils/FutureUtils.scala
@@ -1,6 +1,7 @@
 package ignition.core.utils
 
 import scala.concurrent.{ExecutionContext, Future, Promise, blocking, future}
+import scala.util.control.NonFatal
 import scala.util.{Failure, Success, Try}
 
 object FutureUtils {
@@ -22,11 +23,10 @@ object FutureUtils {
      * instead
      *
      * future.map(i=>1).recover(case _: Exception => 0)
-     * future.transform(=> 1, => 0)
      *
      */
     def asTry()(implicit ec: ExecutionContext) : Future[Try[V]] = {
-      future.map(v => Success(v)).recover { case e: Exception => Failure(e) }
+      future.map(v => Success(v)).recover { case NonFatal(e) => Failure(e) }
     }
   }
 

From bab487acfb4e74c7115a764d5774f5eefaa40630 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Thu, 13 Aug 2015 15:54:42 -0300
Subject: [PATCH 017/268] attempt to update spark_ec2.py

---
 tools/spark-ec2/spark_ec2.py | 708 ++++++++++++++++++++++++-----------
 1 file changed, 482 insertions(+), 226 deletions(-)

diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
index a608f9ce..8cc44d30 100755
--- a/tools/spark-ec2/spark_ec2.py
+++ b/tools/spark-ec2/spark_ec2.py
@@ -19,9 +19,11 @@
 # limitations under the License.
 #
 
-from __future__ import with_statement
+from __future__ import division, print_function, with_statement
 
+import codecs
 import hashlib
+import itertools
 import logging
 import os
 import os.path
@@ -36,13 +38,20 @@
 import tempfile
 import textwrap
 import time
-import urllib2
 import warnings
 from datetime import datetime
 from optparse import OptionParser
 from sys import stderr
 
-SPARK_EC2_VERSION = "1.3.0"
+if sys.version < "3":
+    from urllib2 import urlopen, Request, HTTPError
+else:
+    from urllib.request import urlopen, Request
+    from urllib.error import HTTPError
+    raw_input = input
+    xrange = range
+
+SPARK_EC2_VERSION = "1.4.0"
 SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
 
 VALID_SPARK_VERSIONS = set([
@@ -60,14 +69,84 @@
     "1.2.0",
     "1.2.1",
     "1.3.0",
+    "1.3.1",
+    "1.4.0",
 ])
 
+SPARK_TACHYON_MAP = {
+    "1.0.0": "0.4.1",
+    "1.0.1": "0.4.1",
+    "1.0.2": "0.4.1",
+    "1.1.0": "0.5.0",
+    "1.1.1": "0.5.0",
+    "1.2.0": "0.5.0",
+    "1.2.1": "0.5.0",
+    "1.3.0": "0.5.0",
+    "1.3.1": "0.5.0",
+    "1.4.0": "0.6.4",
+}
+
 DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION
 DEFAULT_SPARK_GITHUB_REPO = "https://github.com/apache/spark"
 
 # Default location to get the spark-ec2 scripts (and ami-list) from
-DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/mesos/spark-ec2"
-DEFAULT_SPARK_EC2_BRANCH = "branch-1.3"
+DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/amplab/spark-ec2"
+DEFAULT_SPARK_EC2_BRANCH = "branch-1.4"
+
+
+def setup_external_libs(libs):
+    """
+    Download external libraries from PyPI to SPARK_EC2_DIR/lib/ and prepend them to our PATH.
+    """
+    PYPI_URL_PREFIX = "https://pypi.python.org/packages/source"
+    SPARK_EC2_LIB_DIR = os.path.join(SPARK_EC2_DIR, "lib")
+
+    if not os.path.exists(SPARK_EC2_LIB_DIR):
+        print("Downloading external libraries that spark-ec2 needs from PyPI to {path}...".format(
+            path=SPARK_EC2_LIB_DIR
+        ))
+        print("This should be a one-time operation.")
+        os.mkdir(SPARK_EC2_LIB_DIR)
+
+    for lib in libs:
+        versioned_lib_name = "{n}-{v}".format(n=lib["name"], v=lib["version"])
+        lib_dir = os.path.join(SPARK_EC2_LIB_DIR, versioned_lib_name)
+
+        if not os.path.isdir(lib_dir):
+            tgz_file_path = os.path.join(SPARK_EC2_LIB_DIR, versioned_lib_name + ".tar.gz")
+            print(" - Downloading {lib}...".format(lib=lib["name"]))
+            download_stream = urlopen(
+                "{prefix}/{first_letter}/{lib_name}/{lib_name}-{lib_version}.tar.gz".format(
+                    prefix=PYPI_URL_PREFIX,
+                    first_letter=lib["name"][:1],
+                    lib_name=lib["name"],
+                    lib_version=lib["version"]
+                )
+            )
+            with open(tgz_file_path, "wb") as tgz_file:
+                tgz_file.write(download_stream.read())
+            with open(tgz_file_path, "rb") as tar:
+                if hashlib.md5(tar.read()).hexdigest() != lib["md5"]:
+                    print("ERROR: Got wrong md5sum for {lib}.".format(lib=lib["name"]), file=stderr)
+                    sys.exit(1)
+            tar = tarfile.open(tgz_file_path)
+            tar.extractall(path=SPARK_EC2_LIB_DIR)
+            tar.close()
+            os.remove(tgz_file_path)
+            print(" - Finished downloading {lib}.".format(lib=lib["name"]))
+        sys.path.insert(1, lib_dir)
+
+
+# Only PyPI libraries are supported.
+external_libs = [
+    {
+        "name": "boto",
+        "version": "2.34.0",
+        "md5": "5556223d2d0cc4d06dd4829e671dcecd"
+    }
+]
+
+setup_external_libs(external_libs)
 
 import boto
 from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType, EBSBlockDeviceType
@@ -107,7 +186,7 @@ def parse_args():
         help="Master instance type (leave empty for same as instance-type)")
     parser.add_option(
         "-r", "--region", default="us-east-1",
-        help="EC2 region zone to launch instances in")
+        help="EC2 region used to launch instances in, or to find them in (default: %default)")
     parser.add_option(
         "-z", "--zone", default="",
         help="Availability zone to launch instances in, or 'all' to spread " +
@@ -133,9 +212,19 @@ def parse_args():
         "--spark-ec2-git-branch",
         default=DEFAULT_SPARK_EC2_BRANCH,
         help="Github repo branch of spark-ec2 to use (default: %default)")
+    parser.add_option(
+        "--deploy-root-dir",
+        default=None,
+        help="A directory to copy into / on the first master. " +
+             "Must be absolute. Note that a trailing slash is handled as per rsync: " +
+             "If you omit it, the last directory of the --deploy-root-dir path will be created " +
+             "in / before copying its contents. If you append the trailing slash, " +
+             "the directory is not created and its contents are copied directly into /. " +
+             "(default: %default).")
     parser.add_option(
         "--hadoop-major-version", default="1",
-        help="Major version of Hadoop (default: %default)")
+        help="Major version of Hadoop. Valid options are 1 (Hadoop 1.0.4), 2 (CDH 4.7.1), yarn " +
+             "(Hadoop 2.4.0) (default: %default)")
     parser.add_option(
         "-D", metavar="[ADDRESS:]PORT", dest="proxy_port",
         help="Use SSH dynamic port forwarding to create a SOCKS proxy at " +
@@ -155,7 +244,7 @@ def parse_args():
         help="Number of EBS volumes to attach to each node as /vol[x]. " +
              "The volumes will be deleted when the instances terminate. " +
              "Only possible on EBS-backed AMIs. " +
-             "EBS volumes are only attached if --ebs-vol-size > 0." +
+             "EBS volumes are only attached if --ebs-vol-size > 0. " +
              "Only support up to 8 EBS volumes.")
     parser.add_option(
         "--placement-group", type="string", default=None,
@@ -187,14 +276,15 @@ def parse_args():
         help="Launch fresh slaves, but use an existing stopped master if possible")
     parser.add_option(
         "--worker-instances", type="int", default=1,
-        help="Number of instances per worker: variable SPARK_WORKER_INSTANCES (default: %default)")
+        help="Number of instances per worker: variable SPARK_WORKER_INSTANCES. Not used if YARN " +
+             "is used as Hadoop major version (default: %default)")
     parser.add_option(
         "--master-opts", type="string", default="",
         help="Extra options to give to master through SPARK_MASTER_OPTS variable " +
              "(e.g -Dspark.worker.timeout=180)")
     parser.add_option(
         "--user-data", type="string", default="",
-        help="Path to a user-data file (most AMI's interpret this as an initialization script)")
+        help="Path to a user-data file (most AMIs interpret this as an initialization script)")
     parser.add_option(
         "--security-group-prefix", type="string", default=None,
         help="Use this prefix for the security group rather than the cluster name.")
@@ -204,6 +294,10 @@ def parse_args():
     parser.add_option(
         "--additional-security-group", type="string", default="",
         help="Additional security group to place the machines in")
+    parser.add_option(
+        "--additional-tags", type="string", default="",
+        help="Additional tags to set on the machines; tags are comma-separated, while name and " +
+             "value are colon separated; ex: \"Task:MySparkProject,Env:production\"")
     parser.add_option(
         "--copy-aws-credentials", action="store_true", default=False,
         help="Add AWS credentials to hadoop configuration to allow Spark to access S3")
@@ -216,6 +310,17 @@ def parse_args():
     parser.add_option(
         "--spot-timeout", type="int", default=45,
         help="Maximum amount of time (in minutes) to wait for spot requests to be fulfilled")
+    parser.add_option(
+        "--private-ips", action="store_true", default=False,
+        help="Use private IPs for instances rather than public if VPC/subnet " +
+             "requires that.")
+    parser.add_option(
+        "--instance-initiated-shutdown-behavior", default="terminate",
+        choices=["stop", "terminate"],
+        help="Whether instances should terminate when shut down or just stop")
+    parser.add_option(
+        "--instance-profile-name", default=None,
+        help="IAM profile name to launch instances under")
 
     (opts, args) = parser.parse_args()
     if len(args) != 2:
@@ -228,14 +333,16 @@ def parse_args():
     home_dir = os.getenv('HOME')
     if home_dir is None or not os.path.isfile(home_dir + '/.boto'):
         if not os.path.isfile('/etc/boto.cfg'):
-            if os.getenv('AWS_ACCESS_KEY_ID') is None:
-                print >> stderr, ("ERROR: The environment variable AWS_ACCESS_KEY_ID " +
-                                  "must be set")
-                sys.exit(1)
-            if os.getenv('AWS_SECRET_ACCESS_KEY') is None:
-                print >> stderr, ("ERROR: The environment variable AWS_SECRET_ACCESS_KEY " +
-                                  "must be set")
-                sys.exit(1)
+            # If there is no boto config, check aws credentials
+            if not os.path.isfile(home_dir + '/.aws/credentials'):
+                if os.getenv('AWS_ACCESS_KEY_ID') is None:
+                    print("ERROR: The environment variable AWS_ACCESS_KEY_ID must be set",
+                          file=stderr)
+                    sys.exit(1)
+                if os.getenv('AWS_SECRET_ACCESS_KEY') is None:
+                    print("ERROR: The environment variable AWS_SECRET_ACCESS_KEY must be set",
+                          file=stderr)
+                    sys.exit(1)
     return (opts, action, cluster_name)
 
 
@@ -246,7 +353,7 @@ def get_or_make_group(conn, name, vpc_id):
     if len(group) > 0:
         return group[0]
     else:
-        print "Creating security group " + name
+        print("Creating security group " + name)
         return conn.create_security_group(name, "Spark EC2 group", vpc_id)
 
 def check_if_http_resource_exists(resource):
@@ -270,12 +377,12 @@ def get_validate_spark_version(version, repo):
         if check_if_http_resource_exists:
             return version
         else:
-            print >> stderr, "Unable to validate pre-built spark version {version}".format(version=version)
+            print("Unable to validate pre-built spark version {version}".format(version=version), file=stderr)
             sys.exit(1)
     elif "." in version:
         version = version.replace("v", "")
         if version not in VALID_SPARK_VERSIONS:
-            print >> stderr, "Don't know about Spark version: {v}".format(v=version)
+            print("Don't know about Spark version: {v}".format(v=version), file=stderr)
             sys.exit(1)
         return version
     else:
@@ -288,84 +395,93 @@ def get_validate_spark_version(version, repo):
             return version
 
 
-# Check whether a given EC2 instance object is in a state we consider active,
-# i.e. not terminating or terminated. We count both stopping and stopped as
-# active since we can restart stopped clusters.
-def is_active(instance):
-    return (instance.state in ['pending', 'running', 'stopping', 'stopped'])
-
-
 # Source: http://aws.amazon.com/amazon-linux-ami/instance-type-matrix/
-# Last Updated: 2014-06-20
+# Last Updated: 2015-06-19
 # For easy maintainability, please keep this manually-inputted dictionary sorted by key.
 EC2_INSTANCE_TYPES = {
     "c1.medium":   "pvm",
     "c1.xlarge":   "pvm",
+    "c3.large":    "pvm",
+    "c3.xlarge":   "pvm",
     "c3.2xlarge":  "pvm",
     "c3.4xlarge":  "pvm",
     "c3.8xlarge":  "pvm",
-    "c3.large":    "pvm",
-    "c3.xlarge":   "pvm",
+    "c4.large":    "hvm",
+    "c4.xlarge":   "hvm",
+    "c4.2xlarge":  "hvm",
+    "c4.4xlarge":  "hvm",
+    "c4.8xlarge":  "hvm",
     "cc1.4xlarge": "hvm",
     "cc2.8xlarge": "hvm",
     "cg1.4xlarge": "hvm",
     "cr1.8xlarge": "hvm",
+    "d2.xlarge":   "hvm",
+    "d2.2xlarge":  "hvm",
+    "d2.4xlarge":  "hvm",
+    "d2.8xlarge":  "hvm",
+    "g2.2xlarge":  "hvm",
+    "g2.8xlarge":  "hvm",
     "hi1.4xlarge": "pvm",
     "hs1.8xlarge": "pvm",
+    "i2.xlarge":   "hvm",
     "i2.2xlarge":  "hvm",
     "i2.4xlarge":  "hvm",
     "i2.8xlarge":  "hvm",
-    "i2.xlarge":   "hvm",
-    "m1.large":    "pvm",
-    "m1.medium":   "pvm",
     "m1.small":    "pvm",
+    "m1.medium":   "pvm",
+    "m1.large":    "pvm",
     "m1.xlarge":   "pvm",
+    "m2.xlarge":   "pvm",
     "m2.2xlarge":  "pvm",
     "m2.4xlarge":  "pvm",
-    "m2.xlarge":   "pvm",
-    "m3.2xlarge":  "hvm",
-    "m3.large":    "hvm",
     "m3.medium":   "hvm",
+    "m3.large":    "hvm",
     "m3.xlarge":   "hvm",
+    "m3.2xlarge":  "hvm",
+    "m4.large":    "hvm",
+    "m4.xlarge":   "hvm",
+    "m4.2xlarge":  "hvm",
+    "m4.4xlarge":  "hvm",
+    "m4.10xlarge": "hvm",
+    "r3.large":    "hvm",
+    "r3.xlarge":   "hvm",
     "r3.2xlarge":  "hvm",
     "r3.4xlarge":  "hvm",
     "r3.8xlarge":  "hvm",
-    "r3.large":    "hvm",
-    "r3.xlarge":   "hvm",
     "t1.micro":    "pvm",
-    "t2.medium":   "hvm",
     "t2.micro":    "hvm",
     "t2.small":    "hvm",
-    "d2.2xlarge":  "hvm",
-    "d2.4xlarge":  "hvm",
-    "d2.8xlarge":  "hvm",
-    "d2.large":    "hvm",
-    "d2.xlarge":   "hvm",
+    "t2.medium":   "hvm",
+    "t2.large":    "hvm",
 }
 
 
+def get_tachyon_version(spark_version):
+    return SPARK_TACHYON_MAP.get(spark_version, "")
+
+
 # Attempt to resolve an appropriate AMI given the architecture and region of the request.
-def get_spark_ami(instance_type, region, spark_ec2_git_repo, spark_ec2_git_branch):
-    if instance_type in EC2_INSTANCE_TYPES:
-        instance_type = EC2_INSTANCE_TYPES[instance_type]
+def get_spark_ami(opts):
+    if opts.instance_type in EC2_INSTANCE_TYPES:
+        instance_type = EC2_INSTANCE_TYPES[opts.instance_type]
     else:
         instance_type = "pvm"
-        print >> stderr,\
-            "Don't recognize %s, assuming type is pvm" % instance_type
+        print("Don't recognize %s, assuming type is pvm" % opts.instance_type, file=stderr)
 
     # URL prefix from which to fetch AMI information
     ami_prefix = "{r}/{b}/ami-list".format(
-        r=spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1),
-        b=spark_ec2_git_branch)
+        r=opts.spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1),
+        b=opts.spark_ec2_git_branch)
 
-    ami_path = "%s/%s/%s" % (ami_prefix, region, instance_type)
+    ami_path = "%s/%s/%s" % (ami_prefix, opts.region, instance_type)
+    reader = codecs.getreader("ascii")
     try:
-        ami = urllib2.urlopen(ami_path).read().strip()
-        print "Spark AMI for %s: %s" % (instance_type, ami)
+        ami = reader(urlopen(ami_path)).read().strip()
     except:
-        print >> stderr, "Could not resolve AMI at: " + ami_path
+        print("Could not resolve AMI at: " + ami_path, file=stderr)
         sys.exit(1)
 
+    print("Spark AMI: " + ami)
     return ami
 
 
@@ -375,11 +491,11 @@ def get_spark_ami(instance_type, region, spark_ec2_git_repo, spark_ec2_git_branc
 # Fails if there already instances running in the cluster's groups.
 def launch_cluster(conn, opts, cluster_name):
     if opts.identity_file is None:
-        print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
+        print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr)
         sys.exit(1)
 
     if opts.key_pair is None:
-        print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
+        print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr)
         sys.exit(1)
 
     user_data_content = None
@@ -387,7 +503,7 @@ def launch_cluster(conn, opts, cluster_name):
         with open(opts.user_data) as user_data_file:
             user_data_content = user_data_file.read()
 
-    print "Setting up security groups..."
+    print("Setting up security groups...")
     if opts.security_group_prefix is None:
         master_group = get_or_make_group(conn, cluster_name + "-master", opts.vpc_id)
         slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id)
@@ -421,6 +537,17 @@ def launch_cluster(conn, opts, cluster_name):
         master_group.authorize('tcp', 50070, 50070, authorized_address)
         master_group.authorize('tcp', 60070, 60070, authorized_address)
         master_group.authorize('tcp', 4040, 4045, authorized_address)
+        # Rstudio (GUI for R) needs port 8787 for web access
+        master_group.authorize('tcp', 8787, 8787, authorized_address)
+        # HDFS NFS gateway requires 111,2049,4242 for tcp & udp
+        master_group.authorize('tcp', 111, 111, authorized_address)
+        master_group.authorize('udp', 111, 111, authorized_address)
+        master_group.authorize('tcp', 2049, 2049, authorized_address)
+        master_group.authorize('udp', 2049, 2049, authorized_address)
+        master_group.authorize('tcp', 4242, 4242, authorized_address)
+        master_group.authorize('udp', 4242, 4242, authorized_address)
+        # RM in YARN mode uses 8088
+        master_group.authorize('tcp', 8088, 8088, authorized_address)
         if opts.ganglia:
             master_group.authorize('tcp', 5080, 5080, authorized_address)
     if slave_group.rules == []:  # Group was just now created
@@ -451,13 +578,13 @@ def launch_cluster(conn, opts, cluster_name):
     existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
                                                              die_on_error=False)
     if existing_slaves or (existing_masters and not opts.use_existing_master):
-        print >> stderr, ("ERROR: There are already instances running in " +
-                          "group %s or %s" % (master_group.name, slave_group.name))
+        print("ERROR: There are already instances running in group %s or %s" %
+              (master_group.name, slave_group.name), file=stderr)
         sys.exit(1)
 
     # Figure out Spark AMI
     if opts.ami is None:
-        opts.ami = get_spark_ami(opts.instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch)
+        opts.ami = get_spark_ami(opts)
 
     if opts.master_ami is None:
         opts.master_ami = get_spark_ami(opts.master_instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch) 
@@ -468,12 +595,12 @@ def launch_cluster(conn, opts, cluster_name):
         additional_group_ids = [sg.id
                                 for sg in conn.get_all_security_groups()
                                 if opts.additional_security_group in (sg.name, sg.id)]
-    print "Launching instances..."
+    print("Launching instances...")
 
     try:
         image = conn.get_all_images(image_ids=[opts.ami])[0]
     except:
-        print >> stderr, "Could not find AMI " + opts.ami
+        print("Could not find AMI " + opts.ami, file=stderr)
         sys.exit(1)
 
     try:
@@ -502,8 +629,8 @@ def launch_cluster(conn, opts, cluster_name):
     # Launch slaves
     if opts.spot_price is not None:
         # Launch spot instances with the requested price
-        print ("Requesting %d slaves as spot instances with price $%.3f" %
-               (opts.slaves, opts.spot_price))
+        print("Requesting %d slaves as spot instances with price $%.3f" %
+              (opts.slaves, opts.spot_price))
         zones = get_zones(conn, opts)
         num_zones = len(zones)
         i = 0
@@ -522,12 +649,13 @@ def launch_cluster(conn, opts, cluster_name):
                 block_device_map=block_map,
                 subnet_id=opts.subnet_id,
                 placement_group=opts.placement_group,
-                user_data=user_data_content)
+                user_data=user_data_content,
+                instance_profile_name=opts.instance_profile_name)
             my_req_ids += [req.id for req in slave_reqs]
             i += 1
 
         start_time = datetime.now()
-        print "Waiting for spot instances to be granted... Request IDs: %s " % my_req_ids
+        print("Waiting for spot instances to be granted... Request IDs: %s " % my_req_ids)
         try:
             while True:
                 time.sleep(10)
@@ -539,28 +667,28 @@ def launch_cluster(conn, opts, cluster_name):
                     raise Exception("Invalid state for spot request: %s - status: %s" %
                         (invalid[0].id, invalid[0].status.message))
                 if len(active_instance_ids) == opts.slaves:
-                    print "All %d slaves granted" % opts.slaves
+                    print("All %d slaves granted" % opts.slaves)
                     reservations = conn.get_all_reservations([r.instance_id for r in active_instance_ids])
                     slave_nodes = []
                     for r in reservations:
                         slave_nodes += r.instances
                     break
                 else:
-                    print "%d of %d slaves granted, waiting longer" % (
-                        len(active_instance_ids), opts.slaves)
+                    print("%d of %d slaves granted, waiting longer" % (
+                        len(active_instance_ids), opts.slaves))
 
                 if (datetime.now() - start_time).seconds > opts.spot_timeout * 60:
                     raise Exception("Timed out while waiting for spot instances")
         except:
-            print "Error: %s" % sys.exc_info()[1]
-            print "Canceling spot instance requests"
+            print("Error: %s" % sys.exc_info()[1])
+            print("Canceling spot instance requests")
             conn.cancel_spot_instance_requests(my_req_ids)
             # Log a warning if any of these requests actually launched instances:
             (master_nodes, slave_nodes) = get_existing_cluster(
                 conn, opts, cluster_name, die_on_error=False)
             running = len(master_nodes) + len(slave_nodes)
             if running:
-                print >> stderr, ("WARNING: %d instances are still running" % running)
+                print(("WARNING: %d instances are still running" % running), file=stderr)
             sys.exit(0)
     else:
         # Launch non-spot instances
@@ -571,24 +699,30 @@ def launch_cluster(conn, opts, cluster_name):
         for zone in zones:
             num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
             if num_slaves_this_zone > 0:
-                slave_res = image.run(key_name=opts.key_pair,
-                                      security_group_ids=[slave_group.id] + additional_group_ids,
-                                      instance_type=opts.instance_type,
-                                      placement=zone,
-                                      min_count=num_slaves_this_zone,
-                                      max_count=num_slaves_this_zone,
-                                      block_device_map=block_map,
-                                      subnet_id=opts.subnet_id,
-                                      placement_group=opts.placement_group,
-                                      user_data=user_data_content)
+                slave_res = image.run(
+                    key_name=opts.key_pair,
+                    security_group_ids=[slave_group.id] + additional_group_ids,
+                    instance_type=opts.instance_type,
+                    placement=zone,
+                    min_count=num_slaves_this_zone,
+                    max_count=num_slaves_this_zone,
+                    block_device_map=block_map,
+                    subnet_id=opts.subnet_id,
+                    placement_group=opts.placement_group,
+                    user_data=user_data_content,
+                    instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior,
+                    instance_profile_name=opts.instance_profile_name)
                 slave_nodes += slave_res.instances
-                print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
-                                                                zone, slave_res.id)
+                print("Launched {s} slave{plural_s} in {z}, regid = {r}".format(
+                      s=num_slaves_this_zone,
+                      plural_s=('' if num_slaves_this_zone == 1 else 's'),
+                      z=zone,
+                      r=slave_res.id))
             i += 1
 
     # Launch or resume masters
     if existing_masters:
-        print "Starting master..."
+        print("Starting master...")
         for inst in existing_masters:
             if inst.state not in ["shutting-down", "terminated"]:
                 inst.start()
@@ -599,72 +733,92 @@ def launch_cluster(conn, opts, cluster_name):
             master_type = opts.instance_type
         if opts.zone == 'all':
             opts.zone = random.choice(conn.get_all_zones()).name
-        master_res = master_image.run(key_name=opts.key_pair,
-                               security_group_ids=[master_group.id] + additional_group_ids,
-                               instance_type=master_type,
-                               placement=opts.zone,
-                               min_count=1,
-                               max_count=1,
-                               block_device_map=block_map,
-                               subnet_id=opts.subnet_id,
-                               placement_group=opts.placement_group,
-                               user_data=user_data_content)
+        master_res = master_image.run(
+            key_name=opts.key_pair,
+            security_group_ids=[master_group.id] + additional_group_ids,
+            instance_type=master_type,
+            placement=opts.zone,
+            min_count=1,
+            max_count=1,
+            block_device_map=block_map,
+            subnet_id=opts.subnet_id,
+            placement_group=opts.placement_group,
+            user_data=user_data_content,
+            instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior,
+            instance_profile_name=opts.instance_profile_name)
 
         master_nodes = master_res.instances
-        print "Launched master in %s, regid = %s" % (zone, master_res.id)
+        print("Launched master in %s, regid = %s" % (zone, master_res.id))
 
     # This wait time corresponds to SPARK-4983
-    print "Waiting for AWS to propagate instance metadata..."
+    print("Waiting for AWS to propagate instance metadata...")
     time.sleep(5)
-    # Give the instances descriptive names
+
+    # Give the instances descriptive names and set additional tags
+    additional_tags = {}
+    if opts.additional_tags.strip():
+        additional_tags = dict(
+            map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',')
+        )
+
     for master in master_nodes:
-        master.add_tag(
-            key='Name',
-            value='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
+        master.add_tags(
+            dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
+        )
+
     for slave in slave_nodes:
-        slave.add_tag(
-            key='Name',
-            value='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
+        slave.add_tags(
+            dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
+        )
 
     # Return all the instances
     return (master_nodes, slave_nodes)
 
 
-# Get the EC2 instances in an existing cluster if available.
-# Returns a tuple of lists of EC2 instance objects for the masters and slaves
 def get_existing_cluster(conn, opts, cluster_name, die_on_error=True):
-    print "Searching for existing cluster " + cluster_name + "..."
-    reservations = conn.get_all_reservations()
-    master_nodes = []
-    slave_nodes = []
-    for res in reservations:
-        active = [i for i in res.instances if is_active(i)]
-        for inst in active:
-            group_names = [g.name for g in inst.groups]
-            if (cluster_name + "-master") in group_names:
-                master_nodes.append(inst)
-            elif (cluster_name + "-slaves") in group_names:
-                slave_nodes.append(inst)
-    if any((master_nodes, slave_nodes)):
-        print "Found %d master(s), %d slaves" % (len(master_nodes), len(slave_nodes))
-    if master_nodes != [] or not die_on_error:
-        return (master_nodes, slave_nodes)
-    else:
-        if master_nodes == [] and slave_nodes != []:
-            print >> sys.stderr, "ERROR: Could not find master in group " + cluster_name + "-master"
-        else:
-            print >> sys.stderr, "ERROR: Could not find any existing cluster"
+    """
+    Get the EC2 instances in an existing cluster if available.
+    Returns a tuple of lists of EC2 instance objects for the masters and slaves.
+    """
+    print("Searching for existing cluster {c} in region {r}...".format(
+          c=cluster_name, r=opts.region))
+
+    def get_instances(group_names):
+        """
+        Get all non-terminated instances that belong to any of the provided security groups.
+
+        EC2 reservation filters and instance states are documented here:
+            http://docs.aws.amazon.com/cli/latest/reference/ec2/describe-instances.html#options
+        """
+        reservations = conn.get_all_reservations(
+            filters={"instance.group-name": group_names})
+        instances = itertools.chain.from_iterable(r.instances for r in reservations)
+        return [i for i in instances if i.state not in ["shutting-down", "terminated"]]
+
+    master_instances = get_instances([cluster_name + "-master"])
+    slave_instances = get_instances([cluster_name + "-slaves"])
+
+    if any((master_instances, slave_instances)):
+        print("Found {m} master{plural_m}, {s} slave{plural_s}.".format(
+              m=len(master_instances),
+              plural_m=('' if len(master_instances) == 1 else 's'),
+              s=len(slave_instances),
+              plural_s=('' if len(slave_instances) == 1 else 's')))
+
+    if not master_instances and die_on_error:
+        print("ERROR: Could not find a master for cluster {c} in region {r}.".format(
+              c=cluster_name, r=opts.region), file=sys.stderr)
         sys.exit(1)
 
+    return (master_instances, slave_instances)
+
 
 # Deploy configuration files and run setup scripts on a newly launched
 # or started EC2 cluster.
-
-
 def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
-    master = master_nodes[0].public_dns_name
+    master = get_dns_name(master_nodes[0], opts.private_ips)
     if deploy_ssh_key:
-        print "Generating cluster's SSH key on master..."
+        print("Generating cluster's SSH key on master...")
         key_setup = """
           [ -f ~/.ssh/id_rsa ] ||
             (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa &&
@@ -672,24 +826,29 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
         """
         ssh(master, opts, key_setup)
         dot_ssh_tar = ssh_read(master, opts, ['tar', 'c', '.ssh'])
-        print "Transferring cluster's SSH key to slaves..."
+        print("Transferring cluster's SSH key to slaves...")
         for slave in slave_nodes:
-            print slave.public_dns_name
-            ssh_write(slave.public_dns_name, opts, ['tar', 'x'], dot_ssh_tar)
+            slave_address = get_dns_name(slave, opts.private_ips)
+            print(slave_address)
+            ssh_write(slave_address, opts, ['tar', 'x'], dot_ssh_tar)
 
     modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs',
-               'mapreduce', 'spark-standalone', 'tachyon']
+               'mapreduce', 'spark-standalone', 'tachyon', 'rstudio']
 
     if opts.hadoop_major_version == "1":
-        modules = filter(lambda x: x != "mapreduce", modules)
+        modules = list(filter(lambda x: x != "mapreduce", modules))
 
     if opts.ganglia:
         modules.append('ganglia')
 
+    # Clear SPARK_WORKER_INSTANCES if running on YARN
+    if opts.hadoop_major_version == "yarn":
+        opts.worker_instances = ""
+
     # NOTE: We should clone the repository before running deploy_files to
     # prevent ec2-variables.sh from being overwritten
-    print "Cloning spark-ec2 scripts from {r}/tree/{b} on master...".format(
-        r=opts.spark_ec2_git_repo, b=opts.spark_ec2_git_branch)
+    print("Cloning spark-ec2 scripts from {r}/tree/{b} on master...".format(
+        r=opts.spark_ec2_git_repo, b=opts.spark_ec2_git_branch))
     ssh(
         host=master,
         opts=opts,
@@ -699,7 +858,7 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
                                                   b=opts.spark_ec2_git_branch)
     )
 
-    print "Deploying files to master..."
+    print("Deploying files to master...")
     deploy_files(
         conn=conn,
         root_dir=SPARK_EC2_DIR + "/" + "deploy.generic",
@@ -709,18 +868,26 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
         modules=modules
     )
 
-    print "Running setup on master..."
+    if opts.deploy_root_dir is not None:
+        print("Deploying {s} to master...".format(s=opts.deploy_root_dir))
+        deploy_user_files(
+            root_dir=opts.deploy_root_dir,
+            opts=opts,
+            master_nodes=master_nodes
+        )
+
+    print("Running setup on master...")
     setup_spark_cluster(master, opts)
-    print "Done!"
+    print("Done!")
 
 
 def setup_spark_cluster(master, opts):
     ssh(master, opts, "chmod u+x spark-ec2/setup.sh")
     ssh(master, opts, "spark-ec2/setup.sh")
-    print "Spark standalone cluster started at http://%s:8080" % master
+    print("Spark standalone cluster started at http://%s:8080" % master)
 
     if opts.ganglia:
-        print "Ganglia started at http://%s:5080/ganglia" % master
+        print("Ganglia started at http://%s:5080/ganglia" % master)
 
 
 def is_ssh_available(host, opts, print_ssh_output=True):
@@ -737,7 +904,7 @@ def is_ssh_available(host, opts, print_ssh_output=True):
 
     if s.returncode != 0 and print_ssh_output:
         # extra leading newline is for spacing in wait_for_cluster_state()
-        print textwrap.dedent("""\n
+        print(textwrap.dedent("""\n
             Warning: SSH connection error. (This could be temporary.)
             Host: {h}
             SSH return code: {r}
@@ -746,7 +913,7 @@ def is_ssh_available(host, opts, print_ssh_output=True):
             h=host,
             r=s.returncode,
             o=cmd_output.strip()
-        )
+        ))
 
     return s.returncode == 0
 
@@ -756,7 +923,8 @@ def is_cluster_ssh_available(cluster_instances, opts):
     Check if SSH is available on all the instances in a cluster.
     """
     for i in cluster_instances:
-        if not is_ssh_available(host=i.ip_address, opts=opts):
+        dns_name = get_dns_name(i, opts.private_ips)
+        if not is_ssh_available(host=dns_name, opts=opts):
             return False
     else:
         return True
@@ -786,7 +954,11 @@ def wait_for_cluster_state(conn, opts, cluster_instances, cluster_state):
         for i in cluster_instances:
             i.update()
 
-        statuses = conn.get_all_instance_status(instance_ids=[i.id for i in cluster_instances])
+        max_batch = 100
+        statuses = []
+        for j in xrange(0, len(cluster_instances), max_batch):
+            batch = [i.id for i in cluster_instances[j:j + max_batch]]
+            statuses.extend(conn.get_all_instance_status(instance_ids=batch))
 
         if cluster_state == 'ssh-ready':
             if all(i.state == 'running' for i in cluster_instances) and \
@@ -806,63 +978,78 @@ def wait_for_cluster_state(conn, opts, cluster_instances, cluster_state):
     sys.stdout.write("\n")
 
     end_time = datetime.now()
-    print "Cluster is now in '{s}' state. Waited {t} seconds.".format(
+    print("Cluster is now in '{s}' state. Waited {t} seconds.".format(
         s=cluster_state,
         t=(end_time - start_time).seconds
-    )
+    ))
 
 
 # Get number of local disks available for a given EC2 instance type.
 def get_num_disks(instance_type):
     # Source: http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html
-    # Last Updated: 2014-06-20
+    # Last Updated: 2015-06-19
     # For easy maintainability, please keep this manually-inputted dictionary sorted by key.
     disks_by_instance = {
         "c1.medium":   1,
         "c1.xlarge":   4,
+        "c3.large":    2,
+        "c3.xlarge":   2,
         "c3.2xlarge":  2,
         "c3.4xlarge":  2,
         "c3.8xlarge":  2,
-        "c3.large":    2,
-        "c3.xlarge":   2,
+        "c4.large":    0,
+        "c4.xlarge":   0,
+        "c4.2xlarge":  0,
+        "c4.4xlarge":  0,
+        "c4.8xlarge":  0,
         "cc1.4xlarge": 2,
         "cc2.8xlarge": 4,
         "cg1.4xlarge": 2,
         "cr1.8xlarge": 2,
+        "d2.xlarge":   3,
+        "d2.2xlarge":  6,
+        "d2.4xlarge":  12,
+        "d2.8xlarge":  24,
         "g2.2xlarge":  1,
+        "g2.8xlarge":  2,
         "hi1.4xlarge": 2,
         "hs1.8xlarge": 24,
+        "i2.xlarge":   1,
         "i2.2xlarge":  2,
         "i2.4xlarge":  4,
         "i2.8xlarge":  8,
-        "i2.xlarge":   1,
-        "m1.large":    2,
-        "m1.medium":   1,
         "m1.small":    1,
+        "m1.medium":   1,
+        "m1.large":    2,
         "m1.xlarge":   4,
+        "m2.xlarge":   1,
         "m2.2xlarge":  1,
         "m2.4xlarge":  2,
-        "m2.xlarge":   1,
-        "m3.2xlarge":  2,
-        "m3.large":    1,
         "m3.medium":   1,
+        "m3.large":    1,
         "m3.xlarge":   2,
+        "m3.2xlarge":  2,
+        "m4.large":    0,
+        "m4.xlarge":   0,
+        "m4.2xlarge":  0,
+        "m4.4xlarge":  0,
+        "m4.10xlarge": 0,
+        "r3.large":    1,
+        "r3.xlarge":   1,
         "r3.2xlarge":  1,
         "r3.4xlarge":  1,
         "r3.8xlarge":  2,
-        "r3.large":    1,
-        "r3.xlarge":   1,
         "t1.micro":    0,
-        'd2.xlarge':   3,
-        'd2.2xlarge':  6,
-        'd2.4xlarge':  12,
-        'd2.8xlarge':  24,
+        "t2.micro":    0,
+        "t2.small":    0,
+        "t2.medium":   0,
+        "t2.large":    0,
     }
     if instance_type in disks_by_instance:
         return disks_by_instance[instance_type]
     else:
-        print >> stderr, ("WARNING: Don't know number of disks on instance type %s; assuming 1"
-                          % instance_type)
+        print("WARNING: Don't know number of disks on instance type %s; assuming 1"
+              % instance_type, file=stderr)
         return 1
 
 
@@ -874,7 +1061,7 @@ def get_num_disks(instance_type):
 #
 # root_dir should be an absolute path to the directory with the files we want to deploy.
 def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
-    active_master = master_nodes[0].public_dns_name
+    active_master = get_dns_name(master_nodes[0], opts.private_ips)
 
     num_disks = get_num_disks(opts.instance_type)
     hdfs_data_dirs = "/mnt/ephemeral-hdfs/data"
@@ -891,17 +1078,27 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
     if opts.spark_version.startswith("http"):
         # Custom pre-built spark package
         spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
+        tachyon_v = ""
+        print("Deploying Spark via custom bunlde; Tachyon won't be set up")
+        modules = filter(lambda x: x != "tachyon", modules)
     elif "." in opts.spark_version:
         # Pre-built Spark deploy
         spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
+        tachyon_v = get_tachyon_version(spark_v)
     else:
         # Spark-only custom deploy
         spark_v = "%s|%s" % (opts.spark_git_repo, opts.spark_version)
+        tachyon_v = ""
+        print("Deploying Spark via git hash; Tachyon won't be set up")
+        modules = filter(lambda x: x != "tachyon", modules)
 
+    master_addresses = [get_dns_name(i, opts.private_ips) for i in master_nodes]
+    slave_addresses = [get_dns_name(i, opts.private_ips) for i in slave_nodes]
+    worker_instances_str = "%d" % opts.worker_instances if opts.worker_instances else ""
     template_vars = {
-        "master_list": '\n'.join([i.public_dns_name for i in master_nodes]),
+        "master_list": '\n'.join(master_addresses),
         "active_master": active_master,
-        "slave_list": '\n'.join([i.public_dns_name for i in slave_nodes]),
+        "slave_list": '\n'.join(slave_addresses),
         "cluster_url": cluster_url,
         "hdfs_data_dirs": hdfs_data_dirs,
         "mapred_local_dirs": mapred_local_dirs,
@@ -909,8 +1106,9 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
         "swap": str(opts.swap),
         "modules": '\n'.join(modules),
         "spark_version": spark_v,
+        "tachyon_version": tachyon_v,
         "hadoop_major_version": opts.hadoop_major_version,
-        "spark_worker_instances": "%d" % opts.worker_instances,
+        "spark_worker_instances": worker_instances_str,
         "spark_master_opts": opts.master_opts
     }
 
@@ -953,6 +1151,23 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
     shutil.rmtree(tmp_dir)
 
 
+# Deploy a given local directory to a cluster, WITHOUT parameter substitution.
+# Note that unlike deploy_files, this works for binary files.
+# Also, it is up to the user to add (or not) the trailing slash in root_dir.
+# Files are only deployed to the first master instance in the cluster.
+#
+# root_dir should be an absolute path.
+def deploy_user_files(root_dir, opts, master_nodes):
+    active_master = get_dns_name(master_nodes[0], opts.private_ips)
+    command = [
+        'rsync', '-rv',
+        '-e', stringify_command(ssh_command(opts)),
+        "%s" % root_dir,
+        "%s@%s:/" % (opts.user, active_master)
+    ]
+    subprocess.check_call(command)
+
+
 def stringify_command(parts):
     if isinstance(parts, str):
         return parts
@@ -986,13 +1201,13 @@ def ssh(host, opts, command):
                 # If this was an ssh failure, provide the user with hints.
                 if e.returncode == 255:
                     raise UsageError(
-                        "Failed to SSH to remote host {0}.\n" +
-                        "Please check that you have provided the correct --identity-file and " +
+                        "Failed to SSH to remote host {0}.\n"
+                        "Please check that you have provided the correct --identity-file and "
                         "--key-pair parameters and try again.".format(host))
                 else:
                     raise e
-            print >> stderr, \
-                "Error executing remote command, retrying after 30 seconds: {0}".format(e)
+            print("Error executing remote command, retrying after 30 seconds: {0}".format(e),
+                  file=stderr)
             time.sleep(30)
             tries = tries + 1
 
@@ -1031,8 +1246,8 @@ def ssh_write(host, opts, command, arguments):
         elif tries > 5:
             raise RuntimeError("ssh_write failed with error %s" % proc.returncode)
         else:
-            print >> stderr, \
-                "Error {0} while executing remote command, retrying after 30 seconds".format(status)
+            print("Error {0} while executing remote command, retrying after 30 seconds".
+                  format(status), file=stderr)
             time.sleep(30)
             tries = tries + 1
 
@@ -1048,12 +1263,26 @@ def get_zones(conn, opts):
 
 # Gets the number of items in a partition
 def get_partition(total, num_partitions, current_partitions):
-    num_slaves_this_zone = total / num_partitions
+    num_slaves_this_zone = total // num_partitions
     if (total % num_partitions) - current_partitions > 0:
         num_slaves_this_zone += 1
     return num_slaves_this_zone
 
 
+# Gets the IP address, taking into account the --private-ips flag
+def get_ip_address(instance, private_ips=False):
+    ip = instance.ip_address if not private_ips else \
+        instance.private_ip_address
+    return ip
+
+
+# Gets the DNS name, taking into account the --private-ips flag
+def get_dns_name(instance, private_ips=False):
+    dns = instance.public_dns_name if not private_ips else \
+        instance.private_ip_address
+    return dns
+
+
 def real_main():
     (opts, action, cluster_name) = parse_args()
 
@@ -1072,28 +1301,28 @@ def real_main():
 
     if opts.identity_file is not None:
         if not os.path.exists(opts.identity_file):
-            print >> stderr,\
-                "ERROR: The identity file '{f}' doesn't exist.".format(f=opts.identity_file)
+            print("ERROR: The identity file '{f}' doesn't exist.".format(f=opts.identity_file),
+                  file=stderr)
             sys.exit(1)
 
         file_mode = os.stat(opts.identity_file).st_mode
         if not (file_mode & S_IRUSR) or not oct(file_mode)[-2:] == '00':
-            print >> stderr, "ERROR: The identity file must be accessible only by you."
-            print >> stderr, 'You can fix this with: chmod 400 "{f}"'.format(f=opts.identity_file)
+            print("ERROR: The identity file must be accessible only by you.", file=stderr)
+            print('You can fix this with: chmod 400 "{f}"'.format(f=opts.identity_file),
+                  file=stderr)
             sys.exit(1)
 
     if opts.instance_type not in EC2_INSTANCE_TYPES:
-        print >> stderr, "Warning: Unrecognized EC2 instance type for instance-type: {t}".format(
-            t=opts.instance_type)
+        print("Warning: Unrecognized EC2 instance type for instance-type: {t}".format(
+              t=opts.instance_type), file=stderr)
 
     if opts.master_instance_type != "":
         if opts.master_instance_type not in EC2_INSTANCE_TYPES:
-            print >> stderr, \
-                "Warning: Unrecognized EC2 instance type for master-instance-type: {t}".format(
-                    t=opts.master_instance_type)
+            print("Warning: Unrecognized EC2 instance type for master-instance-type: {t}".format(
+                  t=opts.master_instance_type), file=stderr)
 
     if opts.ebs_vol_num > 8:
-        print >> stderr, "ebs-vol-num cannot be greater than 8"
+        print("ebs-vol-num cannot be greater than 8", file=stderr)
         sys.exit(1)
 
     # Prevent breaking ami_prefix (/, .git and startswith checks)
@@ -1102,15 +1331,22 @@ def real_main():
             opts.spark_ec2_git_repo.endswith(".git") or \
             not opts.spark_ec2_git_repo.startswith("https://github.com") or \
             not opts.spark_ec2_git_repo.endswith("spark-ec2"):
-        print >> stderr, "spark-ec2-git-repo must be a github repo and it must not have a " \
-                         "trailing / or .git. " \
-                         "Furthermore, we currently only support forks named spark-ec2."
+        print("spark-ec2-git-repo must be a github repo and it must not have a trailing / or .git. "
+              "Furthermore, we currently only support forks named spark-ec2.", file=stderr)
+        sys.exit(1)
+
+    if not (opts.deploy_root_dir is None or
+            (os.path.isabs(opts.deploy_root_dir) and
+             os.path.isdir(opts.deploy_root_dir) and
+             os.path.exists(opts.deploy_root_dir))):
+        print("--deploy-root-dir must be an absolute path to a directory that exists "
+              "on the local file system", file=stderr)
         sys.exit(1)
 
     try:
         conn = ec2.connect_to_region(opts.region)
     except Exception as e:
-        print >> stderr, (e)
+        print((e), file=stderr)
         sys.exit(1)
 
     # Select an AZ at random if it was not specified.
@@ -1119,7 +1355,7 @@ def real_main():
 
     if action == "launch":
         if opts.slaves <= 0:
-            print >> sys.stderr, "ERROR: You have to start at least 1 slave"
+            print("ERROR: You have to start at least 1 slave", file=sys.stderr)
             sys.exit(1)
         if opts.resume:
             (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name)
@@ -1134,26 +1370,27 @@ def real_main():
         setup_cluster(conn, master_nodes, slave_nodes, opts, True)
 
     elif action == "destroy":
-        print "Are you sure you want to destroy the cluster %s?" % cluster_name
-        print "The following instances will be terminated:"
         (master_nodes, slave_nodes) = get_existing_cluster(
             conn, opts, cluster_name, die_on_error=False)
-        for inst in master_nodes + slave_nodes:
-            print "> %s" % inst.public_dns_name
 
-        msg = "ALL DATA ON ALL NODES WILL BE LOST!!\nDestroy cluster %s (y/N): " % cluster_name
+        if any(master_nodes + slave_nodes):
+            print("The following instances will be terminated:")
+            for inst in master_nodes + slave_nodes:
+                print("> %s" % get_dns_name(inst, opts.private_ips))
+            print("ALL DATA ON ALL NODES WILL BE LOST!!")
+
+        msg = "Are you sure you want to destroy the cluster {c}? (y/N) ".format(c=cluster_name)
         response = raw_input(msg)
         if response == "y":
-            print "Terminating master..."
+            print("Terminating master...")
             for inst in master_nodes:
                 inst.terminate()
-            print "Terminating slaves..."
+            print("Terminating slaves...")
             for inst in slave_nodes:
                 inst.terminate()
 
             # Delete security groups as well
             if opts.delete_groups:
-                print "Deleting security groups (this will take some time)..."
                 group_names = [cluster_name + "-master", cluster_name + "-slaves"]
                 wait_for_cluster_state(
                     conn=conn,
@@ -1161,15 +1398,16 @@ def real_main():
                     cluster_instances=(master_nodes + slave_nodes),
                     cluster_state='terminated'
                 )
+                print("Deleting security groups (this will take some time)...")
                 attempt = 1
                 while attempt <= 3:
-                    print "Attempt %d" % attempt
+                    print("Attempt %d" % attempt)
                     groups = [g for g in conn.get_all_security_groups() if g.name in group_names]
                     success = True
                     # Delete individual rules in all groups before deleting groups to
                     # remove dependencies between them
                     for group in groups:
-                        print "Deleting rules in security group " + group.name
+                        print("Deleting rules in security group " + group.name)
                         for rule in group.rules:
                             for grant in rule.grants:
                                 success &= group.revoke(ip_protocol=rule.ip_protocol,
@@ -1182,11 +1420,12 @@ def real_main():
                     time.sleep(30)  # Yes, it does have to be this long :-(
                     for group in groups:
                         try:
-                            conn.delete_security_group(group.name)
-                            print "Deleted security group " + group.name
+                            # It is needed to use group_id to make it work with VPC
+                            conn.delete_security_group(group_id=group.id)
+                            print("Deleted security group %s" % group.name)
                         except boto.exception.EC2ResponseError:
                             success = False
-                            print "Failed to delete security group " + group.name
+                            print("Failed to delete security group %s" % group.name)
 
                     # Unfortunately, group.revoke() returns True even if a rule was not
                     # deleted, so this needs to be rerun if something fails
@@ -1196,18 +1435,21 @@ def real_main():
                     attempt += 1
 
                 if not success:
-                    print "Failed to delete all security groups after 3 tries."
-                    print "Try re-running in a few minutes."
+                    print("Failed to delete all security groups after 3 tries.")
+                    print("Try re-running in a few minutes.")
 
     elif action == "login":
         (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name)
-        master = master_nodes[0].public_dns_name
-        print "Logging into master " + master + "..."
-        proxy_opt = []
-        if opts.proxy_port is not None:
-            proxy_opt = ['-D', opts.proxy_port]
-        subprocess.check_call(
-            ssh_command(opts) + proxy_opt + ['-t', '-t', "%s@%s" % (opts.user, master)])
+        if not master_nodes[0].public_dns_name and not opts.private_ips:
+            print("Master has no public DNS name.  Maybe you meant to specify --private-ips?")
+        else:
+            master = get_dns_name(master_nodes[0], opts.private_ips)
+            print("Logging into master " + master + "...")
+            proxy_opt = []
+            if opts.proxy_port is not None:
+                proxy_opt = ['-D', opts.proxy_port]
+            subprocess.check_call(
+                ssh_command(opts) + proxy_opt + ['-t', '-t', "%s@%s" % (opts.user, master)])
 
     elif action == "reboot-slaves":
         response = raw_input(
@@ -1217,15 +1459,18 @@ def real_main():
         if response == "y":
             (master_nodes, slave_nodes) = get_existing_cluster(
                 conn, opts, cluster_name, die_on_error=False)
-            print "Rebooting slaves..."
+            print("Rebooting slaves...")
             for inst in slave_nodes:
                 if inst.state not in ["shutting-down", "terminated"]:
-                    print "Rebooting " + inst.id
+                    print("Rebooting " + inst.id)
                     inst.reboot()
 
     elif action == "get-master":
         (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name)
-        print master_nodes[0].public_dns_name
+        if not master_nodes[0].public_dns_name and not opts.private_ips:
+            print("Master has no public DNS name.  Maybe you meant to specify --private-ips?")
+        else:
+            print(get_dns_name(master_nodes[0], opts.private_ips))
 
     elif action == "stop":
         response = raw_input(
@@ -1238,11 +1483,11 @@ def real_main():
         if response == "y":
             (master_nodes, slave_nodes) = get_existing_cluster(
                 conn, opts, cluster_name, die_on_error=False)
-            print "Stopping master..."
+            print("Stopping master...")
             for inst in master_nodes:
                 if inst.state not in ["shutting-down", "terminated"]:
                     inst.stop()
-            print "Stopping slaves..."
+            print("Stopping slaves...")
             for inst in slave_nodes:
                 if inst.state not in ["shutting-down", "terminated"]:
                     if inst.spot_instance_request_id:
@@ -1252,11 +1497,11 @@ def real_main():
 
     elif action == "start":
         (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name)
-        print "Starting slaves..."
+        print("Starting slaves...")
         for inst in slave_nodes:
             if inst.state not in ["shutting-down", "terminated"]:
                 inst.start()
-        print "Starting master..."
+        print("Starting master...")
         for inst in master_nodes:
             if inst.state not in ["shutting-down", "terminated"]:
                 inst.start()
@@ -1266,18 +1511,29 @@ def real_main():
             cluster_instances=(master_nodes + slave_nodes),
             cluster_state='ssh-ready'
         )
+
+        # Determine types of running instances
+        existing_master_type = master_nodes[0].instance_type
+        existing_slave_type = slave_nodes[0].instance_type
+        # Setting opts.master_instance_type to the empty string indicates we
+        # have the same instance type for the master and the slaves
+        if existing_master_type == existing_slave_type:
+            existing_master_type = ""
+        opts.master_instance_type = existing_master_type
+        opts.instance_type = existing_slave_type
+
         setup_cluster(conn, master_nodes, slave_nodes, opts, False)
 
     else:
-        print >> stderr, "Invalid action: %s" % action
+        print("Invalid action: %s" % action, file=stderr)
         sys.exit(1)
 
 
 def main():
     try:
         real_main()
-    except UsageError, e:
-        print >> stderr, "\nError:\n", e
+    except UsageError as e:
+        print("\nError:\n", e, file=stderr)
         sys.exit(1)
 
 

From 83fcbcba62dcec3d50bb768135f8eae888467e49 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Thu, 13 Aug 2015 16:16:32 -0300
Subject: [PATCH 018/268] get_spark_ami fix

---
 tools/spark-ec2/spark_ec2.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
index 8cc44d30..4fbf5bd8 100755
--- a/tools/spark-ec2/spark_ec2.py
+++ b/tools/spark-ec2/spark_ec2.py
@@ -459,21 +459,20 @@ def get_validate_spark_version(version, repo):
 def get_tachyon_version(spark_version):
     return SPARK_TACHYON_MAP.get(spark_version, "")
 
-
 # Attempt to resolve an appropriate AMI given the architecture and region of the request.
-def get_spark_ami(opts):
-    if opts.instance_type in EC2_INSTANCE_TYPES:
-        instance_type = EC2_INSTANCE_TYPES[opts.instance_type]
+def get_spark_ami(instance_type, region, spark_ec2_git_repo, spark_ec2_git_branch):
+    if instance_type in EC2_INSTANCE_TYPES:
+        instance_type = EC2_INSTANCE_TYPES[instance_type]
     else:
         instance_type = "pvm"
-        print("Don't recognize %s, assuming type is pvm" % opts.instance_type, file=stderr)
+        print("Don't recognize %s, assuming type is pvm" % instance_type, file=stderr)
 
     # URL prefix from which to fetch AMI information
     ami_prefix = "{r}/{b}/ami-list".format(
-        r=opts.spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1),
-        b=opts.spark_ec2_git_branch)
+        r=spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1),
+        b=spark_ec2_git_branch)
 
-    ami_path = "%s/%s/%s" % (ami_prefix, opts.region, instance_type)
+    ami_path = "%s/%s/%s" % (ami_prefix, region, instance_type)
     reader = codecs.getreader("ascii")
     try:
         ami = reader(urlopen(ami_path)).read().strip()
@@ -484,7 +483,6 @@ def get_spark_ami(opts):
     print("Spark AMI: " + ami)
     return ami
 
-
 # Launch a cluster of the given name, by setting up its security groups,
 # and then starting new instances in them.
 # Returns a tuple of EC2 reservation objects for the master and slaves
@@ -584,10 +582,10 @@ def launch_cluster(conn, opts, cluster_name):
 
     # Figure out Spark AMI
     if opts.ami is None:
-        opts.ami = get_spark_ami(opts)
+        opts.ami = get_spark_ami(opts.instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch)
 
     if opts.master_ami is None:
-        opts.master_ami = get_spark_ami(opts.master_instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch) 
+        opts.master_ami = get_spark_ami(opts.master_instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch)
 
     # we use group ids to work around https://github.com/boto/boto/issues/350
     additional_group_ids = []

From 807f0f616973d74a51998caadfc1bc1b17b7a306 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Fri, 14 Aug 2015 13:51:39 -0300
Subject: [PATCH 019/268] remove user data, spark-ec2 takes care on formatting
 disks

---
 tools/cluster.py             |  3 ---
 tools/scripts/S05mount-disks | 11 -----------
 2 files changed, 14 deletions(-)
 delete mode 100644 tools/scripts/S05mount-disks

diff --git a/tools/cluster.py b/tools/cluster.py
index 3cf1828a..6ebd2386 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -51,7 +51,6 @@
 default_remote_user = 'ec2-user'
 default_remote_control_dir = '/tmp/Ignition'
 default_collect_results_dir = '/tmp'
-default_user_data = os.path.join(script_path, 'scripts', 'S05mount-disks')
 default_defaults_filename = 'cluster_defaults.json'
 
 default_spark_ec2_git_repo = 'https://github.com/chaordic/spark-ec2'
@@ -202,7 +201,6 @@ def launch(cluster_name, slaves,
            key_id=default_key_id, region=default_region,
            zone=default_zone, instance_type=default_instance_type,
            ondemand=False, spot_price=default_spot_price,
-           user_data=default_user_data,
            security_group = None,
            vpc = None,
            vpc_subnet = None,
@@ -272,7 +270,6 @@ def launch(cluster_name, slaves,
                                  '--master-opts', '-Dspark.worker.timeout={0}'.format(worker_timeout),
                                  '--spark-git-repo', spark_repo,
                                  '-v', spark_version,
-                                 '--user-data', user_data,
                                  'launch', cluster_name] +
                                 spot_params +
                                 resume_param +
diff --git a/tools/scripts/S05mount-disks b/tools/scripts/S05mount-disks
deleted file mode 100644
index 8f129a30..00000000
--- a/tools/scripts/S05mount-disks
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-echo 'Mounting disks' >> /tmp/mount-disks.log
-mkdir -p /mnt
-mkdir -p /mnt{2,3,4}
-chmod -R 777 /mnt*
-[ -r /dev/xvdb ] && mkfs.ext4 /dev/xvdb && mount /dev/xvdb /mnt
-[ -r /dev/xvdc ] && mkfs.ext4 /dev/xvdc && mount /dev/xvdc /mnt2
-[ -r /dev/xvdd ] && mkfs.ext4 /dev/xvdd && mount /dev/xvdd /mnt3
-[ -r /dev/xvde ] && mkfs.ext4 /dev/xvde && mount /dev/xvde /mnt4
-

From 637ab060de8b564d6b7a6021ef493b84152af350 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Fri, 14 Aug 2015 14:06:31 -0300
Subject: [PATCH 020/268] fix variable replacement

---
 .../spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh b/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
index 3570891b..4f3e8da8 100644
--- a/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
+++ b/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
@@ -25,8 +25,10 @@ export MAPRED_LOCAL_DIRS="{{mapred_local_dirs}}"
 export SPARK_LOCAL_DIRS="{{spark_local_dirs}}"
 export MODULES="{{modules}}"
 export SPARK_VERSION="{{spark_version}}"
-export SHARK_VERSION="{{shark_version}}"
+export TACHYON_VERSION="{{tachyon_version}}"
 export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}"
 export SWAP_MB="{{swap}}"
 export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}"
 export SPARK_MASTER_OPTS="{{spark_master_opts}}"
+export AWS_ACCESS_KEY_ID="{{aws_access_key_id}}"
+export AWS_SECRET_ACCESS_KEY="{{aws_secret_access_key}}"

From f6d5d0dd7cebb0bc32a9c13f04959015f3e36427 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Fri, 14 Aug 2015 14:07:32 -0300
Subject: [PATCH 021/268] remove rstudio and some fixes

---
 tools/spark-ec2/spark_ec2.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
index 4fbf5bd8..f5bbaac1 100755
--- a/tools/spark-ec2/spark_ec2.py
+++ b/tools/spark-ec2/spark_ec2.py
@@ -357,15 +357,15 @@ def get_or_make_group(conn, name, vpc_id):
         return conn.create_security_group(name, "Spark EC2 group", vpc_id)
 
 def check_if_http_resource_exists(resource):
-    request = urllib2.Request(resource)
+    request = Request(resource)
     request.get_method = lambda: 'HEAD'
     try:
-        response = urllib2.urlopen(request)
+        response = urlopen(request)
         if response.getcode() == 200:
             return True
         else:
             raise RuntimeError("Resource {resource} not found. Error: {code}".format(resource, response.getcode()))
-    except urllib2.HTTPError, e:
+    except HTTPError, e:
         print >> stderr, "Unable to check if HTTP resource {url} exists. Error: {code}".format(
             url=resource,
             code=e.code)
@@ -831,7 +831,7 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
             ssh_write(slave_address, opts, ['tar', 'x'], dot_ssh_tar)
 
     modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs',
-               'mapreduce', 'spark-standalone', 'tachyon', 'rstudio']
+               'mapreduce', 'spark-standalone', 'tachyon']
 
     if opts.hadoop_major_version == "1":
         modules = list(filter(lambda x: x != "mapreduce", modules))

From 7787045de3c1ff132c17f341b3cdecae60ceade0 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Fri, 14 Aug 2015 14:49:14 -0300
Subject: [PATCH 022/268] update spark-ec2 version

---
 tools/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 6ebd2386..23b3bed9 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -54,7 +54,7 @@
 default_defaults_filename = 'cluster_defaults.json'
 
 default_spark_ec2_git_repo = 'https://github.com/chaordic/spark-ec2'
-default_spark_ec2_git_branch = 'v4-yarn'
+default_spark_ec2_git_branch = 'branch-1.4-merge'
 
 
 master_post_create_commands = [

From ccfed3f661b0bae939dae704204767a3ef899ad1 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Mon, 17 Aug 2015 11:08:23 -0300
Subject: [PATCH 023/268] pr review, fix removed feature and added noop
 user-data

---
 tools/cluster.py   | 3 +++
 tools/scripts/noop | 1 +
 2 files changed, 4 insertions(+)
 create mode 100644 tools/scripts/noop

diff --git a/tools/cluster.py b/tools/cluster.py
index 23b3bed9..d6a0263d 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -51,6 +51,7 @@
 default_remote_user = 'ec2-user'
 default_remote_control_dir = '/tmp/Ignition'
 default_collect_results_dir = '/tmp'
+default_user_data = os.path.join(script_path, 'scripts', 'noop')
 default_defaults_filename = 'cluster_defaults.json'
 
 default_spark_ec2_git_repo = 'https://github.com/chaordic/spark-ec2'
@@ -201,6 +202,7 @@ def launch(cluster_name, slaves,
            key_id=default_key_id, region=default_region,
            zone=default_zone, instance_type=default_instance_type,
            ondemand=False, spot_price=default_spot_price,
+           user_data=default_user_data,
            security_group = None,
            vpc = None,
            vpc_subnet = None,
@@ -270,6 +272,7 @@ def launch(cluster_name, slaves,
                                  '--master-opts', '-Dspark.worker.timeout={0}'.format(worker_timeout),
                                  '--spark-git-repo', spark_repo,
                                  '-v', spark_version,
+                                 '--user-data', user_data,
                                  'launch', cluster_name] +
                                 spot_params +
                                 resume_param +
diff --git a/tools/scripts/noop b/tools/scripts/noop
new file mode 100644
index 00000000..cc1f786e
--- /dev/null
+++ b/tools/scripts/noop
@@ -0,0 +1 @@
+#!/bin/bash
\ No newline at end of file

From 9bbcd181723dcdb1d275a5a6040c0eda2c540569 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 18 Aug 2015 16:43:51 -0300
Subject: [PATCH 024/268] added heap size param for driver

---
 remote_hook.sh   | 3 ++-
 tools/cluster.py | 9 ++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index 305a0ff6..65e71070 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -11,6 +11,7 @@ CONTROL_DIR="${5?Please give the Control Directory}"
 SPARK_MEM_PARAM="${6?Please give the Job Memory Size to use}"
 USE_YARN="${7?Please tell if we should use YARN (yes/no)}"
 NOTIFY_ON_ERRORS="${8?Please tell if we will notify on errors (yes/no)}"
+DRIVER_HEAP_SIZE="${9?Please tell driver heap size to use in MB}"
 
 JOB_WITH_TAG=${JOB_NAME}.${JOB_TAG}
 JOB_CONTROL_DIR="${CONTROL_DIR}/${JOB_WITH_TAG}"
@@ -80,7 +81,7 @@ if [[ "${JOB_NAME}" == "shell" ]]; then
 else
     JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log"
     tail -F "${JOB_OUTPUT}" &
-    sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory 25000M --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
+    sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}M" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
 fi
 
 touch "${JOB_CONTROL_DIR}/SUCCESS"
diff --git a/tools/cluster.py b/tools/cluster.py
index d6a0263d..3ac89be8 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -39,6 +39,7 @@
 default_spot_price = '0.10'
 default_worker_instances = '1'
 default_master_instance_type = 'm3.xlarge'
+default_driver_heap_size = '25000'
 default_region = 'us-east-1'
 default_zone = default_region + 'b'
 default_key_id = 'ignition_key'
@@ -372,7 +373,9 @@ def job_run(cluster_name, job_name, job_mem,
             disable_assembly_build=False,
             run_tests=False,
             kill_on_failure=False,
-            destroy_cluster=False, region=default_region):
+            destroy_cluster=False,
+            region=default_region,
+            driver_heap_size=default_driver_heap_size):
 
     utc_job_date_example = '2014-05-04T13:13:10Z'
     if utc_job_date and len(utc_job_date) != len(utc_job_date_example):
@@ -394,9 +397,9 @@ def job_run(cluster_name, job_name, job_mem,
     job_tag = job_tag or job_date.replace(':', '_').replace('-', '_').replace('Z', 'UTC')
     tmux_wait_command = ';(echo Press enter to keep the session open && /bin/bash -c "read -t 5" && sleep 7d)' if not detached else ''
     tmux_arg = ". /etc/profile; . ~/.profile;tmux new-session {detached} -s spark.{job_name}.{job_tag} '{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {tmux_wait_command}' >& /tmp/commandoutput".format(
-        aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, tmux_wait_command=tmux_wait_command)
+        aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, tmux_wait_command=tmux_wait_command)
     non_tmux_arg = ". /etc/profile; . ~/.profile;{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} >& /tmp/commandoutput".format(
-        aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param)
+        aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size)
 
 
     if not disable_assembly_build:

From a2d5af977c37bd7e14fa6b304bf17d4ffd25e231 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 18 Aug 2015 17:03:21 -0300
Subject: [PATCH 025/268] parameterized memory unit

---
 remote_hook.sh   | 4 ++--
 tools/cluster.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index 65e71070..48ba9735 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -11,7 +11,7 @@ CONTROL_DIR="${5?Please give the Control Directory}"
 SPARK_MEM_PARAM="${6?Please give the Job Memory Size to use}"
 USE_YARN="${7?Please tell if we should use YARN (yes/no)}"
 NOTIFY_ON_ERRORS="${8?Please tell if we will notify on errors (yes/no)}"
-DRIVER_HEAP_SIZE="${9?Please tell driver heap size to use in MB}"
+DRIVER_HEAP_SIZE="${9?Please tell driver heap size to use}"
 
 JOB_WITH_TAG=${JOB_NAME}.${JOB_TAG}
 JOB_CONTROL_DIR="${CONTROL_DIR}/${JOB_WITH_TAG}"
@@ -81,7 +81,7 @@ if [[ "${JOB_NAME}" == "shell" ]]; then
 else
     JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log"
     tail -F "${JOB_OUTPUT}" &
-    sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}M" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
+    sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
 fi
 
 touch "${JOB_CONTROL_DIR}/SUCCESS"
diff --git a/tools/cluster.py b/tools/cluster.py
index 3ac89be8..81dc9b2d 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -39,7 +39,7 @@
 default_spot_price = '0.10'
 default_worker_instances = '1'
 default_master_instance_type = 'm3.xlarge'
-default_driver_heap_size = '25000'
+default_driver_heap_size = '25G'
 default_region = 'us-east-1'
 default_zone = default_region + 'b'
 default_key_id = 'ignition_key'

From f165937d608288be7fd673301256f2f6e122bc2e Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 19 Aug 2015 16:12:21 -0300
Subject: [PATCH 026/268] fix default memory size to match default master
 instance type

---
 tools/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 81dc9b2d..f796f53c 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -39,7 +39,7 @@
 default_spot_price = '0.10'
 default_worker_instances = '1'
 default_master_instance_type = 'm3.xlarge'
-default_driver_heap_size = '25G'
+default_driver_heap_size = '12G'
 default_region = 'us-east-1'
 default_zone = default_region + 'b'
 default_key_id = 'ignition_key'

From 980a2784ccabcf435d2df575fcf9c650c820349c Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Wed, 19 Aug 2015 17:39:17 -0300
Subject: [PATCH 027/268] Use the driver heap size param

---
 tools/cluster.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index f796f53c..1f6fdaa5 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -396,9 +396,9 @@ def job_run(cluster_name, job_name, job_mem,
     job_date = utc_job_date or datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
     job_tag = job_tag or job_date.replace(':', '_').replace('-', '_').replace('Z', 'UTC')
     tmux_wait_command = ';(echo Press enter to keep the session open && /bin/bash -c "read -t 5" && sleep 7d)' if not detached else ''
-    tmux_arg = ". /etc/profile; . ~/.profile;tmux new-session {detached} -s spark.{job_name}.{job_tag} '{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {tmux_wait_command}' >& /tmp/commandoutput".format(
+    tmux_arg = ". /etc/profile; . ~/.profile;tmux new-session {detached} -s spark.{job_name}.{job_tag} '{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} {tmux_wait_command}' >& /tmp/commandoutput".format(
         aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, tmux_wait_command=tmux_wait_command)
-    non_tmux_arg = ". /etc/profile; . ~/.profile;{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} >& /tmp/commandoutput".format(
+    non_tmux_arg = ". /etc/profile; . ~/.profile;{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} >& /tmp/commandoutput".format(
         aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size)
 
 

From c78c319a5e2c900888ddc512f4166ee3b5f553fc Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allandouglas@gmail.com>
Date: Wed, 26 Aug 2015 10:56:53 -0300
Subject: [PATCH 028/268] Update spark_ec2.py

---
 tools/spark-ec2/spark_ec2.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
index f5bbaac1..c81d794b 100755
--- a/tools/spark-ec2/spark_ec2.py
+++ b/tools/spark-ec2/spark_ec2.py
@@ -51,7 +51,7 @@
     raw_input = input
     xrange = range
 
-SPARK_EC2_VERSION = "1.4.0"
+SPARK_EC2_VERSION = "1.4.1"
 SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
 
 VALID_SPARK_VERSIONS = set([
@@ -71,6 +71,7 @@
     "1.3.0",
     "1.3.1",
     "1.4.0",
+    "1.4.1,
 ])
 
 SPARK_TACHYON_MAP = {
@@ -84,6 +85,7 @@
     "1.3.0": "0.5.0",
     "1.3.1": "0.5.0",
     "1.4.0": "0.6.4",
+    "1.4.1": "0.6.4",
 }
 
 DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION

From a5379a0a4a54d2e35b83c2cb4c9b4a467b8091d5 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allandouglas@gmail.com>
Date: Wed, 26 Aug 2015 10:57:36 -0300
Subject: [PATCH 029/268] Update spark_ec2.py

---
 tools/spark-ec2/spark_ec2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
index c81d794b..5c6458f9 100755
--- a/tools/spark-ec2/spark_ec2.py
+++ b/tools/spark-ec2/spark_ec2.py
@@ -71,7 +71,7 @@
     "1.3.0",
     "1.3.1",
     "1.4.0",
-    "1.4.1,
+    "1.4.1",
 ])
 
 SPARK_TACHYON_MAP = {

From 59ba13280fdc49f95bc1e5f3878c8384a5d3d865 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Thu, 27 Aug 2015 10:59:16 -0300
Subject: [PATCH 030/268] Use Spark 1.4.1

---
 build.sbt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.sbt b/build.sbt
index 7eb2bffe..476dd3bb 100644
--- a/build.sbt
+++ b/build.sbt
@@ -13,7 +13,7 @@ ideaExcludeFolders += ".idea_modules"
 // Because we can't run two spark contexts on same VM
 parallelExecution in Test := false
 
-libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided")
+libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.4.1" % "provided")
   .exclude("org.apache.hadoop", "hadoop-client")
   .exclude("org.slf4j", "slf4j-log4j12")
 

From 63e867a9de1a11f48a8a72906b38f693cccee52c Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Fri, 28 Aug 2015 16:38:44 -0300
Subject: [PATCH 031/268] Increase group to avoid slowdowns

---
 src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index a1090d20..baf80bc2 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -52,7 +52,7 @@ object SparkContextUtils {
       if (splittedPaths.size < minimumPaths)
         throw new Exception(s"Not enough paths found for $paths")
 
-      val rdds = splittedPaths.grouped(50).map(pathGroup => f(pathGroup.mkString(",")))
+      val rdds = splittedPaths.grouped(5000).map(pathGroup => f(pathGroup.mkString(",")))
 
       new UnionRDD(sc, rdds.toList)
     }

From f12dfdc9d2029941bd293f3aa3ba90c83bbd885a Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Mon, 31 Aug 2015 13:52:32 -0300
Subject: [PATCH 032/268] Updated core to ignore spark ec2 boto

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index cfe2c08a..bcf8c0f8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,6 @@ project/plugins/project/
 
 # Node
 node_modules
+
+# Spark-ec2 boto
+tools/spark-ec2/lib

From cae677fc26ef20ff46f22c098b2cf903db239e5c Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Mon, 31 Aug 2015 16:21:58 -0300
Subject: [PATCH 033/268] Make spark 1.4.1 the default

---
 tools/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 1f6fdaa5..e312d842 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -47,7 +47,7 @@
 default_ami = None # will be decided based on spark-ec2 list
 default_master_ami = None
 default_env = 'dev'
-default_spark_version = '1.3.0'
+default_spark_version = '1.4.1'
 default_spark_repo = 'https://github.com/chaordic/spark'
 default_remote_user = 'ec2-user'
 default_remote_control_dir = '/tmp/Ignition'

From 14324a2f3a7456b6aae993d0b68f02aa9402924a Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Tue, 1 Sep 2015 15:44:27 -0300
Subject: [PATCH 034/268] Added IntBag

---
 .../ignition/core/utils/CollectionUtils.scala |  6 +++
 .../scala/ignition/core/utils/IntBag.scala    | 42 +++++++++++++++++++
 .../core/utils/CollectionUtilsSpec.scala      |  2 -
 .../ignition/core/utils/IntBagSpec.scala      | 23 ++++++++++
 4 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100644 src/main/scala/ignition/core/utils/IntBag.scala
 create mode 100644 src/test/scala/ignition/core/utils/IntBagSpec.scala

diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala
index 52828ca7..eea4755e 100644
--- a/src/main/scala/ignition/core/utils/CollectionUtils.scala
+++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala
@@ -22,6 +22,12 @@ object CollectionUtils {
     }
   }
 
+
+
+  implicit class TraversableOnceLong(xs: TraversableOnce[Long]) {
+    def toBag(): IntBag = IntBag.from(xs)
+  }
+
   implicit class TraversableLikeImprovements[A, Repr](xs: TraversableLike[A, Repr]) {
     def distinctBy[B, That](f: A => B)(implicit cbf: CanBuildFrom[Repr, A, That]) = {
       val builder = cbf(xs.repr)
diff --git a/src/main/scala/ignition/core/utils/IntBag.scala b/src/main/scala/ignition/core/utils/IntBag.scala
new file mode 100644
index 00000000..2a36da6e
--- /dev/null
+++ b/src/main/scala/ignition/core/utils/IntBag.scala
@@ -0,0 +1,42 @@
+package ignition.core.utils
+
+object IntBag {
+  def from(numbers: TraversableOnce[Long]): IntBag = {
+    val histogram = scala.collection.mutable.HashMap.empty[Long, Long]
+    numbers.foreach(n => histogram += (n -> (histogram.getOrElse(n, 0L) + 1)))
+    new IntBag(histogram)
+  }
+
+  val empty = from(Seq.empty)
+}
+
+class IntBag(val histogram: collection.Map[Long, Long]) {
+  def ++(other: IntBag): IntBag = {
+    val newHistogram = scala.collection.mutable.HashMap.empty[Long, Long]
+    (histogram.keySet ++ other.histogram.keySet).foreach(k => newHistogram += (k -> (histogram.getOrElse(k, 0L) + other.histogram.getOrElse(k, 0L))))
+    new IntBag(newHistogram)
+  }
+
+
+  def median: Option[Long] = {
+    if (histogram.nonEmpty) {
+      val total = histogram.values.sum
+      val half = total / 2
+      val max = histogram.keys.max
+
+      val accumulatedFrequency = (0L to max).scanLeft(0L) { case (sumFreq, k) => sumFreq + histogram.getOrElse(k, 0L) }.zipWithIndex
+      accumulatedFrequency.collectFirst { case (sum, k) if sum >= half => k }
+    } else {
+      None
+    }
+  }
+
+  def avg: Option[Long] = {
+    if (histogram.nonEmpty) {
+      val sum = histogram.map { case (k, f) => k * f }.sum
+      val count = histogram.values.sum
+      Option(sum / count)
+    } else
+      None
+  }
+}
diff --git a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala
index c19579ce..f01b8a34 100644
--- a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala
+++ b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala
@@ -33,6 +33,4 @@ class CollectionUtilsSpec extends FlatSpec with ShouldMatchers {
   }
 
 
-
-
 }
diff --git a/src/test/scala/ignition/core/utils/IntBagSpec.scala b/src/test/scala/ignition/core/utils/IntBagSpec.scala
new file mode 100644
index 00000000..b6694b12
--- /dev/null
+++ b/src/test/scala/ignition/core/utils/IntBagSpec.scala
@@ -0,0 +1,23 @@
+package ignition.core.utils
+
+import org.scalatest._
+
+import scala.util.Random
+
+class IntBagSpec extends FlatSpec with ShouldMatchers  {
+
+  "IntBag" should "be built from sequence" in {
+    IntBag.from(Seq(1, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4)).histogram shouldBe Map(1 -> 2, 2 -> 3, 3 -> 1, 4 -> 5)
+  }
+
+  it should "calculate the median and average" in {
+    val size = 1000
+    val numbers = (0 until 1000).map(_ => Random.nextInt(400).toLong).toList
+    val bag = IntBag.from(numbers)
+
+    bag.avg.get shouldBe numbers.sum / size
+
+    // TODO: the median is only approximate and it could be better, improve it
+  }
+
+}

From 3cb2ef5d74282391674d1afea9be30f2eb1a5463 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matheus=20Weber=20da=20Concei=C3=A7=C3=A3o?=
 <matheuswcon@gmail.com>
Date: Tue, 1 Sep 2015 17:17:47 -0300
Subject: [PATCH 035/268] Adds an option to launch the cluster master as spot

---
 tools/cluster.py             |  5 +-
 tools/spark-ec2/spark_ec2.py | 95 +++++++++++++++++++++++++++++-------
 2 files changed, 82 insertions(+), 18 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index e312d842..ed348fbb 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -202,7 +202,7 @@ def launch(cluster_name, slaves,
            tag=[],
            key_id=default_key_id, region=default_region,
            zone=default_zone, instance_type=default_instance_type,
-           ondemand=False, spot_price=default_spot_price,
+           ondemand=False, spot_price=default_spot_price, master_spot=False,
            user_data=default_user_data,
            security_group = None,
            vpc = None,
@@ -252,6 +252,8 @@ def launch(cluster_name, slaves,
             ])
 
         spot_params = ['--spot-price', spot_price] if not ondemand else []
+        master_spot_params = ['--master-spot'] if not ondemand and master_spot else []
+
         ami_params = ['--ami', ami] if ami else []
         master_ami_params = ['--master-ami', master_ami] if master_ami else []
 
@@ -276,6 +278,7 @@ def launch(cluster_name, slaves,
                                  '--user-data', user_data,
                                  'launch', cluster_name] +
                                 spot_params +
+                                master_spot_params +
                                 resume_param +
                                 auth_params +
                                 ami_params +
diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
index 5c6458f9..3583bf1d 100755
--- a/tools/spark-ec2/spark_ec2.py
+++ b/tools/spark-ec2/spark_ec2.py
@@ -260,6 +260,10 @@ def parse_args():
         "--spot-price", metavar="PRICE", type="float",
         help="If specified, launch slaves as spot instances with the given " +
              "maximum price (in dollars)")
+    parser.add_option(
+        "--master-spot", action="store_true", default=False,
+        help="If specified, launch master as spot instance using the same " +
+             "bid and instance type of the slave ones")
     parser.add_option(
         "--ganglia", action="store_true", default=True,
         help="Setup Ganglia monitoring on cluster (default: %default). NOTE: " +
@@ -729,26 +733,83 @@ def launch_cluster(conn, opts, cluster_name):
         master_nodes = existing_masters
     else:
         master_type = opts.master_instance_type
-        if master_type == "":
+        if master_type == "" or opts.master_spot:
             master_type = opts.instance_type
         if opts.zone == 'all':
             opts.zone = random.choice(conn.get_all_zones()).name
-        master_res = master_image.run(
-            key_name=opts.key_pair,
-            security_group_ids=[master_group.id] + additional_group_ids,
-            instance_type=master_type,
-            placement=opts.zone,
-            min_count=1,
-            max_count=1,
-            block_device_map=block_map,
-            subnet_id=opts.subnet_id,
-            placement_group=opts.placement_group,
-            user_data=user_data_content,
-            instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior,
-            instance_profile_name=opts.instance_profile_name)
-
-        master_nodes = master_res.instances
-        print("Launched master in %s, regid = %s" % (zone, master_res.id))
+        if opts.master_spot:
+            # Launch spot master instance with the requested price
+            # Note: The spot_price*1.5 is present to ensure a higher bid price to
+            #       the master spot instance, so the master instance will be the
+            #       last one to be terminated in a spot market price increase
+            print("Requesting master as spot instance with price $%.3f" %
+                (opts.spot_price))
+            master_req = conn.request_spot_instances(
+                price=(opts.spot_price * 1.5),
+                image_id=opts.master_ami,
+                placement=opts.zone,
+                count=1,
+                key_name=opts.key_pair,
+                security_group_ids=[master_group.id] + additional_group_ids,
+                instance_type=master_type,
+                block_device_map=block_map,
+                subnet_id=opts.subnet_id,
+                placement_group=opts.placement_group,
+                user_data=user_data_content,
+                instance_profile_name=opts.instance_profile_name)
+            my_master_req_id = [req.id for req in master_req]
+
+            start_time = datetime.now()
+            print("Waiting for master spot instance to be granted... Request ID: %s " % my_master_req_id)
+            try:
+                while True:
+                    time.sleep(10)
+                    reqs = conn.get_all_spot_instance_requests(my_master_req_id)
+                    active_instance_ids = filter(lambda req: req.state == "active", reqs)
+                    invalid_states = ["capacity-not-available", "capacity-oversubscribed", "price-too-low"]
+                    invalid = filter(lambda req: req.status.code in invalid_states, reqs)
+                    if len(invalid) > 0:
+                        raise Exception("Invalid state for spot request: %s - status: %s" %
+                            (invalid[0].id, invalid[0].status.message))
+                    if len(active_instance_ids) == 1:
+                        print("Master spot instance granted")
+                        master_res = conn.get_all_reservations([r.instance_id for r in active_instance_ids])
+                        master_nodes = master_res[0].instances
+                        break
+                    else:
+                        print("Master spot instance not granted yet, waiting longer")
+
+                    if (datetime.now() - start_time).seconds > opts.spot_timeout * 60:
+                        raise Exception("Timed out while waiting for master spot instance")
+            except:
+                print("Error: %s" % sys.exc_info()[1])
+                print("Canceling master spot instance requests")
+                conn.cancel_spot_instance_requests(my_master_req_id)
+                # Log a warning if any of these requests actually launched instances:
+                (master_nodes, slave_nodes) = get_existing_cluster(
+                    conn, opts, cluster_name, die_on_error=False)
+                running = len(master_nodes) + len(slave_nodes)
+                if running:
+                    print(("WARNING: %d instances are still running" % running), file=stderr)
+                sys.exit(0)
+        else:
+            # Launch ondemand instance
+            master_res = master_image.run(
+                key_name=opts.key_pair,
+                security_group_ids=[master_group.id] + additional_group_ids,
+                instance_type=master_type,
+                placement=opts.zone,
+                min_count=1,
+                max_count=1,
+                block_device_map=block_map,
+                subnet_id=opts.subnet_id,
+                placement_group=opts.placement_group,
+                user_data=user_data_content,
+                instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior,
+                instance_profile_name=opts.instance_profile_name)
+
+            master_nodes = master_res.instances
+            print("Launched master in %s, regid = %s" % (zone, master_res.id))
 
     # This wait time corresponds to SPARK-4983
     print("Waiting for AWS to propagate instance metadata...")

From 38213b49e13492536ddafe6fe70408552014c52b Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Wed, 2 Sep 2015 16:30:17 -0300
Subject: [PATCH 036/268] Fix serialization

---
 src/main/scala/ignition/core/utils/IntBag.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/ignition/core/utils/IntBag.scala b/src/main/scala/ignition/core/utils/IntBag.scala
index 2a36da6e..a322f6f7 100644
--- a/src/main/scala/ignition/core/utils/IntBag.scala
+++ b/src/main/scala/ignition/core/utils/IntBag.scala
@@ -4,13 +4,13 @@ object IntBag {
   def from(numbers: TraversableOnce[Long]): IntBag = {
     val histogram = scala.collection.mutable.HashMap.empty[Long, Long]
     numbers.foreach(n => histogram += (n -> (histogram.getOrElse(n, 0L) + 1)))
-    new IntBag(histogram)
+    IntBag(histogram)
   }
 
   val empty = from(Seq.empty)
 }
 
-class IntBag(val histogram: collection.Map[Long, Long]) {
+case class IntBag(histogram: collection.Map[Long, Long]) {
   def ++(other: IntBag): IntBag = {
     val newHistogram = scala.collection.mutable.HashMap.empty[Long, Long]
     (histogram.keySet ++ other.histogram.keySet).foreach(k => newHistogram += (k -> (histogram.getOrElse(k, 0L) + other.histogram.getOrElse(k, 0L))))

From d668f40fd93f93a9409d97781e34ce9e1d0d8ea2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADs=20Guilherme=20Fernandes=20Pereira?=
 <lg@chaordicsystems.com>
Date: Fri, 4 Sep 2015 15:16:30 -0300
Subject: [PATCH 037/268] Date between helper

---
 src/main/scala/ignition/core/utils/DateUtils.scala | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/main/scala/ignition/core/utils/DateUtils.scala b/src/main/scala/ignition/core/utils/DateUtils.scala
index c3fb5163..8ebf3b13 100644
--- a/src/main/scala/ignition/core/utils/DateUtils.scala
+++ b/src/main/scala/ignition/core/utils/DateUtils.scala
@@ -20,6 +20,9 @@ object DateUtils {
 
     def isEqualOrBefore(other: DateTime) =
       dateTime.isBefore(other) || dateTime.saneEqual(other)
+
+    def isBetween(start: DateTime, end: DateTime) =
+      dateTime.isAfter(start) && dateTime.isEqualOrBefore(end)
   }
 
   implicit class SecondsImprovements(val seconds: Seconds) {

From b2a602556c699f19da79b4ec6bf442a0a777862a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matheus=20Weber=20da=20Concei=C3=A7=C3=A3o?=
 <matheuswcon@gmail.com>
Date: Tue, 8 Sep 2015 15:29:16 -0300
Subject: [PATCH 038/268] Adds a TODO!

---
 tools/spark-ec2/spark_ec2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
index 3583bf1d..52c21c3f 100755
--- a/tools/spark-ec2/spark_ec2.py
+++ b/tools/spark-ec2/spark_ec2.py
@@ -759,6 +759,7 @@ def launch_cluster(conn, opts, cluster_name):
                 instance_profile_name=opts.instance_profile_name)
             my_master_req_id = [req.id for req in master_req]
 
+            # TODO: refactor duplicated spot waiting code
             start_time = datetime.now()
             print("Waiting for master spot instance to be granted... Request ID: %s " % my_master_req_id)
             try:

From 08ae1dd35b7540218c2744c259b5d4c9ee6ae9cf Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 9 Sep 2015 16:01:17 -0300
Subject: [PATCH 039/268] some kind of hack to parallel read and list files
 using spark cluster slaves

---
 .../core/jobs/utils/SparkContextUtils.scala   | 112 +++++++++++++++++-
 1 file changed, 107 insertions(+), 5 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index baf80bc2..f421b614 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -1,21 +1,26 @@
 package ignition.core.jobs.utils
 
-import java.util.Date
-
 import ignition.core.utils.ByteUtils
+import ignition.core.utils.CollectionUtils._
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.io.LongWritable
-import org.apache.spark.SparkContext
+import org.apache.hadoop.io.compress.CompressionCodecFactory
+import org.apache.spark.{Partitioner, SparkContext}
 import org.apache.hadoop.fs.{FileStatus, Path, FileSystem}
 import org.apache.spark.rdd.{UnionRDD, RDD}
-import org.joda.time.{DateTimeZone, DateTime}
+import org.joda.time.DateTime
 import ignition.core.utils.DateUtils._
 
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
 import scala.reflect.ClassTag
 import scala.util.Try
 
-
 object SparkContextUtils {
 
+  case class Bucket(var size: Long, paths: ArrayBuffer[String])
+  case class S3File(path: String, isDir: Boolean, size: Long)
+
   implicit class SparkContextImprovements(sc: SparkContext) {
 
     private def getFileSystem(path: Path): FileSystem = {
@@ -194,5 +199,102 @@ object SparkContextUtils {
       else
         objectHadoopFile(paths, minimumPaths)
     }
+
+    def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long): RDD[String] = {
+      val s3Paths = parallelListFiles(paths)
+      val buckets = buildBuckets(s3Paths, maxBytesPerPartition)
+      val files = buckets.flatMap(_.paths)
+
+      val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId")
+      val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey")
+
+      val partitionedFiles = sc.parallelize(files).map(file => file -> ()).partitionBy(createPartitioner(buckets))
+
+      partitionedFiles.mapPartitions { files =>
+        val conf = new Configuration()
+        conf.set("fs.s3n.awsAccessKeyId", s3Key)
+        conf.set("fs.s3n.awsSecretAccessKey", s3Secret)
+        val codecFactory = new CompressionCodecFactory(conf)
+        files.map { case (path, _) => path } flatMap { s3Path =>
+          val fileSystem = FileSystem.get(new java.net.URI(s3Path), conf)
+          val path = new Path(s3Path)
+          val inputStream = Option(codecFactory.getCodec(path)) match {
+            case Some(compression) => compression.createInputStream(fileSystem.open(path))
+            case None => fileSystem.open(path)
+          }
+          Source.fromInputStream(inputStream).getLines()
+        }
+      }
+    }
+
+    private def createPartitioner(buckets: Seq[Bucket]): Partitioner = {
+      val size = buckets.size
+      val partitions: Map[Any, Int] = buckets.zipWithIndex.flatMap { case (bucket, index) => bucket.paths.map(path => path -> index) }.toMap
+      new Partitioner {
+        override def numPartitions: Int = size
+        override def getPartition(key: Any): Int = partitions(key)
+      }
+    }
+
+    private def buildBuckets(files: Seq[S3File], maxBytesPerPartition: Long): Seq[Bucket] = {
+      val buckets = ArrayBuffer.empty[Bucket]
+      files.distinctBy(_.path).foreach { file =>
+        val size = file.size
+        val bucket = buckets.find(bucket => bucket.size + size < maxBytesPerPartition) match {
+          case Some(bucketFound) => bucketFound
+          case None =>
+            val newBucket = Bucket(0, ArrayBuffer.empty)
+            buckets += newBucket
+            newBucket
+        }
+        bucket.size += size
+        bucket.paths += file.path
+      }
+      buckets
+    }
+
+    def parallelListFiles(paths: Seq[String]): Seq[S3File] = {
+      val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId")
+      val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey")
+
+      val remainingDirectories = new scala.collection.mutable.ArrayBuffer[S3File]
+      remainingDirectories ++= paths.map(S3File(_, isDir = true, 0))
+      val allFiles = new scala.collection.mutable.ArrayBuffer[S3File]
+
+      while (remainingDirectories.nonEmpty) {
+        val newDirs = sc.parallelize(remainingDirectories.map(_.path))
+        val currentBatch = newDirs.flatMap { path =>
+          val conf = new Configuration()
+          conf.set("fs.s3n.awsAccessKeyId", s3Key)
+          conf.set("fs.s3n.awsSecretAccessKey", s3Secret)
+          val fileSystem = FileSystem.get(new java.net.URI(path), conf)
+          try {
+            val hadoopPath = new Path(path)
+            if (fileSystem.isDirectory(hadoopPath)) {
+              val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty)
+              sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen))
+            } else if (fileSystem.isFile(hadoopPath)) {
+              val status = fileSystem.getFileStatus(hadoopPath)
+              Seq(S3File(status.getPath.toString, status.isDirectory, status.getLen))
+            } else { // Maybe is glob or not found
+              val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty)
+              sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen))
+            }
+          } catch {
+            case e: java.io.FileNotFoundException =>
+              println(s"File $path not found.")
+              e.printStackTrace()
+              Nil
+          }
+        }.collect()
+        val (dirs, files) = currentBatch.partition(_.isDir)
+        remainingDirectories.clear()
+        remainingDirectories ++= dirs
+        allFiles ++= files
+      }
+
+      allFiles
+    }
+
   }
 }

From c56c0273b7b875329ff300d93af484f2beaf045f Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Mon, 14 Sep 2015 17:31:59 -0300
Subject: [PATCH 040/268] some of pr reivews

---
 .../core/jobs/utils/SparkContextUtils.scala   | 122 +++++++++---------
 1 file changed, 62 insertions(+), 60 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index f421b614..03801857 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -18,7 +18,7 @@ import scala.util.Try
 
 object SparkContextUtils {
 
-  case class Bucket(var size: Long, paths: ArrayBuffer[String])
+  case class S3FilePartition(var size: Long, paths: ArrayBuffer[String])
   case class S3File(path: String, isDir: Boolean, size: Long)
 
   implicit class SparkContextImprovements(sc: SparkContext) {
@@ -201,14 +201,13 @@ object SparkContextUtils {
     }
 
     def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long): RDD[String] = {
-      val s3Paths = parallelListFiles(paths)
-      val buckets = buildBuckets(s3Paths, maxBytesPerPartition)
-      val files = buckets.flatMap(_.paths)
+      val foundFiles = parallelListFiles(paths)
+      val files = foundFiles.map(_.path)
 
       val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId")
       val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey")
 
-      val partitionedFiles = sc.parallelize(files).map(file => file -> ()).partitionBy(createPartitioner(buckets))
+      val partitionedFiles = sc.parallelize(files).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition))
 
       partitionedFiles.mapPartitions { files =>
         val conf = new Configuration()
@@ -222,78 +221,81 @@ object SparkContextUtils {
             case Some(compression) => compression.createInputStream(fileSystem.open(path))
             case None => fileSystem.open(path)
           }
-          Source.fromInputStream(inputStream).getLines()
+          try {
+            Source.fromInputStream(inputStream).getLines().toList
+          } finally {
+            Try { inputStream.close() }
+          }
         }
       }
     }
 
-    private def createPartitioner(buckets: Seq[Bucket]): Partitioner = {
-      val size = buckets.size
-      val partitions: Map[Any, Int] = buckets.zipWithIndex.flatMap { case (bucket, index) => bucket.paths.map(path => path -> index) }.toMap
-      new Partitioner {
-        override def numPartitions: Int = size
-        override def getPartition(key: Any): Int = partitions(key)
-      }
-    }
-
-    private def buildBuckets(files: Seq[S3File], maxBytesPerPartition: Long): Seq[Bucket] = {
-      val buckets = ArrayBuffer.empty[Bucket]
+    private def createPartitioner(files: Seq[S3File], maxBytesPerPartition: Long): Partitioner = {
+      val partitions = ArrayBuffer.empty[S3FilePartition]
       files.distinctBy(_.path).foreach { file =>
         val size = file.size
-        val bucket = buckets.find(bucket => bucket.size + size < maxBytesPerPartition) match {
-          case Some(bucketFound) => bucketFound
+        val partition = partitions.find(bucket => bucket.size + size < maxBytesPerPartition) match {
+          case Some(partitionFound) => partitionFound
           case None =>
-            val newBucket = Bucket(0, ArrayBuffer.empty)
-            buckets += newBucket
-            newBucket
+            val newPartition = S3FilePartition(0, ArrayBuffer.empty)
+            partitions += newPartition
+            newPartition
         }
-        bucket.size += size
-        bucket.paths += file.path
+        partition.size += size
+        partition.paths += file.path
+      }
+
+      val indexedPartitions: Map[Any, Int] = partitions.zipWithIndex.flatMap {
+        case (bucket, index) => bucket.paths.map(path => path -> index)
+      }.toMap
+
+      new Partitioner {
+        override def numPartitions: Int = partitions.size
+        override def getPartition(key: Any): Int = indexedPartitions(key)
       }
-      buckets
     }
 
     def parallelListFiles(paths: Seq[String]): Seq[S3File] = {
       val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId")
       val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey")
 
-      val remainingDirectories = new scala.collection.mutable.ArrayBuffer[S3File]
-      remainingDirectories ++= paths.map(S3File(_, isDir = true, 0))
-      val allFiles = new scala.collection.mutable.ArrayBuffer[S3File]
-
-      while (remainingDirectories.nonEmpty) {
-        val newDirs = sc.parallelize(remainingDirectories.map(_.path))
-        val currentBatch = newDirs.flatMap { path =>
-          val conf = new Configuration()
-          conf.set("fs.s3n.awsAccessKeyId", s3Key)
-          conf.set("fs.s3n.awsSecretAccessKey", s3Secret)
-          val fileSystem = FileSystem.get(new java.net.URI(path), conf)
-          try {
-            val hadoopPath = new Path(path)
-            if (fileSystem.isDirectory(hadoopPath)) {
-              val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty)
-              sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen))
-            } else if (fileSystem.isFile(hadoopPath)) {
-              val status = fileSystem.getFileStatus(hadoopPath)
-              Seq(S3File(status.getPath.toString, status.isDirectory, status.getLen))
-            } else { // Maybe is glob or not found
-              val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty)
-              sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen))
+      val directories = paths.map(S3File(_, isDir = true, 0))
+
+      def innerListFiles(remainingDirectories: Seq[S3File]): Seq[S3File] = {
+        if (remainingDirectories.isEmpty) {
+          Nil
+        } else {
+          val newDirs = sc.parallelize(remainingDirectories.map(_.path))
+          val currentBatch = newDirs.flatMap { path =>
+            val conf = new Configuration()
+            conf.set("fs.s3n.awsAccessKeyId", s3Key)
+            conf.set("fs.s3n.awsSecretAccessKey", s3Secret)
+            val fileSystem = FileSystem.get(new java.net.URI(path), conf)
+            try {
+              val hadoopPath = new Path(path)
+              if (fileSystem.isDirectory(hadoopPath)) {
+                val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty)
+                sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen))
+              } else if (fileSystem.isFile(hadoopPath)) {
+                val status = fileSystem.getFileStatus(hadoopPath)
+                Seq(S3File(status.getPath.toString, status.isDirectory, status.getLen))
+              } else {
+                // Maybe is glob or not found
+                val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty)
+                sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen))
+              }
+            } catch {
+              case e: java.io.FileNotFoundException =>
+                println(s"File $path not found.")
+                Nil
             }
-          } catch {
-            case e: java.io.FileNotFoundException =>
-              println(s"File $path not found.")
-              e.printStackTrace()
-              Nil
-          }
-        }.collect()
-        val (dirs, files) = currentBatch.partition(_.isDir)
-        remainingDirectories.clear()
-        remainingDirectories ++= dirs
-        allFiles ++= files
-      }
+          }.collect()
 
-      allFiles
+          val (dirs, files) = currentBatch.partition(_.isDir)
+          files ++ innerListFiles(dirs)
+        }
+      }
+      innerListFiles(directories)
     }
 
   }

From 8ffee27d402a33bffd54fad2b11d4b092709d9f4 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 15 Sep 2015 14:26:20 -0300
Subject: [PATCH 041/268] pr review

---
 .../core/jobs/utils/SparkContextUtils.scala   | 107 ++++++++----------
 1 file changed, 49 insertions(+), 58 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 03801857..3fdfea09 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -1,16 +1,17 @@
 package ignition.core.jobs.utils
 
 import ignition.core.utils.ByteUtils
-import ignition.core.utils.CollectionUtils._
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.io.LongWritable
 import org.apache.hadoop.io.compress.CompressionCodecFactory
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.{Partitioner, SparkContext}
 import org.apache.hadoop.fs.{FileStatus, Path, FileSystem}
 import org.apache.spark.rdd.{UnionRDD, RDD}
 import org.joda.time.DateTime
 import ignition.core.utils.DateUtils._
 
+import scala.collection.JavaConversions._
 import scala.collection.mutable.ArrayBuffer
 import scala.io.Source
 import scala.reflect.ClassTag
@@ -18,8 +19,9 @@ import scala.util.Try
 
 object SparkContextUtils {
 
-  case class S3FilePartition(var size: Long, paths: ArrayBuffer[String])
-  case class S3File(path: String, isDir: Boolean, size: Long)
+  case class HadoopFile(path: String, isDir: Boolean, size: Long)
+
+  private case class HadoopFilePartition(size: Long, paths: Seq[String])
 
   implicit class SparkContextImprovements(sc: SparkContext) {
 
@@ -201,18 +203,13 @@ object SparkContextUtils {
     }
 
     def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long): RDD[String] = {
-      val foundFiles = parallelListFiles(paths)
-      val files = foundFiles.map(_.path)
-
-      val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId")
-      val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey")
+      val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap)
 
-      val partitionedFiles = sc.parallelize(files).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition))
+      val foundFiles = parallelListFiles(paths)
+      val partitionedFiles = sc.parallelize(foundFiles.map(_.path)).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition))
 
       partitionedFiles.mapPartitions { files =>
-        val conf = new Configuration()
-        conf.set("fs.s3n.awsAccessKeyId", s3Key)
-        conf.set("fs.s3n.awsSecretAccessKey", s3Secret)
+        val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
         val codecFactory = new CompressionCodecFactory(conf)
         files.map { case (path, _) => path } flatMap { s3Path =>
           val fileSystem = FileSystem.get(new java.net.URI(s3Path), conf)
@@ -222,7 +219,7 @@ object SparkContextUtils {
             case None => fileSystem.open(path)
           }
           try {
-            Source.fromInputStream(inputStream).getLines().toList
+            Source.fromInputStream(inputStream).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _)
           } finally {
             Try { inputStream.close() }
           }
@@ -230,19 +227,15 @@ object SparkContextUtils {
       }
     }
 
-    private def createPartitioner(files: Seq[S3File], maxBytesPerPartition: Long): Partitioner = {
-      val partitions = ArrayBuffer.empty[S3FilePartition]
-      files.distinctBy(_.path).foreach { file =>
-        val size = file.size
-        val partition = partitions.find(bucket => bucket.size + size < maxBytesPerPartition) match {
-          case Some(partitionFound) => partitionFound
-          case None =>
-            val newPartition = S3FilePartition(0, ArrayBuffer.empty)
-            partitions += newPartition
-            newPartition
-        }
-        partition.size += size
-        partition.paths += file.path
+    private def createPartitioner(files: Seq[HadoopFile], maxBytesPerPartition: Long): Partitioner = {
+      val partitions = files.foldLeft(Seq.empty[HadoopFilePartition]) {
+        case (acc, file) =>
+          acc.find(bucket => bucket.size + file.size < maxBytesPerPartition) match {
+            case Some(found) =>
+              val updated = found.copy(size = found.size + file.size, paths = file.path +: found.paths)
+              acc.updated(acc.indexOf(found), updated)
+            case None => acc :+ HadoopFilePartition(file.size, Seq(file.path))
+          }
       }
 
       val indexedPartitions: Map[Any, Int] = partitions.zipWithIndex.flatMap {
@@ -255,43 +248,41 @@ object SparkContextUtils {
       }
     }
 
-    def parallelListFiles(paths: Seq[String]): Seq[S3File] = {
-      val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId")
-      val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey")
+    private def executeListOnWorkers(hadoopConf: Broadcast[Map[String, String]], paths: RDD[String]): Seq[HadoopFile] = {
+      paths.flatMap { path =>
+        val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
+        val fileSystem = FileSystem.get(new java.net.URI(path), conf)
+        try {
+          val hadoopPath = new Path(path)
+          if (fileSystem.isDirectory(hadoopPath)) {
+            val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty)
+            sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen))
+          } else if (fileSystem.isFile(hadoopPath)) {
+            val status = fileSystem.getFileStatus(hadoopPath)
+            Seq(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen))
+          } else {
+            // Maybe is glob or not found
+            val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty)
+            sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen))
+          }
+        } catch {
+          case e: java.io.FileNotFoundException =>
+            println(s"File $path not found.")
+            Nil
+        }
+      }.collect().toSeq
+    }
 
-      val directories = paths.map(S3File(_, isDir = true, 0))
+    def parallelListFiles(paths: Seq[String]): Seq[HadoopFile] = {
+      val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap)
+      val directories = paths.map(HadoopFile(_, isDir = true, 0))
 
-      def innerListFiles(remainingDirectories: Seq[S3File]): Seq[S3File] = {
+      def innerListFiles(remainingDirectories: Seq[HadoopFile]): Seq[HadoopFile] = {
         if (remainingDirectories.isEmpty) {
           Nil
         } else {
-          val newDirs = sc.parallelize(remainingDirectories.map(_.path))
-          val currentBatch = newDirs.flatMap { path =>
-            val conf = new Configuration()
-            conf.set("fs.s3n.awsAccessKeyId", s3Key)
-            conf.set("fs.s3n.awsSecretAccessKey", s3Secret)
-            val fileSystem = FileSystem.get(new java.net.URI(path), conf)
-            try {
-              val hadoopPath = new Path(path)
-              if (fileSystem.isDirectory(hadoopPath)) {
-                val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty)
-                sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen))
-              } else if (fileSystem.isFile(hadoopPath)) {
-                val status = fileSystem.getFileStatus(hadoopPath)
-                Seq(S3File(status.getPath.toString, status.isDirectory, status.getLen))
-              } else {
-                // Maybe is glob or not found
-                val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty)
-                sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen))
-              }
-            } catch {
-              case e: java.io.FileNotFoundException =>
-                println(s"File $path not found.")
-                Nil
-            }
-          }.collect()
-
-          val (dirs, files) = currentBatch.partition(_.isDir)
+          val pathsRDD = sc.parallelize(remainingDirectories.map(_.path))
+          val (dirs, files) = executeListOnWorkers(hadoopConf, pathsRDD).partition(_.isDir)
           files ++ innerListFiles(dirs)
         }
       }

From 7234254729377453b7750166765554cb3eb22951 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 16 Sep 2015 10:14:09 -0300
Subject: [PATCH 042/268] logging input stream close failure

---
 .../core/jobs/utils/SparkContextUtils.scala   | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 3fdfea09..2259d622 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -16,6 +16,7 @@ import scala.collection.mutable.ArrayBuffer
 import scala.io.Source
 import scala.reflect.ClassTag
 import scala.util.Try
+import scala.util.control.NonFatal
 
 object SparkContextUtils {
 
@@ -211,17 +212,23 @@ object SparkContextUtils {
       partitionedFiles.mapPartitions { files =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
         val codecFactory = new CompressionCodecFactory(conf)
-        files.map { case (path, _) => path } flatMap { s3Path =>
-          val fileSystem = FileSystem.get(new java.net.URI(s3Path), conf)
-          val path = new Path(s3Path)
-          val inputStream = Option(codecFactory.getCodec(path)) match {
-            case Some(compression) => compression.createInputStream(fileSystem.open(path))
-            case None => fileSystem.open(path)
+        files.map { case (path, _) => path } flatMap { path =>
+          val fileSystem = FileSystem.get(new java.net.URI(path), conf)
+          val hadoopPath = new Path(path)
+          val inputStream = Option(codecFactory.getCodec(hadoopPath)) match {
+            case Some(compression) => compression.createInputStream(fileSystem.open(hadoopPath))
+            case None => fileSystem.open(hadoopPath)
           }
           try {
             Source.fromInputStream(inputStream).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _)
           } finally {
-            Try { inputStream.close() }
+            try {
+              inputStream.close()
+            } catch {
+              case NonFatal(ex) =>
+                println(s"Fail to close resource from '$path'")
+                ex.printStackTrace()
+            }
           }
         }
       }

From af00eefa0975159ae760bbfec4444638b9862293 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 16 Sep 2015 10:34:33 -0300
Subject: [PATCH 043/268] better exception report

---
 .../scala/ignition/core/jobs/utils/SparkContextUtils.scala     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 2259d622..0d7f0742 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -226,8 +226,7 @@ object SparkContextUtils {
               inputStream.close()
             } catch {
               case NonFatal(ex) =>
-                println(s"Fail to close resource from '$path'")
-                ex.printStackTrace()
+                println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}")
             }
           }
         }

From 89630eb7f990c36ae749cc8312e6bc199b8b15e7 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Thu, 17 Sep 2015 13:53:36 -0300
Subject: [PATCH 044/268] setting UTF-8 codec to read file content (same
 behavior of hadoop client)

---
 .../scala/ignition/core/jobs/utils/SparkContextUtils.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 0d7f0742..ab20c83b 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -13,7 +13,7 @@ import ignition.core.utils.DateUtils._
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.ArrayBuffer
-import scala.io.Source
+import scala.io.{Codec, Source}
 import scala.reflect.ClassTag
 import scala.util.Try
 import scala.util.control.NonFatal
@@ -220,7 +220,7 @@ object SparkContextUtils {
             case None => fileSystem.open(hadoopPath)
           }
           try {
-            Source.fromInputStream(inputStream).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _)
+            Source.fromInputStream(inputStream)(Codec.UTF8).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _)
           } finally {
             try {
               inputStream.close()

From 06ac774d980a47c05670760bd9c3d8725aabc45f Mon Sep 17 00:00:00 2001
From: Fernando Rodrigues <fernandors87@gmail.com>
Date: Mon, 21 Sep 2015 19:31:45 -0300
Subject: [PATCH 045/268] will delete SG's after cluster destroy

---
 tools/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index ed348fbb..3f4065e3 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -313,7 +313,7 @@ def launch(cluster_name, slaves,
     raise CommandError('Failed to created cluster {} after failures'.format(cluster_name))
 
 
-def destroy(cluster_name, delete_groups=False, region=default_region):
+def destroy(cluster_name, delete_groups=True, region=default_region):
     delete_sg_param = ['--delete-groups'] if delete_groups else []
 
     ec2_script_path = chdir_to_ec2_script_and_get_path()

From 10b086e745f9d304df3e77af1215c34cf0c5b59c Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Thu, 15 Oct 2015 14:41:58 -0300
Subject: [PATCH 046/268] spark 1.5.1 update

---
 build.sbt        | 2 +-
 tools/cluster.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/build.sbt b/build.sbt
index 476dd3bb..acdef9cb 100644
--- a/build.sbt
+++ b/build.sbt
@@ -13,7 +13,7 @@ ideaExcludeFolders += ".idea_modules"
 // Because we can't run two spark contexts on same VM
 parallelExecution in Test := false
 
-libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.4.1" % "provided")
+libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.5.1" % "provided")
   .exclude("org.apache.hadoop", "hadoop-client")
   .exclude("org.slf4j", "slf4j-log4j12")
 
diff --git a/tools/cluster.py b/tools/cluster.py
index 3f4065e3..cd972951 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -47,7 +47,7 @@
 default_ami = None # will be decided based on spark-ec2 list
 default_master_ami = None
 default_env = 'dev'
-default_spark_version = '1.4.1'
+default_spark_version = '1.5.1'
 default_spark_repo = 'https://github.com/chaordic/spark'
 default_remote_user = 'ec2-user'
 default_remote_control_dir = '/tmp/Ignition'

From a59f2eb92416915304edad9fd5f72f5c048e78a5 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Thu, 15 Oct 2015 15:34:38 -0300
Subject: [PATCH 047/268] fix spark_ec2.py

---
 tools/spark-ec2/spark_ec2.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
index 52c21c3f..89ade820 100755
--- a/tools/spark-ec2/spark_ec2.py
+++ b/tools/spark-ec2/spark_ec2.py
@@ -51,7 +51,7 @@
     raw_input = input
     xrange = range
 
-SPARK_EC2_VERSION = "1.4.1"
+SPARK_EC2_VERSION = "1.5.1"
 SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
 
 VALID_SPARK_VERSIONS = set([
@@ -72,6 +72,8 @@
     "1.3.1",
     "1.4.0",
     "1.4.1",
+    "1.5.0",
+    "1.5.1",
 ])
 
 SPARK_TACHYON_MAP = {
@@ -86,6 +88,8 @@
     "1.3.1": "0.5.0",
     "1.4.0": "0.6.4",
     "1.4.1": "0.6.4",
+    "1.5.0": "0.7.1",
+    "1.5.1": "0.7.1",
 }
 
 DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION

From b176cc51a513ac6c1d8155ce98e6ea345f6d9abd Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Thu, 15 Oct 2015 17:03:58 -0300
Subject: [PATCH 048/268] Added executor instances option

---
 tools/cluster.py                                            | 6 +++++-
 .../deploy.generic/root/spark-ec2/ec2-variables.sh          | 1 +
 tools/spark-ec2/spark_ec2.py                                | 6 ++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index cd972951..0af46ebe 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -38,6 +38,7 @@
 default_instance_type = 'r3.xlarge'
 default_spot_price = '0.10'
 default_worker_instances = '1'
+default_executor_instances = '1'
 default_master_instance_type = 'm3.xlarge'
 default_driver_heap_size = '12G'
 default_region = 'us-east-1'
@@ -209,7 +210,9 @@ def launch(cluster_name, slaves,
            vpc_subnet = None,
            master_instance_type=default_master_instance_type,
            wait_time='180', hadoop_major_version='2',
-           worker_instances=default_worker_instances, retries_on_same_cluster=5,
+           worker_instances=default_worker_instances,
+           executor_instances=default_executor_instances,
+           retries_on_same_cluster=5,
            max_clusters_to_create=5,
            minimum_percentage_healthy_slaves=0.9,
            remote_user=default_remote_user,
@@ -272,6 +275,7 @@ def launch(cluster_name, slaves,
                                  '--spark-ec2-git-repo', spark_ec2_git_repo,
                                  '--spark-ec2-git-branch', spark_ec2_git_branch,
                                  '--worker-instances', worker_instances,
+                                 '--executor-instances', executor_instances,
                                  '--master-opts', '-Dspark.worker.timeout={0}'.format(worker_timeout),
                                  '--spark-git-repo', spark_repo,
                                  '-v', spark_version,
diff --git a/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh b/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
index 4f3e8da8..bd3b656f 100644
--- a/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
+++ b/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
@@ -29,6 +29,7 @@ export TACHYON_VERSION="{{tachyon_version}}"
 export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}"
 export SWAP_MB="{{swap}}"
 export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}"
+export SPARK_EXECUTOR_INSTANCES="{{spark_executor_instances}}"
 export SPARK_MASTER_OPTS="{{spark_master_opts}}"
 export AWS_ACCESS_KEY_ID="{{aws_access_key_id}}"
 export AWS_SECRET_ACCESS_KEY="{{aws_secret_access_key}}"
diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
index 89ade820..e9442448 100755
--- a/tools/spark-ec2/spark_ec2.py
+++ b/tools/spark-ec2/spark_ec2.py
@@ -288,6 +288,10 @@ def parse_args():
         "--worker-instances", type="int", default=1,
         help="Number of instances per worker: variable SPARK_WORKER_INSTANCES. Not used if YARN " +
              "is used as Hadoop major version (default: %default)")
+    parser.add_option(
+        "--executor-instances", type="int", default=1,
+        help="Number of executor instances per worker: variable SPARK_EXECUTOR_INSTANCES. Not used if YARN " +
+             "is used as Hadoop major version (default: %default)")
     parser.add_option(
         "--master-opts", type="string", default="",
         help="Extra options to give to master through SPARK_MASTER_OPTS variable " +
@@ -1161,6 +1165,7 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
     master_addresses = [get_dns_name(i, opts.private_ips) for i in master_nodes]
     slave_addresses = [get_dns_name(i, opts.private_ips) for i in slave_nodes]
     worker_instances_str = "%d" % opts.worker_instances if opts.worker_instances else ""
+    executor_instances_str = "%d" % opts.executor_instances if opts.executor_instances else ""
     template_vars = {
         "master_list": '\n'.join(master_addresses),
         "active_master": active_master,
@@ -1175,6 +1180,7 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
         "tachyon_version": tachyon_v,
         "hadoop_major_version": opts.hadoop_major_version,
         "spark_worker_instances": worker_instances_str,
+        "spark_executor_instances": executor_instances_str,
         "spark_master_opts": opts.master_opts
     }
 

From 437e2644463157d5df0328df63de8cb8d68bdef7 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Tue, 27 Oct 2015 16:50:59 -0200
Subject: [PATCH 049/268] Adding filterAndGetParallelTextFiles

---
 .../core/jobs/utils/SparkContextUtils.scala   | 42 +++++++++++++++++--
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index ab20c83b..00cbb347 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -36,7 +36,7 @@ object SparkContextUtils {
       for {
         path <- paths
         status <- Option(fs.globStatus(path)).getOrElse(Array.empty).toSeq
-        if status.isDirectory || !removeEmpty || status.getLen > 0 // remove empty files if necessary
+        if !removeEmpty || status.getLen > 0 || status.isDirectory // remove empty files if necessary
       } yield status
     }
 
@@ -69,6 +69,14 @@ object SparkContextUtils {
       processPaths((p) => sc.textFile(p), paths, minimumPaths)
     }
 
+    private def processParallelTextFiles(paths: Seq[String], minimumPaths: Int, maxBytesPerPartition: Long): RDD[String] = {
+      val splittedPaths = paths.flatMap(ignition.core.utils.HadoopUtils.getPathStrings)
+      if (splittedPaths.size < minimumPaths)
+        throw new Exception(s"Not enough paths found for $paths")
+
+      parallelTextFiles(splittedPaths, maxBytesPerPartition)
+    }
+
     private def filterPaths(paths: Seq[String],
                             requireSuccess: Boolean,
                             inclusiveStartDate: Boolean,
@@ -145,6 +153,14 @@ object SparkContextUtils {
         processTextFiles(paths, minimumPaths)
     }
 
+    def getParallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long, synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = {
+      if (synchLocally)
+        processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition)
+      else
+        processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition)
+    }
+
+    @deprecated("It may incur in heavy S3 costs and/or be slow with small files, use filterAndGetParallelTextFiles instead", "2015-10-27")
     def filterAndGetTextFiles(path: String,
                               requireSuccess: Boolean = false,
                               inclusiveStartDate: Boolean = true,
@@ -162,6 +178,24 @@ object SparkContextUtils {
       getTextFiles(paths, synchLocally, forceSynch, minimumPaths)
     }
 
+    def filterAndGetParallelTextFiles(path: String,
+                                      maxBytesPerPartition: Long = 64 * 1000 * 1000,
+                                      requireSuccess: Boolean = false,
+                                      inclusiveStartDate: Boolean = true,
+                                      startDate: Option[DateTime] = None,
+                                      inclusiveEndDate: Boolean = true,
+                                      endDate: Option[DateTime] = None,
+                                      lastN: Option[Int] = None,
+                                      synchLocally: Boolean = false,
+                                      forceSynch: Boolean = false,
+                                      ignoreMalformedDates: Boolean = false,
+                                      minimumPaths: Int = 1)(implicit dateExtractor: PathDateExtractor): RDD[String] = {
+      val paths = getFilteredPaths(Seq(path), requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates)
+      if (paths.size < minimumPaths)
+        throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of paths $paths is less than the required")
+      getParallelTextFiles(paths, maxBytesPerPartition, synchLocally, forceSynch, minimumPaths)
+    }
+
     private def stringHadoopFile(paths: Seq[String], minimumPaths: Int): RDD[Try[String]] = {
       processPaths((p) => sc.sequenceFile(p, classOf[LongWritable], classOf[org.apache.hadoop.io.BytesWritable])
                 .map({ case (k, v) => Try { ByteUtils.toString(v.getBytes, 0, v.getLength, "UTF-8") } }), paths, minimumPaths)
@@ -260,11 +294,11 @@ object SparkContextUtils {
         val fileSystem = FileSystem.get(new java.net.URI(path), conf)
         try {
           val hadoopPath = new Path(path)
-          if (fileSystem.isDirectory(hadoopPath)) {
+          val status = fileSystem.getFileStatus(hadoopPath)
+          if (status.isDirectory) {
             val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty)
             sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen))
-          } else if (fileSystem.isFile(hadoopPath)) {
-            val status = fileSystem.getFileStatus(hadoopPath)
+          } else if (status.isFile) {
             Seq(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen))
           } else {
             // Maybe is glob or not found

From 637b80d9995e646b5260605c049a76487a054d03 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Tue, 27 Oct 2015 19:57:00 -0200
Subject: [PATCH 050/268] Many improvements

---
 .../core/jobs/utils/SparkContextUtils.scala   | 45 ++++++++++++-------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 00cbb347..490cda2c 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -13,6 +13,7 @@ import ignition.core.utils.DateUtils._
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable
 import scala.io.{Codec, Source}
 import scala.reflect.ClassTag
 import scala.util.Try
@@ -69,12 +70,12 @@ object SparkContextUtils {
       processPaths((p) => sc.textFile(p), paths, minimumPaths)
     }
 
-    private def processParallelTextFiles(paths: Seq[String], minimumPaths: Int, maxBytesPerPartition: Long): RDD[String] = {
+    private def processParallelTextFiles(paths: Seq[String], minimumPaths: Int, maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = {
       val splittedPaths = paths.flatMap(ignition.core.utils.HadoopUtils.getPathStrings)
       if (splittedPaths.size < minimumPaths)
         throw new Exception(s"Not enough paths found for $paths")
 
-      parallelTextFiles(splittedPaths, maxBytesPerPartition)
+      parallelTextFiles(splittedPaths, maxBytesPerPartition, minPartitions)
     }
 
     private def filterPaths(paths: Seq[String],
@@ -146,6 +147,7 @@ object SparkContextUtils {
     }
 
 
+    @deprecated("It may incur heavy S3 costs and/or be slow with small files, use getParallelTextFiles instead", "2015-10-27")
     def getTextFiles(paths: Seq[String], synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = {
       if (synchLocally)
         processTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths)
@@ -153,14 +155,17 @@ object SparkContextUtils {
         processTextFiles(paths, minimumPaths)
     }
 
-    def getParallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long, synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = {
+    def getParallelTextFiles(paths: Seq[String],
+                             maxBytesPerPartition: Long = 64 * 1000 * 1000,
+                             minPartitions: Int = 500,
+                             synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = {
       if (synchLocally)
-        processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition)
+        processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition, minPartitions)
       else
-        processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition)
+        processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition, minPartitions)
     }
 
-    @deprecated("It may incur in heavy S3 costs and/or be slow with small files, use filterAndGetParallelTextFiles instead", "2015-10-27")
+    @deprecated("It may incur heavy S3 costs and/or be slow with small files, use filterAndGetParallelTextFiles instead", "2015-10-27")
     def filterAndGetTextFiles(path: String,
                               requireSuccess: Boolean = false,
                               inclusiveStartDate: Boolean = true,
@@ -180,6 +185,7 @@ object SparkContextUtils {
 
     def filterAndGetParallelTextFiles(path: String,
                                       maxBytesPerPartition: Long = 64 * 1000 * 1000,
+                                      minPartitions: Int = 500,
                                       requireSuccess: Boolean = false,
                                       inclusiveStartDate: Boolean = true,
                                       startDate: Option[DateTime] = None,
@@ -193,7 +199,7 @@ object SparkContextUtils {
       val paths = getFilteredPaths(Seq(path), requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates)
       if (paths.size < minimumPaths)
         throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of paths $paths is less than the required")
-      getParallelTextFiles(paths, maxBytesPerPartition, synchLocally, forceSynch, minimumPaths)
+      getParallelTextFiles(paths, maxBytesPerPartition, minPartitions, synchLocally, forceSynch, minimumPaths)
     }
 
     private def stringHadoopFile(paths: Seq[String], minimumPaths: Int): RDD[Try[String]] = {
@@ -237,17 +243,18 @@ object SparkContextUtils {
         objectHadoopFile(paths, minimumPaths)
     }
 
-    def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long): RDD[String] = {
+    def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = {
+      require(paths.nonEmpty, "At least one path is required")
       val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap)
 
       val foundFiles = parallelListFiles(paths)
-      val partitionedFiles = sc.parallelize(foundFiles.map(_.path)).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition))
+      val partitionedFiles = sc.parallelize(foundFiles.map(_.path)).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition, minPartitions))
 
       partitionedFiles.mapPartitions { files =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
         val codecFactory = new CompressionCodecFactory(conf)
+        val fileSystem = FileSystem.get(new java.net.URI(paths.head), conf)
         files.map { case (path, _) => path } flatMap { path =>
-          val fileSystem = FileSystem.get(new java.net.URI(path), conf)
           val hadoopPath = new Path(path)
           val inputStream = Option(codecFactory.getCodec(hadoopPath)) match {
             case Some(compression) => compression.createInputStream(fileSystem.open(hadoopPath))
@@ -267,16 +274,22 @@ object SparkContextUtils {
       }
     }
 
-    private def createPartitioner(files: Seq[HadoopFile], maxBytesPerPartition: Long): Partitioner = {
-      val partitions = files.foldLeft(Seq.empty[HadoopFilePartition]) {
+    private def createPartitioner(files: Seq[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long): Partitioner = {
+      implicit val ordering: Ordering[HadoopFilePartition] = Ordering.by(p => -p.size) // Small partitions come first (highest priority)
+
+      val pq: mutable.PriorityQueue[HadoopFilePartition] = mutable.PriorityQueue.empty
+
+      (0L until minPartitions).foreach(_ => pq += HadoopFilePartition(0, Seq.empty))
+
+      val partitions = files.foldLeft(pq) {
         case (acc, file) =>
-          acc.find(bucket => bucket.size + file.size < maxBytesPerPartition) match {
+          acc.headOption.filter(bucket => bucket.size + file.size < maxBytesPerPartition) match {
             case Some(found) =>
               val updated = found.copy(size = found.size + file.size, paths = file.path +: found.paths)
-              acc.updated(acc.indexOf(found), updated)
-            case None => acc :+ HadoopFilePartition(file.size, Seq(file.path))
+              acc.tail += updated
+            case None => acc += HadoopFilePartition(file.size, Seq(file.path))
           }
-      }
+      }.filter(_.size > 0).toList // Remove empty partitions
 
       val indexedPartitions: Map[Any, Int] = partitions.zipWithIndex.flatMap {
         case (bucket, index) => bucket.paths.map(path => path -> index)

From 0563fab5fa7bbf900436a0bb95b303f62a22bf12 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Wed, 28 Oct 2015 15:53:45 -0200
Subject: [PATCH 051/268] Small improvements

---
 .../ignition/core/jobs/utils/RDDUtils.scala   |  4 ++
 .../core/jobs/utils/SparkContextUtils.scala   | 37 ++++++++++---------
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
index 7e75d5ec..57069bae 100644
--- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
@@ -80,6 +80,10 @@ object RDDUtils {
       }, preservesPartitioning = true)
     }
 
+    def collectValues[U: ClassTag](f: PartialFunction[V, U]): RDD[(K, U)] = {
+      rdd.filter { case (k, v) => f.isDefinedAt(v) }.mapValues(f)
+    }
+
     def groupByKeyAndTake(n: Int): RDD[(K, List[V])] =
       rdd.aggregateByKey(List.empty[V])(
         (lst, v) =>
diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 490cda2c..8e4ec35c 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -75,7 +75,7 @@ object SparkContextUtils {
       if (splittedPaths.size < minimumPaths)
         throw new Exception(s"Not enough paths found for $paths")
 
-      parallelTextFiles(splittedPaths, maxBytesPerPartition, minPartitions)
+      parallelTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions)
     }
 
     private def filterPaths(paths: Seq[String],
@@ -243,7 +243,7 @@ object SparkContextUtils {
         objectHadoopFile(paths, minimumPaths)
     }
 
-    def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = {
+    def parallelTextFiles(paths: List[String], maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = {
       require(paths.nonEmpty, "At least one path is required")
       val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap)
 
@@ -274,7 +274,7 @@ object SparkContextUtils {
       }
     }
 
-    private def createPartitioner(files: Seq[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long): Partitioner = {
+    private def createPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long): Partitioner = {
       implicit val ordering: Ordering[HadoopFilePartition] = Ordering.by(p => -p.size) // Small partitions come first (highest priority)
 
       val pq: mutable.PriorityQueue[HadoopFilePartition] = mutable.PriorityQueue.empty
@@ -289,7 +289,7 @@ object SparkContextUtils {
               acc.tail += updated
             case None => acc += HadoopFilePartition(file.size, Seq(file.path))
           }
-      }.filter(_.size > 0).toList // Remove empty partitions
+      }.filter(_.paths.nonEmpty).toList // Remove empty partitions
 
       val indexedPartitions: Map[Any, Int] = partitions.zipWithIndex.flatMap {
         case (bucket, index) => bucket.paths.map(path => path -> index)
@@ -301,36 +301,39 @@ object SparkContextUtils {
       }
     }
 
-    private def executeListOnWorkers(hadoopConf: Broadcast[Map[String, String]], paths: RDD[String]): Seq[HadoopFile] = {
+    private def executeListOnWorkers(hadoopConf: Broadcast[Map[String, String]], paths: RDD[String]): List[HadoopFile] = {
       paths.flatMap { path =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
         val fileSystem = FileSystem.get(new java.net.URI(path), conf)
-        try {
-          val hadoopPath = new Path(path)
+        val hadoopPath = new Path(path)
+        val tryFind = try {
           val status = fileSystem.getFileStatus(hadoopPath)
           if (status.isDirectory) {
             val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty)
-            sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen))
+            Option(sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList)
           } else if (status.isFile) {
-            Seq(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen))
+            Option(List(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)))
           } else {
-            // Maybe is glob or not found
-            val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty)
-            sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen))
+            None
           }
         } catch {
           case e: java.io.FileNotFoundException =>
-            println(s"File $path not found.")
-            Nil
+            None
+        }
+
+        tryFind.getOrElse {
+          // Maybe is glob or not found
+          val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty)
+          sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList
         }
-      }.collect().toSeq
+      }.collect().toList
     }
 
-    def parallelListFiles(paths: Seq[String]): Seq[HadoopFile] = {
+    def parallelListFiles(paths: List[String]): List[HadoopFile] = {
       val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap)
       val directories = paths.map(HadoopFile(_, isDir = true, 0))
 
-      def innerListFiles(remainingDirectories: Seq[HadoopFile]): Seq[HadoopFile] = {
+      def innerListFiles(remainingDirectories: List[HadoopFile]): List[HadoopFile] = {
         if (remainingDirectories.isEmpty) {
           Nil
         } else {

From cc4f716f7485e6ca314318006fb3fb943937e3e1 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Wed, 28 Oct 2015 17:29:44 -0200
Subject: [PATCH 052/268] Fix file system issues in corner cases

---
 .../scala/ignition/core/jobs/utils/SparkContextUtils.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 8e4ec35c..6e4d0bc8 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -253,9 +253,9 @@ object SparkContextUtils {
       partitionedFiles.mapPartitions { files =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
         val codecFactory = new CompressionCodecFactory(conf)
-        val fileSystem = FileSystem.get(new java.net.URI(paths.head), conf)
         files.map { case (path, _) => path } flatMap { path =>
           val hadoopPath = new Path(path)
+          val fileSystem = hadoopPath.getFileSystem(conf)
           val inputStream = Option(codecFactory.getCodec(hadoopPath)) match {
             case Some(compression) => compression.createInputStream(fileSystem.open(hadoopPath))
             case None => fileSystem.open(hadoopPath)
@@ -304,8 +304,8 @@ object SparkContextUtils {
     private def executeListOnWorkers(hadoopConf: Broadcast[Map[String, String]], paths: RDD[String]): List[HadoopFile] = {
       paths.flatMap { path =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
-        val fileSystem = FileSystem.get(new java.net.URI(path), conf)
         val hadoopPath = new Path(path)
+        val fileSystem = hadoopPath.getFileSystem(conf)
         val tryFind = try {
           val status = fileSystem.getFileStatus(hadoopPath)
           if (status.isDirectory) {

From 5a4916489e43d7d3e67818e83969a17c25e6aecb Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Tue, 3 Nov 2015 09:09:23 -0200
Subject: [PATCH 053/268] Make it faster in some situations

---
 .../core/jobs/utils/SparkContextUtils.scala   | 90 +++++++++++++++----
 1 file changed, 74 insertions(+), 16 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 6e4d0bc8..842ced37 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -27,6 +27,8 @@ object SparkContextUtils {
 
   implicit class SparkContextImprovements(sc: SparkContext) {
 
+    lazy val _hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap)
+
     private def getFileSystem(path: Path): FileSystem = {
       path.getFileSystem(sc.hadoopConfiguration)
     }
@@ -70,12 +72,16 @@ object SparkContextUtils {
       processPaths((p) => sc.textFile(p), paths, minimumPaths)
     }
 
-    private def processParallelTextFiles(paths: Seq[String], minimumPaths: Int, maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = {
+    private def processParallelTextFiles(paths: Seq[String],
+                                         minimumPaths: Int,
+                                         maxBytesPerPartition: Long,
+                                         minPartitions: Int,
+                                         listOnWorkers: Boolean): RDD[String] = {
       val splittedPaths = paths.flatMap(ignition.core.utils.HadoopUtils.getPathStrings)
       if (splittedPaths.size < minimumPaths)
         throw new Exception(s"Not enough paths found for $paths")
 
-      parallelTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions)
+      parallelTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers)
     }
 
     private def filterPaths(paths: Seq[String],
@@ -158,11 +164,11 @@ object SparkContextUtils {
     def getParallelTextFiles(paths: Seq[String],
                              maxBytesPerPartition: Long = 64 * 1000 * 1000,
                              minPartitions: Int = 500,
-                             synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = {
+                             synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1, listOnWorkers: Boolean = false): RDD[String] = {
       if (synchLocally)
-        processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition, minPartitions)
+        processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition, minPartitions, listOnWorkers)
       else
-        processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition, minPartitions)
+        processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition, minPartitions, listOnWorkers)
     }
 
     @deprecated("It may incur heavy S3 costs and/or be slow with small files, use filterAndGetParallelTextFiles instead", "2015-10-27")
@@ -195,11 +201,12 @@ object SparkContextUtils {
                                       synchLocally: Boolean = false,
                                       forceSynch: Boolean = false,
                                       ignoreMalformedDates: Boolean = false,
-                                      minimumPaths: Int = 1)(implicit dateExtractor: PathDateExtractor): RDD[String] = {
+                                      minimumPaths: Int = 1,
+                                      listOnWorkers: Boolean = false)(implicit dateExtractor: PathDateExtractor): RDD[String] = {
       val paths = getFilteredPaths(Seq(path), requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates)
       if (paths.size < minimumPaths)
         throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of paths $paths is less than the required")
-      getParallelTextFiles(paths, maxBytesPerPartition, minPartitions, synchLocally, forceSynch, minimumPaths)
+      getParallelTextFiles(paths, maxBytesPerPartition, minPartitions, synchLocally, forceSynch, minimumPaths, listOnWorkers)
     }
 
     private def stringHadoopFile(paths: Seq[String], minimumPaths: Int): RDD[Try[String]] = {
@@ -243,13 +250,12 @@ object SparkContextUtils {
         objectHadoopFile(paths, minimumPaths)
     }
 
-    def parallelTextFiles(paths: List[String], maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = {
-      require(paths.nonEmpty, "At least one path is required")
-      val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap)
+    def parallelTextFiles(paths: List[String], maxBytesPerPartition: Long, minPartitions: Int, listOnWorkers: Boolean): RDD[String] = {
 
-      val foundFiles = parallelListFiles(paths)
-      val partitionedFiles = sc.parallelize(foundFiles.map(_.path)).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition, minPartitions))
+      val foundFiles = (if (listOnWorkers) parallelListFiles(paths) else driverListFiles(paths)).filter(_.size > 0)
+      val partitionedFiles = sc.parallelize(foundFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition, minPartitions))
 
+      val hadoopConf = _hadoopConf
       partitionedFiles.mapPartitions { files =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
         val codecFactory = new CompressionCodecFactory(conf)
@@ -262,6 +268,10 @@ object SparkContextUtils {
           }
           try {
             Source.fromInputStream(inputStream)(Codec.UTF8).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _)
+          } catch {
+            case NonFatal(ex) =>
+              println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}")
+              throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}")
           } finally {
             try {
               inputStream.close()
@@ -301,7 +311,9 @@ object SparkContextUtils {
       }
     }
 
-    private def executeListOnWorkers(hadoopConf: Broadcast[Map[String, String]], paths: RDD[String]): List[HadoopFile] = {
+
+    private def executeListOnWorkers(paths: RDD[String]): List[HadoopFile] = {
+      val hadoopConf = _hadoopConf
       paths.flatMap { path =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
         val hadoopPath = new Path(path)
@@ -329,16 +341,62 @@ object SparkContextUtils {
       }.collect().toList
     }
 
+
     def parallelListFiles(paths: List[String]): List[HadoopFile] = {
-      val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap)
+
+      val directories = paths.map(HadoopFile(_, isDir = true, 0))
+
+      def innerListFiles(remainingDirectories: List[HadoopFile]): List[HadoopFile] = {
+        if (remainingDirectories.isEmpty) {
+          Nil
+        } else {
+          val remainingPaths = remainingDirectories.map(_.path)
+          val pathsRDD = sc.parallelize(remainingPaths, remainingPaths.size / 2)
+          val (dirs, files) = executeListOnWorkers(pathsRDD).partition(_.isDir)
+          files ++ innerListFiles(dirs)
+        }
+      }
+      innerListFiles(directories)
+    }
+
+
+    private def executeDriverList(paths: Seq[String]): List[HadoopFile] = {
+      val conf = _hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
+      paths.flatMap { path =>
+        val hadoopPath = new Path(path)
+        val fileSystem = hadoopPath.getFileSystem(conf)
+        val tryFind = try {
+          val status = fileSystem.getFileStatus(hadoopPath)
+          if (status.isDirectory) {
+            val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty)
+            Option(sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList)
+          } else if (status.isFile) {
+            Option(List(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)))
+          } else {
+            None
+          }
+        } catch {
+          case e: java.io.FileNotFoundException =>
+            None
+        }
+
+        tryFind.getOrElse {
+          // Maybe is glob or not found
+          val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty)
+          sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList
+        }
+      }.toList
+    }
+
+    def driverListFiles(paths: List[String]): List[HadoopFile] = {
+
       val directories = paths.map(HadoopFile(_, isDir = true, 0))
 
       def innerListFiles(remainingDirectories: List[HadoopFile]): List[HadoopFile] = {
         if (remainingDirectories.isEmpty) {
           Nil
         } else {
-          val pathsRDD = sc.parallelize(remainingDirectories.map(_.path))
-          val (dirs, files) = executeListOnWorkers(hadoopConf, pathsRDD).partition(_.isDir)
+          val (dirs, files) = executeDriverList(remainingDirectories.map(_.path)).partition(_.isDir)
           files ++ innerListFiles(dirs)
         }
       }

From 506bd1c72affb05c4ebfc001440cfe178e1d30ba Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Mon, 9 Nov 2015 13:47:19 -0200
Subject: [PATCH 054/268] Split gzip files and other improvements

---
 build.sbt                                     |  2 +
 .../ignition/core/jobs/CoreJobRunner.scala    |  2 +
 .../core/jobs/utils/SparkContextUtils.scala   | 93 +++++++++++++++----
 tools/cluster.py                              |  5 +
 4 files changed, 85 insertions(+), 17 deletions(-)

diff --git a/build.sbt b/build.sbt
index acdef9cb..528d30cf 100644
--- a/build.sbt
+++ b/build.sbt
@@ -19,6 +19,8 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.5.1" % "provided
 
 libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided")
 
+libraryDependencies += "nl.basjes.hadoop" % "splittablegzip" % "1.2"
+
 libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.2.4"
 
 libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16"
diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
index aa4dcc76..ec5d9039 100644
--- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
+++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
@@ -67,6 +67,8 @@ object CoreJobRunner {
 
       val sparkConf = new SparkConf()
       sparkConf.set("spark.executor.memory", config.executorMemory)
+      sparkConf.set("spark.hadoop.io.compression.codecs",
+        "org.apache.hadoop.io.compress.DefaultCodec,nl.basjes.hadoop.io.compress.SplittableGzipCodec,org.apache.hadoop.io.compress.BZip2Codec")
 
       sparkConf.setMaster(config.master)
       sparkConf.setAppName(appName)
diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 842ced37..78b6ec9b 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -2,8 +2,9 @@ package ignition.core.jobs.utils
 
 import ignition.core.utils.ByteUtils
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.io.LongWritable
+import org.apache.hadoop.io.{Text, LongWritable}
 import org.apache.hadoop.io.compress.CompressionCodecFactory
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.{Partitioner, SparkContext}
 import org.apache.hadoop.fs.{FileStatus, Path, FileSystem}
@@ -21,6 +22,10 @@ import scala.util.control.NonFatal
 
 object SparkContextUtils {
 
+  case class IndexedPartitioner(numPartitions: Int, index: Map[Any, Int]) extends Partitioner {
+    override def getPartition(key: Any): Int = index(key)
+  }
+
   case class HadoopFile(path: String, isDir: Boolean, size: Long)
 
   private case class HadoopFilePartition(size: Long, paths: Seq[String])
@@ -162,8 +167,8 @@ object SparkContextUtils {
     }
 
     def getParallelTextFiles(paths: Seq[String],
-                             maxBytesPerPartition: Long = 64 * 1000 * 1000,
-                             minPartitions: Int = 500,
+                             maxBytesPerPartition: Long = 256 * 1000 * 1000,
+                             minPartitions: Int = 100,
                              synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1, listOnWorkers: Boolean = false): RDD[String] = {
       if (synchLocally)
         processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition, minPartitions, listOnWorkers)
@@ -190,8 +195,8 @@ object SparkContextUtils {
     }
 
     def filterAndGetParallelTextFiles(path: String,
-                                      maxBytesPerPartition: Long = 64 * 1000 * 1000,
-                                      minPartitions: Int = 500,
+                                      maxBytesPerPartition: Long = 256 * 1000 * 1000,
+                                      minPartitions: Int = 100,
                                       requireSuccess: Boolean = false,
                                       inclusiveStartDate: Boolean = true,
                                       startDate: Option[DateTime] = None,
@@ -250,13 +255,27 @@ object SparkContextUtils {
         objectHadoopFile(paths, minimumPaths)
     }
 
-    def parallelTextFiles(paths: List[String], maxBytesPerPartition: Long, minPartitions: Int, listOnWorkers: Boolean): RDD[String] = {
+    case class SizeBasedFileHandling(averageEstimatedCompressionRatio: Int = 8,
+                                     compressedExtensions: Set[String] = Set(".gz")) {
+      
+      def isBig(f: HadoopFile, uncompressedBigSize: Long): Boolean = estimatedSize(f) >= uncompressedBigSize
+      
+      def estimatedSize(f: HadoopFile) = if (isCompressed(f))
+        f.size * averageEstimatedCompressionRatio
+      else
+        f.size
+      
+      def isCompressed(f: HadoopFile): Boolean = compressedExtensions.exists(f.path.endsWith)
+    }
 
-      val foundFiles = (if (listOnWorkers) parallelListFiles(paths) else driverListFiles(paths)).filter(_.size > 0)
-      val partitionedFiles = sc.parallelize(foundFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition, minPartitions))
 
+    def readSmallFiles(smallFiles: List[HadoopFile],
+                       maxBytesPerPartition: Long,
+                       minPartitions: Int,
+                       sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = {
+      val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
       val hadoopConf = _hadoopConf
-      partitionedFiles.mapPartitions { files =>
+      smallPartitionedFiles.mapPartitions { files =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
         val codecFactory = new CompressionCodecFactory(conf)
         files.map { case (path, _) => path } flatMap { path =>
@@ -284,7 +303,48 @@ object SparkContextUtils {
       }
     }
 
-    private def createPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long): Partitioner = {
+    def readBigFiles(bigFiles: List[HadoopFile],
+                     maxBytesPerPartition: Long,
+                     minPartitions: Int,
+                     sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = {
+      def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq("mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString))
+        .foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
+
+      def read(file: HadoopFile, conf: Configuration) = sc.newAPIHadoopFile[LongWritable, Text, TextInputFormat](conf = conf, fClass = classOf[TextInputFormat],
+        kClass = classOf[LongWritable], vClass = classOf[Text], path = file.path).map(pair => pair._2.toString)
+
+      val confCompressed = confWith(maxBytesPerPartition / sizeBasedFileHandling.averageEstimatedCompressionRatio)
+      val confUncompressed = confWith(maxBytesPerPartition)
+
+      val union = new UnionRDD(sc, bigFiles.map { file =>
+        val conf = if (sizeBasedFileHandling.isCompressed(file))
+          confCompressed
+        else
+          confUncompressed
+        read(file, conf)
+      })
+
+      if (union.partitions.size < minPartitions)
+        union.coalesce(minPartitions)
+      else
+        union
+    }
+
+    def parallelTextFiles(paths: List[String],
+                          maxBytesPerPartition: Long,
+                          minPartitions: Int,
+                          listOnWorkers: Boolean,
+                          sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = {
+
+      val foundFiles = (if (listOnWorkers) parallelListFiles(paths) else driverListFiles(paths)).filter(_.size > 0)
+      val (bigFiles, smallFiles) = foundFiles.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition))
+
+      sc.union(
+        readSmallFiles(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling),
+        readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
+    }
+
+    private def createPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long, sizeBasedFileHandling: SizeBasedFileHandling): Partitioner = {
       implicit val ordering: Ordering[HadoopFilePartition] = Ordering.by(p => -p.size) // Small partitions come first (highest priority)
 
       val pq: mutable.PriorityQueue[HadoopFilePartition] = mutable.PriorityQueue.empty
@@ -293,11 +353,13 @@ object SparkContextUtils {
 
       val partitions = files.foldLeft(pq) {
         case (acc, file) =>
-          acc.headOption.filter(bucket => bucket.size + file.size < maxBytesPerPartition) match {
+          val fileSize = sizeBasedFileHandling.estimatedSize(file)
+
+          acc.headOption.filter(bucket => bucket.size + fileSize < maxBytesPerPartition) match {
             case Some(found) =>
-              val updated = found.copy(size = found.size + file.size, paths = file.path +: found.paths)
+              val updated = found.copy(size = found.size + fileSize, paths = file.path +: found.paths)
               acc.tail += updated
-            case None => acc += HadoopFilePartition(file.size, Seq(file.path))
+            case None => acc += HadoopFilePartition(fileSize, Seq(file.path))
           }
       }.filter(_.paths.nonEmpty).toList // Remove empty partitions
 
@@ -305,10 +367,7 @@ object SparkContextUtils {
         case (bucket, index) => bucket.paths.map(path => path -> index)
       }.toMap
 
-      new Partitioner {
-        override def numPartitions: Int = partitions.size
-        override def getPartition(key: Any): Int = indexedPartitions(key)
-      }
+      IndexedPartitioner(partitions.size, indexedPartitions)
     }
 
 
diff --git a/tools/cluster.py b/tools/cluster.py
index 0af46ebe..7daf9617 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -49,6 +49,9 @@
 default_master_ami = None
 default_env = 'dev'
 default_spark_version = '1.5.1'
+custom_builds = {
+    '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz'
+}
 default_spark_repo = 'https://github.com/chaordic/spark'
 default_remote_user = 'ec2-user'
 default_remote_control_dir = '/tmp/Ignition'
@@ -260,6 +263,8 @@ def launch(cluster_name, slaves,
         ami_params = ['--ami', ami] if ami else []
         master_ami_params = ['--master-ami', master_ami] if master_ami else []
 
+        spark_version = custom_builds.get(spark_version, spark_version)
+
         for i in range(retries_on_same_cluster):
             log.info('Running script, try %d of %d', i + 1, retries_on_same_cluster)
             try:

From dc12d2a696e5cf847360097f86b33588b8b4cf84 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Tue, 10 Nov 2015 17:06:14 -0200
Subject: [PATCH 055/268] Use SplittableGzipCodec only for big files

---
 src/main/scala/ignition/core/jobs/CoreJobRunner.scala       | 2 --
 .../scala/ignition/core/jobs/utils/SparkContextUtils.scala  | 6 +++++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
index ec5d9039..aa4dcc76 100644
--- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
+++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
@@ -67,8 +67,6 @@ object CoreJobRunner {
 
       val sparkConf = new SparkConf()
       sparkConf.set("spark.executor.memory", config.executorMemory)
-      sparkConf.set("spark.hadoop.io.compression.codecs",
-        "org.apache.hadoop.io.compress.DefaultCodec,nl.basjes.hadoop.io.compress.SplittableGzipCodec,org.apache.hadoop.io.compress.BZip2Codec")
 
       sparkConf.setMaster(config.master)
       sparkConf.setAppName(appName)
diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 78b6ec9b..d18d5f76 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -307,7 +307,9 @@ object SparkContextUtils {
                      maxBytesPerPartition: Long,
                      minPartitions: Int,
                      sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = {
-      def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq("mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString))
+      def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq(
+        "io.compression.codecs" -> "org.apache.hadoop.io.compress.DefaultCodec,nl.basjes.hadoop.io.compress.SplittableGzipCodec,org.apache.hadoop.io.compress.BZip2Codec",
+        "mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString))
         .foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
 
       def read(file: HadoopFile, conf: Configuration) = sc.newAPIHadoopFile[LongWritable, Text, TextInputFormat](conf = conf, fClass = classOf[TextInputFormat],
@@ -317,10 +319,12 @@ object SparkContextUtils {
       val confUncompressed = confWith(maxBytesPerPartition)
 
       val union = new UnionRDD(sc, bigFiles.map { file =>
+
         val conf = if (sizeBasedFileHandling.isCompressed(file))
           confCompressed
         else
           confUncompressed
+
         read(file, conf)
       })
 

From b52eceea89e95ba3416ba1f8c13d77249129e9d6 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Wed, 11 Nov 2015 16:39:09 -0200
Subject: [PATCH 056/268] Dont use build with updated hadoop client

---
 tools/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 7daf9617..2fe6b245 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -50,7 +50,7 @@
 default_env = 'dev'
 default_spark_version = '1.5.1'
 custom_builds = {
-    '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz'
+#    '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz'
 }
 default_spark_repo = 'https://github.com/chaordic/spark'
 default_remote_user = 'ec2-user'

From f1075e8d5acd6bc9b03d8974c32adf3d146e2d9b Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 17 Nov 2015 17:27:00 -0200
Subject: [PATCH 057/268] s3 list

---
 build.sbt                                     |  2 +
 .../core/jobs/utils/SparkContextUtils.scala   | 49 +++++++---
 .../scala/ignition/core/utils/S3Utils.scala   | 91 +++++++++++++++++++
 3 files changed, 131 insertions(+), 11 deletions(-)
 create mode 100644 src/main/scala/ignition/core/utils/S3Utils.scala

diff --git a/build.sbt b/build.sbt
index 528d30cf..7231704b 100644
--- a/build.sbt
+++ b/build.sbt
@@ -35,6 +35,8 @@ libraryDependencies += "joda-time" % "joda-time" % "2.7"
 
 libraryDependencies += "org.joda" % "joda-convert" % "1.7"
 
+libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.9.6"
+
 resolvers += "Akka Repository" at "http://repo.akka.io/releases/"
 
 resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/"
diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index d18d5f76..96f2341d 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -1,16 +1,18 @@
 package ignition.core.jobs.utils
 
+import com.amazonaws.services.s3.AmazonS3Client
+import com.amazonaws.services.s3.model.{S3ObjectSummary, S3Object}
 import ignition.core.utils.ByteUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.io.{Text, LongWritable}
 import org.apache.hadoop.io.compress.CompressionCodecFactory
 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
-import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.{Partitioner, SparkContext}
 import org.apache.hadoop.fs.{FileStatus, Path, FileSystem}
 import org.apache.spark.rdd.{UnionRDD, RDD}
 import org.joda.time.DateTime
 import ignition.core.utils.DateUtils._
+import ignition.core.utils.S3Utils._
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.ArrayBuffer
@@ -26,6 +28,11 @@ object SparkContextUtils {
     override def getPartition(key: Any): Int = index(key)
   }
 
+  implicit class S3ObjectSummaryExtensions(s3Object: S3ObjectSummary) {
+    def toHadoopFile: HadoopFile =
+      HadoopFile(s"s3n://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize)
+  }
+
   case class HadoopFile(path: String, isDir: Boolean, size: Long)
 
   private case class HadoopFilePartition(size: Long, paths: Seq[String])
@@ -86,7 +93,7 @@ object SparkContextUtils {
       if (splittedPaths.size < minimumPaths)
         throw new Exception(s"Not enough paths found for $paths")
 
-      parallelTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers)
+      parallelListEndReadTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers)
     }
 
     private def filterPaths(paths: Seq[String],
@@ -257,14 +264,14 @@ object SparkContextUtils {
 
     case class SizeBasedFileHandling(averageEstimatedCompressionRatio: Int = 8,
                                      compressedExtensions: Set[String] = Set(".gz")) {
-      
+
       def isBig(f: HadoopFile, uncompressedBigSize: Long): Boolean = estimatedSize(f) >= uncompressedBigSize
-      
+
       def estimatedSize(f: HadoopFile) = if (isCompressed(f))
         f.size * averageEstimatedCompressionRatio
       else
         f.size
-      
+
       def isCompressed(f: HadoopFile): Boolean = compressedExtensions.exists(f.path.endsWith)
     }
 
@@ -334,15 +341,21 @@ object SparkContextUtils {
         union
     }
 
-    def parallelTextFiles(paths: List[String],
-                          maxBytesPerPartition: Long,
-                          minPartitions: Int,
-                          listOnWorkers: Boolean,
-                          sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = {
+    def parallelListEndReadTextFiles(paths: List[String],
+                                     maxBytesPerPartition: Long,
+                                     minPartitions: Int,
+                                     listOnWorkers: Boolean,
+                                     sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = {
 
       val foundFiles = (if (listOnWorkers) parallelListFiles(paths) else driverListFiles(paths)).filter(_.size > 0)
-      val (bigFiles, smallFiles) = foundFiles.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition))
+      parallelReadTextFiles(foundFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)
+    }
 
+    def parallelReadTextFiles(files: List[HadoopFile],
+                              maxBytesPerPartition: Long,
+                              minPartitions: Int,
+                              sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = {
+      val (bigFiles, smallFiles) = files.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition))
       sc.union(
         readSmallFiles(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling),
         readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
@@ -466,5 +479,19 @@ object SparkContextUtils {
       innerListFiles(directories)
     }
 
+    def s3FilterAndGetParallelTextFiles(bucket: String,
+                                        prefix: String,
+                                        startDate: Option[DateTime] = None,
+                                        endDate: Option[DateTime] = None,
+                                        endsWith: Option[String] = None,
+                                        predicate: S3ObjectSummary => Boolean = _ => true,
+                                        maxBytesPerPartition: Long = 256 * 1000 * 1000,
+                                        minPartitions: Int = 100,
+                                        sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling())
+                                       (implicit  s3Client: AmazonS3Client, dateExtractor: PathDateExtractor): RDD[String] = {
+      val foundFiles = s3ListAndFilterFiles(bucket, prefix, startDate, endDate, predicate = predicate)(s3Client, dateExtractor).map(_.toHadoopFile)
+      parallelReadTextFiles(foundFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)
+    }
+
   }
 }
diff --git a/src/main/scala/ignition/core/utils/S3Utils.scala b/src/main/scala/ignition/core/utils/S3Utils.scala
new file mode 100644
index 00000000..28866c4c
--- /dev/null
+++ b/src/main/scala/ignition/core/utils/S3Utils.scala
@@ -0,0 +1,91 @@
+package ignition.core.utils
+
+import com.amazonaws.auth.EnvironmentVariableCredentialsProvider
+import com.amazonaws.services.s3.AmazonS3Client
+import com.amazonaws.services.s3.model.{S3ObjectSummary, ObjectListing}
+import ignition.core.jobs.utils.PathDateExtractor
+import ignition.core.utils.DateUtils._
+import org.joda.time.DateTime
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable
+import scala.util.Try
+
+object S3Utils {
+
+  implicit lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider())
+
+  def s3List(bucket: String, prefix: String, predicate: S3ObjectSummary => Boolean = _ => true)
+            (implicit s3: AmazonS3Client): List[S3ObjectSummary] = {
+    def inner(acc: mutable.ArrayBuffer[S3ObjectSummary], listing: ObjectListing): List[S3ObjectSummary] = {
+      acc ++= listing.getObjectSummaries.toList.filter(predicate)
+      if (listing.isTruncated)
+        inner(acc, s3.listNextBatchOfObjects(listing))
+      else
+        acc.toList
+    }
+
+    inner(new mutable.ArrayBuffer[S3ObjectSummary], s3.listObjects(bucket, prefix))
+  }
+
+  def s3ListAndFilterFiles(bucket: String,
+                           prefix: String,
+                           start: Option[DateTime] = None,
+                           end: Option[DateTime] = None,
+                           endsWith: Option[String] = None,
+                           exclusionPattern: Option[String] = Option("_$folder$"),
+                           predicate: S3ObjectSummary => Boolean = _ => true)
+                          (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): List[S3ObjectSummary] = {
+
+    def excludePatternValidation(s3Object: S3ObjectSummary, exclusionPatternOption: Option[String]): Option[S3ObjectSummary] =
+      exclusionPatternOption match {
+        case Some(pattern) if s3Object.getKey.contains(pattern) => None
+        case Some(_) | None => Option(s3Object)
+      }
+
+    def endsWithValidation(s3Object: S3ObjectSummary, endsWithOption: Option[String]): Option[S3ObjectSummary] =
+      endsWithOption match {
+        case Some(pattern) if s3Object.getKey.endsWith(pattern) => Option(s3Object)
+        case Some(_) => None
+        case None => Option(s3Object)
+      }
+
+    def extractDateFromKey(s3Object: S3ObjectSummary): Option[DateTime] =
+      Try(pathDateExtractor.extractFromPath(s"s3://$bucket/${s3Object.getKey}")).toOption
+
+    def startValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, startOption: Option[DateTime]): Option[S3ObjectSummary] =
+      startOption match {
+        case Some(startDate) if startDate.isEqualOrBefore(extractedDate) => Option(s3Object)
+        case Some(_) => None
+        case None => Option(s3Object)
+      }
+
+    def endValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, endOption: Option[DateTime]): Option[S3ObjectSummary] =
+      endOption match {
+        case Some(endDate) if endDate.isEqualOrAfter(extractedDate) => Option(s3Object)
+        case Some(_) => None
+        case None => Option(s3Object)
+      }
+
+    def applyPredicate(s3Object: S3ObjectSummary): Option[S3ObjectSummary] =
+      if (predicate(s3Object))
+        Option(s3Object)
+      else
+        None
+
+    val allValidations: S3ObjectSummary => Boolean = s3Object => {
+      val validatedS3Object = for {
+        withValidPattern <- excludePatternValidation(s3Object, exclusionPattern)
+        withValidEndsWith <- endsWithValidation(withValidPattern, endsWith)
+        extractedDate <- extractDateFromKey(withValidEndsWith)
+        withValidStart <- startValidation(withValidEndsWith, extractedDate, start)
+        withValidEnd <- endValidation(withValidStart, extractedDate, end)
+        valid <- applyPredicate(withValidEnd)
+      } yield valid
+      validatedS3Object.isDefined
+    }
+
+    s3List(bucket, prefix, allValidations)(s3)
+  }
+
+}

From 909136626ecf4daf300a66893087d8c06609c7e1 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Fri, 20 Nov 2015 18:26:42 -0200
Subject: [PATCH 058/268] Split compressed big files

---
 .../core/jobs/utils/SparkContextUtils.scala   | 92 ++++++++++++++-----
 .../core/utils/AutoCloseableIterator.scala    | 67 ++++++++++++++
 2 files changed, 138 insertions(+), 21 deletions(-)
 create mode 100644 src/main/scala/ignition/core/utils/AutoCloseableIterator.scala

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index d18d5f76..dec5ca13 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -1,20 +1,21 @@
 package ignition.core.jobs.utils
 
-import ignition.core.utils.ByteUtils
+import java.io.InputStream
+
+import ignition.core.utils.DateUtils._
+import ignition.core.utils.{AutoCloseableIterator, ByteUtils}
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.io.{Text, LongWritable}
+import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
 import org.apache.hadoop.io.compress.CompressionCodecFactory
+import org.apache.hadoop.io.{LongWritable, Text}
 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
-import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.{RDD, UnionRDD}
 import org.apache.spark.{Partitioner, SparkContext}
-import org.apache.hadoop.fs.{FileStatus, Path, FileSystem}
-import org.apache.spark.rdd.{UnionRDD, RDD}
 import org.joda.time.DateTime
-import ignition.core.utils.DateUtils._
 
 import scala.collection.JavaConversions._
-import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
 import scala.io.{Codec, Source}
 import scala.reflect.ClassTag
 import scala.util.Try
@@ -22,6 +23,17 @@ import scala.util.control.NonFatal
 
 object SparkContextUtils {
 
+  def close(inputStream: InputStream, path: String): Unit = {
+    try {
+      inputStream.close()
+    } catch {
+      case NonFatal(ex) =>
+        println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}")
+    }
+  }
+
+  case class BigFileSlice(index: Int)
+
   case class IndexedPartitioner(numPartitions: Int, index: Map[Any, Int]) extends Partitioner {
     override def getPartition(key: Any): Int = index(key)
   }
@@ -273,7 +285,7 @@ object SparkContextUtils {
                        maxBytesPerPartition: Long,
                        minPartitions: Int,
                        sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = {
-      val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
+      val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
       val hadoopConf = _hadoopConf
       smallPartitionedFiles.mapPartitions { files =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
@@ -292,13 +304,54 @@ object SparkContextUtils {
               println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}")
               throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}")
           } finally {
-            try {
-              inputStream.close()
-            } catch {
-              case NonFatal(ex) =>
-                println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}")
-            }
+            close(inputStream, path)
+          }
+        }
+      }
+    }
+
+    def readCompressedBigFile(file: HadoopFile, maxBytesPerPartition: Long, minPartitions: Int,
+                              sizeBasedFileHandling: SizeBasedFileHandling, sampleCount: Int = 100): RDD[String] = {
+      val estimatedSize = sizeBasedFileHandling.estimatedSize(file)
+      val totalSlices = (estimatedSize / maxBytesPerPartition + 1).toInt
+      val slices = (0 until totalSlices).map(BigFileSlice.apply)
+
+      val partitioner = {
+        val indexedPartitions: Map[Any, Int] = slices.map(s => s -> s.index).toMap
+        IndexedPartitioner(totalSlices, indexedPartitions)
+      }
+      val hadoopConf = _hadoopConf
+
+      val partitionedSlices = sc.parallelize(slices.map(s => s -> ()), 2).partitionBy(partitioner)
+
+      partitionedSlices.mapPartitions { slices =>
+        val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
+        val codecFactory = new CompressionCodecFactory(conf)
+        val hadoopPath = new Path(file.path)
+        val fileSystem = hadoopPath.getFileSystem(conf)
+        slices.flatMap { case (slice, _) =>
+          val inputStream = Option(codecFactory.getCodec(hadoopPath)) match {
+            case Some(compression) => compression.createInputStream(fileSystem.open(hadoopPath))
+            case None => fileSystem.open(hadoopPath)
           }
+          val lines = Source.fromInputStream(inputStream)(Codec.UTF8).getLines()
+
+          val lineSample = lines.take(sampleCount).toList
+          val linesPerSlice = {
+            val sampleSize = lineSample.map(_.size).sum
+            val estimatedAverageLineSize = Math.round(sampleSize / sampleCount.toFloat)
+            val estimatedTotalLines = Math.round(estimatedSize / estimatedAverageLineSize.toFloat)
+            estimatedTotalLines / totalSlices + 1
+          }
+
+          val linesAfterSeek = (lineSample.toIterator ++ lines).drop(linesPerSlice * slice.index)
+
+          val finalLines = if (slice.index + 1 == totalSlices) // last slice, read until the end
+            linesAfterSeek
+          else
+            linesAfterSeek.take(linesPerSlice)
+
+          AutoCloseableIterator.wrap(finalLines, () => close(inputStream, s"${file.path}, slice $slice"))
         }
       }
     }
@@ -315,17 +368,14 @@ object SparkContextUtils {
       def read(file: HadoopFile, conf: Configuration) = sc.newAPIHadoopFile[LongWritable, Text, TextInputFormat](conf = conf, fClass = classOf[TextInputFormat],
         kClass = classOf[LongWritable], vClass = classOf[Text], path = file.path).map(pair => pair._2.toString)
 
-      val confCompressed = confWith(maxBytesPerPartition / sizeBasedFileHandling.averageEstimatedCompressionRatio)
       val confUncompressed = confWith(maxBytesPerPartition)
 
       val union = new UnionRDD(sc, bigFiles.map { file =>
 
-        val conf = if (sizeBasedFileHandling.isCompressed(file))
-          confCompressed
+        if (sizeBasedFileHandling.isCompressed(file))
+          readCompressedBigFile(file, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)
         else
-          confUncompressed
-
-        read(file, conf)
+          read(file, confUncompressed)
       })
 
       if (union.partitions.size < minPartitions)
@@ -348,7 +398,7 @@ object SparkContextUtils {
         readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
     }
 
-    private def createPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long, sizeBasedFileHandling: SizeBasedFileHandling): Partitioner = {
+    private def createSmallFilesPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long, sizeBasedFileHandling: SizeBasedFileHandling): Partitioner = {
       implicit val ordering: Ordering[HadoopFilePartition] = Ordering.by(p => -p.size) // Small partitions come first (highest priority)
 
       val pq: mutable.PriorityQueue[HadoopFilePartition] = mutable.PriorityQueue.empty
diff --git a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala
new file mode 100644
index 00000000..b3f054ba
--- /dev/null
+++ b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala
@@ -0,0 +1,67 @@
+package ignition.core.utils
+
+import scala.util.Try
+import scala.util.control.NonFatal
+
+object AutoCloseableIterator {
+  case object empty extends AutoCloseableIterator[Nothing] {
+    override def naiveHasNext() = false
+    override def naiveNext() = throw new Exception("Empty AutoCloseableIterator")
+    override def naiveClose() = {}
+  }
+
+  def wrap[T](iterator: Iterator[T], close: () => Unit = () => ()): AutoCloseableIterator[T] = new AutoCloseableIterator[T] {
+    override def naiveClose(): Unit = close()
+    override def naiveHasNext(): Boolean = iterator.hasNext
+    override def naiveNext(): T = iterator.next()
+  }
+}
+
+trait AutoCloseableIterator[T] extends Iterator[T] with AutoCloseable {
+  // Naive functions should be implemented by the user as in a standard Iterator/AutoCloseable
+  def naiveHasNext(): Boolean
+  def naiveNext(): T
+  def naiveClose(): Unit
+
+  var closed = false
+
+  // hasNext closes the iterator and handles the case where it is already closed
+  override def hasNext(): Boolean = if (closed)
+    false
+  else {
+    val naiveResult = try {
+      naiveHasNext
+    } catch {
+      case NonFatal(e) =>
+        Try { close }
+        throw e
+    }
+    if (naiveResult)
+      true
+    else {
+      close // auto close when exhausted
+      false
+    }
+  }
+
+  // next closes the iterator and handles the case where it is already closed
+  override def next(): T = if (closed)
+    throw new RuntimeException("Trying to get next element on a closed iterator")
+  else if (hasNext())
+    try {
+      naiveNext
+    } catch {
+      case NonFatal(e) =>
+        Try { close }
+        throw e
+    }
+  else
+    throw new RuntimeException("Trying to get next element on an exhausted iterator")
+
+  override def close() = if (!closed) {
+    closed = true
+    naiveClose
+  }
+
+  override def finalize() = Try { close }
+}

From 368a9986fa019f15d7f303b403cf96821899609f Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Fri, 20 Nov 2015 18:28:33 -0200
Subject: [PATCH 059/268] Removed unused dependency

---
 build.sbt                                                       | 2 --
 src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala | 1 -
 2 files changed, 3 deletions(-)

diff --git a/build.sbt b/build.sbt
index 528d30cf..acdef9cb 100644
--- a/build.sbt
+++ b/build.sbt
@@ -19,8 +19,6 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.5.1" % "provided
 
 libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided")
 
-libraryDependencies += "nl.basjes.hadoop" % "splittablegzip" % "1.2"
-
 libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.2.4"
 
 libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16"
diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index dec5ca13..06ea71ee 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -361,7 +361,6 @@ object SparkContextUtils {
                      minPartitions: Int,
                      sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = {
       def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq(
-        "io.compression.codecs" -> "org.apache.hadoop.io.compress.DefaultCodec,nl.basjes.hadoop.io.compress.SplittableGzipCodec,org.apache.hadoop.io.compress.BZip2Codec",
         "mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString))
         .foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
 

From 016de5b8e9db0e8aa51bf6c00f1de880938de1e3 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Mon, 23 Nov 2015 10:48:56 -0200
Subject: [PATCH 060/268] pr review

---
 .../core/jobs/utils/SparkContextUtils.scala   | 110 +++++++++++++++---
 .../scala/ignition/core/utils/S3Utils.scala   |  91 ---------------
 2 files changed, 93 insertions(+), 108 deletions(-)
 delete mode 100644 src/main/scala/ignition/core/utils/S3Utils.scala

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 96f2341d..de5cdfca 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -1,22 +1,22 @@
 package ignition.core.jobs.utils
 
+import com.amazonaws.auth.EnvironmentVariableCredentialsProvider
 import com.amazonaws.services.s3.AmazonS3Client
-import com.amazonaws.services.s3.model.{S3ObjectSummary, S3Object}
+import com.amazonaws.services.s3.model.{ObjectListing, S3ObjectSummary}
 import ignition.core.utils.ByteUtils
+import ignition.core.utils.DateUtils._
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.io.{Text, LongWritable}
+import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
 import org.apache.hadoop.io.compress.CompressionCodecFactory
+import org.apache.hadoop.io.{LongWritable, Text}
 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
+import org.apache.spark.rdd.{RDD, UnionRDD}
 import org.apache.spark.{Partitioner, SparkContext}
-import org.apache.hadoop.fs.{FileStatus, Path, FileSystem}
-import org.apache.spark.rdd.{UnionRDD, RDD}
 import org.joda.time.DateTime
-import ignition.core.utils.DateUtils._
-import ignition.core.utils.S3Utils._
 
 import scala.collection.JavaConversions._
-import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
 import scala.io.{Codec, Source}
 import scala.reflect.ClassTag
 import scala.util.Try
@@ -24,15 +24,12 @@ import scala.util.control.NonFatal
 
 object SparkContextUtils {
 
+  implicit lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider())
+
   case class IndexedPartitioner(numPartitions: Int, index: Map[Any, Int]) extends Partitioner {
     override def getPartition(key: Any): Int = index(key)
   }
 
-  implicit class S3ObjectSummaryExtensions(s3Object: S3ObjectSummary) {
-    def toHadoopFile: HadoopFile =
-      HadoopFile(s"s3n://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize)
-  }
-
   case class HadoopFile(path: String, isDir: Boolean, size: Long)
 
   private case class HadoopFilePartition(size: Long, paths: Seq[String])
@@ -93,7 +90,7 @@ object SparkContextUtils {
       if (splittedPaths.size < minimumPaths)
         throw new Exception(s"Not enough paths found for $paths")
 
-      parallelListEndReadTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers)
+      parallelListAndReadTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers)
     }
 
     private def filterPaths(paths: Seq[String],
@@ -341,7 +338,7 @@ object SparkContextUtils {
         union
     }
 
-    def parallelListEndReadTextFiles(paths: List[String],
+    def parallelListAndReadTextFiles(paths: List[String],
                                      maxBytesPerPartition: Long,
                                      minPartitions: Int,
                                      listOnWorkers: Boolean,
@@ -479,17 +476,96 @@ object SparkContextUtils {
       innerListFiles(directories)
     }
 
+    private def s3List(bucket: String, prefix: String, predicate: S3ObjectSummary => Boolean = _ => true)
+                      (implicit s3: AmazonS3Client): List[S3ObjectSummary] = {
+      def inner(acc: mutable.ArrayBuffer[S3ObjectSummary], listing: ObjectListing): List[S3ObjectSummary] = {
+        acc ++= listing.getObjectSummaries.toList.filter(predicate)
+        if (listing.isTruncated)
+          inner(acc, s3.listNextBatchOfObjects(listing))
+        else
+          acc.toList
+      }
+
+      inner(new mutable.ArrayBuffer[S3ObjectSummary], s3.listObjects(bucket, prefix))
+    }
+
+    def s3ListAndFilterFiles(bucket: String,
+                             prefix: String,
+                             start: Option[DateTime] = None,
+                             end: Option[DateTime] = None,
+                             endsWith: Option[String] = None,
+                             exclusionPattern: Option[String] = Option("_$folder$"),
+                             predicate: HadoopFile => Boolean = _ => true)
+                            (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): List[HadoopFile] = {
+
+      def excludePatternValidation(s3Object: S3ObjectSummary, exclusionPatternOption: Option[String]): Option[S3ObjectSummary] =
+        exclusionPatternOption match {
+          case Some(pattern) if s3Object.getKey.contains(pattern) => None
+          case Some(_) | None => Option(s3Object)
+        }
+
+      def endsWithValidation(s3Object: S3ObjectSummary, endsWithOption: Option[String]): Option[S3ObjectSummary] =
+        endsWithOption match {
+          case Some(pattern) if s3Object.getKey.endsWith(pattern) => Option(s3Object)
+          case Some(_) => None
+          case None => Option(s3Object)
+        }
+
+      def extractDateFromKey(s3Object: S3ObjectSummary): Option[DateTime] =
+        Try(pathDateExtractor.extractFromPath(s"s3n://$bucket/${s3Object.getKey}")).toOption
+
+      def startValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, startOption: Option[DateTime]): Option[S3ObjectSummary] =
+        startOption match {
+          case Some(startDate) if startDate.isEqualOrBefore(extractedDate) => Option(s3Object)
+          case Some(_) => None
+          case None => Option(s3Object)
+        }
+
+      def endValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, endOption: Option[DateTime]): Option[S3ObjectSummary] =
+        endOption match {
+          case Some(endDate) if endDate.isEqualOrAfter(extractedDate) => Option(s3Object)
+          case Some(_) => None
+          case None => Option(s3Object)
+        }
+
+      def applyPredicate(file: HadoopFile): Option[HadoopFile] =
+        if (predicate(file))
+          Option(file)
+        else
+          None
+
+      def toHadoopFile(s3Object: S3ObjectSummary): HadoopFile =
+        HadoopFile(s"s3n://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize)
+
+      val allValidations: S3ObjectSummary => Boolean = s3Object => {
+        val validatedFile = for {
+          withValidPattern <- excludePatternValidation(s3Object, exclusionPattern)
+          withValidEndsWith <- endsWithValidation(withValidPattern, endsWith)
+          extractedDate <- extractDateFromKey(withValidEndsWith)
+          withValidStart <- startValidation(withValidEndsWith, extractedDate, start)
+          withValidEnd <- endValidation(withValidStart, extractedDate, end)
+          hadoopFile = toHadoopFile(withValidEnd)
+          valid <- applyPredicate(hadoopFile)
+        } yield valid
+        validatedFile.isDefined
+      }
+
+      s3List(bucket, prefix, allValidations)(s3).map(toHadoopFile)
+    }
+
+
     def s3FilterAndGetParallelTextFiles(bucket: String,
                                         prefix: String,
                                         startDate: Option[DateTime] = None,
                                         endDate: Option[DateTime] = None,
                                         endsWith: Option[String] = None,
-                                        predicate: S3ObjectSummary => Boolean = _ => true,
+                                        predicate: HadoopFile => Boolean = _ => true,
                                         maxBytesPerPartition: Long = 256 * 1000 * 1000,
                                         minPartitions: Int = 100,
                                         sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling())
-                                       (implicit  s3Client: AmazonS3Client, dateExtractor: PathDateExtractor): RDD[String] = {
-      val foundFiles = s3ListAndFilterFiles(bucket, prefix, startDate, endDate, predicate = predicate)(s3Client, dateExtractor).map(_.toHadoopFile)
+                                       (implicit  s3Client: AmazonS3Client = amazonS3ClientFromEnvironmentVariables,
+                                        dateExtractor: PathDateExtractor): RDD[String] = {
+      val foundFiles = s3ListAndFilterFiles(bucket, prefix, startDate, endDate, predicate = predicate)(s3Client, dateExtractor)
       parallelReadTextFiles(foundFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)
     }
 
diff --git a/src/main/scala/ignition/core/utils/S3Utils.scala b/src/main/scala/ignition/core/utils/S3Utils.scala
deleted file mode 100644
index 28866c4c..00000000
--- a/src/main/scala/ignition/core/utils/S3Utils.scala
+++ /dev/null
@@ -1,91 +0,0 @@
-package ignition.core.utils
-
-import com.amazonaws.auth.EnvironmentVariableCredentialsProvider
-import com.amazonaws.services.s3.AmazonS3Client
-import com.amazonaws.services.s3.model.{S3ObjectSummary, ObjectListing}
-import ignition.core.jobs.utils.PathDateExtractor
-import ignition.core.utils.DateUtils._
-import org.joda.time.DateTime
-
-import scala.collection.JavaConversions._
-import scala.collection.mutable
-import scala.util.Try
-
-object S3Utils {
-
-  implicit lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider())
-
-  def s3List(bucket: String, prefix: String, predicate: S3ObjectSummary => Boolean = _ => true)
-            (implicit s3: AmazonS3Client): List[S3ObjectSummary] = {
-    def inner(acc: mutable.ArrayBuffer[S3ObjectSummary], listing: ObjectListing): List[S3ObjectSummary] = {
-      acc ++= listing.getObjectSummaries.toList.filter(predicate)
-      if (listing.isTruncated)
-        inner(acc, s3.listNextBatchOfObjects(listing))
-      else
-        acc.toList
-    }
-
-    inner(new mutable.ArrayBuffer[S3ObjectSummary], s3.listObjects(bucket, prefix))
-  }
-
-  def s3ListAndFilterFiles(bucket: String,
-                           prefix: String,
-                           start: Option[DateTime] = None,
-                           end: Option[DateTime] = None,
-                           endsWith: Option[String] = None,
-                           exclusionPattern: Option[String] = Option("_$folder$"),
-                           predicate: S3ObjectSummary => Boolean = _ => true)
-                          (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): List[S3ObjectSummary] = {
-
-    def excludePatternValidation(s3Object: S3ObjectSummary, exclusionPatternOption: Option[String]): Option[S3ObjectSummary] =
-      exclusionPatternOption match {
-        case Some(pattern) if s3Object.getKey.contains(pattern) => None
-        case Some(_) | None => Option(s3Object)
-      }
-
-    def endsWithValidation(s3Object: S3ObjectSummary, endsWithOption: Option[String]): Option[S3ObjectSummary] =
-      endsWithOption match {
-        case Some(pattern) if s3Object.getKey.endsWith(pattern) => Option(s3Object)
-        case Some(_) => None
-        case None => Option(s3Object)
-      }
-
-    def extractDateFromKey(s3Object: S3ObjectSummary): Option[DateTime] =
-      Try(pathDateExtractor.extractFromPath(s"s3://$bucket/${s3Object.getKey}")).toOption
-
-    def startValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, startOption: Option[DateTime]): Option[S3ObjectSummary] =
-      startOption match {
-        case Some(startDate) if startDate.isEqualOrBefore(extractedDate) => Option(s3Object)
-        case Some(_) => None
-        case None => Option(s3Object)
-      }
-
-    def endValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, endOption: Option[DateTime]): Option[S3ObjectSummary] =
-      endOption match {
-        case Some(endDate) if endDate.isEqualOrAfter(extractedDate) => Option(s3Object)
-        case Some(_) => None
-        case None => Option(s3Object)
-      }
-
-    def applyPredicate(s3Object: S3ObjectSummary): Option[S3ObjectSummary] =
-      if (predicate(s3Object))
-        Option(s3Object)
-      else
-        None
-
-    val allValidations: S3ObjectSummary => Boolean = s3Object => {
-      val validatedS3Object = for {
-        withValidPattern <- excludePatternValidation(s3Object, exclusionPattern)
-        withValidEndsWith <- endsWithValidation(withValidPattern, endsWith)
-        extractedDate <- extractDateFromKey(withValidEndsWith)
-        withValidStart <- startValidation(withValidEndsWith, extractedDate, start)
-        withValidEnd <- endValidation(withValidStart, extractedDate, end)
-        valid <- applyPredicate(withValidEnd)
-      } yield valid
-      validatedS3Object.isDefined
-    }
-
-    s3List(bucket, prefix, allValidations)(s3)
-  }
-
-}

From 7c23316dd058acce0a607b6e16ba0ff35460a28a Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 25 Nov 2015 14:10:27 -0200
Subject: [PATCH 061/268] fix lambda ref to close resources

---
 .../scala/ignition/core/utils/AutoCloseableIterator.scala     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala
index b3f054ba..bc294f6f 100644
--- a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala
+++ b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala
@@ -10,8 +10,8 @@ object AutoCloseableIterator {
     override def naiveClose() = {}
   }
 
-  def wrap[T](iterator: Iterator[T], close: () => Unit = () => ()): AutoCloseableIterator[T] = new AutoCloseableIterator[T] {
-    override def naiveClose(): Unit = close()
+  def wrap[T](iterator: Iterator[T], doClose: () => Unit = () => ()): AutoCloseableIterator[T] = new AutoCloseableIterator[T] {
+    override def naiveClose(): Unit = doClose()
     override def naiveHasNext(): Boolean = iterator.hasNext
     override def naiveNext(): T = iterator.next()
   }

From 358459f9ea32143d2c63ef460986cef0d75345d7 Mon Sep 17 00:00:00 2001
From: Leandro <leandro@chaordicsystems.com>
Date: Fri, 4 Dec 2015 19:43:09 -0200
Subject: [PATCH 062/268] Small Xlint fixes

---
 .../scala/ignition/core/jobs/utils/SparkContextUtils.scala    | 4 ++--
 .../scala/ignition/core/utils/AutoCloseableIterator.scala     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 97aed619..de3bf3ae 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -289,7 +289,7 @@ object SparkContextUtils {
                        maxBytesPerPartition: Long,
                        minPartitions: Int,
                        sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = {
-      val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
+      val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> Unit ), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
       val hadoopConf = _hadoopConf
       smallPartitionedFiles.mapPartitions { files =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
@@ -326,7 +326,7 @@ object SparkContextUtils {
       }
       val hadoopConf = _hadoopConf
 
-      val partitionedSlices = sc.parallelize(slices.map(s => s -> ()), 2).partitionBy(partitioner)
+      val partitionedSlices = sc.parallelize(slices.map(s => s -> Unit ), 2).partitionBy(partitioner)
 
       partitionedSlices.mapPartitions { slices =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
diff --git a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala
index bc294f6f..4e3db808 100644
--- a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala
+++ b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala
@@ -26,7 +26,7 @@ trait AutoCloseableIterator[T] extends Iterator[T] with AutoCloseable {
   var closed = false
 
   // hasNext closes the iterator and handles the case where it is already closed
-  override def hasNext(): Boolean = if (closed)
+  override def hasNext: Boolean = if (closed)
     false
   else {
     val naiveResult = try {
@@ -47,7 +47,7 @@ trait AutoCloseableIterator[T] extends Iterator[T] with AutoCloseable {
   // next closes the iterator and handles the case where it is already closed
   override def next(): T = if (closed)
     throw new RuntimeException("Trying to get next element on a closed iterator")
-  else if (hasNext())
+  else if (hasNext)
     try {
       naiveNext
     } catch {

From 5f54641cb7d6448148a0570599504125fb976eaa Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Mon, 7 Dec 2015 15:40:17 -0200
Subject: [PATCH 063/268] Make it partially compatible with scala 2.11 and
 Xlint free and minor cleanups

---
 build.sbt                                            |  8 ++++----
 .../ignition/core/jobs/utils/SparkContextUtils.scala | 12 +++++++-----
 .../ignition/core/utils/AutoCloseableIterator.scala  |  4 ++--
 src/main/scala/ignition/core/utils/BetterTrace.scala |  3 ++-
 src/main/scala/ignition/core/utils/FutureUtils.scala |  2 +-
 5 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/build.sbt b/build.sbt
index ec4d70bf..d0e2b029 100644
--- a/build.sbt
+++ b/build.sbt
@@ -4,7 +4,7 @@ version := "1.0"
 
 scalaVersion := "2.10.4"
 
-scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings")
+scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code")
 
 ideaExcludeFolders += ".idea"
 
@@ -19,9 +19,7 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.5.1" % "provided
 
 libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided")
 
-libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.2.4"
-
-libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16"
+libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.4"
 
 libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.6"
 
@@ -35,6 +33,8 @@ libraryDependencies += "org.joda" % "joda-convert" % "1.7"
 
 libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.9.6"
 
+libraryDependencies += "commons-lang" % "commons-lang" % "2.6"
+
 resolvers += "Akka Repository" at "http://repo.akka.io/releases/"
 
 resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/"
diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 97aed619..08f4a39d 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -23,6 +23,8 @@ import scala.io.{Codec, Source}
 import scala.reflect.ClassTag
 import scala.util.Try
 import scala.util.control.NonFatal
+import ignition.core.utils.ExceptionUtils._
+
 
 object SparkContextUtils {
 
@@ -31,7 +33,7 @@ object SparkContextUtils {
       inputStream.close()
     } catch {
       case NonFatal(ex) =>
-        println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}")
+        println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}")
     }
   }
 
@@ -289,7 +291,7 @@ object SparkContextUtils {
                        maxBytesPerPartition: Long,
                        minPartitions: Int,
                        sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = {
-      val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
+      val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> Unit), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
       val hadoopConf = _hadoopConf
       smallPartitionedFiles.mapPartitions { files =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
@@ -305,8 +307,8 @@ object SparkContextUtils {
             Source.fromInputStream(inputStream)(Codec.UTF8).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _)
           } catch {
             case NonFatal(ex) =>
-              println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}")
-              throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}")
+              println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}")
+              throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}")
           } finally {
             close(inputStream, path)
           }
@@ -326,7 +328,7 @@ object SparkContextUtils {
       }
       val hadoopConf = _hadoopConf
 
-      val partitionedSlices = sc.parallelize(slices.map(s => s -> ()), 2).partitionBy(partitioner)
+      val partitionedSlices = sc.parallelize(slices.map(s => s -> Unit), 2).partitionBy(partitioner)
 
       partitionedSlices.mapPartitions { slices =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
diff --git a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala
index bc294f6f..4e3db808 100644
--- a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala
+++ b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala
@@ -26,7 +26,7 @@ trait AutoCloseableIterator[T] extends Iterator[T] with AutoCloseable {
   var closed = false
 
   // hasNext closes the iterator and handles the case where it is already closed
-  override def hasNext(): Boolean = if (closed)
+  override def hasNext: Boolean = if (closed)
     false
   else {
     val naiveResult = try {
@@ -47,7 +47,7 @@ trait AutoCloseableIterator[T] extends Iterator[T] with AutoCloseable {
   // next closes the iterator and handles the case where it is already closed
   override def next(): T = if (closed)
     throw new RuntimeException("Trying to get next element on a closed iterator")
-  else if (hasNext())
+  else if (hasNext)
     try {
       naiveNext
     } catch {
diff --git a/src/main/scala/ignition/core/utils/BetterTrace.scala b/src/main/scala/ignition/core/utils/BetterTrace.scala
index 158e261e..32d5ea5f 100644
--- a/src/main/scala/ignition/core/utils/BetterTrace.scala
+++ b/src/main/scala/ignition/core/utils/BetterTrace.scala
@@ -1,5 +1,6 @@
 package ignition.core.utils
 
+import ignition.core.utils.ExceptionUtils._
 // Used mainly to augment scalacheck traces in scalatest
 trait BetterTrace {
   def fail(message: String): Nothing
@@ -7,7 +8,7 @@ trait BetterTrace {
     try {
       block
     } catch {
-      case t: Throwable => fail(s"${t.getMessage}: ${t.getStackTraceString}")
+      case t: Throwable => fail(s"${t.getMessage}: ${t.getFullStacktraceString}")
     }
 
 }
diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala
index 95b44c2f..4523a94f 100644
--- a/src/main/scala/ignition/core/utils/FutureUtils.scala
+++ b/src/main/scala/ignition/core/utils/FutureUtils.scala
@@ -6,7 +6,7 @@ import scala.util.{Failure, Success, Try}
 
 object FutureUtils {
 
-  def blockingFuture[T](body: =>T)(implicit ec: ExecutionContext): Future[T] = future { blocking { body } }
+  def blockingFuture[T](body: =>T)(implicit ec: ExecutionContext): Future[T] = Future { blocking { body } }
 
   implicit class FutureImprovements[V](future: Future[V]) {
     def toOptionOnFailure(errorHandler: (Throwable) => Option[V])(implicit ec: ExecutionContext): Future[Option[V]] = {

From 0ec37240db44a57e0a2117f53bf8d577b4a71037 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Mon, 7 Dec 2015 15:41:30 -0200
Subject: [PATCH 064/268] Make it partially compatible with scala 2.11 and
 Xlint free and minor cleanups

---
 src/main/scala/ignition/core/utils/ExceptionUtils.scala | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 src/main/scala/ignition/core/utils/ExceptionUtils.scala

diff --git a/src/main/scala/ignition/core/utils/ExceptionUtils.scala b/src/main/scala/ignition/core/utils/ExceptionUtils.scala
new file mode 100644
index 00000000..e2626764
--- /dev/null
+++ b/src/main/scala/ignition/core/utils/ExceptionUtils.scala
@@ -0,0 +1,9 @@
+package ignition.core.utils
+
+object ExceptionUtils {
+
+  implicit class ExceptionImprovements(e: Throwable) {
+    def getFullStacktraceString(): String = org.apache.commons.lang.exception.ExceptionUtils.getFullStackTrace(e)
+  }
+
+}

From b66d05dfa1cd328442c52c80d07b19f7643b67a5 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Mon, 7 Dec 2015 15:48:44 -0200
Subject: [PATCH 065/268] Renaming

---
 .../scala/ignition/core/jobs/utils/SparkContextUtils.scala  | 6 +++---
 src/main/scala/ignition/core/utils/BetterTrace.scala        | 2 +-
 src/main/scala/ignition/core/utils/ExceptionUtils.scala     | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 08f4a39d..6765009d 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -33,7 +33,7 @@ object SparkContextUtils {
       inputStream.close()
     } catch {
       case NonFatal(ex) =>
-        println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}")
+        println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getFullStackTraceString}")
     }
   }
 
@@ -307,8 +307,8 @@ object SparkContextUtils {
             Source.fromInputStream(inputStream)(Codec.UTF8).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _)
           } catch {
             case NonFatal(ex) =>
-              println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}")
-              throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}")
+              println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStackTraceString}")
+              throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStackTraceString}")
           } finally {
             close(inputStream, path)
           }
diff --git a/src/main/scala/ignition/core/utils/BetterTrace.scala b/src/main/scala/ignition/core/utils/BetterTrace.scala
index 32d5ea5f..387f49f7 100644
--- a/src/main/scala/ignition/core/utils/BetterTrace.scala
+++ b/src/main/scala/ignition/core/utils/BetterTrace.scala
@@ -8,7 +8,7 @@ trait BetterTrace {
     try {
       block
     } catch {
-      case t: Throwable => fail(s"${t.getMessage}: ${t.getFullStacktraceString}")
+      case t: Throwable => fail(s"${t.getMessage}: ${t.getFullStackTraceString}")
     }
 
 }
diff --git a/src/main/scala/ignition/core/utils/ExceptionUtils.scala b/src/main/scala/ignition/core/utils/ExceptionUtils.scala
index e2626764..1ae33568 100644
--- a/src/main/scala/ignition/core/utils/ExceptionUtils.scala
+++ b/src/main/scala/ignition/core/utils/ExceptionUtils.scala
@@ -3,7 +3,7 @@ package ignition.core.utils
 object ExceptionUtils {
 
   implicit class ExceptionImprovements(e: Throwable) {
-    def getFullStacktraceString(): String = org.apache.commons.lang.exception.ExceptionUtils.getFullStackTrace(e)
+    def getFullStackTraceString(): String = org.apache.commons.lang.exception.ExceptionUtils.getFullStackTrace(e)
   }
 
 }

From 2f6741dfd3d5ff30738cda6c2cc3279cda06fe0e Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Mon, 7 Dec 2015 16:11:10 -0200
Subject: [PATCH 066/268] Use null instead of Unit because Unit isnt
 serialiable

---
 .../scala/ignition/core/jobs/utils/SparkContextUtils.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 6765009d..648da060 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -291,7 +291,7 @@ object SparkContextUtils {
                        maxBytesPerPartition: Long,
                        minPartitions: Int,
                        sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = {
-      val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> Unit), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
+      val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> null), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
       val hadoopConf = _hadoopConf
       smallPartitionedFiles.mapPartitions { files =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
@@ -328,7 +328,7 @@ object SparkContextUtils {
       }
       val hadoopConf = _hadoopConf
 
-      val partitionedSlices = sc.parallelize(slices.map(s => s -> Unit), 2).partitionBy(partitioner)
+      val partitionedSlices = sc.parallelize(slices.map(s => s -> null), 2).partitionBy(partitioner)
 
       partitionedSlices.mapPartitions { slices =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }

From 84e98f490d409f8b9de741eada91d932979b1eff Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Fri, 27 Nov 2015 10:54:12 -0200
Subject: [PATCH 067/268] new filter and get text files

---
 .../core/jobs/utils/SparkContextUtils.scala   | 448 ++++++++++--------
 1 file changed, 250 insertions(+), 198 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 648da060..1afbd74f 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -4,9 +4,9 @@ import java.io.InputStream
 
 import com.amazonaws.auth.EnvironmentVariableCredentialsProvider
 import com.amazonaws.services.s3.AmazonS3Client
-import com.amazonaws.services.s3.model.{ObjectListing, S3ObjectSummary}
-import ignition.core.utils.{AutoCloseableIterator, ByteUtils}
+import com.amazonaws.services.s3.model.{ListObjectsRequest, ObjectListing, S3ObjectSummary}
 import ignition.core.utils.DateUtils._
+import ignition.core.utils.{AutoCloseableIterator, ByteUtils, HadoopUtils}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
 import org.apache.hadoop.io.compress.CompressionCodecFactory
@@ -21,14 +21,24 @@ import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.io.{Codec, Source}
 import scala.reflect.ClassTag
-import scala.util.Try
+import scala.util.{Failure, Success, Try}
 import scala.util.control.NonFatal
 import ignition.core.utils.ExceptionUtils._
 
 
 object SparkContextUtils {
 
-  def close(inputStream: InputStream, path: String): Unit = {
+  private case class BigFileSlice(index: Int)
+
+  private case class HadoopFilePartition(size: Long, paths: Seq[String])
+
+  private case class IndexedPartitioner(numPartitions: Int, index: Map[Any, Int]) extends Partitioner {
+    override def getPartition(key: Any): Int = index(key)
+  }
+
+  private lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider())
+
+  private def close(inputStream: InputStream, path: String): Unit = {
     try {
       inputStream.close()
     } catch {
@@ -37,17 +47,8 @@ object SparkContextUtils {
     }
   }
 
-  case class BigFileSlice(index: Int)
-  implicit lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider())
-
-  case class IndexedPartitioner(numPartitions: Int, index: Map[Any, Int]) extends Partitioner {
-    override def getPartition(key: Any): Int = index(key)
-  }
-
   case class HadoopFile(path: String, isDir: Boolean, size: Long)
 
-  private case class HadoopFilePartition(size: Long, paths: Seq[String])
-
   implicit class SparkContextImprovements(sc: SparkContext) {
 
     lazy val _hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap)
@@ -95,18 +96,6 @@ object SparkContextUtils {
       processPaths((p) => sc.textFile(p), paths, minimumPaths)
     }
 
-    private def processParallelTextFiles(paths: Seq[String],
-                                         minimumPaths: Int,
-                                         maxBytesPerPartition: Long,
-                                         minPartitions: Int,
-                                         listOnWorkers: Boolean): RDD[String] = {
-      val splittedPaths = paths.flatMap(ignition.core.utils.HadoopUtils.getPathStrings)
-      if (splittedPaths.size < minimumPaths)
-        throw new Exception(s"Not enough paths found for $paths")
-
-      parallelListAndReadTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers)
-    }
-
     private def filterPaths(paths: Seq[String],
                             requireSuccess: Boolean,
                             inclusiveStartDate: Boolean,
@@ -141,7 +130,6 @@ object SparkContextUtils {
     }
 
 
-
     def getFilteredPaths(paths: Seq[String],
                          requireSuccess: Boolean,
                          inclusiveStartDate: Boolean,
@@ -154,7 +142,6 @@ object SparkContextUtils {
       filterPaths(paths, requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates)
     }
 
-
     lazy val hdfsPathPrefix = sc.master.replaceFirst("spark://(.*):7077", "hdfs://$1:9000/")
 
     def synchToHdfs(paths: Seq[String], pathsToRdd: (Seq[String], Int) => RDD[String], forceSynch: Boolean): Seq[String] = {
@@ -184,16 +171,6 @@ object SparkContextUtils {
         processTextFiles(paths, minimumPaths)
     }
 
-    def getParallelTextFiles(paths: Seq[String],
-                             maxBytesPerPartition: Long = 256 * 1000 * 1000,
-                             minPartitions: Int = 100,
-                             synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1, listOnWorkers: Boolean = false): RDD[String] = {
-      if (synchLocally)
-        processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition, minPartitions, listOnWorkers)
-      else
-        processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition, minPartitions, listOnWorkers)
-    }
-
     @deprecated("It may incur heavy S3 costs and/or be slow with small files, use filterAndGetParallelTextFiles instead", "2015-10-27")
     def filterAndGetTextFiles(path: String,
                               requireSuccess: Boolean = false,
@@ -212,26 +189,6 @@ object SparkContextUtils {
       getTextFiles(paths, synchLocally, forceSynch, minimumPaths)
     }
 
-    def filterAndGetParallelTextFiles(path: String,
-                                      maxBytesPerPartition: Long = 256 * 1000 * 1000,
-                                      minPartitions: Int = 100,
-                                      requireSuccess: Boolean = false,
-                                      inclusiveStartDate: Boolean = true,
-                                      startDate: Option[DateTime] = None,
-                                      inclusiveEndDate: Boolean = true,
-                                      endDate: Option[DateTime] = None,
-                                      lastN: Option[Int] = None,
-                                      synchLocally: Boolean = false,
-                                      forceSynch: Boolean = false,
-                                      ignoreMalformedDates: Boolean = false,
-                                      minimumPaths: Int = 1,
-                                      listOnWorkers: Boolean = false)(implicit dateExtractor: PathDateExtractor): RDD[String] = {
-      val paths = getFilteredPaths(Seq(path), requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates)
-      if (paths.size < minimumPaths)
-        throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of paths $paths is less than the required")
-      getParallelTextFiles(paths, maxBytesPerPartition, minPartitions, synchLocally, forceSynch, minimumPaths, listOnWorkers)
-    }
-
     private def stringHadoopFile(paths: Seq[String], minimumPaths: Int): RDD[Try[String]] = {
       processPaths((p) => sc.sequenceFile(p, classOf[LongWritable], classOf[org.apache.hadoop.io.BytesWritable])
                 .map({ case (k, v) => Try { ByteUtils.toString(v.getBytes, 0, v.getLength, "UTF-8") } }), paths, minimumPaths)
@@ -287,11 +244,11 @@ object SparkContextUtils {
     }
 
 
-    def readSmallFiles(smallFiles: List[HadoopFile],
-                       maxBytesPerPartition: Long,
-                       minPartitions: Int,
-                       sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = {
-      val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> null), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
+    private def readSmallFiles(smallFiles: List[HadoopFile],
+                               maxBytesPerPartition: Long,
+                               minPartitions: Int,
+                               sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = {
+      val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file ->()), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
       val hadoopConf = _hadoopConf
       smallPartitionedFiles.mapPartitions { files =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
@@ -316,8 +273,8 @@ object SparkContextUtils {
       }
     }
 
-    def readCompressedBigFile(file: HadoopFile, maxBytesPerPartition: Long, minPartitions: Int,
-                              sizeBasedFileHandling: SizeBasedFileHandling, sampleCount: Int = 100): RDD[String] = {
+    private def readCompressedBigFile(file: HadoopFile, maxBytesPerPartition: Long, minPartitions: Int,
+                                      sizeBasedFileHandling: SizeBasedFileHandling, sampleCount: Int = 100): RDD[String] = {
       val estimatedSize = sizeBasedFileHandling.estimatedSize(file)
       val totalSlices = (estimatedSize / maxBytesPerPartition + 1).toInt
       val slices = (0 until totalSlices).map(BigFileSlice.apply)
@@ -362,10 +319,10 @@ object SparkContextUtils {
       }
     }
 
-    def readBigFiles(bigFiles: List[HadoopFile],
-                     maxBytesPerPartition: Long,
-                     minPartitions: Int,
-                     sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = {
+    private def readBigFiles(bigFiles: List[HadoopFile],
+                             maxBytesPerPartition: Long,
+                             minPartitions: Int,
+                             sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = {
       def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq(
         "mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString))
         .foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
@@ -392,21 +349,27 @@ object SparkContextUtils {
     def parallelListAndReadTextFiles(paths: List[String],
                                      maxBytesPerPartition: Long,
                                      minPartitions: Int,
-                                     listOnWorkers: Boolean,
-                                     sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = {
-
-      val foundFiles = (if (listOnWorkers) parallelListFiles(paths) else driverListFiles(paths)).filter(_.size > 0)
-      parallelReadTextFiles(foundFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)
+                                     sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling())
+                                    (implicit dateExtractor: PathDateExtractor): RDD[String] = {
+      val foundFiles = paths.flatMap(smartList(_)).filter(_.size > 0)
+      parallelReadTextFiles(foundFiles, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, sizeBasedFileHandling = sizeBasedFileHandling)
     }
 
     def parallelReadTextFiles(files: List[HadoopFile],
-                              maxBytesPerPartition: Long,
-                              minPartitions: Int,
-                              sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = {
-      val (bigFiles, smallFiles) = files.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition))
-      sc.union(
-        readSmallFiles(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling),
-        readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
+                              maxBytesPerPartition: Long = 256 * 1000 * 1000,
+                              minPartitions: Int = 100,
+                              sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling(),
+                              synchLocally: Option[String] = None,
+                              forceSynch: Boolean = false): RDD[String] = {
+      if (synchLocally.isDefined)
+        doSync(files, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, synchLocally = synchLocally.get,
+          sizeBasedFileHandling = sizeBasedFileHandling, forceSynch = forceSynch)
+      else {
+        val (bigFiles, smallFiles) = files.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition))
+        sc.union(
+          readSmallFiles(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling),
+          readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
+      }
     }
 
     private def createSmallFilesPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long, sizeBasedFileHandling: SizeBasedFileHandling): Partitioner = {
@@ -435,11 +398,9 @@ object SparkContextUtils {
       IndexedPartitioner(partitions.size, indexedPartitions)
     }
 
-
-    private def executeListOnWorkers(paths: RDD[String]): List[HadoopFile] = {
-      val hadoopConf = _hadoopConf
+    private def executeDriverList(paths: Seq[String]): List[HadoopFile] = {
+      val conf = _hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
       paths.flatMap { path =>
-        val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
         val hadoopPath = new Path(path)
         val fileSystem = hadoopPath.getFileSystem(conf)
         val tryFind = try {
@@ -462,162 +423,253 @@ object SparkContextUtils {
           val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty)
           sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList
         }
-      }.collect().toList
+      }.toList
     }
 
-
-    def parallelListFiles(paths: List[String]): List[HadoopFile] = {
-
-      val directories = paths.map(HadoopFile(_, isDir = true, 0))
-
+    private def driverListFiles(path: String): List[HadoopFile] = {
       def innerListFiles(remainingDirectories: List[HadoopFile]): List[HadoopFile] = {
         if (remainingDirectories.isEmpty) {
           Nil
         } else {
-          val remainingPaths = remainingDirectories.map(_.path)
-          val pathsRDD = sc.parallelize(remainingPaths, remainingPaths.size / 2)
-          val (dirs, files) = executeListOnWorkers(pathsRDD).partition(_.isDir)
+          val (dirs, files) = executeDriverList(remainingDirectories.map(_.path)).partition(_.isDir)
           files ++ innerListFiles(dirs)
         }
       }
-      innerListFiles(directories)
+      innerListFiles(List(HadoopFile(path, isDir = true, 0)))
     }
 
+    def s3ListCommonPrefixes(bucket: String, prefix: String, delimiter: String = "/")
+                            (implicit s3: AmazonS3Client): Stream[String] = {
+      def inner(current: ObjectListing): Stream[String] =
+        if (current.isTruncated)
+          current.getCommonPrefixes.toStream ++ inner(s3.listNextBatchOfObjects(current))
+        else
+          current.getCommonPrefixes.toStream
 
-    private def executeDriverList(paths: Seq[String]): List[HadoopFile] = {
-      val conf = _hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
-      paths.flatMap { path =>
-        val hadoopPath = new Path(path)
-        val fileSystem = hadoopPath.getFileSystem(conf)
-        val tryFind = try {
-          val status = fileSystem.getFileStatus(hadoopPath)
-          if (status.isDirectory) {
-            val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty)
-            Option(sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList)
-          } else if (status.isFile) {
-            Option(List(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)))
-          } else {
-            None
-          }
-        } catch {
-          case e: java.io.FileNotFoundException =>
-            None
-        }
-
-        tryFind.getOrElse {
-          // Maybe is glob or not found
-          val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty)
-          sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList
-        }
-      }.toList
+      val request = new ListObjectsRequest(bucket, prefix, null, delimiter, 1000)
+      inner(s3.listObjects(request))
     }
 
-    def driverListFiles(paths: List[String]): List[HadoopFile] = {
-
-      val directories = paths.map(HadoopFile(_, isDir = true, 0))
+    def s3ListObjects(bucket: String, prefix: String)
+                     (implicit s3: AmazonS3Client): Stream[S3ObjectSummary] = {
+      def inner(current: ObjectListing): Stream[S3ObjectSummary] =
+        if (current.isTruncated)
+          current.getObjectSummaries.toStream ++ inner(s3.listNextBatchOfObjects(current))
+        else
+          current.getObjectSummaries.toStream
+
+      inner(s3.listObjects(bucket, prefix))
+    }
+
+    def s3NarrowPaths(bucket: String,
+                      prefix: String,
+                      delimiter: String = "/",
+                      inclusiveStartDate: Boolean = true,
+                      startDate: Option[DateTime] = None,
+                      inclusiveEndDate: Boolean = true,
+                      endDate: Option[DateTime] = None,
+                      ignoreHours: Boolean = true)
+                     (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): Stream[String] = {
+
+      def isGoodDate(date: DateTime): Boolean = {
+        val startDateToCompare =  startDate.map(date => if (ignoreHours) date.withTimeAtStartOfDay() else date)
+        val endDateToCompare = endDate.map(date => if (ignoreHours) date.withTime(23, 59, 59, 999) else date)
+        val goodStartDate = startDateToCompare.isEmpty || (inclusiveStartDate && date.saneEqual(startDateToCompare.get) || date.isAfter(startDateToCompare.get))
+        val goodEndDate = endDateToCompare.isEmpty || (inclusiveEndDate && date.saneEqual(endDateToCompare.get) || date.isBefore(endDateToCompare.get))
+        goodStartDate && goodEndDate
+      }
 
-      def innerListFiles(remainingDirectories: List[HadoopFile]): List[HadoopFile] = {
-        if (remainingDirectories.isEmpty) {
-          Nil
-        } else {
-          val (dirs, files) = executeDriverList(remainingDirectories.map(_.path)).partition(_.isDir)
-          files ++ innerListFiles(dirs)
+      def classifyPath(path: String): Either[String, (String, DateTime)] =
+        Try(pathDateExtractor.extractFromPath(s"s3n://$bucket/$path")) match {
+          case Success(date) => Right(path -> date)
+          case Failure(_) => Left(path)
         }
+
+      s3ListCommonPrefixes(bucket, prefix, delimiter).map(classifyPath).flatMap {
+        case Left(prefixWithoutDate) => s3NarrowPaths(bucket, prefixWithoutDate, delimiter, inclusiveStartDate,
+          startDate, inclusiveEndDate, endDate, ignoreHours)
+        case Right((prefixWithDate, date)) if isGoodDate(date) => List(s"s3n://$bucket/$prefixWithDate")
+        case Right(_) => List.empty
       }
-      innerListFiles(directories)
     }
 
-    private def s3List(bucket: String, prefix: String, predicate: S3ObjectSummary => Boolean = _ => true)
-                      (implicit s3: AmazonS3Client): List[S3ObjectSummary] = {
-      def inner(acc: mutable.ArrayBuffer[S3ObjectSummary], listing: ObjectListing): List[S3ObjectSummary] = {
-        acc ++= listing.getObjectSummaries.toList.filter(predicate)
-        if (listing.isTruncated)
-          inner(acc, s3.listNextBatchOfObjects(listing))
-        else
-          acc.toList
+    private def s3List(path: String,
+                       inclusiveStartDate: Boolean,
+                       startDate: Option[DateTime],
+                       inclusiveEndDate: Boolean,
+                       endDate: Option[DateTime],
+                       exclusionPattern: Option[String])
+                      (implicit s3: AmazonS3Client, dateExtractor: PathDateExtractor): Stream[S3ObjectSummary] = {
+
+      val s3Pattern = "s3n?://([^/]+)(.+)".r
+
+      def extractBucketAndPrefix(path: String): Option[(String, String)] = path match {
+        case s3Pattern(bucket, prefix) => Option(bucket -> prefix.dropWhile(_ == '/'))
+        case _ => None
       }
 
-      inner(new mutable.ArrayBuffer[S3ObjectSummary], s3.listObjects(bucket, prefix))
+      extractBucketAndPrefix(path) match {
+        case Some((pathBucket, pathPrefix)) =>
+          s3NarrowPaths(pathBucket, pathPrefix, inclusiveStartDate = inclusiveStartDate, inclusiveEndDate = inclusiveEndDate,
+            startDate = startDate, endDate = endDate).flatMap(extractBucketAndPrefix).flatMap {
+            case (bucket, prefix) => s3ListObjects(bucket, prefix)
+          }
+        case _ => Stream.empty
+      }
     }
 
-    def s3ListAndFilterFiles(bucket: String,
-                             prefix: String,
-                             start: Option[DateTime] = None,
-                             end: Option[DateTime] = None,
-                             endsWith: Option[String] = None,
-                             exclusionPattern: Option[String] = Option("_$folder$"),
-                             predicate: HadoopFile => Boolean = _ => true)
-                            (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): List[HadoopFile] = {
-
-      def excludePatternValidation(s3Object: S3ObjectSummary, exclusionPatternOption: Option[String]): Option[S3ObjectSummary] =
-        exclusionPatternOption match {
-          case Some(pattern) if s3Object.getKey.contains(pattern) => None
-          case Some(_) | None => Option(s3Object)
+    def listAndFilterFiles(path: String,
+                           requireSuccess: Boolean = false,
+                           inclusiveStartDate: Boolean = true,
+                           startDate: Option[DateTime] = None,
+                           inclusiveEndDate: Boolean = true,
+                           endDate: Option[DateTime] = None,
+                           lastN: Option[Int] = None,
+                           ignoreMalformedDates: Boolean = false,
+                           endsWith: Option[String] = None,
+                           exclusionPattern: Option[String] = Option(".*_temporary.*|.*_\\$folder.*"),
+                           predicate: HadoopFile => Boolean = _ => true)
+                          (implicit dateExtractor: PathDateExtractor): List[HadoopFile] = {
+
+      def isSuccessFile(file: HadoopFile): Boolean =
+        file.path.endsWith("_SUCCESS") || file.path.endsWith("_FINISHED")
+
+      def extractDateFromFile(file: HadoopFile): Option[DateTime] =
+        Try(dateExtractor.extractFromPath(file.path)).toOption
+
+      def excludePatternValidation(file: HadoopFile): Option[HadoopFile] =
+        exclusionPattern match {
+          case Some(pattern) if file.path.matches(pattern) => None
+          case Some(_) | None => Option(file)
         }
 
-      def endsWithValidation(s3Object: S3ObjectSummary, endsWithOption: Option[String]): Option[S3ObjectSummary] =
-        endsWithOption match {
-          case Some(pattern) if s3Object.getKey.endsWith(pattern) => Option(s3Object)
+      def endsWithValidation(file: HadoopFile): Option[HadoopFile] =
+        endsWith match {
+          case Some(pattern) if file.path.endsWith(pattern) => Option(file)
+          case Some(_) if isSuccessFile(file) => Option(file)
           case Some(_) => None
-          case None => Option(s3Object)
+          case None => Option(file)
         }
 
-      def extractDateFromKey(s3Object: S3ObjectSummary): Option[DateTime] =
-        Try(pathDateExtractor.extractFromPath(s"s3n://$bucket/${s3Object.getKey}")).toOption
+      def applyPredicate(file: HadoopFile): Option[HadoopFile] =
+        if (predicate(file)) Option(file) else None
 
-      def startValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, startOption: Option[DateTime]): Option[S3ObjectSummary] =
-        startOption match {
-          case Some(startDate) if startDate.isEqualOrBefore(extractedDate) => Option(s3Object)
-          case Some(_) => None
-          case None => Option(s3Object)
+      def dateValidation(file: HadoopFile): Option[HadoopFile] = {
+        val tryDate = extractDateFromFile(file)
+        if (tryDate.isEmpty && ignoreMalformedDates)
+          None
+        else {
+          val date = tryDate.get
+          val goodStartDate = startDate.isEmpty || (inclusiveStartDate && date.saneEqual(startDate.get) || date.isAfter(startDate.get))
+          val goodEndDate = endDate.isEmpty || (inclusiveEndDate && date.saneEqual(endDate.get) || date.isBefore(endDate.get))
+          if (goodStartDate && goodEndDate) Some(file) else None
         }
+      }
 
-      def endValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, endOption: Option[DateTime]): Option[S3ObjectSummary] =
-        endOption match {
-          case Some(endDate) if endDate.isEqualOrAfter(extractedDate) => Option(s3Object)
-          case Some(_) => None
-          case None => Option(s3Object)
-        }
+      val preValidations: HadoopFile => Boolean = hadoopFile => {
+        val validatedFile = for {
+          _ <- excludePatternValidation(hadoopFile)
+          _ <- endsWithValidation(hadoopFile)
+          _ <- dateValidation(hadoopFile)
+          valid <- applyPredicate(hadoopFile)
+        } yield valid
+        validatedFile.isDefined
+      }
 
-      def applyPredicate(file: HadoopFile): Option[HadoopFile] =
-        if (predicate(file))
-          Option(file)
+      val preFilteredFiles = smartList(path, inclusiveStartDate = inclusiveStartDate, inclusiveEndDate = inclusiveEndDate,
+        startDate = startDate, endDate = endDate, exclusionPattern = exclusionPattern).filter(preValidations)
+
+      val filesByDate = preFilteredFiles.groupBy(extractDateFromFile).collect {
+        case (Some(date), files) => date -> files
+      }
+
+      val posFilteredFiles =
+        if (requireSuccess)
+          filesByDate.filter { case (_, files) => files.exists(isSuccessFile) }
         else
-          None
+          filesByDate
+
+      val allFiles = if (lastN.isDefined)
+        posFilteredFiles.toList.sortBy(_._1).reverse.take(lastN.get).flatMap(_._2)
+      else
+        posFilteredFiles.toList.flatMap(_._2)
+
+      allFiles.sortBy(_.path)
+    }
+
+    def smartList(path: String,
+                  inclusiveStartDate: Boolean = false,
+                  startDate: Option[DateTime] = None,
+                  inclusiveEndDate: Boolean = false,
+                  endDate: Option[DateTime] = None,
+                  exclusionPattern: Option[String] = None)(implicit pathDateExtractor: PathDateExtractor): Stream[HadoopFile] = {
 
       def toHadoopFile(s3Object: S3ObjectSummary): HadoopFile =
         HadoopFile(s"s3n://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize)
 
-      val allValidations: S3ObjectSummary => Boolean = s3Object => {
-        val validatedFile = for {
-          withValidPattern <- excludePatternValidation(s3Object, exclusionPattern)
-          withValidEndsWith <- endsWithValidation(withValidPattern, endsWith)
-          extractedDate <- extractDateFromKey(withValidEndsWith)
-          withValidStart <- startValidation(withValidEndsWith, extractedDate, start)
-          withValidEnd <- endValidation(withValidStart, extractedDate, end)
-          hadoopFile = toHadoopFile(withValidEnd)
-          valid <- applyPredicate(hadoopFile)
-        } yield valid
-        validatedFile.isDefined
+      def listPath(path: String): Stream[HadoopFile] = {
+        if (path.startsWith("s3")) {
+          s3List(path, inclusiveStartDate = inclusiveStartDate, startDate = startDate, inclusiveEndDate = inclusiveEndDate,
+            endDate = endDate, exclusionPattern = exclusionPattern)(amazonS3ClientFromEnvironmentVariables, pathDateExtractor ).map(toHadoopFile)
+        } else {
+          driverListFiles(path).toStream
+        }
       }
 
-      s3List(bucket, prefix, allValidations)(s3).map(toHadoopFile)
+      HadoopUtils.getPathStrings(path).toStream.flatMap(listPath)
     }
 
+    def filterAndGetParallelTextFiles(path: String,
+                                      requireSuccess: Boolean = false,
+                                      inclusiveStartDate: Boolean = true,
+                                      startDate: Option[DateTime] = None,
+                                      inclusiveEndDate: Boolean = true,
+                                      endDate: Option[DateTime] = None,
+                                      lastN: Option[Int] = None,
+                                      ignoreMalformedDates: Boolean = false,
+                                      endsWith: Option[String] = None,
+                                      predicate: HadoopFile => Boolean = _ => true,
+                                      maxBytesPerPartition: Long = 256 * 1000 * 1000,
+                                      minPartitions: Int = 100,
+                                      sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling(),
+                                      minimumFiles: Int = 1,
+                                      synchLocally: Option[String] = None,
+                                      forceSynch: Boolean = false)
+                                     (implicit dateExtractor: PathDateExtractor): RDD[String] = {
+
+      val foundFiles = listAndFilterFiles(path, requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate,
+        endDate, lastN, ignoreMalformedDates, endsWith, predicate = predicate)
+
+      if (foundFiles.size < minimumFiles)
+        throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of files $foundFiles is less than the required")
+
+      parallelReadTextFiles(foundFiles, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions,
+        sizeBasedFileHandling = sizeBasedFileHandling, synchLocally = synchLocally, forceSynch = forceSynch)
+    }
+
+    private def doSync(hadoopFiles: List[HadoopFile],
+                       synchLocally: String,
+                       forceSynch: Boolean,
+                       maxBytesPerPartition: Long = 256 * 1000 * 1000,
+                       minPartitions: Int = 100,
+                       sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = {
+      require(!synchLocally.contains("*"), "Globs are not supported on the sync key")
+
+      def syncPath(suffix: String) = s"$hdfsPathPrefix/_core_ignition_sync_hdfs_cache/$suffix"
+
+      val hashKey = Integer.toHexString(hadoopFiles.toSet.hashCode())
+
+      lazy val foundLocalPaths = getStatus(syncPath(s"$synchLocally/$hashKey/{_SUCCESS,_FINISHED}"), removeEmpty = false)
+
+      val cacheKey = syncPath(s"$synchLocally/$hashKey")
+
+      if (forceSynch || foundLocalPaths.isEmpty) {
+        delete(new Path(syncPath(s"$synchLocally/")))
+        val data = parallelReadTextFiles(hadoopFiles, maxBytesPerPartition, minPartitions, synchLocally = None)
+        data.saveAsTextFile(cacheKey)
+      }
 
-    def s3FilterAndGetParallelTextFiles(bucket: String,
-                                        prefix: String,
-                                        startDate: Option[DateTime] = None,
-                                        endDate: Option[DateTime] = None,
-                                        endsWith: Option[String] = None,
-                                        predicate: HadoopFile => Boolean = _ => true,
-                                        maxBytesPerPartition: Long = 256 * 1000 * 1000,
-                                        minPartitions: Int = 100,
-                                        sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling())
-                                       (implicit  s3Client: AmazonS3Client = amazonS3ClientFromEnvironmentVariables,
-                                        dateExtractor: PathDateExtractor): RDD[String] = {
-      val foundFiles = s3ListAndFilterFiles(bucket, prefix, startDate, endDate, predicate = predicate)(s3Client, dateExtractor)
-      parallelReadTextFiles(foundFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)
+      sc.textFile(cacheKey)
     }
 
   }

From a1d226a8cdf018f0652d06de85ffa11b632531a7 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 8 Dec 2015 09:55:21 -0200
Subject: [PATCH 068/268] merge

---
 src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 1afbd74f..9a96f78d 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -248,7 +248,7 @@ object SparkContextUtils {
                                maxBytesPerPartition: Long,
                                minPartitions: Int,
                                sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = {
-      val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file ->()), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
+      val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> null), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))
       val hadoopConf = _hadoopConf
       smallPartitionedFiles.mapPartitions { files =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }

From f5ad7f29afdd1040d0ca54e94ebf44137dd286f9 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 8 Dec 2015 13:38:16 -0200
Subject: [PATCH 069/268] fix empty file filter

---
 .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 9a96f78d..bed7e8f0 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -351,7 +351,7 @@ object SparkContextUtils {
                                      minPartitions: Int,
                                      sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling())
                                     (implicit dateExtractor: PathDateExtractor): RDD[String] = {
-      val foundFiles = paths.flatMap(smartList(_)).filter(_.size > 0)
+      val foundFiles = paths.flatMap(smartList(_))
       parallelReadTextFiles(foundFiles, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, sizeBasedFileHandling = sizeBasedFileHandling)
     }
 
@@ -361,11 +361,12 @@ object SparkContextUtils {
                               sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling(),
                               synchLocally: Option[String] = None,
                               forceSynch: Boolean = false): RDD[String] = {
+      val filteredFiles = files.filter(_.size > 0)
       if (synchLocally.isDefined)
-        doSync(files, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, synchLocally = synchLocally.get,
+        doSync(filteredFiles, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, synchLocally = synchLocally.get,
           sizeBasedFileHandling = sizeBasedFileHandling, forceSynch = forceSynch)
       else {
-        val (bigFiles, smallFiles) = files.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition))
+        val (bigFiles, smallFiles) = filteredFiles.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition))
         sc.union(
           readSmallFiles(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling),
           readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling))

From 5587537b7e42136daf6ffcae53a9754c19b55fd2 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Thu, 10 Dec 2015 09:51:40 -0200
Subject: [PATCH 070/268] fix narrow paths for paths without common prefixes
 (like final folders)

---
 .../core/jobs/utils/SparkContextUtils.scala      | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index bed7e8f0..4eab7baf 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -486,12 +486,16 @@ object SparkContextUtils {
           case Failure(_) => Left(path)
         }
 
-      s3ListCommonPrefixes(bucket, prefix, delimiter).map(classifyPath).flatMap {
-        case Left(prefixWithoutDate) => s3NarrowPaths(bucket, prefixWithoutDate, delimiter, inclusiveStartDate,
-          startDate, inclusiveEndDate, endDate, ignoreHours)
-        case Right((prefixWithDate, date)) if isGoodDate(date) => List(s"s3n://$bucket/$prefixWithDate")
-        case Right(_) => List.empty
-      }
+      val commonPrefixes = s3ListCommonPrefixes(bucket, prefix, delimiter).map(classifyPath)
+
+      if (commonPrefixes.isEmpty)
+        Stream(s"s3n://$bucket/$prefix")
+      else
+        commonPrefixes.toStream.flatMap {
+          case Left(prefixWithoutDate) => s3NarrowPaths(bucket, prefixWithoutDate, delimiter, inclusiveStartDate, startDate, inclusiveEndDate, endDate, ignoreHours)
+          case Right((prefixWithDate, date)) if isGoodDate(date) => Stream(s"s3n://$bucket/$prefixWithDate")
+          case Right(_) => Stream.empty
+        }
     }
 
     private def s3List(path: String,

From b253f29f66b2ba6858c46e681237e5f1f6c1cf1c Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Mon, 21 Dec 2015 13:34:59 -0200
Subject: [PATCH 071/268] Added some new utils

---
 .../ignition/core/utils/CollectionUtils.scala | 26 +++++++++++++++++++
 .../core/utils/CollectionUtilsSpec.scala      | 13 ++++++++++
 2 files changed, 39 insertions(+)

diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala
index eea4755e..f98fb7ec 100644
--- a/src/main/scala/ignition/core/utils/CollectionUtils.scala
+++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala
@@ -6,7 +6,32 @@ import scalaz.Validation
 
 object CollectionUtils {
 
+
+
+  implicit class SeqImprovements[A](xs: Seq[A]) {
+    def orElseIfEmpty[B >: A](alternative: => Seq[B]): Seq[B] = {
+      if (xs.nonEmpty)
+        xs
+      else
+        alternative
+    }
+  }
+
   implicit class TraversableOnceImprovements[A](xs: TraversableOnce[A]) {
+    def maxOption(implicit cmp: Ordering[A]): Option[A] = {
+      if (xs.isEmpty)
+        None
+      else
+        Option(xs.max)
+    }
+
+    def minOption(implicit cmp: Ordering[A]): Option[A] = {
+      if (xs.isEmpty)
+        None
+      else
+        Option(xs.min)
+    }
+
     def maxByOption[B](f: A => B)(implicit cmp: Ordering[B]): Option[A] = {
       if (xs.isEmpty)
         None
@@ -65,6 +90,7 @@ object CollectionUtils {
       builder.result
     }
 
+
   }
 
   implicit class ValidatedIterableLike[T, R, Repr <: IterableLike[Validation[R, T], Repr]](seq: IterableLike[Validation[R, T], Repr]) {
diff --git a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala
index f01b8a34..548b2423 100644
--- a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala
+++ b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala
@@ -32,5 +32,18 @@ class CollectionUtilsSpec extends FlatSpec with ShouldMatchers {
     list.compressBy(_.value) shouldBe List(MyObj("p1", "v1"), MyObj("p1", "v2"))
   }
 
+  it should "provide orElseIfEmpty" in {
+    Seq.empty[String].orElseIfEmpty(Seq("something")) shouldBe Seq("something")
+    Seq("not empty").orElseIfEmpty(Seq("something")) shouldBe Seq("not empty")
+  }
+
+  it should "provide maxOption and minOption" in {
+    Seq.empty[Int].maxOption shouldBe None
+    Seq(1, 3, 2).maxOption shouldBe Some(3)
+
+    Seq.empty[Int].minOption shouldBe None
+    Seq(1, 3, 2).minOption shouldBe Some(1)
+  }
+
 
 }

From 352ee0b4d584c4d38ef8bf3bd1b4d8320f0adf4a Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Tue, 22 Dec 2015 11:26:40 -0200
Subject: [PATCH 072/268] Minor change

---
 build.sbt                                             | 2 +-
 src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/build.sbt b/build.sbt
index d0e2b029..5de79888 100644
--- a/build.sbt
+++ b/build.sbt
@@ -4,7 +4,7 @@ version := "1.0"
 
 scalaVersion := "2.10.4"
 
-scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code")
+scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130")
 
 ideaExcludeFolders += ".idea"
 
diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
index aa4dcc76..8430d4ef 100644
--- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
+++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
@@ -13,9 +13,11 @@ object CoreJobRunner {
 
   // Used to provide contextual logging
   def setLoggingContextValues(config: RunnerConfig): Unit = {
-    org.slf4j.MDC.put("setupName", config.setupName)
-    org.slf4j.MDC.put("tag", config.tag)
-    org.slf4j.MDC.put("user", config.user)
+    Try { // yes, this may fail but we don't want everything to shut down
+      org.slf4j.MDC.put("setupName", config.setupName)
+      org.slf4j.MDC.put("tag", config.tag)
+      org.slf4j.MDC.put("user", config.user)
+    }
   }
 
   case class RunnerConfig(setupName: String = "nosetup",

From d780ea589d90f4d5683de05a8ca3339ce66a1fd1 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Fri, 15 Jan 2016 14:03:32 -0200
Subject: [PATCH 073/268] Make try work even if the exception is fatall

---
 src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
index 8430d4ef..bbede553 100644
--- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
+++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
@@ -13,10 +13,13 @@ object CoreJobRunner {
 
   // Used to provide contextual logging
   def setLoggingContextValues(config: RunnerConfig): Unit = {
-    Try { // yes, this may fail but we don't want everything to shut down
+    try { // yes, this may fail but we don't want everything to shut down
       org.slf4j.MDC.put("setupName", config.setupName)
       org.slf4j.MDC.put("tag", config.tag)
       org.slf4j.MDC.put("user", config.user)
+    } catch {
+      case e: Throwable =>
+        // cry
     }
   }
 

From 400b1f0d9cfdfb54183f744e9a5f5cf3f3a03df9 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Mon, 1 Feb 2016 10:37:44 -0200
Subject: [PATCH 074/268] zeppelin setup

---
 remote_hook.sh | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index 48ba9735..5d4bbad1 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -49,6 +49,23 @@ on_trap_exit() {
     rm -f "${RUNNING_FILE}"
 }
 
+install_and_run_zeppelin() {
+    if [[ ! -d "zeppelin" ]]; then
+        wget "http://www.us.apache.org/dist/incubator/zeppelin/0.5.6-incubating/zeppelin-0.5.6-incubating-bin-all.tgz" -O zeppelin.tar.gz
+        tar xvzf zeppelin.tar.gz > /tmp/zeppelin_install.log
+        mv `ls -d zeppelin-*` zeppelin
+    fi
+    if [[ -f "zeppelin/bin/zeppelin.sh" ]]; then
+        export MASTER="${JOB_MASTER}"
+        export ZEPPELIN_PORT="8081"
+        export SPARK_HOME="/root/spark"
+        export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH}"
+        sudo -E zeppelin/bin/zeppelin.sh
+    else
+        notify_error_and_exit "Not found zeppelin installation"
+    fi
+}
+
 
 trap "on_trap_exit" EXIT
 
@@ -74,10 +91,11 @@ if [[ "${USE_YARN}" == "yes" ]]; then
     export SPARK_WORKER_MEMORY=${SPARK_MEM_PARAM}
 fi
 
-
 if [[ "${JOB_NAME}" == "shell" ]]; then
     export ADD_JARS=${JAR_PATH}
     sudo -E ${SPARK_HOME}/bin/spark-shell || notify_error_and_exit "Execution failed for shell"
+elif [[ "${JOB_NAME}" == "zeppelin" ]]; then
+    install_and_run_zeppelin
 else
     JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log"
     tail -F "${JOB_OUTPUT}" &

From 333127927fe9581228a12f57f2c8d1a29c474908 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 2 Feb 2016 09:36:49 -0200
Subject: [PATCH 075/268] pr review

---
 remote_hook.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index 5d4bbad1..7d8ed36e 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -52,8 +52,8 @@ on_trap_exit() {
 install_and_run_zeppelin() {
     if [[ ! -d "zeppelin" ]]; then
         wget "http://www.us.apache.org/dist/incubator/zeppelin/0.5.6-incubating/zeppelin-0.5.6-incubating-bin-all.tgz" -O zeppelin.tar.gz
-        tar xvzf zeppelin.tar.gz > /tmp/zeppelin_install.log
-        mv `ls -d zeppelin-*` zeppelin
+        mkdir zepplin
+        tar xvzf zeppelin.tar.gz -C zeppelin --strip-components 1 > /tmp/zeppelin_install.log
     fi
     if [[ -f "zeppelin/bin/zeppelin.sh" ]]; then
         export MASTER="${JOB_MASTER}"
@@ -62,7 +62,7 @@ install_and_run_zeppelin() {
         export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH}"
         sudo -E zeppelin/bin/zeppelin.sh
     else
-        notify_error_and_exit "Not found zeppelin installation"
+        notify_error_and_exit "Zepellin installation not found"
     fi
 }
 

From 33aa47e2cde896bfd32feaa2e9726c9cd3475871 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 3 Feb 2016 15:09:53 -0200
Subject: [PATCH 076/268] rdd.filterNot

---
 src/main/scala/ignition/core/jobs/utils/RDDUtils.scala | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
index 57069bae..60bddc9a 100644
--- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
@@ -57,6 +57,8 @@ object RDDUtils {
     def incrementCounterIf(cond: (V) => Boolean, acc: spark.Accumulator[Int]): RDD[V] = {
       rdd.map(x => { if (cond(x)) acc += 1; x })
     }
+
+    def filterNot(p: V => Boolean): RDD[V] = rdd.filter(!p(_))
   }
 
   implicit class PairRDDImprovements[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) {

From 93964db2d79c6b84f172712b9ce62eaa9fa44687 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 3 Feb 2016 15:45:24 -0200
Subject: [PATCH 077/268] open a browser for zepplin web ui

---
 tools/cluster.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/cluster.py b/tools/cluster.py
index 2fe6b245..4a81eaa9 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -436,6 +436,9 @@ def job_run(cluster_name, job_name, job_mem,
                src_local=remote_hook_local,
                remote_path=with_leading_slash(remote_path))
 
+    if job_name == "zeppelin":
+         subprocess.Popen(["xdg-open", "http://{master}:8081".format(master=master)])
+
     log.info('Will run job in remote host')
     if disable_tmux:
         ssh_call(user=remote_user, host=master, key_file=key_file, args=[non_tmux_arg], allocate_terminal=False)

From 5137e43546660658dfe17beb0ff54c80877f16b1 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 3 Feb 2016 18:10:59 -0200
Subject: [PATCH 078/268] using webbrowser lib

---
 tools/cluster.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 4a81eaa9..daf03d91 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -23,6 +23,7 @@
 import getpass
 import json
 import glob
+import webbrowser
 
 
 log = logging.getLogger()
@@ -437,7 +438,7 @@ def job_run(cluster_name, job_name, job_mem,
                remote_path=with_leading_slash(remote_path))
 
     if job_name == "zeppelin":
-         subprocess.Popen(["xdg-open", "http://{master}:8081".format(master=master)])
+         webbrowser.open("http://{master}:8081".format(master=master))
 
     log.info('Will run job in remote host')
     if disable_tmux:

From b0c323c3f283f4514a644b222a8c2a07dbb6c52c Mon Sep 17 00:00:00 2001
From: Leandro <leandro@chaordicsystems.com>
Date: Mon, 22 Feb 2016 10:54:23 -0300
Subject: [PATCH 079/268] Do not delete the security group by default

---
 tools/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 2fe6b245..d9e37533 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -322,7 +322,7 @@ def launch(cluster_name, slaves,
     raise CommandError('Failed to created cluster {} after failures'.format(cluster_name))
 
 
-def destroy(cluster_name, delete_groups=True, region=default_region):
+def destroy(cluster_name, delete_groups=False, region=default_region):
     delete_sg_param = ['--delete-groups'] if delete_groups else []
 
     ec2_script_path = chdir_to_ec2_script_and_get_path()

From 0d5b6615c1039d52e529d27d5a28f7838b35359c Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Tue, 23 Feb 2016 19:24:09 -0300
Subject: [PATCH 080/268] Added most frequent function to sequences

---
 src/main/scala/ignition/core/utils/CollectionUtils.scala | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala
index f98fb7ec..01960d3d 100644
--- a/src/main/scala/ignition/core/utils/CollectionUtils.scala
+++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala
@@ -15,6 +15,10 @@ object CollectionUtils {
       else
         alternative
     }
+    
+    def mostFrequentOption: Option[A] = {
+      xs.groupBy(identity).maxByOption(_._2.size).map(_._1)
+    }
   }
 
   implicit class TraversableOnceImprovements[A](xs: TraversableOnce[A]) {
@@ -45,6 +49,7 @@ object CollectionUtils {
       else
         Option(xs.minBy(f))
     }
+    
   }
 
 

From ce911f6153d238f1db4c74c056c590ad730d636d Mon Sep 17 00:00:00 2001
From: Leandro <leandro@chaordicsystems.com>
Date: Wed, 24 Feb 2016 17:10:44 -0300
Subject: [PATCH 081/268] Fixing typo and adding driver heap param

---
 remote_hook.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index 7d8ed36e..dd76933a 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -52,14 +52,14 @@ on_trap_exit() {
 install_and_run_zeppelin() {
     if [[ ! -d "zeppelin" ]]; then
         wget "http://www.us.apache.org/dist/incubator/zeppelin/0.5.6-incubating/zeppelin-0.5.6-incubating-bin-all.tgz" -O zeppelin.tar.gz
-        mkdir zepplin
+        mkdir zeppelin
         tar xvzf zeppelin.tar.gz -C zeppelin --strip-components 1 > /tmp/zeppelin_install.log
     fi
     if [[ -f "zeppelin/bin/zeppelin.sh" ]]; then
         export MASTER="${JOB_MASTER}"
         export ZEPPELIN_PORT="8081"
         export SPARK_HOME="/root/spark"
-        export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH}"
+        export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH} --runner-executor-memory ${SPARK_MEM_PARAM}"
         sudo -E zeppelin/bin/zeppelin.sh
     else
         notify_error_and_exit "Zepellin installation not found"

From 99df346f866c674e2595772b94dcdc75fd64ff42 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Thu, 3 Mar 2016 18:52:07 -0300
Subject: [PATCH 082/268] Added new method

---
 src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala
index 548b2423..c800c0f2 100644
--- a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala
+++ b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala
@@ -45,5 +45,10 @@ class CollectionUtilsSpec extends FlatSpec with ShouldMatchers {
     Seq(1, 3, 2).minOption shouldBe Some(1)
   }
 
+  it should "provide mostFrequentOption" in {
+    Seq.empty[String].mostFrequentOption shouldBe None
+    Seq("a", "b", "b", "c", "a", "b").mostFrequentOption shouldBe Option("b")
+  }
+
 
 }

From 736e82af6eccd843e173717e747d72f597c66756 Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Mon, 7 Mar 2016 14:35:21 -0300
Subject: [PATCH 083/268] Added flatten to rdd of sets

---
 src/main/scala/ignition/core/jobs/utils/RDDUtils.scala | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
index 60bddc9a..e70d8476 100644
--- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
@@ -29,6 +29,12 @@ object RDDUtils {
     }
   }
 
+  implicit class SetRDDImprovements[V: ClassTag](rdd: RDD[Set[V]]) {
+    def flatten: RDD[V] = {
+      rdd.flatMap(x => x)
+    }
+  }
+
   implicit class ValidatedRDDImprovements[A: ClassTag, B: ClassTag](rdd: RDD[Validation[A, B]]) {
 
     def mapSuccess(f: B => Validation[A, B]): RDD[Validation[A, B]] = {

From 950d577600f997ea64875e61596241d847c8c36d Mon Sep 17 00:00:00 2001
From: Allan Oliveira <allan@chaordicsystems.com>
Date: Fri, 18 Mar 2016 11:38:05 -0300
Subject: [PATCH 084/268] Remove plugin specific configurations

---
 build.sbt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/build.sbt b/build.sbt
index 5de79888..5ae4552b 100644
--- a/build.sbt
+++ b/build.sbt
@@ -6,10 +6,6 @@ scalaVersion := "2.10.4"
 
 scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130")
 
-ideaExcludeFolders += ".idea"
-
-ideaExcludeFolders += ".idea_modules"
-
 // Because we can't run two spark contexts on same VM
 parallelExecution in Test := false
 

From 603dae78b8b4c633bfce46655f19e34249d32810 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 14 Jun 2016 19:52:08 -0300
Subject: [PATCH 085/268] Added percentile do IntBag

---
 .../scala/ignition/core/utils/IntBag.scala    | 24 ++++++++++++++-----
 .../ignition/core/utils/IntBagSpec.scala      | 16 ++++++++++---
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/src/main/scala/ignition/core/utils/IntBag.scala b/src/main/scala/ignition/core/utils/IntBag.scala
index a322f6f7..38cb3836 100644
--- a/src/main/scala/ignition/core/utils/IntBag.scala
+++ b/src/main/scala/ignition/core/utils/IntBag.scala
@@ -1,5 +1,7 @@
 package ignition.core.utils
 
+import ignition.core.utils.CollectionUtils._
+
 object IntBag {
   def from(numbers: TraversableOnce[Long]): IntBag = {
     val histogram = scala.collection.mutable.HashMap.empty[Long, Long]
@@ -19,15 +21,17 @@ case class IntBag(histogram: collection.Map[Long, Long]) {
 
 
   def median: Option[Long] = {
-    if (histogram.nonEmpty) {
+    percentile(50)
+  }
+
+  def percentile(n: Double): Option[Long] = {
+    require(n > 0 && n <= 100)
+    histogram.keys.maxOption.flatMap { max =>
       val total = histogram.values.sum
-      val half = total / 2
-      val max = histogram.keys.max
+      val position = total * (n / 100)
 
       val accumulatedFrequency = (0L to max).scanLeft(0L) { case (sumFreq, k) => sumFreq + histogram.getOrElse(k, 0L) }.zipWithIndex
-      accumulatedFrequency.collectFirst { case (sum, k) if sum >= half => k }
-    } else {
-      None
+      accumulatedFrequency.collectFirst { case (sum, k) if sum >= position => k - 1 }
     }
   }
 
@@ -39,4 +43,12 @@ case class IntBag(histogram: collection.Map[Long, Long]) {
     } else
       None
   }
+
+  def min: Option[Long] = {
+    histogram.keys.minOption
+  }
+
+  def max: Option[Long] = {
+    histogram.keys.maxOption
+  }
 }
diff --git a/src/test/scala/ignition/core/utils/IntBagSpec.scala b/src/test/scala/ignition/core/utils/IntBagSpec.scala
index b6694b12..76d37a35 100644
--- a/src/test/scala/ignition/core/utils/IntBagSpec.scala
+++ b/src/test/scala/ignition/core/utils/IntBagSpec.scala
@@ -10,14 +10,24 @@ class IntBagSpec extends FlatSpec with ShouldMatchers  {
     IntBag.from(Seq(1, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4)).histogram shouldBe Map(1 -> 2, 2 -> 3, 3 -> 1, 4 -> 5)
   }
 
-  it should "calculate the median and average" in {
+  it should "calculate the average" in {
     val size = 1000
-    val numbers = (0 until 1000).map(_ => Random.nextInt(400).toLong).toList
+    val numbers = (0 until size).map(_ => Random.nextInt(400).toLong).toList
     val bag = IntBag.from(numbers)
 
     bag.avg.get shouldBe numbers.sum / size
+  }
+
+  it should "calculate the percentile, min and max" in {
+    val size = 3 // anything different is hard to guess because of the approximation
+    val numbers = (0 until size).map(_ => Random.nextInt(400).toLong).toList
+    val bag = IntBag.from(numbers)
 
-    // TODO: the median is only approximate and it could be better, improve it
+    bag.min.get shouldBe numbers.min
+    bag.percentile(0.1).get shouldBe numbers.min
+    bag.median.get shouldBe numbers.sorted.apply(1)
+    bag.percentile(99.9).get shouldBe numbers.max
+    bag.max.get shouldBe numbers.max
   }
 
 }

From 7f709e4b1ce52981966a64cac336fb530a5e4db6 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Fri, 22 Jul 2016 10:06:25 -0300
Subject: [PATCH 086/268] log error on compressed big file read

---
 .../core/jobs/utils/SparkContextUtils.scala   | 47 ++++++++++---------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 4eab7baf..552da25d 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -293,28 +293,33 @@ object SparkContextUtils {
         val hadoopPath = new Path(file.path)
         val fileSystem = hadoopPath.getFileSystem(conf)
         slices.flatMap { case (slice, _) =>
-          val inputStream = Option(codecFactory.getCodec(hadoopPath)) match {
-            case Some(compression) => compression.createInputStream(fileSystem.open(hadoopPath))
-            case None => fileSystem.open(hadoopPath)
-          }
-          val lines = Source.fromInputStream(inputStream)(Codec.UTF8).getLines()
-
-          val lineSample = lines.take(sampleCount).toList
-          val linesPerSlice = {
-            val sampleSize = lineSample.map(_.size).sum
-            val estimatedAverageLineSize = Math.round(sampleSize / sampleCount.toFloat)
-            val estimatedTotalLines = Math.round(estimatedSize / estimatedAverageLineSize.toFloat)
-            estimatedTotalLines / totalSlices + 1
+          try {
+            val inputStream = Option(codecFactory.getCodec(hadoopPath)) match {
+              case Some(compression) => compression.createInputStream(fileSystem.open(hadoopPath))
+              case None => fileSystem.open(hadoopPath)
+            }
+            val lines = Source.fromInputStream(inputStream)(Codec.UTF8).getLines()
+
+            val lineSample = lines.take(sampleCount).toList
+            val linesPerSlice = {
+              val sampleSize = lineSample.map(_.size).sum
+              val estimatedAverageLineSize = Math.round(sampleSize / sampleCount.toFloat)
+              val estimatedTotalLines = Math.round(estimatedSize / estimatedAverageLineSize.toFloat)
+              estimatedTotalLines / totalSlices + 1
+            }
+
+            val linesAfterSeek = (lineSample.toIterator ++ lines).drop(linesPerSlice * slice.index)
+
+            val finalLines = if (slice.index + 1 == totalSlices) // last slice, read until the end
+              linesAfterSeek
+            else
+              linesAfterSeek.take(linesPerSlice)
+
+            AutoCloseableIterator.wrap(finalLines, () => close(inputStream, s"${file.path}, slice $slice"))
+          } catch {
+            case NonFatal(e) =>
+              throw new Exception(s"Error on read compressed big file, slice=$slice, file=$file", e)
           }
-
-          val linesAfterSeek = (lineSample.toIterator ++ lines).drop(linesPerSlice * slice.index)
-
-          val finalLines = if (slice.index + 1 == totalSlices) // last slice, read until the end
-            linesAfterSeek
-          else
-            linesAfterSeek.take(linesPerSlice)
-
-          AutoCloseableIterator.wrap(finalLines, () => close(inputStream, s"${file.path}, slice $slice"))
         }
       }
     }

From 6e6ceac12b311ede633ed54eabb82f6889820828 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Mon, 25 Jul 2016 18:46:01 -0300
Subject: [PATCH 087/268] Improve spark shell job run

---
 remote_hook.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index dd76933a..309f85f5 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -93,7 +93,7 @@ fi
 
 if [[ "${JOB_NAME}" == "shell" ]]; then
     export ADD_JARS=${JAR_PATH}
-    sudo -E ${SPARK_HOME}/bin/spark-shell || notify_error_and_exit "Execution failed for shell"
+    sudo -E ${SPARK_HOME}/bin/spark-shell --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell"
 elif [[ "${JOB_NAME}" == "zeppelin" ]]; then
     install_and_run_zeppelin
 else

From c73bbc13c2082b7b160518c62dfce60bf1e2bf45 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 16 Aug 2016 15:03:15 -0300
Subject: [PATCH 088/268] Smaller partitions are safer

---
 .../scala/ignition/core/jobs/utils/SparkContextUtils.scala  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 552da25d..645b218e 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -361,7 +361,7 @@ object SparkContextUtils {
     }
 
     def parallelReadTextFiles(files: List[HadoopFile],
-                              maxBytesPerPartition: Long = 256 * 1000 * 1000,
+                              maxBytesPerPartition: Long = 128 * 1000 * 1000,
                               minPartitions: Int = 100,
                               sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling(),
                               synchLocally: Option[String] = None,
@@ -639,7 +639,7 @@ object SparkContextUtils {
                                       ignoreMalformedDates: Boolean = false,
                                       endsWith: Option[String] = None,
                                       predicate: HadoopFile => Boolean = _ => true,
-                                      maxBytesPerPartition: Long = 256 * 1000 * 1000,
+                                      maxBytesPerPartition: Long = 128 * 1000 * 1000,
                                       minPartitions: Int = 100,
                                       sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling(),
                                       minimumFiles: Int = 1,
@@ -660,7 +660,7 @@ object SparkContextUtils {
     private def doSync(hadoopFiles: List[HadoopFile],
                        synchLocally: String,
                        forceSynch: Boolean,
-                       maxBytesPerPartition: Long = 256 * 1000 * 1000,
+                       maxBytesPerPartition: Long = 128 * 1000 * 1000,
                        minPartitions: Int = 100,
                        sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = {
       require(!synchLocally.contains("*"), "Globs are not supported on the sync key")

From 800044312ae657a633488f0d8c1c9f55475b4ab9 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 18 Aug 2016 19:11:34 -0300
Subject: [PATCH 089/268] Use mutable List on groupByKeyAndTake

---
 .../ignition/core/jobs/utils/RDDUtils.scala   | 19 ++++++++++++-------
 .../core/jobs/utils/RDDUtilsSpec.scala        | 17 +++++++++--------
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
index e70d8476..fd1d74ce 100644
--- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
@@ -3,13 +3,14 @@ package ignition.core.jobs.utils
 import org.slf4j.LoggerFactory
 
 import scala.reflect._
-import org.apache.spark.rdd.{PairRDDFunctions, CoGroupedRDD, RDD}
+import org.apache.spark.rdd.{CoGroupedRDD, PairRDDFunctions, RDD}
 import org.apache.spark.SparkContext._
 import org.apache.spark.Partitioner
 import org.apache.spark
 import org.joda.time.DateTime
 import org.joda.time.format.DateTimeFormat
 
+import scala.collection.mutable
 import scalaz.{Success, Validation}
 
 object RDDUtils {
@@ -93,13 +94,14 @@ object RDDUtils {
     }
 
     def groupByKeyAndTake(n: Int): RDD[(K, List[V])] =
-      rdd.aggregateByKey(List.empty[V])(
+      rdd.aggregateByKey(mutable.ListBuffer.empty[V])(
         (lst, v) =>
           if (lst.size >= n) {
             logger.warn(s"Ignoring value '$v' due aggregation result of size '${lst.size}' is bigger than n=$n")
             lst
           } else {
-            v :: lst
+            lst += v
+            lst
           },
         (lstA, lstB) =>
           if (lstA.size >= n)
@@ -109,11 +111,14 @@ object RDDUtils {
           else {
             if (lstA.size + lstB.size > n) {
               logger.warn(s"Merging partition1=${lstA.size} with partition2=${lstB.size} and taking the first n=$n, sample1='${lstA.take(5)}', sample2='${lstB.take(5)}'")
-              (lstA ++ lstB).take(n)
-            } else
-              lstA ++ lstB
+              lstA ++= lstB
+              lstA.take(n)
+            } else {
+              lstA ++= lstB
+              lstA
+            }
           }
-      )
+      ).mapValues(_.toList)
 
     // Note: completely unoptimized. We could use instead for better performance:
     // 1) sortByKey
diff --git a/src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala b/src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala
index a00e5de8..705ba398 100644
--- a/src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala
+++ b/src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala
@@ -9,17 +9,18 @@ import scala.util.Random
 class RDDUtilsSpec extends FlatSpec with ShouldMatchers with SharedSparkContext {
 
   "RDDUtils" should "provide groupByKeyAndTake" in {
-    val take = 5
-    val rdd = sc.parallelize((1 to Random.nextInt(40) + 10).map(x => "a" -> Random.nextInt()) ++ (1 to Random.nextInt(40) + 10).map(x => "b" -> Random.nextInt()))
-    val result = rdd.groupByKeyAndTake(take).collect().toMap
-    result("a").length shouldBe take
-    result("b").length shouldBe take
+    (10 to 60 by 10).foreach { take =>
+      val rdd = sc.parallelize((1 to 400).map(x => "a" -> Random.nextInt()) ++ (1 to 400).map(x => "b" -> Random.nextInt()), 60)
+      val result = rdd.groupByKeyAndTake(take).collect().toMap
+      result("a").length shouldBe take
+      result("b").length shouldBe take
+    }
   }
 
   it should "provide groupByKeyAndTakeOrdered" in {
-    val take = 5
-    val aList = (1 to Random.nextInt(40) + 10).map(x => "a" -> Random.nextInt()).toList
-    val bList = (1 to Random.nextInt(40) + 10).map(x => "b" -> Random.nextInt()).toList
+    val take = 50
+    val aList = (1 to Random.nextInt(400) + 100).map(x => "a" -> Random.nextInt()).toList
+    val bList = (1 to Random.nextInt(400) + 100).map(x => "b" -> Random.nextInt()).toList
     val rdd = sc.parallelize(aList ++ bList)
     val result = rdd.groupByKeyAndTakeOrdered(take).collect().toMap
     result("a") shouldBe aList.map(_._2).sorted.take(take)

From 64c8cd98cf59efcc6a0b5bcae69f8e5f65d18f85 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Fri, 19 Aug 2016 18:58:43 -0300
Subject: [PATCH 090/268] Preparing por spark 2.0

---
 tools/cluster.py             |  4 +--
 tools/spark-ec2/spark_ec2.py | 53 +++++++++++++++++++++++++-----------
 2 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index e9ad90f3..642d2d98 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -49,7 +49,7 @@
 default_ami = None # will be decided based on spark-ec2 list
 default_master_ami = None
 default_env = 'dev'
-default_spark_version = '1.5.1'
+default_spark_version = '2.0.0'
 custom_builds = {
 #    '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz'
 }
@@ -61,7 +61,7 @@
 default_defaults_filename = 'cluster_defaults.json'
 
 default_spark_ec2_git_repo = 'https://github.com/chaordic/spark-ec2'
-default_spark_ec2_git_branch = 'branch-1.4-merge'
+default_spark_ec2_git_branch = 'branch-2.0'
 
 
 master_post_create_commands = [
diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
index e9442448..50d67b9b 100755
--- a/tools/spark-ec2/spark_ec2.py
+++ b/tools/spark-ec2/spark_ec2.py
@@ -51,7 +51,7 @@
     raw_input = input
     xrange = range
 
-SPARK_EC2_VERSION = "1.5.1"
+SPARK_EC2_VERSION = "2.0.0"
 SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
 
 VALID_SPARK_VERSIONS = set([
@@ -74,6 +74,9 @@
     "1.4.1",
     "1.5.0",
     "1.5.1",
+    "1.5.2",
+    "1.6.0",
+    "2.0.0",
 ])
 
 SPARK_TACHYON_MAP = {
@@ -90,6 +93,9 @@
     "1.4.1": "0.6.4",
     "1.5.0": "0.7.1",
     "1.5.1": "0.7.1",
+    "1.5.2": "0.7.1",
+    "1.6.0": "0.8.2",
+    "2.0.0": "",
 }
 
 DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION
@@ -97,7 +103,7 @@
 
 # Default location to get the spark-ec2 scripts (and ami-list) from
 DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/amplab/spark-ec2"
-DEFAULT_SPARK_EC2_BRANCH = "branch-1.4"
+DEFAULT_SPARK_EC2_BRANCH = "branch-2.0"
 
 
 def setup_external_libs(libs):
@@ -183,6 +189,10 @@ def parse_args():
     parser.add_option(
         "-i", "--identity-file",
         help="SSH private key file to use for logging into instances")
+    parser.add_option(
+        "-p", "--profile", default=None,
+        help="If you have multiple profiles (AWS or boto config), you can configure " +
+             "additional, named profiles by using this option (default: %default)")
     parser.add_option(
         "-t", "--instance-type", default="m1.large",
         help="Type of instance to launch (default: %default). " +
@@ -329,7 +339,7 @@ def parse_args():
         help="Use private IPs for instances rather than public if VPC/subnet " +
              "requires that.")
     parser.add_option(
-        "--instance-initiated-shutdown-behavior", default="terminate",
+        "--instance-initiated-shutdown-behavior", default="stop",
         choices=["stop", "terminate"],
         help="Whether instances should terminate when shut down or just stop")
     parser.add_option(
@@ -415,11 +425,11 @@ def get_validate_spark_version(version, repo):
 EC2_INSTANCE_TYPES = {
     "c1.medium":   "pvm",
     "c1.xlarge":   "pvm",
-    "c3.large":    "pvm",
-    "c3.xlarge":   "pvm",
-    "c3.2xlarge":  "pvm",
-    "c3.4xlarge":  "pvm",
-    "c3.8xlarge":  "pvm",
+    "c3.large":    "hvm",
+    "c3.xlarge":   "hvm",
+    "c3.2xlarge":  "hvm",
+    "c3.4xlarge":  "hvm",
+    "c3.8xlarge":  "hvm",
     "c4.large":    "hvm",
     "c4.xlarge":   "hvm",
     "c4.2xlarge":  "hvm",
@@ -497,6 +507,7 @@ def get_spark_ami(instance_type, region, spark_ec2_git_repo, spark_ec2_git_branc
     print("Spark AMI: " + ami)
     return ami
 
+
 # Launch a cluster of the given name, by setting up its security groups,
 # and then starting new instances in them.
 # Returns a tuple of EC2 reservation objects for the master and slaves
@@ -632,11 +643,14 @@ def launch_cluster(conn, opts, cluster_name):
             device.delete_on_termination = True
             block_map["/dev/sd" + chr(ord('s') + i)] = device
 
-    for i in range(get_num_disks(opts.instance_type)):
-        dev = BlockDeviceType()
-        dev.ephemeral_name = 'ephemeral%d' % i
-        name = '/dev/xvd' + string.letters[i + 1]
-        block_map[name] = dev
+    # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342).
+    if opts.instance_type.startswith('m3.'):
+        for i in range(get_num_disks(opts.instance_type)):
+            dev = BlockDeviceType()
+            dev.ephemeral_name = 'ephemeral%d' % i
+            # The first ephemeral drive is /dev/sdb.
+            name = '/dev/sd' + string.ascii_letters[i + 1]
+            block_map[name] = dev
 
     # Launch slaves
     if opts.spot_price is not None:
@@ -822,7 +836,7 @@ def launch_cluster(conn, opts, cluster_name):
 
     # This wait time corresponds to SPARK-4983
     print("Waiting for AWS to propagate instance metadata...")
-    time.sleep(5)
+    time.sleep(15)
 
     # Give the instances descriptive names and set additional tags
     additional_tags = {}
@@ -903,7 +917,7 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
             ssh_write(slave_address, opts, ['tar', 'x'], dot_ssh_tar)
 
     modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs',
-               'mapreduce', 'spark-standalone', 'tachyon']
+               'mapreduce', 'spark-standalone', 'tachyon', 'rstudio']
 
     if opts.hadoop_major_version == "1":
         modules = list(filter(lambda x: x != "mapreduce", modules))
@@ -1352,6 +1366,10 @@ def get_ip_address(instance, private_ips=False):
 def get_dns_name(instance, private_ips=False):
     dns = instance.public_dns_name if not private_ips else \
         instance.private_ip_address
+    if not dns:
+        raise UsageError("Failed to determine hostname of {0}.\n"
+                         "Please check that you provided --private-ips if "
+                         "necessary".format(instance))
     return dns
 
 
@@ -1416,7 +1434,10 @@ def real_main():
         sys.exit(1)
 
     try:
-        conn = ec2.connect_to_region(opts.region)
+        if opts.profile is None:
+            conn = ec2.connect_to_region(opts.region)
+        else:
+            conn = ec2.connect_to_region(opts.region, profile_name=opts.profile)
     except Exception as e:
         print((e), file=stderr)
         sys.exit(1)

From ee10100061dd87a382ab71295b04d78f6d8e1d6c Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 23 Aug 2016 17:10:26 -0300
Subject: [PATCH 091/268] Preparing por spark 2.0

---
 tools/cluster.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 642d2d98..c4a2f681 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -227,7 +227,8 @@ def launch(cluster_name, slaves,
            spark_version=default_spark_version,
            spark_ec2_git_repo=default_spark_ec2_git_repo,
            spark_ec2_git_branch=default_spark_ec2_git_branch,
-           ami=default_ami, master_ami=default_master_ami):
+           ami=default_ami, master_ami=default_master_ami,
+           instance_profile_name=None):
 
     all_args = locals()
 
@@ -264,6 +265,8 @@ def launch(cluster_name, slaves,
         ami_params = ['--ami', ami] if ami else []
         master_ami_params = ['--master-ami', master_ami] if master_ami else []
 
+        iam_params = ['--instance-profile-name', instance_profile_name] if instance_profile_name else []
+
         spark_version = custom_builds.get(spark_version, spark_version)
 
         for i in range(retries_on_same_cluster):
@@ -292,7 +295,8 @@ def launch(cluster_name, slaves,
                                 resume_param +
                                 auth_params +
                                 ami_params +
-                                master_ami_params,
+                                master_ami_params +
+                                iam_params,
                                 timeout_total_minutes=script_timeout_total_minutes,
                                 timeout_inactivity_minutes=script_timeout_inactivity_minutes)
                 success = True

From 4dbbac0370019b69ce9831ed5297553c7bf673d0 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 25 Aug 2016 17:06:26 -0300
Subject: [PATCH 092/268] Merging

---
 build.sbt                                         | 14 +++++++-------
 .../scala/ignition/core/jobs/utils/RDDUtils.scala | 15 ---------------
 2 files changed, 7 insertions(+), 22 deletions(-)

diff --git a/build.sbt b/build.sbt
index 5ae4552b..adb6ae01 100644
--- a/build.sbt
+++ b/build.sbt
@@ -2,32 +2,32 @@ name := "Ignition-Core"
 
 version := "1.0"
 
-scalaVersion := "2.10.4"
+scalaVersion := "2.11.8"
 
 scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130")
 
 // Because we can't run two spark contexts on same VM
 parallelExecution in Test := false
 
-libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.5.1" % "provided")
+libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.0" % "provided")
   .exclude("org.apache.hadoop", "hadoop-client")
   .exclude("org.slf4j", "slf4j-log4j12")
 
-libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided")
+libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.2" % "provided")
 
-libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.4"
+libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.0"
 
-libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.6"
+libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.9"
 
 libraryDependencies += "com.github.scopt" %% "scopt" % "3.2.0"
 
 libraryDependencies += "net.java.dev.jets3t" % "jets3t" % "0.7.1"
 
-libraryDependencies += "joda-time" % "joda-time" % "2.7"
+libraryDependencies += "joda-time" % "joda-time" % "2.9.4"
 
 libraryDependencies += "org.joda" % "joda-convert" % "1.7"
 
-libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.9.6"
+libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.7.4"
 
 libraryDependencies += "commons-lang" % "commons-lang" % "2.6"
 
diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
index fd1d74ce..b0b3bc86 100644
--- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
@@ -57,25 +57,10 @@ object RDDUtils {
   }
 
   implicit class RDDImprovements[V: ClassTag](rdd: RDD[V]) {
-    def incrementCounter(acc: spark.Accumulator[Int]): RDD[V] = {
-      rdd.map(x => { acc += 1; x })
-    }
-
-    def incrementCounterIf(cond: (V) => Boolean, acc: spark.Accumulator[Int]): RDD[V] = {
-      rdd.map(x => { if (cond(x)) acc += 1; x })
-    }
-
     def filterNot(p: V => Boolean): RDD[V] = rdd.filter(!p(_))
   }
 
   implicit class PairRDDImprovements[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) {
-    def incrementCounter(acc: spark.Accumulator[Int]): RDD[(K, V)] = {
-      rdd.mapValues(x => { acc += 1; x })
-    }
-
-    def incrementCounterIf(cond: (K, V) => Boolean, acc: spark.Accumulator[Int]): RDD[(K, V)] = {
-      rdd.mapPreservingPartitions(x => { if(cond(x._1, x._2)) acc += 1; x._2 })
-    }
 
     def flatMapPreservingPartitions[U: ClassTag](f: ((K, V)) => Seq[U]): RDD[(K, U)] = {
       rdd.mapPartitions[(K, U)](kvs => {

From 11db14ab4c61b3acde269f9d9959682251335c4c Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 25 Aug 2016 18:42:32 -0300
Subject: [PATCH 093/268] Try to fix disks creation

---
 tools/spark-ec2/spark_ec2.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
index 50d67b9b..a89dab8f 100755
--- a/tools/spark-ec2/spark_ec2.py
+++ b/tools/spark-ec2/spark_ec2.py
@@ -643,14 +643,19 @@ def launch_cluster(conn, opts, cluster_name):
             device.delete_on_termination = True
             block_map["/dev/sd" + chr(ord('s') + i)] = device
 
+    for i in range(get_num_disks(opts.instance_type)):
+        dev = BlockDeviceType()
+        dev.ephemeral_name = 'ephemeral%d' % i
+        name = '/dev/xvd' + string.letters[i + 1]
+        block_map[name] = dev
     # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342).
-    if opts.instance_type.startswith('m3.'):
-        for i in range(get_num_disks(opts.instance_type)):
-            dev = BlockDeviceType()
-            dev.ephemeral_name = 'ephemeral%d' % i
-            # The first ephemeral drive is /dev/sdb.
-            name = '/dev/sd' + string.ascii_letters[i + 1]
-            block_map[name] = dev
+    #if opts.instance_type.startswith('m3.'):
+    #    for i in range(get_num_disks(opts.instance_type)):
+    #        dev = BlockDeviceType()
+    #        dev.ephemeral_name = 'ephemeral%d' % i
+    #        # The first ephemeral drive is /dev/sdb.
+    #        name = '/dev/sd' + string.ascii_letters[i + 1]
+    #        block_map[name] = dev
 
     # Launch slaves
     if opts.spot_price is not None:

From 8109f79e98105decab9b34283c7d9d891a8d85cf Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 25 Aug 2016 18:44:11 -0300
Subject: [PATCH 094/268] Making stuff ready for spark 2.0

---
 build.sbt                                         | 10 ++++++----
 project/plugins.sbt                               |  2 +-
 remote_hook.sh                                    |  3 +--
 src/main/scala/ignition/core/utils/S3Client.scala |  4 ++--
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/build.sbt b/build.sbt
index adb6ae01..a3f9fdf6 100644
--- a/build.sbt
+++ b/build.sbt
@@ -13,22 +13,24 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.0" % "provided
   .exclude("org.apache.hadoop", "hadoop-client")
   .exclude("org.slf4j", "slf4j-log4j12")
 
+
 libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.2" % "provided")
 
+libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2")
+  .exclude("org.apache.htrace", "htrace-core")
+  .exclude("commons-beanutils", "commons-beanutils")
+  .exclude("org.slf4j", "slf4j-log4j12")
+
 libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.0"
 
 libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.9"
 
 libraryDependencies += "com.github.scopt" %% "scopt" % "3.2.0"
 
-libraryDependencies += "net.java.dev.jets3t" % "jets3t" % "0.7.1"
-
 libraryDependencies += "joda-time" % "joda-time" % "2.9.4"
 
 libraryDependencies += "org.joda" % "joda-convert" % "1.7"
 
-libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.7.4"
-
 libraryDependencies += "commons-lang" % "commons-lang" % "2.6"
 
 resolvers += "Akka Repository" at "http://repo.akka.io/releases/"
diff --git a/project/plugins.sbt b/project/plugins.sbt
index d5f371ab..f6f3b939 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -1,5 +1,5 @@
 addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
 
-addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2")
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3")
 
 addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")
diff --git a/remote_hook.sh b/remote_hook.sh
index 309f85f5..10902c46 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -92,8 +92,7 @@ if [[ "${USE_YARN}" == "yes" ]]; then
 fi
 
 if [[ "${JOB_NAME}" == "shell" ]]; then
-    export ADD_JARS=${JAR_PATH}
-    sudo -E ${SPARK_HOME}/bin/spark-shell --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell"
+    sudo -E ${SPARK_HOME}/bin/spark-shell --jars ${JAR_PATH} --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell"
 elif [[ "${JOB_NAME}" == "zeppelin" ]]; then
     install_and_run_zeppelin
 else
diff --git a/src/main/scala/ignition/core/utils/S3Client.scala b/src/main/scala/ignition/core/utils/S3Client.scala
index b806b376..020ab6f4 100644
--- a/src/main/scala/ignition/core/utils/S3Client.scala
+++ b/src/main/scala/ignition/core/utils/S3Client.scala
@@ -3,7 +3,7 @@ package ignition.core.utils
 import java.util.Properties
 
 import org.jets3t.service.impl.rest.httpclient.RestS3Service
-import org.jets3t.service.model.S3Object
+import org.jets3t.service.model.{S3Object, StorageObject}
 import org.jets3t.service.security.AWSCredentials
 import org.jets3t.service.{Constants, Jets3tProperties}
 
@@ -36,7 +36,7 @@ class S3Client {
     service.getObject(bucket, key, null, null, null, null, null, null)
   }
 
-  def list(bucket: String, key: String): Array[S3Object] = {
+  def list(bucket: String, key: String): Array[StorageObject] = {
     service.listObjectsChunked(bucket, key, null, 99999L, null, true).getObjects
   }
 

From 4f910ea68bbbcb2543c9bc21c7b5b3bd8cad3c46 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 25 Aug 2016 18:44:11 -0300
Subject: [PATCH 095/268] Making stuff ready for spark 2.0

---
 build.sbt                                        | 16 +++++++++-------
 project/plugins.sbt                              |  2 +-
 remote_hook.sh                                   |  3 +--
 .../scala/ignition/core/utils/S3Client.scala     |  4 ++--
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/build.sbt b/build.sbt
index 5ae4552b..f452fcef 100644
--- a/build.sbt
+++ b/build.sbt
@@ -13,22 +13,24 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.5.1" % "provided
   .exclude("org.apache.hadoop", "hadoop-client")
   .exclude("org.slf4j", "slf4j-log4j12")
 
-libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided")
 
-libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.4"
+libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.2" % "provided")
+
+libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2")
+  .exclude("org.apache.htrace", "htrace-core")
+  .exclude("commons-beanutils", "commons-beanutils")
+  .exclude("org.slf4j", "slf4j-log4j12")
+
+libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.0"
 
 libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.6"
 
 libraryDependencies += "com.github.scopt" %% "scopt" % "3.2.0"
 
-libraryDependencies += "net.java.dev.jets3t" % "jets3t" % "0.7.1"
-
-libraryDependencies += "joda-time" % "joda-time" % "2.7"
+libraryDependencies += "joda-time" % "joda-time" % "2.9.4"
 
 libraryDependencies += "org.joda" % "joda-convert" % "1.7"
 
-libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.9.6"
-
 libraryDependencies += "commons-lang" % "commons-lang" % "2.6"
 
 resolvers += "Akka Repository" at "http://repo.akka.io/releases/"
diff --git a/project/plugins.sbt b/project/plugins.sbt
index d5f371ab..f6f3b939 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -1,5 +1,5 @@
 addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
 
-addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2")
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3")
 
 addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")
diff --git a/remote_hook.sh b/remote_hook.sh
index 309f85f5..10902c46 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -92,8 +92,7 @@ if [[ "${USE_YARN}" == "yes" ]]; then
 fi
 
 if [[ "${JOB_NAME}" == "shell" ]]; then
-    export ADD_JARS=${JAR_PATH}
-    sudo -E ${SPARK_HOME}/bin/spark-shell --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell"
+    sudo -E ${SPARK_HOME}/bin/spark-shell --jars ${JAR_PATH} --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell"
 elif [[ "${JOB_NAME}" == "zeppelin" ]]; then
     install_and_run_zeppelin
 else
diff --git a/src/main/scala/ignition/core/utils/S3Client.scala b/src/main/scala/ignition/core/utils/S3Client.scala
index b806b376..020ab6f4 100644
--- a/src/main/scala/ignition/core/utils/S3Client.scala
+++ b/src/main/scala/ignition/core/utils/S3Client.scala
@@ -3,7 +3,7 @@ package ignition.core.utils
 import java.util.Properties
 
 import org.jets3t.service.impl.rest.httpclient.RestS3Service
-import org.jets3t.service.model.S3Object
+import org.jets3t.service.model.{S3Object, StorageObject}
 import org.jets3t.service.security.AWSCredentials
 import org.jets3t.service.{Constants, Jets3tProperties}
 
@@ -36,7 +36,7 @@ class S3Client {
     service.getObject(bucket, key, null, null, null, null, null, null)
   }
 
-  def list(bucket: String, key: String): Array[S3Object] = {
+  def list(bucket: String, key: String): Array[StorageObject] = {
     service.listObjectsChunked(bucket, key, null, 99999L, null, true).getObjects
   }
 

From 61829644b42fa2feaa02d98e677ce4093b9416c5 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Fri, 26 Aug 2016 15:44:43 -0300
Subject: [PATCH 096/268] Permission fix

---
 remote_hook.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/remote_hook.sh b/remote_hook.sh
index 10902c46..c0cd8da0 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -20,6 +20,7 @@ MY_USER=$(whoami)
 # Avoids problems when another user created our control dir
 sudo mkdir -p "${JOB_CONTROL_DIR}"
 sudo chown $MY_USER "${JOB_CONTROL_DIR}"
+sudo chown -R o+rx /root
 
 
 RUNNING_FILE="${JOB_CONTROL_DIR}/RUNNING"

From 3a44a4e248f0cc5410b1002ad94ff451fe7ec9c1 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Fri, 26 Aug 2016 15:51:16 -0300
Subject: [PATCH 097/268] Permission fix

---
 remote_hook.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index c0cd8da0..688bfbc1 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -20,7 +20,7 @@ MY_USER=$(whoami)
 # Avoids problems when another user created our control dir
 sudo mkdir -p "${JOB_CONTROL_DIR}"
 sudo chown $MY_USER "${JOB_CONTROL_DIR}"
-sudo chown -R o+rx /root
+sudo chmod -R o+rx /root
 
 
 RUNNING_FILE="${JOB_CONTROL_DIR}/RUNNING"

From 0ce1847ad4b7a76edd50240b0af529de4a786245 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Fri, 26 Aug 2016 16:11:50 -0300
Subject: [PATCH 098/268] Make possible to process files without dates

---
 .../scala/ignition/core/jobs/utils/SparkContextUtils.scala  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 645b218e..73f7f332 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -567,12 +567,12 @@ object SparkContextUtils {
       def dateValidation(file: HadoopFile): Option[HadoopFile] = {
         val tryDate = extractDateFromFile(file)
         if (tryDate.isEmpty && ignoreMalformedDates)
-          None
+          Option(file)
         else {
           val date = tryDate.get
           val goodStartDate = startDate.isEmpty || (inclusiveStartDate && date.saneEqual(startDate.get) || date.isAfter(startDate.get))
           val goodEndDate = endDate.isEmpty || (inclusiveEndDate && date.saneEqual(endDate.get) || date.isBefore(endDate.get))
-          if (goodStartDate && goodEndDate) Some(file) else None
+          if (goodStartDate && goodEndDate) Option(file) else None
         }
       }
 
@@ -590,7 +590,7 @@ object SparkContextUtils {
         startDate = startDate, endDate = endDate, exclusionPattern = exclusionPattern).filter(preValidations)
 
       val filesByDate = preFilteredFiles.groupBy(extractDateFromFile).collect {
-        case (Some(date), files) => date -> files
+        case (date, files) => date.getOrElse(new DateTime(1970, 1, 1, 1, 1)) -> files
       }
 
       val posFilteredFiles =

From 34f6bc2f53031f9f27090af1a61cb134baf1ccfc Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Fri, 26 Aug 2016 16:32:59 -0300
Subject: [PATCH 099/268] Supporting s3a on our utils

---
 .../core/jobs/utils/SparkContextUtils.scala        | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 73f7f332..81e1d355 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -148,7 +148,7 @@ object SparkContextUtils {
       val filesToOutput = 1500
       def mapPaths(actionWhenNeedsSynching: (String, String) => Unit): Seq[String] = {
         paths.map(p => {
-          val hdfsPath = p.replace("s3n://", hdfsPathPrefix)
+          val hdfsPath = p.replaceFirst("s3(a|n)://", hdfsPathPrefix)
           if (forceSynch || getStatus(hdfsPath, false).isEmpty || getStatus(s"$hdfsPath/*", true).filterNot(_.isDirectory).size != filesToOutput) {
             val _hdfsPath = new Path(hdfsPath)
             actionWhenNeedsSynching(p, hdfsPath)
@@ -486,7 +486,7 @@ object SparkContextUtils {
       }
 
       def classifyPath(path: String): Either[String, (String, DateTime)] =
-        Try(pathDateExtractor.extractFromPath(s"s3n://$bucket/$path")) match {
+        Try(pathDateExtractor.extractFromPath(s"s3a://$bucket/$path")) match {
           case Success(date) => Right(path -> date)
           case Failure(_) => Left(path)
         }
@@ -494,11 +494,11 @@ object SparkContextUtils {
       val commonPrefixes = s3ListCommonPrefixes(bucket, prefix, delimiter).map(classifyPath)
 
       if (commonPrefixes.isEmpty)
-        Stream(s"s3n://$bucket/$prefix")
+        Stream(s"s3a://$bucket/$prefix")
       else
         commonPrefixes.toStream.flatMap {
           case Left(prefixWithoutDate) => s3NarrowPaths(bucket, prefixWithoutDate, delimiter, inclusiveStartDate, startDate, inclusiveEndDate, endDate, ignoreHours)
-          case Right((prefixWithDate, date)) if isGoodDate(date) => Stream(s"s3n://$bucket/$prefixWithDate")
+          case Right((prefixWithDate, date)) if isGoodDate(date) => Stream(s"s3a://$bucket/$prefixWithDate")
           case Right(_) => Stream.empty
         }
     }
@@ -511,10 +511,10 @@ object SparkContextUtils {
                        exclusionPattern: Option[String])
                       (implicit s3: AmazonS3Client, dateExtractor: PathDateExtractor): Stream[S3ObjectSummary] = {
 
-      val s3Pattern = "s3n?://([^/]+)(.+)".r
+      val s3Pattern = "s3(a|n)?://([^/]+)(.+)".r
 
       def extractBucketAndPrefix(path: String): Option[(String, String)] = path match {
-        case s3Pattern(bucket, prefix) => Option(bucket -> prefix.dropWhile(_ == '/'))
+        case s3Pattern(_, bucket, prefix) => Option(bucket -> prefix.dropWhile(_ == '/'))
         case _ => None
       }
 
@@ -615,7 +615,7 @@ object SparkContextUtils {
                   exclusionPattern: Option[String] = None)(implicit pathDateExtractor: PathDateExtractor): Stream[HadoopFile] = {
 
       def toHadoopFile(s3Object: S3ObjectSummary): HadoopFile =
-        HadoopFile(s"s3n://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize)
+        HadoopFile(s"s3a://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize)
 
       def listPath(path: String): Stream[HadoopFile] = {
         if (path.startsWith("s3")) {

From 8393d54ba57273a4aa14d76ad4685566247c2067 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Wed, 31 Aug 2016 15:08:41 -0300
Subject: [PATCH 100/268] Preparing for new Spark/Scala/Hadoop

---
 build.sbt                                         |  9 ++-------
 .../core/jobs/utils/SparkContextUtils.scala       | 15 ++++++++-------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/build.sbt b/build.sbt
index 2061fe3a..b8765224 100644
--- a/build.sbt
+++ b/build.sbt
@@ -23,13 +23,6 @@ libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2")
 
 libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.0"
 
-libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2")
-  .exclude("org.apache.htrace", "htrace-core")
-  .exclude("commons-beanutils", "commons-beanutils")
-  .exclude("org.slf4j", "slf4j-log4j12")
-
-libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.0"
-
 libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.9"
 
 libraryDependencies += "com.github.scopt" %% "scopt" % "3.2.0"
@@ -40,6 +33,8 @@ libraryDependencies += "org.joda" % "joda-convert" % "1.7"
 
 libraryDependencies += "commons-lang" % "commons-lang" % "2.6"
 
+libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.5"
+
 resolvers += "Akka Repository" at "http://repo.akka.io/releases/"
 
 resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/"
diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 81e1d355..7588deaa 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -24,10 +24,10 @@ import scala.reflect.ClassTag
 import scala.util.{Failure, Success, Try}
 import scala.util.control.NonFatal
 import ignition.core.utils.ExceptionUtils._
+import org.slf4j.LoggerFactory
 
 
 object SparkContextUtils {
-
   private case class BigFileSlice(index: Int)
 
   private case class HadoopFilePartition(size: Long, paths: Seq[String])
@@ -51,6 +51,8 @@ object SparkContextUtils {
 
   implicit class SparkContextImprovements(sc: SparkContext) {
 
+    private lazy val logger = LoggerFactory.getLogger(getClass)
+
     lazy val _hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap)
 
     private def getFileSystem(path: Path): FileSystem = {
@@ -73,7 +75,7 @@ object SparkContextUtils {
     }
 
     // This call is equivalent to a ls -d in shell, but won't fail if part of a path matches nothing,
-    // For instance, given path = s3n://bucket/{a,b}, it will work fine if a exists but b is missing
+    // For instance, given path = s3a://bucket/{a,b}, it will work fine if a exists but b is missing
     def sortedGlobPath(_paths: Seq[String], removeEmpty: Boolean = true): Seq[String] = {
       val paths = _paths.flatMap(path => ignition.core.utils.HadoopUtils.getPathStrings(path))
       paths.flatMap(p => getStatus(p, removeEmpty)).map(_.getPath.toString).distinct.sorted
@@ -148,7 +150,7 @@ object SparkContextUtils {
       val filesToOutput = 1500
       def mapPaths(actionWhenNeedsSynching: (String, String) => Unit): Seq[String] = {
         paths.map(p => {
-          val hdfsPath = p.replaceFirst("s3(a|n)://", hdfsPathPrefix)
+          val hdfsPath = p.replaceFirst("s3[an]://", hdfsPathPrefix)
           if (forceSynch || getStatus(hdfsPath, false).isEmpty || getStatus(s"$hdfsPath/*", true).filterNot(_.isDirectory).size != filesToOutput) {
             val _hdfsPath = new Path(hdfsPath)
             actionWhenNeedsSynching(p, hdfsPath)
@@ -286,7 +288,6 @@ object SparkContextUtils {
       val hadoopConf = _hadoopConf
 
       val partitionedSlices = sc.parallelize(slices.map(s => s -> null), 2).partitionBy(partitioner)
-
       partitionedSlices.mapPartitions { slices =>
         val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
         val codecFactory = new CompressionCodecFactory(conf)
@@ -511,10 +512,10 @@ object SparkContextUtils {
                        exclusionPattern: Option[String])
                       (implicit s3: AmazonS3Client, dateExtractor: PathDateExtractor): Stream[S3ObjectSummary] = {
 
-      val s3Pattern = "s3(a|n)?://([^/]+)(.+)".r
+      val s3Pattern = "s3[an]?://([^/]+)(.+)".r
 
       def extractBucketAndPrefix(path: String): Option[(String, String)] = path match {
-        case s3Pattern(_, bucket, prefix) => Option(bucket -> prefix.dropWhile(_ == '/'))
+        case s3Pattern(bucket, prefix) => Option(bucket -> prefix.dropWhile(_ == '/'))
         case _ => None
       }
 
@@ -620,7 +621,7 @@ object SparkContextUtils {
       def listPath(path: String): Stream[HadoopFile] = {
         if (path.startsWith("s3")) {
           s3List(path, inclusiveStartDate = inclusiveStartDate, startDate = startDate, inclusiveEndDate = inclusiveEndDate,
-            endDate = endDate, exclusionPattern = exclusionPattern)(amazonS3ClientFromEnvironmentVariables, pathDateExtractor ).map(toHadoopFile)
+            endDate = endDate, exclusionPattern = exclusionPattern)(amazonS3ClientFromEnvironmentVariables, pathDateExtractor).map(toHadoopFile)
         } else {
           driverListFiles(path).toStream
         }

From b57d2e4cba3298e0c4cf626697a87bfaed503e38 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Wed, 31 Aug 2016 16:27:17 -0300
Subject: [PATCH 101/268] Making tests pass

---
 src/main/scala/ignition/core/utils/BetterTrace.scala         | 4 +++-
 src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala   | 2 +-
 src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala | 2 +-
 src/test/scala/ignition/core/utils/FutureUtilsSpec.scala     | 2 +-
 src/test/scala/ignition/core/utils/IntBagSpec.scala          | 2 +-
 5 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/main/scala/ignition/core/utils/BetterTrace.scala b/src/main/scala/ignition/core/utils/BetterTrace.scala
index 387f49f7..49c74606 100644
--- a/src/main/scala/ignition/core/utils/BetterTrace.scala
+++ b/src/main/scala/ignition/core/utils/BetterTrace.scala
@@ -1,9 +1,11 @@
 package ignition.core.utils
 
 import ignition.core.utils.ExceptionUtils._
+import org.scalactic.source
 // Used mainly to augment scalacheck traces in scalatest
 trait BetterTrace {
-  def fail(message: String): Nothing
+  def fail(message: String)(implicit pos: source.Position): Nothing
+
   def withBetterTrace(block: => Unit): Unit =
     try {
       block
diff --git a/src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala b/src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala
index 705ba398..eed298b6 100644
--- a/src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala
+++ b/src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala
@@ -6,7 +6,7 @@ import org.scalatest._
 
 import scala.util.Random
 
-class RDDUtilsSpec extends FlatSpec with ShouldMatchers with SharedSparkContext {
+class RDDUtilsSpec extends FlatSpec with Matchers with SharedSparkContext {
 
   "RDDUtils" should "provide groupByKeyAndTake" in {
     (10 to 60 by 10).foreach { take =>
diff --git a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala
index c800c0f2..26757c26 100644
--- a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala
+++ b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala
@@ -3,7 +3,7 @@ package ignition.core.utils
 import org.scalatest._
 import CollectionUtils._
 
-class CollectionUtilsSpec extends FlatSpec with ShouldMatchers {
+class CollectionUtilsSpec extends FlatSpec with Matchers {
 
   case class MyObj(property: String, value: String)
   "CollectionUtils" should "provide distinctBy" in {
diff --git a/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala b/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala
index 8c2b3270..4649fcfc 100644
--- a/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala
+++ b/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala
@@ -6,7 +6,7 @@ import scala.concurrent.{Await, Future}
 import scala.concurrent.duration._
 import scala.concurrent.ExecutionContext.Implicits.global
 
-class FutureUtilsSpec extends FlatSpec with ShouldMatchers {
+class FutureUtilsSpec extends FlatSpec with Matchers {
   "FutureUtils" should "provide toLazyIterable" in {
     val timesCalled = collection.mutable.Map.empty[Int, Int].withDefaultValue(0)
 
diff --git a/src/test/scala/ignition/core/utils/IntBagSpec.scala b/src/test/scala/ignition/core/utils/IntBagSpec.scala
index 76d37a35..f577237e 100644
--- a/src/test/scala/ignition/core/utils/IntBagSpec.scala
+++ b/src/test/scala/ignition/core/utils/IntBagSpec.scala
@@ -4,7 +4,7 @@ import org.scalatest._
 
 import scala.util.Random
 
-class IntBagSpec extends FlatSpec with ShouldMatchers  {
+class IntBagSpec extends FlatSpec with Matchers  {
 
   "IntBag" should "be built from sequence" in {
     IntBag.from(Seq(1, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4)).histogram shouldBe Map(1 -> 2, 2 -> 3, 3 -> 1, 4 -> 5)

From 7029f03122fb25494a64c71ded621fff4d73335b Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 6 Sep 2016 15:21:20 -0300
Subject: [PATCH 102/268] Use older scalatest which is compatible with current
 scalamock

---
 build.sbt                                            | 2 +-
 src/main/scala/ignition/core/utils/BetterTrace.scala | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/build.sbt b/build.sbt
index b8765224..3dd8d22a 100644
--- a/build.sbt
+++ b/build.sbt
@@ -21,7 +21,7 @@ libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2")
   .exclude("commons-beanutils", "commons-beanutils")
   .exclude("org.slf4j", "slf4j-log4j12")
 
-libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.0"
+libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.4"
 
 libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.9"
 
diff --git a/src/main/scala/ignition/core/utils/BetterTrace.scala b/src/main/scala/ignition/core/utils/BetterTrace.scala
index 49c74606..09de73aa 100644
--- a/src/main/scala/ignition/core/utils/BetterTrace.scala
+++ b/src/main/scala/ignition/core/utils/BetterTrace.scala
@@ -1,10 +1,9 @@
 package ignition.core.utils
 
 import ignition.core.utils.ExceptionUtils._
-import org.scalactic.source
 // Used mainly to augment scalacheck traces in scalatest
 trait BetterTrace {
-  def fail(message: String)(implicit pos: source.Position): Nothing
+  def fail(message: String): Nothing
 
   def withBetterTrace(block: => Unit): Unit =
     try {

From cfa7017cba8ba2fcb4a4aa1cfc187a6ee2619d0d Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 6 Sep 2016 18:40:05 -0300
Subject: [PATCH 103/268] Avoid temporary files on s3

---
 .../ignition/core/jobs/CoreJobRunner.scala    |  5 +-
 .../core/jobs/DirectOutputCommitter.scala     | 75 +++++++++++++++++++
 .../core/jobs/utils/SparkContextUtils.scala   |  1 +
 3 files changed, 80 insertions(+), 1 deletion(-)
 create mode 100644 src/main/scala/ignition/core/jobs/DirectOutputCommitter.scala

diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
index bbede553..0dec0896 100644
--- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
+++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
@@ -75,11 +75,14 @@ object CoreJobRunner {
 
       sparkConf.setMaster(config.master)
       sparkConf.setAppName(appName)
-      
+
+      sparkConf.set("spark.hadoop.mapred.output.committer.class", classOf[DirectOutputCommitter].getName())
+
       defaultSparkConfMap.foreach { case (k, v) => sparkConf.set(k, v) }
 
       jobConf.foreach { case (k, v) => sparkConf.set(k, v) }
 
+
       // Add logging context to driver
       setLoggingContextValues(config)
       
diff --git a/src/main/scala/ignition/core/jobs/DirectOutputCommitter.scala b/src/main/scala/ignition/core/jobs/DirectOutputCommitter.scala
new file mode 100644
index 00000000..63611cf4
--- /dev/null
+++ b/src/main/scala/ignition/core/jobs/DirectOutputCommitter.scala
@@ -0,0 +1,75 @@
+package ignition.core.jobs
+
+// Code from: https://gist.github.com/aarondav/c513916e72101bbe14ec
+// Suggested by: http://tech.grammarly.com/blog/posts/Petabyte-Scale-Text-Processing-with-Spark.html
+
+/*
+ * Copyright 2015 Databricks, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.  You may obtain
+ * a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapred._
+
+/**
+  * OutputCommitter suitable for S3 workloads. Unlike the usual FileOutputCommitter, which
+  * writes files to a _temporary/ directory before renaming them to their final location, this
+  * simply writes directly to the final location.
+  *
+  * The FileOutputCommitter is required for HDFS + speculation, which allows only one writer at
+  * a time for a file (so two people racing to write the same file would not work). However, S3
+  * supports multiple writers outputting to the same file, where visibility is guaranteed to be
+  * atomic. This is a monotonic operation: all writers should be writing the same data, so which
+  * one wins is immaterial.
+  *
+  * Code adapted from Ian Hummel's code from this PR:
+  * https://github.com/themodernlife/spark/commit/4359664b1d557d55b0579023df809542386d5b8c
+  */
+class DirectOutputCommitter extends OutputCommitter {
+  override def setupJob(jobContext: JobContext): Unit = { }
+
+  override def setupTask(taskContext: TaskAttemptContext): Unit = { }
+
+  override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = {
+    // We return true here to guard against implementations that do not handle false correctly.
+    // The meaning of returning false is not entirely clear, so it's possible to be interpreted
+    // as an error. Returning true just means that commitTask() will be called, which is a no-op.
+    true
+  }
+
+  override def commitTask(taskContext: TaskAttemptContext): Unit = { }
+
+  override def abortTask(taskContext: TaskAttemptContext): Unit = { }
+
+  /**
+    * Creates a _SUCCESS file to indicate the entire job was successful.
+    * This mimics the behavior of FileOutputCommitter, reusing the same file name and conf option.
+    */
+  override def commitJob(context: JobContext): Unit = {
+    val conf = context.getJobConf
+    if (shouldCreateSuccessFile(conf)) {
+      val outputPath = FileOutputFormat.getOutputPath(conf)
+      if (outputPath != null) {
+        val fileSys = outputPath.getFileSystem(conf)
+        val filePath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME)
+        fileSys.create(filePath).close()
+      }
+    }
+  }
+
+  /** By default, we do create the _SUCCESS file, but we allow it to be turned off. */
+  private def shouldCreateSuccessFile(conf: JobConf): Boolean = {
+    conf.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)
+  }
+}
\ No newline at end of file
diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 7588deaa..dddd51a6 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -330,6 +330,7 @@ object SparkContextUtils {
                              minPartitions: Int,
                              sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = {
       def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq(
+        "mapreduce.input.fileinputformat.split.minsize" -> maxSplitSize.toString,
         "mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString))
         .foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc }
 

From 30ea5440cae326ded9995c00822441ad5241aa3c Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 6 Sep 2016 19:19:13 -0300
Subject: [PATCH 104/268] Fixed doc error

---
 .../ignition/core/testsupport/spark/LocalSparkContext.scala      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/main/scala/ignition/core/testsupport/spark/LocalSparkContext.scala b/src/main/scala/ignition/core/testsupport/spark/LocalSparkContext.scala
index 2edb28e7..a272edaa 100644
--- a/src/main/scala/ignition/core/testsupport/spark/LocalSparkContext.scala
+++ b/src/main/scala/ignition/core/testsupport/spark/LocalSparkContext.scala
@@ -21,7 +21,6 @@ import _root_.io.netty.util.internal.logging.{InternalLoggerFactory, Slf4JLogger
 import org.apache.spark.SparkContext
 import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, Suite}
 
-/** Manages a local `sc` {@link SparkContext} variable, correctly stopping it after each test. */
 trait LocalSparkContext extends BeforeAndAfterEach with BeforeAndAfterAll { self: Suite =>
 
   @transient var sc: SparkContext = _

From 85f920dc5a95af9c03fd851ca29ec55443ca4347 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 20 Sep 2016 15:36:49 -0300
Subject: [PATCH 105/268] Added logging factor to groupByKeyAndTake

---
 .../scala/ignition/core/jobs/utils/RDDUtils.scala   | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
index b0b3bc86..e04dd118 100644
--- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
@@ -11,6 +11,7 @@ import org.joda.time.DateTime
 import org.joda.time.format.DateTimeFormat
 
 import scala.collection.mutable
+import scala.util.Random
 import scalaz.{Success, Validation}
 
 object RDDUtils {
@@ -78,11 +79,14 @@ object RDDUtils {
       rdd.filter { case (k, v) => f.isDefinedAt(v) }.mapValues(f)
     }
 
-    def groupByKeyAndTake(n: Int): RDD[(K, List[V])] =
+    // loggingFactor: percentage of the potential logging that will be really printed
+    // Big jobs will have too much logging and my eat up cluster disk space
+    def groupByKeyAndTake(n: Int, loggingFactor: Double = 0.5): RDD[(K, List[V])] =
       rdd.aggregateByKey(mutable.ListBuffer.empty[V])(
         (lst, v) =>
           if (lst.size >= n) {
-            logger.warn(s"Ignoring value '$v' due aggregation result of size '${lst.size}' is bigger than n=$n")
+            if (Random.nextDouble() < loggingFactor)
+              logger.warn(s"Ignoring value '$v' due aggregation result of size '${lst.size}' is bigger than n=$n")
             lst
           } else {
             lst += v
@@ -95,7 +99,8 @@ object RDDUtils {
             lstB
           else {
             if (lstA.size + lstB.size > n) {
-              logger.warn(s"Merging partition1=${lstA.size} with partition2=${lstB.size} and taking the first n=$n, sample1='${lstA.take(5)}', sample2='${lstB.take(5)}'")
+              if (Random.nextDouble() < loggingFactor)
+                logger.warn(s"Merging partition1=${lstA.size} with partition2=${lstB.size} and taking the first n=$n, sample1='${lstA.take(5)}', sample2='${lstB.take(5)}'")
               lstA ++= lstB
               lstA.take(n)
             } else {
@@ -115,4 +120,4 @@ object RDDUtils {
         (lstA, lstB) => (lstA ++ lstB).sorted(ord).take(n))
     }
   }
-}
+}
\ No newline at end of file

From 2b3231c7dbc0ea808f47ef7c5c05244f5534b507 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Thu, 22 Sep 2016 13:56:32 -0300
Subject: [PATCH 106/268] make provided s3 dependencies

---
 build.sbt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/build.sbt b/build.sbt
index 3dd8d22a..5cddf64a 100644
--- a/build.sbt
+++ b/build.sbt
@@ -13,13 +13,13 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.0" % "provided
   .exclude("org.apache.hadoop", "hadoop-client")
   .exclude("org.slf4j", "slf4j-log4j12")
 
-
 libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.2" % "provided")
 
-libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2")
-  .exclude("org.apache.htrace", "htrace-core")
-  .exclude("commons-beanutils", "commons-beanutils")
-  .exclude("org.slf4j", "slf4j-log4j12")
+libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2" % "provided")
+
+libraryDependencies += ("com.amazonaws" % "aws-java-sdk" % "1.7.4" % "provided")
+
+libraryDependencies += ("net.java.dev.jets3t" % "jets3t" % "0.9.0" % "provided")
 
 libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.4"
 

From ee0ed1ef8e7626023e2ca10960640d39a734c5d3 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Thu, 22 Sep 2016 18:22:40 -0300
Subject: [PATCH 107/268] classpath fix revert

---
 build.sbt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/build.sbt b/build.sbt
index 5cddf64a..3dd8d22a 100644
--- a/build.sbt
+++ b/build.sbt
@@ -13,13 +13,13 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.0" % "provided
   .exclude("org.apache.hadoop", "hadoop-client")
   .exclude("org.slf4j", "slf4j-log4j12")
 
-libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.2" % "provided")
-
-libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2" % "provided")
 
-libraryDependencies += ("com.amazonaws" % "aws-java-sdk" % "1.7.4" % "provided")
+libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.2" % "provided")
 
-libraryDependencies += ("net.java.dev.jets3t" % "jets3t" % "0.9.0" % "provided")
+libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2")
+  .exclude("org.apache.htrace", "htrace-core")
+  .exclude("commons-beanutils", "commons-beanutils")
+  .exclude("org.slf4j", "slf4j-log4j12")
 
 libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.4"
 

From 98ef7243a53be75255b79b8e8735c04aa5d597fd Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 27 Sep 2016 15:48:53 -0300
Subject: [PATCH 108/268] Added Future.withTimeout

---
 build.sbt                                            | 2 ++
 src/main/scala/ignition/core/utils/FutureUtils.scala | 7 +++++++
 2 files changed, 9 insertions(+)

diff --git a/build.sbt b/build.sbt
index 3dd8d22a..711d3798 100644
--- a/build.sbt
+++ b/build.sbt
@@ -35,6 +35,8 @@ libraryDependencies += "commons-lang" % "commons-lang" % "2.6"
 
 libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.5"
 
+libraryDependencies += "com.typesafe.akka" %% "akka-actor" % "2.3.4"
+
 resolvers += "Akka Repository" at "http://repo.akka.io/releases/"
 
 resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/"
diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala
index 4523a94f..55853826 100644
--- a/src/main/scala/ignition/core/utils/FutureUtils.scala
+++ b/src/main/scala/ignition/core/utils/FutureUtils.scala
@@ -1,5 +1,8 @@
 package ignition.core.utils
 
+import akka.actor.ActorSystem
+
+import scala.concurrent.duration.FiniteDuration
 import scala.concurrent.{ExecutionContext, Future, Promise, blocking, future}
 import scala.util.control.NonFatal
 import scala.util.{Failure, Success, Try}
@@ -28,6 +31,10 @@ object FutureUtils {
     def asTry()(implicit ec: ExecutionContext) : Future[Try[V]] = {
       future.map(v => Success(v)).recover { case NonFatal(e) => Failure(e) }
     }
+
+    def withTimeout(timeout: => Throwable)(implicit duration: FiniteDuration, system: ActorSystem): Future[V] = {
+      Future.firstCompletedOf(Seq(future, akka.pattern.after(duration, system.scheduler)(Future.failed(timeout))(system.dispatcher)))(system.dispatcher)
+    }
   }
 
   implicit class FutureGeneratorImprovements[V](generator: Iterable[() => Future[V]]){

From 34afa426e491ba0566e669cff2ba876ce25206c6 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 29 Sep 2016 19:50:32 -0300
Subject: [PATCH 109/268] Moved async http to core

---
 build.sbt                                     |   8 +
 .../core/http/AsyncHttpClientStreamApi.scala  |  69 ++++
 .../core/http/AsyncSprayHttpClient.scala      | 302 ++++++++++++++++++
 .../ignition/core/http/ByteStorage.scala      | 114 +++++++
 .../scala/ignition/core/http/Caching.scala    |  22 ++
 src/main/scala/ignition/core/http/Retry.scala |  65 ++++
 6 files changed, 580 insertions(+)
 create mode 100644 src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
 create mode 100644 src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
 create mode 100644 src/main/scala/ignition/core/http/ByteStorage.scala
 create mode 100644 src/main/scala/ignition/core/http/Caching.scala
 create mode 100644 src/main/scala/ignition/core/http/Retry.scala

diff --git a/build.sbt b/build.sbt
index 711d3798..6ffe0e85 100644
--- a/build.sbt
+++ b/build.sbt
@@ -37,6 +37,14 @@ libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.5"
 
 libraryDependencies += "com.typesafe.akka" %% "akka-actor" % "2.3.4"
 
+libraryDependencies += "io.spray" %% "spray-json" % "1.3.2"
+
+libraryDependencies += "io.spray" %% "spray-client" % "1.3.2"
+
+libraryDependencies += "io.spray" %% "spray-http" % "1.3.2"
+
+libraryDependencies += "io.spray" %% "spray-caching" % "1.3.2"
+
 resolvers += "Akka Repository" at "http://repo.akka.io/releases/"
 
 resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/"
diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
new file mode 100644
index 00000000..52d97810
--- /dev/null
+++ b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
@@ -0,0 +1,69 @@
+package ignition.core.http
+
+import java.io.InputStream
+import java.util.concurrent.TimeUnit
+
+import akka.util.Timeout
+import spray.http.{HttpEntity, HttpMethod, HttpMethods}
+
+import scala.concurrent.Future
+import scala.concurrent.duration._
+import scala.language.postfixOps
+
+
+object AsyncHttpClientStreamApi {
+  
+  case class Credentials(user: String, password: String) {
+    def isEmpty = user.isEmpty && password.isEmpty
+
+    def toOption = Some(this).filter(!_.isEmpty)
+  }
+
+  object Credentials {
+    val empty = Credentials("", "")
+  }
+
+  // TODO: return a stream is dangerous because implies into a lock
+  case class StreamResponse(status: Int, content: InputStream)
+
+  case class RequestConfiguration(maxRedirects: Int = 15,
+                                  maxConnectionsPerHost: Int = 500,
+                                  pipelining: Boolean = false,
+                                  idleTimeout: Duration = Duration(30, TimeUnit.SECONDS),
+                                  requestTimeout: Duration = Duration(20, TimeUnit.SECONDS),
+                                  connectingTimeout: Duration = Duration(10, TimeUnit.SECONDS))
+
+  case class Request(url: String,
+                     params: Map[String, String] = Map.empty,
+                     credentials: Option[Credentials] = None,
+                     method: HttpMethod = HttpMethods.GET,
+                     body: HttpEntity = HttpEntity.Empty,
+                     requestConfiguration: Option[RequestConfiguration] = Option(RequestConfiguration()))
+
+  case class RequestException(message: String, response: StreamResponse) extends RuntimeException(message)
+
+  object NoOpReporter extends ReporterCallback {
+    def onRequest(request: Request): Unit = {}
+    def onResponse(request: Request, status: Int): Unit = {}
+    def onFailure(request: Request, status: Int): Unit = {}
+    def onRetry(request: Request): Unit = {}
+    def onGiveUp(request: Request): Unit = {}
+    def onError(request: Request, error: Any): Unit = {}
+  }
+
+  abstract class ReporterCallback {
+    def onRequest(request: Request): Unit
+    def onResponse(request: Request, status: Int): Unit
+    def onFailure(request: Request, status: Int): Unit
+    def onRetry(request: Request): Unit
+    def onGiveUp(request: Request): Unit
+    def onError(request: Request, error: Any): Unit
+  }
+}
+
+trait AsyncHttpClientStreamApi {
+
+  def makeRequest(request: AsyncHttpClientStreamApi.Request, initialBackoff: FiniteDuration = 100 milliseconds, retryOnHttpStatus: Seq[Int] = List.empty)
+                             (implicit timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback = AsyncHttpClientStreamApi.NoOpReporter): Future[AsyncHttpClientStreamApi.StreamResponse]
+
+}
\ No newline at end of file
diff --git a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
new file mode 100644
index 00000000..54247a04
--- /dev/null
+++ b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
@@ -0,0 +1,302 @@
+package ignition.core.http
+
+import java.net.URL
+import java.util.concurrent.TimeoutException
+
+import akka.actor._
+import akka.io.IO
+import akka.pattern.ask
+import akka.util.Timeout
+
+import spray.can.Http
+import spray.can.Http.HostConnectorSetup
+import spray.can.client.{ClientConnectionSettings, HostConnectorSettings}
+import spray.http.HttpHeaders.Authorization
+import spray.http.StatusCodes.Redirection
+import spray.http._
+
+
+import scala.concurrent.duration._
+import scala.concurrent.{ExecutionContext, Future}
+import scala.language.postfixOps
+import scala.util.control.NonFatal
+
+import ignition.core.http.AsyncHttpClientStreamApi.{Request, RequestConfiguration}
+
+
+trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi {
+
+  implicit def actorRefFactory: ActorRefFactory
+  def executionContext: ExecutionContext = actorRefFactory.dispatcher
+
+  override def makeRequest(request: AsyncHttpClientStreamApi.Request, backoff: FiniteDuration, retryOnHttpStatus: Seq[Int])
+                             (implicit timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback): Future[AsyncHttpClientStreamApi.StreamResponse] = {
+    val processor = actorRefFactory.actorOf(Props(new RequestProcessorActor(timeout, reporter, backoff, retryOnHttpStatus)))
+    (processor ? request).mapTo[AsyncHttpClientStreamApi.StreamResponse]
+  }
+
+  private class RequestProcessorActor(timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback, backoff: FiniteDuration, retryOnHttpStatus: Seq[Int])
+    extends Actor with ActorLogging {
+
+
+    import context.system
+
+    import scala.language.implicitConversions
+
+    def isRedirection(status: StatusCode): Boolean = status match {
+      case r: Redirection => true
+      case _ => false
+    }
+
+    private def toUriString(url: String, params: Map[String, String] = Map.empty) = {
+      def encode(content: String) = java.net.URLEncoder.encode(content, "UTF-8")
+      def encodeParams = params.map { case (k, v) => s"${encode(k)}=${encode(v)}" }.mkString("&")
+      if (params.isEmpty) url else s"$url?${encodeParams}"
+    }
+
+    private implicit def toAuthHeader(credentials: AsyncHttpClientStreamApi.Credentials): List[Authorization] =
+      List(Authorization(credentials = BasicHttpCredentials(username = credentials.user, password = credentials.password)))
+
+    private def toSprayRequest(request: Request): HttpRequest = request match {
+      case Request(uri, params, Some(credentials), method, body, _) if params.isEmpty =>
+          HttpRequest(method = method, uri = request.url, headers = credentials, entity = body)
+
+      case Request(uri, params, Some(credentials), method, body, _) =>
+        HttpRequest(method = method, uri = toUriString(request.url, params), headers = credentials, entity = body)
+
+      case Request(uri, params, None, method, body, _) if params.isEmpty =>
+        HttpRequest(method = method, uri = toUriString(request.url), entity = body)
+
+      case Request(uri, params, None, method, body, _) =>
+        HttpRequest(method = method, uri = toUriString(request.url, params), entity = body)
+    }
+
+    private def toSprayHostConnectorSetup(host: String, configuration: AsyncHttpClientStreamApi.RequestConfiguration): HostConnectorSetup = {
+      // Create based on defaults, change some of them
+      val ccs: ClientConnectionSettings = ClientConnectionSettings(system)
+      val hcs: HostConnectorSettings = HostConnectorSettings(system)
+
+      val updatedCcs = ccs.copy(
+        responseChunkAggregationLimit = 0, // makes our client ineffective if non zero
+        idleTimeout = configuration.idleTimeout,
+        connectingTimeout = configuration.connectingTimeout,
+        requestTimeout = configuration.requestTimeout
+      )
+
+      val updatedHcs = hcs.copy(
+        connectionSettings = updatedCcs,
+        maxRetries = 0, // We have our own retry mechanism
+        maxRedirects = 0, // We do our own redirect following
+        maxConnections = configuration.maxConnectionsPerHost,
+        pipelining = configuration.pipelining
+      )
+      HostConnectorSetup(host = host, settings = Option(updatedHcs))
+    }
+
+    private def executeSprayRequest(request: Request): Unit = request.requestConfiguration match {
+      case Some(configuration) =>
+        val url = new URL(request.url)
+        val message = (toSprayRequest(request), toSprayHostConnectorSetup(url.getHost, configuration))
+        IO(Http) ! message
+      case None =>
+        IO(Http) ! toSprayRequest(request)
+    }
+
+    def handleErrors(commander: ActorRef, request: Request, retry: Retry, storage: ByteStorage, remainingRedirects: Int): Receive = {
+      case ev @ Http.SendFailed(_) =>
+        log.debug("Communication error, cause: {}", ev)
+        reporter.onError(request, ev)
+        storage.close()
+        context.become(retrying(commander, request, remainingRedirects))
+        self ! retry.onError
+
+      case ev @ Timedout(_) =>
+        log.debug("Communication error, cause: {}", ev)
+        reporter.onError(request, ev)
+        storage.close()
+        context.become(retrying(commander, request, remainingRedirects))
+        self ! retry.onTimeout
+
+      case Status.Failure(NonFatal(exception)) =>
+        reporter.onError(request, exception)
+        storage.close()
+        exception match {
+          case ex: Http.RequestTimeoutException =>
+            log.warning("Request {} timeout, details: {}", request, ex.getMessage)
+            context.become(retrying(commander, request, remainingRedirects))
+            self ! retry.onTimeout
+
+          case ex: Http.ConnectionException =>
+            log.warning("Connection error on {}, details: {}", request, ex.getMessage)
+            context.become(retrying(commander, request, remainingRedirects))
+            self ! retry.onError
+
+          case unknownException =>
+            log.error(unknownException, "Unknown error on {}", request)
+            context.become(retrying(commander, request, remainingRedirects))
+            self ! retry.onError
+        }
+
+      case unknownMessage =>
+        log.debug("Unknown message: {}", unknownMessage)
+        reporter.onError(request, unknownMessage)
+        storage.close()
+        context.become(retrying(commander, request, remainingRedirects))
+        self ! retry.onError
+    }
+
+    def receive: Receive = {
+      case request: Request =>
+        log.debug("Starting request {}", request)
+        reporter.onRequest(request)
+        executeSprayRequest(request)
+        val retry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, timeoutBackoff = backoff)
+        val storage = new ByteStorage()
+        val maxRedirects = request.requestConfiguration.getOrElse(RequestConfiguration()).maxRedirects
+        context.become(waitingForResponse(sender, request, retry, storage, maxRedirects)
+          .orElse(handleErrors(sender, request, retry, storage, maxRedirects)))
+    }
+
+    def retrying(commander: ActorRef, request: Request, remainingRedirects: Int): Receive = {
+      case retry: Retry =>
+        if (retry.shouldGiveUp) {
+          reporter.onGiveUp(request)
+          log.warning("Error to get {}, no more retries {}, accepting failure", request, retry)
+          commander ! Status.Failure(new TimeoutException(s"Failed to get '${request.url}'"))
+          context.stop(self)
+        } else {
+          reporter.onRetry(request)
+          log.info("Retrying {}, retry status {}, backing off for {} millis", request, retry, retry.backoff.toMillis)
+          system.scheduler.scheduleOnce(retry.backoff) {
+            log.debug("Waking from backoff, retrying request {}", request)
+            executeSprayRequest(request)
+          }(executionContext)
+          val storage = new ByteStorage()
+          context.become(waitingForResponse(commander, request, retry, storage, remainingRedirects)
+            .orElse(handleErrors(commander, request, retry, storage, remainingRedirects)))
+        }
+    }
+
+    def waitingForResponse(commander: ActorRef, request: Request, retry: Retry, storage: ByteStorage, remainingRedirects: Int): Receive = {
+      case response@HttpResponse(status, entity, headers, _) => try {
+        storage.write(response.entity.data.toByteArray)
+        if (isRedirection(status))
+          handleRedirect(commander, storage, retry, request, status, response, remainingRedirects)
+        else if (status.isSuccess) {
+          reporter.onResponse(request, status.intValue)
+          commander ! Status.Success(AsyncHttpClientStreamApi.StreamResponse(status.intValue, storage.getInputStream()))
+          context.stop(self)
+        } else if (retryOnHttpStatus.contains(status.intValue)) {
+          storage.close()
+          log.debug("HttpResponse: Status {}, retrying...", status)
+          context.become(retrying(commander, request, remainingRedirects))
+          self ! retry.onError
+        } else {
+          val message = s"HTTP response status ${status.intValue}, on request ${request}, ${status.defaultMessage}"
+          log.debug("HttpResponse: {}", message)
+          reporter.onFailure(request, status.intValue)
+          reporter.onGiveUp(request)
+          commander ! Status.Failure(new AsyncHttpClientStreamApi.RequestException(message = message,
+            response = AsyncHttpClientStreamApi.StreamResponse(status.intValue, storage.getInputStream())))
+          context.stop(self)
+        }
+      } catch {
+        case NonFatal(ex) =>
+          storage.close()
+          log.error(ex, "HttpResponse: Failure on creating HttpResponse")
+          reporter.onError(request, ex)
+          context.become(retrying(commander, request, remainingRedirects))
+          self ! retry.onError
+      }
+
+      case chunkStart@ChunkedResponseStart(HttpResponse(status, entity, headers, _)) => try {
+        storage.write(entity.data.toByteArray)
+        if (isRedirection(status))
+          handleRedirect(commander, storage, retry, request, status, chunkStart, remainingRedirects)
+        else if (status.isSuccess) {
+          context.become(accumulateChunks(commander, request, retry, storage, status, remainingRedirects)
+            .orElse(handleErrors(commander, request, retry, storage, remainingRedirects)))
+        } else if (retryOnHttpStatus.contains(status.intValue)) {
+          storage.close()
+          log.debug("ChunkedResponseStart: Status {}, retrying...", status)
+          context.become(retrying(commander, request, remainingRedirects))
+          self ! retry.onError
+        } else {
+          val message = s"HTTP response status ${status.intValue}, on request ${request}, ${status.defaultMessage}"
+          log.debug("ChunkedResponseStart: {}", message)
+          reporter.onFailure(request, status.intValue)
+          reporter.onGiveUp(request)
+          commander ! Status.Failure(new AsyncHttpClientStreamApi.RequestException(message = message,
+            response = AsyncHttpClientStreamApi.StreamResponse(status.intValue, storage.getInputStream())))
+          context.stop(self)
+        }
+      } catch {
+        case NonFatal(ex) =>
+          log.error(ex, "ChunkedResponseStart: Failure on creating ChunkedHttpResponse")
+          reporter.onError(request, ex)
+          context.become(retrying(commander, request, remainingRedirects))
+          self ! retry.onError
+      }
+    }
+
+    def accumulateChunks(commander: ActorRef, request: Request, retry: Retry, storage: ByteStorage, status: StatusCode, remainingRedirects: Int): Receive = {
+      case message@MessageChunk(data, _) => try {
+        storage.write(data.toByteArray)
+      } catch {
+        case NonFatal(ex) =>
+          storage.close()
+          log.error(ex, "MessageChunk: Failure on accumulate chunk data")
+          reporter.onError(request, ex)
+          context.become(retrying(commander, request, remainingRedirects))
+          self ! retry.onError
+      }
+
+      case chunkEnd: ChunkedMessageEnd =>
+        log.debug("ChunkedMessageEnd: all data was received for request {}, status {}", request, status)
+        reporter.onResponse(request, status.intValue)
+        commander ! Status.Success(AsyncHttpClientStreamApi.StreamResponse(status.intValue, storage.getInputStream()))
+        context.stop(self)
+    }
+
+    def handleRedirect(commander: ActorRef, oldStorage: ByteStorage, oldRetry: Retry, oldRequest: Request, status: StatusCode, rawResponse: HttpResponsePart, remainingRedirects: Int): Unit = {
+      if (remainingRedirects <= 0) {
+        val message = s"HandleRedirect: exceeded redirection limit on $oldRequest with status $status"
+        log.warning(message)
+        reporter.onGiveUp(oldRequest)
+        commander ! Status.Failure(new Exception(message))
+        context.stop(self)
+      } else {
+        def makeRequest(headers: List[HttpHeader]): Receive = {
+          oldStorage.close()
+          val newRemainingRedirects = remainingRedirects - 1
+          headers.find(_.is("location")).map(_.value).map { newLocation =>
+            log.debug("Making redirect to {}", newLocation)
+            val newRequest = oldRequest.copy(url = newLocation)
+            executeSprayRequest(newRequest)
+            val newRetry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, timeoutBackoff = backoff)
+            val newStorage = new ByteStorage()
+            waitingForResponse(commander, newRequest, newRetry, newStorage, newRemainingRedirects)
+              .orElse(handleErrors(commander, newRequest, newRetry, newStorage, newRemainingRedirects))
+          }.getOrElse {
+            log.warning("Received redirect for request {} with headers {} without location, retrying...", oldRequest, headers)
+            retrying(commander, oldRequest, newRemainingRedirects)
+          }
+        }
+        context.become(rawResponse match {
+          case response@HttpResponse(status, entity, headers, _) =>
+            makeRequest(headers)
+          case chunkStart@ChunkedResponseStart(HttpResponse(status, entity, headers, _)) => {
+            case message@MessageChunk(data, _) =>
+              // do nothing
+            case chunkEnd: ChunkedMessageEnd =>
+              context.become(makeRequest(headers))
+          }
+          case other =>
+            throw new Exception(s"Bug, called on $other")
+        })
+      }
+    }
+
+  }
+
+}
diff --git a/src/main/scala/ignition/core/http/ByteStorage.scala b/src/main/scala/ignition/core/http/ByteStorage.scala
new file mode 100644
index 00000000..c137a5fe
--- /dev/null
+++ b/src/main/scala/ignition/core/http/ByteStorage.scala
@@ -0,0 +1,114 @@
+package ignition.core.http
+
+import java.io._
+import java.nio.file.{Files, Paths}
+import java.util.UUID
+
+import org.slf4j.LoggerFactory
+
+import scala.util.control.NonFatal
+import scala.util.{Failure, Success, Try}
+
+class ByteStorage(memoryThreshold: Int = 1024 * 1024 * 5) extends AutoCloseable {
+
+  lazy val log = LoggerFactory.getLogger(getClass)
+
+  lazy val tempDirPath = Files.createDirectories(Paths.get(System.getProperty("java.io.tmpdir"), "ByteStorage"))
+
+  lazy val buffer = new ByteArrayOutputStream
+
+  var fileStorage: Option[(File, FileOutputStream)] = None
+
+  def write(bytes: Array[Byte]): Unit = try {
+    if (fileStorage.isDefined) {
+      writeOnFile(bytes)
+    } else if (buffer.size() + bytes.length > memoryThreshold) {
+      log.debug("Memory threshold {} reach, going to file storage", memoryThreshold)
+      setupFileStorage()
+      writeOnFile(buffer.toByteArray)
+      writeOnFile(bytes)
+      // on ByteArrayOutputStream close() takes not effect,
+      // but if we change the buffer impl this is the a good moment to free resources
+      buffer.close()
+    } else {
+      buffer.write(bytes)
+    }
+  } catch {
+    case NonFatal(ex) =>
+      close()
+      throw ex
+  }
+
+  override def close(): Unit = fileStorage match {
+    case Some((file, outputStream)) => try {
+        log.debug("Cleaning up temp file {}", file.getAbsolutePath)
+        outputStream.close()
+        file.delete()
+      } catch {
+        case NonFatal(ex) => log.warn(s"Fail to cleanup temp file ${file.getAbsolutePath}", ex)
+      }
+    case None =>
+      log.debug("Cleaning up memory buffer")
+      buffer.close()
+  }
+
+  private def setupFileStorage(): Unit = if (fileStorage.isEmpty) {
+    tryCreateTempFile match {
+      case Success(storage) => fileStorage = Option(storage)
+      case Failure(ex) => throw ex
+    }
+  } else {
+    throw new IllegalStateException("File storage already setup")
+  }
+
+  private def tryCreateTempFile: Try[(File, FileOutputStream)] = Try {
+    val tempFile = File.createTempFile(s"temp_byte_storage_${UUID.randomUUID().toString}", ".temp", tempDirPath.toFile)
+    tempFile.deleteOnExit()
+    log.debug("Creating temp file {}", tempFile.getAbsolutePath)
+    (tempFile, new FileOutputStream(tempFile))
+  }
+
+  private def writeOnFile(bytes: Array[Byte]): Unit = fileStorage match {
+    case Some((_, outputStream)) => outputStream.write(bytes)
+    case None => throw new IllegalStateException("File storage not initialized")
+  }
+
+  def getInputStream(): InputStream = fileStorage match {
+    case Some((file, outputStream)) => try {
+      outputStream.close()
+      new DeleteOnCloseFileInputStream(file)
+    } catch {
+      case NonFatal(ex) =>
+        log.error("Fail to create InputStream", ex)
+        close()
+        throw ex
+    }
+    case None => new ByteArrayInputStream(buffer.toByteArray)
+  }
+
+  override def finalize() = try {
+    fileStorage match {
+      case Some((file, outputStream)) =>
+        log.debug("Cleaning up temp file {}", file.getAbsolutePath)
+        outputStream.close()
+        file.delete()
+      case None =>
+    }
+  } finally {
+    super.finalize()
+  }
+
+}
+
+private class DeleteOnCloseFileInputStream(file: File) extends FileInputStream(file) {
+  lazy val log = LoggerFactory.getLogger(getClass)
+  override def close() = try {
+    log.debug("Cleaning up file {}", file.getAbsolutePath)
+    file.delete()
+  } catch {
+    case NonFatal(ex) =>
+      log.warn(s"Failed to clean up file ${file.getAbsolutePath}", ex)
+  } finally {
+    super.close()
+  }
+}
\ No newline at end of file
diff --git a/src/main/scala/ignition/core/http/Caching.scala b/src/main/scala/ignition/core/http/Caching.scala
new file mode 100644
index 00000000..112791a5
--- /dev/null
+++ b/src/main/scala/ignition/core/http/Caching.scala
@@ -0,0 +1,22 @@
+package ignition.core.http
+
+import org.slf4j.LoggerFactory
+import spray.caching.Cache
+
+import scala.concurrent._
+import scala.util.Failure
+
+trait Caching[T] {
+  val log = LoggerFactory.getLogger(classOf[Caching[T]])
+
+  val cache: Cache[T]
+  import ExecutionContext.Implicits.global
+  def fetchCache[K](key: K, f: () => Future[T]): Future[T] = cache(key) {
+    f.apply andThen {
+      case Failure(e) => {
+        cache.remove(key)
+        log.info(s"Removed $key from cache due to an exception: $e")
+      }
+    }
+  }
+}
diff --git a/src/main/scala/ignition/core/http/Retry.scala b/src/main/scala/ignition/core/http/Retry.scala
new file mode 100644
index 00000000..03d86db6
--- /dev/null
+++ b/src/main/scala/ignition/core/http/Retry.scala
@@ -0,0 +1,65 @@
+package ignition.core.http
+
+import java.util.Random
+import java.util.concurrent.TimeUnit
+
+import org.joda.time.DateTime
+
+import scala.concurrent.duration.{Duration, FiniteDuration, _}
+import scala.language.postfixOps
+
+object Retry {
+
+  sealed trait State
+  case object Timeout extends State
+  case object Error extends State
+
+  val random = new Random
+
+  val _maxWaitForNextRetry = 10
+
+  def exponentialBackOff(r: Int): FiniteDuration = {
+    val exponent: Double = scala.math.min(r, _maxWaitForNextRetry)
+    scala.math.pow(2, exponent).round * (random.nextInt(30) + 100 milliseconds)
+  }
+
+}
+
+case class Retry(startTime: DateTime,
+                 timeout: FiniteDuration,
+                 state: Retry.State = Retry.Timeout,
+                 timeoutCount: Int = 0,
+                 timeoutBackoff: FiniteDuration = 100 milliseconds,
+                 maxErrors: Int = 10,
+                 errorsCount: Int = 0,
+                 backoffOnError: FiniteDuration = 100 milliseconds) {
+
+  import Retry._
+
+  def onError(): Retry =
+    copy(errorsCount = errorsCount + 1, backoffOnError = exponentialBackOff(errorsCount + 1), state = Retry.Error)
+
+  def onTimeout(): Retry = copy(timeoutCount = timeoutCount + 1, timeoutBackoff = exponentialBackOff(timeoutCount + 1), state = Retry.Timeout)
+
+  def backoff(): FiniteDuration = state match {
+    case Timeout => timeoutBackoff
+    case Error => backoffOnError
+  }
+
+  private def canRetryMore(durations: FiniteDuration*): Boolean = {
+    val maxTime = startTime.plusMillis(timeout.toMillis.toInt)
+    val nextEstimatedTime = DateTime.now.plusMillis(durations.map(_.toMillis.toInt).sum)
+    nextEstimatedTime.isBefore(maxTime)
+  }
+
+  // This is an approximation and we are ignoring the time waiting on backoff.
+  // In this way we are overestimating the average request duration, which is fine because it's better to abort early than wait too much time exceed AskTimeouts
+  private def averageRequestDuration =
+    Duration((DateTime.now.getMillis - startTime.getMillis) / Math.max(timeoutCount + errorsCount, 1), TimeUnit.MILLISECONDS)
+
+  def shouldGiveUp(): Boolean = state match {
+    case Timeout => !canRetryMore(averageRequestDuration, timeoutBackoff)
+    case Error => !canRetryMore(averageRequestDuration, backoffOnError) || errorsCount > maxErrors
+  }
+
+}
\ No newline at end of file

From 733731b3001695a416a96d35e28818a577589b73 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 29 Sep 2016 21:07:48 -0300
Subject: [PATCH 110/268] Add test script

---
 src/main/scala/TestHttp.scala | 49 +++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 src/main/scala/TestHttp.scala

diff --git a/src/main/scala/TestHttp.scala b/src/main/scala/TestHttp.scala
new file mode 100644
index 00000000..355575a0
--- /dev/null
+++ b/src/main/scala/TestHttp.scala
@@ -0,0 +1,49 @@
+
+object TestHttp extends App{
+
+  def goTest(): Unit = {
+    import java.util.concurrent.TimeUnit
+
+    import akka.actor.{ActorRefFactory, ActorSystem}
+    import akka.util.Timeout
+    import ignition.core.http.AsyncHttpClientStreamApi._
+    import ignition.core.http.AsyncSprayHttpClient
+    import ignition.core.utils.ExceptionUtils._
+    import org.joda.time.DateTime
+
+    import scala.concurrent.ExecutionContext.Implicits.global
+    import scala.concurrent.duration.Duration
+    import scala.io.Source
+    import scala.util.{Failure, Success}
+    def now = DateTime.now()
+
+    val system = ActorSystem("http")
+    val client = new AsyncSprayHttpClient {
+      override implicit def actorRefFactory: ActorRefFactory = system
+    }
+    val url = "http://httpbin.org/delay/10" // "http://127.0.0.1:8081/"
+    val conf = RequestConfiguration(requestTimeout = Duration(12, TimeUnit.SECONDS), idleTimeout = Duration(5, TimeUnit.SECONDS))
+    implicit val reporter = NoOpReporter
+    implicit val timeout = Timeout(30, TimeUnit.SECONDS)
+
+    println(s"Starting $now")
+
+    // Should complete ok
+    val request1 = client.makeRequest(Request(url, requestConfiguration = Option(conf)))
+    request1.onComplete {
+      case Success(t) => println(s"request1 finished $now with Success: ${Source.fromInputStream(t.content).mkString}")
+      case Failure(t) => println(s"request1 finished $now with failure: ${t.getFullStackTraceString()}")
+    }
+
+    //Should time out and keep retrying
+    val tightConf = conf.copy(requestTimeout = Duration(3, TimeUnit.SECONDS))
+    val request2 = client.makeRequest(Request(url, requestConfiguration = Option(tightConf)))
+
+    request2.onComplete {
+      case Success(t) => println(s"request2 finished $now with Success: ${Source.fromInputStream(t.content).mkString}")
+      case Failure(t) => println(s"request2 finished $now with failure: ${t.getFullStackTraceString()}")
+    }
+  }
+
+  goTest()
+}

From a1f1c38a4dce2d037fb6726c281f05aad4c7468d Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Fri, 30 Sep 2016 11:39:39 -0300
Subject: [PATCH 111/268] retry on cluster health check

---
 tools/cluster.py | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index c4a2f681..5f59edad 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -24,7 +24,7 @@
 import json
 import glob
 import webbrowser
-
+import ssl
 
 log = logging.getLogger()
 log.setLevel(logging.INFO)
@@ -506,15 +506,23 @@ def job_attach(cluster_name, key_file=default_key_file, job_name=None, job_tag=N
 class NotHealthyCluster(Exception): pass
 
 @named('health-check')
-def health_check(cluster_name, key_file=default_key_file, master=None, remote_user=default_remote_user, region=default_region):
-    master = master or get_master(cluster_name, region=region)
-    all_args = load_cluster_args(master, key_file, remote_user)
-    nslaves = int(all_args['slaves'])
-    minimum_percentage_healthy_slaves = all_args['minimum_percentage_healthy_slaves']
-    masters, slaves = get_active_nodes(cluster_name, region=region)
-    if nslaves == 0 or float(len(slaves)) / nslaves < minimum_percentage_healthy_slaves:
-        raise NotHealthyCluster('Not enough healthy slaves: {0}/{1}'.format(len(slaves), nslaves))
-
+def health_check(cluster_name, key_file=default_key_file, master=None, remote_user=default_remote_user, region=default_region, retries=3):
+    for i in range(retries):
+        try:
+            master = master or get_master(cluster_name, region=region)
+            all_args = load_cluster_args(master, key_file, remote_user)
+            nslaves = int(all_args['slaves'])
+            minimum_percentage_healthy_slaves = all_args['minimum_percentage_healthy_slaves']
+            masters, slaves = get_active_nodes(cluster_name, region=region)
+            if nslaves == 0 or float(len(slaves)) / nslaves < minimum_percentage_healthy_slaves:
+                raise NotHealthyCluster('Not enough healthy slaves: {0}/{1}'.format(len(slaves), nslaves))
+        except NotHealthyCluster, e:
+            raise e
+        except Exception, e:
+            log.warning("Failed to check cluster health, cluster: %s, retries %s" % (cluster_name, i), exc_info=True)
+            if i >= retries - 1:
+                log.critical("Failed to check cluster health, cluster: %s, giveup!" % (cluster_name))
+                raise e
 
 class JobFailure(Exception): pass
 
@@ -645,7 +653,7 @@ def collect(show_tail):
                 failures += 1
                 last_failure = 'Unexpected response: {}'.format(output)
             health_check(cluster_name=cluster_name, key_file=key_file, master=master, remote_user=remote_user, region=region)
-        except subprocess.CalledProcessError as e:
+        except (subprocess.CalledProcessError, ssl.SSLError) as e:
             failures += 1
             log.exception('Got exception')
             last_failure = 'Exception: {}'.format(e)

From b667ce686a505930eec55a25e3a6eb105d3b260a Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Fri, 30 Sep 2016 20:05:42 -0300
Subject: [PATCH 112/268] Set host with correct port and ssl support

---
 src/main/scala/TestHttp.scala                          |  2 +-
 .../ignition/core/http/AsyncSprayHttpClient.scala      | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/main/scala/TestHttp.scala b/src/main/scala/TestHttp.scala
index 355575a0..f11f35ca 100644
--- a/src/main/scala/TestHttp.scala
+++ b/src/main/scala/TestHttp.scala
@@ -21,7 +21,7 @@ object TestHttp extends App{
     val client = new AsyncSprayHttpClient {
       override implicit def actorRefFactory: ActorRefFactory = system
     }
-    val url = "http://httpbin.org/delay/10" // "http://127.0.0.1:8081/"
+    val url = "https://httpbin.org/delay/10" // "http://127.0.0.1:8081/"
     val conf = RequestConfiguration(requestTimeout = Duration(12, TimeUnit.SECONDS), idleTimeout = Duration(5, TimeUnit.SECONDS))
     implicit val reporter = NoOpReporter
     implicit val timeout = Timeout(30, TimeUnit.SECONDS)
diff --git a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
index 54247a04..c6adbbbd 100644
--- a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
+++ b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
@@ -71,7 +71,7 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi {
         HttpRequest(method = method, uri = toUriString(request.url, params), entity = body)
     }
 
-    private def toSprayHostConnectorSetup(host: String, configuration: AsyncHttpClientStreamApi.RequestConfiguration): HostConnectorSetup = {
+    private def toSprayHostConnectorSetup(uri: Uri, configuration: AsyncHttpClientStreamApi.RequestConfiguration): HostConnectorSetup = {
       // Create based on defaults, change some of them
       val ccs: ClientConnectionSettings = ClientConnectionSettings(system)
       val hcs: HostConnectorSettings = HostConnectorSettings(system)
@@ -90,13 +90,15 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi {
         maxConnections = configuration.maxConnectionsPerHost,
         pipelining = configuration.pipelining
       )
-      HostConnectorSetup(host = host, settings = Option(updatedHcs))
+
+      val host = uri.authority.host
+      HostConnectorSetup(host.toString, uri.effectivePort, sslEncryption = uri.scheme == "https", settings = Option(updatedHcs))
     }
 
     private def executeSprayRequest(request: Request): Unit = request.requestConfiguration match {
       case Some(configuration) =>
-        val url = new URL(request.url)
-        val message = (toSprayRequest(request), toSprayHostConnectorSetup(url.getHost, configuration))
+        val url = Uri(request.url)
+        val message = (toSprayRequest(request), toSprayHostConnectorSetup(url, configuration))
         IO(Http) ! message
       case None =>
         IO(Http) ! toSprayRequest(request)

From e0457bb692801aa477e4f4f3a5d82072340af098 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Mon, 3 Oct 2016 18:42:40 -0300
Subject: [PATCH 113/268] Allow a granular fallback to external configuration

---
 src/main/scala/TestHttp.scala                 |  4 +--
 .../core/http/AsyncHttpClientStreamApi.scala  | 24 +++++++++----
 .../core/http/AsyncSprayHttpClient.scala      | 36 +++++++++++--------
 3 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/src/main/scala/TestHttp.scala b/src/main/scala/TestHttp.scala
index f11f35ca..901516e0 100644
--- a/src/main/scala/TestHttp.scala
+++ b/src/main/scala/TestHttp.scala
@@ -22,7 +22,7 @@ object TestHttp extends App{
       override implicit def actorRefFactory: ActorRefFactory = system
     }
     val url = "https://httpbin.org/delay/10" // "http://127.0.0.1:8081/"
-    val conf = RequestConfiguration(requestTimeout = Duration(12, TimeUnit.SECONDS), idleTimeout = Duration(5, TimeUnit.SECONDS))
+    val conf = RequestConfiguration(requestTimeout = Option(Duration(12, TimeUnit.SECONDS)), idleTimeout = Option(Duration(5, TimeUnit.SECONDS)))
     implicit val reporter = NoOpReporter
     implicit val timeout = Timeout(30, TimeUnit.SECONDS)
 
@@ -36,7 +36,7 @@ object TestHttp extends App{
     }
 
     //Should time out and keep retrying
-    val tightConf = conf.copy(requestTimeout = Duration(3, TimeUnit.SECONDS))
+    val tightConf = conf.copy(requestTimeout = Option(Duration(3, TimeUnit.SECONDS)))
     val request2 = client.makeRequest(Request(url, requestConfiguration = Option(tightConf)))
 
     request2.onComplete {
diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
index 52d97810..5ec528ae 100644
--- a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
+++ b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
@@ -26,19 +26,29 @@ object AsyncHttpClientStreamApi {
   // TODO: return a stream is dangerous because implies into a lock
   case class StreamResponse(status: Int, content: InputStream)
 
-  case class RequestConfiguration(maxRedirects: Int = 15,
-                                  maxConnectionsPerHost: Int = 500,
-                                  pipelining: Boolean = false,
-                                  idleTimeout: Duration = Duration(30, TimeUnit.SECONDS),
-                                  requestTimeout: Duration = Duration(20, TimeUnit.SECONDS),
-                                  connectingTimeout: Duration = Duration(10, TimeUnit.SECONDS))
+  // If any value is None, it will fallback to the implementation's default
+  object RequestConfiguration {
+    val defaultMaxRedirects: Int = 15
+    val defaultMaxConnectionsPerHost: Int = 500
+    val defaultPipelining: Boolean = false
+    val defaultIdleTimeout: FiniteDuration = Duration(30, TimeUnit.SECONDS)
+    val defaultRequestTimeout: FiniteDuration = Duration(20, TimeUnit.SECONDS)
+    val defaultConnectingTimeout: FiniteDuration = Duration(10, TimeUnit.SECONDS)
+  }
+
+  case class RequestConfiguration(maxRedirects: Option[Int] = Option(RequestConfiguration.defaultMaxRedirects),
+                                  maxConnectionsPerHost: Option[Int] = Option(RequestConfiguration.defaultMaxConnectionsPerHost),
+                                  pipelining: Option[Boolean] = Option(RequestConfiguration.defaultPipelining),
+                                  idleTimeout: Option[Duration] = Option(RequestConfiguration.defaultIdleTimeout),
+                                  requestTimeout: Option[Duration] = Option(RequestConfiguration.defaultRequestTimeout),
+                                  connectingTimeout: Option[Duration] = Option(RequestConfiguration.defaultConnectingTimeout))
 
   case class Request(url: String,
                      params: Map[String, String] = Map.empty,
                      credentials: Option[Credentials] = None,
                      method: HttpMethod = HttpMethods.GET,
                      body: HttpEntity = HttpEntity.Empty,
-                     requestConfiguration: Option[RequestConfiguration] = Option(RequestConfiguration()))
+                     requestConfiguration: Option[RequestConfiguration] = None)
 
   case class RequestException(message: String, response: StreamResponse) extends RuntimeException(message)
 
diff --git a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
index c6adbbbd..a2dc312d 100644
--- a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
+++ b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
@@ -71,37 +71,44 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi {
         HttpRequest(method = method, uri = toUriString(request.url, params), entity = body)
     }
 
-    private def toSprayHostConnectorSetup(uri: Uri, configuration: AsyncHttpClientStreamApi.RequestConfiguration): HostConnectorSetup = {
+    private def toSprayHostConnectorSetup(uri: Uri, conf: Option[AsyncHttpClientStreamApi.RequestConfiguration]): HostConnectorSetup = {
       // Create based on defaults, change some of them
       val ccs: ClientConnectionSettings = ClientConnectionSettings(system)
       val hcs: HostConnectorSettings = HostConnectorSettings(system)
 
       val updatedCcs = ccs.copy(
         responseChunkAggregationLimit = 0, // makes our client ineffective if non zero
-        idleTimeout = configuration.idleTimeout,
-        connectingTimeout = configuration.connectingTimeout,
-        requestTimeout = configuration.requestTimeout
+        idleTimeout = conf.flatMap(_.idleTimeout).getOrElse(ccs.idleTimeout),
+        connectingTimeout = conf.flatMap(_.connectingTimeout).getOrElse(ccs.connectingTimeout),
+        requestTimeout = conf.flatMap(_.requestTimeout).getOrElse(ccs.requestTimeout)
       )
 
+      val maxConnections = conf.flatMap(_.maxConnectionsPerHost).getOrElse {
+        // Let's avoid someone shoot his own foot
+        if (hcs.maxConnections == 4) // Spray's default is stupidly low
+          // Use the API's default, which is more reasonable
+          RequestConfiguration.defaultMaxConnectionsPerHost
+        else
+          // If the conf is the non-default value, then someone know what he's doing. use that configured value
+          hcs.maxConnections
+      }
+
       val updatedHcs = hcs.copy(
         connectionSettings = updatedCcs,
         maxRetries = 0, // We have our own retry mechanism
         maxRedirects = 0, // We do our own redirect following
-        maxConnections = configuration.maxConnectionsPerHost,
-        pipelining = configuration.pipelining
+        maxConnections = maxConnections,
+        pipelining = conf.flatMap(_.pipelining).getOrElse(hcs.pipelining)
       )
 
       val host = uri.authority.host
       HostConnectorSetup(host.toString, uri.effectivePort, sslEncryption = uri.scheme == "https", settings = Option(updatedHcs))
     }
 
-    private def executeSprayRequest(request: Request): Unit = request.requestConfiguration match {
-      case Some(configuration) =>
-        val url = Uri(request.url)
-        val message = (toSprayRequest(request), toSprayHostConnectorSetup(url, configuration))
-        IO(Http) ! message
-      case None =>
-        IO(Http) ! toSprayRequest(request)
+    private def executeSprayRequest(request: Request): Unit = {
+      val url = Uri(request.url)
+      val message = (toSprayRequest(request), toSprayHostConnectorSetup(url, request.requestConfiguration))
+      IO(Http) ! message
     }
 
     def handleErrors(commander: ActorRef, request: Request, retry: Retry, storage: ByteStorage, remainingRedirects: Int): Receive = {
@@ -154,7 +161,8 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi {
         executeSprayRequest(request)
         val retry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, timeoutBackoff = backoff)
         val storage = new ByteStorage()
-        val maxRedirects = request.requestConfiguration.getOrElse(RequestConfiguration()).maxRedirects
+        val maxRedirects =
+          request.requestConfiguration.flatMap(_.maxRedirects).getOrElse(RequestConfiguration.defaultMaxRedirects)
         context.become(waitingForResponse(sender, request, retry, storage, maxRedirects)
           .orElse(handleErrors(sender, request, retry, storage, maxRedirects)))
     }

From 52699436ed6e84a1f771db1574e295cf2e421e1a Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Mon, 3 Oct 2016 19:10:26 -0300
Subject: [PATCH 114/268] Move telemetry cache to core

---
 .../ignition/core/utils/TelemetryCache.scala  | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 src/main/scala/ignition/core/utils/TelemetryCache.scala

diff --git a/src/main/scala/ignition/core/utils/TelemetryCache.scala b/src/main/scala/ignition/core/utils/TelemetryCache.scala
new file mode 100644
index 00000000..d86f98bc
--- /dev/null
+++ b/src/main/scala/ignition/core/utils/TelemetryCache.scala
@@ -0,0 +1,45 @@
+package ignition.core.utils
+
+import ignition.core.utils.TelemetryCache.TelemetryCacheReporter
+import spray.caching.Cache
+
+import scala.concurrent.{ExecutionContext, Future}
+
+object TelemetryCache {
+
+  def apply[V](cacheName: String, wrapped: Cache[V], reporter: TelemetryCacheReporter): Cache[V] =
+    new TelemetryCache[V](cacheName, wrapped, reporter)
+
+  trait TelemetryCacheReporter {
+    def onHit(name: String): Unit
+    def onMiss(name: String): Unit
+  }
+
+}
+
+class TelemetryCache[V](cacheName: String, wrapped: Cache[V], reporter: TelemetryCacheReporter) extends Cache[V] {
+
+  override def apply(key: Any, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = {
+    val value = wrapped.get(key)
+    if (value.isDefined) {
+      reporter.onHit(cacheName)
+      value.get
+    } else {
+      reporter.onMiss(cacheName)
+      wrapped.apply(key, genValue)
+    }
+  }
+
+  override def get(key: Any): Option[Future[V]] = wrapped.get(key)
+
+  override def clear(): Unit = wrapped.clear()
+
+  override def size: Int = wrapped.size
+
+  override def remove(key: Any): Option[Future[V]] = wrapped.remove(key)
+
+  override def keys: Set[Any] = wrapped.keys
+
+  override def ascendingKeys(limit: Option[Int]): Iterator[Any] = wrapped.ascendingKeys(limit)
+
+}

From d89a08e0a07720c704803b5d20542bd8948f3207 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Wed, 19 Oct 2016 17:09:12 -0200
Subject: [PATCH 115/268] Make retry configurable

---
 .../core/http/AsyncHttpClientStreamApi.scala  |  4 +-
 .../core/http/AsyncSprayHttpClient.scala      | 13 +++--
 src/main/scala/ignition/core/http/Retry.scala | 57 ++++++++++++-------
 .../scala/ignition/core/http/RetrySpec.scala  | 39 +++++++++++++
 4 files changed, 87 insertions(+), 26 deletions(-)
 create mode 100644 src/test/scala/ignition/core/http/RetrySpec.scala

diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
index 5ec528ae..4910c98a 100644
--- a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
+++ b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
@@ -73,7 +73,7 @@ object AsyncHttpClientStreamApi {
 
 trait AsyncHttpClientStreamApi {
 
-  def makeRequest(request: AsyncHttpClientStreamApi.Request, initialBackoff: FiniteDuration = 100 milliseconds, retryOnHttpStatus: Seq[Int] = List.empty)
-                             (implicit timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback = AsyncHttpClientStreamApi.NoOpReporter): Future[AsyncHttpClientStreamApi.StreamResponse]
+  def makeRequest(request: AsyncHttpClientStreamApi.Request, retryConf: RetryConf = RetryConf(), retryOnHttpStatus: Seq[Int] = List.empty)
+                   (implicit timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback = AsyncHttpClientStreamApi.NoOpReporter): Future[AsyncHttpClientStreamApi.StreamResponse]
 
 }
\ No newline at end of file
diff --git a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
index a2dc312d..0565fe2f 100644
--- a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
+++ b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
@@ -29,13 +29,16 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi {
   implicit def actorRefFactory: ActorRefFactory
   def executionContext: ExecutionContext = actorRefFactory.dispatcher
 
-  override def makeRequest(request: AsyncHttpClientStreamApi.Request, backoff: FiniteDuration, retryOnHttpStatus: Seq[Int])
+  override def makeRequest(request: AsyncHttpClientStreamApi.Request, retryConf: RetryConf, retryOnHttpStatus: Seq[Int])
                              (implicit timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback): Future[AsyncHttpClientStreamApi.StreamResponse] = {
-    val processor = actorRefFactory.actorOf(Props(new RequestProcessorActor(timeout, reporter, backoff, retryOnHttpStatus)))
+    val processor = actorRefFactory.actorOf(Props(new RequestProcessorActor(timeout, reporter, retryConf, retryOnHttpStatus)))
     (processor ? request).mapTo[AsyncHttpClientStreamApi.StreamResponse]
   }
 
-  private class RequestProcessorActor(timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback, backoff: FiniteDuration, retryOnHttpStatus: Seq[Int])
+  private class RequestProcessorActor(timeout: Timeout,
+                                      reporter: AsyncHttpClientStreamApi.ReporterCallback,
+                                      retryConf: RetryConf,
+                                      retryOnHttpStatus: Seq[Int])
     extends Actor with ActorLogging {
 
 
@@ -159,7 +162,7 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi {
         log.debug("Starting request {}", request)
         reporter.onRequest(request)
         executeSprayRequest(request)
-        val retry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, timeoutBackoff = backoff)
+        val retry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, conf = retryConf)
         val storage = new ByteStorage()
         val maxRedirects =
           request.requestConfiguration.flatMap(_.maxRedirects).getOrElse(RequestConfiguration.defaultMaxRedirects)
@@ -283,7 +286,7 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi {
             log.debug("Making redirect to {}", newLocation)
             val newRequest = oldRequest.copy(url = newLocation)
             executeSprayRequest(newRequest)
-            val newRetry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, timeoutBackoff = backoff)
+            val newRetry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, conf = retryConf)
             val newStorage = new ByteStorage()
             waitingForResponse(commander, newRequest, newRetry, newStorage, newRemainingRedirects)
               .orElse(handleErrors(commander, newRequest, newRetry, newStorage, newRemainingRedirects))
diff --git a/src/main/scala/ignition/core/http/Retry.scala b/src/main/scala/ignition/core/http/Retry.scala
index 03d86db6..1c94828b 100644
--- a/src/main/scala/ignition/core/http/Retry.scala
+++ b/src/main/scala/ignition/core/http/Retry.scala
@@ -1,12 +1,12 @@
 package ignition.core.http
 
-import java.util.Random
 import java.util.concurrent.TimeUnit
 
 import org.joda.time.DateTime
 
 import scala.concurrent.duration.{Duration, FiniteDuration, _}
 import scala.language.postfixOps
+import scala.util.Random
 
 object Retry {
 
@@ -14,52 +14,71 @@ object Retry {
   case object Timeout extends State
   case object Error extends State
 
-  val random = new Random
-
-  val _maxWaitForNextRetry = 10
-
-  def exponentialBackOff(r: Int): FiniteDuration = {
-    val exponent: Double = scala.math.min(r, _maxWaitForNextRetry)
-    scala.math.pow(2, exponent).round * (random.nextInt(30) + 100 milliseconds)
+  def exponentialBackOff(base: Int,
+                         exponent: Int,
+                         initialBackoff: FiniteDuration,
+                         maxBackoff: FiniteDuration,
+                         maxRandom: FiniteDuration): FiniteDuration = {
+    val randomMillis = maxRandom.toMillis.toInt
+    val random = if (randomMillis > 0)
+      FiniteDuration(Random.nextInt(randomMillis), TimeUnit.MILLISECONDS)
+    else
+      FiniteDuration(0, TimeUnit.MILLISECONDS)
+
+    val calculated = scala.math.pow(base, exponent).round * (random + initialBackoff)
+    calculated.min(maxBackoff)
   }
 
 }
 
-case class Retry(startTime: DateTime,
+case class RetryConf(initialTimeoutBackoff: FiniteDuration = 100 milliseconds,
+                     maxErrors: Int = 10,
+                     initialBackoffOnError: FiniteDuration = 100 milliseconds,
+                     timeoutMultiplicationFactor: Int = 2,
+                     errorMultiplicationFactor: Int = 2,
+                     maxBackoff: FiniteDuration = 1 minute,
+                     maxRandom: FiniteDuration = 30 milliseconds)
+
+case class Retry(conf: RetryConf,
+                 startTime: DateTime,
                  timeout: FiniteDuration,
                  state: Retry.State = Retry.Timeout,
                  timeoutCount: Int = 0,
-                 timeoutBackoff: FiniteDuration = 100 milliseconds,
-                 maxErrors: Int = 10,
-                 errorsCount: Int = 0,
-                 backoffOnError: FiniteDuration = 100 milliseconds) {
+                 errorsCount: Int = 0) {
 
   import Retry._
 
+  protected def now = DateTime.now
+
+  private def errorBackoff =
+    exponentialBackOff(conf.errorMultiplicationFactor, Math.max(errorsCount - 1, 0), conf.initialBackoffOnError, conf.maxBackoff, conf.maxRandom)
+  private def timeoutBackoff =
+    exponentialBackOff(conf.timeoutMultiplicationFactor, Math.max(timeoutCount - 1, 0), conf.initialTimeoutBackoff, conf.maxBackoff, conf.maxRandom)
+
   def onError(): Retry =
-    copy(errorsCount = errorsCount + 1, backoffOnError = exponentialBackOff(errorsCount + 1), state = Retry.Error)
+    copy(errorsCount = errorsCount + 1, state = Retry.Error)
 
-  def onTimeout(): Retry = copy(timeoutCount = timeoutCount + 1, timeoutBackoff = exponentialBackOff(timeoutCount + 1), state = Retry.Timeout)
+  def onTimeout(): Retry = copy(timeoutCount = timeoutCount + 1, state = Retry.Timeout)
 
   def backoff(): FiniteDuration = state match {
     case Timeout => timeoutBackoff
-    case Error => backoffOnError
+    case Error => errorBackoff
   }
 
   private def canRetryMore(durations: FiniteDuration*): Boolean = {
     val maxTime = startTime.plusMillis(timeout.toMillis.toInt)
-    val nextEstimatedTime = DateTime.now.plusMillis(durations.map(_.toMillis.toInt).sum)
+    val nextEstimatedTime = now.plusMillis(durations.map(_.toMillis.toInt).sum)
     nextEstimatedTime.isBefore(maxTime)
   }
 
   // This is an approximation and we are ignoring the time waiting on backoff.
   // In this way we are overestimating the average request duration, which is fine because it's better to abort early than wait too much time exceed AskTimeouts
   private def averageRequestDuration =
-    Duration((DateTime.now.getMillis - startTime.getMillis) / Math.max(timeoutCount + errorsCount, 1), TimeUnit.MILLISECONDS)
+    Duration((now.getMillis - startTime.getMillis) / Math.max(timeoutCount + errorsCount, 1), TimeUnit.MILLISECONDS)
 
   def shouldGiveUp(): Boolean = state match {
     case Timeout => !canRetryMore(averageRequestDuration, timeoutBackoff)
-    case Error => !canRetryMore(averageRequestDuration, backoffOnError) || errorsCount > maxErrors
+    case Error => !canRetryMore(averageRequestDuration, errorBackoff) || errorsCount > conf.maxErrors
   }
 
 }
\ No newline at end of file
diff --git a/src/test/scala/ignition/core/http/RetrySpec.scala b/src/test/scala/ignition/core/http/RetrySpec.scala
new file mode 100644
index 00000000..88528568
--- /dev/null
+++ b/src/test/scala/ignition/core/http/RetrySpec.scala
@@ -0,0 +1,39 @@
+package ignition.core.http
+
+import org.joda.time.DateTime
+import org.scalatest.{FlatSpec, Matchers}
+
+import scala.concurrent.duration._
+
+class RetrySpec extends FlatSpec with Matchers {
+  "Retry" should "return the initial backoff" in {
+    val now = DateTime.now
+    val timeout = 60.seconds
+
+    val retry = Retry(RetryConf(initialBackoffOnError = 123.milliseconds, initialTimeoutBackoff = 456.milliseconds, maxRandom = 0.seconds), now, timeout)
+
+    retry.onError().backoff() shouldBe 123.millisecond
+    retry.onTimeout().backoff() shouldBe 456.millisecond
+  }
+
+  it should "multiply by the factor on second time" in {
+
+    val now = DateTime.now
+    val timeout = 60.seconds
+
+    val retry = Retry(RetryConf(initialBackoffOnError = 123.milliseconds, initialTimeoutBackoff = 456.milliseconds, maxRandom = 0.seconds, timeoutMultiplicationFactor = 3, errorMultiplicationFactor = 5), now, timeout)
+
+    retry.onError().onError().backoff() shouldBe (123 * 5).millisecond
+    retry.onTimeout().onTimeout().backoff() shouldBe (456 * 3).millisecond
+  }
+
+  it should "not explode if called with no errors or timeouts" in {
+    val now = DateTime.now
+    val timeout = 60.seconds
+
+    val retry = Retry(RetryConf(maxRandom = 0.seconds), now, timeout)
+
+    retry.backoff() shouldBe 100.milliseconds
+  }
+
+}

From a7ee8d415127420babb85324e028ca0f215390a5 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Fri, 21 Oct 2016 16:53:46 -0200
Subject: [PATCH 116/268] Multiple level cache: local with remote fallback

---
 .../core/cache/MultipleLevelCache.scala       | 152 ++++++++++++++++++
 1 file changed, 152 insertions(+)
 create mode 100644 src/main/scala/ignition/core/cache/MultipleLevelCache.scala

diff --git a/src/main/scala/ignition/core/cache/MultipleLevelCache.scala b/src/main/scala/ignition/core/cache/MultipleLevelCache.scala
new file mode 100644
index 00000000..83a5c0b8
--- /dev/null
+++ b/src/main/scala/ignition/core/cache/MultipleLevelCache.scala
@@ -0,0 +1,152 @@
+package ignition.core.cache
+
+import ignition.core.utils.FutureUtils._
+
+import scala.concurrent.{ExecutionContext, Future}
+import scala.util.{Failure, Success, Try}
+
+trait SimpleCache[V] {
+  def apply(key: Any, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext): Future[V]
+}
+
+trait LocalCache[V] extends SimpleCache[V] {
+  def get(key: Any): Option[Future[V]]
+  def set(key: Any, value: Try[V]): Boolean
+}
+
+trait RemoteWritableCache[V] {
+  def set(key: Any, value: Try[V])(implicit ec: ExecutionContext): Future[Boolean]
+}
+
+trait RemoteReadableCache[V] {
+  def get(key: Any)(implicit ec: ExecutionContext): Future[Option[V]]
+}
+
+trait RemoteCacheRW[V] extends SimpleCache[V] with RemoteReadableCache[V] with RemoteWritableCache[V]
+
+
+case class LocalAsRemote[V](local: LocalCache[V]) extends RemoteCacheRW[V] {
+  override def get(key: Any)(implicit ec: ExecutionContext): Future[Option[V]] =
+    local.get(key).map(_.map(Option.apply)).getOrElse(Future.successful(None))
+
+  override def set(key: Any, value: Try[V])(implicit ec: ExecutionContext): Future[Boolean] =
+    Future.successful(local.set(key, value))
+
+  override def apply(key: Any, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] =
+    apply(key, genValue)
+}
+
+case class MultipleLevelCache[V](localCache: LocalCache[V],
+                                 remoteRW: List[RemoteCacheRW[V]],
+                                 remoteReadOnly: List[RemoteReadableCache[V]]) extends SimpleCache[V] {
+  val allReadableCaches: Array[RemoteReadableCache[V]] =
+    (LocalAsRemote(localCache) +: (remoteRW ++ remoteReadOnly)).toArray
+
+  // This can be called by multiple instances simultaneously but in the end
+  // only the one that wins the race will create the final value that will be set in
+  // the remote caches and read by the other instances
+  // Unless of course there is some error getting stuff from remote cache
+  // in which case the local value may be returned
+  def canonicalValueGenerator(key: Any, genValue: () => Future[V])() = {
+    val fLocalValue = genValue()
+    val finalValue: Future[V] = fLocalValue.asTry().flatMap {
+      case tLocalValue @ Success(localValue) =>
+        // Successfully generated value, try to set it in the first remote Writable cache
+        remoteRW match {
+          // No remote cache available, just return this value to be set on local cache
+          case Nil =>
+            Future.successful(localValue)
+          // We have at least one remote RW cache
+          case first :: others =>
+            first.set(key, tLocalValue).asTry().flatMap {
+              case Success(true) =>
+                // Successfully inserted on first remote store, propagate value to other remote rw caches
+                // We do it in a fire and forget approach, we only guarantee the data is in the first cache
+                others.foreach(_.set(key, tLocalValue))
+                // Return this value to be set on the local cache
+                Future.successful(localValue)
+              case Success(false) =>
+                // There is already a value there, we lost the race, ours won't be the canonical one, try to get it
+                first.get(key).asTry().flatMap {
+                  case Success(Some(remoteValue)) =>
+                    // Just return it
+                    Future.successful(remoteValue)
+                  case Success(None) =>
+                    // WTF? the set operation said it was there but now the value disappeared?!
+                    // So return our value which is good and hope for the best
+                    // TODO: generate metric and log here
+                    Future.successful(localValue)
+                  case Failure(_) =>
+                    // Oh noes, we failed to get the canonical value
+                    // We are supposing any retries have already been done by the cache implementation
+                    // So return our value which is good and hope for the best
+                    // TODO: generate metric and log here
+                    Future.successful(localValue)
+                }
+              case Failure(_) =>
+                // Oh noes, we failed to set the canonical value
+                // We are supposing any retries have already been done by the cache implementation
+                // So return our value which is good and hope for the best
+                // TODO: generate metric and log here
+                Future.successful(localValue)
+            }
+        }
+      case Failure(eLocal) =>
+        // We failed to generate the value ourselves, our only hope is if someone else successfully did it in the meantime
+        remoteRW match {
+          case Nil =>
+            // There are no remote RW caches
+            // FIXME: perhaps try the read only caches (but we can just be wasting time doing that)
+            // TODO: generate metric and log here
+            Future.failed(eLocal)
+          case first :: others =>
+            first.get(key).asTry().flatMap {
+              case Success(Some(remoteValue)) =>
+                // Hooray, someone calculated and set the value, return it
+                Future.successful(remoteValue)
+              case Success(None) =>
+                // Sadly, there is no value on remote, we failed!
+                // FIXME: perhaps try other caches (but we can just be wasting time doing that)
+                // TODO: generate metric and log here
+                Future.failed(eLocal)
+              case Failure(eRemote) =>
+                // Oh noes, this failed
+                // We are supposing any retries have already been done by the cache implementation
+                // And to make things worse, we don't have a good value
+                // So return a failure
+                // FIXME: perhaps try other caches (but we can just be wasting time doing that)
+                // TODO: generate metric and log here
+                Future.failed(eLocal)
+            }
+        }
+    }
+    finalValue
+  }
+
+  def indexedApply(index: Int, key: Any, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = {
+    if (index >= allReadableCaches.size) { // nothing found on our caches, calculate value
+      // We could generate the value then set on local cache, but calling apply guarantees
+      // canonicalValueGenerator will be called only once in this instance (supposing LocalCache works like Spray Cache)
+      localCache(key, canonicalValueGenerator(key, genValue))
+    } else {
+      allReadableCaches(index).get(key).asTry().flatMap {
+        case Success(None) =>
+          // Try next cache
+          indexedApply(index + 1, key, genValue)
+        case Success(Some(value)) =>
+          Future.successful(value)
+        case Failure(e) =>
+          // Oh noes, this failed
+          // We are supposing any retries have already been done by the cache implementation
+          // So try the next one, we don't have many options
+          // TODO: generate metric and log here
+          indexedApply(index + 1, key, genValue)
+      }
+    }
+  }
+
+  override def apply(key: Any, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] =
+    indexedApply(0, key, genValue)
+}
+
+case class ExpiringRedisWithAsyncUpdate()
\ No newline at end of file

From e07fe16fe0d78fa91fa0ecc2f4964056379dc8c6 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Wed, 26 Oct 2016 15:56:52 -0200
Subject: [PATCH 117/268] Now it implements async updates

---
 .../core/cache/MultipleLevelCache.scala       | 326 ++++++++++++------
 .../ignition/core/utils/FutureUtils.scala     |  13 +
 2 files changed, 232 insertions(+), 107 deletions(-)

diff --git a/src/main/scala/ignition/core/cache/MultipleLevelCache.scala b/src/main/scala/ignition/core/cache/MultipleLevelCache.scala
index 83a5c0b8..6747b4cf 100644
--- a/src/main/scala/ignition/core/cache/MultipleLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/MultipleLevelCache.scala
@@ -1,120 +1,187 @@
 package ignition.core.cache
 
+import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap
 import ignition.core.utils.FutureUtils._
+import org.joda.time.{DateTime, Period}
+import org.slf4j.LoggerFactory
 
-import scala.concurrent.{ExecutionContext, Future}
+import scala.concurrent.duration._
+import scala.concurrent.{ExecutionContext, Future, Promise}
 import scala.util.{Failure, Success, Try}
 
-trait SimpleCache[V] {
-  def apply(key: Any, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext): Future[V]
-}
 
-trait LocalCache[V] extends SimpleCache[V] {
-  def get(key: Any): Option[Future[V]]
-  def set(key: Any, value: Try[V]): Boolean
-}
+object ExpiringMultipleLevelCache {
+  case class TimestampedValue[V](date: DateTime, value: V) {
+    def hasExpired(ttl: Period, now: DateTime): Boolean = {
+      date.plus(ttl).isBefore(now)
+    }
+  }
 
-trait RemoteWritableCache[V] {
-  def set(key: Any, value: Try[V])(implicit ec: ExecutionContext): Future[Boolean]
-}
+  trait GenericCache[V] {
+    def apply(key: String, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext): Future[V]
+  }
 
-trait RemoteReadableCache[V] {
-  def get(key: Any)(implicit ec: ExecutionContext): Future[Option[V]]
-}
+  trait LocalCache[V] extends GenericCache[V] {
+    def get(key: Any): Option[Future[V]]
+    def set(key: Any, value: Try[V]): Unit
+  }
 
-trait RemoteCacheRW[V] extends SimpleCache[V] with RemoteReadableCache[V] with RemoteWritableCache[V]
+  trait RemoteWritableCache[V] {
+    def set(key: String, value: V)(implicit ec: ExecutionContext): Future[Unit]
+    def setLock(key: String, ttl: FiniteDuration): Future[Boolean]
+  }
 
+  trait RemoteReadableCache[V] {
+    def get(key: String)(implicit ec: ExecutionContext): Future[Option[V]]
+  }
 
-case class LocalAsRemote[V](local: LocalCache[V]) extends RemoteCacheRW[V] {
-  override def get(key: Any)(implicit ec: ExecutionContext): Future[Option[V]] =
-    local.get(key).map(_.map(Option.apply)).getOrElse(Future.successful(None))
+  trait RemoteCacheRW[V] extends RemoteReadableCache[V] with RemoteWritableCache[V]
 
-  override def set(key: Any, value: Try[V])(implicit ec: ExecutionContext): Future[Boolean] =
-    Future.successful(local.set(key, value))
+  trait ReporterCallback {
+    def onError(key: String, t: Throwable): Unit
+    def onRemoteGiveup(key: String): Unit
+  }
+
+  object NoOpReporter extends ReporterCallback {
+    def onError(key: String, t: Throwable): Unit = {}
+    def onRemoteGiveup(key: String): Unit = {}
+  }
 
-  override def apply(key: Any, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] =
-    apply(key, genValue)
 }
 
-case class MultipleLevelCache[V](localCache: LocalCache[V],
-                                 remoteRW: List[RemoteCacheRW[V]],
-                                 remoteReadOnly: List[RemoteReadableCache[V]]) extends SimpleCache[V] {
-  val allReadableCaches: Array[RemoteReadableCache[V]] =
-    (LocalAsRemote(localCache) +: (remoteRW ++ remoteReadOnly)).toArray
+
+import ExpiringMultipleLevelCache._
+
+
+
+case class ExpiringMultipleLevelCache[V](ttl: Period,
+                                         localCache: LocalCache[TimestampedValue[V]],
+                                         remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None,
+                                         reporter: ExpiringMultipleLevelCache.ReporterCallback = ExpiringMultipleLevelCache.NoOpReporter,
+                                         maxErrorsToRetryOnRemote: Int = 5) extends GenericCache[V] {
+
+  private val logger = LoggerFactory.getLogger(getClass)
+
+  private val tempUpdate = new ConcurrentLinkedHashMap.Builder[Any, Future[TimestampedValue[V]]].build()
+
+  protected def now = DateTime.now
+
+  private def timestamp(v: V) = TimestampedValue(now, v)
+
+  private def remoteLockKey(key: Any) = s"$key-emlc-lock"
+
+  private val remoteLockTTL = 10.seconds
+
+  // This methods tries to guarantee that everyone that calls it in
+  // a given moment will be left with the same value in the end
+  private def remoteSetOrGet(key: String,
+                             calculatedValue: TimestampedValue[V],
+                             remote: RemoteCacheRW[TimestampedValue[V]],
+                             currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
+    if (currentRetry > maxErrorsToRetryOnRemote) {
+      reporter.onRemoteGiveup(key)
+      // TODO: generate metric and log here
+      // Use our calculated value as it's the best we can do
+      Future.successful(calculatedValue)
+    } else {
+      remote.setLock(remoteLockKey(key), remoteLockTTL).asTry().flatMap {
+        case Success(true) =>
+          // Lock acquired, get the current value and replace it
+          remote.get(key).asTry().flatMap {
+            case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
+              // Current value is good, just return it
+              Future.successful(remoteValue)
+            case Success(_) =>
+              // The remote value is missing or has expired
+              // We have the lock to replace this value. Our calculated value will be the canonical one!
+              remote.set(key, calculatedValue).asTry().flatMap {
+                case Success(_) =>
+                  // Flawless victory
+                  Future.successful(calculatedValue)
+                case Failure(e) =>
+                  // TODO: generate metric and log here
+                  // Retry failure
+                  remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1)
+              }
+            case Failure(_) =>
+              // TODO: generate metric and log here
+              // Retry failure
+              remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1)
+          }
+        case Success(false) =>
+          // Someone got the lock, let's take a look at the value
+          remote.get(key).asTry().flatMap {
+            case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
+              // Current value is good, just return it
+              Future.successful(remoteValue)
+            case Success(_) =>
+              // The value is missing or has expired
+              // Let's start from scratch because we need to be able to set or get a good value
+              // Note: do not increment retry because this isn't an error
+              remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry)
+            case Failure(e) =>
+              // TODO: generate metric and log here
+              // Retry
+              remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1)
+          }
+        case Failure(_) =>
+          // TODO: generate metric and log here
+          // Retry failure
+          remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1)
+      }
+    }
+  }
+
+  private def remoteGetWithRetryOnError(key: String,
+                                        remote: RemoteCacheRW[TimestampedValue[V]],
+                                        currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
+    remote.get(key).asTry().flatMap {
+      case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
+        Future.successful(remoteValue)
+      case Success(_) =>
+        Future.failed(new Exception("No good value found on remote"))
+      case Failure(e) =>
+        if (currentRetry >= maxErrorsToRetryOnRemote) {
+          // TODO: generate metric and log here
+          Future.failed(e)
+        } else {
+          // Retry
+          remoteGetWithRetryOnError(key, remote, currentRetry = currentRetry + 1)
+        }
+    }
+  }
 
   // This can be called by multiple instances simultaneously but in the end
   // only the one that wins the race will create the final value that will be set in
   // the remote caches and read by the other instances
   // Unless of course there is some error getting stuff from remote cache
-  // in which case the local value may be returned
-  def canonicalValueGenerator(key: Any, genValue: () => Future[V])() = {
-    val fLocalValue = genValue()
-    val finalValue: Future[V] = fLocalValue.asTry().flatMap {
-      case tLocalValue @ Success(localValue) =>
-        // Successfully generated value, try to set it in the first remote Writable cache
+  // in which case the locally generated value may be returned
+  protected def canonicalValueGenerator(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext) = {
+    val fGeneratedValue = Try { genValue().map(timestamp) }.asFutureTry()
+    val finalValue: Future[TimestampedValue[V]] = fGeneratedValue.flatMap {
+      case Success(generatedValue) =>
+        // Successfully generated value, try to set it in the remote writable cache
         remoteRW match {
           // No remote cache available, just return this value to be set on local cache
-          case Nil =>
-            Future.successful(localValue)
-          // We have at least one remote RW cache
-          case first :: others =>
-            first.set(key, tLocalValue).asTry().flatMap {
-              case Success(true) =>
-                // Successfully inserted on first remote store, propagate value to other remote rw caches
-                // We do it in a fire and forget approach, we only guarantee the data is in the first cache
-                others.foreach(_.set(key, tLocalValue))
-                // Return this value to be set on the local cache
-                Future.successful(localValue)
-              case Success(false) =>
-                // There is already a value there, we lost the race, ours won't be the canonical one, try to get it
-                first.get(key).asTry().flatMap {
-                  case Success(Some(remoteValue)) =>
-                    // Just return it
-                    Future.successful(remoteValue)
-                  case Success(None) =>
-                    // WTF? the set operation said it was there but now the value disappeared?!
-                    // So return our value which is good and hope for the best
-                    // TODO: generate metric and log here
-                    Future.successful(localValue)
-                  case Failure(_) =>
-                    // Oh noes, we failed to get the canonical value
-                    // We are supposing any retries have already been done by the cache implementation
-                    // So return our value which is good and hope for the best
-                    // TODO: generate metric and log here
-                    Future.successful(localValue)
-                }
-              case Failure(_) =>
-                // Oh noes, we failed to set the canonical value
-                // We are supposing any retries have already been done by the cache implementation
-                // So return our value which is good and hope for the best
-                // TODO: generate metric and log here
-                Future.successful(localValue)
-            }
+          case None =>
+            Future.successful(generatedValue)
+          case Some(remote) =>
+            remoteSetOrGet(key, generatedValue, remote)
         }
       case Failure(eLocal) =>
-        // We failed to generate the value ourselves, our only hope is if someone else successfully did it in the meantime
+        // We failed to generate the value ourselves, our hope is if someone else successfully did it in the meantime
         remoteRW match {
-          case Nil =>
+          case None =>
             // There are no remote RW caches
-            // FIXME: perhaps try the read only caches (but we can just be wasting time doing that)
             // TODO: generate metric and log here
             Future.failed(eLocal)
-          case first :: others =>
-            first.get(key).asTry().flatMap {
-              case Success(Some(remoteValue)) =>
-                // Hooray, someone calculated and set the value, return it
-                Future.successful(remoteValue)
-              case Success(None) =>
-                // Sadly, there is no value on remote, we failed!
-                // FIXME: perhaps try other caches (but we can just be wasting time doing that)
+          case Some(remote) =>
+            remoteGetWithRetryOnError(key, remote).asTry().flatMap {
+              case Success(v) =>
                 // TODO: generate metric and log here
-                Future.failed(eLocal)
+                Future.successful(v)
               case Failure(eRemote) =>
-                // Oh noes, this failed
-                // We are supposing any retries have already been done by the cache implementation
-                // And to make things worse, we don't have a good value
-                // So return a failure
-                // FIXME: perhaps try other caches (but we can just be wasting time doing that)
+                // The real error is the eLocal, return it
                 // TODO: generate metric and log here
                 Future.failed(eLocal)
             }
@@ -123,30 +190,75 @@ case class MultipleLevelCache[V](localCache: LocalCache[V],
     finalValue
   }
 
-  def indexedApply(index: Int, key: Any, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = {
-    if (index >= allReadableCaches.size) { // nothing found on our caches, calculate value
-      // We could generate the value then set on local cache, but calling apply guarantees
-      // canonicalValueGenerator will be called only once in this instance (supposing LocalCache works like Spray Cache)
-      localCache(key, canonicalValueGenerator(key, genValue))
-    } else {
-      allReadableCaches(index).get(key).asTry().flatMap {
-        case Success(None) =>
-          // Try next cache
-          indexedApply(index + 1, key, genValue)
-        case Success(Some(value)) =>
-          Future.successful(value)
-        case Failure(e) =>
-          // Oh noes, this failed
-          // We are supposing any retries have already been done by the cache implementation
-          // So try the next one, we don't have many options
-          // TODO: generate metric and log here
-          indexedApply(index + 1, key, genValue)
-      }
+  // Note: this method may return a failed future, but it will never cache it
+  private def tryGenerateAndSet(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
+    val promise = Promise[TimestampedValue[V]]()
+    tempUpdate.putIfAbsent(key, promise.future) match {
+      case null =>
+        canonicalValueGenerator(key, genValue).onComplete {
+          case Success(v) =>
+            localCache.set(key, Success(v))
+            promise.trySuccess(v)
+            tempUpdate.remove(key)
+          case Failure(e) =>
+            // Note: we don't save failures to cache
+            promise.tryFailure(e)
+            tempUpdate.remove(key)
+        }
+        promise.future
+      case fTrying => fTrying
     }
   }
 
-  override def apply(key: Any, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] =
-    indexedApply(0, key, genValue)
+  override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] =
+    localCache.get(key).map(_.asTry()) match {
+      case Some(future) =>
+        future.flatMap {
+          case Success(localValue) if !localValue.hasExpired(ttl, now) =>
+            // We have locally a good value, just return it
+            Future.successful(localValue.value)
+          case Success(expiredLocalValue) if remoteRW.nonEmpty =>
+            // We have locally an expired value, but we can check a remote cache for better value
+            remoteRW.get.get(key).asTry().flatMap {
+              case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
+                // Remote is good, set locally and return it
+                localCache.set(key, Success(remoteValue))
+                Future.successful(remoteValue.value)
+              case Success(Some(_)) | Success(None) =>
+                // No good remote, return local, async update both
+                tryGenerateAndSet(key, genValue)
+                Future.successful(expiredLocalValue.value)
+              case Failure(e) =>
+                // TODO: log, generate metrics
+                tryGenerateAndSet(key, genValue)
+                Future.successful(expiredLocalValue.value)
+            }
+          case Success(expiredLocalValue) if remoteRW.isEmpty =>
+            tryGenerateAndSet(key, genValue)
+            Future.successful(expiredLocalValue.value)
+          case Failure(e) =>
+            // This is almost impossible to happen because it's local and we don't save failed values
+            // TODO: log, generate metrics
+            tryGenerateAndSet(key, genValue).map(_.value)
+        }
+      case None if remoteRW.nonEmpty =>
+        // No local, let's try remote
+        remoteRW.get.get(key).asTry().flatMap {
+          case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
+            // Remote is good, set locally and return it
+            localCache.set(key, Success(remoteValue))
+            Future.successful(remoteValue.value)
+          case Success(Some(_)) | Success(None) =>
+            // No good remote, sync generate
+            tryGenerateAndSet(key, genValue).map(_.value)
+          case Failure(e) =>
+            // TODO: log, generate metrics
+            tryGenerateAndSet(key, genValue).map(_.value)
+        }
+      case None if remoteRW.isEmpty =>
+        // No local and no remote to look, just generate it
+        tryGenerateAndSet(key, genValue).map(_.value)
+    }
 }
 
 case class ExpiringRedisWithAsyncUpdate()
\ No newline at end of file
diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala
index 55853826..684c950b 100644
--- a/src/main/scala/ignition/core/utils/FutureUtils.scala
+++ b/src/main/scala/ignition/core/utils/FutureUtils.scala
@@ -11,6 +11,7 @@ object FutureUtils {
 
   def blockingFuture[T](body: =>T)(implicit ec: ExecutionContext): Future[T] = Future { blocking { body } }
 
+
   implicit class FutureImprovements[V](future: Future[V]) {
     def toOptionOnFailure(errorHandler: (Throwable) => Option[V])(implicit ec: ExecutionContext): Future[Option[V]] = {
       future.map(Option.apply).recover { case t => errorHandler(t) }
@@ -37,6 +38,18 @@ object FutureUtils {
     }
   }
 
+  implicit class TryFutureImprovements[V](future: Try[Future[V]]) {
+    // Works like asTry(), but will also wrap the outer Try inside the Future
+    def asFutureTry()(implicit ec: ExecutionContext): Future[Try[V]] = {
+      future match {
+        case Success(f) =>
+          f.asTry()
+        case Failure(e) =>
+          Future.successful(Failure(e))
+      }
+    }
+  }
+
   implicit class FutureGeneratorImprovements[V](generator: Iterable[() => Future[V]]){
     def toLazyIterable(batchSize: Int = 1)(implicit ec: ExecutionContext): Iterable[Future[V]] = new Iterable[Future[V]] {
       override def iterator =  new Iterator[Future[V]] {

From 5018bf9b951128e137897d05ecad00989238990c Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Wed, 26 Oct 2016 19:35:35 -0200
Subject: [PATCH 118/268] Added spray cache and minor improvements

---
 ...scala => ExpiringMultipleLevelCache.scala} | 377 +++++++++++-------
 .../scala/spray/cache/ExpiringLruCache.scala  | 139 +++++++
 2 files changed, 367 insertions(+), 149 deletions(-)
 rename src/main/scala/ignition/core/cache/{MultipleLevelCache.scala => ExpiringMultipleLevelCache.scala} (54%)
 create mode 100644 src/main/scala/spray/cache/ExpiringLruCache.scala

diff --git a/src/main/scala/ignition/core/cache/MultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
similarity index 54%
rename from src/main/scala/ignition/core/cache/MultipleLevelCache.scala
rename to src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
index 6747b4cf..5469e308 100644
--- a/src/main/scala/ignition/core/cache/MultipleLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
@@ -2,9 +2,11 @@ package ignition.core.cache
 
 import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap
 import ignition.core.utils.FutureUtils._
+import ignition.core.utils.DateUtils._
 import org.joda.time.{DateTime, Period}
 import org.slf4j.LoggerFactory
 
+
 import scala.concurrent.duration._
 import scala.concurrent.{ExecutionContext, Future, Promise}
 import scala.util.{Failure, Success, Try}
@@ -23,7 +25,7 @@ object ExpiringMultipleLevelCache {
 
   trait LocalCache[V] extends GenericCache[V] {
     def get(key: Any): Option[Future[V]]
-    def set(key: Any, value: Try[V]): Unit
+    def set(key: Any, value: V): Unit
   }
 
   trait RemoteWritableCache[V] {
@@ -38,22 +40,42 @@ object ExpiringMultipleLevelCache {
   trait RemoteCacheRW[V] extends RemoteReadableCache[V] with RemoteWritableCache[V]
 
   trait ReporterCallback {
-    def onError(key: String, t: Throwable): Unit
-    def onRemoteGiveup(key: String): Unit
+    def onCacheMissNothingFound()
+    def onCacheMissButFoundExpiredLocal()
+    def onCacheMissButFoundExpiredRemote()
+    def onRemoteCacheHit()
+    def onLocalCacheHit()
+    def onUnexpectedBehaviour()
+    def onStillTryingToLockOrGet()
+    def onSuccessfullyRemoteSetValue()
+    def onRemoteCacheHitAfterGenerating()
+    def onErrorGeneratingValue(key: String, eLocal: Throwable)
+    def onLocalError(key: String, e: Throwable)
+    def onRemoteError(key: String, t: Throwable): Unit
+    def onRemoteGiveUp(key: String): Unit
   }
 
   object NoOpReporter extends ReporterCallback {
-    def onError(key: String, t: Throwable): Unit = {}
-    def onRemoteGiveup(key: String): Unit = {}
+    override def onCacheMissNothingFound(): Unit = {}
+    override def onUnexpectedBehaviour(): Unit = {}
+    override def onSuccessfullyRemoteSetValue(): Unit = {}
+    override def onRemoteError(key: String, t: Throwable): Unit = {}
+    override def onRemoteGiveUp(key: String): Unit = {}
+    override def onLocalError(key: String, e: Throwable): Unit = {}
+    override def onErrorGeneratingValue(key: String, eLocal: Throwable): Unit = {}
+    override def onRemoteCacheHitAfterGenerating(): Unit = {}
+    override def onCacheMissButFoundExpiredRemote(): Unit = {}
+    override def onStillTryingToLockOrGet(): Unit = {}
+    override def onLocalCacheHit(): Unit = {}
+    override def onRemoteCacheHit(): Unit = {}
+    override def onCacheMissButFoundExpiredLocal(): Unit = {}
   }
-
 }
 
 
 import ExpiringMultipleLevelCache._
 
 
-
 case class ExpiringMultipleLevelCache[V](ttl: Period,
                                          localCache: LocalCache[TimestampedValue[V]],
                                          remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None,
@@ -70,7 +92,182 @@ case class ExpiringMultipleLevelCache[V](ttl: Period,
 
   private def remoteLockKey(key: Any) = s"$key-emlc-lock"
 
-  private val remoteLockTTL = 10.seconds
+  private val remoteLockTTL = 5.seconds
+
+
+  // The idea is simple, have two caches: remote and local
+  // with values that will eventually expire but still be left on the cache
+  // while a new value is asynchronously being calculated/retrieved
+  override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] =
+    // The local cache is always the first try. We'll only look the remote if the local value is missing or has expired
+    localCache.get(key).map(_.asTry()) match {
+      case Some(future) =>
+        future.flatMap {
+          case Success(localValue) if !localValue.hasExpired(ttl, now) =>
+            // We have locally a good value, just return it
+            reporter.onLocalCacheHit()
+            Future.successful(localValue.value)
+          case Success(expiredLocalValue) if remoteRW.nonEmpty =>
+            // We have locally an expired value, but we can check a remote cache for better value
+            remoteRW.get.get(key).asTry().flatMap {
+              case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
+                // Remote is good, set locally and return it
+                reporter.onRemoteCacheHit()
+                localCache.set(key, remoteValue)
+                Future.successful(remoteValue.value)
+              case Success(Some(expiredRemote)) =>
+                // Expired local and expired remote, return the most recent of them, async update both
+                reporter.onCacheMissButFoundExpiredRemote()
+                tryGenerateAndSet(key, genValue)
+                val mostRecent = Set(expiredLocalValue, expiredRemote).maxBy(_.date)
+                Future.successful(mostRecent.value)
+              case Success(None) =>
+                // No remote found, return local, async update both
+                reporter.onCacheMissButFoundExpiredLocal()
+                tryGenerateAndSet(key, genValue)
+                Future.successful(expiredLocalValue.value)
+              case Failure(e) =>
+                reporter.onRemoteError(key, e)
+                logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key expired local value and failed to get remote", e)
+                tryGenerateAndSet(key, genValue)
+                Future.successful(expiredLocalValue.value)
+            }
+          case Success(expiredLocalValue) if remoteRW.isEmpty =>
+            // There is no remote cache configured, we'are on our own
+            // Return expired value and try to generate a new one for the future
+            reporter.onCacheMissButFoundExpiredLocal()
+            tryGenerateAndSet(key, genValue)
+            Future.successful(expiredLocalValue.value)
+          case Failure(e) =>
+            // This is almost impossible to happen because it's local and we don't save failed values
+            reporter.onLocalError(key, e)
+            logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key got a failed future from cache!? This is almost impossible!", e)
+            tryGenerateAndSet(key, genValue).map(_.value)
+        }
+      case None if remoteRW.nonEmpty =>
+        // No local, let's try remote
+        remoteRW.get.get(key).asTry().flatMap {
+          case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
+            // Remote is good, set locally and return it
+            reporter.onRemoteCacheHit()
+            localCache.set(key, remoteValue)
+            Future.successful(remoteValue.value)
+          case Success(Some(expiredRemote)) =>
+            // Expired remote, return the it, async update
+            reporter.onCacheMissButFoundExpiredRemote()
+            tryGenerateAndSet(key, genValue).map(_.value)
+            Future.successful(expiredRemote.value)
+          case Success(None) =>
+            // No good remote, sync generate
+            reporter.onCacheMissNothingFound()
+            tryGenerateAndSet(key, genValue).map(_.value)
+          case Failure(e) =>
+            reporter.onRemoteError(key, e)
+            logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key expired local value and no remote configured", e)
+            tryGenerateAndSet(key, genValue).map(_.value)
+        }
+      case None if remoteRW.isEmpty =>
+        // No local and no remote to look, just generate it
+        // The caller will need to wait for the value generation
+        reporter.onCacheMissNothingFound()
+        tryGenerateAndSet(key, genValue).map(_.value)
+    }
+
+  // Note: this method may return a failed future, but it will never cache it
+  // Our main purpose here is to avoid multiple local calls to generate new promises/futures in parallel,
+  // so we use this Map keep everyone in sync
+  // This is similar to how spray cache works
+  private def tryGenerateAndSet(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
+    val promise = Promise[TimestampedValue[V]]()
+    tempUpdate.putIfAbsent(key, promise.future) match {
+      case null =>
+        canonicalValueGenerator(key, genValue).onComplete {
+          case Success(v) if !v.hasExpired(ttl, now) =>
+            localCache.set(key, v)
+            promise.trySuccess(v)
+            tempUpdate.remove(key)
+          case Success(v) =>
+            // Have we generated/got an expired value!?
+            reporter.onUnexpectedBehaviour()
+            logger.warn(s"tryGenerateAndSet, key $key: unexpectedly generated/got an expired value: $v")
+            localCache.set(key, v)
+            promise.trySuccess(v)
+            tempUpdate.remove(key)
+          case Failure(e) =>
+            // We don't save failures to cache
+            // There is no need to log here, canonicalValueGenerator will log everything already
+            promise.tryFailure(e)
+            tempUpdate.remove(key)
+        }
+        promise.future
+      case fTrying =>
+        // If someone call us while a future is running, we return the running future
+        fTrying
+    }
+  }
+
+  // This can be called by multiple instances/hosts simultaneously but in the end
+  // only the one that wins the race will create the final value that will be set in
+  // the remote cache and read by the other instances
+  // Unless of course there is some error getting stuff from remote cache
+  // in which case the locally generated value may be returned to avoid further delays
+  protected def canonicalValueGenerator(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext) = {
+    val fGeneratedValue = Try { genValue().map(timestamp) }.asFutureTry()
+    val finalValue: Future[TimestampedValue[V]] = fGeneratedValue.flatMap {
+      case Success(generatedValue) =>
+        // Successfully generated value, try to set it in the remote writable cache
+        remoteRW match {
+          // No remote cache available, just return this value to be set on local cache
+          case None =>
+            Future.successful(generatedValue)
+          case Some(remote) =>
+            remoteSetOrGet(key, generatedValue, remote)
+        }
+      case Failure(eLocal) =>
+        // We failed to generate the value ourselves, our hope is if someone else successfully did it in the meantime
+        reporter.onErrorGeneratingValue(key, eLocal)
+        remoteRW match {
+          case None =>
+            // There are no remote RW caches
+            logger.error(s"canonicalValueGenerator, key $key: failed to generate value and no remote cache configured", eLocal)
+            Future.failed(eLocal)
+          case Some(remote) =>
+            remoteGetNonExpiredValue(key, remote).asTry().flatMap {
+              case Success(v) =>
+                logger.warn(s"canonicalValueGenerator, key $key: failed to generate value but got one from remote", eLocal)
+                Future.successful(v)
+              case Failure(eRemote) =>
+                // The real error is the eLocal, return it
+                logger.error(s"canonicalValueGenerator, key $key: failed to generate value and failed to get remote", eLocal)
+                Future.failed(eLocal)
+            }
+        }
+    }
+    finalValue
+  }
+
+  // Auxiliary method, only makes sense to be used by canonicalValueGenerator
+  private def remoteGetNonExpiredValue(key: String,
+                                       remote: RemoteCacheRW[TimestampedValue[V]],
+                                       currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
+    remote.get(key).asTry().flatMap {
+      case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
+        Future.successful(remoteValue)
+      case Success(_) =>
+        Future.failed(new Exception("No good value found on remote"))
+      case Failure(e) =>
+        if (currentRetry >= maxErrorsToRetryOnRemote) {
+          reporter.onRemoteGiveUp(key)
+          logger.error(s"remoteGetWithRetryOnError, key $key: returning calculated value because we got more than $maxErrorsToRetryOnRemote errors", e)
+          Future.failed(e)
+        } else {
+          reporter.onRemoteError(key, e)
+          logger.warn(s"remoteGetWithRetryOnError, key $key: got error trying to get value, retry $currentRetry of $maxErrorsToRetryOnRemote", e)
+          // Retry
+          remoteGetNonExpiredValue(key, remote, currentRetry = currentRetry + 1)
+        }
+    }
+  }
 
   // This methods tries to guarantee that everyone that calls it in
   // a given moment will be left with the same value in the end
@@ -79,32 +276,39 @@ case class ExpiringMultipleLevelCache[V](ttl: Period,
                              remote: RemoteCacheRW[TimestampedValue[V]],
                              currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
     if (currentRetry > maxErrorsToRetryOnRemote) {
-      reporter.onRemoteGiveup(key)
-      // TODO: generate metric and log here
       // Use our calculated value as it's the best we can do
+      reporter.onRemoteGiveUp(key)
+      logger.error(s"remoteSetOrGet, key $key: returning calculated value because we got more than $maxErrorsToRetryOnRemote errors")
       Future.successful(calculatedValue)
     } else {
       remote.setLock(remoteLockKey(key), remoteLockTTL).asTry().flatMap {
         case Success(true) =>
+          logger.info(s"remoteSetOrGet got lock for key $key")
           // Lock acquired, get the current value and replace it
           remote.get(key).asTry().flatMap {
             case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
               // Current value is good, just return it
+              reporter.onRemoteCacheHitAfterGenerating()
+              logger.info(s"remoteSetOrGet got lock for $key but found already a good value on remote")
               Future.successful(remoteValue)
             case Success(_) =>
-              // The remote value is missing or has expired
+              // The remote value is missing or has expired. This is what we were expecting
               // We have the lock to replace this value. Our calculated value will be the canonical one!
               remote.set(key, calculatedValue).asTry().flatMap {
                 case Success(_) =>
-                  // Flawless victory
+                  // Flawless victory!
+                  reporter.onSuccessfullyRemoteSetValue()
+                  logger.info(s"remoteSetOrGet successfully set key $key while under lock")
                   Future.successful(calculatedValue)
                 case Failure(e) =>
-                  // TODO: generate metric and log here
+                  reporter.onRemoteError(key, e)
+                  logger.warn(s"remoteSetOrGet, key $key: got error setting the value, retry $currentRetry of $maxErrorsToRetryOnRemote", e)
                   // Retry failure
                   remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1)
               }
-            case Failure(_) =>
-              // TODO: generate metric and log here
+            case Failure(e) =>
+              reporter.onRemoteError(key, e)
+              logger.warn(s"remoteSetOrGet, key $key: got error getting remote value with lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e)
               // Retry failure
               remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1)
           }
@@ -113,152 +317,27 @@ case class ExpiringMultipleLevelCache[V](ttl: Period,
           remote.get(key).asTry().flatMap {
             case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
               // Current value is good, just return it
+              reporter.onRemoteCacheHitAfterGenerating()
               Future.successful(remoteValue)
             case Success(_) =>
               // The value is missing or has expired
               // Let's start from scratch because we need to be able to set or get a good value
               // Note: do not increment retry because this isn't an error
+              reporter.onStillTryingToLockOrGet()
+              logger.info(s"remoteSetOrGet couldn't lock key $key and didn't found good value on remote")
               remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry)
             case Failure(e) =>
-              // TODO: generate metric and log here
+              reporter.onRemoteError(key, e)
+              logger.warn(s"remoteSetOrGet, key $key: got error getting remote value without lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e)
               // Retry
               remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1)
           }
-        case Failure(_) =>
-          // TODO: generate metric and log here
+        case Failure(e) =>
           // Retry failure
+          reporter.onRemoteError(key, e)
+          logger.warn(s"remoteSetOrGet, key $key: got error trying to set lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e)
           remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1)
       }
     }
   }
-
-  private def remoteGetWithRetryOnError(key: String,
-                                        remote: RemoteCacheRW[TimestampedValue[V]],
-                                        currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
-    remote.get(key).asTry().flatMap {
-      case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
-        Future.successful(remoteValue)
-      case Success(_) =>
-        Future.failed(new Exception("No good value found on remote"))
-      case Failure(e) =>
-        if (currentRetry >= maxErrorsToRetryOnRemote) {
-          // TODO: generate metric and log here
-          Future.failed(e)
-        } else {
-          // Retry
-          remoteGetWithRetryOnError(key, remote, currentRetry = currentRetry + 1)
-        }
-    }
-  }
-
-  // This can be called by multiple instances simultaneously but in the end
-  // only the one that wins the race will create the final value that will be set in
-  // the remote caches and read by the other instances
-  // Unless of course there is some error getting stuff from remote cache
-  // in which case the locally generated value may be returned
-  protected def canonicalValueGenerator(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext) = {
-    val fGeneratedValue = Try { genValue().map(timestamp) }.asFutureTry()
-    val finalValue: Future[TimestampedValue[V]] = fGeneratedValue.flatMap {
-      case Success(generatedValue) =>
-        // Successfully generated value, try to set it in the remote writable cache
-        remoteRW match {
-          // No remote cache available, just return this value to be set on local cache
-          case None =>
-            Future.successful(generatedValue)
-          case Some(remote) =>
-            remoteSetOrGet(key, generatedValue, remote)
-        }
-      case Failure(eLocal) =>
-        // We failed to generate the value ourselves, our hope is if someone else successfully did it in the meantime
-        remoteRW match {
-          case None =>
-            // There are no remote RW caches
-            // TODO: generate metric and log here
-            Future.failed(eLocal)
-          case Some(remote) =>
-            remoteGetWithRetryOnError(key, remote).asTry().flatMap {
-              case Success(v) =>
-                // TODO: generate metric and log here
-                Future.successful(v)
-              case Failure(eRemote) =>
-                // The real error is the eLocal, return it
-                // TODO: generate metric and log here
-                Future.failed(eLocal)
-            }
-        }
-    }
-    finalValue
-  }
-
-  // Note: this method may return a failed future, but it will never cache it
-  private def tryGenerateAndSet(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
-    val promise = Promise[TimestampedValue[V]]()
-    tempUpdate.putIfAbsent(key, promise.future) match {
-      case null =>
-        canonicalValueGenerator(key, genValue).onComplete {
-          case Success(v) =>
-            localCache.set(key, Success(v))
-            promise.trySuccess(v)
-            tempUpdate.remove(key)
-          case Failure(e) =>
-            // Note: we don't save failures to cache
-            promise.tryFailure(e)
-            tempUpdate.remove(key)
-        }
-        promise.future
-      case fTrying => fTrying
-    }
-  }
-
-  override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] =
-    localCache.get(key).map(_.asTry()) match {
-      case Some(future) =>
-        future.flatMap {
-          case Success(localValue) if !localValue.hasExpired(ttl, now) =>
-            // We have locally a good value, just return it
-            Future.successful(localValue.value)
-          case Success(expiredLocalValue) if remoteRW.nonEmpty =>
-            // We have locally an expired value, but we can check a remote cache for better value
-            remoteRW.get.get(key).asTry().flatMap {
-              case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
-                // Remote is good, set locally and return it
-                localCache.set(key, Success(remoteValue))
-                Future.successful(remoteValue.value)
-              case Success(Some(_)) | Success(None) =>
-                // No good remote, return local, async update both
-                tryGenerateAndSet(key, genValue)
-                Future.successful(expiredLocalValue.value)
-              case Failure(e) =>
-                // TODO: log, generate metrics
-                tryGenerateAndSet(key, genValue)
-                Future.successful(expiredLocalValue.value)
-            }
-          case Success(expiredLocalValue) if remoteRW.isEmpty =>
-            tryGenerateAndSet(key, genValue)
-            Future.successful(expiredLocalValue.value)
-          case Failure(e) =>
-            // This is almost impossible to happen because it's local and we don't save failed values
-            // TODO: log, generate metrics
-            tryGenerateAndSet(key, genValue).map(_.value)
-        }
-      case None if remoteRW.nonEmpty =>
-        // No local, let's try remote
-        remoteRW.get.get(key).asTry().flatMap {
-          case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
-            // Remote is good, set locally and return it
-            localCache.set(key, Success(remoteValue))
-            Future.successful(remoteValue.value)
-          case Success(Some(_)) | Success(None) =>
-            // No good remote, sync generate
-            tryGenerateAndSet(key, genValue).map(_.value)
-          case Failure(e) =>
-            // TODO: log, generate metrics
-            tryGenerateAndSet(key, genValue).map(_.value)
-        }
-      case None if remoteRW.isEmpty =>
-        // No local and no remote to look, just generate it
-        tryGenerateAndSet(key, genValue).map(_.value)
-    }
-}
-
-case class ExpiringRedisWithAsyncUpdate()
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/src/main/scala/spray/cache/ExpiringLruCache.scala b/src/main/scala/spray/cache/ExpiringLruCache.scala
new file mode 100644
index 00000000..b1f461f0
--- /dev/null
+++ b/src/main/scala/spray/cache/ExpiringLruCache.scala
@@ -0,0 +1,139 @@
+// Note:
+// For ignition.core we added two methods to satisfy ExpiringMultipleLevelCache.LocalCache[V]
+
+/*
+ * Copyright © 2011-2013 the spray project <http://spray.io>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spray.caching
+
+import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap
+import ignition.core.cache.ExpiringMultipleLevelCache
+
+import scala.annotation.tailrec
+import scala.collection.JavaConverters._
+import scala.concurrent.duration.Duration
+import scala.concurrent.{ExecutionContext, Future, Promise}
+import scala.util.{Failure, Success, Try}
+import spray.util.Timestamp
+
+final class ExpiringLruCache[V](maxCapacity: Long, initialCapacity: Int,
+                                timeToLive: Duration, timeToIdle: Duration) extends Cache[V] with ignition.core.cache.ExpiringMultipleLevelCache.LocalCache[V] {
+  require(!timeToLive.isFinite || !timeToIdle.isFinite || timeToLive > timeToIdle,
+    s"timeToLive($timeToLive) must be greater than timeToIdle($timeToIdle)")
+
+  private[caching] val store = new ConcurrentLinkedHashMap.Builder[Any, Entry[V]]
+    .initialCapacity(initialCapacity)
+    .maximumWeightedCapacity(maxCapacity)
+    .build()
+
+  @tailrec
+  def get(key: Any): Option[Future[V]] = store.get(key) match {
+    case null ⇒ None
+    case entry if (isAlive(entry)) ⇒
+      entry.refresh()
+      Some(entry.future)
+    case entry ⇒
+      // remove entry, but only if it hasn't been removed and reinserted in the meantime
+      if (store.remove(key, entry)) None // successfully removed
+      else get(key) // nope, try again
+  }
+
+  def apply(key: Any, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext): Future[V] = {
+    def insert() = {
+      val newEntry = new Entry(Promise[V]())
+      val valueFuture =
+        store.put(key, newEntry) match {
+          case null ⇒ genValue()
+          case entry ⇒
+            if (isAlive(entry)) {
+              // we date back the new entry we just inserted
+              // in the meantime someone might have already seen the too fresh timestamp we just put in,
+              // but since the original entry is also still alive this doesn't matter
+              newEntry.created = entry.created
+              entry.future
+            } else genValue()
+        }
+      valueFuture.onComplete { value ⇒
+        newEntry.promise.tryComplete(value)
+        // in case of exceptions we remove the cache entry (i.e. try again later)
+        if (value.isFailure) store.remove(key, newEntry)
+      }
+      newEntry.promise.future
+    }
+    store.get(key) match {
+      case null ⇒ insert()
+      case entry if (isAlive(entry)) ⇒
+        entry.refresh()
+        entry.future
+      case entry ⇒ insert()
+    }
+  }
+
+  def remove(key: Any) = store.remove(key) match {
+    case null                      ⇒ None
+    case entry if (isAlive(entry)) ⇒ Some(entry.future)
+    case entry                     ⇒ None
+  }
+
+  def clear(): Unit = { store.clear() }
+
+  def keys: Set[Any] = store.keySet().asScala.toSet
+
+  def ascendingKeys(limit: Option[Int] = None) =
+    limit.map { lim ⇒ store.ascendingKeySetWithLimit(lim) }
+      .getOrElse(store.ascendingKeySet())
+      .iterator().asScala
+
+  def size = store.size
+
+  private def isAlive(entry: Entry[V]) =
+    (entry.created + timeToLive).isFuture &&
+      (entry.lastAccessed + timeToIdle).isFuture
+
+  // Method required by ExpiringMultipleLevelCache.LocalCache
+  override def set(key: Any, value: V): Unit = {
+    val newEntry = new Entry(Promise[V]())
+    newEntry.promise.trySuccess(value)
+    store.put(key, newEntry) match {
+      case null =>
+        // Nothing to do
+      case oldEntry =>
+        // If the old promise is pending, complete it with our future
+        oldEntry.promise.trySuccess(value)
+    }
+  }
+
+  // Method required by ExpiringMultipleLevelCache.LocalCache
+  override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = {
+    val sprayCache: Cache[V] = this
+    sprayCache.apply(key, genValue)
+  }
+}
+
+private[caching] class Entry[T](val promise: Promise[T]) {
+  @volatile var created = Timestamp.now
+  @volatile var lastAccessed = Timestamp.now
+  def future = promise.future
+  def refresh(): Unit = {
+    // we dont care whether we overwrite a potentially newer value
+    lastAccessed = Timestamp.now
+  }
+  override def toString = future.value match {
+    case Some(Success(value))     ⇒ value.toString
+    case Some(Failure(exception)) ⇒ exception.toString
+    case None                     ⇒ "pending"
+  }
+}

From 75182789b0df31142689fd7e8411fa3a83733f2d Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 27 Oct 2016 15:03:21 -0200
Subject: [PATCH 119/268] Make remoteLockTTL a parameter, change ttl to
 FiniteDuration and explicitly define the reporter return type

---
 .../cache/ExpiringMultipleLevelCache.scala    | 38 +++++++++----------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
index 5469e308..85e728b9 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
@@ -1,12 +1,11 @@
 package ignition.core.cache
 
 import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap
-import ignition.core.utils.FutureUtils._
 import ignition.core.utils.DateUtils._
-import org.joda.time.{DateTime, Period}
+import ignition.core.utils.FutureUtils._
+import org.joda.time.DateTime
 import org.slf4j.LoggerFactory
 
-
 import scala.concurrent.duration._
 import scala.concurrent.{ExecutionContext, Future, Promise}
 import scala.util.{Failure, Success, Try}
@@ -14,8 +13,8 @@ import scala.util.{Failure, Success, Try}
 
 object ExpiringMultipleLevelCache {
   case class TimestampedValue[V](date: DateTime, value: V) {
-    def hasExpired(ttl: Period, now: DateTime): Boolean = {
-      date.plus(ttl).isBefore(now)
+    def hasExpired(ttl: FiniteDuration, now: DateTime): Boolean = {
+      date.plus(ttl.toMillis).isBefore(now)
     }
   }
 
@@ -40,17 +39,17 @@ object ExpiringMultipleLevelCache {
   trait RemoteCacheRW[V] extends RemoteReadableCache[V] with RemoteWritableCache[V]
 
   trait ReporterCallback {
-    def onCacheMissNothingFound()
-    def onCacheMissButFoundExpiredLocal()
-    def onCacheMissButFoundExpiredRemote()
-    def onRemoteCacheHit()
-    def onLocalCacheHit()
-    def onUnexpectedBehaviour()
-    def onStillTryingToLockOrGet()
-    def onSuccessfullyRemoteSetValue()
-    def onRemoteCacheHitAfterGenerating()
-    def onErrorGeneratingValue(key: String, eLocal: Throwable)
-    def onLocalError(key: String, e: Throwable)
+    def onCacheMissNothingFound(): Unit
+    def onCacheMissButFoundExpiredLocal(): Unit
+    def onCacheMissButFoundExpiredRemote(): Unit
+    def onRemoteCacheHit(): Unit
+    def onLocalCacheHit(): Unit
+    def onUnexpectedBehaviour(): Unit
+    def onStillTryingToLockOrGet(): Unit
+    def onSuccessfullyRemoteSetValue(): Unit
+    def onRemoteCacheHitAfterGenerating(): Unit
+    def onErrorGeneratingValue(key: String, eLocal: Throwable): Unit
+    def onLocalError(key: String, e: Throwable): Unit
     def onRemoteError(key: String, t: Throwable): Unit
     def onRemoteGiveUp(key: String): Unit
   }
@@ -73,12 +72,13 @@ object ExpiringMultipleLevelCache {
 }
 
 
-import ExpiringMultipleLevelCache._
+import ignition.core.cache.ExpiringMultipleLevelCache._
 
 
-case class ExpiringMultipleLevelCache[V](ttl: Period,
+case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
                                          localCache: LocalCache[TimestampedValue[V]],
                                          remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None,
+                                         remoteLockTTL: FiniteDuration = 5.seconds,
                                          reporter: ExpiringMultipleLevelCache.ReporterCallback = ExpiringMultipleLevelCache.NoOpReporter,
                                          maxErrorsToRetryOnRemote: Int = 5) extends GenericCache[V] {
 
@@ -92,8 +92,6 @@ case class ExpiringMultipleLevelCache[V](ttl: Period,
 
   private def remoteLockKey(key: Any) = s"$key-emlc-lock"
 
-  private val remoteLockTTL = 5.seconds
-
 
   // The idea is simple, have two caches: remote and local
   // with values that will eventually expire but still be left on the cache

From 14ff519d28fdd9abdc8c7da1810a62b6cf95eef0 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 27 Oct 2016 15:32:04 -0200
Subject: [PATCH 120/268] Add ec to setLock

---
 .../scala/ignition/core/cache/ExpiringMultipleLevelCache.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
index 85e728b9..1ddd92b1 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
@@ -29,7 +29,7 @@ object ExpiringMultipleLevelCache {
 
   trait RemoteWritableCache[V] {
     def set(key: String, value: V)(implicit ec: ExecutionContext): Future[Unit]
-    def setLock(key: String, ttl: FiniteDuration): Future[Boolean]
+    def setLock(key: String, ttl: FiniteDuration)(implicit ec: ExecutionContext): Future[Boolean]
   }
 
   trait RemoteReadableCache[V] {

From 60db77caa522c8860f2edc88d4e5b71dd0a6b254 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 27 Oct 2016 16:44:53 -0200
Subject: [PATCH 121/268] Minor stuff

---
 .../cache/ExpiringMultipleLevelCache.scala    | 87 ++++++++++++-------
 ...ache.scala => ExpiringLruLocalCache.scala} |  6 +-
 2 files changed, 58 insertions(+), 35 deletions(-)
 rename src/main/scala/spray/cache/{ExpiringLruCache.scala => ExpiringLruLocalCache.scala} (93%)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
index 1ddd92b1..f41f7a35 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
@@ -5,9 +5,11 @@ import ignition.core.utils.DateUtils._
 import ignition.core.utils.FutureUtils._
 import org.joda.time.DateTime
 import org.slf4j.LoggerFactory
+import spray.caching.ValueMagnet
 
 import scala.concurrent.duration._
 import scala.concurrent.{ExecutionContext, Future, Promise}
+import scala.util.control.NonFatal
 import scala.util.{Failure, Success, Try}
 
 
@@ -18,7 +20,26 @@ object ExpiringMultipleLevelCache {
     }
   }
 
-  trait GenericCache[V] {
+  trait GenericCache[V] { cache =>
+    // Keep compatible with Spray Cache
+    def apply(key: String) = new Keyed(key)
+
+    class Keyed(key: String) {
+      /**
+        * Returns either the cached Future for the key or evaluates the given call-by-name argument
+        * which produces either a value instance of type `V` or a `Future[V]`.
+        */
+      def apply(magnet: ⇒ ValueMagnet[V])(implicit ec: ExecutionContext): Future[V] =
+        cache.apply(key, () ⇒ try magnet.future catch { case NonFatal(e) ⇒ Future.failed(e) })
+
+      /**
+        * Returns either the cached Future for the key or evaluates the given function which
+        * should lead to eventual completion of the promise.
+        */
+      def apply[U](f: Promise[V] ⇒ U)(implicit ec: ExecutionContext): Future[V] =
+        cache.apply(key, () ⇒ { val p = Promise[V](); f(p); p.future })
+    }
+
     def apply(key: String, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext): Future[V]
   }
 
@@ -39,15 +60,15 @@ object ExpiringMultipleLevelCache {
   trait RemoteCacheRW[V] extends RemoteReadableCache[V] with RemoteWritableCache[V]
 
   trait ReporterCallback {
-    def onCacheMissNothingFound(): Unit
-    def onCacheMissButFoundExpiredLocal(): Unit
-    def onCacheMissButFoundExpiredRemote(): Unit
-    def onRemoteCacheHit(): Unit
-    def onLocalCacheHit(): Unit
-    def onUnexpectedBehaviour(): Unit
-    def onStillTryingToLockOrGet(): Unit
-    def onSuccessfullyRemoteSetValue(): Unit
-    def onRemoteCacheHitAfterGenerating(): Unit
+    def onCacheMissNothingFound(key: String): Unit
+    def onCacheMissButFoundExpiredLocal(key: String): Unit
+    def onCacheMissButFoundExpiredRemote(key: String): Unit
+    def onRemoteCacheHit(key: String): Unit
+    def onLocalCacheHit(key: String): Unit
+    def onUnexpectedBehaviour(key: String): Unit
+    def onStillTryingToLockOrGet(key: String): Unit
+    def onSuccessfullyRemoteSetValue(key: String): Unit
+    def onRemoteCacheHitAfterGenerating(key: String): Unit
     def onErrorGeneratingValue(key: String, eLocal: Throwable): Unit
     def onLocalError(key: String, e: Throwable): Unit
     def onRemoteError(key: String, t: Throwable): Unit
@@ -55,19 +76,19 @@ object ExpiringMultipleLevelCache {
   }
 
   object NoOpReporter extends ReporterCallback {
-    override def onCacheMissNothingFound(): Unit = {}
-    override def onUnexpectedBehaviour(): Unit = {}
-    override def onSuccessfullyRemoteSetValue(): Unit = {}
+    override def onCacheMissNothingFound(key: String): Unit = {}
+    override def onUnexpectedBehaviour(key: String): Unit = {}
+    override def onSuccessfullyRemoteSetValue(key: String): Unit = {}
     override def onRemoteError(key: String, t: Throwable): Unit = {}
     override def onRemoteGiveUp(key: String): Unit = {}
     override def onLocalError(key: String, e: Throwable): Unit = {}
     override def onErrorGeneratingValue(key: String, eLocal: Throwable): Unit = {}
-    override def onRemoteCacheHitAfterGenerating(): Unit = {}
-    override def onCacheMissButFoundExpiredRemote(): Unit = {}
-    override def onStillTryingToLockOrGet(): Unit = {}
-    override def onLocalCacheHit(): Unit = {}
-    override def onRemoteCacheHit(): Unit = {}
-    override def onCacheMissButFoundExpiredLocal(): Unit = {}
+    override def onRemoteCacheHitAfterGenerating(key: String): Unit = {}
+    override def onCacheMissButFoundExpiredRemote(key: String): Unit = {}
+    override def onStillTryingToLockOrGet(key: String): Unit = {}
+    override def onLocalCacheHit(key: String): Unit = {}
+    override def onRemoteCacheHit(key: String): Unit = {}
+    override def onCacheMissButFoundExpiredLocal(key: String): Unit = {}
   }
 }
 
@@ -103,25 +124,25 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
         future.flatMap {
           case Success(localValue) if !localValue.hasExpired(ttl, now) =>
             // We have locally a good value, just return it
-            reporter.onLocalCacheHit()
+            reporter.onLocalCacheHit(key)
             Future.successful(localValue.value)
           case Success(expiredLocalValue) if remoteRW.nonEmpty =>
             // We have locally an expired value, but we can check a remote cache for better value
             remoteRW.get.get(key).asTry().flatMap {
               case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
                 // Remote is good, set locally and return it
-                reporter.onRemoteCacheHit()
+                reporter.onRemoteCacheHit(key)
                 localCache.set(key, remoteValue)
                 Future.successful(remoteValue.value)
               case Success(Some(expiredRemote)) =>
                 // Expired local and expired remote, return the most recent of them, async update both
-                reporter.onCacheMissButFoundExpiredRemote()
+                reporter.onCacheMissButFoundExpiredRemote(key)
                 tryGenerateAndSet(key, genValue)
                 val mostRecent = Set(expiredLocalValue, expiredRemote).maxBy(_.date)
                 Future.successful(mostRecent.value)
               case Success(None) =>
                 // No remote found, return local, async update both
-                reporter.onCacheMissButFoundExpiredLocal()
+                reporter.onCacheMissButFoundExpiredLocal(key)
                 tryGenerateAndSet(key, genValue)
                 Future.successful(expiredLocalValue.value)
               case Failure(e) =>
@@ -133,7 +154,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
           case Success(expiredLocalValue) if remoteRW.isEmpty =>
             // There is no remote cache configured, we'are on our own
             // Return expired value and try to generate a new one for the future
-            reporter.onCacheMissButFoundExpiredLocal()
+            reporter.onCacheMissButFoundExpiredLocal(key)
             tryGenerateAndSet(key, genValue)
             Future.successful(expiredLocalValue.value)
           case Failure(e) =>
@@ -147,17 +168,17 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
         remoteRW.get.get(key).asTry().flatMap {
           case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
             // Remote is good, set locally and return it
-            reporter.onRemoteCacheHit()
+            reporter.onRemoteCacheHit(key)
             localCache.set(key, remoteValue)
             Future.successful(remoteValue.value)
           case Success(Some(expiredRemote)) =>
             // Expired remote, return the it, async update
-            reporter.onCacheMissButFoundExpiredRemote()
+            reporter.onCacheMissButFoundExpiredRemote(key)
             tryGenerateAndSet(key, genValue).map(_.value)
             Future.successful(expiredRemote.value)
           case Success(None) =>
             // No good remote, sync generate
-            reporter.onCacheMissNothingFound()
+            reporter.onCacheMissNothingFound(key)
             tryGenerateAndSet(key, genValue).map(_.value)
           case Failure(e) =>
             reporter.onRemoteError(key, e)
@@ -167,7 +188,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
       case None if remoteRW.isEmpty =>
         // No local and no remote to look, just generate it
         // The caller will need to wait for the value generation
-        reporter.onCacheMissNothingFound()
+        reporter.onCacheMissNothingFound(key)
         tryGenerateAndSet(key, genValue).map(_.value)
     }
 
@@ -186,7 +207,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
             tempUpdate.remove(key)
           case Success(v) =>
             // Have we generated/got an expired value!?
-            reporter.onUnexpectedBehaviour()
+            reporter.onUnexpectedBehaviour(key)
             logger.warn(s"tryGenerateAndSet, key $key: unexpectedly generated/got an expired value: $v")
             localCache.set(key, v)
             promise.trySuccess(v)
@@ -286,7 +307,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
           remote.get(key).asTry().flatMap {
             case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
               // Current value is good, just return it
-              reporter.onRemoteCacheHitAfterGenerating()
+              reporter.onRemoteCacheHitAfterGenerating(key)
               logger.info(s"remoteSetOrGet got lock for $key but found already a good value on remote")
               Future.successful(remoteValue)
             case Success(_) =>
@@ -295,7 +316,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
               remote.set(key, calculatedValue).asTry().flatMap {
                 case Success(_) =>
                   // Flawless victory!
-                  reporter.onSuccessfullyRemoteSetValue()
+                  reporter.onSuccessfullyRemoteSetValue(key)
                   logger.info(s"remoteSetOrGet successfully set key $key while under lock")
                   Future.successful(calculatedValue)
                 case Failure(e) =>
@@ -315,13 +336,13 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
           remote.get(key).asTry().flatMap {
             case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
               // Current value is good, just return it
-              reporter.onRemoteCacheHitAfterGenerating()
+              reporter.onRemoteCacheHitAfterGenerating(key)
               Future.successful(remoteValue)
             case Success(_) =>
               // The value is missing or has expired
               // Let's start from scratch because we need to be able to set or get a good value
               // Note: do not increment retry because this isn't an error
-              reporter.onStillTryingToLockOrGet()
+              reporter.onStillTryingToLockOrGet(key)
               logger.info(s"remoteSetOrGet couldn't lock key $key and didn't found good value on remote")
               remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry)
             case Failure(e) =>
diff --git a/src/main/scala/spray/cache/ExpiringLruCache.scala b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala
similarity index 93%
rename from src/main/scala/spray/cache/ExpiringLruCache.scala
rename to src/main/scala/spray/cache/ExpiringLruLocalCache.scala
index b1f461f0..8c403be9 100644
--- a/src/main/scala/spray/cache/ExpiringLruCache.scala
+++ b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala
@@ -29,8 +29,10 @@ import scala.concurrent.{ExecutionContext, Future, Promise}
 import scala.util.{Failure, Success, Try}
 import spray.util.Timestamp
 
-final class ExpiringLruCache[V](maxCapacity: Long, initialCapacity: Int,
-                                timeToLive: Duration, timeToIdle: Duration) extends Cache[V] with ignition.core.cache.ExpiringMultipleLevelCache.LocalCache[V] {
+final class ExpiringLruLocalCache[V](maxCapacity: Long,
+                                     initialCapacity: Int = 16,
+                                     timeToLive: Duration = Duration.Inf,
+                                     timeToIdle: Duration = Duration.Inf) extends Cache[V] with ignition.core.cache.ExpiringMultipleLevelCache.LocalCache[V] {
   require(!timeToLive.isFinite || !timeToIdle.isFinite || timeToLive > timeToIdle,
     s"timeToLive($timeToLive) must be greater than timeToIdle($timeToIdle)")
 

From 5f6ace3b8821a516644de4e5f9adc9bde7930eb4 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 27 Oct 2016 18:06:43 -0200
Subject: [PATCH 122/268] Add sanity test

---
 .../cache/ExpiringMultipleLevelCache.scala    |  4 ++-
 .../cache/ExpiringMultipleLevelCache.scala    | 29 +++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)
 create mode 100644 src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
index f41f7a35..0c5ada3e 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
@@ -105,7 +105,9 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
 
   private val logger = LoggerFactory.getLogger(getClass)
 
-  private val tempUpdate = new ConcurrentLinkedHashMap.Builder[Any, Future[TimestampedValue[V]]].build()
+  private val tempUpdate = new ConcurrentLinkedHashMap.Builder[Any, Future[TimestampedValue[V]]]
+    .maximumWeightedCapacity(Long.MaxValue)
+    .build()
 
   protected def now = DateTime.now
 
diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
new file mode 100644
index 00000000..d602a736
--- /dev/null
+++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
@@ -0,0 +1,29 @@
+package ignition.core.cache
+
+import ignition.core.cache.ExpiringMultipleLevelCache.TimestampedValue
+import org.scalatest.{FlatSpec, Matchers}
+import spray.caching.ExpiringLruLocalCache
+
+import scala.concurrent.ExecutionContext.Implicits.global
+import scala.concurrent.duration._
+import scala.concurrent.{Await, Future}
+
+class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers {
+  case class Data(s: String)
+  "ExpiringMultipleLevelCache" should "calculate a value on cache miss and return it" in {
+    val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
+    val cache = ExpiringMultipleLevelCache[Data](1.minute, local)
+    Await.result(cache("key", () => Future.successful(Data("success"))), 1.minute) shouldBe Data("success")
+  }
+
+  it should "calculate a value on cache miss and return a failed future of the calculation" in {
+    val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
+    val cache = ExpiringMultipleLevelCache[Data](1.minute, local)
+
+    class MyException(s: String) extends Exception(s)
+
+    intercept[MyException ] {
+      Await.result(cache("key", () => Future.failed(new MyException("some failure"))), 1.minute)
+    }
+  }
+}

From c8638492347b3d688599200959345f4050c8cc77 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Mon, 31 Oct 2016 11:46:18 -0200
Subject: [PATCH 123/268] support for setting headers in http client request

---
 .../core/http/AsyncHttpClientStreamApi.scala     |  3 ++-
 .../core/http/AsyncSprayHttpClient.scala         | 16 ++++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
index 4910c98a..30f46c53 100644
--- a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
+++ b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
@@ -4,7 +4,7 @@ import java.io.InputStream
 import java.util.concurrent.TimeUnit
 
 import akka.util.Timeout
-import spray.http.{HttpEntity, HttpMethod, HttpMethods}
+import spray.http.{HttpEntity, HttpHeader, HttpMethod, HttpMethods}
 
 import scala.concurrent.Future
 import scala.concurrent.duration._
@@ -48,6 +48,7 @@ object AsyncHttpClientStreamApi {
                      credentials: Option[Credentials] = None,
                      method: HttpMethod = HttpMethods.GET,
                      body: HttpEntity = HttpEntity.Empty,
+                     headers: List[HttpHeader] = List.empty,
                      requestConfiguration: Option[RequestConfiguration] = None)
 
   case class RequestException(message: String, response: StreamResponse) extends RuntimeException(message)
diff --git a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
index 0565fe2f..405457ea 100644
--- a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
+++ b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
@@ -61,17 +61,17 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi {
       List(Authorization(credentials = BasicHttpCredentials(username = credentials.user, password = credentials.password)))
 
     private def toSprayRequest(request: Request): HttpRequest = request match {
-      case Request(uri, params, Some(credentials), method, body, _) if params.isEmpty =>
-          HttpRequest(method = method, uri = request.url, headers = credentials, entity = body)
+      case Request(uri, params, Some(credentials), method, body, headers, _) if params.isEmpty =>
+          HttpRequest(method = method, uri = request.url, headers = credentials ++ headers, entity = body)
 
-      case Request(uri, params, Some(credentials), method, body, _) =>
-        HttpRequest(method = method, uri = toUriString(request.url, params), headers = credentials, entity = body)
+      case Request(uri, params, Some(credentials), method, body, headers, _) =>
+        HttpRequest(method = method, uri = toUriString(request.url, params), headers = credentials ++ headers, entity = body)
 
-      case Request(uri, params, None, method, body, _) if params.isEmpty =>
-        HttpRequest(method = method, uri = toUriString(request.url), entity = body)
+      case Request(uri, params, None, method, body, headers, _) if params.isEmpty =>
+        HttpRequest(method = method, uri = toUriString(request.url), entity = body, headers = headers)
 
-      case Request(uri, params, None, method, body, _) =>
-        HttpRequest(method = method, uri = toUriString(request.url, params), entity = body)
+      case Request(uri, params, None, method, body, headers, _) =>
+        HttpRequest(method = method, uri = toUriString(request.url, params), entity = body, headers = headers)
     }
 
     private def toSprayHostConnectorSetup(uri: Uri, conf: Option[AsyncHttpClientStreamApi.RequestConfiguration]): HostConnectorSetup = {

From 046e4a88c19d6d1db029c2d423a2992b5a25096d Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Mon, 31 Oct 2016 13:18:51 -0200
Subject: [PATCH 124/268] enable new spark version 2.0.1

---
 tools/spark-ec2/spark_ec2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
index a89dab8f..b1f4e709 100755
--- a/tools/spark-ec2/spark_ec2.py
+++ b/tools/spark-ec2/spark_ec2.py
@@ -77,6 +77,7 @@
     "1.5.2",
     "1.6.0",
     "2.0.0",
+    "2.0.1",
 ])
 
 SPARK_TACHYON_MAP = {

From 3bfe4d886d7b31f5c06d227bf6f4d7fa05655a18 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Mon, 31 Oct 2016 14:30:46 -0200
Subject: [PATCH 125/268] update to spark-2.0.1

---
 build.sbt        | 2 +-
 tools/cluster.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/build.sbt b/build.sbt
index 6ffe0e85..c0f4bf77 100644
--- a/build.sbt
+++ b/build.sbt
@@ -9,7 +9,7 @@ scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warning
 // Because we can't run two spark contexts on same VM
 parallelExecution in Test := false
 
-libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.0" % "provided")
+libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.1" % "provided")
   .exclude("org.apache.hadoop", "hadoop-client")
   .exclude("org.slf4j", "slf4j-log4j12")
 
diff --git a/tools/cluster.py b/tools/cluster.py
index 5f59edad..7d77e8c4 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -49,7 +49,7 @@
 default_ami = None # will be decided based on spark-ec2 list
 default_master_ami = None
 default_env = 'dev'
-default_spark_version = '2.0.0'
+default_spark_version = '2.0.1'
 custom_builds = {
 #    '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz'
 }

From 612c6428e4c431d8b36ef858183f8e3234872933 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Mon, 31 Oct 2016 16:20:40 -0200
Subject: [PATCH 126/268] Added elapsed time and some new metrics

---
 .../cache/ExpiringMultipleLevelCache.scala    | 173 ++++++++++--------
 1 file changed, 99 insertions(+), 74 deletions(-)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
index 0c5ada3e..6de11fdd 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
@@ -1,5 +1,7 @@
 package ignition.core.cache
 
+import java.util.concurrent.TimeUnit
+
 import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap
 import ignition.core.utils.DateUtils._
 import ignition.core.utils.FutureUtils._
@@ -12,7 +14,6 @@ import scala.concurrent.{ExecutionContext, Future, Promise}
 import scala.util.control.NonFatal
 import scala.util.{Failure, Success, Try}
 
-
 object ExpiringMultipleLevelCache {
   case class TimestampedValue[V](date: DateTime, value: V) {
     def hasExpired(ttl: FiniteDuration, now: DateTime): Boolean = {
@@ -60,35 +61,44 @@ object ExpiringMultipleLevelCache {
   trait RemoteCacheRW[V] extends RemoteReadableCache[V] with RemoteWritableCache[V]
 
   trait ReporterCallback {
-    def onCacheMissNothingFound(key: String): Unit
-    def onCacheMissButFoundExpiredLocal(key: String): Unit
-    def onCacheMissButFoundExpiredRemote(key: String): Unit
-    def onRemoteCacheHit(key: String): Unit
-    def onLocalCacheHit(key: String): Unit
-    def onUnexpectedBehaviour(key: String): Unit
-    def onStillTryingToLockOrGet(key: String): Unit
-    def onSuccessfullyRemoteSetValue(key: String): Unit
-    def onRemoteCacheHitAfterGenerating(key: String): Unit
-    def onErrorGeneratingValue(key: String, eLocal: Throwable): Unit
-    def onLocalError(key: String, e: Throwable): Unit
-    def onRemoteError(key: String, t: Throwable): Unit
-    def onRemoteGiveUp(key: String): Unit
+    def onCompletedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit
+    def onGeneratedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit
+    def onCompletedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit
+    def onGeneratedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit
+    def onCacheMissNothingFound(key: String, elapsedTime: FiniteDuration): Unit
+    def onCacheMissButFoundExpiredLocal(key: String, elapsedTime: FiniteDuration): Unit
+    def onCacheMissButFoundExpiredRemote(key: String, elapsedTime: FiniteDuration): Unit
+    def onRemoteCacheHit(key: String, elapsedTime: FiniteDuration): Unit
+    def onLocalCacheHit(key: String, elapsedTime: FiniteDuration): Unit
+    def onUnexpectedBehaviour(key: String, elapsedTime: FiniteDuration): Unit
+    def onStillTryingToLockOrGet(key: String, elapsedTime: FiniteDuration): Unit
+    def onSuccessfullyRemoteSetValue(key: String, elapsedTime: FiniteDuration): Unit
+    def onRemoteCacheHitAfterGenerating(key: String, elapsedTime: FiniteDuration): Unit
+    def onErrorGeneratingValue(key: String, eLocal: Throwable, elapsedTime: FiniteDuration): Unit
+    def onLocalError(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit
+    def onRemoteError(key: String, t: Throwable, elapsedTime: FiniteDuration): Unit
+    def onRemoteGiveUp(key: String, elapsedTime: FiniteDuration): Unit
   }
 
   object NoOpReporter extends ReporterCallback {
-    override def onCacheMissNothingFound(key: String): Unit = {}
-    override def onUnexpectedBehaviour(key: String): Unit = {}
-    override def onSuccessfullyRemoteSetValue(key: String): Unit = {}
-    override def onRemoteError(key: String, t: Throwable): Unit = {}
-    override def onRemoteGiveUp(key: String): Unit = {}
-    override def onLocalError(key: String, e: Throwable): Unit = {}
-    override def onErrorGeneratingValue(key: String, eLocal: Throwable): Unit = {}
-    override def onRemoteCacheHitAfterGenerating(key: String): Unit = {}
-    override def onCacheMissButFoundExpiredRemote(key: String): Unit = {}
-    override def onStillTryingToLockOrGet(key: String): Unit = {}
-    override def onLocalCacheHit(key: String): Unit = {}
-    override def onRemoteCacheHit(key: String): Unit = {}
-    override def onCacheMissButFoundExpiredLocal(key: String): Unit = {}
+    override def onCacheMissNothingFound(key: String, elapsedTime: FiniteDuration): Unit = {}
+    override def onUnexpectedBehaviour(key: String, elapsedTime: FiniteDuration): Unit = {}
+    override def onSuccessfullyRemoteSetValue(key: String, elapsedTime: FiniteDuration): Unit = {}
+    override def onRemoteError(key: String, t: Throwable, elapsedTime: FiniteDuration): Unit = {}
+    override def onRemoteGiveUp(key: String, elapsedTime: FiniteDuration): Unit = {}
+    override def onLocalError(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit = {}
+    override def onErrorGeneratingValue(key: String, eLocal: Throwable, elapsedTime: FiniteDuration): Unit = {}
+    override def onRemoteCacheHitAfterGenerating(key: String, elapsedTime: FiniteDuration): Unit = {}
+    override def onCacheMissButFoundExpiredRemote(key: String, elapsedTime: FiniteDuration): Unit = {}
+    override def onStillTryingToLockOrGet(key: String, elapsedTime: FiniteDuration): Unit = {}
+    override def onLocalCacheHit(key: String, elapsedTime: FiniteDuration): Unit = {}
+    override def onRemoteCacheHit(key: String, elapsedTime: FiniteDuration): Unit = {}
+    override def onCacheMissButFoundExpiredLocal(key: String, elapsedTime: FiniteDuration): Unit = {}
+    override def onCompletedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit = {}
+    override def onCompletedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit = {}
+    override def onGeneratedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit = {}
+    override def onGeneratedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit = {}
+
   }
 }
 
@@ -113,103 +123,115 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
 
   private def timestamp(v: V) = TimestampedValue(now, v)
 
+  private def elapsedTime(startNanoTime: Long) = FiniteDuration(System.nanoTime() - startNanoTime, TimeUnit.NANOSECONDS)
+
   private def remoteLockKey(key: Any) = s"$key-emlc-lock"
 
 
   // The idea is simple, have two caches: remote and local
   // with values that will eventually expire but still be left on the cache
   // while a new value is asynchronously being calculated/retrieved
-  override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] =
+  override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = {
     // The local cache is always the first try. We'll only look the remote if the local value is missing or has expired
-    localCache.get(key).map(_.asTry()) match {
+    val startTime = System.nanoTime()
+    val result = localCache.get(key).map(_.asTry()) match {
       case Some(future) =>
         future.flatMap {
           case Success(localValue) if !localValue.hasExpired(ttl, now) =>
             // We have locally a good value, just return it
-            reporter.onLocalCacheHit(key)
+            reporter.onLocalCacheHit(key, elapsedTime(startTime))
             Future.successful(localValue.value)
           case Success(expiredLocalValue) if remoteRW.nonEmpty =>
             // We have locally an expired value, but we can check a remote cache for better value
             remoteRW.get.get(key).asTry().flatMap {
               case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
                 // Remote is good, set locally and return it
-                reporter.onRemoteCacheHit(key)
+                reporter.onRemoteCacheHit(key, elapsedTime(startTime))
                 localCache.set(key, remoteValue)
                 Future.successful(remoteValue.value)
               case Success(Some(expiredRemote)) =>
                 // Expired local and expired remote, return the most recent of them, async update both
-                reporter.onCacheMissButFoundExpiredRemote(key)
-                tryGenerateAndSet(key, genValue)
+                reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime))
+                tryGenerateAndSet(key, genValue, startTime)
                 val mostRecent = Set(expiredLocalValue, expiredRemote).maxBy(_.date)
                 Future.successful(mostRecent.value)
               case Success(None) =>
                 // No remote found, return local, async update both
-                reporter.onCacheMissButFoundExpiredLocal(key)
-                tryGenerateAndSet(key, genValue)
+                reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime))
+                tryGenerateAndSet(key, genValue, startTime)
                 Future.successful(expiredLocalValue.value)
               case Failure(e) =>
-                reporter.onRemoteError(key, e)
+                reporter.onRemoteError(key, e, elapsedTime(startTime))
                 logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key expired local value and failed to get remote", e)
-                tryGenerateAndSet(key, genValue)
+                tryGenerateAndSet(key, genValue, startTime)
                 Future.successful(expiredLocalValue.value)
             }
           case Success(expiredLocalValue) if remoteRW.isEmpty =>
             // There is no remote cache configured, we'are on our own
             // Return expired value and try to generate a new one for the future
-            reporter.onCacheMissButFoundExpiredLocal(key)
-            tryGenerateAndSet(key, genValue)
+            reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime))
+            tryGenerateAndSet(key, genValue, startTime)
             Future.successful(expiredLocalValue.value)
           case Failure(e) =>
             // This is almost impossible to happen because it's local and we don't save failed values
-            reporter.onLocalError(key, e)
+            reporter.onLocalError(key, e, elapsedTime(startTime))
             logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key got a failed future from cache!? This is almost impossible!", e)
-            tryGenerateAndSet(key, genValue).map(_.value)
+            tryGenerateAndSet(key, genValue, startTime).map(_.value)
         }
       case None if remoteRW.nonEmpty =>
         // No local, let's try remote
         remoteRW.get.get(key).asTry().flatMap {
           case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
             // Remote is good, set locally and return it
-            reporter.onRemoteCacheHit(key)
+            reporter.onRemoteCacheHit(key, elapsedTime(startTime))
             localCache.set(key, remoteValue)
             Future.successful(remoteValue.value)
           case Success(Some(expiredRemote)) =>
             // Expired remote, return the it, async update
-            reporter.onCacheMissButFoundExpiredRemote(key)
-            tryGenerateAndSet(key, genValue).map(_.value)
+            reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime))
+            tryGenerateAndSet(key, genValue, startTime).map(_.value)
             Future.successful(expiredRemote.value)
           case Success(None) =>
             // No good remote, sync generate
-            reporter.onCacheMissNothingFound(key)
-            tryGenerateAndSet(key, genValue).map(_.value)
+            reporter.onCacheMissNothingFound(key, elapsedTime(startTime))
+            tryGenerateAndSet(key, genValue, startTime).map(_.value)
           case Failure(e) =>
-            reporter.onRemoteError(key, e)
+            reporter.onRemoteError(key, e, elapsedTime(startTime))
             logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key expired local value and no remote configured", e)
-            tryGenerateAndSet(key, genValue).map(_.value)
+            tryGenerateAndSet(key, genValue, startTime).map(_.value)
         }
       case None if remoteRW.isEmpty =>
         // No local and no remote to look, just generate it
         // The caller will need to wait for the value generation
-        reporter.onCacheMissNothingFound(key)
-        tryGenerateAndSet(key, genValue).map(_.value)
+        reporter.onCacheMissNothingFound(key, elapsedTime(startTime))
+        tryGenerateAndSet(key, genValue, startTime).map(_.value)
     }
+    result.onComplete {
+      case Success(_) =>
+        reporter.onCompletedWithSuccess(key, elapsedTime(startTime))
+      case Failure(e) =>
+        reporter.onCompletedWithFailure(key, e, elapsedTime(startTime))
+    }
+    result
+  }
 
   // Note: this method may return a failed future, but it will never cache it
   // Our main purpose here is to avoid multiple local calls to generate new promises/futures in parallel,
   // so we use this Map keep everyone in sync
   // This is similar to how spray cache works
-  private def tryGenerateAndSet(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
+  private def tryGenerateAndSet(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
     val promise = Promise[TimestampedValue[V]]()
     tempUpdate.putIfAbsent(key, promise.future) match {
       case null =>
-        canonicalValueGenerator(key, genValue).onComplete {
+        canonicalValueGenerator(key, genValue, nanoStartTime).onComplete {
           case Success(v) if !v.hasExpired(ttl, now) =>
+            reporter.onGeneratedWithSuccess(key, elapsedTime(nanoStartTime))
             localCache.set(key, v)
             promise.trySuccess(v)
             tempUpdate.remove(key)
           case Success(v) =>
             // Have we generated/got an expired value!?
-            reporter.onUnexpectedBehaviour(key)
+            reporter.onUnexpectedBehaviour(key, elapsedTime(nanoStartTime))
             logger.warn(s"tryGenerateAndSet, key $key: unexpectedly generated/got an expired value: $v")
             localCache.set(key, v)
             promise.trySuccess(v)
@@ -217,6 +239,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
           case Failure(e) =>
             // We don't save failures to cache
             // There is no need to log here, canonicalValueGenerator will log everything already
+            reporter.onGeneratedWithFailure(key, e, elapsedTime(nanoStartTime))
             promise.tryFailure(e)
             tempUpdate.remove(key)
         }
@@ -232,7 +255,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
   // the remote cache and read by the other instances
   // Unless of course there is some error getting stuff from remote cache
   // in which case the locally generated value may be returned to avoid further delays
-  protected def canonicalValueGenerator(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext) = {
+  protected def canonicalValueGenerator(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext) = {
     val fGeneratedValue = Try { genValue().map(timestamp) }.asFutureTry()
     val finalValue: Future[TimestampedValue[V]] = fGeneratedValue.flatMap {
       case Success(generatedValue) =>
@@ -242,18 +265,18 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
           case None =>
             Future.successful(generatedValue)
           case Some(remote) =>
-            remoteSetOrGet(key, generatedValue, remote)
+            remoteSetOrGet(key, generatedValue, remote, nanoStartTime)
         }
       case Failure(eLocal) =>
         // We failed to generate the value ourselves, our hope is if someone else successfully did it in the meantime
-        reporter.onErrorGeneratingValue(key, eLocal)
+        reporter.onErrorGeneratingValue(key, eLocal, elapsedTime(nanoStartTime))
         remoteRW match {
           case None =>
             // There are no remote RW caches
             logger.error(s"canonicalValueGenerator, key $key: failed to generate value and no remote cache configured", eLocal)
             Future.failed(eLocal)
           case Some(remote) =>
-            remoteGetNonExpiredValue(key, remote).asTry().flatMap {
+            remoteGetNonExpiredValue(key, remote, nanoStartTime).asTry().flatMap {
               case Success(v) =>
                 logger.warn(s"canonicalValueGenerator, key $key: failed to generate value but got one from remote", eLocal)
                 Future.successful(v)
@@ -270,6 +293,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
   // Auxiliary method, only makes sense to be used by canonicalValueGenerator
   private def remoteGetNonExpiredValue(key: String,
                                        remote: RemoteCacheRW[TimestampedValue[V]],
+                                       nanoStartTime: Long,
                                        currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
     remote.get(key).asTry().flatMap {
       case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
@@ -278,14 +302,14 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
         Future.failed(new Exception("No good value found on remote"))
       case Failure(e) =>
         if (currentRetry >= maxErrorsToRetryOnRemote) {
-          reporter.onRemoteGiveUp(key)
+          reporter.onRemoteGiveUp(key, elapsedTime(nanoStartTime))
           logger.error(s"remoteGetWithRetryOnError, key $key: returning calculated value because we got more than $maxErrorsToRetryOnRemote errors", e)
           Future.failed(e)
         } else {
-          reporter.onRemoteError(key, e)
+          reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
           logger.warn(s"remoteGetWithRetryOnError, key $key: got error trying to get value, retry $currentRetry of $maxErrorsToRetryOnRemote", e)
           // Retry
-          remoteGetNonExpiredValue(key, remote, currentRetry = currentRetry + 1)
+          remoteGetNonExpiredValue(key, remote, nanoStartTime, currentRetry = currentRetry + 1)
         }
     }
   }
@@ -295,10 +319,11 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
   private def remoteSetOrGet(key: String,
                              calculatedValue: TimestampedValue[V],
                              remote: RemoteCacheRW[TimestampedValue[V]],
+                             nanoStartTime: Long,
                              currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
     if (currentRetry > maxErrorsToRetryOnRemote) {
       // Use our calculated value as it's the best we can do
-      reporter.onRemoteGiveUp(key)
+      reporter.onRemoteGiveUp(key, elapsedTime(nanoStartTime))
       logger.error(s"remoteSetOrGet, key $key: returning calculated value because we got more than $maxErrorsToRetryOnRemote errors")
       Future.successful(calculatedValue)
     } else {
@@ -309,7 +334,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
           remote.get(key).asTry().flatMap {
             case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
               // Current value is good, just return it
-              reporter.onRemoteCacheHitAfterGenerating(key)
+              reporter.onRemoteCacheHitAfterGenerating(key, elapsedTime(nanoStartTime))
               logger.info(s"remoteSetOrGet got lock for $key but found already a good value on remote")
               Future.successful(remoteValue)
             case Success(_) =>
@@ -318,46 +343,46 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
               remote.set(key, calculatedValue).asTry().flatMap {
                 case Success(_) =>
                   // Flawless victory!
-                  reporter.onSuccessfullyRemoteSetValue(key)
+                  reporter.onSuccessfullyRemoteSetValue(key, elapsedTime(nanoStartTime))
                   logger.info(s"remoteSetOrGet successfully set key $key while under lock")
                   Future.successful(calculatedValue)
                 case Failure(e) =>
-                  reporter.onRemoteError(key, e)
+                  reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
                   logger.warn(s"remoteSetOrGet, key $key: got error setting the value, retry $currentRetry of $maxErrorsToRetryOnRemote", e)
                   // Retry failure
-                  remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1)
+                  remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
               }
             case Failure(e) =>
-              reporter.onRemoteError(key, e)
+              reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
               logger.warn(s"remoteSetOrGet, key $key: got error getting remote value with lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e)
               // Retry failure
-              remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1)
+              remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
           }
         case Success(false) =>
           // Someone got the lock, let's take a look at the value
           remote.get(key).asTry().flatMap {
             case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
               // Current value is good, just return it
-              reporter.onRemoteCacheHitAfterGenerating(key)
+              reporter.onRemoteCacheHitAfterGenerating(key, elapsedTime(nanoStartTime))
               Future.successful(remoteValue)
             case Success(_) =>
               // The value is missing or has expired
               // Let's start from scratch because we need to be able to set or get a good value
               // Note: do not increment retry because this isn't an error
-              reporter.onStillTryingToLockOrGet(key)
+              reporter.onStillTryingToLockOrGet(key, elapsedTime(nanoStartTime))
               logger.info(s"remoteSetOrGet couldn't lock key $key and didn't found good value on remote")
-              remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry)
+              remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry)
             case Failure(e) =>
-              reporter.onRemoteError(key, e)
+              reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
               logger.warn(s"remoteSetOrGet, key $key: got error getting remote value without lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e)
               // Retry
-              remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1)
+              remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
           }
         case Failure(e) =>
           // Retry failure
-          reporter.onRemoteError(key, e)
+          reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
           logger.warn(s"remoteSetOrGet, key $key: got error trying to set lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e)
-          remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1)
+          remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
       }
     }
   }

From b47a54b46e018e53bc62d270ae697cb891bb7279 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 1 Nov 2016 14:04:18 -0200
Subject: [PATCH 127/268] Make local cache optional, do a backoff on retries

---
 .../cache/ExpiringMultipleLevelCache.scala    | 66 ++++++++++++-------
 .../cache/ExpiringMultipleLevelCache.scala    |  7 +-
 2 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
index 6de11fdd..d4147761 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
@@ -2,6 +2,8 @@ package ignition.core.cache
 
 import java.util.concurrent.TimeUnit
 
+import akka.actor.Scheduler
+import akka.pattern.after
 import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap
 import ignition.core.utils.DateUtils._
 import ignition.core.utils.FutureUtils._
@@ -107,11 +109,13 @@ import ignition.core.cache.ExpiringMultipleLevelCache._
 
 
 case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
-                                         localCache: LocalCache[TimestampedValue[V]],
+                                         localCache: Option[LocalCache[TimestampedValue[V]]],
                                          remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None,
                                          remoteLockTTL: FiniteDuration = 5.seconds,
                                          reporter: ExpiringMultipleLevelCache.ReporterCallback = ExpiringMultipleLevelCache.NoOpReporter,
-                                         maxErrorsToRetryOnRemote: Int = 5) extends GenericCache[V] {
+                                         maxErrorsToRetryOnRemote: Int = 5,
+                                         backoffOnLockAcquire: FiniteDuration = 50.milliseconds,
+                                         backoffOnError: FiniteDuration = 50.milliseconds)(implicit scheduler: Scheduler) extends GenericCache[V] {
 
   private val logger = LoggerFactory.getLogger(getClass)
 
@@ -134,7 +138,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
   override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = {
     // The local cache is always the first try. We'll only look the remote if the local value is missing or has expired
     val startTime = System.nanoTime()
-    val result = localCache.get(key).map(_.asTry()) match {
+    val result = localCache.flatMap(_.get(key).map(_.asTry())) match {
       case Some(future) =>
         future.flatMap {
           case Success(localValue) if !localValue.hasExpired(ttl, now) =>
@@ -147,7 +151,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
               case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
                 // Remote is good, set locally and return it
                 reporter.onRemoteCacheHit(key, elapsedTime(startTime))
-                localCache.set(key, remoteValue)
+                localCache.foreach(_.set(key, remoteValue))
                 Future.successful(remoteValue.value)
               case Success(Some(expiredRemote)) =>
                 // Expired local and expired remote, return the most recent of them, async update both
@@ -162,7 +166,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
                 Future.successful(expiredLocalValue.value)
               case Failure(e) =>
                 reporter.onRemoteError(key, e, elapsedTime(startTime))
-                logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key expired local value and failed to get remote", e)
+                logger.warn(s"apply, key: $key expired local value and failed to get remote", e)
                 tryGenerateAndSet(key, genValue, startTime)
                 Future.successful(expiredLocalValue.value)
             }
@@ -175,7 +179,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
           case Failure(e) =>
             // This is almost impossible to happen because it's local and we don't save failed values
             reporter.onLocalError(key, e, elapsedTime(startTime))
-            logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key got a failed future from cache!? This is almost impossible!", e)
+            logger.warn(s"apply, key: $key got a failed future from cache!? This is almost impossible!", e)
             tryGenerateAndSet(key, genValue, startTime).map(_.value)
         }
       case None if remoteRW.nonEmpty =>
@@ -184,7 +188,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
           case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
             // Remote is good, set locally and return it
             reporter.onRemoteCacheHit(key, elapsedTime(startTime))
-            localCache.set(key, remoteValue)
+            localCache.foreach(_.set(key, remoteValue))
             Future.successful(remoteValue.value)
           case Success(Some(expiredRemote)) =>
             // Expired remote, return the it, async update
@@ -197,7 +201,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
             tryGenerateAndSet(key, genValue, startTime).map(_.value)
           case Failure(e) =>
             reporter.onRemoteError(key, e, elapsedTime(startTime))
-            logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key expired local value and no remote configured", e)
+            logger.warn(s"apply, key: $key expired local value and remote error", e)
             tryGenerateAndSet(key, genValue, startTime).map(_.value)
         }
       case None if remoteRW.isEmpty =>
@@ -223,17 +227,18 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
     val promise = Promise[TimestampedValue[V]]()
     tempUpdate.putIfAbsent(key, promise.future) match {
       case null =>
+        logger.info(s"tryGenerateAndSet, key $key: got request for generating and none in progress found, calling canonicalValueGenerator")
         canonicalValueGenerator(key, genValue, nanoStartTime).onComplete {
           case Success(v) if !v.hasExpired(ttl, now) =>
             reporter.onGeneratedWithSuccess(key, elapsedTime(nanoStartTime))
-            localCache.set(key, v)
+            localCache.foreach(_.set(key, v))
             promise.trySuccess(v)
             tempUpdate.remove(key)
           case Success(v) =>
             // Have we generated/got an expired value!?
             reporter.onUnexpectedBehaviour(key, elapsedTime(nanoStartTime))
             logger.warn(s"tryGenerateAndSet, key $key: unexpectedly generated/got an expired value: $v")
-            localCache.set(key, v)
+            localCache.foreach(_.set(key, v))
             promise.trySuccess(v)
             tempUpdate.remove(key)
           case Failure(e) =>
@@ -246,6 +251,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
         promise.future
       case fTrying =>
         // If someone call us while a future is running, we return the running future
+        logger.info(s"tryGenerateAndSet, key $key: got request for generating but an existing one is current in progress")
         fTrying
     }
   }
@@ -297,19 +303,22 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
                                        currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
     remote.get(key).asTry().flatMap {
       case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
+        logger.info(s"remoteGetNonExpiredValue, key $key: got a good value")
         Future.successful(remoteValue)
       case Success(_) =>
         Future.failed(new Exception("No good value found on remote"))
       case Failure(e) =>
         if (currentRetry >= maxErrorsToRetryOnRemote) {
           reporter.onRemoteGiveUp(key, elapsedTime(nanoStartTime))
-          logger.error(s"remoteGetWithRetryOnError, key $key: returning calculated value because we got more than $maxErrorsToRetryOnRemote errors", e)
+          logger.error(s"remoteGetNonExpiredValue, key $key: returning calculated value because we got more than $maxErrorsToRetryOnRemote errors", e)
           Future.failed(e)
         } else {
           reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
-          logger.warn(s"remoteGetWithRetryOnError, key $key: got error trying to get value, retry $currentRetry of $maxErrorsToRetryOnRemote", e)
+          logger.warn(s"remoteGetNonExpiredValue, key $key: got error trying to get value, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e)
           // Retry
-          remoteGetNonExpiredValue(key, remote, nanoStartTime, currentRetry = currentRetry + 1)
+          after(backoffOnError, scheduler) {
+            remoteGetNonExpiredValue(key, remote, nanoStartTime, currentRetry = currentRetry + 1)
+          }
         }
     }
   }
@@ -348,21 +357,26 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
                   Future.successful(calculatedValue)
                 case Failure(e) =>
                   reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
-                  logger.warn(s"remoteSetOrGet, key $key: got error setting the value, retry $currentRetry of $maxErrorsToRetryOnRemote", e)
+                  logger.warn(s"remoteSetOrGet, key $key: got error setting the value, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e)
                   // Retry failure
-                  remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
+                  after(backoffOnError, scheduler) {
+                    remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
+                  }
               }
             case Failure(e) =>
               reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
-              logger.warn(s"remoteSetOrGet, key $key: got error getting remote value with lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e)
+              logger.warn(s"remoteSetOrGet, key $key: got error getting remote value with lock, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e)
               // Retry failure
-              remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
+              after(backoffOnError, scheduler) {
+                remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
+              }
           }
         case Success(false) =>
           // Someone got the lock, let's take a look at the value
           remote.get(key).asTry().flatMap {
             case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
               // Current value is good, just return it
+              logger.info(s"remoteSetOrGet couldn't lock key $key but found a good on remote afterwards")
               reporter.onRemoteCacheHitAfterGenerating(key, elapsedTime(nanoStartTime))
               Future.successful(remoteValue)
             case Success(_) =>
@@ -370,19 +384,25 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
               // Let's start from scratch because we need to be able to set or get a good value
               // Note: do not increment retry because this isn't an error
               reporter.onStillTryingToLockOrGet(key, elapsedTime(nanoStartTime))
-              logger.info(s"remoteSetOrGet couldn't lock key $key and didn't found good value on remote")
-              remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry)
+              logger.info(s"remoteSetOrGet couldn't lock key $key and didn't found good value on remote, scheduling retry")
+              after(backoffOnLockAcquire, scheduler) {
+                remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry)
+              }
             case Failure(e) =>
               reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
-              logger.warn(s"remoteSetOrGet, key $key: got error getting remote value without lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e)
+              logger.warn(s"remoteSetOrGet, key $key: got error getting remote value without lock, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e)
               // Retry
-              remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
+              after(backoffOnError, scheduler) {
+                remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
+              }
           }
         case Failure(e) =>
           // Retry failure
           reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
-          logger.warn(s"remoteSetOrGet, key $key: got error trying to set lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e)
-          remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
+          logger.warn(s"remoteSetOrGet, key $key: got error trying to set lock, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e)
+          after(backoffOnError, scheduler) {
+            remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
+          }
       }
     }
   }
diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
index d602a736..c5b81e8c 100644
--- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
+++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
@@ -1,5 +1,6 @@
 package ignition.core.cache
 
+import akka.actor.ActorSystem
 import ignition.core.cache.ExpiringMultipleLevelCache.TimestampedValue
 import org.scalatest.{FlatSpec, Matchers}
 import spray.caching.ExpiringLruLocalCache
@@ -10,15 +11,17 @@ import scala.concurrent.{Await, Future}
 
 class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers {
   case class Data(s: String)
+  implicit val scheduler = ActorSystem().scheduler
+
   "ExpiringMultipleLevelCache" should "calculate a value on cache miss and return it" in {
     val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
-    val cache = ExpiringMultipleLevelCache[Data](1.minute, local)
+    val cache = ExpiringMultipleLevelCache[Data](1.minute, Option(local))
     Await.result(cache("key", () => Future.successful(Data("success"))), 1.minute) shouldBe Data("success")
   }
 
   it should "calculate a value on cache miss and return a failed future of the calculation" in {
     val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
-    val cache = ExpiringMultipleLevelCache[Data](1.minute, local)
+    val cache = ExpiringMultipleLevelCache[Data](1.minute, Option(local))
 
     class MyException(s: String) extends Exception(s)
 

From 5b3cfa0528342f3d651504d6421d9f65026313a0 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 1 Nov 2016 18:07:27 -0200
Subject: [PATCH 128/268] Added set method

---
 .../cache/ExpiringMultipleLevelCache.scala    | 59 +++++++++++++++++--
 .../spray/cache/ExpiringLruLocalCache.scala   |  6 --
 2 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
index d4147761..2fbe6a48 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
@@ -44,9 +44,10 @@ object ExpiringMultipleLevelCache {
     }
 
     def apply(key: String, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext): Future[V]
+    def set(key: String, value: V)(implicit ec: ExecutionContext): Future[Unit]
   }
 
-  trait LocalCache[V] extends GenericCache[V] {
+  trait LocalCache[V] {
     def get(key: Any): Option[Future[V]]
     def set(key: Any, value: V): Unit
   }
@@ -219,13 +220,57 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
     result
   }
 
+  // This should be used carefully because it will overwrite the remote value without
+  // any lock, which may cause a desynchronization between the local and remote cache on other instances
+  // Note that if any tryGenerateAndSet is in progress, this will wait until it's finished before setting local/remote
+  override def set(key: String, value: V)(implicit ec: ExecutionContext): Future[Unit] = {
+    logger.info(s"set, key $key: got a call to overwrite local and remote values")
+    val startTime = System.nanoTime()
+    val promise = Promise[TimestampedValue[V]]()
+    val future = promise.future
+    def doIt() = {
+      val tValue = timestamp(value)
+      localCache.foreach(_.set(key, tValue))
+      val result = remoteRW.map(remote => remoteOverwrite(key, tValue, remote, startTime)).getOrElse(Future.successful(tValue))
+      promise.completeWith(result)
+      tempUpdate.remove(key, future)
+    }
+    tempUpdate.put(key, future) match {
+      case null =>
+        doIt()
+        future.map(_ => ())
+      case fTrying =>
+        fTrying.onComplete { case _ => doIt() }
+        future.map(_ => ())
+    }
+  }
+
+  // Overwrite remote value without lock, retrying on error
+  private def remoteOverwrite(key: String, calculatedValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], nanoStartTime: Long, currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
+    remote.set(key, calculatedValue).asTry().flatMap {
+      case Success(_) =>
+        reporter.onSuccessfullyRemoteSetValue(key, elapsedTime(nanoStartTime))
+        logger.info(s"remoteForceSet successfully overwritten key $key")
+        Future.successful(calculatedValue)
+      case Failure(e) =>
+        reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
+        logger.warn(s"remoteForceSet, key $key: got error setting the value, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e)
+        // Retry failure
+        after(backoffOnError, scheduler) {
+          remoteOverwrite(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
+        }
+    }
+  }
+
+
   // Note: this method may return a failed future, but it will never cache it
   // Our main purpose here is to avoid multiple local calls to generate new promises/futures in parallel,
   // so we use this Map keep everyone in sync
   // This is similar to how spray cache works
   private def tryGenerateAndSet(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
     val promise = Promise[TimestampedValue[V]]()
-    tempUpdate.putIfAbsent(key, promise.future) match {
+    val future = promise.future
+    tempUpdate.putIfAbsent(key, future) match {
       case null =>
         logger.info(s"tryGenerateAndSet, key $key: got request for generating and none in progress found, calling canonicalValueGenerator")
         canonicalValueGenerator(key, genValue, nanoStartTime).onComplete {
@@ -233,22 +278,22 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
             reporter.onGeneratedWithSuccess(key, elapsedTime(nanoStartTime))
             localCache.foreach(_.set(key, v))
             promise.trySuccess(v)
-            tempUpdate.remove(key)
+            tempUpdate.remove(key, future)
           case Success(v) =>
             // Have we generated/got an expired value!?
             reporter.onUnexpectedBehaviour(key, elapsedTime(nanoStartTime))
             logger.warn(s"tryGenerateAndSet, key $key: unexpectedly generated/got an expired value: $v")
             localCache.foreach(_.set(key, v))
             promise.trySuccess(v)
-            tempUpdate.remove(key)
+            tempUpdate.remove(key, future)
           case Failure(e) =>
             // We don't save failures to cache
             // There is no need to log here, canonicalValueGenerator will log everything already
             reporter.onGeneratedWithFailure(key, e, elapsedTime(nanoStartTime))
             promise.tryFailure(e)
-            tempUpdate.remove(key)
+            tempUpdate.remove(key, future)
         }
-        promise.future
+        future
       case fTrying =>
         // If someone call us while a future is running, we return the running future
         logger.info(s"tryGenerateAndSet, key $key: got request for generating but an existing one is current in progress")
@@ -406,4 +451,6 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
       }
     }
   }
+
+
 }
\ No newline at end of file
diff --git a/src/main/scala/spray/cache/ExpiringLruLocalCache.scala b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala
index 8c403be9..ac7f6e42 100644
--- a/src/main/scala/spray/cache/ExpiringLruLocalCache.scala
+++ b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala
@@ -117,12 +117,6 @@ final class ExpiringLruLocalCache[V](maxCapacity: Long,
         oldEntry.promise.trySuccess(value)
     }
   }
-
-  // Method required by ExpiringMultipleLevelCache.LocalCache
-  override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = {
-    val sprayCache: Cache[V] = this
-    sprayCache.apply(key, genValue)
-  }
 }
 
 private[caching] class Entry[T](val promise: Promise[T]) {

From 6b450cbf3931f26d765119cd8a7c6fe179c65be3 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 1 Nov 2016 19:09:03 -0200
Subject: [PATCH 129/268] Rename ExpiringMultipleLevelCache to
 ExpiringMultiLevelCache

---
 ...he.scala => ExpiringMultiLevelCache.scala} | 20 +++++++++----------
 .../spray/cache/ExpiringLruLocalCache.scala   |  4 ++--
 .../cache/ExpiringMultipleLevelCache.scala    |  6 +++---
 3 files changed, 15 insertions(+), 15 deletions(-)
 rename src/main/scala/ignition/core/cache/{ExpiringMultipleLevelCache.scala => ExpiringMultiLevelCache.scala} (96%)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
similarity index 96%
rename from src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
rename to src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
index 2fbe6a48..31ebb015 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
@@ -16,7 +16,7 @@ import scala.concurrent.{ExecutionContext, Future, Promise}
 import scala.util.control.NonFatal
 import scala.util.{Failure, Success, Try}
 
-object ExpiringMultipleLevelCache {
+object ExpiringMultiLevelCache {
   case class TimestampedValue[V](date: DateTime, value: V) {
     def hasExpired(ttl: FiniteDuration, now: DateTime): Boolean = {
       date.plus(ttl.toMillis).isBefore(now)
@@ -106,17 +106,17 @@ object ExpiringMultipleLevelCache {
 }
 
 
-import ignition.core.cache.ExpiringMultipleLevelCache._
+import ignition.core.cache.ExpiringMultiLevelCache._
 
 
-case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration,
-                                         localCache: Option[LocalCache[TimestampedValue[V]]],
-                                         remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None,
-                                         remoteLockTTL: FiniteDuration = 5.seconds,
-                                         reporter: ExpiringMultipleLevelCache.ReporterCallback = ExpiringMultipleLevelCache.NoOpReporter,
-                                         maxErrorsToRetryOnRemote: Int = 5,
-                                         backoffOnLockAcquire: FiniteDuration = 50.milliseconds,
-                                         backoffOnError: FiniteDuration = 50.milliseconds)(implicit scheduler: Scheduler) extends GenericCache[V] {
+case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
+                                      localCache: Option[LocalCache[TimestampedValue[V]]],
+                                      remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None,
+                                      remoteLockTTL: FiniteDuration = 5.seconds,
+                                      reporter: ExpiringMultiLevelCache.ReporterCallback = ExpiringMultiLevelCache.NoOpReporter,
+                                      maxErrorsToRetryOnRemote: Int = 5,
+                                      backoffOnLockAcquire: FiniteDuration = 50.milliseconds,
+                                      backoffOnError: FiniteDuration = 50.milliseconds)(implicit scheduler: Scheduler) extends GenericCache[V] {
 
   private val logger = LoggerFactory.getLogger(getClass)
 
diff --git a/src/main/scala/spray/cache/ExpiringLruLocalCache.scala b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala
index ac7f6e42..33d2b4d9 100644
--- a/src/main/scala/spray/cache/ExpiringLruLocalCache.scala
+++ b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala
@@ -20,7 +20,7 @@
 package spray.caching
 
 import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap
-import ignition.core.cache.ExpiringMultipleLevelCache
+import ignition.core.cache.ExpiringMultiLevelCache
 
 import scala.annotation.tailrec
 import scala.collection.JavaConverters._
@@ -32,7 +32,7 @@ import spray.util.Timestamp
 final class ExpiringLruLocalCache[V](maxCapacity: Long,
                                      initialCapacity: Int = 16,
                                      timeToLive: Duration = Duration.Inf,
-                                     timeToIdle: Duration = Duration.Inf) extends Cache[V] with ignition.core.cache.ExpiringMultipleLevelCache.LocalCache[V] {
+                                     timeToIdle: Duration = Duration.Inf) extends Cache[V] with ignition.core.cache.ExpiringMultiLevelCache.LocalCache[V] {
   require(!timeToLive.isFinite || !timeToIdle.isFinite || timeToLive > timeToIdle,
     s"timeToLive($timeToLive) must be greater than timeToIdle($timeToIdle)")
 
diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
index c5b81e8c..c321f794 100644
--- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
+++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
@@ -1,7 +1,7 @@
 package ignition.core.cache
 
 import akka.actor.ActorSystem
-import ignition.core.cache.ExpiringMultipleLevelCache.TimestampedValue
+import ignition.core.cache.ExpiringMultiLevelCache.TimestampedValue
 import org.scalatest.{FlatSpec, Matchers}
 import spray.caching.ExpiringLruLocalCache
 
@@ -15,13 +15,13 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers {
 
   "ExpiringMultipleLevelCache" should "calculate a value on cache miss and return it" in {
     val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
-    val cache = ExpiringMultipleLevelCache[Data](1.minute, Option(local))
+    val cache = ExpiringMultiLevelCache[Data](1.minute, Option(local))
     Await.result(cache("key", () => Future.successful(Data("success"))), 1.minute) shouldBe Data("success")
   }
 
   it should "calculate a value on cache miss and return a failed future of the calculation" in {
     val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
-    val cache = ExpiringMultipleLevelCache[Data](1.minute, Option(local))
+    val cache = ExpiringMultiLevelCache[Data](1.minute, Option(local))
 
     class MyException(s: String) extends Exception(s)
 

From b421352ccf422a2a577fb0b2509ac3b29662ee78 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 1 Nov 2016 20:19:17 -0200
Subject: [PATCH 130/268] Better put the scheduler on each method than on
 constructor

---
 .../core/cache/ExpiringMultiLevelCache.scala  | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
index 31ebb015..be6b6c49 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
@@ -32,19 +32,19 @@ object ExpiringMultiLevelCache {
         * Returns either the cached Future for the key or evaluates the given call-by-name argument
         * which produces either a value instance of type `V` or a `Future[V]`.
         */
-      def apply(magnet: ⇒ ValueMagnet[V])(implicit ec: ExecutionContext): Future[V] =
+      def apply(magnet: ⇒ ValueMagnet[V])(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] =
         cache.apply(key, () ⇒ try magnet.future catch { case NonFatal(e) ⇒ Future.failed(e) })
 
       /**
         * Returns either the cached Future for the key or evaluates the given function which
         * should lead to eventual completion of the promise.
         */
-      def apply[U](f: Promise[V] ⇒ U)(implicit ec: ExecutionContext): Future[V] =
+      def apply[U](f: Promise[V] ⇒ U)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] =
         cache.apply(key, () ⇒ { val p = Promise[V](); f(p); p.future })
     }
 
-    def apply(key: String, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext): Future[V]
-    def set(key: String, value: V)(implicit ec: ExecutionContext): Future[Unit]
+    def apply(key: String, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V]
+    def set(key: String, value: V)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Unit]
   }
 
   trait LocalCache[V] {
@@ -53,12 +53,12 @@ object ExpiringMultiLevelCache {
   }
 
   trait RemoteWritableCache[V] {
-    def set(key: String, value: V)(implicit ec: ExecutionContext): Future[Unit]
-    def setLock(key: String, ttl: FiniteDuration)(implicit ec: ExecutionContext): Future[Boolean]
+    def set(key: String, value: V)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Unit]
+    def setLock(key: String, ttl: FiniteDuration)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Boolean]
   }
 
   trait RemoteReadableCache[V] {
-    def get(key: String)(implicit ec: ExecutionContext): Future[Option[V]]
+    def get(key: String)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Option[V]]
   }
 
   trait RemoteCacheRW[V] extends RemoteReadableCache[V] with RemoteWritableCache[V]
@@ -116,7 +116,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
                                       reporter: ExpiringMultiLevelCache.ReporterCallback = ExpiringMultiLevelCache.NoOpReporter,
                                       maxErrorsToRetryOnRemote: Int = 5,
                                       backoffOnLockAcquire: FiniteDuration = 50.milliseconds,
-                                      backoffOnError: FiniteDuration = 50.milliseconds)(implicit scheduler: Scheduler) extends GenericCache[V] {
+                                      backoffOnError: FiniteDuration = 50.milliseconds) extends GenericCache[V] {
 
   private val logger = LoggerFactory.getLogger(getClass)
 
@@ -136,7 +136,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
   // The idea is simple, have two caches: remote and local
   // with values that will eventually expire but still be left on the cache
   // while a new value is asynchronously being calculated/retrieved
-  override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = {
+  override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = {
     // The local cache is always the first try. We'll only look the remote if the local value is missing or has expired
     val startTime = System.nanoTime()
     val result = localCache.flatMap(_.get(key).map(_.asTry())) match {
@@ -223,7 +223,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
   // This should be used carefully because it will overwrite the remote value without
   // any lock, which may cause a desynchronization between the local and remote cache on other instances
   // Note that if any tryGenerateAndSet is in progress, this will wait until it's finished before setting local/remote
-  override def set(key: String, value: V)(implicit ec: ExecutionContext): Future[Unit] = {
+  override def set(key: String, value: V)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Unit] = {
     logger.info(s"set, key $key: got a call to overwrite local and remote values")
     val startTime = System.nanoTime()
     val promise = Promise[TimestampedValue[V]]()
@@ -246,7 +246,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
   }
 
   // Overwrite remote value without lock, retrying on error
-  private def remoteOverwrite(key: String, calculatedValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], nanoStartTime: Long, currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
+  private def remoteOverwrite(key: String, calculatedValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], nanoStartTime: Long, currentRetry: Int = 0)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = {
     remote.set(key, calculatedValue).asTry().flatMap {
       case Success(_) =>
         reporter.onSuccessfullyRemoteSetValue(key, elapsedTime(nanoStartTime))
@@ -267,7 +267,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
   // Our main purpose here is to avoid multiple local calls to generate new promises/futures in parallel,
   // so we use this Map keep everyone in sync
   // This is similar to how spray cache works
-  private def tryGenerateAndSet(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
+  private def tryGenerateAndSet(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = {
     val promise = Promise[TimestampedValue[V]]()
     val future = promise.future
     tempUpdate.putIfAbsent(key, future) match {
@@ -306,7 +306,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
   // the remote cache and read by the other instances
   // Unless of course there is some error getting stuff from remote cache
   // in which case the locally generated value may be returned to avoid further delays
-  protected def canonicalValueGenerator(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext) = {
+  protected def canonicalValueGenerator(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext, scheduler: Scheduler) = {
     val fGeneratedValue = Try { genValue().map(timestamp) }.asFutureTry()
     val finalValue: Future[TimestampedValue[V]] = fGeneratedValue.flatMap {
       case Success(generatedValue) =>
@@ -345,7 +345,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
   private def remoteGetNonExpiredValue(key: String,
                                        remote: RemoteCacheRW[TimestampedValue[V]],
                                        nanoStartTime: Long,
-                                       currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
+                                       currentRetry: Int = 0)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = {
     remote.get(key).asTry().flatMap {
       case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
         logger.info(s"remoteGetNonExpiredValue, key $key: got a good value")
@@ -374,7 +374,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
                              calculatedValue: TimestampedValue[V],
                              remote: RemoteCacheRW[TimestampedValue[V]],
                              nanoStartTime: Long,
-                             currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = {
+                             currentRetry: Int = 0)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = {
     if (currentRetry > maxErrorsToRetryOnRemote) {
       // Use our calculated value as it's the best we can do
       reporter.onRemoteGiveUp(key, elapsedTime(nanoStartTime))

From 1b1ad65bab256ab579b59e9e31f837c8e43b9312 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 3 Nov 2016 18:21:10 -0200
Subject: [PATCH 131/268] Added sanity check feature

---
 .../core/cache/ExpiringMultiLevelCache.scala  | 51 +++++++++++++++++--
 1 file changed, 47 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
index be6b6c49..911a0e6a 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
@@ -7,7 +7,7 @@ import akka.pattern.after
 import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap
 import ignition.core.utils.DateUtils._
 import ignition.core.utils.FutureUtils._
-import org.joda.time.DateTime
+import org.joda.time.{DateTime, DateTimeZone}
 import org.slf4j.LoggerFactory
 import spray.caching.ValueMagnet
 
@@ -81,6 +81,7 @@ object ExpiringMultiLevelCache {
     def onLocalError(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit
     def onRemoteError(key: String, t: Throwable, elapsedTime: FiniteDuration): Unit
     def onRemoteGiveUp(key: String, elapsedTime: FiniteDuration): Unit
+    def onSanityLocalValueCheckFailedResult(key: String, result: String, elapsedTime: FiniteDuration): Unit
   }
 
   object NoOpReporter extends ReporterCallback {
@@ -101,7 +102,7 @@ object ExpiringMultiLevelCache {
     override def onCompletedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit = {}
     override def onGeneratedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit = {}
     override def onGeneratedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit = {}
-
+    override def onSanityLocalValueCheckFailedResult(key: String, result: String, elapsedTime: FiniteDuration): Unit = {}
   }
 }
 
@@ -116,7 +117,8 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
                                       reporter: ExpiringMultiLevelCache.ReporterCallback = ExpiringMultiLevelCache.NoOpReporter,
                                       maxErrorsToRetryOnRemote: Int = 5,
                                       backoffOnLockAcquire: FiniteDuration = 50.milliseconds,
-                                      backoffOnError: FiniteDuration = 50.milliseconds) extends GenericCache[V] {
+                                      backoffOnError: FiniteDuration = 50.milliseconds,
+                                      sanityLocalValueCheck: Boolean = false) extends GenericCache[V] {
 
   private val logger = LoggerFactory.getLogger(getClass)
 
@@ -145,7 +147,11 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
           case Success(localValue) if !localValue.hasExpired(ttl, now) =>
             // We have locally a good value, just return it
             reporter.onLocalCacheHit(key, elapsedTime(startTime))
-            Future.successful(localValue.value)
+            // But if we're paranoid, let's check if the local value is consistent with remote
+            if (sanityLocalValueCheck)
+              remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, startTime)).getOrElse(Future.successful(localValue.value))
+            else
+              Future.successful(localValue.value)
           case Success(expiredLocalValue) if remoteRW.nonEmpty =>
             // We have locally an expired value, but we can check a remote cache for better value
             remoteRW.get.get(key).asTry().flatMap {
@@ -245,6 +251,43 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
     }
   }
 
+  private def sanityLocalValueCheck(key: String, localValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], startTime: Long)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = {
+    remote.get(key).asTry().flatMap {
+      case Success(Some(remoteValue)) if remoteValue == localValue =>
+        // Remote is the same as local, return any of them
+        Future.successful(remoteValue.value)
+      case Success(Some(remoteValue)) =>
+        // Something is different, try to figure it out
+        val valuesResult = if (remoteValue.value == localValue.value) "same-value" else "different-values"
+        val dateResult = if (remoteValue.date.isAfter(localValue.date))
+          "remote-is-older-than-local"
+        else if (localValue.date.isAfter(remoteValue.date))
+          "local-is-older-than-remote"
+        else if (localValue.date.isEqual(localValue.date))
+          "same-date"
+        else if (localValue.date.withZone(DateTimeZone.UTC).isEqual(localValue.date.withZone(DateTimeZone.UTC)))
+          "same-date-on-utc"
+        else
+          "impossible-dates"
+        val remoteExpired = remoteValue.hasExpired(ttl, now)
+        val localExpired = localValue.hasExpired(ttl, now)
+        val finalResult = s"$valuesResult-$dateResult-remote-expired-${remoteExpired}-local-expired-${localExpired}"
+        logger.warn(s"sanityLocalValueCheck, key $key: got different results for local $localValue and remote $remoteValue ($finalResult)")
+        reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime))
+        // return remote to keep everyone consistent
+        Future.successful(remoteValue.value)
+      case Success(None) =>
+        val localExpired = localValue.hasExpired(ttl, now)
+        val finalResult = s"missing-remote-local-expired-${localExpired}"
+        logger.warn(s"sanityLocalValueCheck, key $key: got local $localValue but no remote ($finalResult)")
+        Future.successful(localValue.value)
+      case Failure(e) =>
+        reporter.onRemoteError(key, e, elapsedTime(startTime))
+        logger.warn(s"sanityLocalValueCheck, key: $key  failed to get remote", e)
+        Future.successful(localValue.value)
+    }
+  }
+
   // Overwrite remote value without lock, retrying on error
   private def remoteOverwrite(key: String, calculatedValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], nanoStartTime: Long, currentRetry: Int = 0)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = {
     remote.set(key, calculatedValue).asTry().flatMap {

From 1f7dbbf749647849e1e6cbe45debf64ca9a06866 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Fri, 4 Nov 2016 17:37:00 -0200
Subject: [PATCH 132/268] Improve sanity check and use UTC dates on timestamped
 values

---
 .../ignition/core/cache/ExpiringMultiLevelCache.scala  | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
index 911a0e6a..d20a3c4f 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
@@ -7,7 +7,7 @@ import akka.pattern.after
 import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap
 import ignition.core.utils.DateUtils._
 import ignition.core.utils.FutureUtils._
-import org.joda.time.{DateTime, DateTimeZone}
+import org.joda.time.{DateTime, DateTimeZone, Interval}
 import org.slf4j.LoggerFactory
 import spray.caching.ValueMagnet
 
@@ -126,7 +126,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
     .maximumWeightedCapacity(Long.MaxValue)
     .build()
 
-  protected def now = DateTime.now
+  protected def now = DateTime.now.withZone(DateTimeZone.UTC)
 
   private def timestamp(v: V) = TimestampedValue(now, v)
 
@@ -257,12 +257,14 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
         // Remote is the same as local, return any of them
         Future.successful(remoteValue.value)
       case Success(Some(remoteValue)) =>
+        def datesAreClose(date1: DateTime, date2: DateTime): Boolean = Math.abs(new Interval(date1, date2).toDurationMillis) <= 5000
         // Something is different, try to figure it out
         val valuesResult = if (remoteValue.value == localValue.value) "same-value" else "different-values"
+        val closeDatesSuffix = if (datesAreClose(remoteValue.date, localValue.date)) "-but-close-dates" else ""
         val dateResult = if (remoteValue.date.isAfter(localValue.date))
-          "remote-is-older-than-local"
+          s"remote-is-newer-than-local$closeDatesSuffix"
         else if (localValue.date.isAfter(remoteValue.date))
-          "local-is-older-than-remote"
+          s"local-is-newer-than-remote$closeDatesSuffix"
         else if (localValue.date.isEqual(localValue.date))
           "same-date"
         else if (localValue.date.withZone(DateTimeZone.UTC).isEqual(localValue.date.withZone(DateTimeZone.UTC)))

From daf6eab6966b5436f4d0fefc156bc614d5d0291e Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Mon, 7 Nov 2016 18:05:19 -0200
Subject: [PATCH 133/268] Fallback on remote not found while in sanity check

---
 .../ignition/core/cache/ExpiringMultiLevelCache.scala     | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
index d20a3c4f..472b88b7 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
@@ -149,7 +149,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
             reporter.onLocalCacheHit(key, elapsedTime(startTime))
             // But if we're paranoid, let's check if the local value is consistent with remote
             if (sanityLocalValueCheck)
-              remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, startTime)).getOrElse(Future.successful(localValue.value))
+              remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, genValue, startTime)).getOrElse(Future.successful(localValue.value))
             else
               Future.successful(localValue.value)
           case Success(expiredLocalValue) if remoteRW.nonEmpty =>
@@ -251,7 +251,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
     }
   }
 
-  private def sanityLocalValueCheck(key: String, localValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], startTime: Long)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = {
+  private def sanityLocalValueCheck(key: String, localValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], genValue: () => Future[V], startTime: Long)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = {
     remote.get(key).asTry().flatMap {
       case Success(Some(remoteValue)) if remoteValue == localValue =>
         // Remote is the same as local, return any of them
@@ -282,7 +282,9 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
         val localExpired = localValue.hasExpired(ttl, now)
         val finalResult = s"missing-remote-local-expired-${localExpired}"
         logger.warn(s"sanityLocalValueCheck, key $key: got local $localValue but no remote ($finalResult)")
-        Future.successful(localValue.value)
+        reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime))
+        // Try generate it to keep a behaviour equivalent to remote only
+        tryGenerateAndSet(key, genValue, startTime).map(_.value)
       case Failure(e) =>
         reporter.onRemoteError(key, e, elapsedTime(startTime))
         logger.warn(s"sanityLocalValueCheck, key: $key  failed to get remote", e)

From b6fbd2195e99e4891eb00d4830cfa9655d36871d Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Mon, 7 Nov 2016 18:05:57 -0200
Subject: [PATCH 134/268] suport for x1 instance type

---
 tools/spark-ec2/spark_ec2.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
index b1f4e709..909c284c 100755
--- a/tools/spark-ec2/spark_ec2.py
+++ b/tools/spark-ec2/spark_ec2.py
@@ -478,6 +478,8 @@ def get_validate_spark_version(version, repo):
     "t2.small":    "hvm",
     "t2.medium":   "hvm",
     "t2.large":    "hvm",
+    "x1.16xlarge": "hvm",
+    "x1.32xlarge": "hvm",
 }
 
 
@@ -1134,6 +1136,9 @@ def get_num_disks(instance_type):
         "t2.small":    0,
         "t2.medium":   0,
         "t2.large":    0,
+        "x1.16xlarge": 1,
+        "x1.32xlarge": 2,
+
     }
     if instance_type in disks_by_instance:
         return disks_by_instance[instance_type]

From aa55e1140d42c4b534c9f150645a25ca71985ab4 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 8 Nov 2016 18:48:50 -0200
Subject: [PATCH 135/268] rename private Entry class to avoid assembly issues

---
 src/main/scala/spray/cache/ExpiringLruLocalCache.scala | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/spray/cache/ExpiringLruLocalCache.scala b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala
index 33d2b4d9..9fa476f9 100644
--- a/src/main/scala/spray/cache/ExpiringLruLocalCache.scala
+++ b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala
@@ -20,14 +20,13 @@
 package spray.caching
 
 import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap
-import ignition.core.cache.ExpiringMultiLevelCache
+import spray.util.Timestamp
 
 import scala.annotation.tailrec
 import scala.collection.JavaConverters._
 import scala.concurrent.duration.Duration
 import scala.concurrent.{ExecutionContext, Future, Promise}
-import scala.util.{Failure, Success, Try}
-import spray.util.Timestamp
+import scala.util.{Failure, Success}
 
 final class ExpiringLruLocalCache[V](maxCapacity: Long,
                                      initialCapacity: Int = 16,
@@ -119,7 +118,7 @@ final class ExpiringLruLocalCache[V](maxCapacity: Long,
   }
 }
 
-private[caching] class Entry[T](val promise: Promise[T]) {
+private[caching] class ExpiringLruLocalCacheEntry[T](val promise: Promise[T]) {
   @volatile var created = Timestamp.now
   @volatile var lastAccessed = Timestamp.now
   def future = promise.future

From 9b08575fc097ac00625c60d8bccd75363422f5be Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Fri, 11 Nov 2016 21:58:27 -0200
Subject: [PATCH 136/268] Remove close dates report because it is too much
 trouble for nothing

---
 .../scala/ignition/core/cache/ExpiringMultiLevelCache.scala | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
index 472b88b7..138d6cbd 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
@@ -257,14 +257,12 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
         // Remote is the same as local, return any of them
         Future.successful(remoteValue.value)
       case Success(Some(remoteValue)) =>
-        def datesAreClose(date1: DateTime, date2: DateTime): Boolean = Math.abs(new Interval(date1, date2).toDurationMillis) <= 5000
         // Something is different, try to figure it out
         val valuesResult = if (remoteValue.value == localValue.value) "same-value" else "different-values"
-        val closeDatesSuffix = if (datesAreClose(remoteValue.date, localValue.date)) "-but-close-dates" else ""
         val dateResult = if (remoteValue.date.isAfter(localValue.date))
-          s"remote-is-newer-than-local$closeDatesSuffix"
+          s"remote-is-newer-than-local"
         else if (localValue.date.isAfter(remoteValue.date))
-          s"local-is-newer-than-remote$closeDatesSuffix"
+          s"local-is-newer-than-remote"
         else if (localValue.date.isEqual(localValue.date))
           "same-date"
         else if (localValue.date.withZone(DateTimeZone.UTC).isEqual(localValue.date.withZone(DateTimeZone.UTC)))

From 64c65a6af8756b8a0fbdf3b354f2b1cc0e8364e5 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 17 Nov 2016 23:43:18 -0200
Subject: [PATCH 137/268] Smart and lazy s3 list

---
 .../core/jobs/utils/SparkContextUtils.scala   | 213 +++++++++---------
 1 file changed, 112 insertions(+), 101 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index dddd51a6..b1994e29 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -24,6 +24,7 @@ import scala.reflect.ClassTag
 import scala.util.{Failure, Success, Try}
 import scala.util.control.NonFatal
 import ignition.core.utils.ExceptionUtils._
+import ignition.core.utils.CollectionUtils._
 import org.slf4j.LoggerFactory
 
 
@@ -47,8 +48,24 @@ object SparkContextUtils {
     }
   }
 
+  object S3SplittedPath {
+    val s3Pattern = "s3[an]?://([^/]+)(.+)".r
+
+    def from(fullPath: String): Option[S3SplittedPath] =
+      fullPath match {
+        case s3Pattern(bucket, prefix) => Option(S3SplittedPath(bucket, prefix.dropWhile(_ == '/')))
+        case _ => None
+      }
+  }
+
+  case class S3SplittedPath(bucket: String, key: String) {
+    def join: String = s"s3a://$bucket/$key"
+  }
+
   case class HadoopFile(path: String, isDir: Boolean, size: Long)
 
+  case class WithOptDate[E](date: Option[DateTime], value: E)
+
   implicit class SparkContextImprovements(sc: SparkContext) {
 
     private lazy val logger = LoggerFactory.getLogger(getClass)
@@ -353,15 +370,6 @@ object SparkContextUtils {
         union
     }
 
-    def parallelListAndReadTextFiles(paths: List[String],
-                                     maxBytesPerPartition: Long,
-                                     minPartitions: Int,
-                                     sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling())
-                                    (implicit dateExtractor: PathDateExtractor): RDD[String] = {
-      val foundFiles = paths.flatMap(smartList(_))
-      parallelReadTextFiles(foundFiles, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, sizeBasedFileHandling = sizeBasedFileHandling)
-    }
-
     def parallelReadTextFiles(files: List[HadoopFile],
                               maxBytesPerPartition: Long = 128 * 1000 * 1000,
                               minPartitions: Int = 100,
@@ -446,38 +454,42 @@ object SparkContextUtils {
       innerListFiles(List(HadoopFile(path, isDir = true, 0)))
     }
 
-    def s3ListCommonPrefixes(bucket: String, prefix: String, delimiter: String = "/")
-                            (implicit s3: AmazonS3Client): Stream[String] = {
+    def s3ListCommonPrefixes(path: S3SplittedPath, delimiter: String = "/")
+                            (implicit s3: AmazonS3Client): Stream[S3SplittedPath] = {
       def inner(current: ObjectListing): Stream[String] =
-        if (current.isTruncated)
+        if (current.isTruncated) {
+          logger.trace(s"list common prefixed truncated for ${path.bucket} ${path.key}: ${current.getCommonPrefixes}")
           current.getCommonPrefixes.toStream ++ inner(s3.listNextBatchOfObjects(current))
-        else
+        } else {
+          logger.trace(s"list common prefixed finished for ${path.bucket} ${path.key}: ${current.getCommonPrefixes}")
           current.getCommonPrefixes.toStream
+        }
 
-      val request = new ListObjectsRequest(bucket, prefix, null, delimiter, 1000)
-      inner(s3.listObjects(request))
+      val request = new ListObjectsRequest(path.bucket, path.key, null, delimiter, 1000)
+      inner(s3.listObjects(request)).map(prefix => path.copy(key = prefix))
     }
 
-    def s3ListObjects(bucket: String, prefix: String)
+    def s3ListObjects(path: S3SplittedPath)
                      (implicit s3: AmazonS3Client): Stream[S3ObjectSummary] = {
       def inner(current: ObjectListing): Stream[S3ObjectSummary] =
-        if (current.isTruncated)
+        if (current.isTruncated) {
+          logger.trace(s"list objects truncated for ${path.bucket} ${path.key}: $current")
           current.getObjectSummaries.toStream ++ inner(s3.listNextBatchOfObjects(current))
-        else
+        } else {
+          logger.trace(s"list objects finished for ${path.bucket} ${path.key}")
           current.getObjectSummaries.toStream
+        }
 
-      inner(s3.listObjects(bucket, prefix))
+      inner(s3.listObjects(path.bucket, path.key))
     }
 
-    def s3NarrowPaths(bucket: String,
-                      prefix: String,
-                      delimiter: String = "/",
+    def s3NarrowPaths(splittedPath: S3SplittedPath,
                       inclusiveStartDate: Boolean = true,
                       startDate: Option[DateTime] = None,
                       inclusiveEndDate: Boolean = true,
                       endDate: Option[DateTime] = None,
                       ignoreHours: Boolean = true)
-                     (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): Stream[String] = {
+                     (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): Stream[WithOptDate[S3SplittedPath]] = {
 
       def isGoodDate(date: DateTime): Boolean = {
         val startDateToCompare =  startDate.map(date => if (ignoreHours) date.withTimeAtStartOfDay() else date)
@@ -487,49 +499,54 @@ object SparkContextUtils {
         goodStartDate && goodEndDate
       }
 
-      def classifyPath(path: String): Either[String, (String, DateTime)] =
-        Try(pathDateExtractor.extractFromPath(s"s3a://$bucket/$path")) match {
+      def classifyPath(path: S3SplittedPath): Either[S3SplittedPath, (S3SplittedPath, DateTime)] =
+        Try(pathDateExtractor.extractFromPath(path.join)) match {
           case Success(date) => Right(path -> date)
           case Failure(_) => Left(path)
         }
 
-      val commonPrefixes = s3ListCommonPrefixes(bucket, prefix, delimiter).map(classifyPath)
+      val commonPrefixes = s3ListCommonPrefixes(splittedPath).map(classifyPath)
 
+      logger.trace(s"s3NarrowPaths for $splittedPath, common prefixes: $commonPrefixes")
       if (commonPrefixes.isEmpty)
-        Stream(s"s3a://$bucket/$prefix")
+        Stream(WithOptDate(None, splittedPath))
       else
         commonPrefixes.toStream.flatMap {
-          case Left(prefixWithoutDate) => s3NarrowPaths(bucket, prefixWithoutDate, delimiter, inclusiveStartDate, startDate, inclusiveEndDate, endDate, ignoreHours)
-          case Right((prefixWithDate, date)) if isGoodDate(date) => Stream(s"s3a://$bucket/$prefixWithDate")
+          case Left(prefixWithoutDate) =>
+            logger.trace(s"s3NarrowPaths prefixWithoutDate: $prefixWithoutDate")
+            s3NarrowPaths(prefixWithoutDate, inclusiveStartDate, startDate, inclusiveEndDate, endDate, ignoreHours)
+          case Right((prefixWithDate, date)) if isGoodDate(date) => Stream(WithOptDate(Option(date), prefixWithDate))
           case Right(_) => Stream.empty
         }
     }
 
-    private def s3List(path: String,
-                       inclusiveStartDate: Boolean,
-                       startDate: Option[DateTime],
-                       inclusiveEndDate: Boolean,
-                       endDate: Option[DateTime],
-                       exclusionPattern: Option[String])
-                      (implicit s3: AmazonS3Client, dateExtractor: PathDateExtractor): Stream[S3ObjectSummary] = {
+    // Sorted from most recent to least recent path
+    private def sortPaths[P](paths: Stream[WithOptDate[P]]): Stream[WithOptDate[P]] = {
+      paths.sortBy { p => p.date.getOrElse(new DateTime(1970, 1, 1, 1, 1)) }(Ordering[DateTime].reverse)
+    }
 
-      val s3Pattern = "s3[an]?://([^/]+)(.+)".r
+    private def sortedS3List(path: String,
+                             inclusiveStartDate: Boolean,
+                             startDate: Option[DateTime],
+                             inclusiveEndDate: Boolean,
+                             endDate: Option[DateTime],
+                             exclusionPattern: Option[String])
+                            (implicit s3: AmazonS3Client, dateExtractor: PathDateExtractor): Stream[WithOptDate[Array[S3ObjectSummary]]] = {
 
-      def extractBucketAndPrefix(path: String): Option[(String, String)] = path match {
-        case s3Pattern(bucket, prefix) => Option(bucket -> prefix.dropWhile(_ == '/'))
-        case _ => None
-      }
 
-      extractBucketAndPrefix(path) match {
-        case Some((pathBucket, pathPrefix)) =>
-          s3NarrowPaths(pathBucket, pathPrefix, inclusiveStartDate = inclusiveStartDate, inclusiveEndDate = inclusiveEndDate,
-            startDate = startDate, endDate = endDate).flatMap(extractBucketAndPrefix).flatMap {
-            case (bucket, prefix) => s3ListObjects(bucket, prefix)
-          }
+      S3SplittedPath.from(path) match {
+        case Some(splittedPath) =>
+          val prefixes: Stream[WithOptDate[S3SplittedPath]] =
+            s3NarrowPaths(splittedPath, inclusiveStartDate = inclusiveStartDate, inclusiveEndDate = inclusiveEndDate,
+              startDate = startDate, endDate = endDate)
+
+          sortPaths(prefixes)
+            .map { case WithOptDate(date, path) => WithOptDate(date, s3ListObjects(path).toArray) } // Will list the most recent path first and only if needed the others
         case _ => Stream.empty
       }
     }
 
+
     def listAndFilterFiles(path: String,
                            requireSuccess: Boolean = false,
                            inclusiveStartDate: Boolean = true,
@@ -546,85 +563,79 @@ object SparkContextUtils {
       def isSuccessFile(file: HadoopFile): Boolean =
         file.path.endsWith("_SUCCESS") || file.path.endsWith("_FINISHED")
 
-      def extractDateFromFile(file: HadoopFile): Option[DateTime] =
-        Try(dateExtractor.extractFromPath(file.path)).toOption
+      def excludePatternValidation(file: HadoopFile): Boolean =
+        exclusionPattern.map(pattern => !file.path.matches(pattern)).getOrElse(true)
 
-      def excludePatternValidation(file: HadoopFile): Option[HadoopFile] =
-        exclusionPattern match {
-          case Some(pattern) if file.path.matches(pattern) => None
-          case Some(_) | None => Option(file)
-        }
+      def endsWithValidation(file: HadoopFile): Boolean =
+        endsWith.map { pattern =>
+          file.path.endsWith(pattern) || isSuccessFile(file)
+        }.getOrElse(true)
 
-      def endsWithValidation(file: HadoopFile): Option[HadoopFile] =
-        endsWith match {
-          case Some(pattern) if file.path.endsWith(pattern) => Option(file)
-          case Some(_) if isSuccessFile(file) => Option(file)
-          case Some(_) => None
-          case None => Option(file)
-        }
-
-      def applyPredicate(file: HadoopFile): Option[HadoopFile] =
-        if (predicate(file)) Option(file) else None
-
-      def dateValidation(file: HadoopFile): Option[HadoopFile] = {
-        val tryDate = extractDateFromFile(file)
+      def dateValidation(tryDate: Option[DateTime]): Boolean = {
         if (tryDate.isEmpty && ignoreMalformedDates)
-          Option(file)
+          true
         else {
           val date = tryDate.get
           val goodStartDate = startDate.isEmpty || (inclusiveStartDate && date.saneEqual(startDate.get) || date.isAfter(startDate.get))
-          val goodEndDate = endDate.isEmpty || (inclusiveEndDate && date.saneEqual(endDate.get) || date.isBefore(endDate.get))
-          if (goodStartDate && goodEndDate) Option(file) else None
+          def goodEndDate = endDate.isEmpty || (inclusiveEndDate && date.saneEqual(endDate.get) || date.isBefore(endDate.get))
+          goodStartDate && goodEndDate
         }
       }
 
-      val preValidations: HadoopFile => Boolean = hadoopFile => {
-        val validatedFile = for {
-          _ <- excludePatternValidation(hadoopFile)
-          _ <- endsWithValidation(hadoopFile)
-          _ <- dateValidation(hadoopFile)
-          valid <- applyPredicate(hadoopFile)
-        } yield valid
-        validatedFile.isDefined
+      def successFileValidation(files: WithOptDate[Array[HadoopFile]]): Boolean = {
+        if (requireSuccess)
+          files.value.exists(isSuccessFile)
+        else
+          true
       }
 
-      val preFilteredFiles = smartList(path, inclusiveStartDate = inclusiveStartDate, inclusiveEndDate = inclusiveEndDate,
-        startDate = startDate, endDate = endDate, exclusionPattern = exclusionPattern).filter(preValidations)
-
-      val filesByDate = preFilteredFiles.groupBy(extractDateFromFile).collect {
-        case (date, files) => date.getOrElse(new DateTime(1970, 1, 1, 1, 1)) -> files
+      def preValidations(files: WithOptDate[Array[HadoopFile]]): Option[WithOptDate[Array[HadoopFile]]] = {
+        if (!dateValidation(files.date) || !successFileValidation(files))
+          None
+        else {
+          val filtered = files.copy(value = files.value
+            .filter(excludePatternValidation).filter(endsWithValidation).filter(predicate))
+          if (filtered.value.isEmpty)
+            None
+          else
+            Option(filtered)
+        }
       }
 
-      val posFilteredFiles =
-        if (requireSuccess)
-          filesByDate.filter { case (_, files) => files.exists(isSuccessFile) }
-        else
-          filesByDate
+      val groupedAndSortedByDateFiles = sortedSmartList(path, inclusiveStartDate = inclusiveStartDate, inclusiveEndDate = inclusiveEndDate,
+        startDate = startDate, endDate = endDate, exclusionPattern = exclusionPattern).flatMap(preValidations)
 
       val allFiles = if (lastN.isDefined)
-        posFilteredFiles.toList.sortBy(_._1).reverse.take(lastN.get).flatMap(_._2)
+        groupedAndSortedByDateFiles.take(lastN.get).flatMap(_.value)
       else
-        posFilteredFiles.toList.flatMap(_._2)
+        groupedAndSortedByDateFiles.flatMap(_.value)
 
-      allFiles.sortBy(_.path)
+      allFiles.sortBy(_.path).toList
     }
 
-    def smartList(path: String,
-                  inclusiveStartDate: Boolean = false,
-                  startDate: Option[DateTime] = None,
-                  inclusiveEndDate: Boolean = false,
-                  endDate: Option[DateTime] = None,
-                  exclusionPattern: Option[String] = None)(implicit pathDateExtractor: PathDateExtractor): Stream[HadoopFile] = {
+    def sortedSmartList(path: String,
+                        inclusiveStartDate: Boolean = false,
+                        startDate: Option[DateTime] = None,
+                        inclusiveEndDate: Boolean = false,
+                        endDate: Option[DateTime] = None,
+                        exclusionPattern: Option[String] = None)(implicit pathDateExtractor: PathDateExtractor): Stream[WithOptDate[Array[HadoopFile]]] = {
 
       def toHadoopFile(s3Object: S3ObjectSummary): HadoopFile =
         HadoopFile(s"s3a://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize)
 
-      def listPath(path: String): Stream[HadoopFile] = {
+      def listPath(path: String): Stream[WithOptDate[Array[HadoopFile]]] = {
         if (path.startsWith("s3")) {
-          s3List(path, inclusiveStartDate = inclusiveStartDate, startDate = startDate, inclusiveEndDate = inclusiveEndDate,
-            endDate = endDate, exclusionPattern = exclusionPattern)(amazonS3ClientFromEnvironmentVariables, pathDateExtractor).map(toHadoopFile)
+          sortedS3List(path, inclusiveStartDate = inclusiveStartDate, startDate = startDate, inclusiveEndDate = inclusiveEndDate,
+            endDate = endDate, exclusionPattern = exclusionPattern)(amazonS3ClientFromEnvironmentVariables, pathDateExtractor).map {
+            case WithOptDate(date, paths) => WithOptDate(date, paths.map(toHadoopFile).toArray)
+          }
         } else {
-          driverListFiles(path).toStream
+          val pathsWithDate: Stream[WithOptDate[Iterable[HadoopFile]]] = driverListFiles(path)
+            .map(p => (Try { pathDateExtractor.extractFromPath(p.path) }.toOption, p))
+            .groupByKey()
+            .map { case (date, path) => WithOptDate(date, path) }
+            .toStream
+          sortPaths(pathsWithDate).map { case WithOptDate(date, paths) => WithOptDate(date, paths.toArray) }
         }
       }
 

From 5881afe87e7d01f7576000b34b79025e4c5c40dc Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Sat, 19 Nov 2016 19:59:15 -0200
Subject: [PATCH 138/268] Removed custom boto because having 2 boto versions is
 confusing and unnecessary

---
 tools/spark-ec2/spark_ec2.py | 54 ------------------------------------
 1 file changed, 54 deletions(-)

diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
index 909c284c..79e81484 100755
--- a/tools/spark-ec2/spark_ec2.py
+++ b/tools/spark-ec2/spark_ec2.py
@@ -107,60 +107,6 @@
 DEFAULT_SPARK_EC2_BRANCH = "branch-2.0"
 
 
-def setup_external_libs(libs):
-    """
-    Download external libraries from PyPI to SPARK_EC2_DIR/lib/ and prepend them to our PATH.
-    """
-    PYPI_URL_PREFIX = "https://pypi.python.org/packages/source"
-    SPARK_EC2_LIB_DIR = os.path.join(SPARK_EC2_DIR, "lib")
-
-    if not os.path.exists(SPARK_EC2_LIB_DIR):
-        print("Downloading external libraries that spark-ec2 needs from PyPI to {path}...".format(
-            path=SPARK_EC2_LIB_DIR
-        ))
-        print("This should be a one-time operation.")
-        os.mkdir(SPARK_EC2_LIB_DIR)
-
-    for lib in libs:
-        versioned_lib_name = "{n}-{v}".format(n=lib["name"], v=lib["version"])
-        lib_dir = os.path.join(SPARK_EC2_LIB_DIR, versioned_lib_name)
-
-        if not os.path.isdir(lib_dir):
-            tgz_file_path = os.path.join(SPARK_EC2_LIB_DIR, versioned_lib_name + ".tar.gz")
-            print(" - Downloading {lib}...".format(lib=lib["name"]))
-            download_stream = urlopen(
-                "{prefix}/{first_letter}/{lib_name}/{lib_name}-{lib_version}.tar.gz".format(
-                    prefix=PYPI_URL_PREFIX,
-                    first_letter=lib["name"][:1],
-                    lib_name=lib["name"],
-                    lib_version=lib["version"]
-                )
-            )
-            with open(tgz_file_path, "wb") as tgz_file:
-                tgz_file.write(download_stream.read())
-            with open(tgz_file_path, "rb") as tar:
-                if hashlib.md5(tar.read()).hexdigest() != lib["md5"]:
-                    print("ERROR: Got wrong md5sum for {lib}.".format(lib=lib["name"]), file=stderr)
-                    sys.exit(1)
-            tar = tarfile.open(tgz_file_path)
-            tar.extractall(path=SPARK_EC2_LIB_DIR)
-            tar.close()
-            os.remove(tgz_file_path)
-            print(" - Finished downloading {lib}.".format(lib=lib["name"]))
-        sys.path.insert(1, lib_dir)
-
-
-# Only PyPI libraries are supported.
-external_libs = [
-    {
-        "name": "boto",
-        "version": "2.34.0",
-        "md5": "5556223d2d0cc4d06dd4829e671dcecd"
-    }
-]
-
-setup_external_libs(external_libs)
-
 import boto
 from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType, EBSBlockDeviceType
 from boto import ec2

From 002c38f9fd0ef013c3d8ba7f4fdd981029133645 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@gmail.com>
Date: Sun, 20 Nov 2016 13:21:14 -0200
Subject: [PATCH 139/268] fix date filter

---
 .../scala/ignition/core/jobs/utils/SparkContextUtils.scala   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index b1994e29..8fec24dd 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -574,12 +574,13 @@ object SparkContextUtils {
       def dateValidation(tryDate: Option[DateTime]): Boolean = {
         if (tryDate.isEmpty && ignoreMalformedDates)
           true
-        else {
+        else if (tryDate.isDefined) {
           val date = tryDate.get
           val goodStartDate = startDate.isEmpty || (inclusiveStartDate && date.saneEqual(startDate.get) || date.isAfter(startDate.get))
           def goodEndDate = endDate.isEmpty || (inclusiveEndDate && date.saneEqual(endDate.get) || date.isBefore(endDate.get))
           goodStartDate && goodEndDate
-        }
+        } else
+          false
       }
 
       def successFileValidation(files: WithOptDate[Array[HadoopFile]]): Boolean = {

From 31ace49364059b7b2ec6e6d400de35841a06aef2 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Mon, 21 Nov 2016 17:08:20 -0200
Subject: [PATCH 140/268] Fix date validation for paths without files

---
 .../core/jobs/utils/SparkContextUtils.scala        | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 8fec24dd..d301a0a8 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -509,7 +509,7 @@ object SparkContextUtils {
 
       logger.trace(s"s3NarrowPaths for $splittedPath, common prefixes: $commonPrefixes")
       if (commonPrefixes.isEmpty)
-        Stream(WithOptDate(None, splittedPath))
+        Stream(WithOptDate(Try(pathDateExtractor.extractFromPath(splittedPath.join)).toOption, splittedPath))
       else
         commonPrefixes.toStream.flatMap {
           case Left(prefixWithoutDate) =>
@@ -571,16 +571,18 @@ object SparkContextUtils {
           file.path.endsWith(pattern) || isSuccessFile(file)
         }.getOrElse(true)
 
-      def dateValidation(tryDate: Option[DateTime]): Boolean = {
+      def dateValidation(files: WithOptDate[Array[HadoopFile]]): Boolean = {
+        val tryDate = files.date
         if (tryDate.isEmpty && ignoreMalformedDates)
           true
-        else if (tryDate.isDefined) {
+        else if (tryDate.isEmpty)
+          throw new Exception(s"Not date found for path $path, expanded files: ${files.value.toList}, consider using ignoreMalformedDates=true if not date is expected on this path")
+        else {
           val date = tryDate.get
           val goodStartDate = startDate.isEmpty || (inclusiveStartDate && date.saneEqual(startDate.get) || date.isAfter(startDate.get))
           def goodEndDate = endDate.isEmpty || (inclusiveEndDate && date.saneEqual(endDate.get) || date.isBefore(endDate.get))
           goodStartDate && goodEndDate
-        } else
-          false
+        }
       }
 
       def successFileValidation(files: WithOptDate[Array[HadoopFile]]): Boolean = {
@@ -591,7 +593,7 @@ object SparkContextUtils {
       }
 
       def preValidations(files: WithOptDate[Array[HadoopFile]]): Option[WithOptDate[Array[HadoopFile]]] = {
-        if (!dateValidation(files.date) || !successFileValidation(files))
+        if (!dateValidation(files) || !successFileValidation(files))
           None
         else {
           val filtered = files.copy(value = files.value

From 4ecd942b1150d303dc42eac2ae13dff563a58591 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 22 Nov 2016 17:08:50 -0200
Subject: [PATCH 141/268] Performs date validation only if there are files to
 be validated

---
 .../scala/ignition/core/jobs/utils/SparkContextUtils.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index d301a0a8..1de12dd6 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -593,12 +593,12 @@ object SparkContextUtils {
       }
 
       def preValidations(files: WithOptDate[Array[HadoopFile]]): Option[WithOptDate[Array[HadoopFile]]] = {
-        if (!dateValidation(files) || !successFileValidation(files))
+        if (!successFileValidation(files))
           None
         else {
           val filtered = files.copy(value = files.value
             .filter(excludePatternValidation).filter(endsWithValidation).filter(predicate))
-          if (filtered.value.isEmpty)
+          if (filtered.value.isEmpty || !dateValidation(filtered))
             None
           else
             Option(filtered)

From 6e1cb5e2c4fd07ab20890636853d95b6b8ff0189 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Thu, 24 Nov 2016 11:30:39 -0200
Subject: [PATCH 142/268] to string for IntBag

---
 src/main/scala/ignition/core/utils/IntBag.scala | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/ignition/core/utils/IntBag.scala b/src/main/scala/ignition/core/utils/IntBag.scala
index 38cb3836..a53d2d8f 100644
--- a/src/main/scala/ignition/core/utils/IntBag.scala
+++ b/src/main/scala/ignition/core/utils/IntBag.scala
@@ -35,12 +35,14 @@ case class IntBag(histogram: collection.Map[Long, Long]) {
     }
   }
 
+  def count: Long = histogram.values.sum
+
+  def sum: Long = histogram.map { case (k, f) => k * f }.sum
+
   def avg: Option[Long] = {
-    if (histogram.nonEmpty) {
-      val sum = histogram.map { case (k, f) => k * f }.sum
-      val count = histogram.values.sum
+    if (histogram.nonEmpty)
       Option(sum / count)
-    } else
+    else
       None
   }
 
@@ -51,4 +53,7 @@ case class IntBag(histogram: collection.Map[Long, Long]) {
   def max: Option[Long] = {
     histogram.keys.maxOption
   }
+
+  override def toString: String = s"IntBag(median=$median, count=$count, sum=$sum, avg=$avg, min=$min, max=$max)"
+
 }

From 135a753751524d62050d13f03b2e464400e69311 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 29 Nov 2016 17:11:50 -0200
Subject: [PATCH 143/268] Avoid checking too early the job

---
 remote_hook.sh   | 8 ++++++--
 tools/cluster.py | 1 +
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index 688bfbc1..3635951e 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -20,12 +20,16 @@ MY_USER=$(whoami)
 # Avoids problems when another user created our control dir
 sudo mkdir -p "${JOB_CONTROL_DIR}"
 sudo chown $MY_USER "${JOB_CONTROL_DIR}"
-sudo chmod -R o+rx /root
-
 
 RUNNING_FILE="${JOB_CONTROL_DIR}/RUNNING"
+# This should be the first thing in the script to avoid the wait remote job thinking we died
 echo $$ > "${RUNNING_FILE}"
 
+
+
+# Let us read the spark home even when the image doesn't give us the permission
+sudo chmod -R o+rx /root
+
 notify_error_and_exit() {
     description="${1}"
     echo "Exiting because: ${description}"
diff --git a/tools/cluster.py b/tools/cluster.py
index 7d77e8c4..080dcedc 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -451,6 +451,7 @@ def job_run(cluster_name, job_name, job_mem,
         ssh_call(user=remote_user, host=master, key_file=key_file, args=[tmux_arg], allocate_terminal=True)
 
     if wait_completion:
+        time.sleep(5) # wait job to set up before checking it
         failed = False
         failed_exception = None
         try:

From 8b91ee353716ba4db6efba0612d5c2f2206d881c Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Wed, 7 Dec 2016 16:09:46 -0200
Subject: [PATCH 144/268] Updated spark

---
 build.sbt        | 2 +-
 tools/cluster.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/build.sbt b/build.sbt
index c0f4bf77..ad80612f 100644
--- a/build.sbt
+++ b/build.sbt
@@ -9,7 +9,7 @@ scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warning
 // Because we can't run two spark contexts on same VM
 parallelExecution in Test := false
 
-libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.1" % "provided")
+libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.2" % "provided")
   .exclude("org.apache.hadoop", "hadoop-client")
   .exclude("org.slf4j", "slf4j-log4j12")
 
diff --git a/tools/cluster.py b/tools/cluster.py
index 080dcedc..c2762f3a 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -49,7 +49,7 @@
 default_ami = None # will be decided based on spark-ec2 list
 default_master_ami = None
 default_env = 'dev'
-default_spark_version = '2.0.1'
+default_spark_version = '2.0.2'
 custom_builds = {
 #    '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz'
 }

From f712f0650c5e19533f75c1acb5282b717baee455 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Wed, 7 Dec 2016 16:28:28 -0200
Subject: [PATCH 145/268] Added spark 2.0.2

---
 tools/spark-ec2/spark_ec2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
index 79e81484..1b405d47 100755
--- a/tools/spark-ec2/spark_ec2.py
+++ b/tools/spark-ec2/spark_ec2.py
@@ -78,6 +78,7 @@
     "1.6.0",
     "2.0.0",
     "2.0.1",
+    "2.0.2",
 ])
 
 SPARK_TACHYON_MAP = {

From 5fb406739caf7b198214d2402e855fa4aae4ab0d Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 29 Dec 2016 19:45:00 -0200
Subject: [PATCH 146/268] Create an uri along the request to guarantee the
 request is valid

---
 .../core/http/AsyncHttpClientStreamApi.scala  | 12 ++++++--
 .../core/http/AsyncSprayHttpClient.scala      | 30 ++++---------------
 2 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
index 30f46c53..131d2a05 100644
--- a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
+++ b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
@@ -4,7 +4,7 @@ import java.io.InputStream
 import java.util.concurrent.TimeUnit
 
 import akka.util.Timeout
-import spray.http.{HttpEntity, HttpHeader, HttpMethod, HttpMethods}
+import spray.http._
 
 import scala.concurrent.Future
 import scala.concurrent.duration._
@@ -49,7 +49,15 @@ object AsyncHttpClientStreamApi {
                      method: HttpMethod = HttpMethods.GET,
                      body: HttpEntity = HttpEntity.Empty,
                      headers: List[HttpHeader] = List.empty,
-                     requestConfiguration: Option[RequestConfiguration] = None)
+                     requestConfiguration: Option[RequestConfiguration] = None) {
+    val uri: Uri = {
+      // Note: This will guarantee we create a valid request (one with a valid uri). Will throw an exception if invalid
+      if (params.nonEmpty)
+        Uri(url).withQuery(params)
+      else
+        Uri(url)
+    }
+  }
 
   case class RequestException(message: String, response: StreamResponse) extends RuntimeException(message)
 
diff --git a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
index 405457ea..af40c25a 100644
--- a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
+++ b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
@@ -1,13 +1,12 @@
 package ignition.core.http
 
-import java.net.URL
 import java.util.concurrent.TimeoutException
 
 import akka.actor._
 import akka.io.IO
 import akka.pattern.ask
 import akka.util.Timeout
-
+import ignition.core.http.AsyncHttpClientStreamApi.{Request, RequestConfiguration}
 import spray.can.Http
 import spray.can.Http.HostConnectorSetup
 import spray.can.client.{ClientConnectionSettings, HostConnectorSettings}
@@ -15,14 +14,10 @@ import spray.http.HttpHeaders.Authorization
 import spray.http.StatusCodes.Redirection
 import spray.http._
 
-
-import scala.concurrent.duration._
 import scala.concurrent.{ExecutionContext, Future}
 import scala.language.postfixOps
 import scala.util.control.NonFatal
 
-import ignition.core.http.AsyncHttpClientStreamApi.{Request, RequestConfiguration}
-
 
 trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi {
 
@@ -51,27 +46,15 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi {
       case _ => false
     }
 
-    private def toUriString(url: String, params: Map[String, String] = Map.empty) = {
-      def encode(content: String) = java.net.URLEncoder.encode(content, "UTF-8")
-      def encodeParams = params.map { case (k, v) => s"${encode(k)}=${encode(v)}" }.mkString("&")
-      if (params.isEmpty) url else s"$url?${encodeParams}"
-    }
-
     private implicit def toAuthHeader(credentials: AsyncHttpClientStreamApi.Credentials): List[Authorization] =
       List(Authorization(credentials = BasicHttpCredentials(username = credentials.user, password = credentials.password)))
 
     private def toSprayRequest(request: Request): HttpRequest = request match {
-      case Request(uri, params, Some(credentials), method, body, headers, _) if params.isEmpty =>
-          HttpRequest(method = method, uri = request.url, headers = credentials ++ headers, entity = body)
-
-      case Request(uri, params, Some(credentials), method, body, headers, _) =>
-        HttpRequest(method = method, uri = toUriString(request.url, params), headers = credentials ++ headers, entity = body)
-
-      case Request(uri, params, None, method, body, headers, _) if params.isEmpty =>
-        HttpRequest(method = method, uri = toUriString(request.url), entity = body, headers = headers)
+      case Request(_, params, Some(credentials), method, body, headers, _) =>
+          HttpRequest(method = method, uri = request.uri, headers = credentials ++ headers, entity = body)
 
-      case Request(uri, params, None, method, body, headers, _) =>
-        HttpRequest(method = method, uri = toUriString(request.url, params), entity = body, headers = headers)
+      case Request(_, params, None, method, body, headers, _) =>
+        HttpRequest(method = method, uri = request.uri, entity = body, headers = headers)
     }
 
     private def toSprayHostConnectorSetup(uri: Uri, conf: Option[AsyncHttpClientStreamApi.RequestConfiguration]): HostConnectorSetup = {
@@ -109,8 +92,7 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi {
     }
 
     private def executeSprayRequest(request: Request): Unit = {
-      val url = Uri(request.url)
-      val message = (toSprayRequest(request), toSprayHostConnectorSetup(url, request.requestConfiguration))
+      val message = (toSprayRequest(request), toSprayHostConnectorSetup(request.uri, request.requestConfiguration))
       IO(Http) ! message
     }
 

From b2437133518175e1c8c33f654c727a2e22c7690f Mon Sep 17 00:00:00 2001
From: Fernando <fernandors87@gmail.com>
Date: Thu, 5 Jan 2017 17:20:41 -0200
Subject: [PATCH 147/268] request uri sanitization effort

---
 .../core/http/AsyncHttpClientStreamApi.scala  | 32 +++++++++++--
 .../http/AsyncHttpClientStreamApiSpec.scala   | 47 +++++++++++++++++++
 2 files changed, 75 insertions(+), 4 deletions(-)
 create mode 100644 src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala

diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
index 131d2a05..9760e100 100644
--- a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
+++ b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
@@ -1,17 +1,40 @@
 package ignition.core.http
 
 import java.io.InputStream
+import java.net.{URL, URLDecoder, URLEncoder}
 import java.util.concurrent.TimeUnit
 
 import akka.util.Timeout
+import spray.http.Uri.Query
 import spray.http._
 
 import scala.concurrent.Future
 import scala.concurrent.duration._
 import scala.language.postfixOps
 
-
 object AsyncHttpClientStreamApi {
+
+  // Due to ancient standards, Java will encode space as + instead of using percent.
+  //
+  // See:
+  // http://stackoverflow.com/questions/1634271/url-encoding-the-space-character-or-20
+  // https://docs.oracle.com/javase/7/docs/api/java/net/URLEncoder.html#encode(java.lang.String,%20java.lang.String)
+  private def sanitizePathSegment(segment: String) =
+    URLEncoder.encode(URLDecoder.decode(segment, "UTF-8"), "UTF-8").replace("+", "%20")
+
+  def sanitizeUrl(strUrl: String) = {
+    val url = new URL(strUrl)
+    val sanePath = url.getPath.split("/").map(sanitizePathSegment).mkString("/")
+
+    Uri.from(
+      scheme = url.getProtocol,
+      userinfo = Option(url.getUserInfo).getOrElse(""),
+      host = url.getHost,
+      port = Seq(url.getPort, 0).max,
+      path = sanePath,
+      query = Query(Option(url.getQuery)),
+      fragment = Option(url.getRef))
+  }
   
   case class Credentials(user: String, password: String) {
     def isEmpty = user.isEmpty && password.isEmpty
@@ -50,12 +73,13 @@ object AsyncHttpClientStreamApi {
                      body: HttpEntity = HttpEntity.Empty,
                      headers: List[HttpHeader] = List.empty,
                      requestConfiguration: Option[RequestConfiguration] = None) {
-    val uri: Uri = {
+
+    def uri: Uri = {
       // Note: This will guarantee we create a valid request (one with a valid uri). Will throw an exception if invalid
       if (params.nonEmpty)
-        Uri(url).withQuery(params)
+        sanitizeUrl(url).withQuery(params)
       else
-        Uri(url)
+        sanitizeUrl(url)
     }
   }
 
diff --git a/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala b/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala
new file mode 100644
index 00000000..ebb5dade
--- /dev/null
+++ b/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala
@@ -0,0 +1,47 @@
+package ignition.core.http
+
+import ignition.core.http.AsyncHttpClientStreamApi.Request
+import org.scalatest.{FunSpec, Matchers}
+
+class AsyncHttpClientStreamApiSpec extends FunSpec with Matchers {
+
+  describe(".sanitizeUrl") {
+    it("should percent encode url paths") {
+      val tests = Seq(
+        "http://images1.petlove.com.br/products/170301/small/Ração-Special-Dog-Premium-Vegetais-Cenoura-e-Espinafre-para-Cães.jpg",
+        "http://images0.petlove.com.br/products/175408/small/Ração-Nestlé-Purina-Pro-Plan-Cat-Sensitive-para-Gatos-Adultos-com-Pele-Sensível.jpg",
+        "http://images3.petlove.com.br/products/171539/small/Ração-Royal-Canin-Feline-Veterinary-Diet-Urinary-SO-High-Dilution-para-Gatos-com-Cálculos-Urinários.jpg"
+      )
+
+      val expectations = Seq(
+        "http://images1.petlove.com.br/products/170301/small/Ra%C3%A7%C3%A3o-Special-Dog-Premium-Vegetais-Cenoura-e-Espinafre-para-C%C3%A3es.jpg",
+        "http://images0.petlove.com.br/products/175408/small/Ra%C3%A7%C3%A3o-Nestl%C3%A9-Purina-Pro-Plan-Cat-Sensitive-para-Gatos-Adultos-com-Pele-Sens%C3%ADvel.jpg",
+        "http://images3.petlove.com.br/products/171539/small/Ra%C3%A7%C3%A3o-Royal-Canin-Feline-Veterinary-Diet-Urinary-SO-High-Dilution-para-Gatos-com-C%C3%A1lculos-Urin%C3%A1rios.jpg"
+      )
+
+      tests.zip(expectations).foreach {
+        case (url, expected) => AsyncHttpClientStreamApi.sanitizeUrl(url).toString shouldBe expected
+      }
+    }
+
+    it("should not encode percent characters in url path") {
+      val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf"
+      val sane = AsyncHttpClientStreamApi.sanitizeUrl(url).toString
+      sane shouldBe "http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf"
+    }
+
+    it("should encode space characters with percent in URL path") {
+      val url = "http://www.example.com/Pentagrama+Invertido.xml?q=blah+bleh"
+      val sane = AsyncHttpClientStreamApi.sanitizeUrl(url).toString
+      sane shouldBe "http://www.example.com/Pentagrama%20Invertido.xml?q=blah+bleh"
+    }
+  }
+
+  describe("Request") {
+    it("should do the best to parse the provided uri") {
+      val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf"
+      val request = Request(url)
+      request.uri.toString shouldBe "http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf"
+    }
+  }
+}

From 2a8ff5ff167ec97f49c994b056b5f4924973026a Mon Sep 17 00:00:00 2001
From: Fernando <fernandors87@gmail.com>
Date: Tue, 17 Jan 2017 14:18:47 -0200
Subject: [PATCH 148/268] URLUtils

---
 .../core/http/AsyncHttpClientStreamApi.scala  | 29 +------
 .../scala/ignition/core/utils/URLUtils.scala  | 38 ++++++++++
 .../http/AsyncHttpClientStreamApiSpec.scala   | 42 +----------
 .../ignition/core/utils/URLUtilsSpec.scala    | 75 +++++++++++++++++++
 4 files changed, 120 insertions(+), 64 deletions(-)
 create mode 100644 src/main/scala/ignition/core/utils/URLUtils.scala
 create mode 100644 src/test/scala/ignition/core/utils/URLUtilsSpec.scala

diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
index 9760e100..e95e4811 100644
--- a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
+++ b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
@@ -1,11 +1,10 @@
 package ignition.core.http
 
 import java.io.InputStream
-import java.net.{URL, URLDecoder, URLEncoder}
 import java.util.concurrent.TimeUnit
 
 import akka.util.Timeout
-import spray.http.Uri.Query
+import ignition.core.utils.URLUtils
 import spray.http._
 
 import scala.concurrent.Future
@@ -13,28 +12,6 @@ import scala.concurrent.duration._
 import scala.language.postfixOps
 
 object AsyncHttpClientStreamApi {
-
-  // Due to ancient standards, Java will encode space as + instead of using percent.
-  //
-  // See:
-  // http://stackoverflow.com/questions/1634271/url-encoding-the-space-character-or-20
-  // https://docs.oracle.com/javase/7/docs/api/java/net/URLEncoder.html#encode(java.lang.String,%20java.lang.String)
-  private def sanitizePathSegment(segment: String) =
-    URLEncoder.encode(URLDecoder.decode(segment, "UTF-8"), "UTF-8").replace("+", "%20")
-
-  def sanitizeUrl(strUrl: String) = {
-    val url = new URL(strUrl)
-    val sanePath = url.getPath.split("/").map(sanitizePathSegment).mkString("/")
-
-    Uri.from(
-      scheme = url.getProtocol,
-      userinfo = Option(url.getUserInfo).getOrElse(""),
-      host = url.getHost,
-      port = Seq(url.getPort, 0).max,
-      path = sanePath,
-      query = Query(Option(url.getQuery)),
-      fragment = Option(url.getRef))
-  }
   
   case class Credentials(user: String, password: String) {
     def isEmpty = user.isEmpty && password.isEmpty
@@ -77,9 +54,9 @@ object AsyncHttpClientStreamApi {
     def uri: Uri = {
       // Note: This will guarantee we create a valid request (one with a valid uri). Will throw an exception if invalid
       if (params.nonEmpty)
-        sanitizeUrl(url).withQuery(params)
+        URLUtils.parseUri(url).withQuery(params)
       else
-        sanitizeUrl(url)
+        URLUtils.parseUri(url)
     }
   }
 
diff --git a/src/main/scala/ignition/core/utils/URLUtils.scala b/src/main/scala/ignition/core/utils/URLUtils.scala
new file mode 100644
index 00000000..800a3a1a
--- /dev/null
+++ b/src/main/scala/ignition/core/utils/URLUtils.scala
@@ -0,0 +1,38 @@
+package ignition.core.utils
+
+import java.net.{URL, URLDecoder, URLEncoder}
+
+import org.apache.http.client.utils.URIBuilder
+import spray.http.Uri
+import spray.http.Uri.Query
+
+object URLUtils {
+
+  // Due to ancient standards, Java will encode space as + instead of using percent.
+  //
+  // See:
+  // http://stackoverflow.com/questions/1634271/url-encoding-the-space-character-or-20
+  // https://docs.oracle.com/javase/7/docs/api/java/net/URLEncoder.html#encode(java.lang.String,%20java.lang.String)
+  def sanitizePathSegment(segment: String) =
+    URLEncoder.encode(URLDecoder.decode(segment, "UTF-8"), "UTF-8").replace("+", "%20")
+
+  def parseUri(urlStr: String): Uri = {
+    val url = new URL(urlStr)
+    val sanePath = url.getPath.split("/").map(sanitizePathSegment).mkString("/")
+
+    Uri.from(
+      scheme = url.getProtocol,
+      userinfo = Option(url.getUserInfo).getOrElse(""),
+      host = url.getHost,
+      port = Seq(url.getPort, 0).max,
+      path = sanePath,
+      query = Query(Option(url.getQuery)),
+      fragment = Option(url.getRef))
+  }
+
+  def addParametersToUrl(url: String, partnerParams: Map[String, String]): String = {
+    val builder = new URIBuilder(url.trim)
+    partnerParams.foreach { case (k, v) => builder.addParameter(k, v) }
+    builder.build().toString
+  }
+}
diff --git a/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala b/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala
index ebb5dade..37accf5b 100644
--- a/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala
+++ b/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala
@@ -5,43 +5,9 @@ import org.scalatest.{FunSpec, Matchers}
 
 class AsyncHttpClientStreamApiSpec extends FunSpec with Matchers {
 
-  describe(".sanitizeUrl") {
-    it("should percent encode url paths") {
-      val tests = Seq(
-        "http://images1.petlove.com.br/products/170301/small/Ração-Special-Dog-Premium-Vegetais-Cenoura-e-Espinafre-para-Cães.jpg",
-        "http://images0.petlove.com.br/products/175408/small/Ração-Nestlé-Purina-Pro-Plan-Cat-Sensitive-para-Gatos-Adultos-com-Pele-Sensível.jpg",
-        "http://images3.petlove.com.br/products/171539/small/Ração-Royal-Canin-Feline-Veterinary-Diet-Urinary-SO-High-Dilution-para-Gatos-com-Cálculos-Urinários.jpg"
-      )
-
-      val expectations = Seq(
-        "http://images1.petlove.com.br/products/170301/small/Ra%C3%A7%C3%A3o-Special-Dog-Premium-Vegetais-Cenoura-e-Espinafre-para-C%C3%A3es.jpg",
-        "http://images0.petlove.com.br/products/175408/small/Ra%C3%A7%C3%A3o-Nestl%C3%A9-Purina-Pro-Plan-Cat-Sensitive-para-Gatos-Adultos-com-Pele-Sens%C3%ADvel.jpg",
-        "http://images3.petlove.com.br/products/171539/small/Ra%C3%A7%C3%A3o-Royal-Canin-Feline-Veterinary-Diet-Urinary-SO-High-Dilution-para-Gatos-com-C%C3%A1lculos-Urin%C3%A1rios.jpg"
-      )
-
-      tests.zip(expectations).foreach {
-        case (url, expected) => AsyncHttpClientStreamApi.sanitizeUrl(url).toString shouldBe expected
-      }
-    }
-
-    it("should not encode percent characters in url path") {
-      val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf"
-      val sane = AsyncHttpClientStreamApi.sanitizeUrl(url).toString
-      sane shouldBe "http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf"
-    }
-
-    it("should encode space characters with percent in URL path") {
-      val url = "http://www.example.com/Pentagrama+Invertido.xml?q=blah+bleh"
-      val sane = AsyncHttpClientStreamApi.sanitizeUrl(url).toString
-      sane shouldBe "http://www.example.com/Pentagrama%20Invertido.xml?q=blah+bleh"
-    }
-  }
-
-  describe("Request") {
-    it("should do the best to parse the provided uri") {
-      val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf"
-      val request = Request(url)
-      request.uri.toString shouldBe "http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf"
-    }
+  it("should do the best to parse the provided uri") {
+    val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf"
+    val request = Request(url)
+    request.uri.toString shouldBe "http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf"
   }
 }
diff --git a/src/test/scala/ignition/core/utils/URLUtilsSpec.scala b/src/test/scala/ignition/core/utils/URLUtilsSpec.scala
new file mode 100644
index 00000000..6665e3ec
--- /dev/null
+++ b/src/test/scala/ignition/core/utils/URLUtilsSpec.scala
@@ -0,0 +1,75 @@
+package ignition.core.utils
+
+import org.scalatest.{FlatSpec, Matchers}
+
+class URLUtilsSpec extends FlatSpec with Matchers {
+
+  "URLUtils" should "add parameters to url with encoded params in base url and not be double encoded" in {
+    val baseUrl: String = "https://tracker.client.com/product=1?email=user%40mail.com"
+    val params = Map("cc" -> "second@mail.com")
+
+    val result: String = URLUtils.addParametersToUrl(baseUrl, params)
+    result shouldEqual "https://tracker.client.com/product=1?email=user%40mail.com&cc=second%40mail.com"
+  }
+
+  it should "add multiples params with the same name" in {
+    val baseUrl: String = "https://tracker.client.com/product=1?email=user%40mail.com&cc=second%40mail.com"
+    val params = Map("cc" -> "third@mail.com")
+
+    val result: String = URLUtils.addParametersToUrl(baseUrl, params)
+    result shouldEqual "https://tracker.client.com/product=1?email=user%40mail.com&cc=second%40mail.com&cc=third%40mail.com"
+  }
+
+  it should "works with Fragment in original URL" in {
+
+    val baseUrl = "https://www.petlove.com.br/carrinho?utm_campanha=internalmkt#/add/variant_sku/310178,31012214/quantity/1?t=1"
+    val params: Map[String, String] = Map(
+      "utm_campaign" -> "abandonodecarrinho",
+      "utm_source" -> "chaordic-mail",
+      "utm_medium" -> "emailmkt",
+      "cc" -> "second@mail.com"
+    )
+
+    val result = URLUtils.addParametersToUrl(baseUrl, params)
+
+    val expected = "https://www.petlove.com.br/carrinho?utm_campanha=internalmkt&utm_campaign=abandonodecarrinho&utm_source=chaordic-mail&utm_medium=emailmkt&cc=second%40mail.com#/add/variant_sku/310178,31012214/quantity/1?t=1"
+
+    result shouldEqual expected
+  }
+
+  it should "handle urls with new line character at the edges" in {
+    val url = "\n\t\n\thttps://www.petlove.com.br/carrinho#/add/variant_sku/3105748-1,3107615/quantity/1?t=1\n\t"
+    val finalUrl = URLUtils.addParametersToUrl(url, Map("test" -> "true"))
+    finalUrl shouldEqual "https://www.petlove.com.br/carrinho?test=true#/add/variant_sku/3105748-1,3107615/quantity/1?t=1"
+  }
+
+  it should "percent encode url paths" in {
+    val tests = Seq(
+      "http://images1.petlove.com.br/products/170301/small/Ração-Special-Dog-Premium-Vegetais-Cenoura-e-Espinafre-para-Cães.jpg",
+      "http://images0.petlove.com.br/products/175408/small/Ração-Nestlé-Purina-Pro-Plan-Cat-Sensitive-para-Gatos-Adultos-com-Pele-Sensível.jpg",
+      "http://images3.petlove.com.br/products/171539/small/Ração-Royal-Canin-Feline-Veterinary-Diet-Urinary-SO-High-Dilution-para-Gatos-com-Cálculos-Urinários.jpg"
+    )
+
+    val expectations = Seq(
+      "http://images1.petlove.com.br/products/170301/small/Ra%C3%A7%C3%A3o-Special-Dog-Premium-Vegetais-Cenoura-e-Espinafre-para-C%C3%A3es.jpg",
+      "http://images0.petlove.com.br/products/175408/small/Ra%C3%A7%C3%A3o-Nestl%C3%A9-Purina-Pro-Plan-Cat-Sensitive-para-Gatos-Adultos-com-Pele-Sens%C3%ADvel.jpg",
+      "http://images3.petlove.com.br/products/171539/small/Ra%C3%A7%C3%A3o-Royal-Canin-Feline-Veterinary-Diet-Urinary-SO-High-Dilution-para-Gatos-com-C%C3%A1lculos-Urin%C3%A1rios.jpg"
+    )
+
+    tests.zip(expectations).foreach {
+      case (url, expected) => URLUtils.parseUri(url).toString shouldBe expected
+    }
+  }
+
+  it should "not encode percent characters in url path" in {
+    val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf"
+    val sane = URLUtils.parseUri(url).toString
+    sane shouldBe "http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf"
+  }
+
+  it should "encode space characters with percent in URL path" in {
+    val url = "http://www.example.com/Pentagrama+Invertido.xml?q=blah+bleh"
+    val sane = URLUtils.parseUri(url).toString
+    sane shouldBe "http://www.example.com/Pentagrama%20Invertido.xml?q=blah+bleh"
+  }
+}

From 51119b1daad3969d6116cf8d2bff05e0276754aa Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 2 Feb 2017 15:17:17 -0200
Subject: [PATCH 149/268] Replace spark-ec2 by flintrock as cluster launcher

---
 .gitmodules                                   |    4 +
 remote_hook.sh                                |    8 +-
 tools/cluster.py                              |  188 +-
 tools/flintrock                               |    1 +
 tools/spark-ec2/README                        |    4 -
 .../root/spark-ec2/ec2-variables.sh           |   35 -
 tools/spark-ec2/spark-ec2                     |   22 -
 tools/spark-ec2/spark_ec2.py                  | 1593 -----------------
 tools/utils.py                                |    6 +-
 9 files changed, 120 insertions(+), 1741 deletions(-)
 create mode 100644 .gitmodules
 create mode 160000 tools/flintrock
 delete mode 100644 tools/spark-ec2/README
 delete mode 100644 tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
 delete mode 100755 tools/spark-ec2/spark-ec2
 delete mode 100755 tools/spark-ec2/spark_ec2.py

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..35ab3b28
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "tools/flintrock"]
+	path = tools/flintrock
+	url = git@github.com:chaordic/flintrock.git
+	branch = ignition_v1
diff --git a/remote_hook.sh b/remote_hook.sh
index 3635951e..86f1f56b 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -86,7 +86,9 @@ JAR_PATH="${JOB_CONTROL_DIR}/Ignition.jar"
 
 cp ${JAR_PATH_SRC} ${JAR_PATH}
 
-export JOB_MASTER=${MASTER}
+# If no $MASTER, then build a url using $SPARK_MASTER_HOST
+export JOB_MASTER=${MASTER:-spark://${SPARK_MASTER_HOST}:7077}
+
 
 if [[ "${USE_YARN}" == "yes" ]]; then
     export YARN_MODE=true
@@ -97,13 +99,13 @@ if [[ "${USE_YARN}" == "yes" ]]; then
 fi
 
 if [[ "${JOB_NAME}" == "shell" ]]; then
-    sudo -E ${SPARK_HOME}/bin/spark-shell --jars ${JAR_PATH} --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell"
+    sudo -E ${SPARK_HOME}/bin/spark-shell --master "${JOB_MASTER}" --jars ${JAR_PATH} --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell"
 elif [[ "${JOB_NAME}" == "zeppelin" ]]; then
     install_and_run_zeppelin
 else
     JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log"
     tail -F "${JOB_OUTPUT}" &
-    sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
+    sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
 fi
 
 touch "${JOB_CONTROL_DIR}/SUCCESS"
diff --git a/tools/cluster.py b/tools/cluster.py
index c2762f3a..4a99a214 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -12,9 +12,8 @@
 from argh.decorators import named, arg
 import subprocess
 from subprocess import check_output, check_call
-from itertools import chain
 from utils import tag_instances, get_masters, get_active_nodes
-from utils import check_call_with_timeout, ProcessTimeoutException
+from utils import check_call_with_timeout
 import os
 import sys
 from datetime import datetime
@@ -40,32 +39,29 @@
 default_spot_price = '0.10'
 default_worker_instances = '1'
 default_executor_instances = '1'
-default_master_instance_type = 'm3.xlarge'
+default_master_instance_type = ''
 default_driver_heap_size = '12G'
 default_region = 'us-east-1'
 default_zone = default_region + 'b'
 default_key_id = 'ignition_key'
 default_key_file = os.path.expanduser('~/.ssh/ignition_key.pem')
-default_ami = None # will be decided based on spark-ec2 list
-default_master_ami = None
+default_ami = 'ami-611e7976'
+default_master_ami = ''
 default_env = 'dev'
 default_spark_version = '2.0.2'
-custom_builds = {
-#    '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz'
-}
-default_spark_repo = 'https://github.com/chaordic/spark'
+default_hdfs_version = '2.7.2'
+default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz'
+default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz'
 default_remote_user = 'ec2-user'
+default_installation_user = 'root'
 default_remote_control_dir = '/tmp/Ignition'
 default_collect_results_dir = '/tmp'
 default_user_data = os.path.join(script_path, 'scripts', 'noop')
 default_defaults_filename = 'cluster_defaults.json'
 
-default_spark_ec2_git_repo = 'https://github.com/chaordic/spark-ec2'
-default_spark_ec2_git_branch = 'branch-2.0'
-
 
 master_post_create_commands = [
-    'sudo', 'yum', '-y', 'install', 'tmux'
+    ['sudo', 'yum', '-y', 'install', 'tmux'],
 ]
 
 
@@ -130,17 +126,19 @@ def ssh_call(user, host, key_file, args=(), allocate_terminal=True, get_output=F
     else:
         return logged_call(base)
 
+def ec2_script_base_path():
+    return os.path.join(script_path, 'flintrock')
 
 def chdir_to_ec2_script_and_get_path():
-    ec2_script_base = os.path.join(script_path, 'spark-ec2')
+    ec2_script_base = ec2_script_base_path()
     os.chdir(ec2_script_base)
-    ec2_script_path = os.path.join(ec2_script_base, 'spark_ec2.py')
+    ec2_script_path = os.path.join(ec2_script_base, 'standalone.py')
     return ec2_script_path
 
 
 def call_ec2_script(args, timeout_total_minutes, timeout_inactivity_minutes):
     ec2_script_path = chdir_to_ec2_script_and_get_path()
-    return check_call_with_timeout(['/usr/bin/env', 'python', '-u',
+    return check_call_with_timeout(['/usr/bin/env', 'python3', '-u',
                                     ec2_script_path] + args,
                                    timeout_total_minutes=timeout_total_minutes,
                                    timeout_inactivity_minutes=timeout_inactivity_minutes)
@@ -207,102 +205,101 @@ def launch(cluster_name, slaves,
            tag=[],
            key_id=default_key_id, region=default_region,
            zone=default_zone, instance_type=default_instance_type,
-           ondemand=False, spot_price=default_spot_price, master_spot=False,
+           # TODO: implement it in flintrock
+           ondemand=False,
+           spot_price=default_spot_price,
+           # TODO: implement it in flintrock
+           master_spot=False,
            user_data=default_user_data,
-           security_group = None,
-           vpc = None,
-           vpc_subnet = None,
+           security_group=None,
+           vpc=None,
+           vpc_subnet=None,
+           # TODO: consider implementing in flintrock
            master_instance_type=default_master_instance_type,
-           wait_time='180', hadoop_major_version='2',
-           worker_instances=default_worker_instances,
            executor_instances=default_executor_instances,
            retries_on_same_cluster=5,
            max_clusters_to_create=5,
            minimum_percentage_healthy_slaves=0.9,
            remote_user=default_remote_user,
+           installation_user=default_installation_user,
            script_timeout_total_minutes=55,
            script_timeout_inactivity_minutes=10,
-           resume=False, just_ignore_existing=False, worker_timeout=240,
-           spark_repo=default_spark_repo,
+           just_ignore_existing=False,
+           spark_download_source=default_spark_download_source,
            spark_version=default_spark_version,
-           spark_ec2_git_repo=default_spark_ec2_git_repo,
-           spark_ec2_git_branch=default_spark_ec2_git_branch,
-           ami=default_ami, master_ami=default_master_ami,
+           hdfs_download_source=default_hdfs_download_source,
+           hdfs_version=default_hdfs_version,
+           ami=default_ami,
+           # TODO: consider implementing in flintrock
+           master_ami=default_master_ami,
            instance_profile_name=None):
 
+    assert not master_instance_type or master_instance_type == instance_type, 'Different master instance type is currently unsupported'
+    assert not master_ami or master_ami == ami, 'Different master ami is currently unsupported'
+    assert not ondemand, 'On demand is unsupported'
+    assert master_spot, 'On demand master is currently unsupported'
+
     all_args = locals()
 
-    if cluster_exists(cluster_name, region=region) and not resume:
+    if cluster_exists(cluster_name, region=region):
         if just_ignore_existing:
             log.info('Cluster exists but that is ok')
             return ''
         else:
-            raise CommandError('Cluster already exists, pick another name or resume the setup using --resume')
+            raise CommandError('Cluster already exists, pick another name')
 
     for j in range(max_clusters_to_create):
         log.info('Creating new cluster {0}, try {1}'.format(cluster_name, j+1))
         success = False
-        resume_param = ['--resume'] if resume else []
 
         auth_params = []
-        if security_group:
-            auth_params.extend([
-                '--authorized-address', '127.0.0.1/32',
-                '--additional-security-group', security_group
-            ])
 
         # '--vpc-id', default_vpc,
         # '--subnet-id', default_vpc_subnet,
         if vpc and vpc_subnet:
             auth_params.extend([
-                '--vpc-id', vpc,
-                '--subnet-id', vpc_subnet,
+                '--ec2-vpc-id', vpc,
+                '--ec2-subnet-id', vpc_subnet,
             ])
 
-        spot_params = ['--spot-price', spot_price] if not ondemand else []
-        master_spot_params = ['--master-spot'] if not ondemand and master_spot else []
-
-        ami_params = ['--ami', ami] if ami else []
-        master_ami_params = ['--master-ami', master_ami] if master_ami else []
+        spot_params = ['--ec2-spot-price', spot_price] if not ondemand else []
+        #master_spot_params = ['--master-spot'] if not ondemand and master_spot else []
 
-        iam_params = ['--instance-profile-name', instance_profile_name] if instance_profile_name else []
+        ami_params = ['--ec2-ami', ami] if ami else []
+        #master_ami_params = ['--master-ami', master_ami] if master_ami else []
 
-        spark_version = custom_builds.get(spark_version, spark_version)
+        iam_params = ['--ec2-instance-profile-name', instance_profile_name] if instance_profile_name else []
 
         for i in range(retries_on_same_cluster):
             log.info('Running script, try %d of %d', i + 1, retries_on_same_cluster)
             try:
-                call_ec2_script(['--identity-file', key_file,
-                                 '--key-pair', key_id,
-                                 '--slaves', slaves,
-                                 '--region', region,
-                                 '--zone', zone,
-                                 '--instance-type', instance_type,
-                                 '--master-instance-type', master_instance_type,
-                                 '--wait', wait_time,
-                                 '--hadoop-major-version', hadoop_major_version,
-                                 '--spark-ec2-git-repo', spark_ec2_git_repo,
-                                 '--spark-ec2-git-branch', spark_ec2_git_branch,
-                                 '--worker-instances', worker_instances,
-                                 '--executor-instances', executor_instances,
-                                 '--master-opts', '-Dspark.worker.timeout={0}'.format(worker_timeout),
-                                 '--spark-git-repo', spark_repo,
-                                 '-v', spark_version,
-                                 '--user-data', user_data,
-                                 'launch', cluster_name] +
+                call_ec2_script(['--debug',
+                                 'launch',
+                                 '--ec2-identity-file', key_file,
+                                 '--ec2-key-name', key_id,
+                                 '--num-slaves', slaves,
+                                 '--ec2-region', region,
+                                 '--ec2-availability-zone', zone,
+                                 '--ec2-instance-type', instance_type,
+                                 '--assume-yes',
+                                 '--install-spark',
+                                 '--install-hdfs',
+                                 '--spark-version', spark_version,
+                                 '--hdfs-version', hdfs_version,
+                                 '--spark-download-source', spark_download_source,
+                                 '--hdfs-download-source', hdfs_download_source,
+                                 '--spark-executor-instances', executor_instances,
+                                 '--ec2-security-group', security_group,
+                                 '--ec2-user', installation_user,
+                                 '--ec2-user-data', user_data,
+                                 cluster_name] +
                                 spot_params +
-                                master_spot_params +
-                                resume_param +
                                 auth_params +
                                 ami_params +
-                                master_ami_params +
                                 iam_params,
                                 timeout_total_minutes=script_timeout_total_minutes,
                                 timeout_inactivity_minutes=script_timeout_inactivity_minutes)
                 success = True
-            except subprocess.CalledProcessError as e:
-                resume_param = ['--resume']
-                log.warn('Failed with: %s', e)
             except Exception as e:
                 # Probably a timeout
                 log.exception('Fatal error calling EC2 script')
@@ -318,7 +315,8 @@ def launch(cluster_name, slaves,
                 master = get_master(cluster_name, region=region)
                 save_cluster_args(master, key_file, remote_user, all_args)
                 health_check(cluster_name=cluster_name, key_file=key_file, master=master, remote_user=remote_user, region=region)
-                ssh_call(user=remote_user, host=master, key_file=key_file, args=master_post_create_commands)
+                for command in master_post_create_commands:
+                    ssh_call(user=remote_user, host=master, key_file=key_file, args=command)
                 return master
         except Exception as e:
             log.exception('Got exception on last steps of cluster configuration')
@@ -328,16 +326,22 @@ def launch(cluster_name, slaves,
 
 
 def destroy(cluster_name, delete_groups=False, region=default_region):
-    delete_sg_param = ['--delete-groups'] if delete_groups else []
+    assert not delete_groups, 'Delete groups is deprecated and unsupported'
+    masters, slaves = get_active_nodes(cluster_name, region=region)
 
-    ec2_script_path = chdir_to_ec2_script_and_get_path()
-    p = subprocess.Popen(['/usr/bin/env', 'python', '-u',
-                          ec2_script_path,
-                          'destroy', cluster_name,
-                          '--region', region] + delete_sg_param,
-                         stdin=subprocess.PIPE,
-                         stdout=sys.stdout, universal_newlines=True)
-    p.communicate('y')
+    all_instances = masters + slaves
+    if all_instances:
+        log.info('The following instances will be terminated:')
+        for i in all_instances:
+            log.info('-> %s' % i.public_dns_name)
+
+        log.info('Terminating master...')
+        for i in masters:
+            i.terminate()
+        log.info('Terminating slaves...')
+        for i in slaves:
+            i.terminate()
+        log.info('Done.')
 
 
 def get_master(cluster_name, region=default_region):
@@ -388,7 +392,6 @@ def job_run(cluster_name, job_name, job_mem,
             remote_control_dir = default_remote_control_dir,
             remote_path=None, master=None,
             disable_assembly_build=False,
-            run_tests=False,
             kill_on_failure=False,
             destroy_cluster=False,
             region=default_region,
@@ -403,7 +406,6 @@ def job_run(cluster_name, job_name, job_mem,
 
     project_path = get_project_path()
     project_name = os.path.basename(project_path)
-    module_name = os.path.basename(get_module_path())
     # Use job user on remote path to avoid too many conflicts for different local users
     remote_path = remote_path or '/home/%s/%s.%s' % (default_remote_user, job_user, project_name)
     remote_hook_local = '{module_path}/remote_hook.sh'.format(module_path=get_module_path())
@@ -517,6 +519,8 @@ def health_check(cluster_name, key_file=default_key_file, master=None, remote_us
             masters, slaves = get_active_nodes(cluster_name, region=region)
             if nslaves == 0 or float(len(slaves)) / nslaves < minimum_percentage_healthy_slaves:
                 raise NotHealthyCluster('Not enough healthy slaves: {0}/{1}'.format(len(slaves), nslaves))
+            if not masters:
+                raise NotHealthyCluster('No master found')
         except NotHealthyCluster, e:
             raise e
         except Exception, e:
@@ -703,7 +707,28 @@ def killall_jobs(cluster_name, key_file=default_key_file,
             done >& /dev/null || true'''.format(remote_control_dir=remote_control_dir)
             ])
 
-
+def check_flintrock_installation():
+    try:
+        call_ec2_script(['--help'], 1 , 1)
+    except:
+        setup = os.path.join(ec2_script_base_path(), 'setup.py')
+        if not os.path.exists(setup):
+            log.error('''
+Flintrock is missing (or the wrong version is being used).
+Check if you have checked out the submodule. Try:
+  git submode update --init --recursive
+Or checkout ignition with:
+  git clone --recursive ....
+''')
+        else:
+            log.error('''
+Some dependencies are missing. For an Ubuntu system, try the following:
+sudo apt-get install python3-yaml libyaml-dev
+sudo python3 -m pip install -U pip packaging setuptools
+cd {flintrock}
+sudo pip3 -r requirements/user.pip
+        '''.format(flintrock=ec2_script_base_path()))
+        sys.exit(1)
 
 
 parser = ArghParser()
@@ -712,4 +737,5 @@ def killall_jobs(cluster_name, key_file=default_key_file,
                      kill_job, killall_jobs, collect_job_results], namespace="jobs")
 
 if __name__ == '__main__':
+
     parser.dispatch()
diff --git a/tools/flintrock b/tools/flintrock
new file mode 160000
index 00000000..541697fb
--- /dev/null
+++ b/tools/flintrock
@@ -0,0 +1 @@
+Subproject commit 541697fb11912df6298d588b845809966e94d280
diff --git a/tools/spark-ec2/README b/tools/spark-ec2/README
deleted file mode 100644
index 72434f24..00000000
--- a/tools/spark-ec2/README
+++ /dev/null
@@ -1,4 +0,0 @@
-This folder contains a script, spark-ec2, for launching Spark clusters on
-Amazon EC2. Usage instructions are available online at:
-
-http://spark.apache.org/docs/latest/ec2-scripts.html
diff --git a/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh b/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
deleted file mode 100644
index bd3b656f..00000000
--- a/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# These variables are automatically filled in by the spark-ec2 script.
-export MASTERS="{{master_list}}"
-export SLAVES="{{slave_list}}"
-export HDFS_DATA_DIRS="{{hdfs_data_dirs}}"
-export MAPRED_LOCAL_DIRS="{{mapred_local_dirs}}"
-export SPARK_LOCAL_DIRS="{{spark_local_dirs}}"
-export MODULES="{{modules}}"
-export SPARK_VERSION="{{spark_version}}"
-export TACHYON_VERSION="{{tachyon_version}}"
-export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}"
-export SWAP_MB="{{swap}}"
-export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}"
-export SPARK_EXECUTOR_INSTANCES="{{spark_executor_instances}}"
-export SPARK_MASTER_OPTS="{{spark_master_opts}}"
-export AWS_ACCESS_KEY_ID="{{aws_access_key_id}}"
-export AWS_SECRET_ACCESS_KEY="{{aws_secret_access_key}}"
diff --git a/tools/spark-ec2/spark-ec2 b/tools/spark-ec2/spark-ec2
deleted file mode 100755
index 31f97712..00000000
--- a/tools/spark-ec2/spark-ec2
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/sh
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-cd "`dirname $0`"
-PYTHONPATH="./third_party/boto-2.4.1.zip/boto-2.4.1:$PYTHONPATH" python ./spark_ec2.py "$@"
diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py
deleted file mode 100755
index 1b405d47..00000000
--- a/tools/spark-ec2/spark_ec2.py
+++ /dev/null
@@ -1,1593 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import division, print_function, with_statement
-
-import codecs
-import hashlib
-import itertools
-import logging
-import os
-import os.path
-import pipes
-import random
-import shutil
-import string
-from stat import S_IRUSR
-import subprocess
-import sys
-import tarfile
-import tempfile
-import textwrap
-import time
-import warnings
-from datetime import datetime
-from optparse import OptionParser
-from sys import stderr
-
-if sys.version < "3":
-    from urllib2 import urlopen, Request, HTTPError
-else:
-    from urllib.request import urlopen, Request
-    from urllib.error import HTTPError
-    raw_input = input
-    xrange = range
-
-SPARK_EC2_VERSION = "2.0.0"
-SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
-
-VALID_SPARK_VERSIONS = set([
-    "0.7.3",
-    "0.8.0",
-    "0.8.1",
-    "0.9.0",
-    "0.9.1",
-    "0.9.2",
-    "1.0.0",
-    "1.0.1",
-    "1.0.2",
-    "1.1.0",
-    "1.1.1",
-    "1.2.0",
-    "1.2.1",
-    "1.3.0",
-    "1.3.1",
-    "1.4.0",
-    "1.4.1",
-    "1.5.0",
-    "1.5.1",
-    "1.5.2",
-    "1.6.0",
-    "2.0.0",
-    "2.0.1",
-    "2.0.2",
-])
-
-SPARK_TACHYON_MAP = {
-    "1.0.0": "0.4.1",
-    "1.0.1": "0.4.1",
-    "1.0.2": "0.4.1",
-    "1.1.0": "0.5.0",
-    "1.1.1": "0.5.0",
-    "1.2.0": "0.5.0",
-    "1.2.1": "0.5.0",
-    "1.3.0": "0.5.0",
-    "1.3.1": "0.5.0",
-    "1.4.0": "0.6.4",
-    "1.4.1": "0.6.4",
-    "1.5.0": "0.7.1",
-    "1.5.1": "0.7.1",
-    "1.5.2": "0.7.1",
-    "1.6.0": "0.8.2",
-    "2.0.0": "",
-}
-
-DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION
-DEFAULT_SPARK_GITHUB_REPO = "https://github.com/apache/spark"
-
-# Default location to get the spark-ec2 scripts (and ami-list) from
-DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/amplab/spark-ec2"
-DEFAULT_SPARK_EC2_BRANCH = "branch-2.0"
-
-
-import boto
-from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType, EBSBlockDeviceType
-from boto import ec2
-
-
-class UsageError(Exception):
-    pass
-
-
-# Configure and parse our command-line arguments
-def parse_args():
-    parser = OptionParser(
-        prog="spark-ec2",
-        version="%prog {v}".format(v=SPARK_EC2_VERSION),
-        usage="%prog [options] <action> <cluster_name>\n\n"
-        + "<action> can be: launch, destroy, login, stop, start, get-master, reboot-slaves")
-
-    parser.add_option(
-        "-s", "--slaves", type="int", default=1,
-        help="Number of slaves to launch (default: %default)")
-    parser.add_option(
-        "-w", "--wait", type="int",
-        help="DEPRECATED (no longer necessary) - Seconds to wait for nodes to start")
-    parser.add_option(
-        "-k", "--key-pair",
-        help="Key pair to use on instances")
-    parser.add_option(
-        "-i", "--identity-file",
-        help="SSH private key file to use for logging into instances")
-    parser.add_option(
-        "-p", "--profile", default=None,
-        help="If you have multiple profiles (AWS or boto config), you can configure " +
-             "additional, named profiles by using this option (default: %default)")
-    parser.add_option(
-        "-t", "--instance-type", default="m1.large",
-        help="Type of instance to launch (default: %default). " +
-             "WARNING: must be 64-bit; small instances won't work")
-    parser.add_option(
-        "-m", "--master-instance-type", default="",
-        help="Master instance type (leave empty for same as instance-type)")
-    parser.add_option(
-        "-r", "--region", default="us-east-1",
-        help="EC2 region used to launch instances in, or to find them in (default: %default)")
-    parser.add_option(
-        "-z", "--zone", default="",
-        help="Availability zone to launch instances in, or 'all' to spread " +
-             "slaves across multiple (an additional $0.01/Gb for bandwidth" +
-             "between zones applies) (default: a single zone chosen at random)")
-    parser.add_option(
-        "-a", "--ami",
-        help="Amazon Machine Image ID to use")
-    parser.add_option("--master-ami",
-        help="Amazon Machine Image ID to use for the Master")
-    parser.add_option(
-        "-v", "--spark-version", default=DEFAULT_SPARK_VERSION,
-        help="Version of Spark to use: 'X.Y.Z' or a specific git hash (default: %default)")
-    parser.add_option(
-        "--spark-git-repo",
-        default=DEFAULT_SPARK_GITHUB_REPO,
-        help="Github repo from which to checkout supplied commit hash (default: %default)")
-    parser.add_option(
-        "--spark-ec2-git-repo",
-        default=DEFAULT_SPARK_EC2_GITHUB_REPO,
-        help="Github repo from which to checkout spark-ec2 (default: %default)")
-    parser.add_option(
-        "--spark-ec2-git-branch",
-        default=DEFAULT_SPARK_EC2_BRANCH,
-        help="Github repo branch of spark-ec2 to use (default: %default)")
-    parser.add_option(
-        "--deploy-root-dir",
-        default=None,
-        help="A directory to copy into / on the first master. " +
-             "Must be absolute. Note that a trailing slash is handled as per rsync: " +
-             "If you omit it, the last directory of the --deploy-root-dir path will be created " +
-             "in / before copying its contents. If you append the trailing slash, " +
-             "the directory is not created and its contents are copied directly into /. " +
-             "(default: %default).")
-    parser.add_option(
-        "--hadoop-major-version", default="1",
-        help="Major version of Hadoop. Valid options are 1 (Hadoop 1.0.4), 2 (CDH 4.7.1), yarn " +
-             "(Hadoop 2.4.0) (default: %default)")
-    parser.add_option(
-        "-D", metavar="[ADDRESS:]PORT", dest="proxy_port",
-        help="Use SSH dynamic port forwarding to create a SOCKS proxy at " +
-             "the given local address (for use with login)")
-    parser.add_option(
-        "--resume", action="store_true", default=False,
-        help="Resume installation on a previously launched cluster " +
-             "(for debugging)")
-    parser.add_option(
-        "--ebs-vol-size", metavar="SIZE", type="int", default=0,
-        help="Size (in GB) of each EBS volume.")
-    parser.add_option(
-        "--ebs-vol-type", default="standard",
-        help="EBS volume type (e.g. 'gp2', 'standard').")
-    parser.add_option(
-        "--ebs-vol-num", type="int", default=1,
-        help="Number of EBS volumes to attach to each node as /vol[x]. " +
-             "The volumes will be deleted when the instances terminate. " +
-             "Only possible on EBS-backed AMIs. " +
-             "EBS volumes are only attached if --ebs-vol-size > 0. " +
-             "Only support up to 8 EBS volumes.")
-    parser.add_option(
-        "--placement-group", type="string", default=None,
-        help="Which placement group to try and launch " +
-             "instances into. Assumes placement group is already " +
-             "created.")
-    parser.add_option(
-        "--swap", metavar="SWAP", type="int", default=1024,
-        help="Swap space to set up per node, in MB (default: %default)")
-    parser.add_option(
-        "--spot-price", metavar="PRICE", type="float",
-        help="If specified, launch slaves as spot instances with the given " +
-             "maximum price (in dollars)")
-    parser.add_option(
-        "--master-spot", action="store_true", default=False,
-        help="If specified, launch master as spot instance using the same " +
-             "bid and instance type of the slave ones")
-    parser.add_option(
-        "--ganglia", action="store_true", default=True,
-        help="Setup Ganglia monitoring on cluster (default: %default). NOTE: " +
-             "the Ganglia page will be publicly accessible")
-    parser.add_option(
-        "--no-ganglia", action="store_false", dest="ganglia",
-        help="Disable Ganglia monitoring for the cluster")
-    parser.add_option(
-        "-u", "--user", default="root",
-        help="The SSH user you want to connect as (default: %default)")
-    parser.add_option(
-        "--delete-groups", action="store_true", default=False,
-        help="When destroying a cluster, delete the security groups that were created")
-    parser.add_option(
-        "--use-existing-master", action="store_true", default=False,
-        help="Launch fresh slaves, but use an existing stopped master if possible")
-    parser.add_option(
-        "--worker-instances", type="int", default=1,
-        help="Number of instances per worker: variable SPARK_WORKER_INSTANCES. Not used if YARN " +
-             "is used as Hadoop major version (default: %default)")
-    parser.add_option(
-        "--executor-instances", type="int", default=1,
-        help="Number of executor instances per worker: variable SPARK_EXECUTOR_INSTANCES. Not used if YARN " +
-             "is used as Hadoop major version (default: %default)")
-    parser.add_option(
-        "--master-opts", type="string", default="",
-        help="Extra options to give to master through SPARK_MASTER_OPTS variable " +
-             "(e.g -Dspark.worker.timeout=180)")
-    parser.add_option(
-        "--user-data", type="string", default="",
-        help="Path to a user-data file (most AMIs interpret this as an initialization script)")
-    parser.add_option(
-        "--security-group-prefix", type="string", default=None,
-        help="Use this prefix for the security group rather than the cluster name.")
-    parser.add_option(
-        "--authorized-address", type="string", default="0.0.0.0/0",
-        help="Address to authorize on created security groups (default: %default)")
-    parser.add_option(
-        "--additional-security-group", type="string", default="",
-        help="Additional security group to place the machines in")
-    parser.add_option(
-        "--additional-tags", type="string", default="",
-        help="Additional tags to set on the machines; tags are comma-separated, while name and " +
-             "value are colon separated; ex: \"Task:MySparkProject,Env:production\"")
-    parser.add_option(
-        "--copy-aws-credentials", action="store_true", default=False,
-        help="Add AWS credentials to hadoop configuration to allow Spark to access S3")
-    parser.add_option(
-        "--subnet-id", default=None,
-        help="VPC subnet to launch instances in")
-    parser.add_option(
-        "--vpc-id", default=None,
-        help="VPC to launch instances in")
-    parser.add_option(
-        "--spot-timeout", type="int", default=45,
-        help="Maximum amount of time (in minutes) to wait for spot requests to be fulfilled")
-    parser.add_option(
-        "--private-ips", action="store_true", default=False,
-        help="Use private IPs for instances rather than public if VPC/subnet " +
-             "requires that.")
-    parser.add_option(
-        "--instance-initiated-shutdown-behavior", default="stop",
-        choices=["stop", "terminate"],
-        help="Whether instances should terminate when shut down or just stop")
-    parser.add_option(
-        "--instance-profile-name", default=None,
-        help="IAM profile name to launch instances under")
-
-    (opts, args) = parser.parse_args()
-    if len(args) != 2:
-        parser.print_help()
-        sys.exit(1)
-    (action, cluster_name) = args
-
-    # Boto config check
-    # http://boto.cloudhackers.com/en/latest/boto_config_tut.html
-    home_dir = os.getenv('HOME')
-    if home_dir is None or not os.path.isfile(home_dir + '/.boto'):
-        if not os.path.isfile('/etc/boto.cfg'):
-            # If there is no boto config, check aws credentials
-            if not os.path.isfile(home_dir + '/.aws/credentials'):
-                if os.getenv('AWS_ACCESS_KEY_ID') is None:
-                    print("ERROR: The environment variable AWS_ACCESS_KEY_ID must be set",
-                          file=stderr)
-                    sys.exit(1)
-                if os.getenv('AWS_SECRET_ACCESS_KEY') is None:
-                    print("ERROR: The environment variable AWS_SECRET_ACCESS_KEY must be set",
-                          file=stderr)
-                    sys.exit(1)
-    return (opts, action, cluster_name)
-
-
-# Get the EC2 security group of the given name, creating it if it doesn't exist
-def get_or_make_group(conn, name, vpc_id):
-    groups = conn.get_all_security_groups()
-    group = [g for g in groups if g.name == name]
-    if len(group) > 0:
-        return group[0]
-    else:
-        print("Creating security group " + name)
-        return conn.create_security_group(name, "Spark EC2 group", vpc_id)
-
-def check_if_http_resource_exists(resource):
-    request = Request(resource)
-    request.get_method = lambda: 'HEAD'
-    try:
-        response = urlopen(request)
-        if response.getcode() == 200:
-            return True
-        else:
-            raise RuntimeError("Resource {resource} not found. Error: {code}".format(resource, response.getcode()))
-    except HTTPError, e:
-        print >> stderr, "Unable to check if HTTP resource {url} exists. Error: {code}".format(
-            url=resource,
-            code=e.code)
-        return False
-
-def get_validate_spark_version(version, repo):
-    if version.startswith("http"):
-        #check if custom package URL exists
-        if check_if_http_resource_exists:
-            return version
-        else:
-            print("Unable to validate pre-built spark version {version}".format(version=version), file=stderr)
-            sys.exit(1)
-    elif "." in version:
-        version = version.replace("v", "")
-        if version not in VALID_SPARK_VERSIONS:
-            print("Don't know about Spark version: {v}".format(v=version), file=stderr)
-            sys.exit(1)
-        return version
-    else:
-        github_commit_url = "{repo}/commit/{commit_hash}".format(repo=repo, commit_hash=version)
-        if not check_if_http_resource_exists(github_commit_url):
-            print >> stderr, "Couldn't validate Spark commit: {repo} / {commit}".format(
-                repo=repo, commit=version)
-            sys.exit(1)
-        else:
-            return version
-
-
-# Source: http://aws.amazon.com/amazon-linux-ami/instance-type-matrix/
-# Last Updated: 2015-06-19
-# For easy maintainability, please keep this manually-inputted dictionary sorted by key.
-EC2_INSTANCE_TYPES = {
-    "c1.medium":   "pvm",
-    "c1.xlarge":   "pvm",
-    "c3.large":    "hvm",
-    "c3.xlarge":   "hvm",
-    "c3.2xlarge":  "hvm",
-    "c3.4xlarge":  "hvm",
-    "c3.8xlarge":  "hvm",
-    "c4.large":    "hvm",
-    "c4.xlarge":   "hvm",
-    "c4.2xlarge":  "hvm",
-    "c4.4xlarge":  "hvm",
-    "c4.8xlarge":  "hvm",
-    "cc1.4xlarge": "hvm",
-    "cc2.8xlarge": "hvm",
-    "cg1.4xlarge": "hvm",
-    "cr1.8xlarge": "hvm",
-    "d2.xlarge":   "hvm",
-    "d2.2xlarge":  "hvm",
-    "d2.4xlarge":  "hvm",
-    "d2.8xlarge":  "hvm",
-    "g2.2xlarge":  "hvm",
-    "g2.8xlarge":  "hvm",
-    "hi1.4xlarge": "pvm",
-    "hs1.8xlarge": "pvm",
-    "i2.xlarge":   "hvm",
-    "i2.2xlarge":  "hvm",
-    "i2.4xlarge":  "hvm",
-    "i2.8xlarge":  "hvm",
-    "m1.small":    "pvm",
-    "m1.medium":   "pvm",
-    "m1.large":    "pvm",
-    "m1.xlarge":   "pvm",
-    "m2.xlarge":   "pvm",
-    "m2.2xlarge":  "pvm",
-    "m2.4xlarge":  "pvm",
-    "m3.medium":   "hvm",
-    "m3.large":    "hvm",
-    "m3.xlarge":   "hvm",
-    "m3.2xlarge":  "hvm",
-    "m4.large":    "hvm",
-    "m4.xlarge":   "hvm",
-    "m4.2xlarge":  "hvm",
-    "m4.4xlarge":  "hvm",
-    "m4.10xlarge": "hvm",
-    "r3.large":    "hvm",
-    "r3.xlarge":   "hvm",
-    "r3.2xlarge":  "hvm",
-    "r3.4xlarge":  "hvm",
-    "r3.8xlarge":  "hvm",
-    "t1.micro":    "pvm",
-    "t2.micro":    "hvm",
-    "t2.small":    "hvm",
-    "t2.medium":   "hvm",
-    "t2.large":    "hvm",
-    "x1.16xlarge": "hvm",
-    "x1.32xlarge": "hvm",
-}
-
-
-def get_tachyon_version(spark_version):
-    return SPARK_TACHYON_MAP.get(spark_version, "")
-
-# Attempt to resolve an appropriate AMI given the architecture and region of the request.
-def get_spark_ami(instance_type, region, spark_ec2_git_repo, spark_ec2_git_branch):
-    if instance_type in EC2_INSTANCE_TYPES:
-        instance_type = EC2_INSTANCE_TYPES[instance_type]
-    else:
-        instance_type = "pvm"
-        print("Don't recognize %s, assuming type is pvm" % instance_type, file=stderr)
-
-    # URL prefix from which to fetch AMI information
-    ami_prefix = "{r}/{b}/ami-list".format(
-        r=spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1),
-        b=spark_ec2_git_branch)
-
-    ami_path = "%s/%s/%s" % (ami_prefix, region, instance_type)
-    reader = codecs.getreader("ascii")
-    try:
-        ami = reader(urlopen(ami_path)).read().strip()
-    except:
-        print("Could not resolve AMI at: " + ami_path, file=stderr)
-        sys.exit(1)
-
-    print("Spark AMI: " + ami)
-    return ami
-
-
-# Launch a cluster of the given name, by setting up its security groups,
-# and then starting new instances in them.
-# Returns a tuple of EC2 reservation objects for the master and slaves
-# Fails if there already instances running in the cluster's groups.
-def launch_cluster(conn, opts, cluster_name):
-    if opts.identity_file is None:
-        print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr)
-        sys.exit(1)
-
-    if opts.key_pair is None:
-        print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr)
-        sys.exit(1)
-
-    user_data_content = None
-    if opts.user_data:
-        with open(opts.user_data) as user_data_file:
-            user_data_content = user_data_file.read()
-
-    print("Setting up security groups...")
-    if opts.security_group_prefix is None:
-        master_group = get_or_make_group(conn, cluster_name + "-master", opts.vpc_id)
-        slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id)
-    else:
-        master_group = get_or_make_group(conn, opts.security_group_prefix + "-master", opts.vpc_id)
-        slave_group = get_or_make_group(conn, opts.security_group_prefix + "-slaves", opts.vpc_id)
-
-    authorized_address = opts.authorized_address
-    if master_group.rules == []:  # Group was just now created
-        if opts.vpc_id is None:
-            master_group.authorize(src_group=master_group)
-            master_group.authorize(src_group=slave_group)
-        else:
-            master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
-                                   src_group=master_group)
-            master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
-                                   src_group=master_group)
-            master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
-                                   src_group=master_group)
-            master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
-                                   src_group=slave_group)
-            master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
-                                   src_group=slave_group)
-            master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
-                                   src_group=slave_group)
-        master_group.authorize('tcp', 22, 22, authorized_address)
-        master_group.authorize('tcp', 8080, 8081, authorized_address)
-        master_group.authorize('tcp', 18080, 18080, authorized_address)
-        master_group.authorize('tcp', 19999, 19999, authorized_address)
-        master_group.authorize('tcp', 50030, 50030, authorized_address)
-        master_group.authorize('tcp', 50070, 50070, authorized_address)
-        master_group.authorize('tcp', 60070, 60070, authorized_address)
-        master_group.authorize('tcp', 4040, 4045, authorized_address)
-        # Rstudio (GUI for R) needs port 8787 for web access
-        master_group.authorize('tcp', 8787, 8787, authorized_address)
-        # HDFS NFS gateway requires 111,2049,4242 for tcp & udp
-        master_group.authorize('tcp', 111, 111, authorized_address)
-        master_group.authorize('udp', 111, 111, authorized_address)
-        master_group.authorize('tcp', 2049, 2049, authorized_address)
-        master_group.authorize('udp', 2049, 2049, authorized_address)
-        master_group.authorize('tcp', 4242, 4242, authorized_address)
-        master_group.authorize('udp', 4242, 4242, authorized_address)
-        # RM in YARN mode uses 8088
-        master_group.authorize('tcp', 8088, 8088, authorized_address)
-        if opts.ganglia:
-            master_group.authorize('tcp', 5080, 5080, authorized_address)
-    if slave_group.rules == []:  # Group was just now created
-        if opts.vpc_id is None:
-            slave_group.authorize(src_group=master_group)
-            slave_group.authorize(src_group=slave_group)
-        else:
-            slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
-                                  src_group=master_group)
-            slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
-                                  src_group=master_group)
-            slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
-                                  src_group=master_group)
-            slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
-                                  src_group=slave_group)
-            slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
-                                  src_group=slave_group)
-            slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
-                                  src_group=slave_group)
-        slave_group.authorize('tcp', 22, 22, authorized_address)
-        slave_group.authorize('tcp', 8080, 8081, authorized_address)
-        slave_group.authorize('tcp', 50060, 50060, authorized_address)
-        slave_group.authorize('tcp', 50075, 50075, authorized_address)
-        slave_group.authorize('tcp', 60060, 60060, authorized_address)
-        slave_group.authorize('tcp', 60075, 60075, authorized_address)
-
-    # Check if instances are already running in our groups
-    existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
-                                                             die_on_error=False)
-    if existing_slaves or (existing_masters and not opts.use_existing_master):
-        print("ERROR: There are already instances running in group %s or %s" %
-              (master_group.name, slave_group.name), file=stderr)
-        sys.exit(1)
-
-    # Figure out Spark AMI
-    if opts.ami is None:
-        opts.ami = get_spark_ami(opts.instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch)
-
-    if opts.master_ami is None:
-        opts.master_ami = get_spark_ami(opts.master_instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch)
-
-    # we use group ids to work around https://github.com/boto/boto/issues/350
-    additional_group_ids = []
-    if opts.additional_security_group:
-        additional_group_ids = [sg.id
-                                for sg in conn.get_all_security_groups()
-                                if opts.additional_security_group in (sg.name, sg.id)]
-    print("Launching instances...")
-
-    try:
-        image = conn.get_all_images(image_ids=[opts.ami])[0]
-    except:
-        print("Could not find AMI " + opts.ami, file=stderr)
-        sys.exit(1)
-
-    try:
-        master_image = conn.get_all_images(image_ids=[opts.master_ami])[0]
-    except:
-        print >> stderr, "Could not find AMI " + opts.master_ami
-        sys.exit(1)
-
-    # Create block device mapping so that we can add EBS volumes if asked to.
-    # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz
-    block_map = BlockDeviceMapping()
-    if opts.ebs_vol_size > 0:
-        for i in range(opts.ebs_vol_num):
-            device = EBSBlockDeviceType()
-            device.size = opts.ebs_vol_size
-            device.volume_type = opts.ebs_vol_type
-            device.delete_on_termination = True
-            block_map["/dev/sd" + chr(ord('s') + i)] = device
-
-    for i in range(get_num_disks(opts.instance_type)):
-        dev = BlockDeviceType()
-        dev.ephemeral_name = 'ephemeral%d' % i
-        name = '/dev/xvd' + string.letters[i + 1]
-        block_map[name] = dev
-    # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342).
-    #if opts.instance_type.startswith('m3.'):
-    #    for i in range(get_num_disks(opts.instance_type)):
-    #        dev = BlockDeviceType()
-    #        dev.ephemeral_name = 'ephemeral%d' % i
-    #        # The first ephemeral drive is /dev/sdb.
-    #        name = '/dev/sd' + string.ascii_letters[i + 1]
-    #        block_map[name] = dev
-
-    # Launch slaves
-    if opts.spot_price is not None:
-        # Launch spot instances with the requested price
-        print("Requesting %d slaves as spot instances with price $%.3f" %
-              (opts.slaves, opts.spot_price))
-        zones = get_zones(conn, opts)
-        num_zones = len(zones)
-        i = 0
-        my_req_ids = []
-        for zone in zones:
-            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
-            slave_reqs = conn.request_spot_instances(
-                price=opts.spot_price,
-                image_id=opts.ami,
-                launch_group="launch-group-%s" % cluster_name,
-                placement=zone,
-                count=num_slaves_this_zone,
-                key_name=opts.key_pair,
-                security_group_ids=[slave_group.id] + additional_group_ids,
-                instance_type=opts.instance_type,
-                block_device_map=block_map,
-                subnet_id=opts.subnet_id,
-                placement_group=opts.placement_group,
-                user_data=user_data_content,
-                instance_profile_name=opts.instance_profile_name)
-            my_req_ids += [req.id for req in slave_reqs]
-            i += 1
-
-        start_time = datetime.now()
-        print("Waiting for spot instances to be granted... Request IDs: %s " % my_req_ids)
-        try:
-            while True:
-                time.sleep(10)
-                reqs = conn.get_all_spot_instance_requests(my_req_ids)
-                active_instance_ids = filter(lambda req: req.state == "active", reqs)
-                invalid_states = ["capacity-not-available", "capacity-oversubscribed", "price-too-low"]
-                invalid = filter(lambda req: req.status.code in invalid_states, reqs)
-                if len(invalid) > 0:
-                    raise Exception("Invalid state for spot request: %s - status: %s" %
-                        (invalid[0].id, invalid[0].status.message))
-                if len(active_instance_ids) == opts.slaves:
-                    print("All %d slaves granted" % opts.slaves)
-                    reservations = conn.get_all_reservations([r.instance_id for r in active_instance_ids])
-                    slave_nodes = []
-                    for r in reservations:
-                        slave_nodes += r.instances
-                    break
-                else:
-                    print("%d of %d slaves granted, waiting longer" % (
-                        len(active_instance_ids), opts.slaves))
-
-                if (datetime.now() - start_time).seconds > opts.spot_timeout * 60:
-                    raise Exception("Timed out while waiting for spot instances")
-        except:
-            print("Error: %s" % sys.exc_info()[1])
-            print("Canceling spot instance requests")
-            conn.cancel_spot_instance_requests(my_req_ids)
-            # Log a warning if any of these requests actually launched instances:
-            (master_nodes, slave_nodes) = get_existing_cluster(
-                conn, opts, cluster_name, die_on_error=False)
-            running = len(master_nodes) + len(slave_nodes)
-            if running:
-                print(("WARNING: %d instances are still running" % running), file=stderr)
-            sys.exit(0)
-    else:
-        # Launch non-spot instances
-        zones = get_zones(conn, opts)
-        num_zones = len(zones)
-        i = 0
-        slave_nodes = []
-        for zone in zones:
-            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
-            if num_slaves_this_zone > 0:
-                slave_res = image.run(
-                    key_name=opts.key_pair,
-                    security_group_ids=[slave_group.id] + additional_group_ids,
-                    instance_type=opts.instance_type,
-                    placement=zone,
-                    min_count=num_slaves_this_zone,
-                    max_count=num_slaves_this_zone,
-                    block_device_map=block_map,
-                    subnet_id=opts.subnet_id,
-                    placement_group=opts.placement_group,
-                    user_data=user_data_content,
-                    instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior,
-                    instance_profile_name=opts.instance_profile_name)
-                slave_nodes += slave_res.instances
-                print("Launched {s} slave{plural_s} in {z}, regid = {r}".format(
-                      s=num_slaves_this_zone,
-                      plural_s=('' if num_slaves_this_zone == 1 else 's'),
-                      z=zone,
-                      r=slave_res.id))
-            i += 1
-
-    # Launch or resume masters
-    if existing_masters:
-        print("Starting master...")
-        for inst in existing_masters:
-            if inst.state not in ["shutting-down", "terminated"]:
-                inst.start()
-        master_nodes = existing_masters
-    else:
-        master_type = opts.master_instance_type
-        if master_type == "" or opts.master_spot:
-            master_type = opts.instance_type
-        if opts.zone == 'all':
-            opts.zone = random.choice(conn.get_all_zones()).name
-        if opts.master_spot:
-            # Launch spot master instance with the requested price
-            # Note: The spot_price*1.5 is present to ensure a higher bid price to
-            #       the master spot instance, so the master instance will be the
-            #       last one to be terminated in a spot market price increase
-            print("Requesting master as spot instance with price $%.3f" %
-                (opts.spot_price))
-            master_req = conn.request_spot_instances(
-                price=(opts.spot_price * 1.5),
-                image_id=opts.master_ami,
-                placement=opts.zone,
-                count=1,
-                key_name=opts.key_pair,
-                security_group_ids=[master_group.id] + additional_group_ids,
-                instance_type=master_type,
-                block_device_map=block_map,
-                subnet_id=opts.subnet_id,
-                placement_group=opts.placement_group,
-                user_data=user_data_content,
-                instance_profile_name=opts.instance_profile_name)
-            my_master_req_id = [req.id for req in master_req]
-
-            # TODO: refactor duplicated spot waiting code
-            start_time = datetime.now()
-            print("Waiting for master spot instance to be granted... Request ID: %s " % my_master_req_id)
-            try:
-                while True:
-                    time.sleep(10)
-                    reqs = conn.get_all_spot_instance_requests(my_master_req_id)
-                    active_instance_ids = filter(lambda req: req.state == "active", reqs)
-                    invalid_states = ["capacity-not-available", "capacity-oversubscribed", "price-too-low"]
-                    invalid = filter(lambda req: req.status.code in invalid_states, reqs)
-                    if len(invalid) > 0:
-                        raise Exception("Invalid state for spot request: %s - status: %s" %
-                            (invalid[0].id, invalid[0].status.message))
-                    if len(active_instance_ids) == 1:
-                        print("Master spot instance granted")
-                        master_res = conn.get_all_reservations([r.instance_id for r in active_instance_ids])
-                        master_nodes = master_res[0].instances
-                        break
-                    else:
-                        print("Master spot instance not granted yet, waiting longer")
-
-                    if (datetime.now() - start_time).seconds > opts.spot_timeout * 60:
-                        raise Exception("Timed out while waiting for master spot instance")
-            except:
-                print("Error: %s" % sys.exc_info()[1])
-                print("Canceling master spot instance requests")
-                conn.cancel_spot_instance_requests(my_master_req_id)
-                # Log a warning if any of these requests actually launched instances:
-                (master_nodes, slave_nodes) = get_existing_cluster(
-                    conn, opts, cluster_name, die_on_error=False)
-                running = len(master_nodes) + len(slave_nodes)
-                if running:
-                    print(("WARNING: %d instances are still running" % running), file=stderr)
-                sys.exit(0)
-        else:
-            # Launch ondemand instance
-            master_res = master_image.run(
-                key_name=opts.key_pair,
-                security_group_ids=[master_group.id] + additional_group_ids,
-                instance_type=master_type,
-                placement=opts.zone,
-                min_count=1,
-                max_count=1,
-                block_device_map=block_map,
-                subnet_id=opts.subnet_id,
-                placement_group=opts.placement_group,
-                user_data=user_data_content,
-                instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior,
-                instance_profile_name=opts.instance_profile_name)
-
-            master_nodes = master_res.instances
-            print("Launched master in %s, regid = %s" % (zone, master_res.id))
-
-    # This wait time corresponds to SPARK-4983
-    print("Waiting for AWS to propagate instance metadata...")
-    time.sleep(15)
-
-    # Give the instances descriptive names and set additional tags
-    additional_tags = {}
-    if opts.additional_tags.strip():
-        additional_tags = dict(
-            map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',')
-        )
-
-    for master in master_nodes:
-        master.add_tags(
-            dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
-        )
-
-    for slave in slave_nodes:
-        slave.add_tags(
-            dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
-        )
-
-    # Return all the instances
-    return (master_nodes, slave_nodes)
-
-
-def get_existing_cluster(conn, opts, cluster_name, die_on_error=True):
-    """
-    Get the EC2 instances in an existing cluster if available.
-    Returns a tuple of lists of EC2 instance objects for the masters and slaves.
-    """
-    print("Searching for existing cluster {c} in region {r}...".format(
-          c=cluster_name, r=opts.region))
-
-    def get_instances(group_names):
-        """
-        Get all non-terminated instances that belong to any of the provided security groups.
-
-        EC2 reservation filters and instance states are documented here:
-            http://docs.aws.amazon.com/cli/latest/reference/ec2/describe-instances.html#options
-        """
-        reservations = conn.get_all_reservations(
-            filters={"instance.group-name": group_names})
-        instances = itertools.chain.from_iterable(r.instances for r in reservations)
-        return [i for i in instances if i.state not in ["shutting-down", "terminated"]]
-
-    master_instances = get_instances([cluster_name + "-master"])
-    slave_instances = get_instances([cluster_name + "-slaves"])
-
-    if any((master_instances, slave_instances)):
-        print("Found {m} master{plural_m}, {s} slave{plural_s}.".format(
-              m=len(master_instances),
-              plural_m=('' if len(master_instances) == 1 else 's'),
-              s=len(slave_instances),
-              plural_s=('' if len(slave_instances) == 1 else 's')))
-
-    if not master_instances and die_on_error:
-        print("ERROR: Could not find a master for cluster {c} in region {r}.".format(
-              c=cluster_name, r=opts.region), file=sys.stderr)
-        sys.exit(1)
-
-    return (master_instances, slave_instances)
-
-
-# Deploy configuration files and run setup scripts on a newly launched
-# or started EC2 cluster.
-def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
-    master = get_dns_name(master_nodes[0], opts.private_ips)
-    if deploy_ssh_key:
-        print("Generating cluster's SSH key on master...")
-        key_setup = """
-          [ -f ~/.ssh/id_rsa ] ||
-            (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa &&
-             cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys)
-        """
-        ssh(master, opts, key_setup)
-        dot_ssh_tar = ssh_read(master, opts, ['tar', 'c', '.ssh'])
-        print("Transferring cluster's SSH key to slaves...")
-        for slave in slave_nodes:
-            slave_address = get_dns_name(slave, opts.private_ips)
-            print(slave_address)
-            ssh_write(slave_address, opts, ['tar', 'x'], dot_ssh_tar)
-
-    modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs',
-               'mapreduce', 'spark-standalone', 'tachyon', 'rstudio']
-
-    if opts.hadoop_major_version == "1":
-        modules = list(filter(lambda x: x != "mapreduce", modules))
-
-    if opts.ganglia:
-        modules.append('ganglia')
-
-    # Clear SPARK_WORKER_INSTANCES if running on YARN
-    if opts.hadoop_major_version == "yarn":
-        opts.worker_instances = ""
-
-    # NOTE: We should clone the repository before running deploy_files to
-    # prevent ec2-variables.sh from being overwritten
-    print("Cloning spark-ec2 scripts from {r}/tree/{b} on master...".format(
-        r=opts.spark_ec2_git_repo, b=opts.spark_ec2_git_branch))
-    ssh(
-        host=master,
-        opts=opts,
-        command="rm -rf spark-ec2"
-        + " && "
-        + "git clone {r} -b {b} spark-ec2".format(r=opts.spark_ec2_git_repo,
-                                                  b=opts.spark_ec2_git_branch)
-    )
-
-    print("Deploying files to master...")
-    deploy_files(
-        conn=conn,
-        root_dir=SPARK_EC2_DIR + "/" + "deploy.generic",
-        opts=opts,
-        master_nodes=master_nodes,
-        slave_nodes=slave_nodes,
-        modules=modules
-    )
-
-    if opts.deploy_root_dir is not None:
-        print("Deploying {s} to master...".format(s=opts.deploy_root_dir))
-        deploy_user_files(
-            root_dir=opts.deploy_root_dir,
-            opts=opts,
-            master_nodes=master_nodes
-        )
-
-    print("Running setup on master...")
-    setup_spark_cluster(master, opts)
-    print("Done!")
-
-
-def setup_spark_cluster(master, opts):
-    ssh(master, opts, "chmod u+x spark-ec2/setup.sh")
-    ssh(master, opts, "spark-ec2/setup.sh")
-    print("Spark standalone cluster started at http://%s:8080" % master)
-
-    if opts.ganglia:
-        print("Ganglia started at http://%s:5080/ganglia" % master)
-
-
-def is_ssh_available(host, opts, print_ssh_output=True):
-    """
-    Check if SSH is available on a host.
-    """
-    s = subprocess.Popen(
-        ssh_command(opts) + ['-t', '-t', '-o', 'ConnectTimeout=3',
-                             '%s@%s' % (opts.user, host), stringify_command('true')],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT  # we pipe stderr through stdout to preserve output order
-    )
-    cmd_output = s.communicate()[0]  # [1] is stderr, which we redirected to stdout
-
-    if s.returncode != 0 and print_ssh_output:
-        # extra leading newline is for spacing in wait_for_cluster_state()
-        print(textwrap.dedent("""\n
-            Warning: SSH connection error. (This could be temporary.)
-            Host: {h}
-            SSH return code: {r}
-            SSH output: {o}
-        """).format(
-            h=host,
-            r=s.returncode,
-            o=cmd_output.strip()
-        ))
-
-    return s.returncode == 0
-
-
-def is_cluster_ssh_available(cluster_instances, opts):
-    """
-    Check if SSH is available on all the instances in a cluster.
-    """
-    for i in cluster_instances:
-        dns_name = get_dns_name(i, opts.private_ips)
-        if not is_ssh_available(host=dns_name, opts=opts):
-            return False
-    else:
-        return True
-
-
-def wait_for_cluster_state(conn, opts, cluster_instances, cluster_state):
-    """
-    Wait for all the instances in the cluster to reach a designated state.
-
-    cluster_instances: a list of boto.ec2.instance.Instance
-    cluster_state: a string representing the desired state of all the instances in the cluster
-           value can be 'ssh-ready' or a valid value from boto.ec2.instance.InstanceState such as
-           'running', 'terminated', etc.
-           (would be nice to replace this with a proper enum: http://stackoverflow.com/a/1695250)
-    """
-    sys.stdout.write(
-        "Waiting for cluster to enter '{s}' state.".format(s=cluster_state)
-    )
-    sys.stdout.flush()
-
-    start_time = datetime.now()
-    num_attempts = 0
-
-    while True:
-        time.sleep(5 * num_attempts)  # seconds
-
-        for i in cluster_instances:
-            i.update()
-
-        max_batch = 100
-        statuses = []
-        for j in xrange(0, len(cluster_instances), max_batch):
-            batch = [i.id for i in cluster_instances[j:j + max_batch]]
-            statuses.extend(conn.get_all_instance_status(instance_ids=batch))
-
-        if cluster_state == 'ssh-ready':
-            if all(i.state == 'running' for i in cluster_instances) and \
-               all(s.system_status.status == 'ok' for s in statuses) and \
-               all(s.instance_status.status == 'ok' for s in statuses) and \
-               is_cluster_ssh_available(cluster_instances, opts):
-                break
-        else:
-            if all(i.state == cluster_state for i in cluster_instances):
-                break
-
-        num_attempts += 1
-
-        sys.stdout.write(".")
-        sys.stdout.flush()
-
-    sys.stdout.write("\n")
-
-    end_time = datetime.now()
-    print("Cluster is now in '{s}' state. Waited {t} seconds.".format(
-        s=cluster_state,
-        t=(end_time - start_time).seconds
-    ))
-
-
-# Get number of local disks available for a given EC2 instance type.
-def get_num_disks(instance_type):
-    # Source: http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html
-    # Last Updated: 2015-06-19
-    # For easy maintainability, please keep this manually-inputted dictionary sorted by key.
-    disks_by_instance = {
-        "c1.medium":   1,
-        "c1.xlarge":   4,
-        "c3.large":    2,
-        "c3.xlarge":   2,
-        "c3.2xlarge":  2,
-        "c3.4xlarge":  2,
-        "c3.8xlarge":  2,
-        "c4.large":    0,
-        "c4.xlarge":   0,
-        "c4.2xlarge":  0,
-        "c4.4xlarge":  0,
-        "c4.8xlarge":  0,
-        "cc1.4xlarge": 2,
-        "cc2.8xlarge": 4,
-        "cg1.4xlarge": 2,
-        "cr1.8xlarge": 2,
-        "d2.xlarge":   3,
-        "d2.2xlarge":  6,
-        "d2.4xlarge":  12,
-        "d2.8xlarge":  24,
-        "g2.2xlarge":  1,
-        "g2.8xlarge":  2,
-        "hi1.4xlarge": 2,
-        "hs1.8xlarge": 24,
-        "i2.xlarge":   1,
-        "i2.2xlarge":  2,
-        "i2.4xlarge":  4,
-        "i2.8xlarge":  8,
-        "m1.small":    1,
-        "m1.medium":   1,
-        "m1.large":    2,
-        "m1.xlarge":   4,
-        "m2.xlarge":   1,
-        "m2.2xlarge":  1,
-        "m2.4xlarge":  2,
-        "m3.medium":   1,
-        "m3.large":    1,
-        "m3.xlarge":   2,
-        "m3.2xlarge":  2,
-        "m4.large":    0,
-        "m4.xlarge":   0,
-        "m4.2xlarge":  0,
-        "m4.4xlarge":  0,
-        "m4.10xlarge": 0,
-        "r3.large":    1,
-        "r3.xlarge":   1,
-        "r3.2xlarge":  1,
-        "r3.4xlarge":  1,
-        "r3.8xlarge":  2,
-        "t1.micro":    0,
-        "t2.micro":    0,
-        "t2.small":    0,
-        "t2.medium":   0,
-        "t2.large":    0,
-        "x1.16xlarge": 1,
-        "x1.32xlarge": 2,
-
-    }
-    if instance_type in disks_by_instance:
-        return disks_by_instance[instance_type]
-    else:
-        print("WARNING: Don't know number of disks on instance type %s; assuming 1"
-              % instance_type, file=stderr)
-        return 1
-
-
-# Deploy the configuration file templates in a given local directory to
-# a cluster, filling in any template parameters with information about the
-# cluster (e.g. lists of masters and slaves). Files are only deployed to
-# the first master instance in the cluster, and we expect the setup
-# script to be run on that instance to copy them to other nodes.
-#
-# root_dir should be an absolute path to the directory with the files we want to deploy.
-def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
-    active_master = get_dns_name(master_nodes[0], opts.private_ips)
-
-    num_disks = get_num_disks(opts.instance_type)
-    hdfs_data_dirs = "/mnt/ephemeral-hdfs/data"
-    mapred_local_dirs = "/mnt/hadoop/mrlocal"
-    spark_local_dirs = "/mnt/spark"
-    if num_disks > 1:
-        for i in range(2, num_disks + 1):
-            hdfs_data_dirs += ",/mnt%d/ephemeral-hdfs/data" % i
-            mapred_local_dirs += ",/mnt%d/hadoop/mrlocal" % i
-            spark_local_dirs += ",/mnt%d/spark" % i
-
-    cluster_url = "%s:7077" % active_master
-
-    if opts.spark_version.startswith("http"):
-        # Custom pre-built spark package
-        spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
-        tachyon_v = ""
-        print("Deploying Spark via custom bunlde; Tachyon won't be set up")
-        modules = filter(lambda x: x != "tachyon", modules)
-    elif "." in opts.spark_version:
-        # Pre-built Spark deploy
-        spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
-        tachyon_v = get_tachyon_version(spark_v)
-    else:
-        # Spark-only custom deploy
-        spark_v = "%s|%s" % (opts.spark_git_repo, opts.spark_version)
-        tachyon_v = ""
-        print("Deploying Spark via git hash; Tachyon won't be set up")
-        modules = filter(lambda x: x != "tachyon", modules)
-
-    master_addresses = [get_dns_name(i, opts.private_ips) for i in master_nodes]
-    slave_addresses = [get_dns_name(i, opts.private_ips) for i in slave_nodes]
-    worker_instances_str = "%d" % opts.worker_instances if opts.worker_instances else ""
-    executor_instances_str = "%d" % opts.executor_instances if opts.executor_instances else ""
-    template_vars = {
-        "master_list": '\n'.join(master_addresses),
-        "active_master": active_master,
-        "slave_list": '\n'.join(slave_addresses),
-        "cluster_url": cluster_url,
-        "hdfs_data_dirs": hdfs_data_dirs,
-        "mapred_local_dirs": mapred_local_dirs,
-        "spark_local_dirs": spark_local_dirs,
-        "swap": str(opts.swap),
-        "modules": '\n'.join(modules),
-        "spark_version": spark_v,
-        "tachyon_version": tachyon_v,
-        "hadoop_major_version": opts.hadoop_major_version,
-        "spark_worker_instances": worker_instances_str,
-        "spark_executor_instances": executor_instances_str,
-        "spark_master_opts": opts.master_opts
-    }
-
-    if opts.copy_aws_credentials:
-        template_vars["aws_access_key_id"] = conn.aws_access_key_id
-        template_vars["aws_secret_access_key"] = conn.aws_secret_access_key
-    else:
-        template_vars["aws_access_key_id"] = ""
-        template_vars["aws_secret_access_key"] = ""
-
-    # Create a temp directory in which we will place all the files to be
-    # deployed after we substitue template parameters in them
-    tmp_dir = tempfile.mkdtemp()
-    for path, dirs, files in os.walk(root_dir):
-        if path.find(".svn") == -1:
-            dest_dir = os.path.join('/', path[len(root_dir):])
-            local_dir = tmp_dir + dest_dir
-            if not os.path.exists(local_dir):
-                os.makedirs(local_dir)
-            for filename in files:
-                if filename[0] not in '#.~' and filename[-1] != '~':
-                    dest_file = os.path.join(dest_dir, filename)
-                    local_file = tmp_dir + dest_file
-                    with open(os.path.join(path, filename)) as src:
-                        with open(local_file, "w") as dest:
-                            text = src.read()
-                            for key in template_vars:
-                                text = text.replace("{{" + key + "}}", template_vars[key])
-                            dest.write(text)
-                            dest.close()
-    # rsync the whole directory over to the master machine
-    command = [
-        'rsync', '-rv',
-        '-e', stringify_command(ssh_command(opts)),
-        "%s/" % tmp_dir,
-        "%s@%s:/" % (opts.user, active_master)
-    ]
-    subprocess.check_call(command)
-    # Remove the temp directory we created above
-    shutil.rmtree(tmp_dir)
-
-
-# Deploy a given local directory to a cluster, WITHOUT parameter substitution.
-# Note that unlike deploy_files, this works for binary files.
-# Also, it is up to the user to add (or not) the trailing slash in root_dir.
-# Files are only deployed to the first master instance in the cluster.
-#
-# root_dir should be an absolute path.
-def deploy_user_files(root_dir, opts, master_nodes):
-    active_master = get_dns_name(master_nodes[0], opts.private_ips)
-    command = [
-        'rsync', '-rv',
-        '-e', stringify_command(ssh_command(opts)),
-        "%s" % root_dir,
-        "%s@%s:/" % (opts.user, active_master)
-    ]
-    subprocess.check_call(command)
-
-
-def stringify_command(parts):
-    if isinstance(parts, str):
-        return parts
-    else:
-        return ' '.join(map(pipes.quote, parts))
-
-
-def ssh_args(opts):
-    parts = ['-o', 'StrictHostKeyChecking=no']
-    parts += ['-o', 'UserKnownHostsFile=/dev/null']
-    if opts.identity_file is not None:
-        parts += ['-i', opts.identity_file]
-    return parts
-
-
-def ssh_command(opts):
-    return ['ssh'] + ssh_args(opts)
-
-
-# Run a command on a host through ssh, retrying up to five times
-# and then throwing an exception if ssh continues to fail.
-def ssh(host, opts, command):
-    tries = 0
-    while True:
-        try:
-            return subprocess.check_call(
-                ssh_command(opts) + ['-t', '-t', '%s@%s' % (opts.user, host),
-                                     stringify_command(command)])
-        except subprocess.CalledProcessError as e:
-            if tries > 5:
-                # If this was an ssh failure, provide the user with hints.
-                if e.returncode == 255:
-                    raise UsageError(
-                        "Failed to SSH to remote host {0}.\n"
-                        "Please check that you have provided the correct --identity-file and "
-                        "--key-pair parameters and try again.".format(host))
-                else:
-                    raise e
-            print("Error executing remote command, retrying after 30 seconds: {0}".format(e),
-                  file=stderr)
-            time.sleep(30)
-            tries = tries + 1
-
-
-# Backported from Python 2.7 for compatiblity with 2.6 (See SPARK-1990)
-def _check_output(*popenargs, **kwargs):
-    if 'stdout' in kwargs:
-        raise ValueError('stdout argument not allowed, it will be overridden.')
-    process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
-    output, unused_err = process.communicate()
-    retcode = process.poll()
-    if retcode:
-        cmd = kwargs.get("args")
-        if cmd is None:
-            cmd = popenargs[0]
-        raise subprocess.CalledProcessError(retcode, cmd, output=output)
-    return output
-
-
-def ssh_read(host, opts, command):
-    return _check_output(
-        ssh_command(opts) + ['%s@%s' % (opts.user, host), stringify_command(command)])
-
-
-def ssh_write(host, opts, command, arguments):
-    tries = 0
-    while True:
-        proc = subprocess.Popen(
-            ssh_command(opts) + ['%s@%s' % (opts.user, host), stringify_command(command)],
-            stdin=subprocess.PIPE)
-        proc.stdin.write(arguments)
-        proc.stdin.close()
-        status = proc.wait()
-        if status == 0:
-            break
-        elif tries > 5:
-            raise RuntimeError("ssh_write failed with error %s" % proc.returncode)
-        else:
-            print("Error {0} while executing remote command, retrying after 30 seconds".
-                  format(status), file=stderr)
-            time.sleep(30)
-            tries = tries + 1
-
-
-# Gets a list of zones to launch instances in
-def get_zones(conn, opts):
-    if opts.zone == 'all':
-        zones = [z.name for z in conn.get_all_zones()]
-    else:
-        zones = [opts.zone]
-    return zones
-
-
-# Gets the number of items in a partition
-def get_partition(total, num_partitions, current_partitions):
-    num_slaves_this_zone = total // num_partitions
-    if (total % num_partitions) - current_partitions > 0:
-        num_slaves_this_zone += 1
-    return num_slaves_this_zone
-
-
-# Gets the IP address, taking into account the --private-ips flag
-def get_ip_address(instance, private_ips=False):
-    ip = instance.ip_address if not private_ips else \
-        instance.private_ip_address
-    return ip
-
-
-# Gets the DNS name, taking into account the --private-ips flag
-def get_dns_name(instance, private_ips=False):
-    dns = instance.public_dns_name if not private_ips else \
-        instance.private_ip_address
-    if not dns:
-        raise UsageError("Failed to determine hostname of {0}.\n"
-                         "Please check that you provided --private-ips if "
-                         "necessary".format(instance))
-    return dns
-
-
-def real_main():
-    (opts, action, cluster_name) = parse_args()
-
-    # Input parameter validation
-    get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
-
-    if opts.wait is not None:
-        # NOTE: DeprecationWarnings are silent in 2.7+ by default.
-        #       To show them, run Python with the -Wdefault switch.
-        # See: https://docs.python.org/3.5/whatsnew/2.7.html
-        warnings.warn(
-            "This option is deprecated and has no effect. "
-            "spark-ec2 automatically waits as long as necessary for clusters to start up.",
-            DeprecationWarning
-        )
-
-    if opts.identity_file is not None:
-        if not os.path.exists(opts.identity_file):
-            print("ERROR: The identity file '{f}' doesn't exist.".format(f=opts.identity_file),
-                  file=stderr)
-            sys.exit(1)
-
-        file_mode = os.stat(opts.identity_file).st_mode
-        if not (file_mode & S_IRUSR) or not oct(file_mode)[-2:] == '00':
-            print("ERROR: The identity file must be accessible only by you.", file=stderr)
-            print('You can fix this with: chmod 400 "{f}"'.format(f=opts.identity_file),
-                  file=stderr)
-            sys.exit(1)
-
-    if opts.instance_type not in EC2_INSTANCE_TYPES:
-        print("Warning: Unrecognized EC2 instance type for instance-type: {t}".format(
-              t=opts.instance_type), file=stderr)
-
-    if opts.master_instance_type != "":
-        if opts.master_instance_type not in EC2_INSTANCE_TYPES:
-            print("Warning: Unrecognized EC2 instance type for master-instance-type: {t}".format(
-                  t=opts.master_instance_type), file=stderr)
-
-    if opts.ebs_vol_num > 8:
-        print("ebs-vol-num cannot be greater than 8", file=stderr)
-        sys.exit(1)
-
-    # Prevent breaking ami_prefix (/, .git and startswith checks)
-    # Prevent forks with non spark-ec2 names for now.
-    if opts.spark_ec2_git_repo.endswith("/") or \
-            opts.spark_ec2_git_repo.endswith(".git") or \
-            not opts.spark_ec2_git_repo.startswith("https://github.com") or \
-            not opts.spark_ec2_git_repo.endswith("spark-ec2"):
-        print("spark-ec2-git-repo must be a github repo and it must not have a trailing / or .git. "
-              "Furthermore, we currently only support forks named spark-ec2.", file=stderr)
-        sys.exit(1)
-
-    if not (opts.deploy_root_dir is None or
-            (os.path.isabs(opts.deploy_root_dir) and
-             os.path.isdir(opts.deploy_root_dir) and
-             os.path.exists(opts.deploy_root_dir))):
-        print("--deploy-root-dir must be an absolute path to a directory that exists "
-              "on the local file system", file=stderr)
-        sys.exit(1)
-
-    try:
-        if opts.profile is None:
-            conn = ec2.connect_to_region(opts.region)
-        else:
-            conn = ec2.connect_to_region(opts.region, profile_name=opts.profile)
-    except Exception as e:
-        print((e), file=stderr)
-        sys.exit(1)
-
-    # Select an AZ at random if it was not specified.
-    if opts.zone == "":
-        opts.zone = random.choice(conn.get_all_zones()).name
-
-    if action == "launch":
-        if opts.slaves <= 0:
-            print("ERROR: You have to start at least 1 slave", file=sys.stderr)
-            sys.exit(1)
-        if opts.resume:
-            (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name)
-        else:
-            (master_nodes, slave_nodes) = launch_cluster(conn, opts, cluster_name)
-        wait_for_cluster_state(
-            conn=conn,
-            opts=opts,
-            cluster_instances=(master_nodes + slave_nodes),
-            cluster_state='ssh-ready'
-        )
-        setup_cluster(conn, master_nodes, slave_nodes, opts, True)
-
-    elif action == "destroy":
-        (master_nodes, slave_nodes) = get_existing_cluster(
-            conn, opts, cluster_name, die_on_error=False)
-
-        if any(master_nodes + slave_nodes):
-            print("The following instances will be terminated:")
-            for inst in master_nodes + slave_nodes:
-                print("> %s" % get_dns_name(inst, opts.private_ips))
-            print("ALL DATA ON ALL NODES WILL BE LOST!!")
-
-        msg = "Are you sure you want to destroy the cluster {c}? (y/N) ".format(c=cluster_name)
-        response = raw_input(msg)
-        if response == "y":
-            print("Terminating master...")
-            for inst in master_nodes:
-                inst.terminate()
-            print("Terminating slaves...")
-            for inst in slave_nodes:
-                inst.terminate()
-
-            # Delete security groups as well
-            if opts.delete_groups:
-                group_names = [cluster_name + "-master", cluster_name + "-slaves"]
-                wait_for_cluster_state(
-                    conn=conn,
-                    opts=opts,
-                    cluster_instances=(master_nodes + slave_nodes),
-                    cluster_state='terminated'
-                )
-                print("Deleting security groups (this will take some time)...")
-                attempt = 1
-                while attempt <= 3:
-                    print("Attempt %d" % attempt)
-                    groups = [g for g in conn.get_all_security_groups() if g.name in group_names]
-                    success = True
-                    # Delete individual rules in all groups before deleting groups to
-                    # remove dependencies between them
-                    for group in groups:
-                        print("Deleting rules in security group " + group.name)
-                        for rule in group.rules:
-                            for grant in rule.grants:
-                                success &= group.revoke(ip_protocol=rule.ip_protocol,
-                                                        from_port=rule.from_port,
-                                                        to_port=rule.to_port,
-                                                        src_group=grant)
-
-                    # Sleep for AWS eventual-consistency to catch up, and for instances
-                    # to terminate
-                    time.sleep(30)  # Yes, it does have to be this long :-(
-                    for group in groups:
-                        try:
-                            # It is needed to use group_id to make it work with VPC
-                            conn.delete_security_group(group_id=group.id)
-                            print("Deleted security group %s" % group.name)
-                        except boto.exception.EC2ResponseError:
-                            success = False
-                            print("Failed to delete security group %s" % group.name)
-
-                    # Unfortunately, group.revoke() returns True even if a rule was not
-                    # deleted, so this needs to be rerun if something fails
-                    if success:
-                        break
-
-                    attempt += 1
-
-                if not success:
-                    print("Failed to delete all security groups after 3 tries.")
-                    print("Try re-running in a few minutes.")
-
-    elif action == "login":
-        (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name)
-        if not master_nodes[0].public_dns_name and not opts.private_ips:
-            print("Master has no public DNS name.  Maybe you meant to specify --private-ips?")
-        else:
-            master = get_dns_name(master_nodes[0], opts.private_ips)
-            print("Logging into master " + master + "...")
-            proxy_opt = []
-            if opts.proxy_port is not None:
-                proxy_opt = ['-D', opts.proxy_port]
-            subprocess.check_call(
-                ssh_command(opts) + proxy_opt + ['-t', '-t', "%s@%s" % (opts.user, master)])
-
-    elif action == "reboot-slaves":
-        response = raw_input(
-            "Are you sure you want to reboot the cluster " +
-            cluster_name + " slaves?\n" +
-            "Reboot cluster slaves " + cluster_name + " (y/N): ")
-        if response == "y":
-            (master_nodes, slave_nodes) = get_existing_cluster(
-                conn, opts, cluster_name, die_on_error=False)
-            print("Rebooting slaves...")
-            for inst in slave_nodes:
-                if inst.state not in ["shutting-down", "terminated"]:
-                    print("Rebooting " + inst.id)
-                    inst.reboot()
-
-    elif action == "get-master":
-        (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name)
-        if not master_nodes[0].public_dns_name and not opts.private_ips:
-            print("Master has no public DNS name.  Maybe you meant to specify --private-ips?")
-        else:
-            print(get_dns_name(master_nodes[0], opts.private_ips))
-
-    elif action == "stop":
-        response = raw_input(
-            "Are you sure you want to stop the cluster " +
-            cluster_name + "?\nDATA ON EPHEMERAL DISKS WILL BE LOST, " +
-            "BUT THE CLUSTER WILL KEEP USING SPACE ON\n" +
-            "AMAZON EBS IF IT IS EBS-BACKED!!\n" +
-            "All data on spot-instance slaves will be lost.\n" +
-            "Stop cluster " + cluster_name + " (y/N): ")
-        if response == "y":
-            (master_nodes, slave_nodes) = get_existing_cluster(
-                conn, opts, cluster_name, die_on_error=False)
-            print("Stopping master...")
-            for inst in master_nodes:
-                if inst.state not in ["shutting-down", "terminated"]:
-                    inst.stop()
-            print("Stopping slaves...")
-            for inst in slave_nodes:
-                if inst.state not in ["shutting-down", "terminated"]:
-                    if inst.spot_instance_request_id:
-                        inst.terminate()
-                    else:
-                        inst.stop()
-
-    elif action == "start":
-        (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name)
-        print("Starting slaves...")
-        for inst in slave_nodes:
-            if inst.state not in ["shutting-down", "terminated"]:
-                inst.start()
-        print("Starting master...")
-        for inst in master_nodes:
-            if inst.state not in ["shutting-down", "terminated"]:
-                inst.start()
-        wait_for_cluster_state(
-            conn=conn,
-            opts=opts,
-            cluster_instances=(master_nodes + slave_nodes),
-            cluster_state='ssh-ready'
-        )
-
-        # Determine types of running instances
-        existing_master_type = master_nodes[0].instance_type
-        existing_slave_type = slave_nodes[0].instance_type
-        # Setting opts.master_instance_type to the empty string indicates we
-        # have the same instance type for the master and the slaves
-        if existing_master_type == existing_slave_type:
-            existing_master_type = ""
-        opts.master_instance_type = existing_master_type
-        opts.instance_type = existing_slave_type
-
-        setup_cluster(conn, master_nodes, slave_nodes, opts, False)
-
-    else:
-        print("Invalid action: %s" % action, file=stderr)
-        sys.exit(1)
-
-
-def main():
-    try:
-        real_main()
-    except UsageError as e:
-        print("\nError:\n", e, file=stderr)
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    logging.basicConfig()
-    main()
diff --git a/tools/utils.py b/tools/utils.py
index bac56029..39d6129f 100644
--- a/tools/utils.py
+++ b/tools/utils.py
@@ -20,9 +20,10 @@ def parse_nodes(active_instances, cluster_name):
     slave_nodes = []
     for instance in active_instances:
         group_names = [g.name for g in instance.groups]
-        if (cluster_name + "-master") in group_names:
+        # This can handle both spark-ec2 and flintrock clusters
+        if (cluster_name + "-master") in group_names or (("flintrock-" + cluster_name) in group_names and instance.tags.get('flintrock-role') == 'master'):
             master_nodes.append(instance)
-        elif (cluster_name + "-slaves") in group_names:
+        elif (cluster_name + "-slaves") in group_names or (("flintrock-" + cluster_name) in group_names and instance.tags.get('flintrock-role') in ('slave', None)):
             slave_nodes.append(instance)
     return (master_nodes, slave_nodes)
 
@@ -121,4 +122,3 @@ def check_call_with_timeout(args, stdin=None, stdout=None,
     if p.returncode != 0:
         raise subprocess.CalledProcessError(p.returncode, args)
     return p.returncode
-

From 2b3c7a39cc58b028579582404750cd06aa4aeeff Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Mon, 6 Feb 2017 18:17:29 -0200
Subject: [PATCH 150/268] Avoid changing permissions of .ssh directory

---
 remote_hook.sh  | 3 ++-
 tools/flintrock | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index 86f1f56b..081ca880 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -28,7 +28,8 @@ echo $$ > "${RUNNING_FILE}"
 
 
 # Let us read the spark home even when the image doesn't give us the permission
-sudo chmod -R o+rx /root
+sudo chmod o+rx /root
+sudo chmod -R o+rx /root/spark
 
 notify_error_and_exit() {
     description="${1}"
diff --git a/tools/flintrock b/tools/flintrock
index 541697fb..325d3eb1 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit 541697fb11912df6298d588b845809966e94d280
+Subproject commit 325d3eb12a2c7a732a7ebd7d1a5d806803216d03

From 58610661d80c9c79edc3a768ee951a0a2e78628e Mon Sep 17 00:00:00 2001
From: Fernando Rodrigues da Silva <fernandors87@gmail.com>
Date: Tue, 14 Feb 2017 16:54:58 -0200
Subject: [PATCH 151/268] make parseUri return a Try (#116)

---
 .../core/http/AsyncHttpClientStreamApi.scala  |  6 +++---
 .../scala/ignition/core/utils/URLUtils.scala  | 19 ++++++++++++-------
 .../http/AsyncHttpClientStreamApiSpec.scala   |  2 ++
 .../ignition/core/utils/URLUtilsSpec.scala    | 12 +++++++-----
 4 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
index e95e4811..6868f0b7 100644
--- a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
+++ b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
@@ -10,6 +10,7 @@ import spray.http._
 import scala.concurrent.Future
 import scala.concurrent.duration._
 import scala.language.postfixOps
+import scala.util.Try
 
 object AsyncHttpClientStreamApi {
   
@@ -52,11 +53,10 @@ object AsyncHttpClientStreamApi {
                      requestConfiguration: Option[RequestConfiguration] = None) {
 
     def uri: Uri = {
-      // Note: This will guarantee we create a valid request (one with a valid uri). Will throw an exception if invalid
       if (params.nonEmpty)
-        URLUtils.parseUri(url).withQuery(params)
+        URLUtils.parseUri(url).map(_.withQuery(params)).get
       else
-        URLUtils.parseUri(url)
+        URLUtils.parseUri(url).get
     }
   }
 
diff --git a/src/main/scala/ignition/core/utils/URLUtils.scala b/src/main/scala/ignition/core/utils/URLUtils.scala
index 800a3a1a..f66a3f03 100644
--- a/src/main/scala/ignition/core/utils/URLUtils.scala
+++ b/src/main/scala/ignition/core/utils/URLUtils.scala
@@ -6,6 +6,8 @@ import org.apache.http.client.utils.URIBuilder
 import spray.http.Uri
 import spray.http.Uri.Query
 
+import scala.util.Try
+
 object URLUtils {
 
   // Due to ancient standards, Java will encode space as + instead of using percent.
@@ -13,14 +15,17 @@ object URLUtils {
   // See:
   // http://stackoverflow.com/questions/1634271/url-encoding-the-space-character-or-20
   // https://docs.oracle.com/javase/7/docs/api/java/net/URLEncoder.html#encode(java.lang.String,%20java.lang.String)
-  def sanitizePathSegment(segment: String) =
-    URLEncoder.encode(URLDecoder.decode(segment, "UTF-8"), "UTF-8").replace("+", "%20")
-
-  def parseUri(urlStr: String): Uri = {
-    val url = new URL(urlStr)
-    val sanePath = url.getPath.split("/").map(sanitizePathSegment).mkString("/")
+  def sanitizePathSegment(segment: String): Try[String] =
+    Try { URLEncoder.encode(URLDecoder.decode(segment, "UTF-8"), "UTF-8").replace("+", "%20") }
 
-    Uri.from(
+  def parseUri(urlStr: String): Try[Uri] = {
+    for {
+      url <- Try(new URL(urlStr))
+      rawSegments = url.getPath.split("/")
+      saneSegments = rawSegments.map(sanitizePathSegment)
+      if saneSegments.forall(_.isSuccess)
+      sanePath = saneSegments.map(_.get).mkString("/")
+    } yield Uri.from(
       scheme = url.getProtocol,
       userinfo = Option(url.getUserInfo).getOrElse(""),
       host = url.getHost,
diff --git a/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala b/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala
index 37accf5b..fb774b6e 100644
--- a/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala
+++ b/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala
@@ -3,6 +3,8 @@ package ignition.core.http
 import ignition.core.http.AsyncHttpClientStreamApi.Request
 import org.scalatest.{FunSpec, Matchers}
 
+import scala.util.Success
+
 class AsyncHttpClientStreamApiSpec extends FunSpec with Matchers {
 
   it("should do the best to parse the provided uri") {
diff --git a/src/test/scala/ignition/core/utils/URLUtilsSpec.scala b/src/test/scala/ignition/core/utils/URLUtilsSpec.scala
index 6665e3ec..114da15f 100644
--- a/src/test/scala/ignition/core/utils/URLUtilsSpec.scala
+++ b/src/test/scala/ignition/core/utils/URLUtilsSpec.scala
@@ -2,6 +2,8 @@ package ignition.core.utils
 
 import org.scalatest.{FlatSpec, Matchers}
 
+import scala.util.Success
+
 class URLUtilsSpec extends FlatSpec with Matchers {
 
   "URLUtils" should "add parameters to url with encoded params in base url and not be double encoded" in {
@@ -57,19 +59,19 @@ class URLUtilsSpec extends FlatSpec with Matchers {
     )
 
     tests.zip(expectations).foreach {
-      case (url, expected) => URLUtils.parseUri(url).toString shouldBe expected
+      case (url, expected) => URLUtils.parseUri(url).map(_.toString) shouldBe Success(expected)
     }
   }
 
   it should "not encode percent characters in url path" in {
     val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf"
-    val sane = URLUtils.parseUri(url).toString
-    sane shouldBe "http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf"
+    val sane = URLUtils.parseUri(url).map(_.toString)
+    sane shouldBe Success("http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf")
   }
 
   it should "encode space characters with percent in URL path" in {
     val url = "http://www.example.com/Pentagrama+Invertido.xml?q=blah+bleh"
-    val sane = URLUtils.parseUri(url).toString
-    sane shouldBe "http://www.example.com/Pentagrama%20Invertido.xml?q=blah+bleh"
+    val sane = URLUtils.parseUri(url).map(_.toString)
+    sane shouldBe Success("http://www.example.com/Pentagrama%20Invertido.xml?q=blah+bleh")
   }
 }

From 21f9136cba067b5f2b6b53dff4e548e552e229a6 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Wed, 1 Mar 2017 18:09:39 -0300
Subject: [PATCH 152/268] Make it possible to save job execution

---
 remote_hook.sh                                        | 2 ++
 src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/remote_hook.sh b/remote_hook.sh
index 081ca880..6648ccd8 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -31,6 +31,8 @@ echo $$ > "${RUNNING_FILE}"
 sudo chmod o+rx /root
 sudo chmod -R o+rx /root/spark
 
+sudo mkdir -p /media/tmp/spark-events
+
 notify_error_and_exit() {
     description="${1}"
     echo "Exiting because: ${description}"
diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
index 0dec0896..ab47ee12 100644
--- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
+++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
@@ -73,6 +73,8 @@ object CoreJobRunner {
       val sparkConf = new SparkConf()
       sparkConf.set("spark.executor.memory", config.executorMemory)
 
+      sparkConf.set("spark.eventLog.dir", "file:///media/tmp/spark-events")
+
       sparkConf.setMaster(config.master)
       sparkConf.setAppName(appName)
 

From efbf31b1ca8d19def8de0b8c4fc895f595241a57 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 14 Mar 2017 17:49:11 -0300
Subject: [PATCH 153/268] Make EBS root size be configurable

---
 tools/cluster.py | 3 +++
 tools/flintrock  | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 4a99a214..83e2b81d 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -41,6 +41,7 @@
 default_executor_instances = '1'
 default_master_instance_type = ''
 default_driver_heap_size = '12G'
+default_min_root_ebs_size_gb = '30'
 default_region = 'us-east-1'
 default_zone = default_region + 'b'
 default_key_id = 'ignition_key'
@@ -217,6 +218,7 @@ def launch(cluster_name, slaves,
            # TODO: consider implementing in flintrock
            master_instance_type=default_master_instance_type,
            executor_instances=default_executor_instances,
+           min_root_ebs_size_gb=default_min_root_ebs_size_gb,
            retries_on_same_cluster=5,
            max_clusters_to_create=5,
            minimum_percentage_healthy_slaves=0.9,
@@ -281,6 +283,7 @@ def launch(cluster_name, slaves,
                                  '--ec2-region', region,
                                  '--ec2-availability-zone', zone,
                                  '--ec2-instance-type', instance_type,
+                                 '--ec2-min-root-ebs-size-gb', min_root_ebs_size_gb,
                                  '--assume-yes',
                                  '--install-spark',
                                  '--install-hdfs',
diff --git a/tools/flintrock b/tools/flintrock
index 325d3eb1..ecee499a 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit 325d3eb12a2c7a732a7ebd7d1a5d806803216d03
+Subproject commit ecee499a762aa0dc5e5a875f096f8f606f0e79ea

From 2f57ac4caa8b3042734998ed5cee648b78142ceb Mon Sep 17 00:00:00 2001
From: Fernando Rodrigues da Silva <fernandors87@gmail.com>
Date: Mon, 3 Apr 2017 14:45:27 -0300
Subject: [PATCH 154/268] updating flintrock version (#118)

---
 tools/flintrock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/flintrock b/tools/flintrock
index ecee499a..f9091b3c 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit ecee499a762aa0dc5e5a875f096f8f606f0e79ea
+Subproject commit f9091b3ce508c814fd97ab3936ae77335feafff8

From d15fca4a505ed8fee35d4c2837d04f7e18cffa7d Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Mon, 3 Apr 2017 20:54:05 -0300
Subject: [PATCH 155/268] Improve flintrock sanity check and minor stuff

---
 src/main/scala/ignition/core/utils/IntBag.scala |  4 ++++
 tools/cluster.py                                | 16 +++++++++-------
 tools/flintrock                                 |  2 +-
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/main/scala/ignition/core/utils/IntBag.scala b/src/main/scala/ignition/core/utils/IntBag.scala
index a53d2d8f..1dfce82a 100644
--- a/src/main/scala/ignition/core/utils/IntBag.scala
+++ b/src/main/scala/ignition/core/utils/IntBag.scala
@@ -13,6 +13,10 @@ object IntBag {
 }
 
 case class IntBag(histogram: collection.Map[Long, Long]) {
+
+  def +(n: Long) =
+    this ++ IntBag.from(n :: Nil)
+
   def ++(other: IntBag): IntBag = {
     val newHistogram = scala.collection.mutable.HashMap.empty[Long, Long]
     (histogram.keySet ++ other.histogram.keySet).foreach(k => newHistogram += (k -> (histogram.getOrElse(k, 0L) + other.histogram.getOrElse(k, 0L))))
diff --git a/tools/cluster.py b/tools/cluster.py
index 83e2b81d..29d75ac2 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -137,12 +137,13 @@ def chdir_to_ec2_script_and_get_path():
     return ec2_script_path
 
 
-def call_ec2_script(args, timeout_total_minutes, timeout_inactivity_minutes):
+def call_ec2_script(args, timeout_total_minutes, timeout_inactivity_minutes, stdout=None):
     ec2_script_path = chdir_to_ec2_script_and_get_path()
     return check_call_with_timeout(['/usr/bin/env', 'python3', '-u',
                                     ec2_script_path] + args,
-                                   timeout_total_minutes=timeout_total_minutes,
-                                   timeout_inactivity_minutes=timeout_inactivity_minutes)
+                                    stdout=stdout,
+                                    timeout_total_minutes=timeout_total_minutes,
+                                    timeout_inactivity_minutes=timeout_inactivity_minutes)
 
 
 def cluster_exists(cluster_name, region):
@@ -712,7 +713,8 @@ def killall_jobs(cluster_name, key_file=default_key_file,
 
 def check_flintrock_installation():
     try:
-        call_ec2_script(['--help'], 1 , 1)
+        with file('/dev/null', 'w') as devnull:
+            call_ec2_script(['--help'], 1 , 1, stdout=devnull)
     except:
         setup = os.path.join(ec2_script_base_path(), 'setup.py')
         if not os.path.exists(setup):
@@ -726,10 +728,10 @@ def check_flintrock_installation():
         else:
             log.error('''
 Some dependencies are missing. For an Ubuntu system, try the following:
-sudo apt-get install python3-yaml libyaml-dev
+sudo apt-get install python3-yaml libyaml-dev python3-pip
 sudo python3 -m pip install -U pip packaging setuptools
 cd {flintrock}
-sudo pip3 -r requirements/user.pip
+sudo pip3 install -r requirements/user.pip
         '''.format(flintrock=ec2_script_base_path()))
         sys.exit(1)
 
@@ -740,5 +742,5 @@ def check_flintrock_installation():
                      kill_job, killall_jobs, collect_job_results], namespace="jobs")
 
 if __name__ == '__main__':
-
+    check_flintrock_installation()
     parser.dispatch()
diff --git a/tools/flintrock b/tools/flintrock
index f9091b3c..dd7354ac 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit f9091b3ce508c814fd97ab3936ae77335feafff8
+Subproject commit dd7354ac8319ecbc6240ef5542ecfeeb4c0f55a6

From a8d8a226f80f03a561a3565cbb192bc024d7d3f3 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 4 Apr 2017 15:27:54 -0300
Subject: [PATCH 156/268] Use master flintrock

---
 tools/flintrock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/flintrock b/tools/flintrock
index dd7354ac..4629fe4b 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit dd7354ac8319ecbc6240ef5542ecfeeb4c0f55a6
+Subproject commit 4629fe4bc1f333dd149a44dcc5d9b8775186b324

From 66b16c51a95279d2ebcd52b72632f85c9058f32f Mon Sep 17 00:00:00 2001
From: Rafael Zimmermann <engzimmermann@gmail.com>
Date: Thu, 15 Jun 2017 16:40:52 +0200
Subject: [PATCH 157/268] Create README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8f30027e..8b395319 100644
--- a/README.md
+++ b/README.md
@@ -8,4 +8,4 @@ It also provides many utilities for Spark jobs and Scala programs in general.
 It should be used inside a project as a submodule. See https://github.com/chaordic/ignition-template for an example. 
 
 # Getting started
-See http://monkeys.chaordic.com.br/start-using-spark-with-ignition/ for quick-start tutorial
+See [Start using Spark with Ignition!](http://monkeys.chaordic.com.br/2015/03/22/start-using-spark-with-ignition.html) for quick-start tutorial

From 4861be7fc664dcc3abb4963d1d3dfd84f392dc8c Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Fri, 7 Jul 2017 10:01:12 -0300
Subject: [PATCH 158/268] disable verbose spark logging for tests

---
 .../ignition/core/testsupport/spark/SharedSparkContext.scala     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/main/scala/ignition/core/testsupport/spark/SharedSparkContext.scala b/src/main/scala/ignition/core/testsupport/spark/SharedSparkContext.scala
index 314d5442..4fa5756b 100644
--- a/src/main/scala/ignition/core/testsupport/spark/SharedSparkContext.scala
+++ b/src/main/scala/ignition/core/testsupport/spark/SharedSparkContext.scala
@@ -33,6 +33,7 @@ trait SharedSparkContext extends BeforeAndAfterAll { self: Suite =>
     //Logger.getRootLogger().removeAllAppenders();
     //Logger.getRootLogger().addAppender(new NullAppender());
     _sc = new SparkContext("local", "test", conf)
+    _sc.setLogLevel("OFF")
     super.beforeAll()
   }
 

From c628ed529e796cafc2ca981fd8e7183d3646e3b1 Mon Sep 17 00:00:00 2001
From: Henrique Goulart <henriquedsg89@gmail.com>
Date: Wed, 19 Jul 2017 16:09:54 -0300
Subject: [PATCH 159/268] Fix weird await on test

---
 .../ignition/core/cache/ExpiringMultipleLevelCache.scala  | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
index c321f794..7f2101c3 100644
--- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
+++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
@@ -2,6 +2,7 @@ package ignition.core.cache
 
 import akka.actor.ActorSystem
 import ignition.core.cache.ExpiringMultiLevelCache.TimestampedValue
+import org.scalatest.concurrent.ScalaFutures
 import org.scalatest.{FlatSpec, Matchers}
 import spray.caching.ExpiringLruLocalCache
 
@@ -9,7 +10,7 @@ import scala.concurrent.ExecutionContext.Implicits.global
 import scala.concurrent.duration._
 import scala.concurrent.{Await, Future}
 
-class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers {
+class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFutures {
   case class Data(s: String)
   implicit val scheduler = ActorSystem().scheduler
 
@@ -25,8 +26,9 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers {
 
     class MyException(s: String) extends Exception(s)
 
-    intercept[MyException ] {
-      Await.result(cache("key", () => Future.failed(new MyException("some failure"))), 1.minute)
+    val eventualCache = cache("key", () => Future.failed(new MyException("some failure")))
+    whenReady(eventualCache.failed) { failure =>
+      failure shouldBe a [MyException]
     }
   }
 }

From c8fcc3ed88488bf48c97bb6c1dbaf709bb9e1125 Mon Sep 17 00:00:00 2001
From: Henrique Goulart <henriquedsg89@gmail.com>
Date: Wed, 19 Jul 2017 16:10:01 -0300
Subject: [PATCH 160/268] Add log4j configuration to avoid annoying test log

---
 src/test/resources/log4j.properties | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 src/test/resources/log4j.properties

diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties
new file mode 100644
index 00000000..8455c4cf
--- /dev/null
+++ b/src/test/resources/log4j.properties
@@ -0,0 +1,21 @@
+log4j.rootCategory=ERROR, console
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# Ignition!
+log4j.logger.ignition=ERROR
+
+# Disable annoying logger that is always logging an error message on ExpiringMultipleLevelCacheSpec test
+log4j.logger.ignition.core.cache.ExpiringMultiLevelCache=OFF
+
+# Spark, Hadoop, etc
+log4j.logger.org.apache=ERROR
+
+# Akka
+log4j.logger.Remoting=ERROR
+
+# Jetty
+log4j.logger.org.eclipse.jetty=ERROR
+log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
+org.eclipse.jetty.LEVEL=ERROR

From ce4cfb50f87dbcb5375aa265472b12d25cc12a70 Mon Sep 17 00:00:00 2001
From: Henrique Goulart <henriquedsg89@gmail.com>
Date: Wed, 19 Jul 2017 16:20:13 -0300
Subject: [PATCH 161/268] Fix wrong file name

---
 ...tipleLevelCache.scala => ExpiringMultipleLevelCacheSpec.scala} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/test/scala/ignition/core/cache/{ExpiringMultipleLevelCache.scala => ExpiringMultipleLevelCacheSpec.scala} (100%)

diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
similarity index 100%
rename from src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala
rename to src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala

From 34a13f84a2d030ff161eb710472471050cf6c4e3 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 25 Jul 2017 10:18:30 -0300
Subject: [PATCH 162/268] update dependencies

---
 build.sbt                                     | 27 +++++++++----------
 .../testsupport/spark/LocalSparkContext.scala |  2 +-
 .../ignition/core/utils/BetterTrace.scala     |  2 +-
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/build.sbt b/build.sbt
index ad80612f..63b5c2e2 100644
--- a/build.sbt
+++ b/build.sbt
@@ -2,18 +2,17 @@ name := "Ignition-Core"
 
 version := "1.0"
 
-scalaVersion := "2.11.8"
+scalaVersion := "2.11.11"
 
 scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130")
 
 // Because we can't run two spark contexts on same VM
 parallelExecution in Test := false
 
-libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.2" % "provided")
+libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.2.0" % "provided")
   .exclude("org.apache.hadoop", "hadoop-client")
   .exclude("org.slf4j", "slf4j-log4j12")
 
-
 libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.2" % "provided")
 
 libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2")
@@ -21,29 +20,29 @@ libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2")
   .exclude("commons-beanutils", "commons-beanutils")
   .exclude("org.slf4j", "slf4j-log4j12")
 
-libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.4"
+libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.3"
 
-libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.9"
+libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.2.14"
 
-libraryDependencies += "com.github.scopt" %% "scopt" % "3.2.0"
+libraryDependencies += "com.github.scopt" %% "scopt" % "3.6.0"
 
-libraryDependencies += "joda-time" % "joda-time" % "2.9.4"
+libraryDependencies += "joda-time" % "joda-time" % "2.9.9"
 
-libraryDependencies += "org.joda" % "joda-convert" % "1.7"
+libraryDependencies += "org.joda" % "joda-convert" % "1.8.2"
 
 libraryDependencies += "commons-lang" % "commons-lang" % "2.6"
 
-libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.5"
+libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.25"
 
-libraryDependencies += "com.typesafe.akka" %% "akka-actor" % "2.3.4"
+libraryDependencies += "com.typesafe.akka" %% "akka-actor" % "2.5.3"
 
-libraryDependencies += "io.spray" %% "spray-json" % "1.3.2"
+libraryDependencies += "io.spray" %% "spray-json" % "1.3.3"
 
-libraryDependencies += "io.spray" %% "spray-client" % "1.3.2"
+libraryDependencies += "io.spray" %% "spray-client" % "1.3.4"
 
-libraryDependencies += "io.spray" %% "spray-http" % "1.3.2"
+libraryDependencies += "io.spray" %% "spray-http" % "1.3.4"
 
-libraryDependencies += "io.spray" %% "spray-caching" % "1.3.2"
+libraryDependencies += "io.spray" %% "spray-caching" % "1.3.4"
 
 resolvers += "Akka Repository" at "http://repo.akka.io/releases/"
 
diff --git a/src/main/scala/ignition/core/testsupport/spark/LocalSparkContext.scala b/src/main/scala/ignition/core/testsupport/spark/LocalSparkContext.scala
index a272edaa..814f565d 100644
--- a/src/main/scala/ignition/core/testsupport/spark/LocalSparkContext.scala
+++ b/src/main/scala/ignition/core/testsupport/spark/LocalSparkContext.scala
@@ -26,7 +26,7 @@ trait LocalSparkContext extends BeforeAndAfterEach with BeforeAndAfterAll { self
   @transient var sc: SparkContext = _
 
   override def beforeAll() {
-    InternalLoggerFactory.setDefaultFactory(new Slf4JLoggerFactory())
+    InternalLoggerFactory.setDefaultFactory(Slf4JLoggerFactory.INSTANCE)
     super.beforeAll()
   }
 
diff --git a/src/main/scala/ignition/core/utils/BetterTrace.scala b/src/main/scala/ignition/core/utils/BetterTrace.scala
index 09de73aa..9c91ca05 100644
--- a/src/main/scala/ignition/core/utils/BetterTrace.scala
+++ b/src/main/scala/ignition/core/utils/BetterTrace.scala
@@ -3,7 +3,7 @@ package ignition.core.utils
 import ignition.core.utils.ExceptionUtils._
 // Used mainly to augment scalacheck traces in scalatest
 trait BetterTrace {
-  def fail(message: String): Nothing
+  def fail(message: String): Nothing = throw new NotImplementedError(message)
 
   def withBetterTrace(block: => Unit): Unit =
     try {

From 2fd48757063e1146c0e85b4bc39e49454049df2d Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Thu, 27 Jul 2017 16:16:01 -0300
Subject: [PATCH 163/268] update spark and flintrock

---
 tools/cluster.py | 2 +-
 tools/flintrock  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 29d75ac2..5efefeb5 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -49,7 +49,7 @@
 default_ami = 'ami-611e7976'
 default_master_ami = ''
 default_env = 'dev'
-default_spark_version = '2.0.2'
+default_spark_version = '2.2.0'
 default_hdfs_version = '2.7.2'
 default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz'
 default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz'
diff --git a/tools/flintrock b/tools/flintrock
index 4629fe4b..2cc5ddaf 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit 4629fe4bc1f333dd149a44dcc5d9b8775186b324
+Subproject commit 2cc5ddaf12a5850a710c168c9b52def6cfdcadd0

From 766ab9cec3aae0f2a13e9df5bf2ffcfc39f7993d Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 1 Aug 2017 09:58:02 -0300
Subject: [PATCH 164/268] update flintrock

---
 tools/flintrock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/flintrock b/tools/flintrock
index 2cc5ddaf..eba6ab1d 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit 2cc5ddaf12a5850a710c168c9b52def6cfdcadd0
+Subproject commit eba6ab1dceb942937bdc9610736e70d72e2a6579

From 5141aa7724284f324c583eee720fd3b4c9bc4390 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 1 Aug 2017 11:24:33 -0300
Subject: [PATCH 165/268] fix versions

---
 build.sbt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/build.sbt b/build.sbt
index 63b5c2e2..f4fa51d0 100644
--- a/build.sbt
+++ b/build.sbt
@@ -2,7 +2,7 @@ name := "Ignition-Core"
 
 version := "1.0"
 
-scalaVersion := "2.11.11"
+scalaVersion := "2.11.8"
 
 scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130")
 
@@ -34,15 +34,15 @@ libraryDependencies += "commons-lang" % "commons-lang" % "2.6"
 
 libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.25"
 
-libraryDependencies += "com.typesafe.akka" %% "akka-actor" % "2.5.3"
+libraryDependencies += "com.typesafe.akka" %% "akka-actor" % "2.3.4"
 
-libraryDependencies += "io.spray" %% "spray-json" % "1.3.3"
+libraryDependencies += "io.spray" %% "spray-json" % "1.3.2"
 
-libraryDependencies += "io.spray" %% "spray-client" % "1.3.4"
+libraryDependencies += "io.spray" %% "spray-client" % "1.3.2"
 
-libraryDependencies += "io.spray" %% "spray-http" % "1.3.4"
+libraryDependencies += "io.spray" %% "spray-http" % "1.3.2"
 
-libraryDependencies += "io.spray" %% "spray-caching" % "1.3.4"
+libraryDependencies += "io.spray" %% "spray-caching" % "1.3.2"
 
 resolvers += "Akka Repository" at "http://repo.akka.io/releases/"
 

From 0c0d986a05e5e79259b5dd4ae607f8fe5b8a7663 Mon Sep 17 00:00:00 2001
From: Leonardo Santos <leonardo@neemu.com>
Date: Fri, 18 Aug 2017 18:15:17 -0400
Subject: [PATCH 166/268] ExpiringMultiLevelCache: caching 404

---
 .../core/cache/ExpiringMultiLevelCache.scala  | 102 ++++++++++++++----
 1 file changed, 79 insertions(+), 23 deletions(-)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
index 138d6cbd..8f953550 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
@@ -17,10 +17,22 @@ import scala.util.control.NonFatal
 import scala.util.{Failure, Success, Try}
 
 object ExpiringMultiLevelCache {
-  case class TimestampedValue[V](date: DateTime, value: V) {
+  case class TimestampedValue[V](date: DateTime,
+                                 value: Option[V] = None,
+                                 status4XX: Boolean = false,
+                                 status5XX: Boolean = false,
+                                 error: Option[Throwable] = None) {
     def hasExpired(ttl: FiniteDuration, now: DateTime): Boolean = {
+      // TODO: Cached Error should have little ttl
       date.plus(ttl.toMillis).isBefore(now)
     }
+    def getValue: V = {
+      this.value match {
+        case Some(x) => x
+        // We should never try to get a value that was saved as None, probably from an error
+        case None => throw new Exception("Trying to get None value")
+      }
+    }
   }
 
   trait GenericCache[V] { cache =>
@@ -109,7 +121,6 @@ object ExpiringMultiLevelCache {
 
 import ignition.core.cache.ExpiringMultiLevelCache._
 
-
 case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
                                       localCache: Option[LocalCache[TimestampedValue[V]]],
                                       remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None,
@@ -128,12 +139,34 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
 
   protected def now = DateTime.now.withZone(DateTimeZone.UTC)
 
-  private def timestamp(v: V) = TimestampedValue(now, v)
+  private def timestamp(v: V) = TimestampedValue(date = now, value = Some(v))
+
+  private def timestamp(status4XX: Boolean, status5XX: Boolean, error: Throwable): TimestampedValue[V] = {
+    TimestampedValue(value = None, date = now, status4XX = status4XX, status5XX = status5XX, error = Some(error))
+  }
 
   private def elapsedTime(startNanoTime: Long) = FiniteDuration(System.nanoTime() - startNanoTime, TimeUnit.NANOSECONDS)
 
   private def remoteLockKey(key: Any) = s"$key-emlc-lock"
 
+  case class SavedErrorCache(ttl: FiniteDuration = 1.minutes,
+                             status4XX: Boolean = false,
+                             status5XX: Boolean = false,
+                             error: Throwable)
+
+  case class CustomException(private val message: String = "", private val cause: Throwable = None.orNull) extends Exception(message, cause)
+
+  private def checkSavedErrorCache(key: String, genValue: () => Future[V], startTime: Long, v: TimestampedValue[V]): Future[V] = {
+    val promise = Promise[V]()
+    val future = promise.future
+    if (v.status4XX || v.status5XX) {
+      promise.tryFailure(v.error.getOrElse(None.orNull))
+    }
+    else {
+      promise.trySuccess(v.getValue)
+    }
+    future
+  }
 
   // The idea is simple, have two caches: remote and local
   // with values that will eventually expire but still be left on the cache
@@ -141,7 +174,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
   override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = {
     // The local cache is always the first try. We'll only look the remote if the local value is missing or has expired
     val startTime = System.nanoTime()
-    val result = localCache.flatMap(_.get(key).map(_.asTry())) match {
+    val result: Future[V] = localCache.flatMap(_.get(key).map(_.asTry())) match {
       case Some(future) =>
         future.flatMap {
           case Success(localValue) if !localValue.hasExpired(ttl, now) =>
@@ -149,9 +182,10 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
             reporter.onLocalCacheHit(key, elapsedTime(startTime))
             // But if we're paranoid, let's check if the local value is consistent with remote
             if (sanityLocalValueCheck)
-              remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, genValue, startTime)).getOrElse(Future.successful(localValue.value))
+              remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, genValue, startTime)).getOrElse(checkSavedErrorCache(key, genValue, startTime, localValue))
             else
-              Future.successful(localValue.value)
+              // We can even get a SavedErrorCache
+              checkSavedErrorCache(key, genValue, startTime, localValue)
           case Success(expiredLocalValue) if remoteRW.nonEmpty =>
             // We have locally an expired value, but we can check a remote cache for better value
             remoteRW.get.get(key).asTry().flatMap {
@@ -159,35 +193,40 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
                 // Remote is good, set locally and return it
                 reporter.onRemoteCacheHit(key, elapsedTime(startTime))
                 localCache.foreach(_.set(key, remoteValue))
-                Future.successful(remoteValue.value)
+                // We can even get a SavedErrorCache
+                checkSavedErrorCache(key, genValue, startTime, remoteValue)
               case Success(Some(expiredRemote)) =>
                 // Expired local and expired remote, return the most recent of them, async update both
                 reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime))
                 tryGenerateAndSet(key, genValue, startTime)
                 val mostRecent = Set(expiredLocalValue, expiredRemote).maxBy(_.date)
-                Future.successful(mostRecent.value)
+                // We can even get a SavedErrorCache
+                checkSavedErrorCache(key, genValue, startTime, mostRecent)
               case Success(None) =>
                 // No remote found, return local, async update both
                 reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime))
                 tryGenerateAndSet(key, genValue, startTime)
-                Future.successful(expiredLocalValue.value)
+                // We can even get a SavedErrorCache
+                checkSavedErrorCache(key, genValue, startTime, expiredLocalValue)
               case Failure(e) =>
                 reporter.onRemoteError(key, e, elapsedTime(startTime))
                 logger.warn(s"apply, key: $key expired local value and failed to get remote", e)
                 tryGenerateAndSet(key, genValue, startTime)
-                Future.successful(expiredLocalValue.value)
+                // We can even get a SavedErrorCache
+                checkSavedErrorCache(key, genValue, startTime, expiredLocalValue)
             }
           case Success(expiredLocalValue) if remoteRW.isEmpty =>
             // There is no remote cache configured, we'are on our own
             // Return expired value and try to generate a new one for the future
             reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime))
             tryGenerateAndSet(key, genValue, startTime)
-            Future.successful(expiredLocalValue.value)
+            // We can even get a SavedErrorCache
+            checkSavedErrorCache(key, genValue, startTime, expiredLocalValue)
           case Failure(e) =>
             // This is almost impossible to happen because it's local and we don't save failed values
             reporter.onLocalError(key, e, elapsedTime(startTime))
             logger.warn(s"apply, key: $key got a failed future from cache!? This is almost impossible!", e)
-            tryGenerateAndSet(key, genValue, startTime).map(_.value)
+            tryGenerateAndSet(key, genValue, startTime).map(_.getValue)
         }
       case None if remoteRW.nonEmpty =>
         // No local, let's try remote
@@ -196,26 +235,28 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
             // Remote is good, set locally and return it
             reporter.onRemoteCacheHit(key, elapsedTime(startTime))
             localCache.foreach(_.set(key, remoteValue))
-            Future.successful(remoteValue.value)
+            // We can even get a SavedErrorCache
+            checkSavedErrorCache(key, genValue, startTime, remoteValue)
           case Success(Some(expiredRemote)) =>
             // Expired remote, return the it, async update
             reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime))
-            tryGenerateAndSet(key, genValue, startTime).map(_.value)
-            Future.successful(expiredRemote.value)
+            tryGenerateAndSet(key, genValue, startTime).map(_.getValue)
+            // We can even get a SavedErrorCache
+            checkSavedErrorCache(key, genValue, startTime, expiredRemote)
           case Success(None) =>
             // No good remote, sync generate
             reporter.onCacheMissNothingFound(key, elapsedTime(startTime))
-            tryGenerateAndSet(key, genValue, startTime).map(_.value)
+            tryGenerateAndSet(key, genValue, startTime).map(_.getValue)
           case Failure(e) =>
             reporter.onRemoteError(key, e, elapsedTime(startTime))
             logger.warn(s"apply, key: $key expired local value and remote error", e)
-            tryGenerateAndSet(key, genValue, startTime).map(_.value)
+            tryGenerateAndSet(key, genValue, startTime).map(_.getValue)
         }
       case None if remoteRW.isEmpty =>
         // No local and no remote to look, just generate it
         // The caller will need to wait for the value generation
         reporter.onCacheMissNothingFound(key, elapsedTime(startTime))
-        tryGenerateAndSet(key, genValue, startTime).map(_.value)
+        tryGenerateAndSet(key, genValue, startTime).map(_.getValue)
     }
     result.onComplete {
       case Success(_) =>
@@ -255,7 +296,8 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
     remote.get(key).asTry().flatMap {
       case Success(Some(remoteValue)) if remoteValue == localValue =>
         // Remote is the same as local, return any of them
-        Future.successful(remoteValue.value)
+        // We can even get a SavedErrorCache
+        checkSavedErrorCache(key, genValue, startTime, remoteValue)
       case Success(Some(remoteValue)) =>
         // Something is different, try to figure it out
         val valuesResult = if (remoteValue.value == localValue.value) "same-value" else "different-values"
@@ -275,18 +317,20 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
         logger.warn(s"sanityLocalValueCheck, key $key: got different results for local $localValue and remote $remoteValue ($finalResult)")
         reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime))
         // return remote to keep everyone consistent
-        Future.successful(remoteValue.value)
+        // We can even get a SavedErrorCache
+        checkSavedErrorCache(key, genValue, startTime, remoteValue)
       case Success(None) =>
         val localExpired = localValue.hasExpired(ttl, now)
         val finalResult = s"missing-remote-local-expired-${localExpired}"
         logger.warn(s"sanityLocalValueCheck, key $key: got local $localValue but no remote ($finalResult)")
         reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime))
         // Try generate it to keep a behaviour equivalent to remote only
-        tryGenerateAndSet(key, genValue, startTime).map(_.value)
+        tryGenerateAndSet(key, genValue, startTime).map(_.getValue)
       case Failure(e) =>
         reporter.onRemoteError(key, e, elapsedTime(startTime))
         logger.warn(s"sanityLocalValueCheck, key: $key  failed to get remote", e)
-        Future.successful(localValue.value)
+        // We can even get a SavedErrorCache
+        checkSavedErrorCache(key, genValue, startTime, localValue)
     }
   }
 
@@ -379,7 +423,19 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
               case Failure(eRemote) =>
                 // The real error is the eLocal, return it
                 logger.error(s"canonicalValueGenerator, key $key: failed to generate value and failed to get remote", eLocal)
-                Future.failed(eLocal)
+                eLocal match {
+                  case NonFatal(e) => {
+                    // if error was nonFatal (404) then saves it to cache
+                    // TODO: check if it is actually a 4XX error, or something else
+                    // TODO: handle 5XX errors as well?
+                    val timestampedValue = timestamp(status4XX = true, status5XX = false, error = e)
+                    remoteSetOrGet(key, timestampedValue, remote, nanoStartTime)
+                    Future.failed(eLocal)
+                  }
+                  case _ => {
+                    Future.failed(eLocal)
+                  }
+                }
             }
         }
     }

From 36c1448b4e270adf5da4f03d0f07ba1899dda611 Mon Sep 17 00:00:00 2001
From: Leonardo Santos <leonardo@neemu.com>
Date: Tue, 22 Aug 2017 14:48:43 -0400
Subject: [PATCH 167/268] ExpiringMultiLevelCache: caching errors when only
 LocalCache

---
 .../core/cache/ExpiringMultiLevelCache.scala  | 17 ++++++++---
 .../ExpiringMultipleLevelCacheSpec.scala      | 29 +++++++++++++++++++
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
index 8f953550..8674c174 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
@@ -414,7 +414,18 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
           case None =>
             // There are no remote RW caches
             logger.error(s"canonicalValueGenerator, key $key: failed to generate value and no remote cache configured", eLocal)
-            Future.failed(eLocal)
+            eLocal match {
+              case NonFatal(e) => {
+                // if error was nonFatal (404) then saves it to cache
+                // TODO: check if it is actually a 4XX error, or something else
+                // TODO: handle 5XX errors as well?
+                val timestampedValue = timestamp(status4XX = true, status5XX = false, error = e)
+                // Saved it only in localCache
+                localCache.foreach(_.set(key, timestampedValue))
+                Future.failed(eLocal)
+              }
+              case _ => Future.failed(eLocal)
+            }
           case Some(remote) =>
             remoteGetNonExpiredValue(key, remote, nanoStartTime).asTry().flatMap {
               case Success(v) =>
@@ -432,9 +443,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
                     remoteSetOrGet(key, timestampedValue, remote, nanoStartTime)
                     Future.failed(eLocal)
                   }
-                  case _ => {
-                    Future.failed(eLocal)
-                  }
+                  case _ => Future.failed(eLocal)
                 }
             }
         }
diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
index 7f2101c3..bd932868 100644
--- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
+++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
@@ -31,4 +31,33 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu
       failure shouldBe a [MyException]
     }
   }
+
+  it should "calculate a value on cache miss just once, the second call should be from cache hit" in {
+    var myFailedRequestCount: Int = 0
+
+    // TODO: Throw a 404 error
+    class MyException(s: String) extends ArithmeticException(s) // Some NonFatal Exception
+    def myFailedRequest(): Future[Nothing] = {
+      println("calling myFailedRequest()")
+      myFailedRequestCount = myFailedRequestCount + 1
+      Future.failed(new MyException("some failure"))
+    }
+
+    val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
+    val cache = ExpiringMultiLevelCache[Data](1.minute, Option(local))
+
+    val eventualCache = cache("key", myFailedRequest)
+    whenReady(eventualCache.failed) { failure =>
+      failure shouldBe a [MyException]
+      myFailedRequestCount shouldBe 1
+    }
+
+    val eventualCache2 = cache("key", myFailedRequest)
+    whenReady(eventualCache2.failed) { failure =>
+      failure shouldBe a [MyException]
+      myFailedRequestCount shouldBe 1
+    }
+
+  }
+
 }

From 131f6baf94b40ba4246fe0a26d88f49434670253 Mon Sep 17 00:00:00 2001
From: Leonardo Santos <leonardo@neemu.com>
Date: Thu, 24 Aug 2017 16:25:02 -0400
Subject: [PATCH 168/268] ExpiringMultiLevelCache: some improvement in tests

---
 .../ExpiringMultipleLevelCacheSpec.scala      | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
index bd932868..3a02d903 100644
--- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
+++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
@@ -1,5 +1,7 @@
 package ignition.core.cache
 
+import java.io.FileNotFoundException
+
 import akka.actor.ActorSystem
 import ignition.core.cache.ExpiringMultiLevelCache.TimestampedValue
 import org.scalatest.concurrent.ScalaFutures
@@ -35,8 +37,7 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu
   it should "calculate a value on cache miss just once, the second call should be from cache hit" in {
     var myFailedRequestCount: Int = 0
 
-    // TODO: Throw a 404 error
-    class MyException(s: String) extends ArithmeticException(s) // Some NonFatal Exception
+    class MyException(s: String) extends FileNotFoundException(s) // Some NonFatal Exception
     def myFailedRequest(): Future[Nothing] = {
       println("calling myFailedRequest()")
       myFailedRequestCount = myFailedRequestCount + 1
@@ -58,6 +59,24 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu
       myFailedRequestCount shouldBe 1
     }
 
+    val eventualCache3 = cache("key", myFailedRequest)
+    whenReady(eventualCache3.failed) { failure =>
+      failure shouldBe a [MyException]
+      myFailedRequestCount shouldBe 1
+    }
+
+    val eventualCache4 = cache("key", myFailedRequest)
+    whenReady(eventualCache4.failed) { failure =>
+      failure shouldBe a [MyException]
+      myFailedRequestCount shouldBe 1
+    }
+
+    val eventualCache5 = cache("key", myFailedRequest)
+    whenReady(eventualCache5.failed) { failure =>
+      failure shouldBe a [MyException]
+      myFailedRequestCount shouldBe 1
+    }
+
   }
 
 }

From bf04dc6593d0d5b70fa09b51536d922edfb97c33 Mon Sep 17 00:00:00 2001
From: Leonardo Santos <leonardo@neemu.com>
Date: Thu, 24 Aug 2017 17:17:14 -0400
Subject: [PATCH 169/268] ExpiringMultiLevelCache: optional support to cache
 error with differrent ttl

---
 .../core/cache/ExpiringMultiLevelCache.scala  | 75 +++++++--------
 .../ExpiringMultipleLevelCacheSpec.scala      | 96 ++++++++++++++++++-
 2 files changed, 130 insertions(+), 41 deletions(-)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
index 8674c174..b5c10667 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
@@ -19,12 +19,14 @@ import scala.util.{Failure, Success, Try}
 object ExpiringMultiLevelCache {
   case class TimestampedValue[V](date: DateTime,
                                  value: Option[V] = None,
-                                 status4XX: Boolean = false,
-                                 status5XX: Boolean = false,
-                                 error: Option[Throwable] = None) {
-    def hasExpired(ttl: FiniteDuration, now: DateTime): Boolean = {
-      // TODO: Cached Error should have little ttl
-      date.plus(ttl.toMillis).isBefore(now)
+                                 hasError: Boolean = false,
+                                 throwable: Option[Throwable] = None) {
+    def hasExpired(ttl: FiniteDuration, now: DateTime, ttlCachedErrors: FiniteDuration = 1.minute): Boolean = {
+      if (!hasError) {
+        date.plus(ttl.toMillis).isBefore(now)
+      } else {
+        date.plus(ttlCachedErrors.toMillis).isBefore(now)
+      }
     }
     def getValue: V = {
       this.value match {
@@ -129,7 +131,9 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
                                       maxErrorsToRetryOnRemote: Int = 5,
                                       backoffOnLockAcquire: FiniteDuration = 50.milliseconds,
                                       backoffOnError: FiniteDuration = 50.milliseconds,
-                                      sanityLocalValueCheck: Boolean = false) extends GenericCache[V] {
+                                      sanityLocalValueCheck: Boolean = false,
+                                      cacheErrors: Boolean = false,
+                                      ttlCachedErrors: FiniteDuration = 1.minute) extends GenericCache[V] {
 
   private val logger = LoggerFactory.getLogger(getClass)
 
@@ -141,26 +145,19 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
 
   private def timestamp(v: V) = TimestampedValue(date = now, value = Some(v))
 
-  private def timestamp(status4XX: Boolean, status5XX: Boolean, error: Throwable): TimestampedValue[V] = {
-    TimestampedValue(value = None, date = now, status4XX = status4XX, status5XX = status5XX, error = Some(error))
+  private def timestamp(hasError: Boolean, throwable: Throwable): TimestampedValue[V] = {
+    TimestampedValue(value = None, date = now, hasError = hasError, throwable = Some(throwable))
   }
 
   private def elapsedTime(startNanoTime: Long) = FiniteDuration(System.nanoTime() - startNanoTime, TimeUnit.NANOSECONDS)
 
   private def remoteLockKey(key: Any) = s"$key-emlc-lock"
 
-  case class SavedErrorCache(ttl: FiniteDuration = 1.minutes,
-                             status4XX: Boolean = false,
-                             status5XX: Boolean = false,
-                             error: Throwable)
-
-  case class CustomException(private val message: String = "", private val cause: Throwable = None.orNull) extends Exception(message, cause)
-
   private def checkSavedErrorCache(key: String, genValue: () => Future[V], startTime: Long, v: TimestampedValue[V]): Future[V] = {
     val promise = Promise[V]()
     val future = promise.future
-    if (v.status4XX || v.status5XX) {
-      promise.tryFailure(v.error.getOrElse(None.orNull))
+    if (v.hasError) {
+      promise.tryFailure(v.throwable.getOrElse(None.orNull))
     }
     else {
       promise.trySuccess(v.getValue)
@@ -177,7 +174,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
     val result: Future[V] = localCache.flatMap(_.get(key).map(_.asTry())) match {
       case Some(future) =>
         future.flatMap {
-          case Success(localValue) if !localValue.hasExpired(ttl, now) =>
+          case Success(localValue) if !localValue.hasExpired(ttl, now, ttlCachedErrors) =>
             // We have locally a good value, just return it
             reporter.onLocalCacheHit(key, elapsedTime(startTime))
             // But if we're paranoid, let's check if the local value is consistent with remote
@@ -189,7 +186,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
           case Success(expiredLocalValue) if remoteRW.nonEmpty =>
             // We have locally an expired value, but we can check a remote cache for better value
             remoteRW.get.get(key).asTry().flatMap {
-              case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
+              case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) =>
                 // Remote is good, set locally and return it
                 reporter.onRemoteCacheHit(key, elapsedTime(startTime))
                 localCache.foreach(_.set(key, remoteValue))
@@ -231,7 +228,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
       case None if remoteRW.nonEmpty =>
         // No local, let's try remote
         remoteRW.get.get(key).asTry().flatMap {
-          case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
+          case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) =>
             // Remote is good, set locally and return it
             reporter.onRemoteCacheHit(key, elapsedTime(startTime))
             localCache.foreach(_.set(key, remoteValue))
@@ -311,8 +308,8 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
           "same-date-on-utc"
         else
           "impossible-dates"
-        val remoteExpired = remoteValue.hasExpired(ttl, now)
-        val localExpired = localValue.hasExpired(ttl, now)
+        val remoteExpired = remoteValue.hasExpired(ttl, now, ttlCachedErrors)
+        val localExpired = localValue.hasExpired(ttl, now, ttlCachedErrors)
         val finalResult = s"$valuesResult-$dateResult-remote-expired-${remoteExpired}-local-expired-${localExpired}"
         logger.warn(s"sanityLocalValueCheck, key $key: got different results for local $localValue and remote $remoteValue ($finalResult)")
         reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime))
@@ -320,7 +317,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
         // We can even get a SavedErrorCache
         checkSavedErrorCache(key, genValue, startTime, remoteValue)
       case Success(None) =>
-        val localExpired = localValue.hasExpired(ttl, now)
+        val localExpired = localValue.hasExpired(ttl, now, ttlCachedErrors)
         val finalResult = s"missing-remote-local-expired-${localExpired}"
         logger.warn(s"sanityLocalValueCheck, key $key: got local $localValue but no remote ($finalResult)")
         reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime))
@@ -363,7 +360,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
       case null =>
         logger.info(s"tryGenerateAndSet, key $key: got request for generating and none in progress found, calling canonicalValueGenerator")
         canonicalValueGenerator(key, genValue, nanoStartTime).onComplete {
-          case Success(v) if !v.hasExpired(ttl, now) =>
+          case Success(v) if !v.hasExpired(ttl, now, ttlCachedErrors) =>
             reporter.onGeneratedWithSuccess(key, elapsedTime(nanoStartTime))
             localCache.foreach(_.set(key, v))
             promise.trySuccess(v)
@@ -416,12 +413,12 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
             logger.error(s"canonicalValueGenerator, key $key: failed to generate value and no remote cache configured", eLocal)
             eLocal match {
               case NonFatal(e) => {
-                // if error was nonFatal (404) then saves it to cache
-                // TODO: check if it is actually a 4XX error, or something else
-                // TODO: handle 5XX errors as well?
-                val timestampedValue = timestamp(status4XX = true, status5XX = false, error = e)
-                // Saved it only in localCache
-                localCache.foreach(_.set(key, timestampedValue))
+                if (cacheErrors) {
+                  // if error was NonFatal Error then saves it to cache
+                  val timestampedValue = timestamp(hasError = true, throwable = e)
+                  // Saved it only in localCache
+                  localCache.foreach(_.set(key, timestampedValue))
+                }
                 Future.failed(eLocal)
               }
               case _ => Future.failed(eLocal)
@@ -436,11 +433,11 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
                 logger.error(s"canonicalValueGenerator, key $key: failed to generate value and failed to get remote", eLocal)
                 eLocal match {
                   case NonFatal(e) => {
-                    // if error was nonFatal (404) then saves it to cache
-                    // TODO: check if it is actually a 4XX error, or something else
-                    // TODO: handle 5XX errors as well?
-                    val timestampedValue = timestamp(status4XX = true, status5XX = false, error = e)
-                    remoteSetOrGet(key, timestampedValue, remote, nanoStartTime)
+                    if (cacheErrors) {
+                      // if error was NonFatal Error then saves it to cache
+                      val timestampedValue = timestamp(hasError = true, throwable = e)
+                      remoteSetOrGet(key, timestampedValue, remote, nanoStartTime)
+                    }
                     Future.failed(eLocal)
                   }
                   case _ => Future.failed(eLocal)
@@ -457,7 +454,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
                                        nanoStartTime: Long,
                                        currentRetry: Int = 0)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = {
     remote.get(key).asTry().flatMap {
-      case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
+      case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) =>
         logger.info(s"remoteGetNonExpiredValue, key $key: got a good value")
         Future.successful(remoteValue)
       case Success(_) =>
@@ -496,7 +493,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
           logger.info(s"remoteSetOrGet got lock for key $key")
           // Lock acquired, get the current value and replace it
           remote.get(key).asTry().flatMap {
-            case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
+            case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) =>
               // Current value is good, just return it
               reporter.onRemoteCacheHitAfterGenerating(key, elapsedTime(nanoStartTime))
               logger.info(s"remoteSetOrGet got lock for $key but found already a good value on remote")
@@ -529,7 +526,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
         case Success(false) =>
           // Someone got the lock, let's take a look at the value
           remote.get(key).asTry().flatMap {
-            case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) =>
+            case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) =>
               // Current value is good, just return it
               logger.info(s"remoteSetOrGet couldn't lock key $key but found a good on remote afterwards")
               reporter.onRemoteCacheHitAfterGenerating(key, elapsedTime(nanoStartTime))
diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
index 3a02d903..ecf99a0d 100644
--- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
+++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
@@ -39,13 +39,12 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu
 
     class MyException(s: String) extends FileNotFoundException(s) // Some NonFatal Exception
     def myFailedRequest(): Future[Nothing] = {
-      println("calling myFailedRequest()")
       myFailedRequestCount = myFailedRequestCount + 1
       Future.failed(new MyException("some failure"))
     }
 
     val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
-    val cache = ExpiringMultiLevelCache[Data](1.minute, Option(local))
+    val cache = ExpiringMultiLevelCache[Data](ttl = 1.minute, localCache = Option(local), cacheErrors = true, ttlCachedErrors = 9.seconds)
 
     val eventualCache = cache("key", myFailedRequest)
     whenReady(eventualCache.failed) { failure =>
@@ -79,4 +78,97 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu
 
   }
 
+  it should "calculate a value on cache miss on every request" in {
+    var myFailedRequestCount: Int = 0
+
+    class MyException(s: String) extends FileNotFoundException(s) // Some NonFatal Exception
+    def myFailedRequest(): Future[Nothing] = {
+      myFailedRequestCount = myFailedRequestCount + 1
+      Future.failed(new MyException("some failure"))
+    }
+
+    val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
+    val cache = ExpiringMultiLevelCache[Data](ttl = 1.minute, localCache = Option(local), cacheErrors = false)
+
+    val eventualCache = cache("key", myFailedRequest)
+    whenReady(eventualCache.failed) { failure =>
+      failure shouldBe a [MyException]
+      myFailedRequestCount shouldBe 1
+    }
+
+    val eventualCache2 = cache("key", myFailedRequest)
+    whenReady(eventualCache2.failed) { failure =>
+      failure shouldBe a [MyException]
+      myFailedRequestCount shouldBe 2
+    }
+
+    val eventualCache3 = cache("key", myFailedRequest)
+    whenReady(eventualCache3.failed) { failure =>
+      failure shouldBe a [MyException]
+      myFailedRequestCount shouldBe 3
+    }
+
+    val eventualCache4 = cache("key", myFailedRequest)
+    whenReady(eventualCache4.failed) { failure =>
+      failure shouldBe a [MyException]
+      myFailedRequestCount shouldBe 4
+    }
+
+    val eventualCache5 = cache("key", myFailedRequest)
+    whenReady(eventualCache5.failed) { failure =>
+      failure shouldBe a [MyException]
+      myFailedRequestCount shouldBe 5
+    }
+
+  }
+
+  it should "calculate a value on cache miss, then wait ttlCachedError to get a cache miss again" in {
+    var myFailedRequestCount: Int = 0
+
+    class MyException(s: String) extends FileNotFoundException(s) // Some NonFatal Exception
+    def myFailedRequest(): Future[Nothing] = {
+      myFailedRequestCount = myFailedRequestCount + 1
+      Future.failed(new MyException("some failure"))
+    }
+
+    val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
+    val cache = ExpiringMultiLevelCache[Data](ttl = 1.minute, localCache = Option(local), cacheErrors = true, ttlCachedErrors = 9.seconds)
+
+    val eventualCache = cache("key", myFailedRequest)
+    whenReady(eventualCache.failed) { failure =>
+      failure shouldBe a [MyException]
+      myFailedRequestCount shouldBe 1
+    }
+
+    val eventualCache2 = cache("key", myFailedRequest)
+    whenReady(eventualCache2.failed) { failure =>
+      failure shouldBe a [MyException]
+      myFailedRequestCount shouldBe 1
+    }
+
+    Thread.sleep(10000)
+
+    val eventualCache3 = cache("key", myFailedRequest)
+    whenReady(eventualCache3.failed) { failure =>
+      failure shouldBe a [MyException]
+      myFailedRequestCount shouldBe 2
+    }
+
+    val eventualCache4 = cache("key", myFailedRequest)
+    whenReady(eventualCache4.failed) { failure =>
+      failure shouldBe a [MyException]
+      myFailedRequestCount shouldBe 2
+    }
+
+    Thread.sleep(1000)
+
+    val eventualCache5 = cache("key", myFailedRequest)
+    whenReady(eventualCache5.failed) { failure =>
+      failure shouldBe a [MyException]
+      myFailedRequestCount shouldBe 2
+    }
+
+  }
+
+
 }

From 44651b4d8aeccd7e42dd324d9af386a1f699d180 Mon Sep 17 00:00:00 2001
From: Leonardo Santos <leonardo@neemu.com>
Date: Thu, 24 Aug 2017 17:30:29 -0400
Subject: [PATCH 170/268] ExpiringMultiLevelCache: testing ttl

---
 .../ExpiringMultipleLevelCacheSpec.scala      | 25 ++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
index ecf99a0d..0a21ff6c 100644
--- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
+++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
@@ -34,6 +34,30 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu
     }
   }
 
+  it should "calculate a value on cache miss after ttl" in {
+    var myRequestCount: Int = 0
+
+    def myRequest(): Future[Data] = {
+      myRequestCount = myRequestCount + 1
+      Future.successful(Data("success"))
+    }
+
+    val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
+    val cache = ExpiringMultiLevelCache[Data](ttl = 9.seconds, localCache = Option(local))
+
+    Await.result(cache("key", myRequest), 1.minute) shouldBe Data("success")
+    myRequestCount shouldBe 1
+    Await.result(cache("key", myRequest), 1.minute) shouldBe Data("success")
+    myRequestCount shouldBe 1
+
+    Thread.sleep(10000)
+
+    Await.result(cache("key", myRequest), 1.minute) shouldBe Data("success")
+    myRequestCount shouldBe 2
+    Await.result(cache("key", myRequest), 1.minute) shouldBe Data("success")
+    myRequestCount shouldBe 2
+  }
+
   it should "calculate a value on cache miss just once, the second call should be from cache hit" in {
     var myFailedRequestCount: Int = 0
 
@@ -170,5 +194,4 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu
 
   }
 
-
 }

From 14a5ca29c54795f8cebfe476d87ca199e83263d1 Mon Sep 17 00:00:00 2001
From: Leonardo Santos <leonardo@neemu.com>
Date: Mon, 28 Aug 2017 13:18:05 -0400
Subject: [PATCH 171/268] ExpiringMultiLevelCache: refactoring after code
 review

---
 .../core/cache/ExpiringMultiLevelCache.scala  | 91 ++++++-------------
 1 file changed, 29 insertions(+), 62 deletions(-)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
index b5c10667..bfe392e2 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
@@ -17,22 +17,11 @@ import scala.util.control.NonFatal
 import scala.util.{Failure, Success, Try}
 
 object ExpiringMultiLevelCache {
-  case class TimestampedValue[V](date: DateTime,
-                                 value: Option[V] = None,
-                                 hasError: Boolean = false,
-                                 throwable: Option[Throwable] = None) {
+  case class TimestampedValue[V](date: DateTime, value: Try[V]) {
     def hasExpired(ttl: FiniteDuration, now: DateTime, ttlCachedErrors: FiniteDuration = 1.minute): Boolean = {
-      if (!hasError) {
-        date.plus(ttl.toMillis).isBefore(now)
-      } else {
-        date.plus(ttlCachedErrors.toMillis).isBefore(now)
-      }
-    }
-    def getValue: V = {
-      this.value match {
-        case Some(x) => x
-        // We should never try to get a value that was saved as None, probably from an error
-        case None => throw new Exception("Trying to get None value")
+      value match {
+        case Success(_) => date.plus(ttl.toMillis).isBefore(now)
+        case Failure(_) => date.plus(ttlCachedErrors.toMillis).isBefore(now)
       }
     }
   }
@@ -123,6 +112,7 @@ object ExpiringMultiLevelCache {
 
 import ignition.core.cache.ExpiringMultiLevelCache._
 
+
 case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
                                       localCache: Option[LocalCache[TimestampedValue[V]]],
                                       remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None,
@@ -143,27 +133,14 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
 
   protected def now = DateTime.now.withZone(DateTimeZone.UTC)
 
-  private def timestamp(v: V) = TimestampedValue(date = now, value = Some(v))
+  private def timestamp(v: V): TimestampedValue[V] = TimestampedValue(now, Try(v))
 
-  private def timestamp(hasError: Boolean, throwable: Throwable): TimestampedValue[V] = {
-    TimestampedValue(value = None, date = now, hasError = hasError, throwable = Some(throwable))
-  }
+  private def timestampError(e: Throwable): TimestampedValue[V] = TimestampedValue(now, Failure(e))
 
   private def elapsedTime(startNanoTime: Long) = FiniteDuration(System.nanoTime() - startNanoTime, TimeUnit.NANOSECONDS)
 
   private def remoteLockKey(key: Any) = s"$key-emlc-lock"
 
-  private def checkSavedErrorCache(key: String, genValue: () => Future[V], startTime: Long, v: TimestampedValue[V]): Future[V] = {
-    val promise = Promise[V]()
-    val future = promise.future
-    if (v.hasError) {
-      promise.tryFailure(v.throwable.getOrElse(None.orNull))
-    }
-    else {
-      promise.trySuccess(v.getValue)
-    }
-    future
-  }
 
   // The idea is simple, have two caches: remote and local
   // with values that will eventually expire but still be left on the cache
@@ -179,10 +156,9 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
             reporter.onLocalCacheHit(key, elapsedTime(startTime))
             // But if we're paranoid, let's check if the local value is consistent with remote
             if (sanityLocalValueCheck)
-              remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, genValue, startTime)).getOrElse(checkSavedErrorCache(key, genValue, startTime, localValue))
+              remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, genValue, startTime)).getOrElse(Future.successful(localValue.value.get))
             else
-              // We can even get a SavedErrorCache
-              checkSavedErrorCache(key, genValue, startTime, localValue)
+              Future.successful(localValue.value.get)
           case Success(expiredLocalValue) if remoteRW.nonEmpty =>
             // We have locally an expired value, but we can check a remote cache for better value
             remoteRW.get.get(key).asTry().flatMap {
@@ -190,40 +166,35 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
                 // Remote is good, set locally and return it
                 reporter.onRemoteCacheHit(key, elapsedTime(startTime))
                 localCache.foreach(_.set(key, remoteValue))
-                // We can even get a SavedErrorCache
-                checkSavedErrorCache(key, genValue, startTime, remoteValue)
+                Future.successful(remoteValue.value.get)
               case Success(Some(expiredRemote)) =>
                 // Expired local and expired remote, return the most recent of them, async update both
                 reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime))
                 tryGenerateAndSet(key, genValue, startTime)
                 val mostRecent = Set(expiredLocalValue, expiredRemote).maxBy(_.date)
-                // We can even get a SavedErrorCache
-                checkSavedErrorCache(key, genValue, startTime, mostRecent)
+                Future.successful(mostRecent.value.get)
               case Success(None) =>
                 // No remote found, return local, async update both
                 reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime))
                 tryGenerateAndSet(key, genValue, startTime)
-                // We can even get a SavedErrorCache
-                checkSavedErrorCache(key, genValue, startTime, expiredLocalValue)
+                Future.successful(expiredLocalValue.value.get)
               case Failure(e) =>
                 reporter.onRemoteError(key, e, elapsedTime(startTime))
                 logger.warn(s"apply, key: $key expired local value and failed to get remote", e)
                 tryGenerateAndSet(key, genValue, startTime)
-                // We can even get a SavedErrorCache
-                checkSavedErrorCache(key, genValue, startTime, expiredLocalValue)
+                Future.successful(expiredLocalValue.value.get)
             }
           case Success(expiredLocalValue) if remoteRW.isEmpty =>
             // There is no remote cache configured, we'are on our own
             // Return expired value and try to generate a new one for the future
             reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime))
             tryGenerateAndSet(key, genValue, startTime)
-            // We can even get a SavedErrorCache
-            checkSavedErrorCache(key, genValue, startTime, expiredLocalValue)
+            Future.successful(expiredLocalValue.value.get)
           case Failure(e) =>
             // This is almost impossible to happen because it's local and we don't save failed values
             reporter.onLocalError(key, e, elapsedTime(startTime))
             logger.warn(s"apply, key: $key got a failed future from cache!? This is almost impossible!", e)
-            tryGenerateAndSet(key, genValue, startTime).map(_.getValue)
+            tryGenerateAndSet(key, genValue, startTime).map(_.value.get)
         }
       case None if remoteRW.nonEmpty =>
         // No local, let's try remote
@@ -232,28 +203,26 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
             // Remote is good, set locally and return it
             reporter.onRemoteCacheHit(key, elapsedTime(startTime))
             localCache.foreach(_.set(key, remoteValue))
-            // We can even get a SavedErrorCache
-            checkSavedErrorCache(key, genValue, startTime, remoteValue)
+            Future.successful(remoteValue.value.get)
           case Success(Some(expiredRemote)) =>
             // Expired remote, return the it, async update
             reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime))
-            tryGenerateAndSet(key, genValue, startTime).map(_.getValue)
-            // We can even get a SavedErrorCache
-            checkSavedErrorCache(key, genValue, startTime, expiredRemote)
+            tryGenerateAndSet(key, genValue, startTime).map(_.value)
+            Future.successful(expiredRemote.value.get)
           case Success(None) =>
             // No good remote, sync generate
             reporter.onCacheMissNothingFound(key, elapsedTime(startTime))
-            tryGenerateAndSet(key, genValue, startTime).map(_.getValue)
+            tryGenerateAndSet(key, genValue, startTime).map(_.value.get)
           case Failure(e) =>
             reporter.onRemoteError(key, e, elapsedTime(startTime))
             logger.warn(s"apply, key: $key expired local value and remote error", e)
-            tryGenerateAndSet(key, genValue, startTime).map(_.getValue)
+            tryGenerateAndSet(key, genValue, startTime).map(_.value.get)
         }
       case None if remoteRW.isEmpty =>
         // No local and no remote to look, just generate it
         // The caller will need to wait for the value generation
         reporter.onCacheMissNothingFound(key, elapsedTime(startTime))
-        tryGenerateAndSet(key, genValue, startTime).map(_.getValue)
+        tryGenerateAndSet(key, genValue, startTime).map(_.value.get)
     }
     result.onComplete {
       case Success(_) =>
@@ -293,8 +262,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
     remote.get(key).asTry().flatMap {
       case Success(Some(remoteValue)) if remoteValue == localValue =>
         // Remote is the same as local, return any of them
-        // We can even get a SavedErrorCache
-        checkSavedErrorCache(key, genValue, startTime, remoteValue)
+        Future.successful(remoteValue.value.get)
       case Success(Some(remoteValue)) =>
         // Something is different, try to figure it out
         val valuesResult = if (remoteValue.value == localValue.value) "same-value" else "different-values"
@@ -314,20 +282,18 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
         logger.warn(s"sanityLocalValueCheck, key $key: got different results for local $localValue and remote $remoteValue ($finalResult)")
         reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime))
         // return remote to keep everyone consistent
-        // We can even get a SavedErrorCache
-        checkSavedErrorCache(key, genValue, startTime, remoteValue)
+        Future.successful(remoteValue.value.get)
       case Success(None) =>
         val localExpired = localValue.hasExpired(ttl, now, ttlCachedErrors)
         val finalResult = s"missing-remote-local-expired-${localExpired}"
         logger.warn(s"sanityLocalValueCheck, key $key: got local $localValue but no remote ($finalResult)")
         reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime))
         // Try generate it to keep a behaviour equivalent to remote only
-        tryGenerateAndSet(key, genValue, startTime).map(_.getValue)
+        tryGenerateAndSet(key, genValue, startTime).map(_.value.get)
       case Failure(e) =>
         reporter.onRemoteError(key, e, elapsedTime(startTime))
         logger.warn(s"sanityLocalValueCheck, key: $key  failed to get remote", e)
-        // We can even get a SavedErrorCache
-        checkSavedErrorCache(key, genValue, startTime, localValue)
+        Future.successful(localValue.value.get)
     }
   }
 
@@ -415,7 +381,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
               case NonFatal(e) => {
                 if (cacheErrors) {
                   // if error was NonFatal Error then saves it to cache
-                  val timestampedValue = timestamp(hasError = true, throwable = e)
+                  val timestampedValue: TimestampedValue[V] = timestampError(e)
                   // Saved it only in localCache
                   localCache.foreach(_.set(key, timestampedValue))
                 }
@@ -435,8 +401,9 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
                   case NonFatal(e) => {
                     if (cacheErrors) {
                       // if error was NonFatal Error then saves it to cache
-                      val timestampedValue = timestamp(hasError = true, throwable = e)
-                      remoteSetOrGet(key, timestampedValue, remote, nanoStartTime)
+                      val timestampedValue = timestampError(e)
+                      // Saved it only in localCache
+                      localCache.foreach(_.set(key, timestampedValue))
                     }
                     Future.failed(eLocal)
                   }

From 4952a61bae51b24ca62bfe3bf875fabf3b096115 Mon Sep 17 00:00:00 2001
From: Leonardo Santos <leonardo@neemu.com>
Date: Thu, 31 Aug 2017 17:24:07 -0400
Subject: [PATCH 172/268] ExpiringMultiLevelCache: Using Try properly with
 Future.fromTry

---
 .../core/cache/ExpiringMultiLevelCache.scala  | 53 ++++++++++++-------
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
index bfe392e2..6ac0f626 100644
--- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
+++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
@@ -156,9 +156,9 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
             reporter.onLocalCacheHit(key, elapsedTime(startTime))
             // But if we're paranoid, let's check if the local value is consistent with remote
             if (sanityLocalValueCheck)
-              remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, genValue, startTime)).getOrElse(Future.successful(localValue.value.get))
+              remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, genValue, startTime)).getOrElse(Future.fromTry(localValue.value))
             else
-              Future.successful(localValue.value.get)
+              Future.fromTry(localValue.value)
           case Success(expiredLocalValue) if remoteRW.nonEmpty =>
             // We have locally an expired value, but we can check a remote cache for better value
             remoteRW.get.get(key).asTry().flatMap {
@@ -166,35 +166,39 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
                 // Remote is good, set locally and return it
                 reporter.onRemoteCacheHit(key, elapsedTime(startTime))
                 localCache.foreach(_.set(key, remoteValue))
-                Future.successful(remoteValue.value.get)
+                Future.fromTry(remoteValue.value)
               case Success(Some(expiredRemote)) =>
                 // Expired local and expired remote, return the most recent of them, async update both
                 reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime))
                 tryGenerateAndSet(key, genValue, startTime)
                 val mostRecent = Set(expiredLocalValue, expiredRemote).maxBy(_.date)
-                Future.successful(mostRecent.value.get)
+                Future.fromTry(mostRecent.value)
               case Success(None) =>
                 // No remote found, return local, async update both
                 reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime))
                 tryGenerateAndSet(key, genValue, startTime)
-                Future.successful(expiredLocalValue.value.get)
+                Future.fromTry(expiredLocalValue.value)
               case Failure(e) =>
                 reporter.onRemoteError(key, e, elapsedTime(startTime))
                 logger.warn(s"apply, key: $key expired local value and failed to get remote", e)
                 tryGenerateAndSet(key, genValue, startTime)
-                Future.successful(expiredLocalValue.value.get)
+                Future.fromTry(expiredLocalValue.value)
             }
           case Success(expiredLocalValue) if remoteRW.isEmpty =>
             // There is no remote cache configured, we'are on our own
             // Return expired value and try to generate a new one for the future
             reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime))
             tryGenerateAndSet(key, genValue, startTime)
-            Future.successful(expiredLocalValue.value.get)
+            Future.fromTry(expiredLocalValue.value)
           case Failure(e) =>
             // This is almost impossible to happen because it's local and we don't save failed values
+            // Failed values are stored into property "value", not as the value itself
             reporter.onLocalError(key, e, elapsedTime(startTime))
             logger.warn(s"apply, key: $key got a failed future from cache!? This is almost impossible!", e)
-            tryGenerateAndSet(key, genValue, startTime).map(_.value.get)
+            for {
+              tsv <- tryGenerateAndSet(key, genValue, startTime)
+              value <- Future.fromTry(tsv.value)
+            } yield value
         }
       case None if remoteRW.nonEmpty =>
         // No local, let's try remote
@@ -203,26 +207,35 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
             // Remote is good, set locally and return it
             reporter.onRemoteCacheHit(key, elapsedTime(startTime))
             localCache.foreach(_.set(key, remoteValue))
-            Future.successful(remoteValue.value.get)
+            Future.fromTry(remoteValue.value)
           case Success(Some(expiredRemote)) =>
             // Expired remote, return the it, async update
             reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime))
-            tryGenerateAndSet(key, genValue, startTime).map(_.value)
-            Future.successful(expiredRemote.value.get)
+            tryGenerateAndSet(key, genValue, startTime)
+            Future.fromTry(expiredRemote.value)
           case Success(None) =>
             // No good remote, sync generate
             reporter.onCacheMissNothingFound(key, elapsedTime(startTime))
-            tryGenerateAndSet(key, genValue, startTime).map(_.value.get)
+            for {
+              tsv <- tryGenerateAndSet(key, genValue, startTime)
+              value <- Future.fromTry(tsv.value)
+            } yield value
           case Failure(e) =>
             reporter.onRemoteError(key, e, elapsedTime(startTime))
             logger.warn(s"apply, key: $key expired local value and remote error", e)
-            tryGenerateAndSet(key, genValue, startTime).map(_.value.get)
+            for {
+              tsv <- tryGenerateAndSet(key, genValue, startTime)
+              value <- Future.fromTry(tsv.value)
+            } yield value
         }
       case None if remoteRW.isEmpty =>
         // No local and no remote to look, just generate it
         // The caller will need to wait for the value generation
         reporter.onCacheMissNothingFound(key, elapsedTime(startTime))
-        tryGenerateAndSet(key, genValue, startTime).map(_.value.get)
+        for {
+          tsv <- tryGenerateAndSet(key, genValue, startTime)
+          value <- Future.fromTry(tsv.value)
+        } yield value
     }
     result.onComplete {
       case Success(_) =>
@@ -262,7 +275,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
     remote.get(key).asTry().flatMap {
       case Success(Some(remoteValue)) if remoteValue == localValue =>
         // Remote is the same as local, return any of them
-        Future.successful(remoteValue.value.get)
+        Future.fromTry(remoteValue.value)
       case Success(Some(remoteValue)) =>
         // Something is different, try to figure it out
         val valuesResult = if (remoteValue.value == localValue.value) "same-value" else "different-values"
@@ -282,18 +295,22 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
         logger.warn(s"sanityLocalValueCheck, key $key: got different results for local $localValue and remote $remoteValue ($finalResult)")
         reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime))
         // return remote to keep everyone consistent
-        Future.successful(remoteValue.value.get)
+        Future.fromTry(remoteValue.value)
       case Success(None) =>
         val localExpired = localValue.hasExpired(ttl, now, ttlCachedErrors)
         val finalResult = s"missing-remote-local-expired-${localExpired}"
         logger.warn(s"sanityLocalValueCheck, key $key: got local $localValue but no remote ($finalResult)")
         reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime))
         // Try generate it to keep a behaviour equivalent to remote only
-        tryGenerateAndSet(key, genValue, startTime).map(_.value.get)
+        for {
+          tsv <- tryGenerateAndSet(key, genValue, startTime)
+          value <- Future.fromTry(tsv.value)
+        } yield value
+
       case Failure(e) =>
         reporter.onRemoteError(key, e, elapsedTime(startTime))
         logger.warn(s"sanityLocalValueCheck, key: $key  failed to get remote", e)
-        Future.successful(localValue.value.get)
+        Future.fromTry(localValue.value)
     }
   }
 

From 4cff4323b4cd305d2231538d17696151164e5211 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 17 Oct 2017 17:28:27 -0200
Subject: [PATCH 173/268] I hope now we fixed those tests random failures

---
 .../ExpiringMultipleLevelCacheSpec.scala      | 35 +++++++++++++------
 .../ignition/core/utils/FutureUtilsSpec.scala | 35 +++++++++++++------
 2 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
index 0a21ff6c..dec108b4 100644
--- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
+++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
@@ -1,6 +1,7 @@
 package ignition.core.cache
 
 import java.io.FileNotFoundException
+import java.util.concurrent.atomic.AtomicInteger
 
 import akka.actor.ActorSystem
 import ignition.core.cache.ExpiringMultiLevelCache.TimestampedValue
@@ -35,27 +36,41 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu
   }
 
   it should "calculate a value on cache miss after ttl" in {
-    var myRequestCount: Int = 0
+    val myRequestCount = new AtomicInteger()
 
     def myRequest(): Future[Data] = {
-      myRequestCount = myRequestCount + 1
+      myRequestCount.incrementAndGet()
       Future.successful(Data("success"))
     }
 
     val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
     val cache = ExpiringMultiLevelCache[Data](ttl = 9.seconds, localCache = Option(local))
 
-    Await.result(cache("key", myRequest), 1.minute) shouldBe Data("success")
-    myRequestCount shouldBe 1
-    Await.result(cache("key", myRequest), 1.minute) shouldBe Data("success")
-    myRequestCount shouldBe 1
+    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
+      result shouldBe Data("success")
+    }
+
+    myRequestCount.get() shouldBe 1
+
+    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
+      result shouldBe Data("success")
+    }
+
+    myRequestCount.get() shouldBe 1
 
     Thread.sleep(10000)
 
-    Await.result(cache("key", myRequest), 1.minute) shouldBe Data("success")
-    myRequestCount shouldBe 2
-    Await.result(cache("key", myRequest), 1.minute) shouldBe Data("success")
-    myRequestCount shouldBe 2
+    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
+      result shouldBe Data("success")
+    }
+
+    myRequestCount.get() shouldBe 2
+
+    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
+      result shouldBe Data("success")
+    }
+
+    myRequestCount.get() shouldBe 2
   }
 
   it should "calculate a value on cache miss just once, the second call should be from cache hit" in {
diff --git a/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala b/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala
index 4649fcfc..bb47e196 100644
--- a/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala
+++ b/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala
@@ -1,43 +1,56 @@
 package ignition.core.utils
-import FutureUtils._
+import ignition.core.utils.FutureUtils._
 import org.scalatest._
+import org.scalatest.concurrent.ScalaFutures
 
-import scala.concurrent.{Await, Future}
-import scala.concurrent.duration._
 import scala.concurrent.ExecutionContext.Implicits.global
+import scala.concurrent.Future
+import scala.concurrent.duration._
 
-class FutureUtilsSpec extends FlatSpec with Matchers {
+class FutureUtilsSpec extends FlatSpec with Matchers with ScalaFutures {
   "FutureUtils" should "provide toLazyIterable" in {
     val timesCalled = collection.mutable.Map.empty[Int, Int].withDefaultValue(0)
 
     val generators = (0 until 20).map { i => () => Future { timesCalled(i) += 1 ; i } }
     val iterable = generators.toLazyIterable()
     val iterator = iterable.toIterator
-    timesCalled.forall { case (key, count) => count == 0 } shouldBe true
+    timesCalled.forall { case (_, count) => count == 0 } shouldBe true
 
-    Await.result(iterator.next(), 2.seconds)
+    whenReady(iterator.next(), timeout(2.seconds)) { _ => () }
 
     timesCalled(0) shouldBe 1
 
     (1 until 20).foreach { i => timesCalled(i) shouldBe 0 }
 
-    Await.result(Future.sequence(iterator), 5.seconds).toList shouldBe (1 until 20).toList
+    whenReady(Future.sequence(iterator), timeout(5.seconds)) { result =>
+      result.toList shouldBe (1 until 20).toList
+    }
 
     (0 until 20).foreach { i => timesCalled(i) shouldBe 1 }
   }
 
   it should "provide collectAndTake" in {
     val timesCalled = collection.mutable.Map.empty[Int, Int].withDefaultValue(0)
-    val iterable = (0 until 30).map { i => () => Future { timesCalled(i) += 1 ; i } }.toLazyIterable()
+    val iterable = (0 until 30).map { i =>
+      () =>
+        Future {
+          synchronized {
+            timesCalled(i) += 1
+          }
+          i
+        }
+    }.toLazyIterable()
 
     val expectedRange = Range(5, 15)
-    val result = Await.result(iterable.collectAndTake({ case i if expectedRange.contains(i) => i }, n = expectedRange.size), 5.seconds)
 
-    result shouldBe expectedRange.toList
+    val f: Future[List[Int]] = iterable.collectAndTake({ case i if expectedRange.contains(i) => i }, n = expectedRange.size)
+
+    whenReady(f, timeout(5.seconds)) { result =>
+      result shouldBe expectedRange.toList
+    }
 
     (0 until 20).foreach { i => timesCalled(i) shouldBe 1 } // 2 batches of size 10
     (20 until 30).foreach { i => timesCalled(i) shouldBe 0 } // last batch won't be ran
-
   }
 
 }

From ce0e6f26e469eb9fdb4e2b626cf83d4784c7c70d Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 17 Oct 2017 18:03:50 -0200
Subject: [PATCH 174/268] and here we go...

---
 .../ExpiringMultipleLevelCacheSpec.scala      | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
index dec108b4..121f34a8 100644
--- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
+++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
@@ -70,6 +70,34 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu
       result shouldBe Data("success")
     }
 
+    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
+      result shouldBe Data("success")
+    }
+
+    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
+      result shouldBe Data("success")
+    }
+
+    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
+      result shouldBe Data("success")
+    }
+
+    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
+      result shouldBe Data("success")
+    }
+
+    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
+      result shouldBe Data("success")
+    }
+
+    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
+      result shouldBe Data("success")
+    }
+
+    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
+      result shouldBe Data("success")
+    }
+
     myRequestCount.get() shouldBe 2
   }
 

From 3f885eb342a78ae43d032088d219b84f1acf1139 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 18 Oct 2017 09:21:45 -0200
Subject: [PATCH 175/268] random programming...

---
 .../ExpiringMultipleLevelCacheSpec.scala      | 59 ++++++-------------
 1 file changed, 18 insertions(+), 41 deletions(-)

diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
index 121f34a8..d202e4e7 100644
--- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
+++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
@@ -36,6 +36,7 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu
   }
 
   it should "calculate a value on cache miss after ttl" in {
+    val cacheTtl = 3.seconds
     val myRequestCount = new AtomicInteger()
 
     def myRequest(): Future[Data] = {
@@ -44,61 +45,37 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu
     }
 
     val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
-    val cache = ExpiringMultiLevelCache[Data](ttl = 9.seconds, localCache = Option(local))
+    val cache = ExpiringMultiLevelCache[Data](ttl = cacheTtl, localCache = Option(local))
 
-    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
+    whenReady(cache("key", myRequest)) { result =>
       result shouldBe Data("success")
     }
 
     myRequestCount.get() shouldBe 1
 
-    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
+    whenReady(cache("key", myRequest)) { result =>
       result shouldBe Data("success")
     }
 
     myRequestCount.get() shouldBe 1
 
-    Thread.sleep(10000)
-
-    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
-      result shouldBe Data("success")
-    }
-
-    myRequestCount.get() shouldBe 2
-
-    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
-      result shouldBe Data("success")
-    }
-
-    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
-      result shouldBe Data("success")
-    }
-
-    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
-      result shouldBe Data("success")
+    val f = Future {
+      Thread.sleep(cacheTtl.toMillis + 10)
     }
 
-    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
-      result shouldBe Data("success")
-    }
-
-    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
-      result shouldBe Data("success")
-    }
+    whenReady(f, timeout(cacheTtl + 20.milli)) { _ =>
+      whenReady(cache("key", myRequest)) { result =>
+        result shouldBe Data("success")
+      }
 
-    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
-      result shouldBe Data("success")
-    }
+      myRequestCount.get() shouldBe 2
 
-    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
-      result shouldBe Data("success")
-    }
+      whenReady(cache("key", myRequest)) { result =>
+        result shouldBe Data("success")
+      }
 
-    whenReady(cache("key", myRequest), timeout(1.minute)) { result =>
-      result shouldBe Data("success")
+      myRequestCount.get() shouldBe 2
     }
-
-    myRequestCount.get() shouldBe 2
   }
 
   it should "calculate a value on cache miss just once, the second call should be from cache hit" in {
@@ -199,7 +176,7 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu
     }
 
     val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
-    val cache = ExpiringMultiLevelCache[Data](ttl = 1.minute, localCache = Option(local), cacheErrors = true, ttlCachedErrors = 9.seconds)
+    val cache = ExpiringMultiLevelCache[Data](ttl = 1.minute, localCache = Option(local), cacheErrors = true, ttlCachedErrors = 4.seconds)
 
     val eventualCache = cache("key", myFailedRequest)
     whenReady(eventualCache.failed) { failure =>
@@ -213,7 +190,7 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu
       myFailedRequestCount shouldBe 1
     }
 
-    Thread.sleep(10000)
+    Thread.sleep(5000)
 
     val eventualCache3 = cache("key", myFailedRequest)
     whenReady(eventualCache3.failed) { failure =>
@@ -227,7 +204,7 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu
       myFailedRequestCount shouldBe 2
     }
 
-    Thread.sleep(1000)
+    Thread.sleep(500)
 
     val eventualCache5 = cache("key", myFailedRequest)
     whenReady(eventualCache5.failed) { failure =>

From e9d4703576d5256ed4677e0fb397351ebca64241 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 18 Oct 2017 10:25:41 -0200
Subject: [PATCH 176/268] simple version...

---
 .../ExpiringMultipleLevelCacheSpec.scala      | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
index d202e4e7..9fd77d78 100644
--- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
+++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
@@ -59,23 +59,19 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu
 
     myRequestCount.get() shouldBe 1
 
-    val f = Future {
-      Thread.sleep(cacheTtl.toMillis + 10)
-    }
-
-    whenReady(f, timeout(cacheTtl + 20.milli)) { _ =>
-      whenReady(cache("key", myRequest)) { result =>
-        result shouldBe Data("success")
-      }
+    Thread.sleep(cacheTtl.toMillis + 10)
 
-      myRequestCount.get() shouldBe 2
+    whenReady(cache("key", myRequest)) { result =>
+      result shouldBe Data("success")
+    }
 
-      whenReady(cache("key", myRequest)) { result =>
-        result shouldBe Data("success")
-      }
+    myRequestCount.get() shouldBe 2
 
-      myRequestCount.get() shouldBe 2
+    whenReady(cache("key", myRequest)) { result =>
+      result shouldBe Data("success")
     }
+
+    myRequestCount.get() shouldBe 2
   }
 
   it should "calculate a value on cache miss just once, the second call should be from cache hit" in {

From 6a52e1980d438f5e25a63ff401f39c488c08adf8 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 18 Oct 2017 13:48:37 -0200
Subject: [PATCH 177/268] sync this too

---
 src/test/scala/ignition/core/utils/FutureUtilsSpec.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala b/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala
index bb47e196..c10b50d5 100644
--- a/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala
+++ b/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala
@@ -11,7 +11,7 @@ class FutureUtilsSpec extends FlatSpec with Matchers with ScalaFutures {
   "FutureUtils" should "provide toLazyIterable" in {
     val timesCalled = collection.mutable.Map.empty[Int, Int].withDefaultValue(0)
 
-    val generators = (0 until 20).map { i => () => Future { timesCalled(i) += 1 ; i } }
+    val generators = (0 until 20).map { i => () => Future { synchronized { timesCalled(i) += 1 } ; i } }
     val iterable = generators.toLazyIterable()
     val iterator = iterable.toIterator
     timesCalled.forall { case (_, count) => count == 0 } shouldBe true

From 979a8995e8d74e6b1709c42c1934f4d3fb350db1 Mon Sep 17 00:00:00 2001
From: Fernando Rodrigues da Silva <fernandors87@users.noreply.github.com>
Date: Mon, 6 Nov 2017 20:05:33 -0200
Subject: [PATCH 178/268] fix too many open files (#126)

---
 tools/scripts/noop | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/scripts/noop b/tools/scripts/noop
index cc1f786e..ed32eb97 100644
--- a/tools/scripts/noop
+++ b/tools/scripts/noop
@@ -1 +1,3 @@
-#!/bin/bash
\ No newline at end of file
+#!/bin/bash
+
+echo '*    -    nofile    256000' >> /etc/security/limits.conf
\ No newline at end of file

From 6a9f7a84737b4317d4b87f96567ad46ee0f2dae7 Mon Sep 17 00:00:00 2001
From: Fernando Rodrigues da Silva <fernandors87@users.noreply.github.com>
Date: Wed, 8 Nov 2017 11:33:54 -0200
Subject: [PATCH 179/268] changing ulimit -n to an usual value (#127)

---
 tools/scripts/noop | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/scripts/noop b/tools/scripts/noop
index ed32eb97..eb34279f 100644
--- a/tools/scripts/noop
+++ b/tools/scripts/noop
@@ -1,3 +1,3 @@
 #!/bin/bash
 
-echo '*    -    nofile    256000' >> /etc/security/limits.conf
\ No newline at end of file
+echo '*    -    nofile    65535' >> /etc/security/limits.conf
\ No newline at end of file

From 7c5ebfd64ca74698e7d6d58e52f1e93235941ed6 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Thu, 23 Nov 2017 11:58:26 -0200
Subject: [PATCH 180/268] moving this class to a better scope

---
 .../core/jobs/utils/SparkContextUtils.scala   | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 1de12dd6..cd362de0 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -37,6 +37,18 @@ object SparkContextUtils {
     override def getPartition(key: Any): Int = index(key)
   }
 
+  case class SizeBasedFileHandling(averageEstimatedCompressionRatio: Int = 8, compressedExtensions: Set[String] = Set(".gz")) {
+
+    def isBig(f: HadoopFile, uncompressedBigSize: Long): Boolean = estimatedSize(f) >= uncompressedBigSize
+
+    def estimatedSize(f: HadoopFile): Long = if (isCompressed(f))
+      f.size * averageEstimatedCompressionRatio
+    else
+      f.size
+
+    def isCompressed(f: HadoopFile): Boolean = compressedExtensions.exists(f.path.endsWith)
+  }
+
   private lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider())
 
   private def close(inputStream: InputStream, path: String): Unit = {
@@ -249,20 +261,6 @@ object SparkContextUtils {
         objectHadoopFile(paths, minimumPaths)
     }
 
-    case class SizeBasedFileHandling(averageEstimatedCompressionRatio: Int = 8,
-                                     compressedExtensions: Set[String] = Set(".gz")) {
-
-      def isBig(f: HadoopFile, uncompressedBigSize: Long): Boolean = estimatedSize(f) >= uncompressedBigSize
-
-      def estimatedSize(f: HadoopFile) = if (isCompressed(f))
-        f.size * averageEstimatedCompressionRatio
-      else
-        f.size
-
-      def isCompressed(f: HadoopFile): Boolean = compressedExtensions.exists(f.path.endsWith)
-    }
-
-
     private def readSmallFiles(smallFiles: List[HadoopFile],
                                maxBytesPerPartition: Long,
                                minPartitions: Int,

From ce9d356932ceb845dcb231acf2c13296bda1c811 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Thu, 23 Nov 2017 15:22:11 -0200
Subject: [PATCH 181/268] using java8

---
 circle.yml | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 circle.yml

diff --git a/circle.yml b/circle.yml
new file mode 100644
index 00000000..abd78de2
--- /dev/null
+++ b/circle.yml
@@ -0,0 +1,3 @@
+machine:
+  java:
+    version: oraclejdk8

From cc08dcefdfbf59307b9d1819e92b6d138d6e0b25 Mon Sep 17 00:00:00 2001
From: Henrique Goulart <henriquedsg89@gmail.com>
Date: Mon, 19 Feb 2018 09:24:25 -0300
Subject: [PATCH 182/268] Update zeppelin to latest version and spark memory
 parameter

---
 remote_hook.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index 6648ccd8..cb43904f 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -59,7 +59,7 @@ on_trap_exit() {
 
 install_and_run_zeppelin() {
     if [[ ! -d "zeppelin" ]]; then
-        wget "http://www.us.apache.org/dist/incubator/zeppelin/0.5.6-incubating/zeppelin-0.5.6-incubating-bin-all.tgz" -O zeppelin.tar.gz
+        wget "http://www-us.apache.org/dist/zeppelin/zeppelin-0.7.3/zeppelin-0.7.3-bin-all.tgz" -O zeppelin.tar.gz
         mkdir zeppelin
         tar xvzf zeppelin.tar.gz -C zeppelin --strip-components 1 > /tmp/zeppelin_install.log
     fi
@@ -67,7 +67,7 @@ install_and_run_zeppelin() {
         export MASTER="${JOB_MASTER}"
         export ZEPPELIN_PORT="8081"
         export SPARK_HOME="/root/spark"
-        export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH} --runner-executor-memory ${SPARK_MEM_PARAM}"
+        export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH} --executor-memory ${SPARK_MEM_PARAM}"
         sudo -E zeppelin/bin/zeppelin.sh
     else
         notify_error_and_exit "Zepellin installation not found"

From 92599ce2defaac6685eb3d3ed590059f8616627b Mon Sep 17 00:00:00 2001
From: Henrique Goulart <henriquedsg89@gmail.com>
Date: Thu, 8 Mar 2018 10:09:59 -0300
Subject: [PATCH 183/268] Workaround to use private vpc

---
 tools/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 5efefeb5..7b5863ef 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -352,7 +352,7 @@ def get_master(cluster_name, region=default_region):
     masters = get_masters(cluster_name, region=region)
     if not masters:
         raise CommandError("No master on {}".format(cluster_name))
-    return masters[0].public_dns_name
+    return masters[0].private_dns_name
 
 
 def ssh_master(cluster_name, key_file=default_key_file, user=default_remote_user, region=default_region, *args):

From 4c118a107d36651133c35ac39934632ca82af593 Mon Sep 17 00:00:00 2001
From: Henrique Goulart <henriquedsg89@gmail.com>
Date: Thu, 8 Mar 2018 10:17:20 -0300
Subject: [PATCH 184/268] Fix DNS names for launching on private subnet

---
 tools/cluster.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 7b5863ef..c56a803a 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -337,7 +337,7 @@ def destroy(cluster_name, delete_groups=False, region=default_region):
     if all_instances:
         log.info('The following instances will be terminated:')
         for i in all_instances:
-            log.info('-> %s' % i.public_dns_name)
+            log.info('-> %s' % (i.public_dns_name or i.private_dns_name))
 
         log.info('Terminating master...')
         for i in masters:
@@ -352,7 +352,7 @@ def get_master(cluster_name, region=default_region):
     masters = get_masters(cluster_name, region=region)
     if not masters:
         raise CommandError("No master on {}".format(cluster_name))
-    return masters[0].private_dns_name
+    return masters[0].public_dns_name or masters[0].private_dns_name
 
 
 def ssh_master(cluster_name, key_file=default_key_file, user=default_remote_user, region=default_region, *args):

From 67ac87a62c2fd4e56b65365fe64abf9c41ae7ebc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Roeck?= <roeckandre@gmail.com>
Date: Wed, 14 Mar 2018 13:49:08 -0300
Subject: [PATCH 185/268] Improve stderr and cluster destroy (#132)

* Return stdout and stderr as subprocess output

* Add wait_termination param to destroy method

* Add log and timetout to cluster termination

* Rename wait_timeout param
---
 tools/cluster.py | 14 +++++++++++++-
 tools/utils.py   |  3 ++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 5efefeb5..6e475300 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -329,7 +329,7 @@ def launch(cluster_name, slaves,
     raise CommandError('Failed to created cluster {} after failures'.format(cluster_name))
 
 
-def destroy(cluster_name, delete_groups=False, region=default_region):
+def destroy(cluster_name, delete_groups=False, region=default_region, wait_termination=False, wait_timeout_minutes=10):
     assert not delete_groups, 'Delete groups is deprecated and unsupported'
     masters, slaves = get_active_nodes(cluster_name, region=region)
 
@@ -342,9 +342,21 @@ def destroy(cluster_name, delete_groups=False, region=default_region):
         log.info('Terminating master...')
         for i in masters:
             i.terminate()
+
         log.info('Terminating slaves...')
         for i in slaves:
             i.terminate()
+
+        if wait_termination:
+            log.info('Waiting for instances termination...')
+        termination_timeout = wait_timeout_minutes*60
+        termination_start = time.time()
+        while wait_termination and all_instances and time.time() < termination_start+termination_timeout:
+            all_instances = [i for i in all_instances if i.state != 'terminated']
+            time.sleep(5)
+            for i in all_instances:
+                i.update()
+
         log.info('Done.')
 
 
diff --git a/tools/utils.py b/tools/utils.py
index 39d6129f..88a236cd 100644
--- a/tools/utils.py
+++ b/tools/utils.py
@@ -120,5 +120,6 @@ def check_call_with_timeout(args, stdin=None, stdout=None,
     read_from_to(p.stdout, stdout)
     read_from_to(p.stderr, stderr)
     if p.returncode != 0:
-        raise subprocess.CalledProcessError(p.returncode, args)
+        stdall = 'STDOUT:\n{}\nSTDERR:\n{}'.format(stdout, stderr)
+        raise subprocess.CalledProcessError(p.returncode, args, output=stdall)
     return p.returncode

From d30f140dbcc87e1cd3c87b5ef28898bab9dd5a0c Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 3 Apr 2018 14:46:24 -0300
Subject: [PATCH 186/268] flintrok with private vpc support

---
 tools/flintrock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/flintrock b/tools/flintrock
index eba6ab1d..b4bd82cc 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit eba6ab1dceb942937bdc9610736e70d72e2a6579
+Subproject commit b4bd82cc3cb5e72c2fd301510db7570326ce3086

From 306cd39b50e98e3080a8c9fbbfcd6efb0eaad17f Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 10 Apr 2018 10:42:57 -0300
Subject: [PATCH 187/268] update flintrock

---
 tools/flintrock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/flintrock b/tools/flintrock
index 67721e89..239fec7e 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit 67721e8997b1bf85a7ec1283714039959e9f6c63
+Subproject commit 239fec7eb5c81ad428c1ce7aafd66998bc887a10

From 6ced247c1071f1a43e5e4636aa79361f4cc62e5c Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 24 Apr 2018 11:14:19 -0300
Subject: [PATCH 188/268] update spark and hadoop

---
 build.sbt                                             | 6 +++---
 src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 4 ++--
 tools/cluster.py                                      | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/build.sbt b/build.sbt
index f4fa51d0..39bbd8b6 100644
--- a/build.sbt
+++ b/build.sbt
@@ -9,13 +9,13 @@ scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warning
 // Because we can't run two spark contexts on same VM
 parallelExecution in Test := false
 
-libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.2.0" % "provided")
+libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.3.0" % "provided")
   .exclude("org.apache.hadoop", "hadoop-client")
   .exclude("org.slf4j", "slf4j-log4j12")
 
-libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.2" % "provided")
+libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided")
 
-libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2")
+libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.6")
   .exclude("org.apache.htrace", "htrace-core")
   .exclude("commons-beanutils", "commons-beanutils")
   .exclude("org.slf4j", "slf4j-log4j12")
diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
index ab47ee12..0fa12c9c 100644
--- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
+++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
@@ -92,8 +92,8 @@ object CoreJobRunner {
 
       // Also try to propagate logging context to workers
       // TODO: find a more efficient and bullet-proof way
-      val configBroadCast = sc.broadcast(config)
-      sc.parallelize(Range(1, 2000), numSlices = 2000).foreachPartition(_ => setLoggingContextValues(configBroadCast.value))
+//      val configBroadCast = sc.broadcast(config)
+//      sc.parallelize(Range(1, 2000), numSlices = 2000).foreachPartition(_ => setLoggingContextValues(configBroadCast.value))
 
       val context = RunnerContext(sc, config)
 
diff --git a/tools/cluster.py b/tools/cluster.py
index 83a289ec..5bdcb4da 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -46,11 +46,11 @@
 default_zone = default_region + 'b'
 default_key_id = 'ignition_key'
 default_key_file = os.path.expanduser('~/.ssh/ignition_key.pem')
-default_ami = 'ami-611e7976'
+default_ami = 'ami-5679a229'
 default_master_ami = ''
 default_env = 'dev'
-default_spark_version = '2.2.0'
-default_hdfs_version = '2.7.2'
+default_spark_version = '2.3.0'
+default_hdfs_version = '2.7.6'
 default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz'
 default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz'
 default_remote_user = 'ec2-user'

From 3591fdb4f19d6800b1d83cdee7f47cae69e1cb1f Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 25 Apr 2018 15:06:03 -0300
Subject: [PATCH 189/268] script to create ami

---
 tools/create_image.sh | 47 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 tools/create_image.sh

diff --git a/tools/create_image.sh b/tools/create_image.sh
new file mode 100644
index 00000000..5f807365
--- /dev/null
+++ b/tools/create_image.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Creates an AMI for the Spark EC2 scripts starting with a stock Amazon Linux AMI.
+
+# This script was adapted from:
+# https://github.com/amplab/spark-ec2/blob/branch-1.6/create_image.sh
+
+set -e
+
+if [ "$(id -u)" != "0" ]; then
+   echo "This script must be run as root" 1>&2
+   exit 1
+fi
+
+# Dev tools
+sudo yum install -y java-1.8.0-openjdk-devel
+# Perf tools
+sudo yum install -y dstat iotop strace sysstat htop perf
+sudo debuginfo-install -q -y glibc
+sudo debuginfo-install -q -y kernel
+sudo yum --enablerepo='*-debug*' install -q -y java-1.8.0-openjdk-debuginfo.x86_64
+
+# Root ssh config
+sudo sed -i 's/PermitRootLogin.*/PermitRootLogin without-password/g' \
+  /etc/ssh/sshd_config
+sudo sed -i 's/disable_root.*/disable_root: 0/g' /etc/cloud/cloud.cfg
+
+# Edit bash profile
+echo "export PS1=\"\\u@\\h \\W]\\$ \"" >> ~/.bash_profile
+echo "export JAVA_HOME=/usr/lib/jvm/java-1.8.0" >> ~/.bash_profile
+
+source ~/.bash_profile
+
+# Global JAVA_HOME env
+echo "export JAVA_HOME=/usr/lib/jvm/java-1.8.0" >> /etc/environment
+
+# Install Snappy lib (for Hadoop)
+yum install -y snappy
+
+# Install netlib-java native dependencies
+yum install -y  blas atlas lapack
+
+# Create /usr/bin/realpath which is used by R to find Java installations
+# NOTE: /usr/bin/realpath is missing in CentOS AMIs. See
+# http://superuser.com/questions/771104/usr-bin-realpath-not-found-in-centos-6-5
+echo '#!/bin/bash' > /usr/bin/realpath
+echo 'readlink -e "$@"' >> /usr/bin/realpath
+chmod a+x /usr/bin/realpath
\ No newline at end of file

From 040b7e6df351523c2a1ba72eebf26910a892fa6e Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 25 Apr 2018 15:07:13 -0300
Subject: [PATCH 190/268] update to a new ami

---
 tools/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 5bdcb4da..bd114e68 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -46,7 +46,7 @@
 default_zone = default_region + 'b'
 default_key_id = 'ignition_key'
 default_key_file = os.path.expanduser('~/.ssh/ignition_key.pem')
-default_ami = 'ami-5679a229'
+default_ami = 'ami-60b6001f'
 default_master_ami = ''
 default_env = 'dev'
 default_spark_version = '2.3.0'

From 144f2403c6498b02c1de6d3ee184676724a591b4 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 25 Apr 2018 15:12:11 -0300
Subject: [PATCH 191/268] update flintrock

---
 tools/flintrock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/flintrock b/tools/flintrock
index 239fec7e..787faa4d 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit 239fec7eb5c81ad428c1ce7aafd66998bc887a10
+Subproject commit 787faa4d1b7708e0a387c7243723eddd2b1a33cb

From fedf0eb3e4c99d2adb09c7e7afa60c189f84e387 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 25 Apr 2018 15:16:27 -0300
Subject: [PATCH 192/268] reverting commented lines by mistake

---
 src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
index 0fa12c9c..ab47ee12 100644
--- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
+++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
@@ -92,8 +92,8 @@ object CoreJobRunner {
 
       // Also try to propagate logging context to workers
       // TODO: find a more efficient and bullet-proof way
-//      val configBroadCast = sc.broadcast(config)
-//      sc.parallelize(Range(1, 2000), numSlices = 2000).foreachPartition(_ => setLoggingContextValues(configBroadCast.value))
+      val configBroadCast = sc.broadcast(config)
+      sc.parallelize(Range(1, 2000), numSlices = 2000).foreachPartition(_ => setLoggingContextValues(configBroadCast.value))
 
       val context = RunnerContext(sc, config)
 

From 78a5ce3b20a944e7ccc08de92920c592818b3b54 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 2 May 2018 15:58:32 -0300
Subject: [PATCH 193/268] flintrock updates

---
 tools/flintrock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/flintrock b/tools/flintrock
index 787faa4d..f9304910 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit 787faa4d1b7708e0a387c7243723eddd2b1a33cb
+Subproject commit f9304910e69bbf858f95abeb6be1204a38c169d5

From ea1fc349de5c869b01c565163f06874c64b0be91 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Thu, 3 May 2018 15:56:30 -0300
Subject: [PATCH 194/268] flintrock update

---
 tools/flintrock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/flintrock b/tools/flintrock
index f9304910..39d2c249 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit f9304910e69bbf858f95abeb6be1204a38c169d5
+Subproject commit 39d2c249b08fbd1c3869e7d435d66b7a97de8cc3

From 37735f7f22c928f9dd4279a3ba155f12a6f591d1 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Thu, 7 Jun 2018 08:16:14 -0300
Subject: [PATCH 195/268] passing forward to sync the file size estimator

---
 src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index cd362de0..2d6bf6b5 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -689,7 +689,7 @@ object SparkContextUtils {
 
       if (forceSynch || foundLocalPaths.isEmpty) {
         delete(new Path(syncPath(s"$synchLocally/")))
-        val data = parallelReadTextFiles(hadoopFiles, maxBytesPerPartition, minPartitions, synchLocally = None)
+        val data = parallelReadTextFiles(hadoopFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling = sizeBasedFileHandling,  synchLocally = None)
         data.saveAsTextFile(cacheKey)
       }
 

From db7125d4870e6a25eaff6c66779dc7b50541f765 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Fri, 15 Jun 2018 17:03:46 -0300
Subject: [PATCH 196/268] update spark 2.3.1, hadoop 2.8.4, flintrock with
 support for spark without hadoop build

---
 build.sbt        | 6 +++---
 tools/cluster.py | 6 +++---
 tools/flintrock  | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/build.sbt b/build.sbt
index 39bbd8b6..7271b924 100644
--- a/build.sbt
+++ b/build.sbt
@@ -9,13 +9,13 @@ scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warning
 // Because we can't run two spark contexts on same VM
 parallelExecution in Test := false
 
-libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.3.0" % "provided")
+libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.3.1" % "provided")
   .exclude("org.apache.hadoop", "hadoop-client")
   .exclude("org.slf4j", "slf4j-log4j12")
 
-libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided")
+libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.8.4" % "provided")
 
-libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.6")
+libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.8.4")
   .exclude("org.apache.htrace", "htrace-core")
   .exclude("commons-beanutils", "commons-beanutils")
   .exclude("org.slf4j", "slf4j-log4j12")
diff --git a/tools/cluster.py b/tools/cluster.py
index bd114e68..99a63b28 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -49,9 +49,9 @@
 default_ami = 'ami-60b6001f'
 default_master_ami = ''
 default_env = 'dev'
-default_spark_version = '2.3.0'
-default_hdfs_version = '2.7.6'
-default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz'
+default_spark_version = '2.3.1'
+default_hdfs_version = '2.8.4'
+default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-without-hadoop.tgz'
 default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz'
 default_remote_user = 'ec2-user'
 default_installation_user = 'root'
diff --git a/tools/flintrock b/tools/flintrock
index 39d2c249..9b560c7a 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit 39d2c249b08fbd1c3869e7d435d66b7a97de8cc3
+Subproject commit 9b560c7a54f898bd3924a55410c1ed2509c97152

From 74c3cb32c6147870b832a42d51f4f59a4fc0df71 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 19 Jun 2018 10:55:48 -0300
Subject: [PATCH 197/268] update flintrock

---
 tools/flintrock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/flintrock b/tools/flintrock
index 9b560c7a..0e540aef 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit 9b560c7a54f898bd3924a55410c1ed2509c97152
+Subproject commit 0e540aef41632c43db7db6387b8e22dd07a791d9

From 6350c67a3cd280dfb5160ef1cd907a863b7af01d Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Fri, 22 Jun 2018 16:46:41 -0300
Subject: [PATCH 198/268] increase ulimit open files, to help in big shuffles

---
 tools/scripts/noop | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/scripts/noop b/tools/scripts/noop
index eb34279f..0e872836 100644
--- a/tools/scripts/noop
+++ b/tools/scripts/noop
@@ -1,3 +1,3 @@
 #!/bin/bash
 
-echo '*    -    nofile    65535' >> /etc/security/limits.conf
\ No newline at end of file
+echo '*    -    nofile    1000000' >> /etc/security/limits.conf
\ No newline at end of file

From 8b58235d043314fd9982583d27cc374b0ac98a5d Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Fri, 22 Jun 2018 18:23:28 -0300
Subject: [PATCH 199/268] update flintrok with nvme support

---
 tools/flintrock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/flintrock b/tools/flintrock
index 39d2c249..d2318e99 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit 39d2c249b08fbd1c3869e7d435d66b7a97de8cc3
+Subproject commit d2318e99dd972765673a0b8d716d3409d337e2da

From 0b6c0c5f9fd46605bc5f7f050088ccc75f588d11 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Tue, 26 Jun 2018 09:21:08 -0300
Subject: [PATCH 200/268] rollback hadoop to 2.7.6, because of issues with
 spark sql

---
 build.sbt        | 4 ++--
 tools/cluster.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/build.sbt b/build.sbt
index 7271b924..81f585b6 100644
--- a/build.sbt
+++ b/build.sbt
@@ -13,9 +13,9 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.3.1" % "provided
   .exclude("org.apache.hadoop", "hadoop-client")
   .exclude("org.slf4j", "slf4j-log4j12")
 
-libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.8.4" % "provided")
+libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided")
 
-libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.8.4")
+libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.6")
   .exclude("org.apache.htrace", "htrace-core")
   .exclude("commons-beanutils", "commons-beanutils")
   .exclude("org.slf4j", "slf4j-log4j12")
diff --git a/tools/cluster.py b/tools/cluster.py
index 99a63b28..34272f4f 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -50,8 +50,8 @@
 default_master_ami = ''
 default_env = 'dev'
 default_spark_version = '2.3.1'
-default_hdfs_version = '2.8.4'
-default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-without-hadoop.tgz'
+default_hdfs_version = '2.7.6'
+default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz'
 default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz'
 default_remote_user = 'ec2-user'
 default_installation_user = 'root'

From 83ed71d6894bf95bbd9d904fe78ecea4e668ef31 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Mon, 2 Jul 2018 11:49:23 -0300
Subject: [PATCH 201/268] update zeppeling 0.8.0

---
 remote_hook.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index cb43904f..3821d92e 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -59,7 +59,7 @@ on_trap_exit() {
 
 install_and_run_zeppelin() {
     if [[ ! -d "zeppelin" ]]; then
-        wget "http://www-us.apache.org/dist/zeppelin/zeppelin-0.7.3/zeppelin-0.7.3-bin-all.tgz" -O zeppelin.tar.gz
+        wget "http://www-us.apache.org/dist/zeppelin/zeppelin-0.8.0/zeppelin-0.8.0-bin-all.tgz" -O zeppelin.tar.gz
         mkdir zeppelin
         tar xvzf zeppelin.tar.gz -C zeppelin --strip-components 1 > /tmp/zeppelin_install.log
     fi
@@ -70,7 +70,7 @@ install_and_run_zeppelin() {
         export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH} --executor-memory ${SPARK_MEM_PARAM}"
         sudo -E zeppelin/bin/zeppelin.sh
     else
-        notify_error_and_exit "Zepellin installation not found"
+        notify_error_and_exit "Zeppelin installation not found"
     fi
 }
 

From 6f9f219888aa6f7fa5f191ec725b9c55584f00e4 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 4 Jul 2018 09:42:20 -0300
Subject: [PATCH 202/268] update flintrock

---
 tools/flintrock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/flintrock b/tools/flintrock
index d2318e99..c9f58f54 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit d2318e99dd972765673a0b8d716d3409d337e2da
+Subproject commit c9f58f547adaa57401e910df78c5986e76b8a155

From cb3f518cf78389d98397877473a81fa01a99fe02 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Wed, 19 Sep 2018 18:27:03 -0300
Subject: [PATCH 203/268] Allow to IAM and other credentials to work on s3
 listing

---
 .../ignition/core/jobs/utils/SparkContextUtils.scala | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 2d6bf6b5..1d06505d 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -2,10 +2,12 @@ package ignition.core.jobs.utils
 
 import java.io.InputStream
 
-import com.amazonaws.auth.EnvironmentVariableCredentialsProvider
+import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
 import com.amazonaws.services.s3.AmazonS3Client
 import com.amazonaws.services.s3.model.{ListObjectsRequest, ObjectListing, S3ObjectSummary}
+import ignition.core.utils.CollectionUtils._
 import ignition.core.utils.DateUtils._
+import ignition.core.utils.ExceptionUtils._
 import ignition.core.utils.{AutoCloseableIterator, ByteUtils, HadoopUtils}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
@@ -15,17 +17,15 @@ import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 import org.apache.spark.rdd.{RDD, UnionRDD}
 import org.apache.spark.{Partitioner, SparkContext}
 import org.joda.time.DateTime
+import org.slf4j.LoggerFactory
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.io.{Codec, Source}
 import scala.reflect.ClassTag
-import scala.util.{Failure, Success, Try}
 import scala.util.control.NonFatal
-import ignition.core.utils.ExceptionUtils._
-import ignition.core.utils.CollectionUtils._
-import org.slf4j.LoggerFactory
+import scala.util.{Failure, Success, Try}
 
 
 object SparkContextUtils {
@@ -49,7 +49,7 @@ object SparkContextUtils {
     def isCompressed(f: HadoopFile): Boolean = compressedExtensions.exists(f.path.endsWith)
   }
 
-  private lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider())
+  private lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new DefaultAWSCredentialsProviderChain())
 
   private def close(inputStream: InputStream, path: String): Unit = {
     try {

From 8aaf8602018f3fe7c61ece74d8bc7ba8b5591f65 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 20 Sep 2018 16:45:05 -0300
Subject: [PATCH 204/268] yarn support

---
 tools/cluster.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 68 insertions(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 34272f4f..cb5d1798 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -510,6 +510,73 @@ def job_run(cluster_name, job_name, job_mem,
     return (job_name, job_tag)
 
 
+@arg('job-mem', help='The amount of memory to use for this job (like: 80G)')
+@named('local-yarn-run')
+def job_local_yarn_run(job_name, job_mem, queue,
+            job_user=getpass.getuser(),
+            utc_job_date=None, job_tag=None,
+            disable_assembly_build=False,
+            spark_submit='spark-submit',
+            deploy_mode='cluster',
+            yarn_memory_overhead=0.3,
+            driver_heap_size=default_driver_heap_size):
+
+    def parse_memory(s):
+        import re
+        match = re.match(r'([0-9]+)([a-zA-Z]+)', s)
+        if match is None or len(match.groups()) != 2:
+            raise Exception('Invalid memory size: ' + s)
+        return match.groups()
+
+    def calculate_overhead(s):
+        from math import ceil
+        (n, unit) = parse_memory(s)
+        return str(int(ceil(float(n) * (1 + yarn_memory_overhead)))) + unit
+
+    driver_overhead = calculate_overhead(driver_heap_size)
+    executor_overhead = calculate_overhead(job_mem)
+
+    utc_job_date_example = '2014-05-04T13:13:10Z'
+    if utc_job_date and len(utc_job_date) != len(utc_job_date_example):
+        raise CommandError('UTC Job Date should be given as in the following example: {}'.format(utc_job_date_example))
+    
+    project_path = get_project_path()
+    project_name = os.path.basename(project_path)
+    # Use job user on remote path to avoid too many conflicts for different local users
+    job_date = utc_job_date or datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
+    job_tag = job_tag or job_date.replace(':', '_').replace('-', '_').replace('Z', 'UTC')
+    
+    if not disable_assembly_build:
+        build_assembly()
+
+    assembly_path = get_assembly_path()
+    if assembly_path is None:
+        raise Exception('Something is wrong: no assembly found')
+
+    
+    log.info('Will run job using local installation of yarn')
+
+    check_call([
+        spark_submit,
+        '--class', 'ignition.jobs.Runner',
+        '--master', 'yarn',
+        '--deploy-mode', deploy_mode,
+        '--queue', queue,
+        '--driver-memory', driver_heap_size,
+        '--conf', 'spark.yarn.am.memory', driver_heap_size,
+        '--executor-memory', job_mem,
+        '--conf', 'spark.yarn.am.memoryOverhead', driver_overhead,
+        '--conf', 'spark.driver.memoryOverhead', driver_overhead,
+        '--conf', 'spark.executor.memoryOverhead', executor_overhead,
+        assembly_path,
+        job_name,
+        '--runner-master', 'yarn',
+        '--runner-executor-memory', job_mem
+        # add job tag, date, etc
+
+    ])
+    
+
 @named('attach')
 def job_attach(cluster_name, key_file=default_key_file, job_name=None, job_tag=None,
                master=None, remote_user=default_remote_user, region=default_region):
@@ -750,7 +817,7 @@ def check_flintrock_installation():
 
 parser = ArghParser()
 parser.add_commands([launch, destroy, get_master, ssh_master, tag_cluster_instances, health_check])
-parser.add_commands([job_run, job_attach, wait_for_job,
+parser.add_commands([job_run, job_local_yarn_run, job_attach, wait_for_job,
                      kill_job, killall_jobs, collect_job_results], namespace="jobs")
 
 if __name__ == '__main__':

From 465286559d12dc8d15f6f48581e2073dc0d88a93 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allandouglas@gmail.com>
Date: Thu, 20 Sep 2018 21:40:21 -0300
Subject: [PATCH 205/268] Added remaining options

---
 tools/cluster.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index cb5d1798..3f0edcbf 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -518,8 +518,9 @@ def job_local_yarn_run(job_name, job_mem, queue,
             disable_assembly_build=False,
             spark_submit='spark-submit',
             deploy_mode='cluster',
-            yarn_memory_overhead=0.3,
-            driver_heap_size=default_driver_heap_size):
+            yarn_memory_overhead=0.2,
+            driver_heap_size=default_driver_heap_size,
+            driver_java_options='-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps'):
 
     def parse_memory(s):
         import re
@@ -540,9 +541,6 @@ def calculate_overhead(s):
     if utc_job_date and len(utc_job_date) != len(utc_job_date_example):
         raise CommandError('UTC Job Date should be given as in the following example: {}'.format(utc_job_date_example))
     
-    project_path = get_project_path()
-    project_name = os.path.basename(project_path)
-    # Use job user on remote path to avoid too many conflicts for different local users
     job_date = utc_job_date or datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
     job_tag = job_tag or job_date.replace(':', '_').replace('-', '_').replace('Z', 'UTC')
     
@@ -555,25 +553,26 @@ def calculate_overhead(s):
 
     
     log.info('Will run job using local installation of yarn')
-
     check_call([
         spark_submit,
         '--class', 'ignition.jobs.Runner',
         '--master', 'yarn',
+        '--driver-java-options', driver_java_options,
         '--deploy-mode', deploy_mode,
         '--queue', queue,
         '--driver-memory', driver_heap_size,
-        '--conf', 'spark.yarn.am.memory', driver_heap_size,
+        '--conf', 'spark.yarn.am.memory=' + driver_heap_size,
         '--executor-memory', job_mem,
-        '--conf', 'spark.yarn.am.memoryOverhead', driver_overhead,
-        '--conf', 'spark.driver.memoryOverhead', driver_overhead,
-        '--conf', 'spark.executor.memoryOverhead', executor_overhead,
+        '--conf', 'spark.yarn.am.memoryOverhead=' + driver_overhead,
+        '--conf', 'spark.driver.memoryOverhead=' + driver_overhead,
+        '--conf', 'spark.executor.memoryOverhead=' + executor_overhead,
         assembly_path,
         job_name,
         '--runner-master', 'yarn',
-        '--runner-executor-memory', job_mem
-        # add job tag, date, etc
-
+        '--runner-executor-memory', job_mem,
+        '--runner-user', job_user,
+        '--runner-tag', job_tag,
+        '--runner-date', job_date
     ])
     
 

From 77fb7a7b77ce509e368ed482d26f6a98566664c9 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Fri, 21 Sep 2018 14:04:42 -0300
Subject: [PATCH 206/268] Make YARN jobs finish successfully

---
 src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
index ab47ee12..7200d747 100644
--- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
+++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
@@ -1,8 +1,9 @@
 package ignition.core.jobs
 
 import org.apache.spark.{SparkConf, SparkContext}
-import org.joda.time.{DateTimeZone, DateTime}
+import org.joda.time.{DateTime, DateTimeZone}
 
+import scala.concurrent.Future
 import scala.util.Try
 
 object CoreJobRunner {
@@ -105,7 +106,13 @@ object CoreJobRunner {
           System.exit(1) // force exit of all threads
       }
       Try { sc.stop() }
-      System.exit(0) // force exit of all threads
+      import scala.concurrent.ExecutionContext.Implicits.global
+      Future {
+        // If everything is fine, the system will shut down without the help of this thread and YARN will report success
+        // But sometimes it gets stuck, then it's necessary to use the force, but this may finish the job as failed on YARN
+        Thread.sleep(30 * 1000)
+        System.exit(0) // force exit of all threads
+      }
     }
   }
 }

From fa6d32f40afbca2111e0a051649edf12dc61f354 Mon Sep 17 00:00:00 2001
From: Henrique Goulart <henriquedsg89@gmail.com>
Date: Mon, 24 Sep 2018 20:07:08 -0300
Subject: [PATCH 207/268] Update AMI

I deleted the other AMI. Now we will use the one that have been used in
datalake production for a long time (platform AMI)
---
 tools/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 34272f4f..37b327c0 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -46,7 +46,7 @@
 default_zone = default_region + 'b'
 default_key_id = 'ignition_key'
 default_key_file = os.path.expanduser('~/.ssh/ignition_key.pem')
-default_ami = 'ami-60b6001f'
+default_ami = 'ami-611e7976'
 default_master_ami = ''
 default_env = 'dev'
 default_spark_version = '2.3.1'

From 709d508ca69dcc864ef6957515580c8ff1d4ab79 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Wed, 26 Sep 2018 10:25:17 -0300
Subject: [PATCH 208/268] support for jupyter with pyspark

---
 remote_hook.sh | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index 3821d92e..eb9b3616 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -66,7 +66,7 @@ install_and_run_zeppelin() {
     if [[ -f "zeppelin/bin/zeppelin.sh" ]]; then
         export MASTER="${JOB_MASTER}"
         export ZEPPELIN_PORT="8081"
-        export SPARK_HOME="/root/spark"
+        export SPARK_HOME=$(get_first_present /root/spark /opt/spark ~/spark*/)
         export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH} --executor-memory ${SPARK_MEM_PARAM}"
         sudo -E zeppelin/bin/zeppelin.sh
     else
@@ -74,6 +74,19 @@ install_and_run_zeppelin() {
     fi
 }
 
+install_and_run_jupyter() {
+    sudo yum -y install python3 python3-pip
+    sudo pip3 install jupyter pandas boto3 matplotlib numpy sklearn scipy
+    export SPARK_HOME=$(get_first_present /root/spark /opt/spark ~/spark*/)
+    export HADOOP_HOME=$(get_first_present /root/hadoop /opt/hadoop ~/hadoop*/)
+    export SPARK_CONF_DIR="${SPARK_HOME}/conf"
+    export HADOOP_CONF_DIR="${HADOOP_HOME}/conf"
+    export JOB_MASTER=${MASTER:-spark://${SPARK_MASTER_HOST}:7077}
+    export PYSPARK_PYTHON=$(which python3)
+    export PYSPARK_DRIVER_PYTHON=$(which jupyter)
+    export PYSPARK_DRIVER_PYTHON_OPTS="notebook --allow-root --ip=${SPARK_MASTER_HOST} --no-browser --port=8888"
+    sudo -E "${SPARK_HOME}/bin/pyspark" --master "${JOB_MASTER}" --executor-memory "${SPARK_MEM_PARAM}" --driver-memory "${DRIVER_HEAP_SIZE}"
+}
 
 trap "on_trap_exit" EXIT
 
@@ -105,6 +118,8 @@ if [[ "${JOB_NAME}" == "shell" ]]; then
     sudo -E ${SPARK_HOME}/bin/spark-shell --master "${JOB_MASTER}" --jars ${JAR_PATH} --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell"
 elif [[ "${JOB_NAME}" == "zeppelin" ]]; then
     install_and_run_zeppelin
+elif [[ "${JOB_NAME}" == "jupyter" ]]; then
+    install_and_run_jupyter
 else
     JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log"
     tail -F "${JOB_OUTPUT}" &

From 9ba1e71da5ee34ea333db6a95874621605d730b3 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Wed, 26 Sep 2018 14:37:17 -0300
Subject: [PATCH 209/268] Avoid explicit spark context stopping

---
 src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
index 7200d747..c1d0541f 100644
--- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
+++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
@@ -105,7 +105,7 @@ object CoreJobRunner {
           t.printStackTrace()
           System.exit(1) // force exit of all threads
       }
-      Try { sc.stop() }
+
       import scala.concurrent.ExecutionContext.Implicits.global
       Future {
         // If everything is fine, the system will shut down without the help of this thread and YARN will report success

From cc71992d5c849e4ec9106f93e42432f1bdcca670 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Wed, 26 Sep 2018 15:38:57 -0300
Subject: [PATCH 210/268] fixed memory calculation and added executor cores

---
 tools/cluster.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 3f0edcbf..682dd97e 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -516,6 +516,7 @@ def job_local_yarn_run(job_name, job_mem, queue,
             job_user=getpass.getuser(),
             utc_job_date=None, job_tag=None,
             disable_assembly_build=False,
+            executor_cores=5,
             spark_submit='spark-submit',
             deploy_mode='cluster',
             yarn_memory_overhead=0.2,
@@ -532,7 +533,7 @@ def parse_memory(s):
     def calculate_overhead(s):
         from math import ceil
         (n, unit) = parse_memory(s)
-        return str(int(ceil(float(n) * (1 + yarn_memory_overhead)))) + unit
+        return str(int(ceil(float(n) * yarn_memory_overhead))) + unit
 
     driver_overhead = calculate_overhead(driver_heap_size)
     executor_overhead = calculate_overhead(job_mem)
@@ -560,6 +561,7 @@ def calculate_overhead(s):
         '--driver-java-options', driver_java_options,
         '--deploy-mode', deploy_mode,
         '--queue', queue,
+        '--conf', 'spark.executor.cores=' + str(executor_cores),
         '--driver-memory', driver_heap_size,
         '--conf', 'spark.yarn.am.memory=' + driver_heap_size,
         '--executor-memory', job_mem,

From bfb364ab1e4ac016717bebdab635198ae57e4138 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Thu, 27 Sep 2018 16:51:35 -0300
Subject: [PATCH 211/268] tail it :)

---
 remote_hook.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index 3821d92e..68412ca9 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -x
+
 # We suppose we are in a subdirectory of the root project
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
@@ -84,7 +86,7 @@ MAIN_CLASS="ignition.jobs.Runner"
 
 cd "${DIR}" || notify_error_and_exit "Internal script error for job ${JOB_WITH_TAG}"
 
-JAR_PATH_SRC=$(echo "${DIR}"/*assembly*.jar)
+JAR_PATH_SRC=$(ls "${DIR}"/*assembly*.jar | tail -1)
 JAR_PATH="${JOB_CONTROL_DIR}/Ignition.jar"
 
 cp ${JAR_PATH_SRC} ${JAR_PATH}

From 438297459b28959e14046fd8738dcc3ad5e904e4 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Fri, 28 Sep 2018 10:23:10 -0300
Subject: [PATCH 212/268] exec shell in cluster

---
 tools/cluster.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index aeb5eb6a..b3780ba3 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -372,6 +372,14 @@ def ssh_master(cluster_name, key_file=default_key_file, user=default_remote_user
     ssh_call(user=user, host=master, key_file=key_file, args=args)
 
 
+def exec_shell(cluster_name, command, key_file=default_key_file, user=default_remote_user, region=default_region):
+    masters, slaves = get_active_nodes(cluster_name, region=region)
+    for node in masters + slaves:
+        host = node.public_dns_name or node.private_dns_name
+        output = ssh_call(user=user, host=host, key_file=key_file, args=[command], allocate_terminal=True, get_output=True)
+        log.info("exec output of host %s:\n%s", host, output)
+
+
 def rsync_call(user, host, key_file, args=[], src_local='', dest_local='', remote_path='', tries=3):
     rsync_args = ['rsync', '--timeout', '60', '-azvP']
     rsync_args += ['-e', 'ssh -i {} -o StrictHostKeyChecking=no'.format(key_file)]
@@ -817,7 +825,7 @@ def check_flintrock_installation():
 
 
 parser = ArghParser()
-parser.add_commands([launch, destroy, get_master, ssh_master, tag_cluster_instances, health_check])
+parser.add_commands([launch, destroy, get_master, ssh_master, tag_cluster_instances, health_check, exec_shell])
 parser.add_commands([job_run, job_local_yarn_run, job_attach, wait_for_job,
                      kill_job, killall_jobs, collect_job_results], namespace="jobs")
 

From 4e900d913af7c5ea3e593d15f762a1ea87c916ee Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Fri, 28 Sep 2018 10:56:22 -0300
Subject: [PATCH 213/268] by default, install python3 and pip also tmux

---
 tools/create_image.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tools/create_image.sh b/tools/create_image.sh
index 5f807365..5b2af793 100644
--- a/tools/create_image.sh
+++ b/tools/create_image.sh
@@ -37,7 +37,13 @@ echo "export JAVA_HOME=/usr/lib/jvm/java-1.8.0" >> /etc/environment
 yum install -y snappy
 
 # Install netlib-java native dependencies
-yum install -y  blas atlas lapack
+yum install -y blas atlas lapack
+
+# Install python3 and pip3
+yum install -y python3 python3-pip
+
+# Install python3 and pip3
+yum install -y tmux
 
 # Create /usr/bin/realpath which is used by R to find Java installations
 # NOTE: /usr/bin/realpath is missing in CentOS AMIs. See

From 9bfa59b3736cfba61262066f2ebf94d6b0784124 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Fri, 28 Sep 2018 13:27:15 -0300
Subject: [PATCH 214/268] fix commentary description

---
 tools/create_image.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/create_image.sh b/tools/create_image.sh
index 5b2af793..852b861c 100644
--- a/tools/create_image.sh
+++ b/tools/create_image.sh
@@ -42,7 +42,7 @@ yum install -y blas atlas lapack
 # Install python3 and pip3
 yum install -y python3 python3-pip
 
-# Install python3 and pip3
+# Install tmux
 yum install -y tmux
 
 # Create /usr/bin/realpath which is used by R to find Java installations

From 881892b2a726e1737aeea475cba5d4877c47b866 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allandouglas@gmail.com>
Date: Mon, 1 Oct 2018 18:15:43 -0300
Subject: [PATCH 215/268] Get latest asssembly (lexicographically)

---
 tools/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index aeb5eb6a..9f4eb18b 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -387,7 +387,7 @@ def build_assembly():
 def get_assembly_path():
     paths = glob.glob(get_project_path() + '/target/scala-*/*assembly*.jar')
     if paths:
-        return paths[0]
+        return paths[-1]
     else:
         return None
 

From 2d5c63b9249d88c94a18da35e36b0adb42f31aa4 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 2 Oct 2018 15:04:17 -0300
Subject: [PATCH 216/268] Enable hive support

---
 build.sbt                                     |  2 +
 .../ignition/core/jobs/CoreJobRunner.scala    | 38 ++++++++++++-------
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/build.sbt b/build.sbt
index 81f585b6..b85671d8 100644
--- a/build.sbt
+++ b/build.sbt
@@ -13,6 +13,8 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.3.1" % "provided
   .exclude("org.apache.hadoop", "hadoop-client")
   .exclude("org.slf4j", "slf4j-log4j12")
 
+libraryDependencies += ("org.apache.spark" %% "spark-sql" % "2.3.1" % "provided")
+
 libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided")
 
 libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.6")
diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
index c1d0541f..4e7c27fe 100644
--- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
+++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
@@ -1,14 +1,19 @@
 package ignition.core.jobs
 
-import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.SparkSession
 import org.joda.time.{DateTime, DateTimeZone}
+import org.slf4j.{Logger, LoggerFactory}
 
 import scala.concurrent.Future
 import scala.util.Try
 
 object CoreJobRunner {
 
+  val logger: Logger = LoggerFactory.getLogger(getClass)
+
   case class RunnerContext(sparkContext: SparkContext,
+                           sparkSession: SparkSession,
                            config: RunnerConfig)
 
 
@@ -71,32 +76,39 @@ object CoreJobRunner {
       val appName = s"${config.setupName}.${config.tag}"
 
 
-      val sparkConf = new SparkConf()
-      sparkConf.set("spark.executor.memory", config.executorMemory)
-
-      sparkConf.set("spark.eventLog.dir", "file:///media/tmp/spark-events")
+      val builder = SparkSession.builder
+      builder.config("spark.executor.memory", config.executorMemory)
 
-      sparkConf.setMaster(config.master)
-      sparkConf.setAppName(appName)
+      builder.config("spark.eventLog.dir", "file:///media/tmp/spark-events")
 
-      sparkConf.set("spark.hadoop.mapred.output.committer.class", classOf[DirectOutputCommitter].getName())
+      builder.master(config.master)
+      builder.appName(appName)
 
-      defaultSparkConfMap.foreach { case (k, v) => sparkConf.set(k, v) }
+      builder.config("spark.hadoop.mapred.output.committer.class", classOf[DirectOutputCommitter].getName())
 
-      jobConf.foreach { case (k, v) => sparkConf.set(k, v) }
+      defaultSparkConfMap.foreach { case (k, v) => builder.config(k, v) }
 
+      jobConf.foreach { case (k, v) => builder.config(k, v) }
 
       // Add logging context to driver
       setLoggingContextValues(config)
-      
-      val sc = new SparkContext(sparkConf)
 
+      try {
+        builder.enableHiveSupport()
+      } catch {
+        case t: Throwable => logger.warn("Failed to enable HIVE support", t)
+      }
+
+      val session = builder.getOrCreate()
+
+      val sc = session.sparkContext
       // Also try to propagate logging context to workers
       // TODO: find a more efficient and bullet-proof way
       val configBroadCast = sc.broadcast(config)
+
       sc.parallelize(Range(1, 2000), numSlices = 2000).foreachPartition(_ => setLoggingContextValues(configBroadCast.value))
 
-      val context = RunnerContext(sc, config)
+      val context = RunnerContext(sc, session, config)
 
       try {
         jobSetup.apply(context)

From d67b7f478fd1e9d18141ab02b0832c9e552186e0 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 4 Oct 2018 16:34:07 -0300
Subject: [PATCH 217/268] Get latest assembly by time

---
 remote_hook.sh   | 2 +-
 tools/cluster.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index ddfe80a2..f30879e3 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -99,7 +99,7 @@ MAIN_CLASS="ignition.jobs.Runner"
 
 cd "${DIR}" || notify_error_and_exit "Internal script error for job ${JOB_WITH_TAG}"
 
-JAR_PATH_SRC=$(ls "${DIR}"/*assembly*.jar | tail -1)
+JAR_PATH_SRC=$(ls -t "${DIR}"/*assembly*.jar | head -1) # most recent jar
 JAR_PATH="${JOB_CONTROL_DIR}/Ignition.jar"
 
 cp ${JAR_PATH_SRC} ${JAR_PATH}
diff --git a/tools/cluster.py b/tools/cluster.py
index 9f4eb18b..f22107d5 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -387,6 +387,7 @@ def build_assembly():
 def get_assembly_path():
     paths = glob.glob(get_project_path() + '/target/scala-*/*assembly*.jar')
     if paths:
+        paths.sort(key=os.path.getmtime)
         return paths[-1]
     else:
         return None

From d7e635a61a664f6378cfb5e59d1012e95217eea2 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 9 Oct 2018 15:14:49 -0300
Subject: [PATCH 218/268] Added options to conf yarn

---
 remote_hook.sh   |  1 -
 tools/cluster.py | 40 ++++++++++++++++++++++++----------------
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index f30879e3..0a5a2cb8 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 
-set -x
 
 # We suppose we are in a subdirectory of the root project
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
diff --git a/tools/cluster.py b/tools/cluster.py
index f22107d5..eac73844 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -510,7 +510,7 @@ def job_run(cluster_name, job_name, job_mem,
             raise failed_exception or Exception('Failed!?')
     return (job_name, job_tag)
 
-
+@argh.arg('-c', '--conf', action='append', type=str)
 @arg('job-mem', help='The amount of memory to use for this job (like: 80G)')
 @named('local-yarn-run')
 def job_local_yarn_run(job_name, job_mem, queue,
@@ -522,7 +522,8 @@ def job_local_yarn_run(job_name, job_mem, queue,
             deploy_mode='cluster',
             yarn_memory_overhead=0.2,
             driver_heap_size=default_driver_heap_size,
-            driver_java_options='-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps'):
+            driver_java_options='-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps',
+            conf=[]):
 
     def parse_memory(s):
         import re
@@ -555,20 +556,27 @@ def calculate_overhead(s):
 
     
     log.info('Will run job using local installation of yarn')
-    check_call([
-        spark_submit,
-        '--class', 'ignition.jobs.Runner',
-        '--master', 'yarn',
-        '--driver-java-options', driver_java_options,
-        '--deploy-mode', deploy_mode,
-        '--queue', queue,
-        '--conf', 'spark.executor.cores=' + str(executor_cores),
-        '--driver-memory', driver_heap_size,
-        '--conf', 'spark.yarn.am.memory=' + driver_heap_size,
-        '--executor-memory', job_mem,
-        '--conf', 'spark.yarn.am.memoryOverhead=' + driver_overhead,
-        '--conf', 'spark.driver.memoryOverhead=' + driver_overhead,
-        '--conf', 'spark.executor.memoryOverhead=' + executor_overhead,
+    confs = [
+      spark_submit,
+      '--class', 'ignition.jobs.Runner',
+      '--master', 'yarn',
+      '--driver-java-options', driver_java_options,
+      '--deploy-mode', deploy_mode,
+      '--queue', queue,
+      '--conf', 'spark.executor.cores=' + str(executor_cores),
+      '--driver-memory', driver_heap_size,
+      '--conf', 'spark.yarn.am.memory=' + driver_heap_size,
+      '--executor-memory', job_mem,
+      '--conf', 'spark.yarn.am.memoryOverhead=' + driver_overhead,
+      '--conf', 'spark.driver.memoryOverhead=' + driver_overhead,
+      '--conf', 'spark.executor.memoryOverhead=' + executor_overhead
+    ]
+
+    for c in conf:
+        confs.extend(['--conf', c])
+
+    check_call(
+      confs + [
         assembly_path,
         job_name,
         '--runner-master', 'yarn',

From f7468e1551342da13194296cda6332c0dd9a6850 Mon Sep 17 00:00:00 2001
From: Felipe Mafatti <malfattif@yahoo.com.br>
Date: Wed, 10 Oct 2018 11:54:20 -0300
Subject: [PATCH 219/268] Update submodule - Change ebs to delete on
 termination (#152)

---
 tools/flintrock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/flintrock b/tools/flintrock
index c9f58f54..e5b3b9b2 160000
--- a/tools/flintrock
+++ b/tools/flintrock
@@ -1 +1 @@
-Subproject commit c9f58f547adaa57401e910df78c5986e76b8a155
+Subproject commit e5b3b9b2a6ac66536ba6e105cd42f988f9d8bb7e

From 1c3644f689f37b4d01c563b7564921ddeebc3266 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allandouglas@gmail.com>
Date: Tue, 16 Oct 2018 23:35:53 -0300
Subject: [PATCH 220/268] Support sudo, avoid loop break on failures

---
 tools/cluster.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index fddabd65..7cb07a83 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -114,8 +114,10 @@ def logged_call(args, tries=1):
     return logged_call_base(check_call, args, tries)
 
 
-def ssh_call(user, host, key_file, args=(), allocate_terminal=True, get_output=False):
-    base = ['ssh', '-q']
+def ssh_call(user, host, key_file, args=(), allocate_terminal=True, get_output=False, quiet=True):
+    base = ['ssh']
+    if quiet:
+        base += ['-q']
     if allocate_terminal:
         base += ['-tt']
     base += ['-i', key_file,
@@ -372,12 +374,22 @@ def ssh_master(cluster_name, key_file=default_key_file, user=default_remote_user
     ssh_call(user=user, host=master, key_file=key_file, args=args)
 
 
-def exec_shell(cluster_name, command, key_file=default_key_file, user=default_remote_user, region=default_region):
+def exec_shell(cluster_name, command, key_file=default_key_file, user=default_remote_user, region=default_region, sudo=False):
+    import subprocess
     masters, slaves = get_active_nodes(cluster_name, region=region)
+    if not masters:
+         log.warn('No master found')
     for node in masters + slaves:
         host = node.public_dns_name or node.private_dns_name
-        output = ssh_call(user=user, host=host, key_file=key_file, args=[command], allocate_terminal=True, get_output=True)
-        log.info("exec output of host %s:\n%s", host, output)
+        log.info("exec output of host %s\n", host)
+        cmd = ['ssh', '-t', '-o', 'StrictHostKeyChecking=no', user + '@' + host  ,'-i', key_file]
+        if sudo:
+            cmd += ['sudo']
+        cmd += ['bash']
+        p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
+        p.communicate(command)
+        if p.wait() != 0:
+            log.warn('\nError executing command on host: %s', host)
 
 
 def rsync_call(user, host, key_file, args=[], src_local='', dest_local='', remote_path='', tries=3):

From bb5e6adf85cbb35d1bb12726776b7eb2a7cfdb29 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@chaordicsystems.com>
Date: Fri, 26 Oct 2018 10:19:34 -0300
Subject: [PATCH 221/268] update to spark 2.3.2

---
 build.sbt        | 4 ++--
 tools/cluster.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/build.sbt b/build.sbt
index b85671d8..03f135d5 100644
--- a/build.sbt
+++ b/build.sbt
@@ -9,11 +9,11 @@ scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warning
 // Because we can't run two spark contexts on same VM
 parallelExecution in Test := false
 
-libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.3.1" % "provided")
+libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.3.2" % "provided")
   .exclude("org.apache.hadoop", "hadoop-client")
   .exclude("org.slf4j", "slf4j-log4j12")
 
-libraryDependencies += ("org.apache.spark" %% "spark-sql" % "2.3.1" % "provided")
+libraryDependencies += ("org.apache.spark" %% "spark-sql" % "2.3.2" % "provided")
 
 libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided")
 
diff --git a/tools/cluster.py b/tools/cluster.py
index eac73844..4d7e7dd3 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -49,7 +49,7 @@
 default_ami = 'ami-611e7976'
 default_master_ami = ''
 default_env = 'dev'
-default_spark_version = '2.3.1'
+default_spark_version = '2.3.2'
 default_hdfs_version = '2.7.6'
 default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz'
 default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz'

From f489fd2d50b65c1f43b43f7afb0b2459145485fb Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Wed, 21 Nov 2018 19:16:28 -0200
Subject: [PATCH 222/268] Make cluster.py compatible with python3

---
 tools/cluster.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 4d7e7dd3..38b7a19f 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -614,9 +614,9 @@ def health_check(cluster_name, key_file=default_key_file, master=None, remote_us
                 raise NotHealthyCluster('Not enough healthy slaves: {0}/{1}'.format(len(slaves), nslaves))
             if not masters:
                 raise NotHealthyCluster('No master found')
-        except NotHealthyCluster, e:
+        except NotHealthyCluster as e:
             raise e
-        except Exception, e:
+        except Exception as e:
             log.warning("Failed to check cluster health, cluster: %s, retries %s" % (cluster_name, i), exc_info=True)
             if i >= retries - 1:
                 log.critical("Failed to check cluster health, cluster: %s, giveup!" % (cluster_name))

From cc510d89ad22f0824d98a618095aa738d0ec7643 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 22 Nov 2018 16:34:36 -0200
Subject: [PATCH 223/268] making compatible

---
 tools/cluster.py | 2 +-
 tools/utils.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 38b7a19f..060bce71 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -123,7 +123,7 @@ def ssh_call(user, host, key_file, args=(), allocate_terminal=True, get_output=F
              '{0}@{1}'.format(user, host)]
     base += args
     if get_output:
-        return logged_call_output(base)
+        return logged_call_output(base).decode("utf-8")
     else:
         return logged_call(base)
 
diff --git a/tools/utils.py b/tools/utils.py
index 88a236cd..5064be61 100644
--- a/tools/utils.py
+++ b/tools/utils.py
@@ -79,7 +79,7 @@ def read_non_blocking(f):
     while select.select([f], [], [], 0)[0]:
         c = f.read(1)
         if c:
-            result.append(c)
+            result.append(c.decode('utf-8'))
         else:
             break
     return ''.join(result) if result else None

From e8ec4d6c58ecb352865755e2a98692b922d4ec60 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@gmail.com>
Date: Tue, 27 Nov 2018 17:41:22 -0200
Subject: [PATCH 224/268] Lightweight core (#155)

* remove a lot of stuff
* spark 2.4
---
 build.sbt                                     |  37 +-
 project/build.properties                      |   2 +-
 project/plugins.sbt                           |   5 -
 src/main/scala/TestHttp.scala                 |  49 --
 .../core/cache/ExpiringMultiLevelCache.scala  | 547 ------------------
 .../core/http/AsyncHttpClientStreamApi.scala  |  89 ---
 .../core/http/AsyncSprayHttpClient.scala      | 297 ----------
 .../ignition/core/http/ByteStorage.scala      | 114 ----
 .../scala/ignition/core/http/Caching.scala    |  22 -
 src/main/scala/ignition/core/http/Retry.scala |  84 ---
 .../core/jobs/utils/SparkContextUtils.scala   |  13 +-
 .../ignition/core/utils/FutureUtils.scala     |   5 -
 .../scala/ignition/core/utils/S3Client.scala  |  62 --
 .../ignition/core/utils/TelemetryCache.scala  |  45 --
 .../scala/ignition/core/utils/URLUtils.scala  |  21 +-
 .../spray/cache/ExpiringLruLocalCache.scala   | 134 -----
 .../ExpiringMultipleLevelCacheSpec.scala      | 213 -------
 .../http/AsyncHttpClientStreamApiSpec.scala   |  15 -
 .../scala/ignition/core/http/RetrySpec.scala  |  39 --
 .../ignition/core/utils/URLUtilsSpec.scala    |  29 -
 tools/cluster.py                              |   2 +-
 21 files changed, 16 insertions(+), 1808 deletions(-)
 delete mode 100644 project/plugins.sbt
 delete mode 100644 src/main/scala/TestHttp.scala
 delete mode 100644 src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
 delete mode 100644 src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
 delete mode 100644 src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
 delete mode 100644 src/main/scala/ignition/core/http/ByteStorage.scala
 delete mode 100644 src/main/scala/ignition/core/http/Caching.scala
 delete mode 100644 src/main/scala/ignition/core/http/Retry.scala
 delete mode 100644 src/main/scala/ignition/core/utils/S3Client.scala
 delete mode 100644 src/main/scala/ignition/core/utils/TelemetryCache.scala
 delete mode 100644 src/main/scala/spray/cache/ExpiringLruLocalCache.scala
 delete mode 100644 src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
 delete mode 100644 src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala
 delete mode 100644 src/test/scala/ignition/core/http/RetrySpec.scala

diff --git a/build.sbt b/build.sbt
index 03f135d5..b27321ea 100644
--- a/build.sbt
+++ b/build.sbt
@@ -2,29 +2,20 @@ name := "Ignition-Core"
 
 version := "1.0"
 
-scalaVersion := "2.11.8"
+scalaVersion := "2.11.12"
 
 scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130")
 
 // Because we can't run two spark contexts on same VM
 parallelExecution in Test := false
 
-libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.3.2" % "provided")
-  .exclude("org.apache.hadoop", "hadoop-client")
-  .exclude("org.slf4j", "slf4j-log4j12")
+test in assembly := {}
 
-libraryDependencies += ("org.apache.spark" %% "spark-sql" % "2.3.2" % "provided")
+libraryDependencies += ("org.apache.spark" %% "spark-sql" % "2.4.0" % "provided")
 
 libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided")
 
-libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.6")
-  .exclude("org.apache.htrace", "htrace-core")
-  .exclude("commons-beanutils", "commons-beanutils")
-  .exclude("org.slf4j", "slf4j-log4j12")
-
-libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.3"
-
-libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.2.14"
+libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.2.27"
 
 libraryDependencies += "com.github.scopt" %% "scopt" % "3.6.0"
 
@@ -32,24 +23,8 @@ libraryDependencies += "joda-time" % "joda-time" % "2.9.9"
 
 libraryDependencies += "org.joda" % "joda-convert" % "1.8.2"
 
-libraryDependencies += "commons-lang" % "commons-lang" % "2.6"
-
 libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.25"
 
-libraryDependencies += "com.typesafe.akka" %% "akka-actor" % "2.3.4"
-
-libraryDependencies += "io.spray" %% "spray-json" % "1.3.2"
-
-libraryDependencies += "io.spray" %% "spray-client" % "1.3.2"
+libraryDependencies += "com.amazonaws" % "aws-java-sdk-s3" % "1.11.456"
 
-libraryDependencies += "io.spray" %% "spray-http" % "1.3.2"
-
-libraryDependencies += "io.spray" %% "spray-caching" % "1.3.2"
-
-resolvers += "Akka Repository" at "http://repo.akka.io/releases/"
-
-resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/"
-
-resolvers += "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/"
-
-resolvers += Resolver.sonatypeRepo("public")
+libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.3"
diff --git a/project/build.properties b/project/build.properties
index be6c454f..7c58a83a 100644
--- a/project/build.properties
+++ b/project/build.properties
@@ -1 +1 @@
-sbt.version=0.13.5
+sbt.version=1.2.6
diff --git a/project/plugins.sbt b/project/plugins.sbt
deleted file mode 100644
index f6f3b939..00000000
--- a/project/plugins.sbt
+++ /dev/null
@@ -1,5 +0,0 @@
-addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
-
-addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3")
-
-addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")
diff --git a/src/main/scala/TestHttp.scala b/src/main/scala/TestHttp.scala
deleted file mode 100644
index 901516e0..00000000
--- a/src/main/scala/TestHttp.scala
+++ /dev/null
@@ -1,49 +0,0 @@
-
-object TestHttp extends App{
-
-  def goTest(): Unit = {
-    import java.util.concurrent.TimeUnit
-
-    import akka.actor.{ActorRefFactory, ActorSystem}
-    import akka.util.Timeout
-    import ignition.core.http.AsyncHttpClientStreamApi._
-    import ignition.core.http.AsyncSprayHttpClient
-    import ignition.core.utils.ExceptionUtils._
-    import org.joda.time.DateTime
-
-    import scala.concurrent.ExecutionContext.Implicits.global
-    import scala.concurrent.duration.Duration
-    import scala.io.Source
-    import scala.util.{Failure, Success}
-    def now = DateTime.now()
-
-    val system = ActorSystem("http")
-    val client = new AsyncSprayHttpClient {
-      override implicit def actorRefFactory: ActorRefFactory = system
-    }
-    val url = "https://httpbin.org/delay/10" // "http://127.0.0.1:8081/"
-    val conf = RequestConfiguration(requestTimeout = Option(Duration(12, TimeUnit.SECONDS)), idleTimeout = Option(Duration(5, TimeUnit.SECONDS)))
-    implicit val reporter = NoOpReporter
-    implicit val timeout = Timeout(30, TimeUnit.SECONDS)
-
-    println(s"Starting $now")
-
-    // Should complete ok
-    val request1 = client.makeRequest(Request(url, requestConfiguration = Option(conf)))
-    request1.onComplete {
-      case Success(t) => println(s"request1 finished $now with Success: ${Source.fromInputStream(t.content).mkString}")
-      case Failure(t) => println(s"request1 finished $now with failure: ${t.getFullStackTraceString()}")
-    }
-
-    //Should time out and keep retrying
-    val tightConf = conf.copy(requestTimeout = Option(Duration(3, TimeUnit.SECONDS)))
-    val request2 = client.makeRequest(Request(url, requestConfiguration = Option(tightConf)))
-
-    request2.onComplete {
-      case Success(t) => println(s"request2 finished $now with Success: ${Source.fromInputStream(t.content).mkString}")
-      case Failure(t) => println(s"request2 finished $now with failure: ${t.getFullStackTraceString()}")
-    }
-  }
-
-  goTest()
-}
diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
deleted file mode 100644
index 6ac0f626..00000000
--- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala
+++ /dev/null
@@ -1,547 +0,0 @@
-package ignition.core.cache
-
-import java.util.concurrent.TimeUnit
-
-import akka.actor.Scheduler
-import akka.pattern.after
-import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap
-import ignition.core.utils.DateUtils._
-import ignition.core.utils.FutureUtils._
-import org.joda.time.{DateTime, DateTimeZone, Interval}
-import org.slf4j.LoggerFactory
-import spray.caching.ValueMagnet
-
-import scala.concurrent.duration._
-import scala.concurrent.{ExecutionContext, Future, Promise}
-import scala.util.control.NonFatal
-import scala.util.{Failure, Success, Try}
-
-object ExpiringMultiLevelCache {
-  case class TimestampedValue[V](date: DateTime, value: Try[V]) {
-    def hasExpired(ttl: FiniteDuration, now: DateTime, ttlCachedErrors: FiniteDuration = 1.minute): Boolean = {
-      value match {
-        case Success(_) => date.plus(ttl.toMillis).isBefore(now)
-        case Failure(_) => date.plus(ttlCachedErrors.toMillis).isBefore(now)
-      }
-    }
-  }
-
-  trait GenericCache[V] { cache =>
-    // Keep compatible with Spray Cache
-    def apply(key: String) = new Keyed(key)
-
-    class Keyed(key: String) {
-      /**
-        * Returns either the cached Future for the key or evaluates the given call-by-name argument
-        * which produces either a value instance of type `V` or a `Future[V]`.
-        */
-      def apply(magnet: ⇒ ValueMagnet[V])(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] =
-        cache.apply(key, () ⇒ try magnet.future catch { case NonFatal(e) ⇒ Future.failed(e) })
-
-      /**
-        * Returns either the cached Future for the key or evaluates the given function which
-        * should lead to eventual completion of the promise.
-        */
-      def apply[U](f: Promise[V] ⇒ U)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] =
-        cache.apply(key, () ⇒ { val p = Promise[V](); f(p); p.future })
-    }
-
-    def apply(key: String, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V]
-    def set(key: String, value: V)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Unit]
-  }
-
-  trait LocalCache[V] {
-    def get(key: Any): Option[Future[V]]
-    def set(key: Any, value: V): Unit
-  }
-
-  trait RemoteWritableCache[V] {
-    def set(key: String, value: V)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Unit]
-    def setLock(key: String, ttl: FiniteDuration)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Boolean]
-  }
-
-  trait RemoteReadableCache[V] {
-    def get(key: String)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Option[V]]
-  }
-
-  trait RemoteCacheRW[V] extends RemoteReadableCache[V] with RemoteWritableCache[V]
-
-  trait ReporterCallback {
-    def onCompletedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit
-    def onGeneratedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit
-    def onCompletedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit
-    def onGeneratedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit
-    def onCacheMissNothingFound(key: String, elapsedTime: FiniteDuration): Unit
-    def onCacheMissButFoundExpiredLocal(key: String, elapsedTime: FiniteDuration): Unit
-    def onCacheMissButFoundExpiredRemote(key: String, elapsedTime: FiniteDuration): Unit
-    def onRemoteCacheHit(key: String, elapsedTime: FiniteDuration): Unit
-    def onLocalCacheHit(key: String, elapsedTime: FiniteDuration): Unit
-    def onUnexpectedBehaviour(key: String, elapsedTime: FiniteDuration): Unit
-    def onStillTryingToLockOrGet(key: String, elapsedTime: FiniteDuration): Unit
-    def onSuccessfullyRemoteSetValue(key: String, elapsedTime: FiniteDuration): Unit
-    def onRemoteCacheHitAfterGenerating(key: String, elapsedTime: FiniteDuration): Unit
-    def onErrorGeneratingValue(key: String, eLocal: Throwable, elapsedTime: FiniteDuration): Unit
-    def onLocalError(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit
-    def onRemoteError(key: String, t: Throwable, elapsedTime: FiniteDuration): Unit
-    def onRemoteGiveUp(key: String, elapsedTime: FiniteDuration): Unit
-    def onSanityLocalValueCheckFailedResult(key: String, result: String, elapsedTime: FiniteDuration): Unit
-  }
-
-  object NoOpReporter extends ReporterCallback {
-    override def onCacheMissNothingFound(key: String, elapsedTime: FiniteDuration): Unit = {}
-    override def onUnexpectedBehaviour(key: String, elapsedTime: FiniteDuration): Unit = {}
-    override def onSuccessfullyRemoteSetValue(key: String, elapsedTime: FiniteDuration): Unit = {}
-    override def onRemoteError(key: String, t: Throwable, elapsedTime: FiniteDuration): Unit = {}
-    override def onRemoteGiveUp(key: String, elapsedTime: FiniteDuration): Unit = {}
-    override def onLocalError(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit = {}
-    override def onErrorGeneratingValue(key: String, eLocal: Throwable, elapsedTime: FiniteDuration): Unit = {}
-    override def onRemoteCacheHitAfterGenerating(key: String, elapsedTime: FiniteDuration): Unit = {}
-    override def onCacheMissButFoundExpiredRemote(key: String, elapsedTime: FiniteDuration): Unit = {}
-    override def onStillTryingToLockOrGet(key: String, elapsedTime: FiniteDuration): Unit = {}
-    override def onLocalCacheHit(key: String, elapsedTime: FiniteDuration): Unit = {}
-    override def onRemoteCacheHit(key: String, elapsedTime: FiniteDuration): Unit = {}
-    override def onCacheMissButFoundExpiredLocal(key: String, elapsedTime: FiniteDuration): Unit = {}
-    override def onCompletedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit = {}
-    override def onCompletedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit = {}
-    override def onGeneratedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit = {}
-    override def onGeneratedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit = {}
-    override def onSanityLocalValueCheckFailedResult(key: String, result: String, elapsedTime: FiniteDuration): Unit = {}
-  }
-}
-
-
-import ignition.core.cache.ExpiringMultiLevelCache._
-
-
-case class ExpiringMultiLevelCache[V](ttl: FiniteDuration,
-                                      localCache: Option[LocalCache[TimestampedValue[V]]],
-                                      remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None,
-                                      remoteLockTTL: FiniteDuration = 5.seconds,
-                                      reporter: ExpiringMultiLevelCache.ReporterCallback = ExpiringMultiLevelCache.NoOpReporter,
-                                      maxErrorsToRetryOnRemote: Int = 5,
-                                      backoffOnLockAcquire: FiniteDuration = 50.milliseconds,
-                                      backoffOnError: FiniteDuration = 50.milliseconds,
-                                      sanityLocalValueCheck: Boolean = false,
-                                      cacheErrors: Boolean = false,
-                                      ttlCachedErrors: FiniteDuration = 1.minute) extends GenericCache[V] {
-
-  private val logger = LoggerFactory.getLogger(getClass)
-
-  private val tempUpdate = new ConcurrentLinkedHashMap.Builder[Any, Future[TimestampedValue[V]]]
-    .maximumWeightedCapacity(Long.MaxValue)
-    .build()
-
-  protected def now = DateTime.now.withZone(DateTimeZone.UTC)
-
-  private def timestamp(v: V): TimestampedValue[V] = TimestampedValue(now, Try(v))
-
-  private def timestampError(e: Throwable): TimestampedValue[V] = TimestampedValue(now, Failure(e))
-
-  private def elapsedTime(startNanoTime: Long) = FiniteDuration(System.nanoTime() - startNanoTime, TimeUnit.NANOSECONDS)
-
-  private def remoteLockKey(key: Any) = s"$key-emlc-lock"
-
-
-  // The idea is simple, have two caches: remote and local
-  // with values that will eventually expire but still be left on the cache
-  // while a new value is asynchronously being calculated/retrieved
-  override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = {
-    // The local cache is always the first try. We'll only look the remote if the local value is missing or has expired
-    val startTime = System.nanoTime()
-    val result: Future[V] = localCache.flatMap(_.get(key).map(_.asTry())) match {
-      case Some(future) =>
-        future.flatMap {
-          case Success(localValue) if !localValue.hasExpired(ttl, now, ttlCachedErrors) =>
-            // We have locally a good value, just return it
-            reporter.onLocalCacheHit(key, elapsedTime(startTime))
-            // But if we're paranoid, let's check if the local value is consistent with remote
-            if (sanityLocalValueCheck)
-              remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, genValue, startTime)).getOrElse(Future.fromTry(localValue.value))
-            else
-              Future.fromTry(localValue.value)
-          case Success(expiredLocalValue) if remoteRW.nonEmpty =>
-            // We have locally an expired value, but we can check a remote cache for better value
-            remoteRW.get.get(key).asTry().flatMap {
-              case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) =>
-                // Remote is good, set locally and return it
-                reporter.onRemoteCacheHit(key, elapsedTime(startTime))
-                localCache.foreach(_.set(key, remoteValue))
-                Future.fromTry(remoteValue.value)
-              case Success(Some(expiredRemote)) =>
-                // Expired local and expired remote, return the most recent of them, async update both
-                reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime))
-                tryGenerateAndSet(key, genValue, startTime)
-                val mostRecent = Set(expiredLocalValue, expiredRemote).maxBy(_.date)
-                Future.fromTry(mostRecent.value)
-              case Success(None) =>
-                // No remote found, return local, async update both
-                reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime))
-                tryGenerateAndSet(key, genValue, startTime)
-                Future.fromTry(expiredLocalValue.value)
-              case Failure(e) =>
-                reporter.onRemoteError(key, e, elapsedTime(startTime))
-                logger.warn(s"apply, key: $key expired local value and failed to get remote", e)
-                tryGenerateAndSet(key, genValue, startTime)
-                Future.fromTry(expiredLocalValue.value)
-            }
-          case Success(expiredLocalValue) if remoteRW.isEmpty =>
-            // There is no remote cache configured, we'are on our own
-            // Return expired value and try to generate a new one for the future
-            reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime))
-            tryGenerateAndSet(key, genValue, startTime)
-            Future.fromTry(expiredLocalValue.value)
-          case Failure(e) =>
-            // This is almost impossible to happen because it's local and we don't save failed values
-            // Failed values are stored into property "value", not as the value itself
-            reporter.onLocalError(key, e, elapsedTime(startTime))
-            logger.warn(s"apply, key: $key got a failed future from cache!? This is almost impossible!", e)
-            for {
-              tsv <- tryGenerateAndSet(key, genValue, startTime)
-              value <- Future.fromTry(tsv.value)
-            } yield value
-        }
-      case None if remoteRW.nonEmpty =>
-        // No local, let's try remote
-        remoteRW.get.get(key).asTry().flatMap {
-          case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) =>
-            // Remote is good, set locally and return it
-            reporter.onRemoteCacheHit(key, elapsedTime(startTime))
-            localCache.foreach(_.set(key, remoteValue))
-            Future.fromTry(remoteValue.value)
-          case Success(Some(expiredRemote)) =>
-            // Expired remote, return the it, async update
-            reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime))
-            tryGenerateAndSet(key, genValue, startTime)
-            Future.fromTry(expiredRemote.value)
-          case Success(None) =>
-            // No good remote, sync generate
-            reporter.onCacheMissNothingFound(key, elapsedTime(startTime))
-            for {
-              tsv <- tryGenerateAndSet(key, genValue, startTime)
-              value <- Future.fromTry(tsv.value)
-            } yield value
-          case Failure(e) =>
-            reporter.onRemoteError(key, e, elapsedTime(startTime))
-            logger.warn(s"apply, key: $key expired local value and remote error", e)
-            for {
-              tsv <- tryGenerateAndSet(key, genValue, startTime)
-              value <- Future.fromTry(tsv.value)
-            } yield value
-        }
-      case None if remoteRW.isEmpty =>
-        // No local and no remote to look, just generate it
-        // The caller will need to wait for the value generation
-        reporter.onCacheMissNothingFound(key, elapsedTime(startTime))
-        for {
-          tsv <- tryGenerateAndSet(key, genValue, startTime)
-          value <- Future.fromTry(tsv.value)
-        } yield value
-    }
-    result.onComplete {
-      case Success(_) =>
-        reporter.onCompletedWithSuccess(key, elapsedTime(startTime))
-      case Failure(e) =>
-        reporter.onCompletedWithFailure(key, e, elapsedTime(startTime))
-    }
-    result
-  }
-
-  // This should be used carefully because it will overwrite the remote value without
-  // any lock, which may cause a desynchronization between the local and remote cache on other instances
-  // Note that if any tryGenerateAndSet is in progress, this will wait until it's finished before setting local/remote
-  override def set(key: String, value: V)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Unit] = {
-    logger.info(s"set, key $key: got a call to overwrite local and remote values")
-    val startTime = System.nanoTime()
-    val promise = Promise[TimestampedValue[V]]()
-    val future = promise.future
-    def doIt() = {
-      val tValue = timestamp(value)
-      localCache.foreach(_.set(key, tValue))
-      val result = remoteRW.map(remote => remoteOverwrite(key, tValue, remote, startTime)).getOrElse(Future.successful(tValue))
-      promise.completeWith(result)
-      tempUpdate.remove(key, future)
-    }
-    tempUpdate.put(key, future) match {
-      case null =>
-        doIt()
-        future.map(_ => ())
-      case fTrying =>
-        fTrying.onComplete { case _ => doIt() }
-        future.map(_ => ())
-    }
-  }
-
-  private def sanityLocalValueCheck(key: String, localValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], genValue: () => Future[V], startTime: Long)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = {
-    remote.get(key).asTry().flatMap {
-      case Success(Some(remoteValue)) if remoteValue == localValue =>
-        // Remote is the same as local, return any of them
-        Future.fromTry(remoteValue.value)
-      case Success(Some(remoteValue)) =>
-        // Something is different, try to figure it out
-        val valuesResult = if (remoteValue.value == localValue.value) "same-value" else "different-values"
-        val dateResult = if (remoteValue.date.isAfter(localValue.date))
-          s"remote-is-newer-than-local"
-        else if (localValue.date.isAfter(remoteValue.date))
-          s"local-is-newer-than-remote"
-        else if (localValue.date.isEqual(localValue.date))
-          "same-date"
-        else if (localValue.date.withZone(DateTimeZone.UTC).isEqual(localValue.date.withZone(DateTimeZone.UTC)))
-          "same-date-on-utc"
-        else
-          "impossible-dates"
-        val remoteExpired = remoteValue.hasExpired(ttl, now, ttlCachedErrors)
-        val localExpired = localValue.hasExpired(ttl, now, ttlCachedErrors)
-        val finalResult = s"$valuesResult-$dateResult-remote-expired-${remoteExpired}-local-expired-${localExpired}"
-        logger.warn(s"sanityLocalValueCheck, key $key: got different results for local $localValue and remote $remoteValue ($finalResult)")
-        reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime))
-        // return remote to keep everyone consistent
-        Future.fromTry(remoteValue.value)
-      case Success(None) =>
-        val localExpired = localValue.hasExpired(ttl, now, ttlCachedErrors)
-        val finalResult = s"missing-remote-local-expired-${localExpired}"
-        logger.warn(s"sanityLocalValueCheck, key $key: got local $localValue but no remote ($finalResult)")
-        reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime))
-        // Try generate it to keep a behaviour equivalent to remote only
-        for {
-          tsv <- tryGenerateAndSet(key, genValue, startTime)
-          value <- Future.fromTry(tsv.value)
-        } yield value
-
-      case Failure(e) =>
-        reporter.onRemoteError(key, e, elapsedTime(startTime))
-        logger.warn(s"sanityLocalValueCheck, key: $key  failed to get remote", e)
-        Future.fromTry(localValue.value)
-    }
-  }
-
-  // Overwrite remote value without lock, retrying on error
-  private def remoteOverwrite(key: String, calculatedValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], nanoStartTime: Long, currentRetry: Int = 0)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = {
-    remote.set(key, calculatedValue).asTry().flatMap {
-      case Success(_) =>
-        reporter.onSuccessfullyRemoteSetValue(key, elapsedTime(nanoStartTime))
-        logger.info(s"remoteForceSet successfully overwritten key $key")
-        Future.successful(calculatedValue)
-      case Failure(e) =>
-        reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
-        logger.warn(s"remoteForceSet, key $key: got error setting the value, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e)
-        // Retry failure
-        after(backoffOnError, scheduler) {
-          remoteOverwrite(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
-        }
-    }
-  }
-
-
-  // Note: this method may return a failed future, but it will never cache it
-  // Our main purpose here is to avoid multiple local calls to generate new promises/futures in parallel,
-  // so we use this Map keep everyone in sync
-  // This is similar to how spray cache works
-  private def tryGenerateAndSet(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = {
-    val promise = Promise[TimestampedValue[V]]()
-    val future = promise.future
-    tempUpdate.putIfAbsent(key, future) match {
-      case null =>
-        logger.info(s"tryGenerateAndSet, key $key: got request for generating and none in progress found, calling canonicalValueGenerator")
-        canonicalValueGenerator(key, genValue, nanoStartTime).onComplete {
-          case Success(v) if !v.hasExpired(ttl, now, ttlCachedErrors) =>
-            reporter.onGeneratedWithSuccess(key, elapsedTime(nanoStartTime))
-            localCache.foreach(_.set(key, v))
-            promise.trySuccess(v)
-            tempUpdate.remove(key, future)
-          case Success(v) =>
-            // Have we generated/got an expired value!?
-            reporter.onUnexpectedBehaviour(key, elapsedTime(nanoStartTime))
-            logger.warn(s"tryGenerateAndSet, key $key: unexpectedly generated/got an expired value: $v")
-            localCache.foreach(_.set(key, v))
-            promise.trySuccess(v)
-            tempUpdate.remove(key, future)
-          case Failure(e) =>
-            // We don't save failures to cache
-            // There is no need to log here, canonicalValueGenerator will log everything already
-            reporter.onGeneratedWithFailure(key, e, elapsedTime(nanoStartTime))
-            promise.tryFailure(e)
-            tempUpdate.remove(key, future)
-        }
-        future
-      case fTrying =>
-        // If someone call us while a future is running, we return the running future
-        logger.info(s"tryGenerateAndSet, key $key: got request for generating but an existing one is current in progress")
-        fTrying
-    }
-  }
-
-  // This can be called by multiple instances/hosts simultaneously but in the end
-  // only the one that wins the race will create the final value that will be set in
-  // the remote cache and read by the other instances
-  // Unless of course there is some error getting stuff from remote cache
-  // in which case the locally generated value may be returned to avoid further delays
-  protected def canonicalValueGenerator(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext, scheduler: Scheduler) = {
-    val fGeneratedValue = Try { genValue().map(timestamp) }.asFutureTry()
-    val finalValue: Future[TimestampedValue[V]] = fGeneratedValue.flatMap {
-      case Success(generatedValue) =>
-        // Successfully generated value, try to set it in the remote writable cache
-        remoteRW match {
-          // No remote cache available, just return this value to be set on local cache
-          case None =>
-            Future.successful(generatedValue)
-          case Some(remote) =>
-            remoteSetOrGet(key, generatedValue, remote, nanoStartTime)
-        }
-      case Failure(eLocal) =>
-        // We failed to generate the value ourselves, our hope is if someone else successfully did it in the meantime
-        reporter.onErrorGeneratingValue(key, eLocal, elapsedTime(nanoStartTime))
-        remoteRW match {
-          case None =>
-            // There are no remote RW caches
-            logger.error(s"canonicalValueGenerator, key $key: failed to generate value and no remote cache configured", eLocal)
-            eLocal match {
-              case NonFatal(e) => {
-                if (cacheErrors) {
-                  // if error was NonFatal Error then saves it to cache
-                  val timestampedValue: TimestampedValue[V] = timestampError(e)
-                  // Saved it only in localCache
-                  localCache.foreach(_.set(key, timestampedValue))
-                }
-                Future.failed(eLocal)
-              }
-              case _ => Future.failed(eLocal)
-            }
-          case Some(remote) =>
-            remoteGetNonExpiredValue(key, remote, nanoStartTime).asTry().flatMap {
-              case Success(v) =>
-                logger.warn(s"canonicalValueGenerator, key $key: failed to generate value but got one from remote", eLocal)
-                Future.successful(v)
-              case Failure(eRemote) =>
-                // The real error is the eLocal, return it
-                logger.error(s"canonicalValueGenerator, key $key: failed to generate value and failed to get remote", eLocal)
-                eLocal match {
-                  case NonFatal(e) => {
-                    if (cacheErrors) {
-                      // if error was NonFatal Error then saves it to cache
-                      val timestampedValue = timestampError(e)
-                      // Saved it only in localCache
-                      localCache.foreach(_.set(key, timestampedValue))
-                    }
-                    Future.failed(eLocal)
-                  }
-                  case _ => Future.failed(eLocal)
-                }
-            }
-        }
-    }
-    finalValue
-  }
-
-  // Auxiliary method, only makes sense to be used by canonicalValueGenerator
-  private def remoteGetNonExpiredValue(key: String,
-                                       remote: RemoteCacheRW[TimestampedValue[V]],
-                                       nanoStartTime: Long,
-                                       currentRetry: Int = 0)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = {
-    remote.get(key).asTry().flatMap {
-      case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) =>
-        logger.info(s"remoteGetNonExpiredValue, key $key: got a good value")
-        Future.successful(remoteValue)
-      case Success(_) =>
-        Future.failed(new Exception("No good value found on remote"))
-      case Failure(e) =>
-        if (currentRetry >= maxErrorsToRetryOnRemote) {
-          reporter.onRemoteGiveUp(key, elapsedTime(nanoStartTime))
-          logger.error(s"remoteGetNonExpiredValue, key $key: returning calculated value because we got more than $maxErrorsToRetryOnRemote errors", e)
-          Future.failed(e)
-        } else {
-          reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
-          logger.warn(s"remoteGetNonExpiredValue, key $key: got error trying to get value, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e)
-          // Retry
-          after(backoffOnError, scheduler) {
-            remoteGetNonExpiredValue(key, remote, nanoStartTime, currentRetry = currentRetry + 1)
-          }
-        }
-    }
-  }
-
-  // This methods tries to guarantee that everyone that calls it in
-  // a given moment will be left with the same value in the end
-  private def remoteSetOrGet(key: String,
-                             calculatedValue: TimestampedValue[V],
-                             remote: RemoteCacheRW[TimestampedValue[V]],
-                             nanoStartTime: Long,
-                             currentRetry: Int = 0)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = {
-    if (currentRetry > maxErrorsToRetryOnRemote) {
-      // Use our calculated value as it's the best we can do
-      reporter.onRemoteGiveUp(key, elapsedTime(nanoStartTime))
-      logger.error(s"remoteSetOrGet, key $key: returning calculated value because we got more than $maxErrorsToRetryOnRemote errors")
-      Future.successful(calculatedValue)
-    } else {
-      remote.setLock(remoteLockKey(key), remoteLockTTL).asTry().flatMap {
-        case Success(true) =>
-          logger.info(s"remoteSetOrGet got lock for key $key")
-          // Lock acquired, get the current value and replace it
-          remote.get(key).asTry().flatMap {
-            case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) =>
-              // Current value is good, just return it
-              reporter.onRemoteCacheHitAfterGenerating(key, elapsedTime(nanoStartTime))
-              logger.info(s"remoteSetOrGet got lock for $key but found already a good value on remote")
-              Future.successful(remoteValue)
-            case Success(_) =>
-              // The remote value is missing or has expired. This is what we were expecting
-              // We have the lock to replace this value. Our calculated value will be the canonical one!
-              remote.set(key, calculatedValue).asTry().flatMap {
-                case Success(_) =>
-                  // Flawless victory!
-                  reporter.onSuccessfullyRemoteSetValue(key, elapsedTime(nanoStartTime))
-                  logger.info(s"remoteSetOrGet successfully set key $key while under lock")
-                  Future.successful(calculatedValue)
-                case Failure(e) =>
-                  reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
-                  logger.warn(s"remoteSetOrGet, key $key: got error setting the value, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e)
-                  // Retry failure
-                  after(backoffOnError, scheduler) {
-                    remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
-                  }
-              }
-            case Failure(e) =>
-              reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
-              logger.warn(s"remoteSetOrGet, key $key: got error getting remote value with lock, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e)
-              // Retry failure
-              after(backoffOnError, scheduler) {
-                remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
-              }
-          }
-        case Success(false) =>
-          // Someone got the lock, let's take a look at the value
-          remote.get(key).asTry().flatMap {
-            case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) =>
-              // Current value is good, just return it
-              logger.info(s"remoteSetOrGet couldn't lock key $key but found a good on remote afterwards")
-              reporter.onRemoteCacheHitAfterGenerating(key, elapsedTime(nanoStartTime))
-              Future.successful(remoteValue)
-            case Success(_) =>
-              // The value is missing or has expired
-              // Let's start from scratch because we need to be able to set or get a good value
-              // Note: do not increment retry because this isn't an error
-              reporter.onStillTryingToLockOrGet(key, elapsedTime(nanoStartTime))
-              logger.info(s"remoteSetOrGet couldn't lock key $key and didn't found good value on remote, scheduling retry")
-              after(backoffOnLockAcquire, scheduler) {
-                remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry)
-              }
-            case Failure(e) =>
-              reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
-              logger.warn(s"remoteSetOrGet, key $key: got error getting remote value without lock, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e)
-              // Retry
-              after(backoffOnError, scheduler) {
-                remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
-              }
-          }
-        case Failure(e) =>
-          // Retry failure
-          reporter.onRemoteError(key, e, elapsedTime(nanoStartTime))
-          logger.warn(s"remoteSetOrGet, key $key: got error trying to set lock, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e)
-          after(backoffOnError, scheduler) {
-            remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1)
-          }
-      }
-    }
-  }
-
-
-}
\ No newline at end of file
diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
deleted file mode 100644
index 6868f0b7..00000000
--- a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala
+++ /dev/null
@@ -1,89 +0,0 @@
-package ignition.core.http
-
-import java.io.InputStream
-import java.util.concurrent.TimeUnit
-
-import akka.util.Timeout
-import ignition.core.utils.URLUtils
-import spray.http._
-
-import scala.concurrent.Future
-import scala.concurrent.duration._
-import scala.language.postfixOps
-import scala.util.Try
-
-object AsyncHttpClientStreamApi {
-  
-  case class Credentials(user: String, password: String) {
-    def isEmpty = user.isEmpty && password.isEmpty
-
-    def toOption = Some(this).filter(!_.isEmpty)
-  }
-
-  object Credentials {
-    val empty = Credentials("", "")
-  }
-
-  // TODO: return a stream is dangerous because implies into a lock
-  case class StreamResponse(status: Int, content: InputStream)
-
-  // If any value is None, it will fallback to the implementation's default
-  object RequestConfiguration {
-    val defaultMaxRedirects: Int = 15
-    val defaultMaxConnectionsPerHost: Int = 500
-    val defaultPipelining: Boolean = false
-    val defaultIdleTimeout: FiniteDuration = Duration(30, TimeUnit.SECONDS)
-    val defaultRequestTimeout: FiniteDuration = Duration(20, TimeUnit.SECONDS)
-    val defaultConnectingTimeout: FiniteDuration = Duration(10, TimeUnit.SECONDS)
-  }
-
-  case class RequestConfiguration(maxRedirects: Option[Int] = Option(RequestConfiguration.defaultMaxRedirects),
-                                  maxConnectionsPerHost: Option[Int] = Option(RequestConfiguration.defaultMaxConnectionsPerHost),
-                                  pipelining: Option[Boolean] = Option(RequestConfiguration.defaultPipelining),
-                                  idleTimeout: Option[Duration] = Option(RequestConfiguration.defaultIdleTimeout),
-                                  requestTimeout: Option[Duration] = Option(RequestConfiguration.defaultRequestTimeout),
-                                  connectingTimeout: Option[Duration] = Option(RequestConfiguration.defaultConnectingTimeout))
-
-  case class Request(url: String,
-                     params: Map[String, String] = Map.empty,
-                     credentials: Option[Credentials] = None,
-                     method: HttpMethod = HttpMethods.GET,
-                     body: HttpEntity = HttpEntity.Empty,
-                     headers: List[HttpHeader] = List.empty,
-                     requestConfiguration: Option[RequestConfiguration] = None) {
-
-    def uri: Uri = {
-      if (params.nonEmpty)
-        URLUtils.parseUri(url).map(_.withQuery(params)).get
-      else
-        URLUtils.parseUri(url).get
-    }
-  }
-
-  case class RequestException(message: String, response: StreamResponse) extends RuntimeException(message)
-
-  object NoOpReporter extends ReporterCallback {
-    def onRequest(request: Request): Unit = {}
-    def onResponse(request: Request, status: Int): Unit = {}
-    def onFailure(request: Request, status: Int): Unit = {}
-    def onRetry(request: Request): Unit = {}
-    def onGiveUp(request: Request): Unit = {}
-    def onError(request: Request, error: Any): Unit = {}
-  }
-
-  abstract class ReporterCallback {
-    def onRequest(request: Request): Unit
-    def onResponse(request: Request, status: Int): Unit
-    def onFailure(request: Request, status: Int): Unit
-    def onRetry(request: Request): Unit
-    def onGiveUp(request: Request): Unit
-    def onError(request: Request, error: Any): Unit
-  }
-}
-
-trait AsyncHttpClientStreamApi {
-
-  def makeRequest(request: AsyncHttpClientStreamApi.Request, retryConf: RetryConf = RetryConf(), retryOnHttpStatus: Seq[Int] = List.empty)
-                   (implicit timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback = AsyncHttpClientStreamApi.NoOpReporter): Future[AsyncHttpClientStreamApi.StreamResponse]
-
-}
\ No newline at end of file
diff --git a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
deleted file mode 100644
index af40c25a..00000000
--- a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala
+++ /dev/null
@@ -1,297 +0,0 @@
-package ignition.core.http
-
-import java.util.concurrent.TimeoutException
-
-import akka.actor._
-import akka.io.IO
-import akka.pattern.ask
-import akka.util.Timeout
-import ignition.core.http.AsyncHttpClientStreamApi.{Request, RequestConfiguration}
-import spray.can.Http
-import spray.can.Http.HostConnectorSetup
-import spray.can.client.{ClientConnectionSettings, HostConnectorSettings}
-import spray.http.HttpHeaders.Authorization
-import spray.http.StatusCodes.Redirection
-import spray.http._
-
-import scala.concurrent.{ExecutionContext, Future}
-import scala.language.postfixOps
-import scala.util.control.NonFatal
-
-
-trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi {
-
-  implicit def actorRefFactory: ActorRefFactory
-  def executionContext: ExecutionContext = actorRefFactory.dispatcher
-
-  override def makeRequest(request: AsyncHttpClientStreamApi.Request, retryConf: RetryConf, retryOnHttpStatus: Seq[Int])
-                             (implicit timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback): Future[AsyncHttpClientStreamApi.StreamResponse] = {
-    val processor = actorRefFactory.actorOf(Props(new RequestProcessorActor(timeout, reporter, retryConf, retryOnHttpStatus)))
-    (processor ? request).mapTo[AsyncHttpClientStreamApi.StreamResponse]
-  }
-
-  private class RequestProcessorActor(timeout: Timeout,
-                                      reporter: AsyncHttpClientStreamApi.ReporterCallback,
-                                      retryConf: RetryConf,
-                                      retryOnHttpStatus: Seq[Int])
-    extends Actor with ActorLogging {
-
-
-    import context.system
-
-    import scala.language.implicitConversions
-
-    def isRedirection(status: StatusCode): Boolean = status match {
-      case r: Redirection => true
-      case _ => false
-    }
-
-    private implicit def toAuthHeader(credentials: AsyncHttpClientStreamApi.Credentials): List[Authorization] =
-      List(Authorization(credentials = BasicHttpCredentials(username = credentials.user, password = credentials.password)))
-
-    private def toSprayRequest(request: Request): HttpRequest = request match {
-      case Request(_, params, Some(credentials), method, body, headers, _) =>
-          HttpRequest(method = method, uri = request.uri, headers = credentials ++ headers, entity = body)
-
-      case Request(_, params, None, method, body, headers, _) =>
-        HttpRequest(method = method, uri = request.uri, entity = body, headers = headers)
-    }
-
-    private def toSprayHostConnectorSetup(uri: Uri, conf: Option[AsyncHttpClientStreamApi.RequestConfiguration]): HostConnectorSetup = {
-      // Create based on defaults, change some of them
-      val ccs: ClientConnectionSettings = ClientConnectionSettings(system)
-      val hcs: HostConnectorSettings = HostConnectorSettings(system)
-
-      val updatedCcs = ccs.copy(
-        responseChunkAggregationLimit = 0, // makes our client ineffective if non zero
-        idleTimeout = conf.flatMap(_.idleTimeout).getOrElse(ccs.idleTimeout),
-        connectingTimeout = conf.flatMap(_.connectingTimeout).getOrElse(ccs.connectingTimeout),
-        requestTimeout = conf.flatMap(_.requestTimeout).getOrElse(ccs.requestTimeout)
-      )
-
-      val maxConnections = conf.flatMap(_.maxConnectionsPerHost).getOrElse {
-        // Let's avoid someone shoot his own foot
-        if (hcs.maxConnections == 4) // Spray's default is stupidly low
-          // Use the API's default, which is more reasonable
-          RequestConfiguration.defaultMaxConnectionsPerHost
-        else
-          // If the conf is the non-default value, then someone know what he's doing. use that configured value
-          hcs.maxConnections
-      }
-
-      val updatedHcs = hcs.copy(
-        connectionSettings = updatedCcs,
-        maxRetries = 0, // We have our own retry mechanism
-        maxRedirects = 0, // We do our own redirect following
-        maxConnections = maxConnections,
-        pipelining = conf.flatMap(_.pipelining).getOrElse(hcs.pipelining)
-      )
-
-      val host = uri.authority.host
-      HostConnectorSetup(host.toString, uri.effectivePort, sslEncryption = uri.scheme == "https", settings = Option(updatedHcs))
-    }
-
-    private def executeSprayRequest(request: Request): Unit = {
-      val message = (toSprayRequest(request), toSprayHostConnectorSetup(request.uri, request.requestConfiguration))
-      IO(Http) ! message
-    }
-
-    def handleErrors(commander: ActorRef, request: Request, retry: Retry, storage: ByteStorage, remainingRedirects: Int): Receive = {
-      case ev @ Http.SendFailed(_) =>
-        log.debug("Communication error, cause: {}", ev)
-        reporter.onError(request, ev)
-        storage.close()
-        context.become(retrying(commander, request, remainingRedirects))
-        self ! retry.onError
-
-      case ev @ Timedout(_) =>
-        log.debug("Communication error, cause: {}", ev)
-        reporter.onError(request, ev)
-        storage.close()
-        context.become(retrying(commander, request, remainingRedirects))
-        self ! retry.onTimeout
-
-      case Status.Failure(NonFatal(exception)) =>
-        reporter.onError(request, exception)
-        storage.close()
-        exception match {
-          case ex: Http.RequestTimeoutException =>
-            log.warning("Request {} timeout, details: {}", request, ex.getMessage)
-            context.become(retrying(commander, request, remainingRedirects))
-            self ! retry.onTimeout
-
-          case ex: Http.ConnectionException =>
-            log.warning("Connection error on {}, details: {}", request, ex.getMessage)
-            context.become(retrying(commander, request, remainingRedirects))
-            self ! retry.onError
-
-          case unknownException =>
-            log.error(unknownException, "Unknown error on {}", request)
-            context.become(retrying(commander, request, remainingRedirects))
-            self ! retry.onError
-        }
-
-      case unknownMessage =>
-        log.debug("Unknown message: {}", unknownMessage)
-        reporter.onError(request, unknownMessage)
-        storage.close()
-        context.become(retrying(commander, request, remainingRedirects))
-        self ! retry.onError
-    }
-
-    def receive: Receive = {
-      case request: Request =>
-        log.debug("Starting request {}", request)
-        reporter.onRequest(request)
-        executeSprayRequest(request)
-        val retry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, conf = retryConf)
-        val storage = new ByteStorage()
-        val maxRedirects =
-          request.requestConfiguration.flatMap(_.maxRedirects).getOrElse(RequestConfiguration.defaultMaxRedirects)
-        context.become(waitingForResponse(sender, request, retry, storage, maxRedirects)
-          .orElse(handleErrors(sender, request, retry, storage, maxRedirects)))
-    }
-
-    def retrying(commander: ActorRef, request: Request, remainingRedirects: Int): Receive = {
-      case retry: Retry =>
-        if (retry.shouldGiveUp) {
-          reporter.onGiveUp(request)
-          log.warning("Error to get {}, no more retries {}, accepting failure", request, retry)
-          commander ! Status.Failure(new TimeoutException(s"Failed to get '${request.url}'"))
-          context.stop(self)
-        } else {
-          reporter.onRetry(request)
-          log.info("Retrying {}, retry status {}, backing off for {} millis", request, retry, retry.backoff.toMillis)
-          system.scheduler.scheduleOnce(retry.backoff) {
-            log.debug("Waking from backoff, retrying request {}", request)
-            executeSprayRequest(request)
-          }(executionContext)
-          val storage = new ByteStorage()
-          context.become(waitingForResponse(commander, request, retry, storage, remainingRedirects)
-            .orElse(handleErrors(commander, request, retry, storage, remainingRedirects)))
-        }
-    }
-
-    def waitingForResponse(commander: ActorRef, request: Request, retry: Retry, storage: ByteStorage, remainingRedirects: Int): Receive = {
-      case response@HttpResponse(status, entity, headers, _) => try {
-        storage.write(response.entity.data.toByteArray)
-        if (isRedirection(status))
-          handleRedirect(commander, storage, retry, request, status, response, remainingRedirects)
-        else if (status.isSuccess) {
-          reporter.onResponse(request, status.intValue)
-          commander ! Status.Success(AsyncHttpClientStreamApi.StreamResponse(status.intValue, storage.getInputStream()))
-          context.stop(self)
-        } else if (retryOnHttpStatus.contains(status.intValue)) {
-          storage.close()
-          log.debug("HttpResponse: Status {}, retrying...", status)
-          context.become(retrying(commander, request, remainingRedirects))
-          self ! retry.onError
-        } else {
-          val message = s"HTTP response status ${status.intValue}, on request ${request}, ${status.defaultMessage}"
-          log.debug("HttpResponse: {}", message)
-          reporter.onFailure(request, status.intValue)
-          reporter.onGiveUp(request)
-          commander ! Status.Failure(new AsyncHttpClientStreamApi.RequestException(message = message,
-            response = AsyncHttpClientStreamApi.StreamResponse(status.intValue, storage.getInputStream())))
-          context.stop(self)
-        }
-      } catch {
-        case NonFatal(ex) =>
-          storage.close()
-          log.error(ex, "HttpResponse: Failure on creating HttpResponse")
-          reporter.onError(request, ex)
-          context.become(retrying(commander, request, remainingRedirects))
-          self ! retry.onError
-      }
-
-      case chunkStart@ChunkedResponseStart(HttpResponse(status, entity, headers, _)) => try {
-        storage.write(entity.data.toByteArray)
-        if (isRedirection(status))
-          handleRedirect(commander, storage, retry, request, status, chunkStart, remainingRedirects)
-        else if (status.isSuccess) {
-          context.become(accumulateChunks(commander, request, retry, storage, status, remainingRedirects)
-            .orElse(handleErrors(commander, request, retry, storage, remainingRedirects)))
-        } else if (retryOnHttpStatus.contains(status.intValue)) {
-          storage.close()
-          log.debug("ChunkedResponseStart: Status {}, retrying...", status)
-          context.become(retrying(commander, request, remainingRedirects))
-          self ! retry.onError
-        } else {
-          val message = s"HTTP response status ${status.intValue}, on request ${request}, ${status.defaultMessage}"
-          log.debug("ChunkedResponseStart: {}", message)
-          reporter.onFailure(request, status.intValue)
-          reporter.onGiveUp(request)
-          commander ! Status.Failure(new AsyncHttpClientStreamApi.RequestException(message = message,
-            response = AsyncHttpClientStreamApi.StreamResponse(status.intValue, storage.getInputStream())))
-          context.stop(self)
-        }
-      } catch {
-        case NonFatal(ex) =>
-          log.error(ex, "ChunkedResponseStart: Failure on creating ChunkedHttpResponse")
-          reporter.onError(request, ex)
-          context.become(retrying(commander, request, remainingRedirects))
-          self ! retry.onError
-      }
-    }
-
-    def accumulateChunks(commander: ActorRef, request: Request, retry: Retry, storage: ByteStorage, status: StatusCode, remainingRedirects: Int): Receive = {
-      case message@MessageChunk(data, _) => try {
-        storage.write(data.toByteArray)
-      } catch {
-        case NonFatal(ex) =>
-          storage.close()
-          log.error(ex, "MessageChunk: Failure on accumulate chunk data")
-          reporter.onError(request, ex)
-          context.become(retrying(commander, request, remainingRedirects))
-          self ! retry.onError
-      }
-
-      case chunkEnd: ChunkedMessageEnd =>
-        log.debug("ChunkedMessageEnd: all data was received for request {}, status {}", request, status)
-        reporter.onResponse(request, status.intValue)
-        commander ! Status.Success(AsyncHttpClientStreamApi.StreamResponse(status.intValue, storage.getInputStream()))
-        context.stop(self)
-    }
-
-    def handleRedirect(commander: ActorRef, oldStorage: ByteStorage, oldRetry: Retry, oldRequest: Request, status: StatusCode, rawResponse: HttpResponsePart, remainingRedirects: Int): Unit = {
-      if (remainingRedirects <= 0) {
-        val message = s"HandleRedirect: exceeded redirection limit on $oldRequest with status $status"
-        log.warning(message)
-        reporter.onGiveUp(oldRequest)
-        commander ! Status.Failure(new Exception(message))
-        context.stop(self)
-      } else {
-        def makeRequest(headers: List[HttpHeader]): Receive = {
-          oldStorage.close()
-          val newRemainingRedirects = remainingRedirects - 1
-          headers.find(_.is("location")).map(_.value).map { newLocation =>
-            log.debug("Making redirect to {}", newLocation)
-            val newRequest = oldRequest.copy(url = newLocation)
-            executeSprayRequest(newRequest)
-            val newRetry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, conf = retryConf)
-            val newStorage = new ByteStorage()
-            waitingForResponse(commander, newRequest, newRetry, newStorage, newRemainingRedirects)
-              .orElse(handleErrors(commander, newRequest, newRetry, newStorage, newRemainingRedirects))
-          }.getOrElse {
-            log.warning("Received redirect for request {} with headers {} without location, retrying...", oldRequest, headers)
-            retrying(commander, oldRequest, newRemainingRedirects)
-          }
-        }
-        context.become(rawResponse match {
-          case response@HttpResponse(status, entity, headers, _) =>
-            makeRequest(headers)
-          case chunkStart@ChunkedResponseStart(HttpResponse(status, entity, headers, _)) => {
-            case message@MessageChunk(data, _) =>
-              // do nothing
-            case chunkEnd: ChunkedMessageEnd =>
-              context.become(makeRequest(headers))
-          }
-          case other =>
-            throw new Exception(s"Bug, called on $other")
-        })
-      }
-    }
-
-  }
-
-}
diff --git a/src/main/scala/ignition/core/http/ByteStorage.scala b/src/main/scala/ignition/core/http/ByteStorage.scala
deleted file mode 100644
index c137a5fe..00000000
--- a/src/main/scala/ignition/core/http/ByteStorage.scala
+++ /dev/null
@@ -1,114 +0,0 @@
-package ignition.core.http
-
-import java.io._
-import java.nio.file.{Files, Paths}
-import java.util.UUID
-
-import org.slf4j.LoggerFactory
-
-import scala.util.control.NonFatal
-import scala.util.{Failure, Success, Try}
-
-class ByteStorage(memoryThreshold: Int = 1024 * 1024 * 5) extends AutoCloseable {
-
-  lazy val log = LoggerFactory.getLogger(getClass)
-
-  lazy val tempDirPath = Files.createDirectories(Paths.get(System.getProperty("java.io.tmpdir"), "ByteStorage"))
-
-  lazy val buffer = new ByteArrayOutputStream
-
-  var fileStorage: Option[(File, FileOutputStream)] = None
-
-  def write(bytes: Array[Byte]): Unit = try {
-    if (fileStorage.isDefined) {
-      writeOnFile(bytes)
-    } else if (buffer.size() + bytes.length > memoryThreshold) {
-      log.debug("Memory threshold {} reach, going to file storage", memoryThreshold)
-      setupFileStorage()
-      writeOnFile(buffer.toByteArray)
-      writeOnFile(bytes)
-      // on ByteArrayOutputStream close() takes not effect,
-      // but if we change the buffer impl this is the a good moment to free resources
-      buffer.close()
-    } else {
-      buffer.write(bytes)
-    }
-  } catch {
-    case NonFatal(ex) =>
-      close()
-      throw ex
-  }
-
-  override def close(): Unit = fileStorage match {
-    case Some((file, outputStream)) => try {
-        log.debug("Cleaning up temp file {}", file.getAbsolutePath)
-        outputStream.close()
-        file.delete()
-      } catch {
-        case NonFatal(ex) => log.warn(s"Fail to cleanup temp file ${file.getAbsolutePath}", ex)
-      }
-    case None =>
-      log.debug("Cleaning up memory buffer")
-      buffer.close()
-  }
-
-  private def setupFileStorage(): Unit = if (fileStorage.isEmpty) {
-    tryCreateTempFile match {
-      case Success(storage) => fileStorage = Option(storage)
-      case Failure(ex) => throw ex
-    }
-  } else {
-    throw new IllegalStateException("File storage already setup")
-  }
-
-  private def tryCreateTempFile: Try[(File, FileOutputStream)] = Try {
-    val tempFile = File.createTempFile(s"temp_byte_storage_${UUID.randomUUID().toString}", ".temp", tempDirPath.toFile)
-    tempFile.deleteOnExit()
-    log.debug("Creating temp file {}", tempFile.getAbsolutePath)
-    (tempFile, new FileOutputStream(tempFile))
-  }
-
-  private def writeOnFile(bytes: Array[Byte]): Unit = fileStorage match {
-    case Some((_, outputStream)) => outputStream.write(bytes)
-    case None => throw new IllegalStateException("File storage not initialized")
-  }
-
-  def getInputStream(): InputStream = fileStorage match {
-    case Some((file, outputStream)) => try {
-      outputStream.close()
-      new DeleteOnCloseFileInputStream(file)
-    } catch {
-      case NonFatal(ex) =>
-        log.error("Fail to create InputStream", ex)
-        close()
-        throw ex
-    }
-    case None => new ByteArrayInputStream(buffer.toByteArray)
-  }
-
-  override def finalize() = try {
-    fileStorage match {
-      case Some((file, outputStream)) =>
-        log.debug("Cleaning up temp file {}", file.getAbsolutePath)
-        outputStream.close()
-        file.delete()
-      case None =>
-    }
-  } finally {
-    super.finalize()
-  }
-
-}
-
-private class DeleteOnCloseFileInputStream(file: File) extends FileInputStream(file) {
-  lazy val log = LoggerFactory.getLogger(getClass)
-  override def close() = try {
-    log.debug("Cleaning up file {}", file.getAbsolutePath)
-    file.delete()
-  } catch {
-    case NonFatal(ex) =>
-      log.warn(s"Failed to clean up file ${file.getAbsolutePath}", ex)
-  } finally {
-    super.close()
-  }
-}
\ No newline at end of file
diff --git a/src/main/scala/ignition/core/http/Caching.scala b/src/main/scala/ignition/core/http/Caching.scala
deleted file mode 100644
index 112791a5..00000000
--- a/src/main/scala/ignition/core/http/Caching.scala
+++ /dev/null
@@ -1,22 +0,0 @@
-package ignition.core.http
-
-import org.slf4j.LoggerFactory
-import spray.caching.Cache
-
-import scala.concurrent._
-import scala.util.Failure
-
-trait Caching[T] {
-  val log = LoggerFactory.getLogger(classOf[Caching[T]])
-
-  val cache: Cache[T]
-  import ExecutionContext.Implicits.global
-  def fetchCache[K](key: K, f: () => Future[T]): Future[T] = cache(key) {
-    f.apply andThen {
-      case Failure(e) => {
-        cache.remove(key)
-        log.info(s"Removed $key from cache due to an exception: $e")
-      }
-    }
-  }
-}
diff --git a/src/main/scala/ignition/core/http/Retry.scala b/src/main/scala/ignition/core/http/Retry.scala
deleted file mode 100644
index 1c94828b..00000000
--- a/src/main/scala/ignition/core/http/Retry.scala
+++ /dev/null
@@ -1,84 +0,0 @@
-package ignition.core.http
-
-import java.util.concurrent.TimeUnit
-
-import org.joda.time.DateTime
-
-import scala.concurrent.duration.{Duration, FiniteDuration, _}
-import scala.language.postfixOps
-import scala.util.Random
-
-object Retry {
-
-  sealed trait State
-  case object Timeout extends State
-  case object Error extends State
-
-  def exponentialBackOff(base: Int,
-                         exponent: Int,
-                         initialBackoff: FiniteDuration,
-                         maxBackoff: FiniteDuration,
-                         maxRandom: FiniteDuration): FiniteDuration = {
-    val randomMillis = maxRandom.toMillis.toInt
-    val random = if (randomMillis > 0)
-      FiniteDuration(Random.nextInt(randomMillis), TimeUnit.MILLISECONDS)
-    else
-      FiniteDuration(0, TimeUnit.MILLISECONDS)
-
-    val calculated = scala.math.pow(base, exponent).round * (random + initialBackoff)
-    calculated.min(maxBackoff)
-  }
-
-}
-
-case class RetryConf(initialTimeoutBackoff: FiniteDuration = 100 milliseconds,
-                     maxErrors: Int = 10,
-                     initialBackoffOnError: FiniteDuration = 100 milliseconds,
-                     timeoutMultiplicationFactor: Int = 2,
-                     errorMultiplicationFactor: Int = 2,
-                     maxBackoff: FiniteDuration = 1 minute,
-                     maxRandom: FiniteDuration = 30 milliseconds)
-
-case class Retry(conf: RetryConf,
-                 startTime: DateTime,
-                 timeout: FiniteDuration,
-                 state: Retry.State = Retry.Timeout,
-                 timeoutCount: Int = 0,
-                 errorsCount: Int = 0) {
-
-  import Retry._
-
-  protected def now = DateTime.now
-
-  private def errorBackoff =
-    exponentialBackOff(conf.errorMultiplicationFactor, Math.max(errorsCount - 1, 0), conf.initialBackoffOnError, conf.maxBackoff, conf.maxRandom)
-  private def timeoutBackoff =
-    exponentialBackOff(conf.timeoutMultiplicationFactor, Math.max(timeoutCount - 1, 0), conf.initialTimeoutBackoff, conf.maxBackoff, conf.maxRandom)
-
-  def onError(): Retry =
-    copy(errorsCount = errorsCount + 1, state = Retry.Error)
-
-  def onTimeout(): Retry = copy(timeoutCount = timeoutCount + 1, state = Retry.Timeout)
-
-  def backoff(): FiniteDuration = state match {
-    case Timeout => timeoutBackoff
-    case Error => errorBackoff
-  }
-
-  private def canRetryMore(durations: FiniteDuration*): Boolean = {
-    val maxTime = startTime.plusMillis(timeout.toMillis.toInt)
-    val nextEstimatedTime = now.plusMillis(durations.map(_.toMillis.toInt).sum)
-    nextEstimatedTime.isBefore(maxTime)
-  }
-
-  // This is an approximation and we are ignoring the time waiting on backoff.
-  // In this way we are overestimating the average request duration, which is fine because it's better to abort early than wait too much time exceed AskTimeouts
-  private def averageRequestDuration =
-    Duration((now.getMillis - startTime.getMillis) / Math.max(timeoutCount + errorsCount, 1), TimeUnit.MILLISECONDS)
-
-  def shouldGiveUp(): Boolean = state match {
-    case Timeout => !canRetryMore(averageRequestDuration, timeoutBackoff)
-    case Error => !canRetryMore(averageRequestDuration, errorBackoff) || errorsCount > conf.maxErrors
-  }
-
-}
\ No newline at end of file
diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 1d06505d..e07ba54f 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -3,7 +3,7 @@ package ignition.core.jobs.utils
 import java.io.InputStream
 
 import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
-import com.amazonaws.services.s3.AmazonS3Client
+import com.amazonaws.services.s3.{AmazonS3, AmazonS3Builder, AmazonS3Client}
 import com.amazonaws.services.s3.model.{ListObjectsRequest, ObjectListing, S3ObjectSummary}
 import ignition.core.utils.CollectionUtils._
 import ignition.core.utils.DateUtils._
@@ -49,7 +49,8 @@ object SparkContextUtils {
     def isCompressed(f: HadoopFile): Boolean = compressedExtensions.exists(f.path.endsWith)
   }
 
-  private lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new DefaultAWSCredentialsProviderChain())
+  private lazy val amazonS3ClientFromEnvironmentVariables: AmazonS3 =
+    AmazonS3Client.builder().withCredentials(new DefaultAWSCredentialsProviderChain()).build()
 
   private def close(inputStream: InputStream, path: String): Unit = {
     try {
@@ -453,7 +454,7 @@ object SparkContextUtils {
     }
 
     def s3ListCommonPrefixes(path: S3SplittedPath, delimiter: String = "/")
-                            (implicit s3: AmazonS3Client): Stream[S3SplittedPath] = {
+                            (implicit s3: AmazonS3): Stream[S3SplittedPath] = {
       def inner(current: ObjectListing): Stream[String] =
         if (current.isTruncated) {
           logger.trace(s"list common prefixed truncated for ${path.bucket} ${path.key}: ${current.getCommonPrefixes}")
@@ -468,7 +469,7 @@ object SparkContextUtils {
     }
 
     def s3ListObjects(path: S3SplittedPath)
-                     (implicit s3: AmazonS3Client): Stream[S3ObjectSummary] = {
+                     (implicit s3: AmazonS3): Stream[S3ObjectSummary] = {
       def inner(current: ObjectListing): Stream[S3ObjectSummary] =
         if (current.isTruncated) {
           logger.trace(s"list objects truncated for ${path.bucket} ${path.key}: $current")
@@ -487,7 +488,7 @@ object SparkContextUtils {
                       inclusiveEndDate: Boolean = true,
                       endDate: Option[DateTime] = None,
                       ignoreHours: Boolean = true)
-                     (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): Stream[WithOptDate[S3SplittedPath]] = {
+                     (implicit s3: AmazonS3, pathDateExtractor: PathDateExtractor): Stream[WithOptDate[S3SplittedPath]] = {
 
       def isGoodDate(date: DateTime): Boolean = {
         val startDateToCompare =  startDate.map(date => if (ignoreHours) date.withTimeAtStartOfDay() else date)
@@ -529,7 +530,7 @@ object SparkContextUtils {
                              inclusiveEndDate: Boolean,
                              endDate: Option[DateTime],
                              exclusionPattern: Option[String])
-                            (implicit s3: AmazonS3Client, dateExtractor: PathDateExtractor): Stream[WithOptDate[Array[S3ObjectSummary]]] = {
+                            (implicit s3: AmazonS3, dateExtractor: PathDateExtractor): Stream[WithOptDate[Array[S3ObjectSummary]]] = {
 
 
       S3SplittedPath.from(path) match {
diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala
index 684c950b..f12918db 100644
--- a/src/main/scala/ignition/core/utils/FutureUtils.scala
+++ b/src/main/scala/ignition/core/utils/FutureUtils.scala
@@ -1,7 +1,5 @@
 package ignition.core.utils
 
-import akka.actor.ActorSystem
-
 import scala.concurrent.duration.FiniteDuration
 import scala.concurrent.{ExecutionContext, Future, Promise, blocking, future}
 import scala.util.control.NonFatal
@@ -33,9 +31,6 @@ object FutureUtils {
       future.map(v => Success(v)).recover { case NonFatal(e) => Failure(e) }
     }
 
-    def withTimeout(timeout: => Throwable)(implicit duration: FiniteDuration, system: ActorSystem): Future[V] = {
-      Future.firstCompletedOf(Seq(future, akka.pattern.after(duration, system.scheduler)(Future.failed(timeout))(system.dispatcher)))(system.dispatcher)
-    }
   }
 
   implicit class TryFutureImprovements[V](future: Try[Future[V]]) {
diff --git a/src/main/scala/ignition/core/utils/S3Client.scala b/src/main/scala/ignition/core/utils/S3Client.scala
deleted file mode 100644
index 020ab6f4..00000000
--- a/src/main/scala/ignition/core/utils/S3Client.scala
+++ /dev/null
@@ -1,62 +0,0 @@
-package ignition.core.utils
-
-import java.util.Properties
-
-import org.jets3t.service.impl.rest.httpclient.RestS3Service
-import org.jets3t.service.model.{S3Object, StorageObject}
-import org.jets3t.service.security.AWSCredentials
-import org.jets3t.service.{Constants, Jets3tProperties}
-
-
-class S3Client {
-
-  val jets3tProperties = {
-    val jets3tProperties = Jets3tProperties.getInstance(Constants.JETS3T_PROPERTIES_FILENAME)
-    val properties = new Properties()
-//    properties.put("httpclient.max-connections", "2") // The maximum number of simultaneous connections to allow globally
-//    properties.put("httpclient.retry-max", "10") // How many times to retry connections when they fail with IO errors
-//    properties.put("httpclient.socket-timeout-ms", "30000") // How many milliseconds to wait before a connection times out. 0 means infinity.
-
-    jets3tProperties.loadAndReplaceProperties(properties, "ignition'")
-    jets3tProperties
-  }
-
-  val service = new RestS3Service(
-    new AWSCredentials(System.getenv("AWS_ACCESS_KEY_ID"), System.getenv("AWS_SECRET_ACCESS_KEY")),
-    null, null, jets3tProperties
-  )
-
-  def writeContent(bucket: String, key: String, content: String, contentType: String = "text/plain"): S3Object = {
-    val obj = new S3Object(key, content)
-    obj.setContentType(contentType)
-    service.putObject(bucket, obj)
-  }
-
-  def readContent(bucket: String, key: String): S3Object = {
-    service.getObject(bucket, key, null, null, null, null, null, null)
-  }
-
-  def list(bucket: String, key: String): Array[StorageObject] = {
-    service.listObjectsChunked(bucket, key, null, 99999L, null, true).getObjects
-  }
-
-  def copyFile(sourceBucket: String, sourceKey: String,
-               destBucket: String, destKey: String,
-               destContentType: Option[String] = None,
-               destContentEncoding: Option[String] = None): Unit = {
-    val destFile = new S3Object(destKey)
-    val replaceMetaData = destContentType.isDefined || destContentEncoding.isDefined
-    destContentEncoding.foreach(encoding => destFile.setContentEncoding(encoding))
-    destContentType.foreach(contentType => destFile.setContentType(contentType))
-    service.copyObject(sourceBucket, sourceKey, destBucket, destFile, replaceMetaData)
-  }
-
-  def fileExists(bucket: String, key: String): Boolean = {
-    try {
-      service.getObjectDetails(bucket, key, null, null, null, null)
-      true
-    } catch {
-      case e: org.jets3t.service.S3ServiceException if e.getResponseCode == 404 => false
-    }
-  }
-}
diff --git a/src/main/scala/ignition/core/utils/TelemetryCache.scala b/src/main/scala/ignition/core/utils/TelemetryCache.scala
deleted file mode 100644
index d86f98bc..00000000
--- a/src/main/scala/ignition/core/utils/TelemetryCache.scala
+++ /dev/null
@@ -1,45 +0,0 @@
-package ignition.core.utils
-
-import ignition.core.utils.TelemetryCache.TelemetryCacheReporter
-import spray.caching.Cache
-
-import scala.concurrent.{ExecutionContext, Future}
-
-object TelemetryCache {
-
-  def apply[V](cacheName: String, wrapped: Cache[V], reporter: TelemetryCacheReporter): Cache[V] =
-    new TelemetryCache[V](cacheName, wrapped, reporter)
-
-  trait TelemetryCacheReporter {
-    def onHit(name: String): Unit
-    def onMiss(name: String): Unit
-  }
-
-}
-
-class TelemetryCache[V](cacheName: String, wrapped: Cache[V], reporter: TelemetryCacheReporter) extends Cache[V] {
-
-  override def apply(key: Any, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = {
-    val value = wrapped.get(key)
-    if (value.isDefined) {
-      reporter.onHit(cacheName)
-      value.get
-    } else {
-      reporter.onMiss(cacheName)
-      wrapped.apply(key, genValue)
-    }
-  }
-
-  override def get(key: Any): Option[Future[V]] = wrapped.get(key)
-
-  override def clear(): Unit = wrapped.clear()
-
-  override def size: Int = wrapped.size
-
-  override def remove(key: Any): Option[Future[V]] = wrapped.remove(key)
-
-  override def keys: Set[Any] = wrapped.keys
-
-  override def ascendingKeys(limit: Option[Int]): Iterator[Any] = wrapped.ascendingKeys(limit)
-
-}
diff --git a/src/main/scala/ignition/core/utils/URLUtils.scala b/src/main/scala/ignition/core/utils/URLUtils.scala
index f66a3f03..4a0ae28c 100644
--- a/src/main/scala/ignition/core/utils/URLUtils.scala
+++ b/src/main/scala/ignition/core/utils/URLUtils.scala
@@ -1,10 +1,8 @@
 package ignition.core.utils
 
-import java.net.{URL, URLDecoder, URLEncoder}
+import java.net.{URLDecoder, URLEncoder}
 
 import org.apache.http.client.utils.URIBuilder
-import spray.http.Uri
-import spray.http.Uri.Query
 
 import scala.util.Try
 
@@ -18,23 +16,6 @@ object URLUtils {
   def sanitizePathSegment(segment: String): Try[String] =
     Try { URLEncoder.encode(URLDecoder.decode(segment, "UTF-8"), "UTF-8").replace("+", "%20") }
 
-  def parseUri(urlStr: String): Try[Uri] = {
-    for {
-      url <- Try(new URL(urlStr))
-      rawSegments = url.getPath.split("/")
-      saneSegments = rawSegments.map(sanitizePathSegment)
-      if saneSegments.forall(_.isSuccess)
-      sanePath = saneSegments.map(_.get).mkString("/")
-    } yield Uri.from(
-      scheme = url.getProtocol,
-      userinfo = Option(url.getUserInfo).getOrElse(""),
-      host = url.getHost,
-      port = Seq(url.getPort, 0).max,
-      path = sanePath,
-      query = Query(Option(url.getQuery)),
-      fragment = Option(url.getRef))
-  }
-
   def addParametersToUrl(url: String, partnerParams: Map[String, String]): String = {
     val builder = new URIBuilder(url.trim)
     partnerParams.foreach { case (k, v) => builder.addParameter(k, v) }
diff --git a/src/main/scala/spray/cache/ExpiringLruLocalCache.scala b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala
deleted file mode 100644
index 9fa476f9..00000000
--- a/src/main/scala/spray/cache/ExpiringLruLocalCache.scala
+++ /dev/null
@@ -1,134 +0,0 @@
-// Note:
-// For ignition.core we added two methods to satisfy ExpiringMultipleLevelCache.LocalCache[V]
-
-/*
- * Copyright © 2011-2013 the spray project <http://spray.io>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package spray.caching
-
-import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap
-import spray.util.Timestamp
-
-import scala.annotation.tailrec
-import scala.collection.JavaConverters._
-import scala.concurrent.duration.Duration
-import scala.concurrent.{ExecutionContext, Future, Promise}
-import scala.util.{Failure, Success}
-
-final class ExpiringLruLocalCache[V](maxCapacity: Long,
-                                     initialCapacity: Int = 16,
-                                     timeToLive: Duration = Duration.Inf,
-                                     timeToIdle: Duration = Duration.Inf) extends Cache[V] with ignition.core.cache.ExpiringMultiLevelCache.LocalCache[V] {
-  require(!timeToLive.isFinite || !timeToIdle.isFinite || timeToLive > timeToIdle,
-    s"timeToLive($timeToLive) must be greater than timeToIdle($timeToIdle)")
-
-  private[caching] val store = new ConcurrentLinkedHashMap.Builder[Any, Entry[V]]
-    .initialCapacity(initialCapacity)
-    .maximumWeightedCapacity(maxCapacity)
-    .build()
-
-  @tailrec
-  def get(key: Any): Option[Future[V]] = store.get(key) match {
-    case null ⇒ None
-    case entry if (isAlive(entry)) ⇒
-      entry.refresh()
-      Some(entry.future)
-    case entry ⇒
-      // remove entry, but only if it hasn't been removed and reinserted in the meantime
-      if (store.remove(key, entry)) None // successfully removed
-      else get(key) // nope, try again
-  }
-
-  def apply(key: Any, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext): Future[V] = {
-    def insert() = {
-      val newEntry = new Entry(Promise[V]())
-      val valueFuture =
-        store.put(key, newEntry) match {
-          case null ⇒ genValue()
-          case entry ⇒
-            if (isAlive(entry)) {
-              // we date back the new entry we just inserted
-              // in the meantime someone might have already seen the too fresh timestamp we just put in,
-              // but since the original entry is also still alive this doesn't matter
-              newEntry.created = entry.created
-              entry.future
-            } else genValue()
-        }
-      valueFuture.onComplete { value ⇒
-        newEntry.promise.tryComplete(value)
-        // in case of exceptions we remove the cache entry (i.e. try again later)
-        if (value.isFailure) store.remove(key, newEntry)
-      }
-      newEntry.promise.future
-    }
-    store.get(key) match {
-      case null ⇒ insert()
-      case entry if (isAlive(entry)) ⇒
-        entry.refresh()
-        entry.future
-      case entry ⇒ insert()
-    }
-  }
-
-  def remove(key: Any) = store.remove(key) match {
-    case null                      ⇒ None
-    case entry if (isAlive(entry)) ⇒ Some(entry.future)
-    case entry                     ⇒ None
-  }
-
-  def clear(): Unit = { store.clear() }
-
-  def keys: Set[Any] = store.keySet().asScala.toSet
-
-  def ascendingKeys(limit: Option[Int] = None) =
-    limit.map { lim ⇒ store.ascendingKeySetWithLimit(lim) }
-      .getOrElse(store.ascendingKeySet())
-      .iterator().asScala
-
-  def size = store.size
-
-  private def isAlive(entry: Entry[V]) =
-    (entry.created + timeToLive).isFuture &&
-      (entry.lastAccessed + timeToIdle).isFuture
-
-  // Method required by ExpiringMultipleLevelCache.LocalCache
-  override def set(key: Any, value: V): Unit = {
-    val newEntry = new Entry(Promise[V]())
-    newEntry.promise.trySuccess(value)
-    store.put(key, newEntry) match {
-      case null =>
-        // Nothing to do
-      case oldEntry =>
-        // If the old promise is pending, complete it with our future
-        oldEntry.promise.trySuccess(value)
-    }
-  }
-}
-
-private[caching] class ExpiringLruLocalCacheEntry[T](val promise: Promise[T]) {
-  @volatile var created = Timestamp.now
-  @volatile var lastAccessed = Timestamp.now
-  def future = promise.future
-  def refresh(): Unit = {
-    // we dont care whether we overwrite a potentially newer value
-    lastAccessed = Timestamp.now
-  }
-  override def toString = future.value match {
-    case Some(Success(value))     ⇒ value.toString
-    case Some(Failure(exception)) ⇒ exception.toString
-    case None                     ⇒ "pending"
-  }
-}
diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
deleted file mode 100644
index 9fd77d78..00000000
--- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala
+++ /dev/null
@@ -1,213 +0,0 @@
-package ignition.core.cache
-
-import java.io.FileNotFoundException
-import java.util.concurrent.atomic.AtomicInteger
-
-import akka.actor.ActorSystem
-import ignition.core.cache.ExpiringMultiLevelCache.TimestampedValue
-import org.scalatest.concurrent.ScalaFutures
-import org.scalatest.{FlatSpec, Matchers}
-import spray.caching.ExpiringLruLocalCache
-
-import scala.concurrent.ExecutionContext.Implicits.global
-import scala.concurrent.duration._
-import scala.concurrent.{Await, Future}
-
-class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFutures {
-  case class Data(s: String)
-  implicit val scheduler = ActorSystem().scheduler
-
-  "ExpiringMultipleLevelCache" should "calculate a value on cache miss and return it" in {
-    val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
-    val cache = ExpiringMultiLevelCache[Data](1.minute, Option(local))
-    Await.result(cache("key", () => Future.successful(Data("success"))), 1.minute) shouldBe Data("success")
-  }
-
-  it should "calculate a value on cache miss and return a failed future of the calculation" in {
-    val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
-    val cache = ExpiringMultiLevelCache[Data](1.minute, Option(local))
-
-    class MyException(s: String) extends Exception(s)
-
-    val eventualCache = cache("key", () => Future.failed(new MyException("some failure")))
-    whenReady(eventualCache.failed) { failure =>
-      failure shouldBe a [MyException]
-    }
-  }
-
-  it should "calculate a value on cache miss after ttl" in {
-    val cacheTtl = 3.seconds
-    val myRequestCount = new AtomicInteger()
-
-    def myRequest(): Future[Data] = {
-      myRequestCount.incrementAndGet()
-      Future.successful(Data("success"))
-    }
-
-    val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
-    val cache = ExpiringMultiLevelCache[Data](ttl = cacheTtl, localCache = Option(local))
-
-    whenReady(cache("key", myRequest)) { result =>
-      result shouldBe Data("success")
-    }
-
-    myRequestCount.get() shouldBe 1
-
-    whenReady(cache("key", myRequest)) { result =>
-      result shouldBe Data("success")
-    }
-
-    myRequestCount.get() shouldBe 1
-
-    Thread.sleep(cacheTtl.toMillis + 10)
-
-    whenReady(cache("key", myRequest)) { result =>
-      result shouldBe Data("success")
-    }
-
-    myRequestCount.get() shouldBe 2
-
-    whenReady(cache("key", myRequest)) { result =>
-      result shouldBe Data("success")
-    }
-
-    myRequestCount.get() shouldBe 2
-  }
-
-  it should "calculate a value on cache miss just once, the second call should be from cache hit" in {
-    var myFailedRequestCount: Int = 0
-
-    class MyException(s: String) extends FileNotFoundException(s) // Some NonFatal Exception
-    def myFailedRequest(): Future[Nothing] = {
-      myFailedRequestCount = myFailedRequestCount + 1
-      Future.failed(new MyException("some failure"))
-    }
-
-    val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
-    val cache = ExpiringMultiLevelCache[Data](ttl = 1.minute, localCache = Option(local), cacheErrors = true, ttlCachedErrors = 9.seconds)
-
-    val eventualCache = cache("key", myFailedRequest)
-    whenReady(eventualCache.failed) { failure =>
-      failure shouldBe a [MyException]
-      myFailedRequestCount shouldBe 1
-    }
-
-    val eventualCache2 = cache("key", myFailedRequest)
-    whenReady(eventualCache2.failed) { failure =>
-      failure shouldBe a [MyException]
-      myFailedRequestCount shouldBe 1
-    }
-
-    val eventualCache3 = cache("key", myFailedRequest)
-    whenReady(eventualCache3.failed) { failure =>
-      failure shouldBe a [MyException]
-      myFailedRequestCount shouldBe 1
-    }
-
-    val eventualCache4 = cache("key", myFailedRequest)
-    whenReady(eventualCache4.failed) { failure =>
-      failure shouldBe a [MyException]
-      myFailedRequestCount shouldBe 1
-    }
-
-    val eventualCache5 = cache("key", myFailedRequest)
-    whenReady(eventualCache5.failed) { failure =>
-      failure shouldBe a [MyException]
-      myFailedRequestCount shouldBe 1
-    }
-
-  }
-
-  it should "calculate a value on cache miss on every request" in {
-    var myFailedRequestCount: Int = 0
-
-    class MyException(s: String) extends FileNotFoundException(s) // Some NonFatal Exception
-    def myFailedRequest(): Future[Nothing] = {
-      myFailedRequestCount = myFailedRequestCount + 1
-      Future.failed(new MyException("some failure"))
-    }
-
-    val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
-    val cache = ExpiringMultiLevelCache[Data](ttl = 1.minute, localCache = Option(local), cacheErrors = false)
-
-    val eventualCache = cache("key", myFailedRequest)
-    whenReady(eventualCache.failed) { failure =>
-      failure shouldBe a [MyException]
-      myFailedRequestCount shouldBe 1
-    }
-
-    val eventualCache2 = cache("key", myFailedRequest)
-    whenReady(eventualCache2.failed) { failure =>
-      failure shouldBe a [MyException]
-      myFailedRequestCount shouldBe 2
-    }
-
-    val eventualCache3 = cache("key", myFailedRequest)
-    whenReady(eventualCache3.failed) { failure =>
-      failure shouldBe a [MyException]
-      myFailedRequestCount shouldBe 3
-    }
-
-    val eventualCache4 = cache("key", myFailedRequest)
-    whenReady(eventualCache4.failed) { failure =>
-      failure shouldBe a [MyException]
-      myFailedRequestCount shouldBe 4
-    }
-
-    val eventualCache5 = cache("key", myFailedRequest)
-    whenReady(eventualCache5.failed) { failure =>
-      failure shouldBe a [MyException]
-      myFailedRequestCount shouldBe 5
-    }
-
-  }
-
-  it should "calculate a value on cache miss, then wait ttlCachedError to get a cache miss again" in {
-    var myFailedRequestCount: Int = 0
-
-    class MyException(s: String) extends FileNotFoundException(s) // Some NonFatal Exception
-    def myFailedRequest(): Future[Nothing] = {
-      myFailedRequestCount = myFailedRequestCount + 1
-      Future.failed(new MyException("some failure"))
-    }
-
-    val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100)
-    val cache = ExpiringMultiLevelCache[Data](ttl = 1.minute, localCache = Option(local), cacheErrors = true, ttlCachedErrors = 4.seconds)
-
-    val eventualCache = cache("key", myFailedRequest)
-    whenReady(eventualCache.failed) { failure =>
-      failure shouldBe a [MyException]
-      myFailedRequestCount shouldBe 1
-    }
-
-    val eventualCache2 = cache("key", myFailedRequest)
-    whenReady(eventualCache2.failed) { failure =>
-      failure shouldBe a [MyException]
-      myFailedRequestCount shouldBe 1
-    }
-
-    Thread.sleep(5000)
-
-    val eventualCache3 = cache("key", myFailedRequest)
-    whenReady(eventualCache3.failed) { failure =>
-      failure shouldBe a [MyException]
-      myFailedRequestCount shouldBe 2
-    }
-
-    val eventualCache4 = cache("key", myFailedRequest)
-    whenReady(eventualCache4.failed) { failure =>
-      failure shouldBe a [MyException]
-      myFailedRequestCount shouldBe 2
-    }
-
-    Thread.sleep(500)
-
-    val eventualCache5 = cache("key", myFailedRequest)
-    whenReady(eventualCache5.failed) { failure =>
-      failure shouldBe a [MyException]
-      myFailedRequestCount shouldBe 2
-    }
-
-  }
-
-}
diff --git a/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala b/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala
deleted file mode 100644
index fb774b6e..00000000
--- a/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala
+++ /dev/null
@@ -1,15 +0,0 @@
-package ignition.core.http
-
-import ignition.core.http.AsyncHttpClientStreamApi.Request
-import org.scalatest.{FunSpec, Matchers}
-
-import scala.util.Success
-
-class AsyncHttpClientStreamApiSpec extends FunSpec with Matchers {
-
-  it("should do the best to parse the provided uri") {
-    val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf"
-    val request = Request(url)
-    request.uri.toString shouldBe "http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf"
-  }
-}
diff --git a/src/test/scala/ignition/core/http/RetrySpec.scala b/src/test/scala/ignition/core/http/RetrySpec.scala
deleted file mode 100644
index 88528568..00000000
--- a/src/test/scala/ignition/core/http/RetrySpec.scala
+++ /dev/null
@@ -1,39 +0,0 @@
-package ignition.core.http
-
-import org.joda.time.DateTime
-import org.scalatest.{FlatSpec, Matchers}
-
-import scala.concurrent.duration._
-
-class RetrySpec extends FlatSpec with Matchers {
-  "Retry" should "return the initial backoff" in {
-    val now = DateTime.now
-    val timeout = 60.seconds
-
-    val retry = Retry(RetryConf(initialBackoffOnError = 123.milliseconds, initialTimeoutBackoff = 456.milliseconds, maxRandom = 0.seconds), now, timeout)
-
-    retry.onError().backoff() shouldBe 123.millisecond
-    retry.onTimeout().backoff() shouldBe 456.millisecond
-  }
-
-  it should "multiply by the factor on second time" in {
-
-    val now = DateTime.now
-    val timeout = 60.seconds
-
-    val retry = Retry(RetryConf(initialBackoffOnError = 123.milliseconds, initialTimeoutBackoff = 456.milliseconds, maxRandom = 0.seconds, timeoutMultiplicationFactor = 3, errorMultiplicationFactor = 5), now, timeout)
-
-    retry.onError().onError().backoff() shouldBe (123 * 5).millisecond
-    retry.onTimeout().onTimeout().backoff() shouldBe (456 * 3).millisecond
-  }
-
-  it should "not explode if called with no errors or timeouts" in {
-    val now = DateTime.now
-    val timeout = 60.seconds
-
-    val retry = Retry(RetryConf(maxRandom = 0.seconds), now, timeout)
-
-    retry.backoff() shouldBe 100.milliseconds
-  }
-
-}
diff --git a/src/test/scala/ignition/core/utils/URLUtilsSpec.scala b/src/test/scala/ignition/core/utils/URLUtilsSpec.scala
index 114da15f..a4b4f10d 100644
--- a/src/test/scala/ignition/core/utils/URLUtilsSpec.scala
+++ b/src/test/scala/ignition/core/utils/URLUtilsSpec.scala
@@ -45,33 +45,4 @@ class URLUtilsSpec extends FlatSpec with Matchers {
     finalUrl shouldEqual "https://www.petlove.com.br/carrinho?test=true#/add/variant_sku/3105748-1,3107615/quantity/1?t=1"
   }
 
-  it should "percent encode url paths" in {
-    val tests = Seq(
-      "http://images1.petlove.com.br/products/170301/small/Ração-Special-Dog-Premium-Vegetais-Cenoura-e-Espinafre-para-Cães.jpg",
-      "http://images0.petlove.com.br/products/175408/small/Ração-Nestlé-Purina-Pro-Plan-Cat-Sensitive-para-Gatos-Adultos-com-Pele-Sensível.jpg",
-      "http://images3.petlove.com.br/products/171539/small/Ração-Royal-Canin-Feline-Veterinary-Diet-Urinary-SO-High-Dilution-para-Gatos-com-Cálculos-Urinários.jpg"
-    )
-
-    val expectations = Seq(
-      "http://images1.petlove.com.br/products/170301/small/Ra%C3%A7%C3%A3o-Special-Dog-Premium-Vegetais-Cenoura-e-Espinafre-para-C%C3%A3es.jpg",
-      "http://images0.petlove.com.br/products/175408/small/Ra%C3%A7%C3%A3o-Nestl%C3%A9-Purina-Pro-Plan-Cat-Sensitive-para-Gatos-Adultos-com-Pele-Sens%C3%ADvel.jpg",
-      "http://images3.petlove.com.br/products/171539/small/Ra%C3%A7%C3%A3o-Royal-Canin-Feline-Veterinary-Diet-Urinary-SO-High-Dilution-para-Gatos-com-C%C3%A1lculos-Urin%C3%A1rios.jpg"
-    )
-
-    tests.zip(expectations).foreach {
-      case (url, expected) => URLUtils.parseUri(url).map(_.toString) shouldBe Success(expected)
-    }
-  }
-
-  it should "not encode percent characters in url path" in {
-    val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf"
-    val sane = URLUtils.parseUri(url).map(_.toString)
-    sane shouldBe Success("http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf")
-  }
-
-  it should "encode space characters with percent in URL path" in {
-    val url = "http://www.example.com/Pentagrama+Invertido.xml?q=blah+bleh"
-    val sane = URLUtils.parseUri(url).map(_.toString)
-    sane shouldBe Success("http://www.example.com/Pentagrama%20Invertido.xml?q=blah+bleh")
-  }
 }
diff --git a/tools/cluster.py b/tools/cluster.py
index 060bce71..2b0e38d4 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -49,7 +49,7 @@
 default_ami = 'ami-611e7976'
 default_master_ami = ''
 default_env = 'dev'
-default_spark_version = '2.3.2'
+default_spark_version = '2.4.0'
 default_hdfs_version = '2.7.6'
 default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz'
 default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz'

From e40cf6bf16c2ca3af3eff02e2147cafd7eb06163 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@gmail.com>
Date: Wed, 28 Nov 2018 09:59:30 -0200
Subject: [PATCH 225/268] fix runtime classnotfound issue (#156)

---
 build.sbt                                              | 10 ++++++----
 .../ignition/core/jobs/utils/SparkContextUtils.scala   |  5 ++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/build.sbt b/build.sbt
index b27321ea..51044b57 100644
--- a/build.sbt
+++ b/build.sbt
@@ -11,9 +11,13 @@ parallelExecution in Test := false
 
 test in assembly := {}
 
-libraryDependencies += ("org.apache.spark" %% "spark-sql" % "2.4.0" % "provided")
+libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.0" % "provided"
 
-libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided")
+libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided"
+
+libraryDependencies += "org.apache.hadoop" % "hadoop-aws" % "2.7.6" % "provided"
+
+libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.7.4" % "provided"
 
 libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.2.27"
 
@@ -25,6 +29,4 @@ libraryDependencies += "org.joda" % "joda-convert" % "1.8.2"
 
 libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.25"
 
-libraryDependencies += "com.amazonaws" % "aws-java-sdk-s3" % "1.11.456"
-
 libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.3"
diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index e07ba54f..3e4ff961 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -3,8 +3,8 @@ package ignition.core.jobs.utils
 import java.io.InputStream
 
 import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
-import com.amazonaws.services.s3.{AmazonS3, AmazonS3Builder, AmazonS3Client}
 import com.amazonaws.services.s3.model.{ListObjectsRequest, ObjectListing, S3ObjectSummary}
+import com.amazonaws.services.s3.{AmazonS3, AmazonS3Client}
 import ignition.core.utils.CollectionUtils._
 import ignition.core.utils.DateUtils._
 import ignition.core.utils.ExceptionUtils._
@@ -49,8 +49,7 @@ object SparkContextUtils {
     def isCompressed(f: HadoopFile): Boolean = compressedExtensions.exists(f.path.endsWith)
   }
 
-  private lazy val amazonS3ClientFromEnvironmentVariables: AmazonS3 =
-    AmazonS3Client.builder().withCredentials(new DefaultAWSCredentialsProviderChain()).build()
+  private lazy val amazonS3ClientFromEnvironmentVariables: AmazonS3 = new AmazonS3Client(new DefaultAWSCredentialsProviderChain())
 
   private def close(inputStream: InputStream, path: String): Unit = {
     try {

From ca56cf60fabec8c03012757e7097e6b2b6849bf8 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allandouglas@gmail.com>
Date: Wed, 28 Nov 2018 23:04:36 -0200
Subject: [PATCH 226/268] python3 compatibility

---
 tools/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 2b0e38d4..a74951bc 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -802,7 +802,7 @@ def killall_jobs(cluster_name, key_file=default_key_file,
 
 def check_flintrock_installation():
     try:
-        with file('/dev/null', 'w') as devnull:
+        with open('/dev/null', 'w') as devnull:
             call_ec2_script(['--help'], 1 , 1, stdout=devnull)
     except:
         setup = os.path.join(ec2_script_base_path(), 'setup.py')

From b1d5ae096b74f63fa6bda9e35bd1fb24b5f899a2 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allandouglas@gmail.com>
Date: Tue, 4 Dec 2018 21:06:32 -0200
Subject: [PATCH 227/268] The quiet flag makes debugging some issues pretty
 hard

---
 tools/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index a74951bc..add0f633 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -115,7 +115,7 @@ def logged_call(args, tries=1):
 
 
 def ssh_call(user, host, key_file, args=(), allocate_terminal=True, get_output=False):
-    base = ['ssh', '-q']
+    base = ['ssh']
     if allocate_terminal:
         base += ['-tt']
     base += ['-i', key_file,

From ba3b39f1078de8f2996313312381b2a4a0a4ad10 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allandouglas@gmail.com>
Date: Wed, 19 Dec 2018 20:31:15 -0200
Subject: [PATCH 228/268] Added RDD-like .values

---
 src/main/scala/ignition/core/utils/CollectionUtils.scala | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala
index 01960d3d..c3b87d4c 100644
--- a/src/main/scala/ignition/core/utils/CollectionUtils.scala
+++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala
@@ -139,6 +139,8 @@ object CollectionUtils {
         .mapValues(_.map { case (k, v) => v }.reduce(fn))
         .toList
     }
+    def values: List[V] =
+      iterable.map { case (k, v) => v }.toList
   }
 
 

From 04964c088edd0f6ce26a124c3c8e07c6e7c2360a Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Thu, 20 Dec 2018 13:48:29 -0200
Subject: [PATCH 229/268] Fix extra data cluster saving so scripts like job
 runner can reuse the cluster

---
 tools/cluster.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 5a08877e..f27d74f9 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -172,18 +172,22 @@ def save_cluster_args(master, key_file, remote_user, all_args):
              args=["echo '{}' > /tmp/cluster_args.json".format(json.dumps(all_args))])
 
 def load_cluster_args(master, key_file, remote_user):
-    return json.loads(ssh_call(user=remote_user, host=master, key_file=key_file,
+    return json.loads(ssh_call(user=remote_user, host=master, key_file=key_file, allocate_terminal=False,
                                args=["cat", "/tmp/cluster_args.json"], get_output=True))
 
 # Util to be used by external scripts
 def save_extra_data(data_str, cluster_name, region=default_region, key_file=default_key_file, remote_user=default_remote_user, master=None):
     master = master or get_master(cluster_name, region=region)
-    ssh_call(user=remote_user, host=master, key_file=key_file,
-             args=["echo '{}' > /tmp/cluster_extra_data.txt".format(data_str)])
+    cmd = ['ssh', '-o', 'StrictHostKeyChecking=no', remote_user + '@' + master , '-i', key_file, '/bin/bash', '-c', 'cat > /tmp/cluster_extra_data.txt']
+    p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
+    p.communicate(data_str)
+    if p.wait() != 0:
+        raise Exception('Error saving extra data on master')
+
 
 def load_extra_data(cluster_name, region=default_region, key_file=default_key_file, remote_user=default_remote_user, master=None):
     master = master or get_master(cluster_name, region=region)
-    return ssh_call(user=remote_user, host=master, key_file=key_file,
+    return ssh_call(user=remote_user, host=master, key_file=key_file, allocate_terminal=False,
                     args=["cat", "/tmp/cluster_extra_data.txt"], get_output=True)
 
 

From debd6afd4d3cfa4d515cb033c04c6b12d70c0704 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Wed, 9 Jan 2019 20:01:41 -0200
Subject: [PATCH 230/268] Added new command and made remove files from collect
 a parameter

---
 tools/cluster.py | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index f27d74f9..261b2b85 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -436,7 +436,8 @@ def job_run(cluster_name, job_name, job_mem,
             kill_on_failure=False,
             destroy_cluster=False,
             region=default_region,
-            driver_heap_size=default_driver_heap_size):
+            driver_heap_size=default_driver_heap_size,
+            remove_files=True):
 
     utc_job_date_example = '2014-05-04T13:13:10Z'
     if utc_job_date and len(utc_job_date) != len(utc_job_date_example):
@@ -503,7 +504,7 @@ def job_run(cluster_name, job_name, job_mem,
                          region=region,
                          job_timeout_minutes=job_timeout_minutes,
                          remote_user=remote_user, remote_control_dir=remote_control_dir,
-                         collect_results_dir=collect_results_dir)
+                         collect_results_dir=collect_results_dir, remove_files=remove_files)
         except JobFailure as e:
             failed = True
             failed_exception = e
@@ -666,16 +667,18 @@ def collect_job_results(cluster_name, job_name, job_tag,
                         region=default_region,
                         master=None, remote_user=default_remote_user,
                         remote_control_dir=default_remote_control_dir,
-                        collect_results_dir=default_collect_results_dir):
+                        collect_results_dir=default_collect_results_dir,
+                        remove_files=False):
     master = master or get_master(cluster_name, region=region)
 
     job_with_tag = get_job_with_tag(job_name, job_tag)
     job_control_dir = get_job_control_dir(remote_control_dir, job_with_tag)
 
+    # Keep the RUNNING file so we can kill the job if needed
+    args = ['--remove-source-files', '--exclude', 'RUNNING'] if remove_files else []
     rsync_call(user=remote_user,
                host=master,
-               # Keep the RUNNING file so we can kill the job if needed
-               args=['--remove-source-files', '--exclude', 'RUNNING'],
+               args=args,
                key_file=key_file,
                dest_local=with_leading_slash(collect_results_dir),
                remote_path=job_control_dir)
@@ -683,13 +686,35 @@ def collect_job_results(cluster_name, job_name, job_tag,
     return os.path.join(collect_results_dir, os.path.basename(job_control_dir))
 
 
+@named('collect-all-results')
+def collect_all_job_results(cluster_name,
+                            key_file=default_key_file,
+                            region=default_region,
+                            master=None, remote_user=default_remote_user,
+                            remote_control_dir=default_remote_control_dir,
+                            collect_results_dir=default_collect_results_dir,
+                            remove_files=False):
+    master = master or get_master(cluster_name, region=region)
+
+    # Keep the RUNNING file so we can kill the job if needed
+    args = ['--remove-source-files', '--exclude', 'RUNNING'] if remove_files else []
+    rsync_call(user=remote_user,
+               host=master,
+               args=args,
+               key_file=key_file,
+               dest_local=with_leading_slash(collect_results_dir),
+               remote_path=with_leading_slash(remote_control_dir))
+
+    return collect_results_dir
+
+
 @named('wait-for')
 def wait_for_job(cluster_name, job_name, job_tag, key_file=default_key_file,
                  master=None, remote_user=default_remote_user,
                  region=default_region,
                  remote_control_dir=default_remote_control_dir,
                  collect_results_dir=default_collect_results_dir,
-                 job_timeout_minutes=0, max_failures=5, seconds_to_sleep=60):
+                 job_timeout_minutes=0, max_failures=5, seconds_to_sleep=60, remove_files=True):
 
     master = master or get_master(cluster_name, region=region)
 
@@ -714,7 +739,7 @@ def collect(show_tail):
                                                key_file=key_file, region=region,
                                                master=master, remote_user=remote_user,
                                                remote_control_dir=remote_control_dir,
-                                               collect_results_dir=collect_results_dir)
+                                               collect_results_dir=collect_results_dir, remove_files=remove_files)
             log.info('Jobs results saved on: {}'.format(dest_log_dir))
             if show_tail:
                 output_log = os.path.join(dest_log_dir, 'output.log')
@@ -852,7 +877,7 @@ def check_flintrock_installation():
 parser = ArghParser()
 parser.add_commands([launch, destroy, get_master, ssh_master, tag_cluster_instances, health_check, exec_shell])
 parser.add_commands([job_run, job_local_yarn_run, job_attach, wait_for_job,
-                     kill_job, killall_jobs, collect_job_results], namespace="jobs")
+                     kill_job, killall_jobs, collect_job_results, collect_all_job_results], namespace="jobs")
 
 if __name__ == '__main__':
     check_flintrock_installation()

From 1ad9969bc4572768a6b3c10bed1731c43f46c6c0 Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Tue, 29 Jan 2019 20:33:08 -0200
Subject: [PATCH 231/268] Make extra args really be usable

---
 remote_hook.sh                                      |  3 ++-
 .../scala/ignition/core/jobs/CoreJobRunner.scala    |  6 +++---
 tools/cluster.py                                    | 13 ++++++++-----
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index 0a5a2cb8..1fd970f6 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -13,6 +13,7 @@ SPARK_MEM_PARAM="${6?Please give the Job Memory Size to use}"
 USE_YARN="${7?Please tell if we should use YARN (yes/no)}"
 NOTIFY_ON_ERRORS="${8?Please tell if we will notify on errors (yes/no)}"
 DRIVER_HEAP_SIZE="${9?Please tell driver heap size to use}"
+shift 9
 
 JOB_WITH_TAG=${JOB_NAME}.${JOB_TAG}
 JOB_CONTROL_DIR="${CONTROL_DIR}/${JOB_WITH_TAG}"
@@ -124,7 +125,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then
 else
     JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log"
     tail -F "${JOB_OUTPUT}" &
-    sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
+    sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
 fi
 
 touch "${JOB_CONTROL_DIR}/SUCCESS"
diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
index 4e7c27fe..5ee6fbce 100644
--- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
+++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
@@ -35,7 +35,7 @@ object CoreJobRunner {
                           user: String = "nouser",
                           master: String = "local[*]",
                           executorMemory: String = "2G",
-                          additionalArgs: Map[String, String] = Map.empty)
+                          extraArgs: Map[String, String] = Map.empty)
 
   def runJobSetup(args: Array[String], jobsSetups: Map[String, (CoreJobRunner.RunnerContext => Unit, Map[String, String])], defaultSparkConfMap: Map[String, String]) {
     val parser = new scopt.OptionParser[RunnerConfig]("Runner") {
@@ -60,8 +60,8 @@ object CoreJobRunner {
         c.copy(executorMemory = x)
       }
 
-      opt[(String, String)]('w', "runner-with-arg") unbounded() action { (x, c) =>
-        c.copy(additionalArgs = c.additionalArgs ++ Map(x))
+      opt[(String, String)]('w', "runner-extra") unbounded() action { (x, c) =>
+        c.copy(extraArgs = c.extraArgs ++ Map(x))
       }
     }
 
diff --git a/tools/cluster.py b/tools/cluster.py
index 261b2b85..f80db920 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -422,6 +422,7 @@ def get_assembly_path():
 @arg('--disable-tmux', help='Do not use tmux. Warning: many features will not work without tmux. Use only if the tmux is missing on the master.')
 @arg('--detached', help='Run job in background, requires tmux')
 @arg('--destroy-cluster', help='Will destroy cluster after finishing the job')
+@arg('--extra', action='append', type=str, help='Additional arguments for the job in the format k=v')
 @named('run')
 def job_run(cluster_name, job_name, job_mem,
             key_file=default_key_file, disable_tmux=False,
@@ -437,7 +438,8 @@ def job_run(cluster_name, job_name, job_mem,
             destroy_cluster=False,
             region=default_region,
             driver_heap_size=default_driver_heap_size,
-            remove_files=True):
+            remove_files=True,
+            extra=[]):
 
     utc_job_date_example = '2014-05-04T13:13:10Z'
     if utc_job_date and len(utc_job_date) != len(utc_job_date_example):
@@ -456,11 +458,12 @@ def job_run(cluster_name, job_name, job_mem,
     yarn_param = 'yes' if yarn else 'no'
     job_date = utc_job_date or datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
     job_tag = job_tag or job_date.replace(':', '_').replace('-', '_').replace('Z', 'UTC')
+    runner_extra_args = ' '.join('--runner-extra "%s"' % arg for arg in extra)
     tmux_wait_command = ';(echo Press enter to keep the session open && /bin/bash -c "read -t 5" && sleep 7d)' if not detached else ''
-    tmux_arg = ". /etc/profile; . ~/.profile;tmux new-session {detached} -s spark.{job_name}.{job_tag} '{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} {tmux_wait_command}' >& /tmp/commandoutput".format(
-        aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, tmux_wait_command=tmux_wait_command)
-    non_tmux_arg = ". /etc/profile; . ~/.profile;{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} >& /tmp/commandoutput".format(
-        aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size)
+    tmux_arg = ". /etc/profile; . ~/.profile;tmux new-session {detached} -s spark.{job_name}.{job_tag} '{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} {runner_extra_args} {tmux_wait_command}' >& /tmp/commandoutput".format(
+        aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, runner_extra_args=runner_extra_args, tmux_wait_command=tmux_wait_command)
+    non_tmux_arg = ". /etc/profile; . ~/.profile;{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} {runner_extra_args} >& /tmp/commandoutput".format(
+        aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, runner_extra_args=runner_extra_args)
 
 
     if not disable_assembly_build:

From 0eb6a1828a44c4fc0ca3a5bf3c13ef36ba6a0f0f Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Mon, 11 Feb 2019 19:20:01 -0200
Subject: [PATCH 232/268] Added singleton to ExecutionRetry

---
 src/main/scala/ignition/core/jobs/ExecutionRetry.scala | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/main/scala/ignition/core/jobs/ExecutionRetry.scala b/src/main/scala/ignition/core/jobs/ExecutionRetry.scala
index 61daa523..7e5a3953 100644
--- a/src/main/scala/ignition/core/jobs/ExecutionRetry.scala
+++ b/src/main/scala/ignition/core/jobs/ExecutionRetry.scala
@@ -2,6 +2,8 @@ package ignition.core.jobs
 
 import scala.util.Try
 
+object ExecutionRetry extends ExecutionRetry
+
 trait ExecutionRetry {
 
   def executeRetrying[T](code: => T, maxExecutions: Int = 3): T = {

From e7e7a7666cd235614f2ca5fd73d96f13f4645b83 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@gmail.com>
Date: Tue, 12 Mar 2019 16:01:37 -0300
Subject: [PATCH 233/268] install toree (#161)

* install toree
---
 remote_hook.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index 1fd970f6..5078786b 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -78,7 +78,7 @@ install_and_run_zeppelin() {
 
 install_and_run_jupyter() {
     sudo yum -y install python3 python3-pip
-    sudo pip3 install jupyter pandas boto3 matplotlib numpy sklearn scipy
+    sudo pip3 install jupyter pandas boto3 matplotlib numpy sklearn scipy toree
     export SPARK_HOME=$(get_first_present /root/spark /opt/spark ~/spark*/)
     export HADOOP_HOME=$(get_first_present /root/hadoop /opt/hadoop ~/hadoop*/)
     export SPARK_CONF_DIR="${SPARK_HOME}/conf"
@@ -87,6 +87,7 @@ install_and_run_jupyter() {
     export PYSPARK_PYTHON=$(which python3)
     export PYSPARK_DRIVER_PYTHON=$(which jupyter)
     export PYSPARK_DRIVER_PYTHON_OPTS="notebook --allow-root --ip=${SPARK_MASTER_HOST} --no-browser --port=8888"
+    sudo $(which jupyter) toree install --spark_home="${SPARK_HOME}" --spark_opts="--master ${JOB_MASTER} --executor-memory ${SPARK_MEM_PARAM} --driver-memory ${DRIVER_HEAP_SIZE}"
     sudo -E "${SPARK_HOME}/bin/pyspark" --master "${JOB_MASTER}" --executor-memory "${SPARK_MEM_PARAM}" --driver-memory "${DRIVER_HEAP_SIZE}"
 }
 

From 2225f897aa8bb615bd956ac78d544fc448a095ed Mon Sep 17 00:00:00 2001
From: "Allan Douglas R. de Oliveira" <allan@chaordicsystems.com>
Date: Wed, 20 Mar 2019 17:05:28 -0300
Subject: [PATCH 234/268] Added Timestamp comparison

---
 src/main/scala/ignition/core/utils/DateUtils.scala | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/ignition/core/utils/DateUtils.scala b/src/main/scala/ignition/core/utils/DateUtils.scala
index 8ebf3b13..71ec771f 100644
--- a/src/main/scala/ignition/core/utils/DateUtils.scala
+++ b/src/main/scala/ignition/core/utils/DateUtils.scala
@@ -1,6 +1,8 @@
 package ignition.core.utils
 
-import org.joda.time.{Seconds, Period, DateTimeZone, DateTime}
+import java.sql.Timestamp
+
+import org.joda.time.{DateTime, DateTimeZone, Period, Seconds}
 import org.joda.time.format.ISODateTimeFormat
 
 object DateUtils {
@@ -9,6 +11,10 @@ object DateUtils {
   implicit def dateTimeOrdering: Ordering[DateTime] = Ordering.fromLessThan(_ isBefore _)
   implicit def periodOrdering: Ordering[Period] = Ordering.fromLessThan(_.toStandardSeconds.getSeconds <  _.toStandardSeconds.getSeconds)
 
+  implicit def timestampOrdering: Ordering[Timestamp] = new Ordering[Timestamp] {
+    def compare(x: Timestamp, y: Timestamp): Int = x compareTo y
+  }
+
   implicit class DateTimeImprovements(val dateTime: DateTime) {
     def toIsoString = isoDateTimeFormatter.print(dateTime)
 

From 71d94ca83218b0a424db9f2cfb85eb61d844826e Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@gmail.com>
Date: Wed, 26 Jun 2019 11:31:26 -0300
Subject: [PATCH 235/268] update to spark=2.4.3, scala=2.12.8 and some compiler
 fixes (#163)

* update to spark=2.4.3, scala=2.12.8 and some compiler fixes
* using a 2.12 scala build
---
 build.sbt                                     |  4 ++--
 .../ignition/core/jobs/CoreJobRunner.scala    |  1 -
 .../ignition/core/jobs/utils/RDDUtils.scala   | 12 +++---------
 .../core/jobs/utils/SparkContextUtils.scala   | 19 +++++++++----------
 .../ignition/core/utils/CollectionUtils.scala |  8 +++-----
 .../ignition/core/utils/FutureUtils.scala     |  5 ++---
 .../ignition/core/utils/URLUtilsSpec.scala    |  2 --
 tools/cluster.py                              |  4 ++--
 8 files changed, 21 insertions(+), 34 deletions(-)

diff --git a/build.sbt b/build.sbt
index 51044b57..8bf00c9d 100644
--- a/build.sbt
+++ b/build.sbt
@@ -2,7 +2,7 @@ name := "Ignition-Core"
 
 version := "1.0"
 
-scalaVersion := "2.11.12"
+scalaVersion := "2.12.8"
 
 scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130")
 
@@ -11,7 +11,7 @@ parallelExecution in Test := false
 
 test in assembly := {}
 
-libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.0" % "provided"
+libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.3" % "provided"
 
 libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided"
 
diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
index 5ee6fbce..eb1c7014 100644
--- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
+++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala
@@ -6,7 +6,6 @@ import org.joda.time.{DateTime, DateTimeZone}
 import org.slf4j.{Logger, LoggerFactory}
 
 import scala.concurrent.Future
-import scala.util.Try
 
 object CoreJobRunner {
 
diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
index e04dd118..ab08c3c7 100644
--- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala
@@ -1,18 +1,12 @@
 package ignition.core.jobs.utils
 
+import org.apache.spark.rdd.RDD
 import org.slf4j.LoggerFactory
-
-import scala.reflect._
-import org.apache.spark.rdd.{CoGroupedRDD, PairRDDFunctions, RDD}
-import org.apache.spark.SparkContext._
-import org.apache.spark.Partitioner
-import org.apache.spark
-import org.joda.time.DateTime
-import org.joda.time.format.DateTimeFormat
+import scalaz.{Success, Validation}
 
 import scala.collection.mutable
+import scala.reflect._
 import scala.util.Random
-import scalaz.{Success, Validation}
 
 object RDDUtils {
 
diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
index 3e4ff961..e5155340 100644
--- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
+++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala
@@ -19,7 +19,7 @@ import org.apache.spark.{Partitioner, SparkContext}
 import org.joda.time.DateTime
 import org.slf4j.LoggerFactory
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.io.{Codec, Source}
@@ -82,7 +82,7 @@ object SparkContextUtils {
 
     private lazy val logger = LoggerFactory.getLogger(getClass)
 
-    lazy val _hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap)
+    lazy val _hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().asScala.map { case entry => entry.getKey -> entry.getValue }.toMap)
 
     private def getFileSystem(path: Path): FileSystem = {
       path.getFileSystem(sc.hadoopConfiguration)
@@ -181,7 +181,6 @@ object SparkContextUtils {
         paths.map(p => {
           val hdfsPath = p.replaceFirst("s3[an]://", hdfsPathPrefix)
           if (forceSynch || getStatus(hdfsPath, false).isEmpty || getStatus(s"$hdfsPath/*", true).filterNot(_.isDirectory).size != filesToOutput) {
-            val _hdfsPath = new Path(hdfsPath)
             actionWhenNeedsSynching(p, hdfsPath)
           }
           hdfsPath
@@ -457,10 +456,10 @@ object SparkContextUtils {
       def inner(current: ObjectListing): Stream[String] =
         if (current.isTruncated) {
           logger.trace(s"list common prefixed truncated for ${path.bucket} ${path.key}: ${current.getCommonPrefixes}")
-          current.getCommonPrefixes.toStream ++ inner(s3.listNextBatchOfObjects(current))
+          current.getCommonPrefixes.asScala.toStream ++ inner(s3.listNextBatchOfObjects(current))
         } else {
           logger.trace(s"list common prefixed finished for ${path.bucket} ${path.key}: ${current.getCommonPrefixes}")
-          current.getCommonPrefixes.toStream
+          current.getCommonPrefixes.asScala.toStream
         }
 
       val request = new ListObjectsRequest(path.bucket, path.key, null, delimiter, 1000)
@@ -472,10 +471,10 @@ object SparkContextUtils {
       def inner(current: ObjectListing): Stream[S3ObjectSummary] =
         if (current.isTruncated) {
           logger.trace(s"list objects truncated for ${path.bucket} ${path.key}: $current")
-          current.getObjectSummaries.toStream ++ inner(s3.listNextBatchOfObjects(current))
+          current.getObjectSummaries.asScala.toStream ++ inner(s3.listNextBatchOfObjects(current))
         } else {
           logger.trace(s"list objects finished for ${path.bucket} ${path.key}")
-          current.getObjectSummaries.toStream
+          current.getObjectSummaries.asScala.toStream
         }
 
       inner(s3.listObjects(path.bucket, path.key))
@@ -674,9 +673,9 @@ object SparkContextUtils {
     private def doSync(hadoopFiles: List[HadoopFile],
                        synchLocally: String,
                        forceSynch: Boolean,
-                       maxBytesPerPartition: Long = 128 * 1000 * 1000,
-                       minPartitions: Int = 100,
-                       sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = {
+                       maxBytesPerPartition: Long,
+                       minPartitions: Int,
+                       sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = {
       require(!synchLocally.contains("*"), "Globs are not supported on the sync key")
 
       def syncPath(suffix: String) = s"$hdfsPathPrefix/_core_ignition_sync_hdfs_cache/$suffix"
diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala
index c3b87d4c..2405c7ef 100644
--- a/src/main/scala/ignition/core/utils/CollectionUtils.scala
+++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala
@@ -1,12 +1,10 @@
 package ignition.core.utils
-import scala.collection.{TraversableLike, IterableLike}
-import scala.collection.generic.CanBuildFrom
-import scala.language.implicitConversions
 import scalaz.Validation
 
-object CollectionUtils {
-
+import scala.collection.generic.CanBuildFrom
+import scala.collection.{IterableLike, TraversableLike}
 
+object CollectionUtils {
 
   implicit class SeqImprovements[A](xs: Seq[A]) {
     def orElseIfEmpty[B >: A](alternative: => Seq[B]): Seq[B] = {
diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala
index f12918db..4054f750 100644
--- a/src/main/scala/ignition/core/utils/FutureUtils.scala
+++ b/src/main/scala/ignition/core/utils/FutureUtils.scala
@@ -1,7 +1,6 @@
 package ignition.core.utils
 
-import scala.concurrent.duration.FiniteDuration
-import scala.concurrent.{ExecutionContext, Future, Promise, blocking, future}
+import scala.concurrent.{ExecutionContext, Future, Promise, blocking}
 import scala.util.control.NonFatal
 import scala.util.{Failure, Success, Try}
 
@@ -46,7 +45,7 @@ object FutureUtils {
   }
 
   implicit class FutureGeneratorImprovements[V](generator: Iterable[() => Future[V]]){
-    def toLazyIterable(batchSize: Int = 1)(implicit ec: ExecutionContext): Iterable[Future[V]] = new Iterable[Future[V]] {
+    def toLazyIterable(batchSize: Int = 1): Iterable[Future[V]] = new Iterable[Future[V]] {
       override def iterator =  new Iterator[Future[V]] {
         val generatorIterator = generator.toIterator
         var currentBatch: List[Future[V]] = List.empty
diff --git a/src/test/scala/ignition/core/utils/URLUtilsSpec.scala b/src/test/scala/ignition/core/utils/URLUtilsSpec.scala
index a4b4f10d..61781903 100644
--- a/src/test/scala/ignition/core/utils/URLUtilsSpec.scala
+++ b/src/test/scala/ignition/core/utils/URLUtilsSpec.scala
@@ -2,8 +2,6 @@ package ignition.core.utils
 
 import org.scalatest.{FlatSpec, Matchers}
 
-import scala.util.Success
-
 class URLUtilsSpec extends FlatSpec with Matchers {
 
   "URLUtils" should "add parameters to url with encoded params in base url and not be double encoded" in {
diff --git a/tools/cluster.py b/tools/cluster.py
index f80db920..0e0fd864 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -49,9 +49,9 @@
 default_ami = 'ami-611e7976'
 default_master_ami = ''
 default_env = 'dev'
-default_spark_version = '2.4.0'
+default_spark_version = '2.4.3'
 default_hdfs_version = '2.7.6'
-default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz'
+default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-without-hadoop-scala-2.12.tgz'
 default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz'
 default_remote_user = 'ec2-user'
 default_installation_user = 'root'

From d1c02ee3dccef0f5c72e090da926cf3ae22678f8 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@gmail.com>
Date: Thu, 27 Jun 2019 15:12:13 -0300
Subject: [PATCH 236/268] rollback to scala 2.11 (#164)

---
 build.sbt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.sbt b/build.sbt
index 8bf00c9d..c6d7dbdb 100644
--- a/build.sbt
+++ b/build.sbt
@@ -2,7 +2,7 @@ name := "Ignition-Core"
 
 version := "1.0"
 
-scalaVersion := "2.12.8"
+scalaVersion := "2.11.12"
 
 scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130")
 

From c9f09ddc5c4a109595dd3e6412a19e69d046c6e0 Mon Sep 17 00:00:00 2001
From: Fernando Luiz Parisotto <fernando.parisotto@gmail.com>
Date: Thu, 27 Jun 2019 15:18:35 -0300
Subject: [PATCH 237/268] rollback to spark with hadoop (#165)

---
 tools/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 0e0fd864..7c50ccae 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -51,7 +51,7 @@
 default_env = 'dev'
 default_spark_version = '2.4.3'
 default_hdfs_version = '2.7.6'
-default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-without-hadoop-scala-2.12.tgz'
+default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz'
 default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz'
 default_remote_user = 'ec2-user'
 default_installation_user = 'root'

From 0d94bdc64dd8337ea2bc91be425f8fc3cba584e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=AA=20Couto=20e=20Silva?=
 <31329678+csrene@users.noreply.github.com>
Date: Tue, 24 Sep 2019 16:22:20 -0300
Subject: [PATCH 238/268] Optional AWS credentials propagation (#166)

added disable-propagate-aws-credentials option to run job
---
 tools/cluster.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 7c50ccae..966dd987 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -423,6 +423,7 @@ def get_assembly_path():
 @arg('--detached', help='Run job in background, requires tmux')
 @arg('--destroy-cluster', help='Will destroy cluster after finishing the job')
 @arg('--extra', action='append', type=str, help='Additional arguments for the job in the format k=v')
+@arg('--disable-propagate-aws-credentials', help='Setting this to true will not propagate your AWS credentials from your environment to the master')
 @named('run')
 def job_run(cluster_name, job_name, job_mem,
             key_file=default_key_file, disable_tmux=False,
@@ -439,6 +440,7 @@ def job_run(cluster_name, job_name, job_mem,
             region=default_region,
             driver_heap_size=default_driver_heap_size,
             remove_files=True,
+            disable_propagate_aws_credentials=False,
             extra=[]):
 
     utc_job_date_example = '2014-05-04T13:13:10Z'
@@ -456,14 +458,15 @@ def job_run(cluster_name, job_name, job_mem,
     remote_hook = '{remote_path}/remote_hook.sh'.format(remote_path=remote_path)
     notify_param = 'yes' if notify_on_errors else 'no'
     yarn_param = 'yes' if yarn else 'no'
+    aws_vars = get_aws_keys_str() if not disable_propagate_aws_credentials else ''
     job_date = utc_job_date or datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
     job_tag = job_tag or job_date.replace(':', '_').replace('-', '_').replace('Z', 'UTC')
     runner_extra_args = ' '.join('--runner-extra "%s"' % arg for arg in extra)
     tmux_wait_command = ';(echo Press enter to keep the session open && /bin/bash -c "read -t 5" && sleep 7d)' if not detached else ''
     tmux_arg = ". /etc/profile; . ~/.profile;tmux new-session {detached} -s spark.{job_name}.{job_tag} '{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} {runner_extra_args} {tmux_wait_command}' >& /tmp/commandoutput".format(
-        aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, runner_extra_args=runner_extra_args, tmux_wait_command=tmux_wait_command)
+        aws_vars=aws_vars, job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, runner_extra_args=runner_extra_args, tmux_wait_command=tmux_wait_command)
     non_tmux_arg = ". /etc/profile; . ~/.profile;{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} {runner_extra_args} >& /tmp/commandoutput".format(
-        aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, runner_extra_args=runner_extra_args)
+        aws_vars=aws_vars, job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, runner_extra_args=runner_extra_args)
 
 
     if not disable_assembly_build:

From e2adf968611b31f02902b678fe472b30b413b369 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=A1vio=20Zantut=20Nogueira?= <flaviozantut@gmail.com>
Date: Mon, 16 May 2022 15:27:20 -0300
Subject: [PATCH 239/268] Wait termination on destroy unsuccessful cluster

---
 tools/cluster.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 966dd987..8cc12f98 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -331,8 +331,8 @@ def launch(cluster_name, slaves,
         except Exception as e:
             log.exception('Got exception on last steps of cluster configuration')
         log.warn('Destroying unsuccessful cluster')
-        destroy(cluster_name=cluster_name, region=region)
-    raise CommandError('Failed to created cluster {} after failures'.format(cluster_name))
+        destroy(cluster_name=cluster_name, region=region, wait_termination=True)
+    raise CommandError('Failed to created cluster {0} after failures'.format(cluster_name))
 
 
 def destroy(cluster_name, delete_groups=False, region=default_region, wait_termination=False, wait_timeout_minutes=10):

From 54c5faefb90d43349136f5b389903456a67bfbcc Mon Sep 17 00:00:00 2001
From: alexopss <68519704+alexopss@users.noreply.github.com>
Date: Tue, 17 May 2022 10:05:53 -0300
Subject: [PATCH 240/268] Fix/circleci

Add config circle
---
 .circleci/config.yml | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .circleci/config.yml

diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1 @@
+

From 3b5f93fb3226273514aae312111b1d2cd11fa8ae Mon Sep 17 00:00:00 2001
From: Machine User <80485061+chaordic-automation@users.noreply.github.com>
Date: Tue, 17 May 2022 10:09:52 -0300
Subject: [PATCH 241/268] Updated config.yml

---
 .circleci/config.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 8b137891..26423d9c 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1 +1,2 @@
-
+orbs:
+  node: circleci/node@5.0.2
\ No newline at end of file

From a55ffe79cedc75964281a50e7063a6489a56b51b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=A1vio=20Zantut=20Nogueira?= <flaviozantut@gmail.com>
Date: Mon, 30 May 2022 14:05:38 -0300
Subject: [PATCH 242/268] Add shutting-down state to active instances

---
 tools/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/utils.py b/tools/utils.py
index 5064be61..5cfecb77 100644
--- a/tools/utils.py
+++ b/tools/utils.py
@@ -12,7 +12,7 @@ def get_active_instances(conn):
     active = [instance for res in conn.get_all_instances()
               for instance in res.instances
               if instance.state in set(['pending', 'running',
-                                        'stopping', 'stopped'])]
+                                        'stopping', 'stopped', 'shutting-down'])]
     return active
 
 def parse_nodes(active_instances, cluster_name):

From 9223c02f94f6d0e6dd9c53ea555d34d8a2308f1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=A1vio=20Zantut=20Nogueira?= <flaviozantut@gmail.com>
Date: Wed, 1 Jun 2022 17:23:02 -0300
Subject: [PATCH 243/268] Update CI

---
 .circleci/config.yml | 17 +++++++++++++++--
 circle.yml           |  3 ---
 2 files changed, 15 insertions(+), 5 deletions(-)
 delete mode 100644 circle.yml

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 26423d9c..79ea469e 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,2 +1,15 @@
-orbs:
-  node: circleci/node@5.0.2
\ No newline at end of file
+version: 2.1
+
+# Define the jobs we want to run for this project
+jobs:
+  build:
+    docker:
+      - image: openjdk:8-jdk-oraclelinux7
+    steps:
+      - run: echo "build job is not implemented"
+
+# Orchestrate our job run sequence
+workflows:
+  build:
+    jobs:
+      - build
\ No newline at end of file
diff --git a/circle.yml b/circle.yml
deleted file mode 100644
index abd78de2..00000000
--- a/circle.yml
+++ /dev/null
@@ -1,3 +0,0 @@
-machine:
-  java:
-    version: oraclejdk8

From 2ee8f07e651101a4f16a29daa81d8ab549c4a82b Mon Sep 17 00:00:00 2001
From: AllanRolli <papitoallan@gmail.com>
Date: Wed, 8 Jun 2022 10:04:45 -0300
Subject: [PATCH 244/268] updated destroy function inside module cluster.py

---
 tools/cluster.py | 55 ++++++++++++++++++++++++------------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 8cc12f98..778ccfaf 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -335,35 +335,36 @@ def launch(cluster_name, slaves,
     raise CommandError('Failed to created cluster {0} after failures'.format(cluster_name))
 
 
-def destroy(cluster_name, delete_groups=False, region=default_region, wait_termination=False, wait_timeout_minutes=10):
+def destroy(cluster_name, wait_termination=False, wait_timeout_minutes=10, delete_groups=False, region=default_region,script_timeout_total_minutes=55,script_timeout_inactivity_minutes=10):
     assert not delete_groups, 'Delete groups is deprecated and unsupported'
     masters, slaves = get_active_nodes(cluster_name, region=region)
-
-    all_instances = masters + slaves
-    if all_instances:
-        log.info('The following instances will be terminated:')
-        for i in all_instances:
-            log.info('-> %s' % (i.public_dns_name or i.private_dns_name))
-
-        log.info('Terminating master...')
-        for i in masters:
-            i.terminate()
-
-        log.info('Terminating slaves...')
-        for i in slaves:
-            i.terminate()
-
-        if wait_termination:
-            log.info('Waiting for instances termination...')
-        termination_timeout = wait_timeout_minutes*60
-        termination_start = time.time()
-        while wait_termination and all_instances and time.time() < termination_start+termination_timeout:
-            all_instances = [i for i in all_instances if i.state != 'terminated']
-            time.sleep(5)
-            for i in all_instances:
-                i.update()
-
-        log.info('Done.')
+    
+    try:    # First we test if exist the cluster with the function cluster_exists
+        if cluster_exists(cluster_name,region):
+            # Here we use the script to destroy the cluster using the name of it
+            call_ec2_script(['destroy','--assume-yes', cluster_name,'--ec2-vpc-id','vpc-94215df1'],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes)  
+            all_instances = masters + slaves
+            # To better view about what the script is doing i choose to let the same code of the destroy i have updated
+            if all_instances:        
+                log.info('The %s will be terminated:', cluster_name)
+                for i in all_instances:
+                    log.info('-> %s' % (i.public_dns_name or i.private_dns_name))        
+
+                if wait_termination:
+                    log.info('Waiting for instances termination...')
+                termination_timeout = wait_timeout_minutes*60
+                termination_start = time.time()
+
+                while wait_termination and all_instances and time.time() < termination_start+termination_timeout:
+                    all_instances = [i for i in all_instances if i.state != 'terminated']
+                    time.sleep(5)
+                    for i in all_instances:
+                        i.update()
+                # The log says the destruction is Done but is still running, just chill and enjoy the ride
+                log.info('Done.')
+    # Here is the exception of the try if we don't find the cluster
+    except Exception as e:
+        print('Does not exist the cluster %s', cluster_name)
 
 
 def get_master(cluster_name, region=default_region):

From dee4d21919de0b6e89b850e6a761c2d33e496852 Mon Sep 17 00:00:00 2001
From: Allan Rolli <allan.rolli.profissional@hotmail.com>
Date: Wed, 8 Jun 2022 16:02:51 -0300
Subject: [PATCH 245/268] Update cluster.py

removed cluster_exists and added vpc inside variable
---
 tools/cluster.py | 45 ++++++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 778ccfaf..60e3a8b4 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -59,6 +59,7 @@
 default_collect_results_dir = '/tmp'
 default_user_data = os.path.join(script_path, 'scripts', 'noop')
 default_defaults_filename = 'cluster_defaults.json'
+default_vpc='vpc-94215df1'
 
 
 master_post_create_commands = [
@@ -335,33 +336,31 @@ def launch(cluster_name, slaves,
     raise CommandError('Failed to created cluster {0} after failures'.format(cluster_name))
 
 
-def destroy(cluster_name, wait_termination=False, wait_timeout_minutes=10, delete_groups=False, region=default_region,script_timeout_total_minutes=55,script_timeout_inactivity_minutes=10):
+def destroy(cluster_name, wait_termination=False, wait_timeout_minutes=10, delete_groups=False, default_vpc, region=default_region,script_timeout_total_minutes=55,script_timeout_inactivity_minutes=10):
     assert not delete_groups, 'Delete groups is deprecated and unsupported'
     masters, slaves = get_active_nodes(cluster_name, region=region)
     
-    try:    # First we test if exist the cluster with the function cluster_exists
-        if cluster_exists(cluster_name,region):
-            # Here we use the script to destroy the cluster using the name of it
-            call_ec2_script(['destroy','--assume-yes', cluster_name,'--ec2-vpc-id','vpc-94215df1'],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes)  
-            all_instances = masters + slaves
-            # To better view about what the script is doing i choose to let the same code of the destroy i have updated
-            if all_instances:        
-                log.info('The %s will be terminated:', cluster_name)
+    try:# Here we use the script to destroy the cluster using the name of it
+        call_ec2_script(['destroy','--assume-yes', cluster_name,'--ec2-vpc-id',default_vpc],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes)  
+        all_instances = masters + slaves
+        # To better view about what the script is doing i choose to let the same code of the destroy i have updated
+        if all_instances:        
+            log.info('The %s will be terminated:', cluster_name)
+            for i in all_instances:
+                log.info('-> %s' % (i.public_dns_name or i.private_dns_name))        
+
+            if wait_termination:
+                log.info('Waiting for instances termination...')
+            termination_timeout = wait_timeout_minutes*60
+            termination_start = time.time()
+
+            while wait_termination and all_instances and time.time() < termination_start+termination_timeout:
+                all_instances = [i for i in all_instances if i.state != 'terminated']
+                time.sleep(5)
                 for i in all_instances:
-                    log.info('-> %s' % (i.public_dns_name or i.private_dns_name))        
-
-                if wait_termination:
-                    log.info('Waiting for instances termination...')
-                termination_timeout = wait_timeout_minutes*60
-                termination_start = time.time()
-
-                while wait_termination and all_instances and time.time() < termination_start+termination_timeout:
-                    all_instances = [i for i in all_instances if i.state != 'terminated']
-                    time.sleep(5)
-                    for i in all_instances:
-                        i.update()
-                # The log says the destruction is Done but is still running, just chill and enjoy the ride
-                log.info('Done.')
+                    i.update()
+            # The log says the destruction is Done but is still running, just chill and enjoy the ride
+            log.info('Done.')
     # Here is the exception of the try if we don't find the cluster
     except Exception as e:
         print('Does not exist the cluster %s', cluster_name)

From 5cf960a82bb4b1b926b5252e8f38e9ebacab0d7d Mon Sep 17 00:00:00 2001
From: Allan Rolli <allan.rolli.profissional@hotmail.com>
Date: Fri, 10 Jun 2022 14:03:10 -0300
Subject: [PATCH 246/268] Update utils.py

---
 tools/utils.py | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/tools/utils.py b/tools/utils.py
index 5cfecb77..a33faa47 100644
--- a/tools/utils.py
+++ b/tools/utils.py
@@ -123,3 +123,44 @@ def check_call_with_timeout(args, stdin=None, stdout=None,
         stdall = 'STDOUT:\n{}\nSTDERR:\n{}'.format(stdout, stderr)
         raise subprocess.CalledProcessError(p.returncode, args, output=stdall)
     return p.returncode
+
+def check_call_with_timeout_describe(args, stdin=None, stdout=None,
+                        stderr=None, shell=False,
+                        timeout_total_minutes=0,
+                        timeout_inactivity_minutes=0):
+    stdout = stdout or sys.stdout
+    stderr = stderr or sys.stderr
+    begin_time_total = time.time()
+    begin_time_inactivity = time.time()
+    p = subprocess.Popen(args,
+                         stdin=stdin,
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE,
+                         shell=shell,
+                         universal_newlines=False)
+    while True:
+        if read_from_to(p.stdout, stdout):
+            begin_time_inactivity = time.time()
+        if read_from_to(p.stderr, stderr):
+            begin_time_inactivity = time.time()
+        if p.poll() is not None:
+            break
+        terminate_by_total_timeout = timeout_total_minutes > 0 and time.time() - begin_time_total > (timeout_total_minutes * 60)
+        terminate_by_inactivity_timeout = timeout_inactivity_minutes > 0 and time.time() - begin_time_inactivity > (timeout_inactivity_minutes * 60)
+        if terminate_by_inactivity_timeout or terminate_by_total_timeout:
+            p.terminate()
+            for i in range(100):
+                if p.poll is not None:
+                    break
+                time.sleep(0.1)
+            p.kill()
+            message = 'Terminated by inactivity' if terminate_by_inactivity_timeout else 'Terminated by total timeout'
+            raise ProcessTimeoutException(message)
+        time.sleep(0.5)
+    read_from_to(p.stdout, stdout)
+    read_from_to(p.stderr, stderr)
+    if p.returncode != 0:
+        stdall = 'STDOUT:\n{}\nSTDERR:\n{}'.format(stdout, stderr)
+        raise subprocess.CalledProcessError(p.returncode, args, output=stdall)
+    if len(args) > 5:
+        return args[5] 

From 5148803d65ed760f9fc5d0a58eb46a3a899358b1 Mon Sep 17 00:00:00 2001
From: Allan Rolli <allan.rolli.profissional@hotmail.com>
Date: Fri, 10 Jun 2022 14:05:41 -0300
Subject: [PATCH 247/268] Update cluster.py

---
 tools/cluster.py | 56 ++++++++++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 23 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 60e3a8b4..821a6f9a 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -13,7 +13,7 @@
 import subprocess
 from subprocess import check_output, check_call
 from utils import tag_instances, get_masters, get_active_nodes
-from utils import check_call_with_timeout
+from utils import check_call_with_timeout, check_call_with_timeout_describe
 import os
 import sys
 from datetime import datetime
@@ -147,6 +147,13 @@ def call_ec2_script(args, timeout_total_minutes, timeout_inactivity_minutes, std
                                     stdout=stdout,
                                     timeout_total_minutes=timeout_total_minutes,
                                     timeout_inactivity_minutes=timeout_inactivity_minutes)
+def call_ec2_script_describe(args, timeout_total_minutes, timeout_inactivity_minutes, stdout=None):
+    ec2_script_path = chdir_to_ec2_script_and_get_path()
+    return check_call_with_timeout_describe(['/usr/bin/env', 'python3', '-u',
+                                    ec2_script_path] + args,
+                                    stdout=stdout,
+                                    timeout_total_minutes=timeout_total_minutes,
+                                    timeout_inactivity_minutes=timeout_inactivity_minutes)
 
 
 def cluster_exists(cluster_name, region):
@@ -336,34 +343,37 @@ def launch(cluster_name, slaves,
     raise CommandError('Failed to created cluster {0} after failures'.format(cluster_name))
 
 
-def destroy(cluster_name, wait_termination=False, wait_timeout_minutes=10, delete_groups=False, default_vpc, region=default_region,script_timeout_total_minutes=55,script_timeout_inactivity_minutes=10):
+def destroy(cluster_name, wait_termination=False, vpc=default_vpc, wait_timeout_minutes=10, delete_groups=False, region=default_region,script_timeout_total_minutes=55,script_timeout_inactivity_minutes=10):
     assert not delete_groups, 'Delete groups is deprecated and unsupported'
     masters, slaves = get_active_nodes(cluster_name, region=region)
     
-    try:# Here we use the script to destroy the cluster using the name of it
-        call_ec2_script(['destroy','--assume-yes', cluster_name,'--ec2-vpc-id',default_vpc],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes)  
-        all_instances = masters + slaves
-        # To better view about what the script is doing i choose to let the same code of the destroy i have updated
-        if all_instances:        
-            log.info('The %s will be terminated:', cluster_name)
-            for i in all_instances:
-                log.info('-> %s' % (i.public_dns_name or i.private_dns_name))        
-
-            if wait_termination:
-                log.info('Waiting for instances termination...')
-            termination_timeout = wait_timeout_minutes*60
-            termination_start = time.time()
-
-            while wait_termination and all_instances and time.time() < termination_start+termination_timeout:
-                all_instances = [i for i in all_instances if i.state != 'terminated']
-                time.sleep(5)
+    try:    # First we test if exist the cluster with the function cluster_exists        
+        cluster = call_ec2_script_describe(['describe', cluster_name,'--ec2-vpc-id',vpc],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes)
+        if cluster == cluster_name:
+            call_ec2_script(['destroy','--assume-yes', cluster_name,'--ec2-vpc-id',vpc],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes) 
+            # Here we use the script to destroy the cluster using the name of it
+            all_instances = masters + slaves
+            # To better view about what the script is doing i choose to let the same code of the destroy i have updated
+            if all_instances:        
+                log.info('The %s will be terminated:', cluster_name)
                 for i in all_instances:
-                    i.update()
-            # The log says the destruction is Done but is still running, just chill and enjoy the ride
-            log.info('Done.')
+                    log.info('-> %s' % (i.public_dns_name or i.private_dns_name))        
+
+                if wait_termination:
+                    log.info('Waiting for instances termination...')
+                termination_timeout = wait_timeout_minutes*60
+                termination_start = time.time()
+
+                while wait_termination and all_instances and time.time() < termination_start+termination_timeout:
+                    all_instances = [i for i in all_instances if i.state != 'terminated']
+                    time.sleep(5)
+                    for i in all_instances:
+                        i.update()
+                # The log says the destruction is Done but is still running, just chill and enjoy the ride
+                log.info('Done.')
     # Here is the exception of the try if we don't find the cluster
     except Exception as e:
-        print('Does not exist the cluster %s', cluster_name)
+        log.info('Does not exist %s', cluster_name)
 
 
 def get_master(cluster_name, region=default_region):

From 268e6d0fcf153e5370bc38429497fa998a7f0062 Mon Sep 17 00:00:00 2001
From: Emerson Ferreira <emerson.ferreira@fcamara.com.br>
Date: Wed, 15 Jun 2022 16:34:09 -0300
Subject: [PATCH 248/268] Fixing the destroy function on flintrock.

---
 tools/cluster.py | 79 +++++++++++++++++++++++++++++++++++-------------
 tools/utils.py   |  6 ++++
 2 files changed, 64 insertions(+), 21 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 821a6f9a..ef42e646 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -7,12 +7,13 @@
 
 """
 
+from cgitb import reset
 import argh
 from argh import ArghParser, CommandError
 from argh.decorators import named, arg
 import subprocess
 from subprocess import check_output, check_call
-from utils import tag_instances, get_masters, get_active_nodes
+from utils import tag_instances, get_masters, get_active_nodes, get_active_nodes_by_tag
 from utils import check_call_with_timeout, check_call_with_timeout_describe
 import os
 import sys
@@ -343,37 +344,73 @@ def launch(cluster_name, slaves,
     raise CommandError('Failed to created cluster {0} after failures'.format(cluster_name))
 
 
+def destroy_by_flyntrock(region, cluster_name, vpc=default_vpc, script_timeout_total_minutes=55, script_timeout_inactivity_minutes=10, wait_termination=False, wait_timeout_minutes=10):
+    # create a variable to store the result
+    result = False
+    
+    try: # create a try catch to manage the possible erros
+        cluster = call_ec2_script_describe(['describe', cluster_name,'--ec2-vpc-id',vpc],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes)
+        if cluster == cluster_name:
+            call_ec2_script(['destroy','--assume-yes', cluster_name,'--ec2-vpc-id',vpc],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes) 
+            result = True
+    except Exception as e:
+        #log.info('Error to destroy cluster {0} by flintrock'.format(cluster_name))
+        destroy_by_cluster_name_tag(region, 'spark_cluster_name', cluster_name, wait_termination, wait_timeout_minutes)
+        pass
+
+    return result
+
+def destroy_by_cluster_name_tag(region, tag_name, cluster_name, wait_termination, wait_timeout_minutes):
+    instances = get_active_nodes_by_tag(region, tag_name, cluster_name)  
+    
+    if instances:
+        #log.info('Trying to terminate remain instances by id.')
+
+        for instance in instances:
+          #log.info('Terminate instance {0}'.format(instance.id))
+          instance.terminate()
+          log.info('Instance {0} is terminating.'.format(instance.id))
+
+        # call this function to wait instances to terminate
+        wait_for_intances_to_terminate(cluster_name, wait_termination, wait_timeout_minutes, instances)
+    
+    return instances
+
+
 def destroy(cluster_name, wait_termination=False, vpc=default_vpc, wait_timeout_minutes=10, delete_groups=False, region=default_region,script_timeout_total_minutes=55,script_timeout_inactivity_minutes=10):
     assert not delete_groups, 'Delete groups is deprecated and unsupported'
     masters, slaves = get_active_nodes(cluster_name, region=region)
     
     try:    # First we test if exist the cluster with the function cluster_exists        
-        cluster = call_ec2_script_describe(['describe', cluster_name,'--ec2-vpc-id',vpc],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes)
-        if cluster == cluster_name:
-            call_ec2_script(['destroy','--assume-yes', cluster_name,'--ec2-vpc-id',vpc],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes) 
+        if(destroy_by_flyntrock(region, cluster_name, vpc, script_timeout_total_minutes, script_timeout_inactivity_minutes, wait_termination, wait_timeout_minutes)):
             # Here we use the script to destroy the cluster using the name of it
             all_instances = masters + slaves
             # To better view about what the script is doing i choose to let the same code of the destroy i have updated
-            if all_instances:        
-                log.info('The %s will be terminated:', cluster_name)
-                for i in all_instances:
-                    log.info('-> %s' % (i.public_dns_name or i.private_dns_name))        
-
-                if wait_termination:
-                    log.info('Waiting for instances termination...')
-                termination_timeout = wait_timeout_minutes*60
-                termination_start = time.time()
-
-                while wait_termination and all_instances and time.time() < termination_start+termination_timeout:
-                    all_instances = [i for i in all_instances if i.state != 'terminated']
-                    time.sleep(5)
-                    for i in all_instances:
-                        i.update()
-                # The log says the destruction is Done but is still running, just chill and enjoy the ride
-                log.info('Done.')
+            wait_for_intances_to_terminate(cluster_name, wait_termination, wait_timeout_minutes, all_instances)
     # Here is the exception of the try if we don't find the cluster
     except Exception as e:
         log.info('Does not exist %s', cluster_name)
+        pass
+
+def wait_for_intances_to_terminate(cluster_name, wait_termination=False, wait_timeout_minutes=10, all_instances=[]):
+    # To better view about what the script is doing i choose to let the same code of the destroy i have updated
+    if all_instances:        
+        log.info('The %s will be terminated:', cluster_name)
+        for i in all_instances:
+            log.info('-> %s' % (i.public_dns_name or i.private_dns_name))        
+
+        if wait_termination:
+            log.info('Waiting for instances termination...')
+        termination_timeout = wait_timeout_minutes*60
+        termination_start = time.time()
+
+        while wait_termination and all_instances and time.time() < termination_start+termination_timeout:
+            all_instances = [i for i in all_instances if i.state != 'terminated']
+            time.sleep(5)
+            for i in all_instances:
+                i.update()
+        # The log says the destruction is Done but is still running, just chill and enjoy the ride
+        log.info('Done.')
 
 
 def get_master(cluster_name, region=default_region):
diff --git a/tools/utils.py b/tools/utils.py
index a33faa47..4f8b175e 100644
--- a/tools/utils.py
+++ b/tools/utils.py
@@ -40,6 +40,12 @@ def get_active_nodes(cluster_name, region):
     return parse_nodes(active, cluster_name)
 
 
+def get_active_nodes_by_tag(region, tag_name, tag_value):
+    conn = boto.ec2.connect_to_region(region)
+    filter = {"tag:{0}".format(tag_name):["{0}".format(tag_value)], "instance-state-name":["running"]}
+    return conn.get_only_instances(filters=filter)
+
+
 def tag_instances(cluster_name, tags, region):
     conn = boto.ec2.connect_to_region(region)
 

From 5692e45642c6ea4c1fa06553b7b51bf23be19f3e Mon Sep 17 00:00:00 2001
From: Emerson Ferreira <emerson.ferreira@fcamara.com.br>
Date: Mon, 20 Jun 2022 14:25:58 -0300
Subject: [PATCH 249/268] Creating new methods to manage the destroy cluster
 before try to create a new cluster.

---
 tools/cluster.py | 10 ++++--
 tools/utils.py   | 93 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 3 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index ef42e646..e970aa64 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -14,7 +14,7 @@
 import subprocess
 from subprocess import check_output, check_call
 from utils import tag_instances, get_masters, get_active_nodes, get_active_nodes_by_tag
-from utils import check_call_with_timeout, check_call_with_timeout_describe
+from utils import check_call_with_timeout, check_call_with_timeout_describe, destroy_by_request_spot_ids
 import os
 import sys
 from datetime import datetime
@@ -381,7 +381,11 @@ def destroy(cluster_name, wait_termination=False, vpc=default_vpc, wait_timeout_
     assert not delete_groups, 'Delete groups is deprecated and unsupported'
     masters, slaves = get_active_nodes(cluster_name, region=region)
     
-    try:    # First we test if exist the cluster with the function cluster_exists        
+    try: # First we test if exist the cluster with the function cluster_exists        
+        # get instances ids by json return and cancel the requests
+        wait_for_intances_to_terminate(cluster_name, wait_termination, wait_timeout_minutes, destroy_by_request_spot_ids(region, cluster_name))
+        
+        # test if the cluster exists and call destroy by fintorock to destroy it
         if(destroy_by_flyntrock(region, cluster_name, vpc, script_timeout_total_minutes, script_timeout_inactivity_minutes, wait_termination, wait_timeout_minutes)):
             # Here we use the script to destroy the cluster using the name of it
             all_instances = masters + slaves
@@ -397,7 +401,7 @@ def wait_for_intances_to_terminate(cluster_name, wait_termination=False, wait_ti
     if all_instances:        
         log.info('The %s will be terminated:', cluster_name)
         for i in all_instances:
-            log.info('-> %s' % (i.public_dns_name or i.private_dns_name))        
+            log.info('-> %s' % (i.public_dns_name or i.private_dns_name or i.id))        
 
         if wait_termination:
             log.info('Waiting for instances termination...')
diff --git a/tools/utils.py b/tools/utils.py
index 4f8b175e..e579361a 100644
--- a/tools/utils.py
+++ b/tools/utils.py
@@ -5,6 +5,13 @@
 import subprocess
 import select
 import time
+import json
+from os.path import exists
+from os import makedirs
+import os
+
+# get a folder_log_path from env variable
+folder_log_path = os.getenv('LOG_FOLDER')
 
 logging.basicConfig(level=logging.INFO)
 
@@ -45,6 +52,92 @@ def get_active_nodes_by_tag(region, tag_name, tag_value):
     filter = {"tag:{0}".format(tag_name):["{0}".format(tag_value)], "instance-state-name":["running"]}
     return conn.get_only_instances(filters=filter)
 
+def get_requests_ids_by_cluster_name(cluster_name):
+    # create a array with the requests ids
+    requests_ids = []
+    folder_full_path = os.path.abspath(os.getcwd())
+
+    if folder_log_path:
+        # check if the folder exists and if not create it
+        folderExist = exists(folder_log_path)
+
+        if folderExist != True:
+            makedirs(folder_log_path)
+    
+        file_name = '{0}/{1}.json'.format(folder_log_path, cluster_name)
+    else:
+        file_name = '{0}.json'.format(cluster_name)
+
+    # verify if the file exists
+    file_exists = exists(file_name)
+    
+    if file_exists:
+        # open a json log file if exists
+        json_file = open(file_name)
+
+        # deserialize the json file to object
+        json_content = json.load(json_file)
+
+        # create a array with the requests ids
+        for request_id in json_content:
+            requests_ids.append(str(request_id['SpotInstanceRequestId']))
+
+    return requests_ids
+
+
+def destroy_by_request_spot_ids(region, cluster_name):
+    conn = boto.ec2.connect_to_region(region)
+    instances = []
+    
+    try:
+        # get requets ids from json log file
+        request_ids = get_requests_ids_by_cluster_name(cluster_name)
+        logging.info('The amount of requests ids found in json log file: {0}'.format(len(request_ids)))
+        instances_cancelled = []
+        
+        # test if the request has any id
+        if len(request_ids) > 0:
+            spot_requests = conn.get_all_spot_instance_requests()
+            for request in request_ids:
+                for spot_request in spot_requests:                    
+                    if request == spot_request.id:
+                        # cancel the requests returned before
+                        conn.cancel_spot_instance_requests(request)
+                        instances_cancelled.append(spot_request)
+
+            # verify if the cancelled list is not empty
+            if len(instances_cancelled) > 0:
+                instances_ids = []
+        
+                # create the instance list of machines based on requests ids
+                for request_cancelled in instances_cancelled:
+                    if request_cancelled.instance_id:
+                        instances_ids.append(request_cancelled.instance_id)
+                
+                # test if the instance id is not empty
+                if len(instances_ids) > 0:
+                    instances_requested = conn.get_only_instances(instances_ids)
+
+                    # terminate instances from request spot
+                    for instance in instances_requested:
+                        # checking again if the object is in the list to not terminate wrong machines
+                        if instances_ids.index(instance.id) > -1:
+                            if instance.state == 'running':
+                                logging.info('Terminating instance: {0}'.format(instance.id))
+                                # add only instances that are running to return list
+                                instances.append(instance)
+                                # terminate the instance
+                                instance.terminate()
+                            elif instance.state == 'shutting-down':
+                                # add the instance to the wait list
+                                instances.append(instance)
+
+    except Exception as e:
+        logging.error('Error to destroy cluster {0} by request ids.'.format(cluster_name))
+        pass
+
+    return instances
+
 
 def tag_instances(cluster_name, tags, region):
     conn = boto.ec2.connect_to_region(region)

From 16e427be2adbaba8e7497f85d468effb56d7ae21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8Dtalo?= <italosp00@gmail.com>
Date: Tue, 21 Jun 2022 19:06:57 -0300
Subject: [PATCH 250/268] feat: new flintrock option for create cluster

---
 tools/cluster.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/cluster.py b/tools/cluster.py
index 8cc12f98..56a844de 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -302,6 +302,7 @@ def launch(cluster_name, slaves,
                                  '--ec2-security-group', security_group,
                                  '--ec2-user', installation_user,
                                  '--ec2-user-data', user_data,
+                                 '--launch-template-name', cluster_name,
                                  cluster_name] +
                                 spot_params +
                                 auth_params +

From ce1f19df5561c84fca8b6c9400ccbe6c4b62b2be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8Dtalo?= <italosp00@gmail.com>
Date: Fri, 8 Jul 2022 10:13:27 -0300
Subject: [PATCH 251/268] feat: removing the zone parameter, due to multi az
 flintrock

---
 tools/cluster.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index d20a287b..0cb4e3b9 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -297,7 +297,6 @@ def launch(cluster_name, slaves,
                                  '--ec2-key-name', key_id,
                                  '--num-slaves', slaves,
                                  '--ec2-region', region,
-                                 '--ec2-availability-zone', zone,
                                  '--ec2-instance-type', instance_type,
                                  '--ec2-min-root-ebs-size-gb', min_root_ebs_size_gb,
                                  '--assume-yes',

From 9280c4ce6df2eb736cd6eb89c6195455dcab71f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=A1vio=20Zantut=20Nogueira?= <flaviozantut@gmail.com>
Date: Wed, 13 Jul 2022 17:17:38 -0300
Subject: [PATCH 252/268] Change installation and launch user from root to
 ec2-user

---
 remote_hook.sh   | 14 +++++++-------
 tools/cluster.py |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index 5078786b..25233a56 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -30,10 +30,10 @@ echo $$ > "${RUNNING_FILE}"
 
 
 # Let us read the spark home even when the image doesn't give us the permission
-sudo chmod o+rx /root
-sudo chmod -R o+rx /root/spark
+sudo chmod o+rx /home/ec2-user
+sudo chmod -R o+rx /home/ec2-user/spark
 
-sudo mkdir -p /media/tmp/spark-events
+mkdir -p /media/tmp/spark-events
 
 notify_error_and_exit() {
     description="${1}"
@@ -70,7 +70,7 @@ install_and_run_zeppelin() {
         export ZEPPELIN_PORT="8081"
         export SPARK_HOME=$(get_first_present /root/spark /opt/spark ~/spark*/)
         export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH} --executor-memory ${SPARK_MEM_PARAM}"
-        sudo -E zeppelin/bin/zeppelin.sh
+        zeppelin/bin/zeppelin.sh
     else
         notify_error_and_exit "Zeppelin installation not found"
     fi
@@ -88,7 +88,7 @@ install_and_run_jupyter() {
     export PYSPARK_DRIVER_PYTHON=$(which jupyter)
     export PYSPARK_DRIVER_PYTHON_OPTS="notebook --allow-root --ip=${SPARK_MASTER_HOST} --no-browser --port=8888"
     sudo $(which jupyter) toree install --spark_home="${SPARK_HOME}" --spark_opts="--master ${JOB_MASTER} --executor-memory ${SPARK_MEM_PARAM} --driver-memory ${DRIVER_HEAP_SIZE}"
-    sudo -E "${SPARK_HOME}/bin/pyspark" --master "${JOB_MASTER}" --executor-memory "${SPARK_MEM_PARAM}" --driver-memory "${DRIVER_HEAP_SIZE}"
+    ${SPARK_HOME}/bin/pyspark --master "${JOB_MASTER}" --executor-memory "${SPARK_MEM_PARAM}" --driver-memory "${DRIVER_HEAP_SIZE}"
 }
 
 trap "on_trap_exit" EXIT
@@ -118,7 +118,7 @@ if [[ "${USE_YARN}" == "yes" ]]; then
 fi
 
 if [[ "${JOB_NAME}" == "shell" ]]; then
-    sudo -E ${SPARK_HOME}/bin/spark-shell --master "${JOB_MASTER}" --jars ${JAR_PATH} --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell"
+    ${SPARK_HOME}/bin/spark-shell --master "${JOB_MASTER}" --jars ${JAR_PATH} --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell"
 elif [[ "${JOB_NAME}" == "zeppelin" ]]; then
     install_and_run_zeppelin
 elif [[ "${JOB_NAME}" == "jupyter" ]]; then
@@ -126,7 +126,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then
 else
     JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log"
     tail -F "${JOB_OUTPUT}" &
-    sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
+    ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
 fi
 
 touch "${JOB_CONTROL_DIR}/SUCCESS"
diff --git a/tools/cluster.py b/tools/cluster.py
index e970aa64..629dc40c 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -55,7 +55,7 @@
 default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz'
 default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz'
 default_remote_user = 'ec2-user'
-default_installation_user = 'root'
+default_installation_user = 'ec2-user'
 default_remote_control_dir = '/tmp/Ignition'
 default_collect_results_dir = '/tmp'
 default_user_data = os.path.join(script_path, 'scripts', 'noop')

From aa39bb5a670b9101dd4307a8d2448bd2fc54d435 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8Dtalo?= <italosp00@gmail.com>
Date: Wed, 13 Jul 2022 17:57:32 -0300
Subject: [PATCH 253/268] feat: add a function to delete Flintrock SG rules in
 dev environment

- it is a module inside core/tools which is called with subprocess
  of python2 (cluster.py) due to the script (revoke_sg_rules.py) be
  written in python3.
---
 tools/cluster.py         |   8 +++
 tools/revoke_sg_rules.py | 107 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 115 insertions(+)
 create mode 100644 tools/revoke_sg_rules.py

diff --git a/tools/cluster.py b/tools/cluster.py
index e970aa64..1025926e 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -383,6 +383,14 @@ def destroy(cluster_name, wait_termination=False, vpc=default_vpc, wait_timeout_
     
     try: # First we test if exist the cluster with the function cluster_exists        
         # get instances ids by json return and cancel the requests
+        
+        # if in dev environment, will delete the flintrock SG rules of the machine running this script
+        if os.getenv('ENVIRONMENT') == 'development':
+            revoke_sg_script = os.path.join(script_path, 'revoke_sg_rules.py')
+            process = subprocess.Popen(["python3", revoke_sg_script, region, vpc], stdout=subprocess.PIPE)
+            stdout_str = process.communicate()[0]
+            log.info(stdout_str)
+
         wait_for_intances_to_terminate(cluster_name, wait_termination, wait_timeout_minutes, destroy_by_request_spot_ids(region, cluster_name))
         
         # test if the cluster exists and call destroy by fintorock to destroy it
diff --git a/tools/revoke_sg_rules.py b/tools/revoke_sg_rules.py
new file mode 100644
index 00000000..40f5df72
--- /dev/null
+++ b/tools/revoke_sg_rules.py
@@ -0,0 +1,107 @@
+import urllib.request
+import sys
+
+from botocore.exceptions import ClientError
+import boto3
+
+
+def _get_security_group(region, vpc_id):
+    ec2 = boto3.client('ec2', region_name=region)
+    response = ec2.describe_security_groups(
+        Filters=[
+            {
+                'Name': 'vpc-id',
+                'Values': [
+                    vpc_id,
+                ]
+            },
+        ],
+    )
+    return response
+
+
+def _client_cidr():
+    flintrock_client_ip = (
+            urllib.request.urlopen('http://checkip.amazonaws.com/')
+            .read().decode('utf-8').strip())
+    flintrock__client_cidr = '{ip}/32'.format(ip=flintrock_client_ip)
+    return flintrock__client_cidr
+
+
+def _delete_rule(cidr_ip, ip_protocol, from_port, to_port, group_id, region):
+      ec2 = boto3.client('ec2', region_name=region)
+      ec2.revoke_security_group_ingress(
+            CidrIp=cidr_ip,
+            GroupId=group_id,
+            IpProtocol=ip_protocol,
+            FromPort=from_port,
+            ToPort=to_port
+        )
+    
+def revoke_flintrock_sg_ingress(region, vpc_id):
+    """Revoke Flintrock Security Group's Rules matched with the IP from
+    the current machine given the Region and VPC ID
+
+    :param region: The AWS region where the VPC is located
+    :type region: str
+    :param vpc_id: The VPC ID where flintrock Security Group was created
+    :type vpc_id: str
+    :returns: a string with a message explaining the success or fail
+    :rtype: str
+    """
+    
+    response = _get_security_group(region=region, vpc_id=vpc_id)
+    # variables required to delete rule
+    cidr_to_revoke_rules = _client_cidr()
+    group_id = ''
+    group_name = ''
+    from_port = ''
+    to_port = ''
+    ip_protocol = ''
+    # variable to store the success of the 
+    # loop and give the right return message
+    success = False
+
+    security_groups = response["SecurityGroups"]
+
+    if len(security_groups) == 0:
+        return 'There is no security groups in the vpc {} at region {}'.format(vpc_id, region)
+
+    for sg in security_groups:
+        group_id = sg['GroupId']
+        group_name = sg['GroupName']
+        if group_name == 'flintrock':
+            for ip in sg['IpPermissions']:
+                if 'FromPort' in ip:
+                    from_port = ip['FromPort']
+                    ip_protocol = ip['IpProtocol']
+                    to_port = ip['ToPort']
+                    for cidr in ip['IpRanges']:
+                        # identifying which rules contain the local IP range
+                        if cidr['CidrIp'] == cidr_to_revoke_rules:
+                            try:
+                                _delete_rule(
+                                    cidr_ip=cidr['CidrIp'],
+                                    ip_protocol=ip_protocol,
+                                    from_port=from_port,
+                                    to_port=to_port,
+                                    group_id=group_id,
+                                    region=region
+                                )
+                                success = True
+                            except ClientError as error:
+                                raise error
+                       
+
+    if not success:
+        return 'There is no rule from this client to delete in the vpc id: {}.'.format(vpc_id)
+        
+    else:
+        return 'Successfully deleted the rules from this client in the vpc id: {}.'.format(vpc_id)
+
+
+if __name__ == '__main__':
+    region = sys.argv[1]
+    vpc_id = sys.argv[2]
+    result = revoke_flintrock_sg_ingress(region=region, vpc_id=vpc_id)
+    print(result)
\ No newline at end of file

From ce109e46eab145d3940206c409e33fad226ecabc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8Dtalo?= <italosp00@gmail.com>
Date: Wed, 13 Jul 2022 18:07:04 -0300
Subject: [PATCH 254/268] refactor: add new blank line at the end

---
 tools/revoke_sg_rules.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/revoke_sg_rules.py b/tools/revoke_sg_rules.py
index 40f5df72..06f42b9e 100644
--- a/tools/revoke_sg_rules.py
+++ b/tools/revoke_sg_rules.py
@@ -104,4 +104,5 @@ def revoke_flintrock_sg_ingress(region, vpc_id):
     region = sys.argv[1]
     vpc_id = sys.argv[2]
     result = revoke_flintrock_sg_ingress(region=region, vpc_id=vpc_id)
-    print(result)
\ No newline at end of file
+    print(result)
+    
\ No newline at end of file

From c81c362e1a81b226961aa80973f09b99cea5466c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8Dtalo=20Pinto?= <italo.pinto@fcamara.com.br>
Date: Thu, 14 Jul 2022 15:40:17 -0300
Subject: [PATCH 255/268] Update tools/revoke_sg_rules.py

Co-authored-by: Iury Krieger <iury.krieger@linx.com.br>
---
 tools/revoke_sg_rules.py | 73 +++++++++++++++-------------------------
 1 file changed, 28 insertions(+), 45 deletions(-)

diff --git a/tools/revoke_sg_rules.py b/tools/revoke_sg_rules.py
index 06f42b9e..c5ee51f6 100644
--- a/tools/revoke_sg_rules.py
+++ b/tools/revoke_sg_rules.py
@@ -51,53 +51,36 @@ def revoke_flintrock_sg_ingress(region, vpc_id):
     """
     
     response = _get_security_group(region=region, vpc_id=vpc_id)
-    # variables required to delete rule
-    cidr_to_revoke_rules = _client_cidr()
-    group_id = ''
-    group_name = ''
-    from_port = ''
-    to_port = ''
-    ip_protocol = ''
-    # variable to store the success of the 
-    # loop and give the right return message
-    success = False
-
     security_groups = response["SecurityGroups"]
+    cidr_to_revoke_rules = _client_cidr()
+    
+    if not len(security_groups):
+        raise 'There is no security groups in the vpc {} at region {}'.format(vpc_id, region)
 
-    if len(security_groups) == 0:
-        return 'There is no security groups in the vpc {} at region {}'.format(vpc_id, region)
-
-    for sg in security_groups:
-        group_id = sg['GroupId']
-        group_name = sg['GroupName']
-        if group_name == 'flintrock':
-            for ip in sg['IpPermissions']:
-                if 'FromPort' in ip:
-                    from_port = ip['FromPort']
-                    ip_protocol = ip['IpProtocol']
-                    to_port = ip['ToPort']
-                    for cidr in ip['IpRanges']:
-                        # identifying which rules contain the local IP range
-                        if cidr['CidrIp'] == cidr_to_revoke_rules:
-                            try:
-                                _delete_rule(
-                                    cidr_ip=cidr['CidrIp'],
-                                    ip_protocol=ip_protocol,
-                                    from_port=from_port,
-                                    to_port=to_port,
-                                    group_id=group_id,
-                                    region=region
-                                )
-                                success = True
-                            except ClientError as error:
-                                raise error
-                       
-
-    if not success:
-        return 'There is no rule from this client to delete in the vpc id: {}.'.format(vpc_id)
-        
-    else:
-        return 'Successfully deleted the rules from this client in the vpc id: {}.'.format(vpc_id)
+     for security_group in security_groups:
+        for ip_permission in security_group['IpPermissions']:
+            for ip_range in ip_permission['IpRanges']:
+                group_id = security_group['GroupId']
+                group_name = security_group['GroupName']
+                from_port = ip_permission['FromPort']
+                ip_protocol = ip_permission['IpProtocol']
+                to_port = ip_permission['ToPort']
+                
+                if group_name == 'flintrock' and  'FromPort' in ip_permission and ip_range['CidrIp'] == cidr_to_revoke_rules:
+                    try:
+                        _delete_rule(
+                            cidr_ip=ip_range['CidrIp'],
+                            ip_protocol=ip_protocol,
+                            from_port=from_port,
+                            to_port=to_port,
+                            group_id=group_id,
+                            region=region
+                        )
+                    except ClientError as error:
+                        print('There is no rule from this client to delete in the vpc id: {}.'.format(vpc_id))
+                        raise error
+                        
+    print('Successfully deleted the rules from this client in the vpc id: {}.'.format(vpc_id))
 
 
 if __name__ == '__main__':

From 12737d002a50347a462a04d6fb0b41deed4e0bcf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8Dtalo?= <italosp00@gmail.com>
Date: Thu, 14 Jul 2022 16:21:04 -0300
Subject: [PATCH 256/268] refactor: fixing identation, style and add new
 function

- add a boolean function to return true if a given cidr
  exists in a given security group id.
- extends the function _get_security_group() to return
  an specific security group through the param `sg_name`.
- add a logic to verifie if the rules were deleted
  before end the script.
---
 tools/revoke_sg_rules.py | 116 +++++++++++++++++++++++++--------------
 1 file changed, 75 insertions(+), 41 deletions(-)

diff --git a/tools/revoke_sg_rules.py b/tools/revoke_sg_rules.py
index c5ee51f6..1577f80f 100644
--- a/tools/revoke_sg_rules.py
+++ b/tools/revoke_sg_rules.py
@@ -5,7 +5,7 @@
 import boto3
 
 
-def _get_security_group(region, vpc_id):
+def _get_security_group(region, vpc_id, sg_name):
     ec2 = boto3.client('ec2', region_name=region)
     response = ec2.describe_security_groups(
         Filters=[
@@ -17,7 +17,13 @@ def _get_security_group(region, vpc_id):
             },
         ],
     )
-    return response
+    desired_sg = None
+    security_groups = response['SecurityGroups']
+    for security_group in security_groups:
+        if security_group['GroupName'] == sg_name:
+            desired_sg = security_group
+
+    return desired_sg
 
 
 def _client_cidr():
@@ -28,6 +34,29 @@ def _client_cidr():
     return flintrock__client_cidr
 
 
+def _exists_cidr_in_sg(region, cidr, sg_id):
+    """Boolean function to return `true` if a given cidr
+    exists in a given security group id. Otherwise returns
+    `false`.
+    """
+    ec2 = boto3.client('ec2', region_name=region)
+    response = ec2.describe_security_group_rules(
+        Filters=[
+            {
+                'Name': 'group-id',
+                'Values': [
+                    sg_id,
+                ]
+            },
+        ]
+    )
+    rules = response['SecurityGroupRules']
+    for rule in rules:
+        if rule['CidrIpv4'] == cidr:
+            return True
+    return False
+
+
 def _delete_rule(cidr_ip, ip_protocol, from_port, to_port, group_id, region):
       ec2 = boto3.client('ec2', region_name=region)
       ec2.revoke_security_group_ingress(
@@ -37,55 +66,60 @@ def _delete_rule(cidr_ip, ip_protocol, from_port, to_port, group_id, region):
             FromPort=from_port,
             ToPort=to_port
         )
-    
+
+
 def revoke_flintrock_sg_ingress(region, vpc_id):
     """Revoke Flintrock Security Group's Rules matched with the IP from
     the current machine given the Region and VPC ID
-
-    :param region: The AWS region where the VPC is located
-    :type region: str
-    :param vpc_id: The VPC ID where flintrock Security Group was created
-    :type vpc_id: str
-    :returns: a string with a message explaining the success or fail
-    :rtype: str
+    :param `region`: The AWS region where the VPC is located
+    :type `region`: str
+    :param `vpc_id`: The VPC ID where flintrock Security Group was created
+    :type `vpc_id`: str
     """
     
-    response = _get_security_group(region=region, vpc_id=vpc_id)
-    security_groups = response["SecurityGroups"]
+    flintrock_security_group = _get_security_group(region=region, vpc_id=vpc_id, sg_name='flintrock')
     cidr_to_revoke_rules = _client_cidr()
+    flintrock_group_id = flintrock_security_group['GroupId']
+
+    if flintrock_security_group['GroupName'] != 'flintrock':
+        print('Flintrock security groups doesn\'t exist in this vpc {} at region {}'.format(vpc_id, region))
+        return # we don't want the script to ``raise`` an error, to not mess with the job_runner.py logs
+    
+    # check if the local IP is in some rule or not
+    if not _exists_cidr_in_sg(region=region, cidr=cidr_to_revoke_rules, sg_id=flintrock_group_id):
+        print('There is no rules with the IP of this client in Flintrock security group.')
+        return
+
+    for ip_permission in flintrock_security_group['IpPermissions']:
+        for ip_range in ip_permission['IpRanges']:
+            group_id = flintrock_group_id
+            from_port = ip_permission['FromPort']
+            ip_protocol = ip_permission['IpProtocol']
+            to_port = ip_permission['ToPort']
+            
+            if 'FromPort' in ip_permission and ip_range['CidrIp'] == cidr_to_revoke_rules:
+                try:
+                    _delete_rule(
+                        cidr_ip=ip_range['CidrIp'],
+                        ip_protocol=ip_protocol,
+                        from_port=from_port,
+                        to_port=to_port,
+                        group_id=group_id,
+                        region=region
+                    )
+                except ClientError as error:
+                    print(error)
     
-    if not len(security_groups):
-        raise 'There is no security groups in the vpc {} at region {}'.format(vpc_id, region)
-
-     for security_group in security_groups:
-        for ip_permission in security_group['IpPermissions']:
-            for ip_range in ip_permission['IpRanges']:
-                group_id = security_group['GroupId']
-                group_name = security_group['GroupName']
-                from_port = ip_permission['FromPort']
-                ip_protocol = ip_permission['IpProtocol']
-                to_port = ip_permission['ToPort']
-                
-                if group_name == 'flintrock' and  'FromPort' in ip_permission and ip_range['CidrIp'] == cidr_to_revoke_rules:
-                    try:
-                        _delete_rule(
-                            cidr_ip=ip_range['CidrIp'],
-                            ip_protocol=ip_protocol,
-                            from_port=from_port,
-                            to_port=to_port,
-                            group_id=group_id,
-                            region=region
-                        )
-                    except ClientError as error:
-                        print('There is no rule from this client to delete in the vpc id: {}.'.format(vpc_id))
-                        raise error
-                        
-    print('Successfully deleted the rules from this client in the vpc id: {}.'.format(vpc_id))
+    # check again to confirm if the rules were revoked
+    status = False
+    while not status:
+        status = _exists_cidr_in_sg(region=region, cidr=cidr_to_revoke_rules, sg_id=flintrock_group_id)
+
+    print('Successfully deleted rules of this client from flintrock security group at vpc {}'.format(vpc_id))
 
 
 if __name__ == '__main__':
     region = sys.argv[1]
     vpc_id = sys.argv[2]
-    result = revoke_flintrock_sg_ingress(region=region, vpc_id=vpc_id)
-    print(result)
+    revoke_flintrock_sg_ingress(region=region, vpc_id=vpc_id)
     
\ No newline at end of file

From bb8c938a1ce74d0895e7a76aff61aa363b0f1788 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8Dtalo?= <italosp00@gmail.com>
Date: Thu, 14 Jul 2022 16:39:18 -0300
Subject: [PATCH 257/268] fix: the while check was taking too long to return

- it is due to requests that are made with boto3
---
 tools/revoke_sg_rules.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tools/revoke_sg_rules.py b/tools/revoke_sg_rules.py
index 1577f80f..d4ab31a0 100644
--- a/tools/revoke_sg_rules.py
+++ b/tools/revoke_sg_rules.py
@@ -111,11 +111,8 @@ def revoke_flintrock_sg_ingress(region, vpc_id):
                     print(error)
     
     # check again to confirm if the rules were revoked
-    status = False
-    while not status:
-        status = _exists_cidr_in_sg(region=region, cidr=cidr_to_revoke_rules, sg_id=flintrock_group_id)
-
-    print('Successfully deleted rules of this client from flintrock security group at vpc {}'.format(vpc_id))
+    if not _exists_cidr_in_sg(region=region, cidr=cidr_to_revoke_rules, sg_id=flintrock_group_id):
+        print('Successfully deleted rules of this client from flintrock security group at vpc {}'.format(vpc_id))
 
 
 if __name__ == '__main__':

From 9e9ba1cc0457231080c09d3dcf50da61fc22acbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8Dtalo?= <italosp00@gmail.com>
Date: Wed, 20 Jul 2022 17:52:48 -0300
Subject: [PATCH 258/268] refactor: rename the function to destroy by request
 id and change in code style

- now the function is to destroy by fleet id
---
 tools/cluster.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/cluster.py b/tools/cluster.py
index 0cb4e3b9..0ca4618b 100755
--- a/tools/cluster.py
+++ b/tools/cluster.py
@@ -14,7 +14,7 @@
 import subprocess
 from subprocess import check_output, check_call
 from utils import tag_instances, get_masters, get_active_nodes, get_active_nodes_by_tag
-from utils import check_call_with_timeout, check_call_with_timeout_describe, destroy_by_request_spot_ids
+from utils import check_call_with_timeout, check_call_with_timeout_describe, destroy_by_fleet_id
 import os
 import sys
 from datetime import datetime
@@ -383,10 +383,10 @@ def destroy(cluster_name, wait_termination=False, vpc=default_vpc, wait_timeout_
     
     try: # First we test if exist the cluster with the function cluster_exists        
         # get instances ids by json return and cancel the requests
-        wait_for_intances_to_terminate(cluster_name, wait_termination, wait_timeout_minutes, destroy_by_request_spot_ids(region, cluster_name))
+        wait_for_intances_to_terminate(cluster_name, wait_termination, wait_timeout_minutes, destroy_by_fleet_id(region, cluster_name))
         
         # test if the cluster exists and call destroy by fintorock to destroy it
-        if(destroy_by_flyntrock(region, cluster_name, vpc, script_timeout_total_minutes, script_timeout_inactivity_minutes, wait_termination, wait_timeout_minutes)):
+        if destroy_by_flyntrock(region, cluster_name, vpc, script_timeout_total_minutes, script_timeout_inactivity_minutes, wait_termination, wait_timeout_minutes):
             # Here we use the script to destroy the cluster using the name of it
             all_instances = masters + slaves
             # To better view about what the script is doing i choose to let the same code of the destroy i have updated

From c6418c7c90304eacb7378bc0f8f67d3b0bcfe6f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8Dtalo?= <italosp00@gmail.com>
Date: Wed, 20 Jul 2022 17:54:36 -0300
Subject: [PATCH 259/268] feat: now destroy the cluster by fleet id

---
 tools/utils.py | 113 +++++++++++++++++++++++--------------------------
 1 file changed, 53 insertions(+), 60 deletions(-)

diff --git a/tools/utils.py b/tools/utils.py
index e579361a..447e1512 100644
--- a/tools/utils.py
+++ b/tools/utils.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python
+import ast
 import logging
+from pprint import pprint
 import boto.ec2
 import sys
 import subprocess
@@ -52,87 +54,78 @@ def get_active_nodes_by_tag(region, tag_name, tag_value):
     filter = {"tag:{0}".format(tag_name):["{0}".format(tag_value)], "instance-state-name":["running"]}
     return conn.get_only_instances(filters=filter)
 
-def get_requests_ids_by_cluster_name(cluster_name):
+def get_fleet_id_by_cluster_name(cluster_name):
     # create a array with the requests ids
-    requests_ids = []
-    folder_full_path = os.path.abspath(os.getcwd())
+    fleet_id = ''
+    file_name = '{0}.json'.format(cluster_name)
 
     if folder_log_path:
         # check if the folder exists and if not create it
-        folderExist = exists(folder_log_path)
-
-        if folderExist != True:
+        if not exists(folder_log_path):
             makedirs(folder_log_path)
     
         file_name = '{0}/{1}.json'.format(folder_log_path, cluster_name)
-    else:
-        file_name = '{0}.json'.format(cluster_name)
 
     # verify if the file exists
-    file_exists = exists(file_name)
-    
-    if file_exists:
+    if exists(file_name):
         # open a json log file if exists
-        json_file = open(file_name)
-
-        # deserialize the json file to object
-        json_content = json.load(json_file)
+        with open(file_name) as json_file:# deserialize the json file to object
+            json_content = json.load(json_file)
+            
+            # create a array with the requests ids
+            for request in json_content:
+                fleet_id = str(request['FleetId'])
 
-        # create a array with the requests ids
-        for request_id in json_content:
-            requests_ids.append(str(request_id['SpotInstanceRequestId']))
+    return fleet_id
 
-    return requests_ids
 
-
-def destroy_by_request_spot_ids(region, cluster_name):
+def destroy_by_fleet_id(region, cluster_name):
     conn = boto.ec2.connect_to_region(region)
+    fleet_instances_ids = []
     instances = []
     
     try:
         # get requets ids from json log file
-        request_ids = get_requests_ids_by_cluster_name(cluster_name)
-        logging.info('The amount of requests ids found in json log file: {0}'.format(len(request_ids)))
-        instances_cancelled = []
-        
-        # test if the request has any id
-        if len(request_ids) > 0:
-            spot_requests = conn.get_all_spot_instance_requests()
-            for request in request_ids:
-                for spot_request in spot_requests:                    
-                    if request == spot_request.id:
-                        # cancel the requests returned before
-                        conn.cancel_spot_instance_requests(request)
-                        instances_cancelled.append(spot_request)
-
-            # verify if the cancelled list is not empty
-            if len(instances_cancelled) > 0:
-                instances_ids = []
+        fleet_id = get_fleet_id_by_cluster_name(cluster_name)
+        logging.info('The fleet id found in json log file: {0}'.format(fleet_id))
         
-                # create the instance list of machines based on requests ids
-                for request_cancelled in instances_cancelled:
-                    if request_cancelled.instance_id:
-                        instances_ids.append(request_cancelled.instance_id)
-                
-                # test if the instance id is not empty
-                if len(instances_ids) > 0:
-                    instances_requested = conn.get_only_instances(instances_ids)
-
-                    # terminate instances from request spot
-                    for instance in instances_requested:
-                        # checking again if the object is in the list to not terminate wrong machines
-                        if instances_ids.index(instance.id) > -1:
-                            if instance.state == 'running':
-                                logging.info('Terminating instance: {0}'.format(instance.id))
-                                # add only instances that are running to return list
-                                instances.append(instance)
-                                # terminate the instance
-                                instance.terminate()
-                            elif instance.state == 'shutting-down':
-                                # add the instance to the wait list
-                                instances.append(instance)
+        # call an external script to delete the fleet and retrieve the list of instances
+        delete_fleet_script = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'delete_fleet.py')
+        process = subprocess.Popen(["python3", delete_fleet_script, region, fleet_id], stdout=subprocess.PIPE)
+        stdout_str = process.communicate()[0]
+
+        # the subprocess return a string with the character '\n' separating the delete message and the list of instances 
+        stdout_str_split = stdout_str.split('\n')
+
+        # message of fleet deletion
+        deleted_fleet = stdout_str_split[0]
+        logging.info(deleted_fleet)
+
+        # getting the list of the string containing the list of istances
+        # e.g."['i-0e90a67a64693dc39', 'i-00889275ebe58bb7b', 'i-0982e3e6728044bef']"
+        fleet_instances = ast.literal_eval(stdout_str_split[1])
+        fleet_instances_ids.extend(fleet_instances)
+
+        # test if the instance id is not empty
+        if len(fleet_instances_ids) > 0:
+            instances_requested = conn.get_only_instances(fleet_instances_ids)
+
+            # terminate instances from request spot
+            for instance in instances_requested:
+                # checking again if the object is in the list to not terminate wrong machines
+                if fleet_instances_ids.index(instance.id) > -1:
+                    if instance.state == 'running':
+                        logging.info('Terminating instance: {0}'.format(instance.id))
+                        # add only instances that are running to return list
+                        instances.append(instance)
+                        # terminate the instance
+                        instance.terminate()
+                    elif instance.state == 'shutting-down':
+                        # add the instance to the wait list
+                        instances.append(instance)
 
     except Exception as e:
+        logging.error(e)
         logging.error('Error to destroy cluster {0} by request ids.'.format(cluster_name))
         pass
 

From 0df5898730acb737d12ecde79e8df3a2ebefa481 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8Dtalo?= <italosp00@gmail.com>
Date: Wed, 20 Jul 2022 17:56:33 -0300
Subject: [PATCH 260/268] feat: script in python 3 to handle the fleet delete

- it is called inside the util.py (python2) through
  subprocess.Popen().

- this script returns a message for the fleet deleted
  and a list containing the instance ids.
---
 tools/delete_fleet.py | 46 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 tools/delete_fleet.py

diff --git a/tools/delete_fleet.py b/tools/delete_fleet.py
new file mode 100644
index 00000000..79aa6bbe
--- /dev/null
+++ b/tools/delete_fleet.py
@@ -0,0 +1,46 @@
+import sys
+from time import sleep
+
+import boto3
+from botocore.exceptions import ClientError
+
+def describe_fleets(region, fleet_id):
+    ec2 = boto3.client('ec2', region_name=region)
+    response = ec2.describe_fleets(
+       FleetIds=[
+            fleet_id
+        ],
+    )
+
+    return response['Fleets'][0]['Instances'][0]['InstanceIds']
+
+def delete_fleet(region, fleet_id):
+    ec2 = boto3.client('ec2', region_name=region)
+    response = ec2.delete_fleets(
+        FleetIds=[
+            fleet_id,
+        ],
+        TerminateInstances=True
+    )
+
+    return response['SuccessfulFleetDeletions'][0]['CurrentFleetState']
+
+
+if __name__ == '__main__':
+    region = sys.argv[1]
+    fleet_id = sys.argv[2]
+    try:
+        # Delete the fleet
+        fleet_deleted_states = ["deleted", "deleted_running", "deleted_terminating"]
+        fleet_state = None
+        while fleet_state not in fleet_deleted_states:
+            sleep(5)
+            fleet_state = delete_fleet(region=region, fleet_id=fleet_id)
+        print(f"Fleet deleted. Fleet state: {fleet_state}")
+
+        # get the instance ids from the fleet
+        print(describe_fleets(region=region, fleet_id=fleet_id))
+    except (ClientError, Exception) as e:
+        print(e)
+  
+  

From 26cd862cb26d68c26a5338c9854ba357e8a51ace Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8Dtalo?= <italosp00@gmail.com>
Date: Thu, 28 Jul 2022 16:26:49 -0300
Subject: [PATCH 261/268] hotfix: fix the output when there is no json file
 with fleet id

---
 tools/delete_fleet.py | 11 +++++++----
 tools/utils.py        | 12 ++++++++----
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/tools/delete_fleet.py b/tools/delete_fleet.py
index 79aa6bbe..70f1ce0f 100644
--- a/tools/delete_fleet.py
+++ b/tools/delete_fleet.py
@@ -11,8 +11,12 @@ def describe_fleets(region, fleet_id):
             fleet_id
         ],
     )
-
-    return response['Fleets'][0]['Instances'][0]['InstanceIds']
+    errors = response['Fleets'][0]['Errors']
+    instances = response['Fleets'][0]['Instances']
+    # to ensure we are returning an array anyway
+    if len(errors) > 0 and len(instances) == 0:
+        return ['']
+    return instances[0]['InstanceIds']
 
 def delete_fleet(region, fleet_id):
     ec2 = boto3.client('ec2', region_name=region)
@@ -42,5 +46,4 @@ def delete_fleet(region, fleet_id):
         print(describe_fleets(region=region, fleet_id=fleet_id))
     except (ClientError, Exception) as e:
         print(e)
-  
-  
+  
\ No newline at end of file
diff --git a/tools/utils.py b/tools/utils.py
index 447e1512..8c97d23f 100644
--- a/tools/utils.py
+++ b/tools/utils.py
@@ -85,8 +85,12 @@ def destroy_by_fleet_id(region, cluster_name):
     instances = []
     
     try:
-        # get requets ids from json log file
+        # get fleet id from json log file
         fleet_id = get_fleet_id_by_cluster_name(cluster_name)
+
+        if fleet_id in [None, '']:
+            raise Exception('There is no fleet id to delete. Keep going.')
+
         logging.info('The fleet id found in json log file: {0}'.format(fleet_id))
         
         # call an external script to delete the fleet and retrieve the list of instances
@@ -106,8 +110,8 @@ def destroy_by_fleet_id(region, cluster_name):
         fleet_instances = ast.literal_eval(stdout_str_split[1])
         fleet_instances_ids.extend(fleet_instances)
 
-        # test if the instance id is not empty
-        if len(fleet_instances_ids) > 0:
+        # test if the instance id is not empty and contains an instance id for sure
+        if len(fleet_instances_ids) > 0 and fleet_instances_ids[0].startswith('i-'):
             instances_requested = conn.get_only_instances(fleet_instances_ids)
 
             # terminate instances from request spot
@@ -126,7 +130,7 @@ def destroy_by_fleet_id(region, cluster_name):
 
     except Exception as e:
         logging.error(e)
-        logging.error('Error to destroy cluster {0} by request ids.'.format(cluster_name))
+        logging.error('Error to destroy cluster {0} by fleet id.'.format(cluster_name))
         pass
 
     return instances

From 421f95980ed8c22cb771527666deaab921dc4637 Mon Sep 17 00:00:00 2001
From: Daniel Dantas <dldantas.apps@gmail.com>
Date: Fri, 29 Jul 2022 16:49:00 -0300
Subject: [PATCH 262/268] added javaagent parameter (spark-submit) to enable
 metrics by jmx protocol

---
 remote_hook.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index 5078786b..bee42f3e 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -126,7 +126,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then
 else
     JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log"
     tail -F "${JOB_OUTPUT}" &
-    sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
+    sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}"  || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
 fi
 
 touch "${JOB_CONTROL_DIR}/SUCCESS"

From ec8a2ef0ae3e71ba018b6115a9a463409a73a867 Mon Sep 17 00:00:00 2001
From: Daniel Dantas <dldantas.apps@gmail.com>
Date: Wed, 3 Aug 2022 12:00:28 -0300
Subject: [PATCH 263/268] added parameter spark.metrics.conf

---
 remote_hook.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index bee42f3e..c01df7e7 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -126,7 +126,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then
 else
     JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log"
     tail -F "${JOB_OUTPUT}" &
-    sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}"  || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
+    sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --class "spark.metrics.conf=/tmp/jmx/metrics.properties" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}"  || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
 fi
 
 touch "${JOB_CONTROL_DIR}/SUCCESS"

From 82f7fd83397188723402c89314483c2cd068bd71 Mon Sep 17 00:00:00 2001
From: Daniel Dantas <dldantas.apps@gmail.com>
Date: Wed, 3 Aug 2022 16:15:32 -0300
Subject: [PATCH 264/268] added conf spark.metrics.conf

---
 remote_hook.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index c01df7e7..573c888b 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -126,7 +126,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then
 else
     JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log"
     tail -F "${JOB_OUTPUT}" &
-    sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --class "spark.metrics.conf=/tmp/jmx/metrics.properties" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}"  || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
+    sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --conf "spark.metrics.conf=/tmp/jmx/metrics.properties" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}"  || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
 fi
 
 touch "${JOB_CONTROL_DIR}/SUCCESS"

From fbb8128a523793bb9fcc730ea1464094179f8b5d Mon Sep 17 00:00:00 2001
From: Daniel Dantas <dldantas.apps@gmail.com>
Date: Thu, 4 Aug 2022 21:12:13 -0300
Subject: [PATCH 265/268] added --packages parameter in spark-submit command

---
 remote_hook.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index 5969e00a..84ceec72 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -126,7 +126,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then
 else
     JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log"
     tail -F "${JOB_OUTPUT}" &
-    ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --conf "spark.metrics.conf=/tmp/jmx/metrics.properties" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
+    ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --conf "spark.metrics.conf=/tmp/jmx/metrics.properties" --repositories "https://raw.github.com/banzaicloud/spark-metrics/master/maven-repo/releases" --packages "com.banzaicloud:spark-metrics_2.11:2.4-1.0.6,io.prometheus:simpleclient:0.3.0,io.prometheus:simpleclient_dropwizard:0.3.0,io.prometheus:simpleclient_pushgateway:0.3.0,io.dropwizard.metrics:metrics-core:3.1.2" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
 fi
 
 touch "${JOB_CONTROL_DIR}/SUCCESS"

From 44bece63a15766f09f9c6526a4834bd5d00c0f0d Mon Sep 17 00:00:00 2001
From: Daniel Dantas <dldantas.apps@gmail.com>
Date: Tue, 9 Aug 2022 09:35:38 -0300
Subject: [PATCH 266/268] removed spark.metrics.conf parameter from
 spark-submit

---
 remote_hook.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index 84ceec72..a8cbe8ee 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -126,7 +126,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then
 else
     JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log"
     tail -F "${JOB_OUTPUT}" &
-    ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --conf "spark.metrics.conf=/tmp/jmx/metrics.properties" --repositories "https://raw.github.com/banzaicloud/spark-metrics/master/maven-repo/releases" --packages "com.banzaicloud:spark-metrics_2.11:2.4-1.0.6,io.prometheus:simpleclient:0.3.0,io.prometheus:simpleclient_dropwizard:0.3.0,io.prometheus:simpleclient_pushgateway:0.3.0,io.dropwizard.metrics:metrics-core:3.1.2" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
+    ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --repositories "https://mvnrepository.com/artifact/com.banzaicloud/spark-metrics" --packages "com.banzaicloud:spark-metrics_2.11:2.4-1.0.6,io.prometheus:simpleclient:0.3.0,io.prometheus:simpleclient_dropwizard:0.3.0,io.prometheus:simpleclient_pushgateway:0.3.0,io.dropwizard.metrics:metrics-core:3.1.2" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
 fi
 
 touch "${JOB_CONTROL_DIR}/SUCCESS"

From 98594ff2171b695011612fa07fb118a3fa74fe47 Mon Sep 17 00:00:00 2001
From: Daniel Dantas <dldantas.apps@gmail.com>
Date: Tue, 9 Aug 2022 17:08:45 -0300
Subject: [PATCH 267/268] removed --repositories and --packages parameters from
 spark-submit

---
 remote_hook.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index a8cbe8ee..ac9b828f 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -126,7 +126,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then
 else
     JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log"
     tail -F "${JOB_OUTPUT}" &
-    ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --repositories "https://mvnrepository.com/artifact/com.banzaicloud/spark-metrics" --packages "com.banzaicloud:spark-metrics_2.11:2.4-1.0.6,io.prometheus:simpleclient:0.3.0,io.prometheus:simpleclient_dropwizard:0.3.0,io.prometheus:simpleclient_pushgateway:0.3.0,io.dropwizard.metrics:metrics-core:3.1.2" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
+    ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
 fi
 
 touch "${JOB_CONTROL_DIR}/SUCCESS"

From 90983fbb8c017c920118bf1a1a9bfea7ca05c459 Mon Sep 17 00:00:00 2001
From: Daniel Dantas <dldantas.apps@gmail.com>
Date: Mon, 15 Aug 2022 11:13:15 -0300
Subject: [PATCH 268/268] removed javaagent parameter

---
 remote_hook.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/remote_hook.sh b/remote_hook.sh
index ac9b828f..903d618e 100755
--- a/remote_hook.sh
+++ b/remote_hook.sh
@@ -126,7 +126,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then
 else
     JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log"
     tail -F "${JOB_OUTPUT}" &
-    ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
+    ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}"
 fi
 
 touch "${JOB_CONTROL_DIR}/SUCCESS"