From f9ffdd5fc5c1a1b539c9e1fc29e1cab1c8829e85 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Thu, 7 May 2015 11:43:52 -0300 Subject: [PATCH 001/268] Fixed typo --- src/main/scala/ignition/core/jobs/utils/RDDUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala index fc42ded5..7e75d5ec 100644 --- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala @@ -84,7 +84,7 @@ object RDDUtils { rdd.aggregateByKey(List.empty[V])( (lst, v) => if (lst.size >= n) { - logger.warn(s"Ignoring value '$v' due aggregation result of size '${lst.size}' is bigger then n = '$n'") + logger.warn(s"Ignoring value '$v' due aggregation result of size '${lst.size}' is bigger than n=$n") lst } else { v :: lst From b0e168e5d79b2cd46fd9722eca572fb358e3d421 Mon Sep 17 00:00:00 2001 From: ZaGo Date: Fri, 8 May 2015 13:38:26 -0300 Subject: [PATCH 002/268] refactoring to allow changes in ignition.mail --- .../ignition/core/jobs/utils/SparkContextUtils.scala | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 29c32112..a1090d20 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -130,6 +130,13 @@ object SparkContextUtils { } + def getTextFiles(paths: Seq[String], synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = { + if (synchLocally) + processTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths) + else + processTextFiles(paths, minimumPaths) + } + def filterAndGetTextFiles(path: String, requireSuccess: Boolean = false, inclusiveStartDate: Boolean = true, @@ -144,10 +151,7 @@ object SparkContextUtils { val paths = getFilteredPaths(Seq(path), requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates) if (paths.size < minimumPaths) throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of paths $paths is less than the required") - else if (synchLocally) - processTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths) - else - processTextFiles(paths, minimumPaths) + getTextFiles(paths, synchLocally, forceSynch, minimumPaths) } private def stringHadoopFile(paths: Seq[String], minimumPaths: Int): RDD[Try[String]] = { From a8e9734b844bf9d5bdbe0572c0a7e247399983de Mon Sep 17 00:00:00 2001 From: Filipe Niero Felisbino Date: Fri, 8 May 2015 15:49:25 -0300 Subject: [PATCH 003/268] Fix ec2 request issue --- tools/spark-ec2/spark_ec2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 5fdf0467..a608f9ce 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -540,7 +540,7 @@ def launch_cluster(conn, opts, cluster_name): (invalid[0].id, invalid[0].status.message)) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves - reservations = conn.get_all_reservations(active_instance_ids) + reservations = conn.get_all_reservations([r.instance_id for r in active_instance_ids]) slave_nodes = [] for r in reservations: slave_nodes += r.instances From 9ae5178549af17b57a19e0ff2fefcb385c5401bf Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 18 May 2015 10:41:19 -0300 Subject: [PATCH 004/268] Minor improvements --- build.sbt | 2 +- src/main/scala/ignition/core/utils/DateUtils.scala | 10 +++++++++- src/main/scala/ignition/core/utils/FutureUtils.scala | 4 +++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/build.sbt b/build.sbt index 095c1228..4dfcd1ae 100644 --- a/build.sbt +++ b/build.sbt @@ -17,7 +17,7 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") -libraryDependencies += "com.github.nscala-time" %% "nscala-time" % "0.8.0" +libraryDependencies += "com.github.nscala-time" %% "nscala-time" % "2.0.0" libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.0" diff --git a/src/main/scala/ignition/core/utils/DateUtils.scala b/src/main/scala/ignition/core/utils/DateUtils.scala index 231817c7..c3fb5163 100644 --- a/src/main/scala/ignition/core/utils/DateUtils.scala +++ b/src/main/scala/ignition/core/utils/DateUtils.scala @@ -1,6 +1,6 @@ package ignition.core.utils -import org.joda.time.{Period, DateTimeZone, DateTime} +import org.joda.time.{Seconds, Period, DateTimeZone, DateTime} import org.joda.time.format.ISODateTimeFormat object DateUtils { @@ -21,4 +21,12 @@ object DateUtils { def isEqualOrBefore(other: DateTime) = dateTime.isBefore(other) || dateTime.saneEqual(other) } + + implicit class SecondsImprovements(val seconds: Seconds) { + + implicit def toScalaDuration: scala.concurrent.duration.FiniteDuration = { + scala.concurrent.duration.Duration(seconds.getSeconds, scala.concurrent.duration.SECONDS) + } + + } } diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala index 068d63bc..81b0490e 100644 --- a/src/main/scala/ignition/core/utils/FutureUtils.scala +++ b/src/main/scala/ignition/core/utils/FutureUtils.scala @@ -1,10 +1,12 @@ package ignition.core.utils -import scala.concurrent.{ExecutionContext, Future, Promise} +import scala.concurrent.{ExecutionContext, Future, Promise, blocking, future} import scala.util.{Failure, Success} object FutureUtils { + def blockingFuture[T](body: =>T)(implicit ec: ExecutionContext): Future[T] = future { blocking { body } } + implicit class FutureImprovements[V](future: Future[V]) { def toOptionOnFailure(errorHandler: (Throwable) => Option[V])(implicit ec: ExecutionContext): Future[Option[V]] = { future.map(Option.apply).recover { case t => errorHandler(t) } From 53cfe885d21307acb4072260f68d6d2f718dc746 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Mon, 18 May 2015 17:05:35 -0300 Subject: [PATCH 005/268] remove unused lib --- build.sbt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index 4dfcd1ae..046d9503 100644 --- a/build.sbt +++ b/build.sbt @@ -17,8 +17,6 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") -libraryDependencies += "com.github.nscala-time" %% "nscala-time" % "2.0.0" - libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.0" libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16" @@ -29,6 +27,10 @@ libraryDependencies += "com.github.scopt" %% "scopt" % "3.2.0" libraryDependencies += "net.java.dev.jets3t" % "jets3t" % "0.7.1" +libraryDependencies += "joda-time" % "joda-time" % "2.7" + +libraryDependencies += "org.joda" % "joda-convert" % "1.7" + resolvers += "Akka Repository" at "http://repo.akka.io/releases/" resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/" From d965fd6ad12bbf2fadf9302837ec7b242661eba8 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 1 Jun 2015 14:10:58 -0300 Subject: [PATCH 006/268] Added utilitary function for better stack traces --- .../scala/ignition/core/utils/BetterTrace.scala | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 src/main/scala/ignition/core/utils/BetterTrace.scala diff --git a/src/main/scala/ignition/core/utils/BetterTrace.scala b/src/main/scala/ignition/core/utils/BetterTrace.scala new file mode 100644 index 00000000..158e261e --- /dev/null +++ b/src/main/scala/ignition/core/utils/BetterTrace.scala @@ -0,0 +1,13 @@ +package ignition.core.utils + +// Used mainly to augment scalacheck traces in scalatest +trait BetterTrace { + def fail(message: String): Nothing + def withBetterTrace(block: => Unit): Unit = + try { + block + } catch { + case t: Throwable => fail(s"${t.getMessage}: ${t.getStackTraceString}") + } + +} From c89961984bbd4be54c63366d4df5b915a25c89fc Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 8 Jun 2015 18:08:22 -0300 Subject: [PATCH 007/268] Updated scalatest to fix conflicts --- build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index 046d9503..be7e1b12 100644 --- a/build.sbt +++ b/build.sbt @@ -17,7 +17,7 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") -libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.0" +libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.1.3" libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16" From 82a09c0ff017484bfbada7d1a4b451e7c288a025 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Thu, 11 Jun 2015 16:34:05 -0300 Subject: [PATCH 008/268] Improved s3 service --- src/main/scala/ignition/core/utils/S3Client.scala | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/utils/S3Client.scala b/src/main/scala/ignition/core/utils/S3Client.scala index f02d7acd..a988aa7f 100644 --- a/src/main/scala/ignition/core/utils/S3Client.scala +++ b/src/main/scala/ignition/core/utils/S3Client.scala @@ -37,7 +37,14 @@ class S3Client { } def list(bucket: String, key: String): Array[S3Object] = { - service.listObjects(bucket, key, null, 99999L) + service.listObjectsChunked(bucket, key, null, 99999L, null, true).getObjects + } + + def copyFile(sourceBucket: String, sourceKey: String, destBucket: String, destKey: String, destContentType: Option[String] = None): Unit = { + val destFile = new S3Object(destKey) + val replaceMetaData = destContentType.isDefined + destContentType.foreach(contentType => destFile.setContentType(contentType)) + service.copyObject(sourceBucket, sourceKey, destBucket, destFile, replaceMetaData) } def fileExists(bucket: String, key: String): Boolean = { From c32cce56fab86bc3372bd17adc1f7745f3d0c797 Mon Sep 17 00:00:00 2001 From: Flavio Sales Truzzi Date: Thu, 11 Jun 2015 19:11:30 -0300 Subject: [PATCH 009/268] Add optinal content type --- src/main/scala/ignition/core/utils/S3Client.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/utils/S3Client.scala b/src/main/scala/ignition/core/utils/S3Client.scala index a988aa7f..fe509a4b 100644 --- a/src/main/scala/ignition/core/utils/S3Client.scala +++ b/src/main/scala/ignition/core/utils/S3Client.scala @@ -26,9 +26,9 @@ class S3Client { null, null, jets3tProperties ) - def writeContent(bucket: String, key: String, content: String): S3Object = { + def writeContent(bucket: String, key: String, content: String, contentType: String = "text/plain"): S3Object = { val obj = new S3Object(key, content) - obj.setContentType("text/plain") + obj.setContentType(contentType) service.putObject(bucket, obj) } From 8f51a86897eb401c9190a8e1fbc2e40359e8a678 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Fri, 12 Jun 2015 16:27:35 -0300 Subject: [PATCH 010/268] Added content encoding --- src/main/scala/ignition/core/utils/S3Client.scala | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/utils/S3Client.scala b/src/main/scala/ignition/core/utils/S3Client.scala index fe509a4b..b806b376 100644 --- a/src/main/scala/ignition/core/utils/S3Client.scala +++ b/src/main/scala/ignition/core/utils/S3Client.scala @@ -40,9 +40,13 @@ class S3Client { service.listObjectsChunked(bucket, key, null, 99999L, null, true).getObjects } - def copyFile(sourceBucket: String, sourceKey: String, destBucket: String, destKey: String, destContentType: Option[String] = None): Unit = { + def copyFile(sourceBucket: String, sourceKey: String, + destBucket: String, destKey: String, + destContentType: Option[String] = None, + destContentEncoding: Option[String] = None): Unit = { val destFile = new S3Object(destKey) - val replaceMetaData = destContentType.isDefined + val replaceMetaData = destContentType.isDefined || destContentEncoding.isDefined + destContentEncoding.foreach(encoding => destFile.setContentEncoding(encoding)) destContentType.foreach(contentType => destFile.setContentType(contentType)) service.copyObject(sourceBucket, sourceKey, destBucket, destFile, replaceMetaData) } From c752d9379edc91e37c261eed5610dfe09a3a06bf Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Thu, 18 Jun 2015 20:01:26 -0300 Subject: [PATCH 011/268] Upgraded scalatest --- build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index be7e1b12..c4723faf 100644 --- a/build.sbt +++ b/build.sbt @@ -17,7 +17,7 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") -libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.1.3" +libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.2.4" libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16" From 59f818da5aedc7dd919eca2d6e58f21208672316 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 23 Jun 2015 18:01:01 -0300 Subject: [PATCH 012/268] Added removeEmpty to Maps --- src/main/scala/ignition/core/utils/CollectionUtils.scala | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala index 27977270..5994b153 100644 --- a/src/main/scala/ignition/core/utils/CollectionUtils.scala +++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala @@ -103,4 +103,10 @@ object CollectionUtils { .toList } } + + + implicit class CollectionMap[K, V <: TraversableOnce[Any]](map: Map[K, V]) { + def removeEmpty(): Map[K, V] = + map.filter { case (k, v) => !v.isEmpty } + } } From 842ca9dba49ed76ddedb4990779928b01d46cfc3 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 23 Jun 2015 18:01:47 -0300 Subject: [PATCH 013/268] Added removeEmpty to Maps --- src/main/scala/ignition/core/utils/CollectionUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala index 5994b153..52828ca7 100644 --- a/src/main/scala/ignition/core/utils/CollectionUtils.scala +++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala @@ -107,6 +107,6 @@ object CollectionUtils { implicit class CollectionMap[K, V <: TraversableOnce[Any]](map: Map[K, V]) { def removeEmpty(): Map[K, V] = - map.filter { case (k, v) => !v.isEmpty } + map.filter { case (k, v) => v.nonEmpty } } } From d05f836d8967657fd6df96293d65e013f45861e5 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 25 Jun 2015 09:34:14 -0300 Subject: [PATCH 014/268] exclude slf4j-log4j12 backend --- build.sbt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index c4723faf..7eb2bffe 100644 --- a/build.sbt +++ b/build.sbt @@ -13,7 +13,9 @@ ideaExcludeFolders += ".idea_modules" // Because we can't run two spark contexts on same VM parallelExecution in Test := false -libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided").exclude("org.apache.hadoop", "hadoop-client") +libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided") + .exclude("org.apache.hadoop", "hadoop-client") + .exclude("org.slf4j", "slf4j-log4j12") libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") From c9abcd53eb2d839d61c017e0b574e00be911be4c Mon Sep 17 00:00:00 2001 From: sisso Date: Tue, 30 Jun 2015 14:41:15 -0300 Subject: [PATCH 015/268] added method that allow to map future using success/failure --- .../ignition/core/utils/FutureUtils.scala | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala index 81b0490e..41cf75a3 100644 --- a/src/main/scala/ignition/core/utils/FutureUtils.scala +++ b/src/main/scala/ignition/core/utils/FutureUtils.scala @@ -1,7 +1,7 @@ package ignition.core.utils import scala.concurrent.{ExecutionContext, Future, Promise, blocking, future} -import scala.util.{Failure, Success} +import scala.util.{Failure, Success, Try} object FutureUtils { @@ -11,6 +11,23 @@ object FutureUtils { def toOptionOnFailure(errorHandler: (Throwable) => Option[V])(implicit ec: ExecutionContext): Future[Option[V]] = { future.map(Option.apply).recover { case t => errorHandler(t) } } + + /** + * Appear to be redundant. But its the only way to map a future with + * Success and Failure in same algorithm without split it to use map/recover + * or transform. + * + * future.asTry.map { case Success(v) => 1; case Failure(e) => 0 } + * + * instead + * + * future.map(i=>1).recover(case _: Exception => 0) + * future.transform(=> 1, => 0) + * + */ + def asTry()(implicit ec: ExecutionContext) : Future[Try[V]] = { + future.map(v => Success(v)).recover { case e: Exception => Failure(e) } + } } implicit class FutureGeneratorImprovements[V](generator: Iterable[() => Future[V]]){ From 48f4e2c2798c3d5a2262234ae001ae6cb5ad5cc6 Mon Sep 17 00:00:00 2001 From: sisso Date: Tue, 7 Jul 2015 18:09:12 -0300 Subject: [PATCH 016/268] change catch to NonFatal --- src/main/scala/ignition/core/utils/FutureUtils.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala index 41cf75a3..95b44c2f 100644 --- a/src/main/scala/ignition/core/utils/FutureUtils.scala +++ b/src/main/scala/ignition/core/utils/FutureUtils.scala @@ -1,6 +1,7 @@ package ignition.core.utils import scala.concurrent.{ExecutionContext, Future, Promise, blocking, future} +import scala.util.control.NonFatal import scala.util.{Failure, Success, Try} object FutureUtils { @@ -22,11 +23,10 @@ object FutureUtils { * instead * * future.map(i=>1).recover(case _: Exception => 0) - * future.transform(=> 1, => 0) * */ def asTry()(implicit ec: ExecutionContext) : Future[Try[V]] = { - future.map(v => Success(v)).recover { case e: Exception => Failure(e) } + future.map(v => Success(v)).recover { case NonFatal(e) => Failure(e) } } } From bab487acfb4e74c7115a764d5774f5eefaa40630 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 13 Aug 2015 15:54:42 -0300 Subject: [PATCH 017/268] attempt to update spark_ec2.py --- tools/spark-ec2/spark_ec2.py | 708 ++++++++++++++++++++++++----------- 1 file changed, 482 insertions(+), 226 deletions(-) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index a608f9ce..8cc44d30 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -19,9 +19,11 @@ # limitations under the License. # -from __future__ import with_statement +from __future__ import division, print_function, with_statement +import codecs import hashlib +import itertools import logging import os import os.path @@ -36,13 +38,20 @@ import tempfile import textwrap import time -import urllib2 import warnings from datetime import datetime from optparse import OptionParser from sys import stderr -SPARK_EC2_VERSION = "1.3.0" +if sys.version < "3": + from urllib2 import urlopen, Request, HTTPError +else: + from urllib.request import urlopen, Request + from urllib.error import HTTPError + raw_input = input + xrange = range + +SPARK_EC2_VERSION = "1.4.0" SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__)) VALID_SPARK_VERSIONS = set([ @@ -60,14 +69,84 @@ "1.2.0", "1.2.1", "1.3.0", + "1.3.1", + "1.4.0", ]) +SPARK_TACHYON_MAP = { + "1.0.0": "0.4.1", + "1.0.1": "0.4.1", + "1.0.2": "0.4.1", + "1.1.0": "0.5.0", + "1.1.1": "0.5.0", + "1.2.0": "0.5.0", + "1.2.1": "0.5.0", + "1.3.0": "0.5.0", + "1.3.1": "0.5.0", + "1.4.0": "0.6.4", +} + DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION DEFAULT_SPARK_GITHUB_REPO = "https://github.com/apache/spark" # Default location to get the spark-ec2 scripts (and ami-list) from -DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/mesos/spark-ec2" -DEFAULT_SPARK_EC2_BRANCH = "branch-1.3" +DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/amplab/spark-ec2" +DEFAULT_SPARK_EC2_BRANCH = "branch-1.4" + + +def setup_external_libs(libs): + """ + Download external libraries from PyPI to SPARK_EC2_DIR/lib/ and prepend them to our PATH. + """ + PYPI_URL_PREFIX = "https://pypi.python.org/packages/source" + SPARK_EC2_LIB_DIR = os.path.join(SPARK_EC2_DIR, "lib") + + if not os.path.exists(SPARK_EC2_LIB_DIR): + print("Downloading external libraries that spark-ec2 needs from PyPI to {path}...".format( + path=SPARK_EC2_LIB_DIR + )) + print("This should be a one-time operation.") + os.mkdir(SPARK_EC2_LIB_DIR) + + for lib in libs: + versioned_lib_name = "{n}-{v}".format(n=lib["name"], v=lib["version"]) + lib_dir = os.path.join(SPARK_EC2_LIB_DIR, versioned_lib_name) + + if not os.path.isdir(lib_dir): + tgz_file_path = os.path.join(SPARK_EC2_LIB_DIR, versioned_lib_name + ".tar.gz") + print(" - Downloading {lib}...".format(lib=lib["name"])) + download_stream = urlopen( + "{prefix}/{first_letter}/{lib_name}/{lib_name}-{lib_version}.tar.gz".format( + prefix=PYPI_URL_PREFIX, + first_letter=lib["name"][:1], + lib_name=lib["name"], + lib_version=lib["version"] + ) + ) + with open(tgz_file_path, "wb") as tgz_file: + tgz_file.write(download_stream.read()) + with open(tgz_file_path, "rb") as tar: + if hashlib.md5(tar.read()).hexdigest() != lib["md5"]: + print("ERROR: Got wrong md5sum for {lib}.".format(lib=lib["name"]), file=stderr) + sys.exit(1) + tar = tarfile.open(tgz_file_path) + tar.extractall(path=SPARK_EC2_LIB_DIR) + tar.close() + os.remove(tgz_file_path) + print(" - Finished downloading {lib}.".format(lib=lib["name"])) + sys.path.insert(1, lib_dir) + + +# Only PyPI libraries are supported. +external_libs = [ + { + "name": "boto", + "version": "2.34.0", + "md5": "5556223d2d0cc4d06dd4829e671dcecd" + } +] + +setup_external_libs(external_libs) import boto from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType, EBSBlockDeviceType @@ -107,7 +186,7 @@ def parse_args(): help="Master instance type (leave empty for same as instance-type)") parser.add_option( "-r", "--region", default="us-east-1", - help="EC2 region zone to launch instances in") + help="EC2 region used to launch instances in, or to find them in (default: %default)") parser.add_option( "-z", "--zone", default="", help="Availability zone to launch instances in, or 'all' to spread " + @@ -133,9 +212,19 @@ def parse_args(): "--spark-ec2-git-branch", default=DEFAULT_SPARK_EC2_BRANCH, help="Github repo branch of spark-ec2 to use (default: %default)") + parser.add_option( + "--deploy-root-dir", + default=None, + help="A directory to copy into / on the first master. " + + "Must be absolute. Note that a trailing slash is handled as per rsync: " + + "If you omit it, the last directory of the --deploy-root-dir path will be created " + + "in / before copying its contents. If you append the trailing slash, " + + "the directory is not created and its contents are copied directly into /. " + + "(default: %default).") parser.add_option( "--hadoop-major-version", default="1", - help="Major version of Hadoop (default: %default)") + help="Major version of Hadoop. Valid options are 1 (Hadoop 1.0.4), 2 (CDH 4.7.1), yarn " + + "(Hadoop 2.4.0) (default: %default)") parser.add_option( "-D", metavar="[ADDRESS:]PORT", dest="proxy_port", help="Use SSH dynamic port forwarding to create a SOCKS proxy at " + @@ -155,7 +244,7 @@ def parse_args(): help="Number of EBS volumes to attach to each node as /vol[x]. " + "The volumes will be deleted when the instances terminate. " + "Only possible on EBS-backed AMIs. " + - "EBS volumes are only attached if --ebs-vol-size > 0." + + "EBS volumes are only attached if --ebs-vol-size > 0. " + "Only support up to 8 EBS volumes.") parser.add_option( "--placement-group", type="string", default=None, @@ -187,14 +276,15 @@ def parse_args(): help="Launch fresh slaves, but use an existing stopped master if possible") parser.add_option( "--worker-instances", type="int", default=1, - help="Number of instances per worker: variable SPARK_WORKER_INSTANCES (default: %default)") + help="Number of instances per worker: variable SPARK_WORKER_INSTANCES. Not used if YARN " + + "is used as Hadoop major version (default: %default)") parser.add_option( "--master-opts", type="string", default="", help="Extra options to give to master through SPARK_MASTER_OPTS variable " + "(e.g -Dspark.worker.timeout=180)") parser.add_option( "--user-data", type="string", default="", - help="Path to a user-data file (most AMI's interpret this as an initialization script)") + help="Path to a user-data file (most AMIs interpret this as an initialization script)") parser.add_option( "--security-group-prefix", type="string", default=None, help="Use this prefix for the security group rather than the cluster name.") @@ -204,6 +294,10 @@ def parse_args(): parser.add_option( "--additional-security-group", type="string", default="", help="Additional security group to place the machines in") + parser.add_option( + "--additional-tags", type="string", default="", + help="Additional tags to set on the machines; tags are comma-separated, while name and " + + "value are colon separated; ex: \"Task:MySparkProject,Env:production\"") parser.add_option( "--copy-aws-credentials", action="store_true", default=False, help="Add AWS credentials to hadoop configuration to allow Spark to access S3") @@ -216,6 +310,17 @@ def parse_args(): parser.add_option( "--spot-timeout", type="int", default=45, help="Maximum amount of time (in minutes) to wait for spot requests to be fulfilled") + parser.add_option( + "--private-ips", action="store_true", default=False, + help="Use private IPs for instances rather than public if VPC/subnet " + + "requires that.") + parser.add_option( + "--instance-initiated-shutdown-behavior", default="terminate", + choices=["stop", "terminate"], + help="Whether instances should terminate when shut down or just stop") + parser.add_option( + "--instance-profile-name", default=None, + help="IAM profile name to launch instances under") (opts, args) = parser.parse_args() if len(args) != 2: @@ -228,14 +333,16 @@ def parse_args(): home_dir = os.getenv('HOME') if home_dir is None or not os.path.isfile(home_dir + '/.boto'): if not os.path.isfile('/etc/boto.cfg'): - if os.getenv('AWS_ACCESS_KEY_ID') is None: - print >> stderr, ("ERROR: The environment variable AWS_ACCESS_KEY_ID " + - "must be set") - sys.exit(1) - if os.getenv('AWS_SECRET_ACCESS_KEY') is None: - print >> stderr, ("ERROR: The environment variable AWS_SECRET_ACCESS_KEY " + - "must be set") - sys.exit(1) + # If there is no boto config, check aws credentials + if not os.path.isfile(home_dir + '/.aws/credentials'): + if os.getenv('AWS_ACCESS_KEY_ID') is None: + print("ERROR: The environment variable AWS_ACCESS_KEY_ID must be set", + file=stderr) + sys.exit(1) + if os.getenv('AWS_SECRET_ACCESS_KEY') is None: + print("ERROR: The environment variable AWS_SECRET_ACCESS_KEY must be set", + file=stderr) + sys.exit(1) return (opts, action, cluster_name) @@ -246,7 +353,7 @@ def get_or_make_group(conn, name, vpc_id): if len(group) > 0: return group[0] else: - print "Creating security group " + name + print("Creating security group " + name) return conn.create_security_group(name, "Spark EC2 group", vpc_id) def check_if_http_resource_exists(resource): @@ -270,12 +377,12 @@ def get_validate_spark_version(version, repo): if check_if_http_resource_exists: return version else: - print >> stderr, "Unable to validate pre-built spark version {version}".format(version=version) + print("Unable to validate pre-built spark version {version}".format(version=version), file=stderr) sys.exit(1) elif "." in version: version = version.replace("v", "") if version not in VALID_SPARK_VERSIONS: - print >> stderr, "Don't know about Spark version: {v}".format(v=version) + print("Don't know about Spark version: {v}".format(v=version), file=stderr) sys.exit(1) return version else: @@ -288,84 +395,93 @@ def get_validate_spark_version(version, repo): return version -# Check whether a given EC2 instance object is in a state we consider active, -# i.e. not terminating or terminated. We count both stopping and stopped as -# active since we can restart stopped clusters. -def is_active(instance): - return (instance.state in ['pending', 'running', 'stopping', 'stopped']) - - # Source: http://aws.amazon.com/amazon-linux-ami/instance-type-matrix/ -# Last Updated: 2014-06-20 +# Last Updated: 2015-06-19 # For easy maintainability, please keep this manually-inputted dictionary sorted by key. EC2_INSTANCE_TYPES = { "c1.medium": "pvm", "c1.xlarge": "pvm", + "c3.large": "pvm", + "c3.xlarge": "pvm", "c3.2xlarge": "pvm", "c3.4xlarge": "pvm", "c3.8xlarge": "pvm", - "c3.large": "pvm", - "c3.xlarge": "pvm", + "c4.large": "hvm", + "c4.xlarge": "hvm", + "c4.2xlarge": "hvm", + "c4.4xlarge": "hvm", + "c4.8xlarge": "hvm", "cc1.4xlarge": "hvm", "cc2.8xlarge": "hvm", "cg1.4xlarge": "hvm", "cr1.8xlarge": "hvm", + "d2.xlarge": "hvm", + "d2.2xlarge": "hvm", + "d2.4xlarge": "hvm", + "d2.8xlarge": "hvm", + "g2.2xlarge": "hvm", + "g2.8xlarge": "hvm", "hi1.4xlarge": "pvm", "hs1.8xlarge": "pvm", + "i2.xlarge": "hvm", "i2.2xlarge": "hvm", "i2.4xlarge": "hvm", "i2.8xlarge": "hvm", - "i2.xlarge": "hvm", - "m1.large": "pvm", - "m1.medium": "pvm", "m1.small": "pvm", + "m1.medium": "pvm", + "m1.large": "pvm", "m1.xlarge": "pvm", + "m2.xlarge": "pvm", "m2.2xlarge": "pvm", "m2.4xlarge": "pvm", - "m2.xlarge": "pvm", - "m3.2xlarge": "hvm", - "m3.large": "hvm", "m3.medium": "hvm", + "m3.large": "hvm", "m3.xlarge": "hvm", + "m3.2xlarge": "hvm", + "m4.large": "hvm", + "m4.xlarge": "hvm", + "m4.2xlarge": "hvm", + "m4.4xlarge": "hvm", + "m4.10xlarge": "hvm", + "r3.large": "hvm", + "r3.xlarge": "hvm", "r3.2xlarge": "hvm", "r3.4xlarge": "hvm", "r3.8xlarge": "hvm", - "r3.large": "hvm", - "r3.xlarge": "hvm", "t1.micro": "pvm", - "t2.medium": "hvm", "t2.micro": "hvm", "t2.small": "hvm", - "d2.2xlarge": "hvm", - "d2.4xlarge": "hvm", - "d2.8xlarge": "hvm", - "d2.large": "hvm", - "d2.xlarge": "hvm", + "t2.medium": "hvm", + "t2.large": "hvm", } +def get_tachyon_version(spark_version): + return SPARK_TACHYON_MAP.get(spark_version, "") + + # Attempt to resolve an appropriate AMI given the architecture and region of the request. -def get_spark_ami(instance_type, region, spark_ec2_git_repo, spark_ec2_git_branch): - if instance_type in EC2_INSTANCE_TYPES: - instance_type = EC2_INSTANCE_TYPES[instance_type] +def get_spark_ami(opts): + if opts.instance_type in EC2_INSTANCE_TYPES: + instance_type = EC2_INSTANCE_TYPES[opts.instance_type] else: instance_type = "pvm" - print >> stderr,\ - "Don't recognize %s, assuming type is pvm" % instance_type + print("Don't recognize %s, assuming type is pvm" % opts.instance_type, file=stderr) # URL prefix from which to fetch AMI information ami_prefix = "{r}/{b}/ami-list".format( - r=spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1), - b=spark_ec2_git_branch) + r=opts.spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1), + b=opts.spark_ec2_git_branch) - ami_path = "%s/%s/%s" % (ami_prefix, region, instance_type) + ami_path = "%s/%s/%s" % (ami_prefix, opts.region, instance_type) + reader = codecs.getreader("ascii") try: - ami = urllib2.urlopen(ami_path).read().strip() - print "Spark AMI for %s: %s" % (instance_type, ami) + ami = reader(urlopen(ami_path)).read().strip() except: - print >> stderr, "Could not resolve AMI at: " + ami_path + print("Could not resolve AMI at: " + ami_path, file=stderr) sys.exit(1) + print("Spark AMI: " + ami) return ami @@ -375,11 +491,11 @@ def get_spark_ami(instance_type, region, spark_ec2_git_repo, spark_ec2_git_branc # Fails if there already instances running in the cluster's groups. def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: - print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." + print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr) sys.exit(1) if opts.key_pair is None: - print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." + print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr) sys.exit(1) user_data_content = None @@ -387,7 +503,7 @@ def launch_cluster(conn, opts, cluster_name): with open(opts.user_data) as user_data_file: user_data_content = user_data_file.read() - print "Setting up security groups..." + print("Setting up security groups...") if opts.security_group_prefix is None: master_group = get_or_make_group(conn, cluster_name + "-master", opts.vpc_id) slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id) @@ -421,6 +537,17 @@ def launch_cluster(conn, opts, cluster_name): master_group.authorize('tcp', 50070, 50070, authorized_address) master_group.authorize('tcp', 60070, 60070, authorized_address) master_group.authorize('tcp', 4040, 4045, authorized_address) + # Rstudio (GUI for R) needs port 8787 for web access + master_group.authorize('tcp', 8787, 8787, authorized_address) + # HDFS NFS gateway requires 111,2049,4242 for tcp & udp + master_group.authorize('tcp', 111, 111, authorized_address) + master_group.authorize('udp', 111, 111, authorized_address) + master_group.authorize('tcp', 2049, 2049, authorized_address) + master_group.authorize('udp', 2049, 2049, authorized_address) + master_group.authorize('tcp', 4242, 4242, authorized_address) + master_group.authorize('udp', 4242, 4242, authorized_address) + # RM in YARN mode uses 8088 + master_group.authorize('tcp', 8088, 8088, authorized_address) if opts.ganglia: master_group.authorize('tcp', 5080, 5080, authorized_address) if slave_group.rules == []: # Group was just now created @@ -451,13 +578,13 @@ def launch_cluster(conn, opts, cluster_name): existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): - print >> stderr, ("ERROR: There are already instances running in " + - "group %s or %s" % (master_group.name, slave_group.name)) + print("ERROR: There are already instances running in group %s or %s" % + (master_group.name, slave_group.name), file=stderr) sys.exit(1) # Figure out Spark AMI if opts.ami is None: - opts.ami = get_spark_ami(opts.instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch) + opts.ami = get_spark_ami(opts) if opts.master_ami is None: opts.master_ami = get_spark_ami(opts.master_instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch) @@ -468,12 +595,12 @@ def launch_cluster(conn, opts, cluster_name): additional_group_ids = [sg.id for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id)] - print "Launching instances..." + print("Launching instances...") try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: - print >> stderr, "Could not find AMI " + opts.ami + print("Could not find AMI " + opts.ami, file=stderr) sys.exit(1) try: @@ -502,8 +629,8 @@ def launch_cluster(conn, opts, cluster_name): # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price - print ("Requesting %d slaves as spot instances with price $%.3f" % - (opts.slaves, opts.spot_price)) + print("Requesting %d slaves as spot instances with price $%.3f" % + (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 @@ -522,12 +649,13 @@ def launch_cluster(conn, opts, cluster_name): block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, - user_data=user_data_content) + user_data=user_data_content, + instance_profile_name=opts.instance_profile_name) my_req_ids += [req.id for req in slave_reqs] i += 1 start_time = datetime.now() - print "Waiting for spot instances to be granted... Request IDs: %s " % my_req_ids + print("Waiting for spot instances to be granted... Request IDs: %s " % my_req_ids) try: while True: time.sleep(10) @@ -539,28 +667,28 @@ def launch_cluster(conn, opts, cluster_name): raise Exception("Invalid state for spot request: %s - status: %s" % (invalid[0].id, invalid[0].status.message)) if len(active_instance_ids) == opts.slaves: - print "All %d slaves granted" % opts.slaves + print("All %d slaves granted" % opts.slaves) reservations = conn.get_all_reservations([r.instance_id for r in active_instance_ids]) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: - print "%d of %d slaves granted, waiting longer" % ( - len(active_instance_ids), opts.slaves) + print("%d of %d slaves granted, waiting longer" % ( + len(active_instance_ids), opts.slaves)) if (datetime.now() - start_time).seconds > opts.spot_timeout * 60: raise Exception("Timed out while waiting for spot instances") except: - print "Error: %s" % sys.exc_info()[1] - print "Canceling spot instance requests" + print("Error: %s" % sys.exc_info()[1]) + print("Canceling spot instance requests") conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: - print >> stderr, ("WARNING: %d instances are still running" % running) + print(("WARNING: %d instances are still running" % running), file=stderr) sys.exit(0) else: # Launch non-spot instances @@ -571,24 +699,30 @@ def launch_cluster(conn, opts, cluster_name): for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: - slave_res = image.run(key_name=opts.key_pair, - security_group_ids=[slave_group.id] + additional_group_ids, - instance_type=opts.instance_type, - placement=zone, - min_count=num_slaves_this_zone, - max_count=num_slaves_this_zone, - block_device_map=block_map, - subnet_id=opts.subnet_id, - placement_group=opts.placement_group, - user_data=user_data_content) + slave_res = image.run( + key_name=opts.key_pair, + security_group_ids=[slave_group.id] + additional_group_ids, + instance_type=opts.instance_type, + placement=zone, + min_count=num_slaves_this_zone, + max_count=num_slaves_this_zone, + block_device_map=block_map, + subnet_id=opts.subnet_id, + placement_group=opts.placement_group, + user_data=user_data_content, + instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, + instance_profile_name=opts.instance_profile_name) slave_nodes += slave_res.instances - print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, - zone, slave_res.id) + print("Launched {s} slave{plural_s} in {z}, regid = {r}".format( + s=num_slaves_this_zone, + plural_s=('' if num_slaves_this_zone == 1 else 's'), + z=zone, + r=slave_res.id)) i += 1 # Launch or resume masters if existing_masters: - print "Starting master..." + print("Starting master...") for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() @@ -599,72 +733,92 @@ def launch_cluster(conn, opts, cluster_name): master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name - master_res = master_image.run(key_name=opts.key_pair, - security_group_ids=[master_group.id] + additional_group_ids, - instance_type=master_type, - placement=opts.zone, - min_count=1, - max_count=1, - block_device_map=block_map, - subnet_id=opts.subnet_id, - placement_group=opts.placement_group, - user_data=user_data_content) + master_res = master_image.run( + key_name=opts.key_pair, + security_group_ids=[master_group.id] + additional_group_ids, + instance_type=master_type, + placement=opts.zone, + min_count=1, + max_count=1, + block_device_map=block_map, + subnet_id=opts.subnet_id, + placement_group=opts.placement_group, + user_data=user_data_content, + instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, + instance_profile_name=opts.instance_profile_name) master_nodes = master_res.instances - print "Launched master in %s, regid = %s" % (zone, master_res.id) + print("Launched master in %s, regid = %s" % (zone, master_res.id)) # This wait time corresponds to SPARK-4983 - print "Waiting for AWS to propagate instance metadata..." + print("Waiting for AWS to propagate instance metadata...") time.sleep(5) - # Give the instances descriptive names + + # Give the instances descriptive names and set additional tags + additional_tags = {} + if opts.additional_tags.strip(): + additional_tags = dict( + map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',') + ) + for master in master_nodes: - master.add_tag( - key='Name', - value='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) + master.add_tags( + dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) + ) + for slave in slave_nodes: - slave.add_tag( - key='Name', - value='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) + slave.add_tags( + dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) + ) # Return all the instances return (master_nodes, slave_nodes) -# Get the EC2 instances in an existing cluster if available. -# Returns a tuple of lists of EC2 instance objects for the masters and slaves def get_existing_cluster(conn, opts, cluster_name, die_on_error=True): - print "Searching for existing cluster " + cluster_name + "..." - reservations = conn.get_all_reservations() - master_nodes = [] - slave_nodes = [] - for res in reservations: - active = [i for i in res.instances if is_active(i)] - for inst in active: - group_names = [g.name for g in inst.groups] - if (cluster_name + "-master") in group_names: - master_nodes.append(inst) - elif (cluster_name + "-slaves") in group_names: - slave_nodes.append(inst) - if any((master_nodes, slave_nodes)): - print "Found %d master(s), %d slaves" % (len(master_nodes), len(slave_nodes)) - if master_nodes != [] or not die_on_error: - return (master_nodes, slave_nodes) - else: - if master_nodes == [] and slave_nodes != []: - print >> sys.stderr, "ERROR: Could not find master in group " + cluster_name + "-master" - else: - print >> sys.stderr, "ERROR: Could not find any existing cluster" + """ + Get the EC2 instances in an existing cluster if available. + Returns a tuple of lists of EC2 instance objects for the masters and slaves. + """ + print("Searching for existing cluster {c} in region {r}...".format( + c=cluster_name, r=opts.region)) + + def get_instances(group_names): + """ + Get all non-terminated instances that belong to any of the provided security groups. + + EC2 reservation filters and instance states are documented here: + http://docs.aws.amazon.com/cli/latest/reference/ec2/describe-instances.html#options + """ + reservations = conn.get_all_reservations( + filters={"instance.group-name": group_names}) + instances = itertools.chain.from_iterable(r.instances for r in reservations) + return [i for i in instances if i.state not in ["shutting-down", "terminated"]] + + master_instances = get_instances([cluster_name + "-master"]) + slave_instances = get_instances([cluster_name + "-slaves"]) + + if any((master_instances, slave_instances)): + print("Found {m} master{plural_m}, {s} slave{plural_s}.".format( + m=len(master_instances), + plural_m=('' if len(master_instances) == 1 else 's'), + s=len(slave_instances), + plural_s=('' if len(slave_instances) == 1 else 's'))) + + if not master_instances and die_on_error: + print("ERROR: Could not find a master for cluster {c} in region {r}.".format( + c=cluster_name, r=opts.region), file=sys.stderr) sys.exit(1) + return (master_instances, slave_instances) + # Deploy configuration files and run setup scripts on a newly launched # or started EC2 cluster. - - def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): - master = master_nodes[0].public_dns_name + master = get_dns_name(master_nodes[0], opts.private_ips) if deploy_ssh_key: - print "Generating cluster's SSH key on master..." + print("Generating cluster's SSH key on master...") key_setup = """ [ -f ~/.ssh/id_rsa ] || (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa && @@ -672,24 +826,29 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): """ ssh(master, opts, key_setup) dot_ssh_tar = ssh_read(master, opts, ['tar', 'c', '.ssh']) - print "Transferring cluster's SSH key to slaves..." + print("Transferring cluster's SSH key to slaves...") for slave in slave_nodes: - print slave.public_dns_name - ssh_write(slave.public_dns_name, opts, ['tar', 'x'], dot_ssh_tar) + slave_address = get_dns_name(slave, opts.private_ips) + print(slave_address) + ssh_write(slave_address, opts, ['tar', 'x'], dot_ssh_tar) modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs', - 'mapreduce', 'spark-standalone', 'tachyon'] + 'mapreduce', 'spark-standalone', 'tachyon', 'rstudio'] if opts.hadoop_major_version == "1": - modules = filter(lambda x: x != "mapreduce", modules) + modules = list(filter(lambda x: x != "mapreduce", modules)) if opts.ganglia: modules.append('ganglia') + # Clear SPARK_WORKER_INSTANCES if running on YARN + if opts.hadoop_major_version == "yarn": + opts.worker_instances = "" + # NOTE: We should clone the repository before running deploy_files to # prevent ec2-variables.sh from being overwritten - print "Cloning spark-ec2 scripts from {r}/tree/{b} on master...".format( - r=opts.spark_ec2_git_repo, b=opts.spark_ec2_git_branch) + print("Cloning spark-ec2 scripts from {r}/tree/{b} on master...".format( + r=opts.spark_ec2_git_repo, b=opts.spark_ec2_git_branch)) ssh( host=master, opts=opts, @@ -699,7 +858,7 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): b=opts.spark_ec2_git_branch) ) - print "Deploying files to master..." + print("Deploying files to master...") deploy_files( conn=conn, root_dir=SPARK_EC2_DIR + "/" + "deploy.generic", @@ -709,18 +868,26 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): modules=modules ) - print "Running setup on master..." + if opts.deploy_root_dir is not None: + print("Deploying {s} to master...".format(s=opts.deploy_root_dir)) + deploy_user_files( + root_dir=opts.deploy_root_dir, + opts=opts, + master_nodes=master_nodes + ) + + print("Running setup on master...") setup_spark_cluster(master, opts) - print "Done!" + print("Done!") def setup_spark_cluster(master, opts): ssh(master, opts, "chmod u+x spark-ec2/setup.sh") ssh(master, opts, "spark-ec2/setup.sh") - print "Spark standalone cluster started at http://%s:8080" % master + print("Spark standalone cluster started at http://%s:8080" % master) if opts.ganglia: - print "Ganglia started at http://%s:5080/ganglia" % master + print("Ganglia started at http://%s:5080/ganglia" % master) def is_ssh_available(host, opts, print_ssh_output=True): @@ -737,7 +904,7 @@ def is_ssh_available(host, opts, print_ssh_output=True): if s.returncode != 0 and print_ssh_output: # extra leading newline is for spacing in wait_for_cluster_state() - print textwrap.dedent("""\n + print(textwrap.dedent("""\n Warning: SSH connection error. (This could be temporary.) Host: {h} SSH return code: {r} @@ -746,7 +913,7 @@ def is_ssh_available(host, opts, print_ssh_output=True): h=host, r=s.returncode, o=cmd_output.strip() - ) + )) return s.returncode == 0 @@ -756,7 +923,8 @@ def is_cluster_ssh_available(cluster_instances, opts): Check if SSH is available on all the instances in a cluster. """ for i in cluster_instances: - if not is_ssh_available(host=i.ip_address, opts=opts): + dns_name = get_dns_name(i, opts.private_ips) + if not is_ssh_available(host=dns_name, opts=opts): return False else: return True @@ -786,7 +954,11 @@ def wait_for_cluster_state(conn, opts, cluster_instances, cluster_state): for i in cluster_instances: i.update() - statuses = conn.get_all_instance_status(instance_ids=[i.id for i in cluster_instances]) + max_batch = 100 + statuses = [] + for j in xrange(0, len(cluster_instances), max_batch): + batch = [i.id for i in cluster_instances[j:j + max_batch]] + statuses.extend(conn.get_all_instance_status(instance_ids=batch)) if cluster_state == 'ssh-ready': if all(i.state == 'running' for i in cluster_instances) and \ @@ -806,63 +978,78 @@ def wait_for_cluster_state(conn, opts, cluster_instances, cluster_state): sys.stdout.write("\n") end_time = datetime.now() - print "Cluster is now in '{s}' state. Waited {t} seconds.".format( + print("Cluster is now in '{s}' state. Waited {t} seconds.".format( s=cluster_state, t=(end_time - start_time).seconds - ) + )) # Get number of local disks available for a given EC2 instance type. def get_num_disks(instance_type): # Source: http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html - # Last Updated: 2014-06-20 + # Last Updated: 2015-06-19 # For easy maintainability, please keep this manually-inputted dictionary sorted by key. disks_by_instance = { "c1.medium": 1, "c1.xlarge": 4, + "c3.large": 2, + "c3.xlarge": 2, "c3.2xlarge": 2, "c3.4xlarge": 2, "c3.8xlarge": 2, - "c3.large": 2, - "c3.xlarge": 2, + "c4.large": 0, + "c4.xlarge": 0, + "c4.2xlarge": 0, + "c4.4xlarge": 0, + "c4.8xlarge": 0, "cc1.4xlarge": 2, "cc2.8xlarge": 4, "cg1.4xlarge": 2, "cr1.8xlarge": 2, + "d2.xlarge": 3, + "d2.2xlarge": 6, + "d2.4xlarge": 12, + "d2.8xlarge": 24, "g2.2xlarge": 1, + "g2.8xlarge": 2, "hi1.4xlarge": 2, "hs1.8xlarge": 24, + "i2.xlarge": 1, "i2.2xlarge": 2, "i2.4xlarge": 4, "i2.8xlarge": 8, - "i2.xlarge": 1, - "m1.large": 2, - "m1.medium": 1, "m1.small": 1, + "m1.medium": 1, + "m1.large": 2, "m1.xlarge": 4, + "m2.xlarge": 1, "m2.2xlarge": 1, "m2.4xlarge": 2, - "m2.xlarge": 1, - "m3.2xlarge": 2, - "m3.large": 1, "m3.medium": 1, + "m3.large": 1, "m3.xlarge": 2, + "m3.2xlarge": 2, + "m4.large": 0, + "m4.xlarge": 0, + "m4.2xlarge": 0, + "m4.4xlarge": 0, + "m4.10xlarge": 0, + "r3.large": 1, + "r3.xlarge": 1, "r3.2xlarge": 1, "r3.4xlarge": 1, "r3.8xlarge": 2, - "r3.large": 1, - "r3.xlarge": 1, "t1.micro": 0, - 'd2.xlarge': 3, - 'd2.2xlarge': 6, - 'd2.4xlarge': 12, - 'd2.8xlarge': 24, + "t2.micro": 0, + "t2.small": 0, + "t2.medium": 0, + "t2.large": 0, } if instance_type in disks_by_instance: return disks_by_instance[instance_type] else: - print >> stderr, ("WARNING: Don't know number of disks on instance type %s; assuming 1" - % instance_type) + print("WARNING: Don't know number of disks on instance type %s; assuming 1" + % instance_type, file=stderr) return 1 @@ -874,7 +1061,7 @@ def get_num_disks(instance_type): # # root_dir should be an absolute path to the directory with the files we want to deploy. def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): - active_master = master_nodes[0].public_dns_name + active_master = get_dns_name(master_nodes[0], opts.private_ips) num_disks = get_num_disks(opts.instance_type) hdfs_data_dirs = "/mnt/ephemeral-hdfs/data" @@ -891,17 +1078,27 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): if opts.spark_version.startswith("http"): # Custom pre-built spark package spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo) + tachyon_v = "" + print("Deploying Spark via custom bunlde; Tachyon won't be set up") + modules = filter(lambda x: x != "tachyon", modules) elif "." in opts.spark_version: # Pre-built Spark deploy spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo) + tachyon_v = get_tachyon_version(spark_v) else: # Spark-only custom deploy spark_v = "%s|%s" % (opts.spark_git_repo, opts.spark_version) + tachyon_v = "" + print("Deploying Spark via git hash; Tachyon won't be set up") + modules = filter(lambda x: x != "tachyon", modules) + master_addresses = [get_dns_name(i, opts.private_ips) for i in master_nodes] + slave_addresses = [get_dns_name(i, opts.private_ips) for i in slave_nodes] + worker_instances_str = "%d" % opts.worker_instances if opts.worker_instances else "" template_vars = { - "master_list": '\n'.join([i.public_dns_name for i in master_nodes]), + "master_list": '\n'.join(master_addresses), "active_master": active_master, - "slave_list": '\n'.join([i.public_dns_name for i in slave_nodes]), + "slave_list": '\n'.join(slave_addresses), "cluster_url": cluster_url, "hdfs_data_dirs": hdfs_data_dirs, "mapred_local_dirs": mapred_local_dirs, @@ -909,8 +1106,9 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): "swap": str(opts.swap), "modules": '\n'.join(modules), "spark_version": spark_v, + "tachyon_version": tachyon_v, "hadoop_major_version": opts.hadoop_major_version, - "spark_worker_instances": "%d" % opts.worker_instances, + "spark_worker_instances": worker_instances_str, "spark_master_opts": opts.master_opts } @@ -953,6 +1151,23 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): shutil.rmtree(tmp_dir) +# Deploy a given local directory to a cluster, WITHOUT parameter substitution. +# Note that unlike deploy_files, this works for binary files. +# Also, it is up to the user to add (or not) the trailing slash in root_dir. +# Files are only deployed to the first master instance in the cluster. +# +# root_dir should be an absolute path. +def deploy_user_files(root_dir, opts, master_nodes): + active_master = get_dns_name(master_nodes[0], opts.private_ips) + command = [ + 'rsync', '-rv', + '-e', stringify_command(ssh_command(opts)), + "%s" % root_dir, + "%s@%s:/" % (opts.user, active_master) + ] + subprocess.check_call(command) + + def stringify_command(parts): if isinstance(parts, str): return parts @@ -986,13 +1201,13 @@ def ssh(host, opts, command): # If this was an ssh failure, provide the user with hints. if e.returncode == 255: raise UsageError( - "Failed to SSH to remote host {0}.\n" + - "Please check that you have provided the correct --identity-file and " + + "Failed to SSH to remote host {0}.\n" + "Please check that you have provided the correct --identity-file and " "--key-pair parameters and try again.".format(host)) else: raise e - print >> stderr, \ - "Error executing remote command, retrying after 30 seconds: {0}".format(e) + print("Error executing remote command, retrying after 30 seconds: {0}".format(e), + file=stderr) time.sleep(30) tries = tries + 1 @@ -1031,8 +1246,8 @@ def ssh_write(host, opts, command, arguments): elif tries > 5: raise RuntimeError("ssh_write failed with error %s" % proc.returncode) else: - print >> stderr, \ - "Error {0} while executing remote command, retrying after 30 seconds".format(status) + print("Error {0} while executing remote command, retrying after 30 seconds". + format(status), file=stderr) time.sleep(30) tries = tries + 1 @@ -1048,12 +1263,26 @@ def get_zones(conn, opts): # Gets the number of items in a partition def get_partition(total, num_partitions, current_partitions): - num_slaves_this_zone = total / num_partitions + num_slaves_this_zone = total // num_partitions if (total % num_partitions) - current_partitions > 0: num_slaves_this_zone += 1 return num_slaves_this_zone +# Gets the IP address, taking into account the --private-ips flag +def get_ip_address(instance, private_ips=False): + ip = instance.ip_address if not private_ips else \ + instance.private_ip_address + return ip + + +# Gets the DNS name, taking into account the --private-ips flag +def get_dns_name(instance, private_ips=False): + dns = instance.public_dns_name if not private_ips else \ + instance.private_ip_address + return dns + + def real_main(): (opts, action, cluster_name) = parse_args() @@ -1072,28 +1301,28 @@ def real_main(): if opts.identity_file is not None: if not os.path.exists(opts.identity_file): - print >> stderr,\ - "ERROR: The identity file '{f}' doesn't exist.".format(f=opts.identity_file) + print("ERROR: The identity file '{f}' doesn't exist.".format(f=opts.identity_file), + file=stderr) sys.exit(1) file_mode = os.stat(opts.identity_file).st_mode if not (file_mode & S_IRUSR) or not oct(file_mode)[-2:] == '00': - print >> stderr, "ERROR: The identity file must be accessible only by you." - print >> stderr, 'You can fix this with: chmod 400 "{f}"'.format(f=opts.identity_file) + print("ERROR: The identity file must be accessible only by you.", file=stderr) + print('You can fix this with: chmod 400 "{f}"'.format(f=opts.identity_file), + file=stderr) sys.exit(1) if opts.instance_type not in EC2_INSTANCE_TYPES: - print >> stderr, "Warning: Unrecognized EC2 instance type for instance-type: {t}".format( - t=opts.instance_type) + print("Warning: Unrecognized EC2 instance type for instance-type: {t}".format( + t=opts.instance_type), file=stderr) if opts.master_instance_type != "": if opts.master_instance_type not in EC2_INSTANCE_TYPES: - print >> stderr, \ - "Warning: Unrecognized EC2 instance type for master-instance-type: {t}".format( - t=opts.master_instance_type) + print("Warning: Unrecognized EC2 instance type for master-instance-type: {t}".format( + t=opts.master_instance_type), file=stderr) if opts.ebs_vol_num > 8: - print >> stderr, "ebs-vol-num cannot be greater than 8" + print("ebs-vol-num cannot be greater than 8", file=stderr) sys.exit(1) # Prevent breaking ami_prefix (/, .git and startswith checks) @@ -1102,15 +1331,22 @@ def real_main(): opts.spark_ec2_git_repo.endswith(".git") or \ not opts.spark_ec2_git_repo.startswith("https://github.com") or \ not opts.spark_ec2_git_repo.endswith("spark-ec2"): - print >> stderr, "spark-ec2-git-repo must be a github repo and it must not have a " \ - "trailing / or .git. " \ - "Furthermore, we currently only support forks named spark-ec2." + print("spark-ec2-git-repo must be a github repo and it must not have a trailing / or .git. " + "Furthermore, we currently only support forks named spark-ec2.", file=stderr) + sys.exit(1) + + if not (opts.deploy_root_dir is None or + (os.path.isabs(opts.deploy_root_dir) and + os.path.isdir(opts.deploy_root_dir) and + os.path.exists(opts.deploy_root_dir))): + print("--deploy-root-dir must be an absolute path to a directory that exists " + "on the local file system", file=stderr) sys.exit(1) try: conn = ec2.connect_to_region(opts.region) except Exception as e: - print >> stderr, (e) + print((e), file=stderr) sys.exit(1) # Select an AZ at random if it was not specified. @@ -1119,7 +1355,7 @@ def real_main(): if action == "launch": if opts.slaves <= 0: - print >> sys.stderr, "ERROR: You have to start at least 1 slave" + print("ERROR: You have to start at least 1 slave", file=sys.stderr) sys.exit(1) if opts.resume: (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) @@ -1134,26 +1370,27 @@ def real_main(): setup_cluster(conn, master_nodes, slave_nodes, opts, True) elif action == "destroy": - print "Are you sure you want to destroy the cluster %s?" % cluster_name - print "The following instances will be terminated:" (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) - for inst in master_nodes + slave_nodes: - print "> %s" % inst.public_dns_name - msg = "ALL DATA ON ALL NODES WILL BE LOST!!\nDestroy cluster %s (y/N): " % cluster_name + if any(master_nodes + slave_nodes): + print("The following instances will be terminated:") + for inst in master_nodes + slave_nodes: + print("> %s" % get_dns_name(inst, opts.private_ips)) + print("ALL DATA ON ALL NODES WILL BE LOST!!") + + msg = "Are you sure you want to destroy the cluster {c}? (y/N) ".format(c=cluster_name) response = raw_input(msg) if response == "y": - print "Terminating master..." + print("Terminating master...") for inst in master_nodes: inst.terminate() - print "Terminating slaves..." + print("Terminating slaves...") for inst in slave_nodes: inst.terminate() # Delete security groups as well if opts.delete_groups: - print "Deleting security groups (this will take some time)..." group_names = [cluster_name + "-master", cluster_name + "-slaves"] wait_for_cluster_state( conn=conn, @@ -1161,15 +1398,16 @@ def real_main(): cluster_instances=(master_nodes + slave_nodes), cluster_state='terminated' ) + print("Deleting security groups (this will take some time)...") attempt = 1 while attempt <= 3: - print "Attempt %d" % attempt + print("Attempt %d" % attempt) groups = [g for g in conn.get_all_security_groups() if g.name in group_names] success = True # Delete individual rules in all groups before deleting groups to # remove dependencies between them for group in groups: - print "Deleting rules in security group " + group.name + print("Deleting rules in security group " + group.name) for rule in group.rules: for grant in rule.grants: success &= group.revoke(ip_protocol=rule.ip_protocol, @@ -1182,11 +1420,12 @@ def real_main(): time.sleep(30) # Yes, it does have to be this long :-( for group in groups: try: - conn.delete_security_group(group.name) - print "Deleted security group " + group.name + # It is needed to use group_id to make it work with VPC + conn.delete_security_group(group_id=group.id) + print("Deleted security group %s" % group.name) except boto.exception.EC2ResponseError: success = False - print "Failed to delete security group " + group.name + print("Failed to delete security group %s" % group.name) # Unfortunately, group.revoke() returns True even if a rule was not # deleted, so this needs to be rerun if something fails @@ -1196,18 +1435,21 @@ def real_main(): attempt += 1 if not success: - print "Failed to delete all security groups after 3 tries." - print "Try re-running in a few minutes." + print("Failed to delete all security groups after 3 tries.") + print("Try re-running in a few minutes.") elif action == "login": (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) - master = master_nodes[0].public_dns_name - print "Logging into master " + master + "..." - proxy_opt = [] - if opts.proxy_port is not None: - proxy_opt = ['-D', opts.proxy_port] - subprocess.check_call( - ssh_command(opts) + proxy_opt + ['-t', '-t', "%s@%s" % (opts.user, master)]) + if not master_nodes[0].public_dns_name and not opts.private_ips: + print("Master has no public DNS name. Maybe you meant to specify --private-ips?") + else: + master = get_dns_name(master_nodes[0], opts.private_ips) + print("Logging into master " + master + "...") + proxy_opt = [] + if opts.proxy_port is not None: + proxy_opt = ['-D', opts.proxy_port] + subprocess.check_call( + ssh_command(opts) + proxy_opt + ['-t', '-t', "%s@%s" % (opts.user, master)]) elif action == "reboot-slaves": response = raw_input( @@ -1217,15 +1459,18 @@ def real_main(): if response == "y": (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) - print "Rebooting slaves..." + print("Rebooting slaves...") for inst in slave_nodes: if inst.state not in ["shutting-down", "terminated"]: - print "Rebooting " + inst.id + print("Rebooting " + inst.id) inst.reboot() elif action == "get-master": (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) - print master_nodes[0].public_dns_name + if not master_nodes[0].public_dns_name and not opts.private_ips: + print("Master has no public DNS name. Maybe you meant to specify --private-ips?") + else: + print(get_dns_name(master_nodes[0], opts.private_ips)) elif action == "stop": response = raw_input( @@ -1238,11 +1483,11 @@ def real_main(): if response == "y": (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) - print "Stopping master..." + print("Stopping master...") for inst in master_nodes: if inst.state not in ["shutting-down", "terminated"]: inst.stop() - print "Stopping slaves..." + print("Stopping slaves...") for inst in slave_nodes: if inst.state not in ["shutting-down", "terminated"]: if inst.spot_instance_request_id: @@ -1252,11 +1497,11 @@ def real_main(): elif action == "start": (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) - print "Starting slaves..." + print("Starting slaves...") for inst in slave_nodes: if inst.state not in ["shutting-down", "terminated"]: inst.start() - print "Starting master..." + print("Starting master...") for inst in master_nodes: if inst.state not in ["shutting-down", "terminated"]: inst.start() @@ -1266,18 +1511,29 @@ def real_main(): cluster_instances=(master_nodes + slave_nodes), cluster_state='ssh-ready' ) + + # Determine types of running instances + existing_master_type = master_nodes[0].instance_type + existing_slave_type = slave_nodes[0].instance_type + # Setting opts.master_instance_type to the empty string indicates we + # have the same instance type for the master and the slaves + if existing_master_type == existing_slave_type: + existing_master_type = "" + opts.master_instance_type = existing_master_type + opts.instance_type = existing_slave_type + setup_cluster(conn, master_nodes, slave_nodes, opts, False) else: - print >> stderr, "Invalid action: %s" % action + print("Invalid action: %s" % action, file=stderr) sys.exit(1) def main(): try: real_main() - except UsageError, e: - print >> stderr, "\nError:\n", e + except UsageError as e: + print("\nError:\n", e, file=stderr) sys.exit(1) From 83fcbcba62dcec3d50bb768135f8eae888467e49 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 13 Aug 2015 16:16:32 -0300 Subject: [PATCH 018/268] get_spark_ami fix --- tools/spark-ec2/spark_ec2.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 8cc44d30..4fbf5bd8 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -459,21 +459,20 @@ def get_validate_spark_version(version, repo): def get_tachyon_version(spark_version): return SPARK_TACHYON_MAP.get(spark_version, "") - # Attempt to resolve an appropriate AMI given the architecture and region of the request. -def get_spark_ami(opts): - if opts.instance_type in EC2_INSTANCE_TYPES: - instance_type = EC2_INSTANCE_TYPES[opts.instance_type] +def get_spark_ami(instance_type, region, spark_ec2_git_repo, spark_ec2_git_branch): + if instance_type in EC2_INSTANCE_TYPES: + instance_type = EC2_INSTANCE_TYPES[instance_type] else: instance_type = "pvm" - print("Don't recognize %s, assuming type is pvm" % opts.instance_type, file=stderr) + print("Don't recognize %s, assuming type is pvm" % instance_type, file=stderr) # URL prefix from which to fetch AMI information ami_prefix = "{r}/{b}/ami-list".format( - r=opts.spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1), - b=opts.spark_ec2_git_branch) + r=spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1), + b=spark_ec2_git_branch) - ami_path = "%s/%s/%s" % (ami_prefix, opts.region, instance_type) + ami_path = "%s/%s/%s" % (ami_prefix, region, instance_type) reader = codecs.getreader("ascii") try: ami = reader(urlopen(ami_path)).read().strip() @@ -484,7 +483,6 @@ def get_spark_ami(opts): print("Spark AMI: " + ami) return ami - # Launch a cluster of the given name, by setting up its security groups, # and then starting new instances in them. # Returns a tuple of EC2 reservation objects for the master and slaves @@ -584,10 +582,10 @@ def launch_cluster(conn, opts, cluster_name): # Figure out Spark AMI if opts.ami is None: - opts.ami = get_spark_ami(opts) + opts.ami = get_spark_ami(opts.instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch) if opts.master_ami is None: - opts.master_ami = get_spark_ami(opts.master_instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch) + opts.master_ami = get_spark_ami(opts.master_instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch) # we use group ids to work around https://github.com/boto/boto/issues/350 additional_group_ids = [] From 807f0f616973d74a51998caadfc1bc1b17b7a306 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 14 Aug 2015 13:51:39 -0300 Subject: [PATCH 019/268] remove user data, spark-ec2 takes care on formatting disks --- tools/cluster.py | 3 --- tools/scripts/S05mount-disks | 11 ----------- 2 files changed, 14 deletions(-) delete mode 100644 tools/scripts/S05mount-disks diff --git a/tools/cluster.py b/tools/cluster.py index 3cf1828a..6ebd2386 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -51,7 +51,6 @@ default_remote_user = 'ec2-user' default_remote_control_dir = '/tmp/Ignition' default_collect_results_dir = '/tmp' -default_user_data = os.path.join(script_path, 'scripts', 'S05mount-disks') default_defaults_filename = 'cluster_defaults.json' default_spark_ec2_git_repo = 'https://github.com/chaordic/spark-ec2' @@ -202,7 +201,6 @@ def launch(cluster_name, slaves, key_id=default_key_id, region=default_region, zone=default_zone, instance_type=default_instance_type, ondemand=False, spot_price=default_spot_price, - user_data=default_user_data, security_group = None, vpc = None, vpc_subnet = None, @@ -272,7 +270,6 @@ def launch(cluster_name, slaves, '--master-opts', '-Dspark.worker.timeout={0}'.format(worker_timeout), '--spark-git-repo', spark_repo, '-v', spark_version, - '--user-data', user_data, 'launch', cluster_name] + spot_params + resume_param + diff --git a/tools/scripts/S05mount-disks b/tools/scripts/S05mount-disks deleted file mode 100644 index 8f129a30..00000000 --- a/tools/scripts/S05mount-disks +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -echo 'Mounting disks' >> /tmp/mount-disks.log -mkdir -p /mnt -mkdir -p /mnt{2,3,4} -chmod -R 777 /mnt* -[ -r /dev/xvdb ] && mkfs.ext4 /dev/xvdb && mount /dev/xvdb /mnt -[ -r /dev/xvdc ] && mkfs.ext4 /dev/xvdc && mount /dev/xvdc /mnt2 -[ -r /dev/xvdd ] && mkfs.ext4 /dev/xvdd && mount /dev/xvdd /mnt3 -[ -r /dev/xvde ] && mkfs.ext4 /dev/xvde && mount /dev/xvde /mnt4 - From 637ab060de8b564d6b7a6021ef493b84152af350 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 14 Aug 2015 14:06:31 -0300 Subject: [PATCH 020/268] fix variable replacement --- .../spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh b/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh index 3570891b..4f3e8da8 100644 --- a/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh +++ b/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh @@ -25,8 +25,10 @@ export MAPRED_LOCAL_DIRS="{{mapred_local_dirs}}" export SPARK_LOCAL_DIRS="{{spark_local_dirs}}" export MODULES="{{modules}}" export SPARK_VERSION="{{spark_version}}" -export SHARK_VERSION="{{shark_version}}" +export TACHYON_VERSION="{{tachyon_version}}" export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}" export SWAP_MB="{{swap}}" export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}" export SPARK_MASTER_OPTS="{{spark_master_opts}}" +export AWS_ACCESS_KEY_ID="{{aws_access_key_id}}" +export AWS_SECRET_ACCESS_KEY="{{aws_secret_access_key}}" From f6d5d0dd7cebb0bc32a9c13f04959015f3e36427 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 14 Aug 2015 14:07:32 -0300 Subject: [PATCH 021/268] remove rstudio and some fixes --- tools/spark-ec2/spark_ec2.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 4fbf5bd8..f5bbaac1 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -357,15 +357,15 @@ def get_or_make_group(conn, name, vpc_id): return conn.create_security_group(name, "Spark EC2 group", vpc_id) def check_if_http_resource_exists(resource): - request = urllib2.Request(resource) + request = Request(resource) request.get_method = lambda: 'HEAD' try: - response = urllib2.urlopen(request) + response = urlopen(request) if response.getcode() == 200: return True else: raise RuntimeError("Resource {resource} not found. Error: {code}".format(resource, response.getcode())) - except urllib2.HTTPError, e: + except HTTPError, e: print >> stderr, "Unable to check if HTTP resource {url} exists. Error: {code}".format( url=resource, code=e.code) @@ -831,7 +831,7 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): ssh_write(slave_address, opts, ['tar', 'x'], dot_ssh_tar) modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs', - 'mapreduce', 'spark-standalone', 'tachyon', 'rstudio'] + 'mapreduce', 'spark-standalone', 'tachyon'] if opts.hadoop_major_version == "1": modules = list(filter(lambda x: x != "mapreduce", modules)) From 7787045de3c1ff132c17f341b3cdecae60ceade0 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 14 Aug 2015 14:49:14 -0300 Subject: [PATCH 022/268] update spark-ec2 version --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 6ebd2386..23b3bed9 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -54,7 +54,7 @@ default_defaults_filename = 'cluster_defaults.json' default_spark_ec2_git_repo = 'https://github.com/chaordic/spark-ec2' -default_spark_ec2_git_branch = 'v4-yarn' +default_spark_ec2_git_branch = 'branch-1.4-merge' master_post_create_commands = [ From ccfed3f661b0bae939dae704204767a3ef899ad1 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Mon, 17 Aug 2015 11:08:23 -0300 Subject: [PATCH 023/268] pr review, fix removed feature and added noop user-data --- tools/cluster.py | 3 +++ tools/scripts/noop | 1 + 2 files changed, 4 insertions(+) create mode 100644 tools/scripts/noop diff --git a/tools/cluster.py b/tools/cluster.py index 23b3bed9..d6a0263d 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -51,6 +51,7 @@ default_remote_user = 'ec2-user' default_remote_control_dir = '/tmp/Ignition' default_collect_results_dir = '/tmp' +default_user_data = os.path.join(script_path, 'scripts', 'noop') default_defaults_filename = 'cluster_defaults.json' default_spark_ec2_git_repo = 'https://github.com/chaordic/spark-ec2' @@ -201,6 +202,7 @@ def launch(cluster_name, slaves, key_id=default_key_id, region=default_region, zone=default_zone, instance_type=default_instance_type, ondemand=False, spot_price=default_spot_price, + user_data=default_user_data, security_group = None, vpc = None, vpc_subnet = None, @@ -270,6 +272,7 @@ def launch(cluster_name, slaves, '--master-opts', '-Dspark.worker.timeout={0}'.format(worker_timeout), '--spark-git-repo', spark_repo, '-v', spark_version, + '--user-data', user_data, 'launch', cluster_name] + spot_params + resume_param + diff --git a/tools/scripts/noop b/tools/scripts/noop new file mode 100644 index 00000000..cc1f786e --- /dev/null +++ b/tools/scripts/noop @@ -0,0 +1 @@ +#!/bin/bash \ No newline at end of file From 9bbcd181723dcdb1d275a5a6040c0eda2c540569 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 18 Aug 2015 16:43:51 -0300 Subject: [PATCH 024/268] added heap size param for driver --- remote_hook.sh | 3 ++- tools/cluster.py | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/remote_hook.sh b/remote_hook.sh index 305a0ff6..65e71070 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -11,6 +11,7 @@ CONTROL_DIR="${5?Please give the Control Directory}" SPARK_MEM_PARAM="${6?Please give the Job Memory Size to use}" USE_YARN="${7?Please tell if we should use YARN (yes/no)}" NOTIFY_ON_ERRORS="${8?Please tell if we will notify on errors (yes/no)}" +DRIVER_HEAP_SIZE="${9?Please tell driver heap size to use in MB}" JOB_WITH_TAG=${JOB_NAME}.${JOB_TAG} JOB_CONTROL_DIR="${CONTROL_DIR}/${JOB_WITH_TAG}" @@ -80,7 +81,7 @@ if [[ "${JOB_NAME}" == "shell" ]]; then else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & - sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory 25000M --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" + sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}M" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" fi touch "${JOB_CONTROL_DIR}/SUCCESS" diff --git a/tools/cluster.py b/tools/cluster.py index d6a0263d..3ac89be8 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -39,6 +39,7 @@ default_spot_price = '0.10' default_worker_instances = '1' default_master_instance_type = 'm3.xlarge' +default_driver_heap_size = '25000' default_region = 'us-east-1' default_zone = default_region + 'b' default_key_id = 'ignition_key' @@ -372,7 +373,9 @@ def job_run(cluster_name, job_name, job_mem, disable_assembly_build=False, run_tests=False, kill_on_failure=False, - destroy_cluster=False, region=default_region): + destroy_cluster=False, + region=default_region, + driver_heap_size=default_driver_heap_size): utc_job_date_example = '2014-05-04T13:13:10Z' if utc_job_date and len(utc_job_date) != len(utc_job_date_example): @@ -394,9 +397,9 @@ def job_run(cluster_name, job_name, job_mem, job_tag = job_tag or job_date.replace(':', '_').replace('-', '_').replace('Z', 'UTC') tmux_wait_command = ';(echo Press enter to keep the session open && /bin/bash -c "read -t 5" && sleep 7d)' if not detached else '' tmux_arg = ". /etc/profile; . ~/.profile;tmux new-session {detached} -s spark.{job_name}.{job_tag} '{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {tmux_wait_command}' >& /tmp/commandoutput".format( - aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, tmux_wait_command=tmux_wait_command) + aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, tmux_wait_command=tmux_wait_command) non_tmux_arg = ". /etc/profile; . ~/.profile;{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} >& /tmp/commandoutput".format( - aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param) + aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size) if not disable_assembly_build: From a2d5af977c37bd7e14fa6b304bf17d4ffd25e231 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 18 Aug 2015 17:03:21 -0300 Subject: [PATCH 025/268] parameterized memory unit --- remote_hook.sh | 4 ++-- tools/cluster.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/remote_hook.sh b/remote_hook.sh index 65e71070..48ba9735 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -11,7 +11,7 @@ CONTROL_DIR="${5?Please give the Control Directory}" SPARK_MEM_PARAM="${6?Please give the Job Memory Size to use}" USE_YARN="${7?Please tell if we should use YARN (yes/no)}" NOTIFY_ON_ERRORS="${8?Please tell if we will notify on errors (yes/no)}" -DRIVER_HEAP_SIZE="${9?Please tell driver heap size to use in MB}" +DRIVER_HEAP_SIZE="${9?Please tell driver heap size to use}" JOB_WITH_TAG=${JOB_NAME}.${JOB_TAG} JOB_CONTROL_DIR="${CONTROL_DIR}/${JOB_WITH_TAG}" @@ -81,7 +81,7 @@ if [[ "${JOB_NAME}" == "shell" ]]; then else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & - sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}M" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" + sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" fi touch "${JOB_CONTROL_DIR}/SUCCESS" diff --git a/tools/cluster.py b/tools/cluster.py index 3ac89be8..81dc9b2d 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -39,7 +39,7 @@ default_spot_price = '0.10' default_worker_instances = '1' default_master_instance_type = 'm3.xlarge' -default_driver_heap_size = '25000' +default_driver_heap_size = '25G' default_region = 'us-east-1' default_zone = default_region + 'b' default_key_id = 'ignition_key' From f165937d608288be7fd673301256f2f6e122bc2e Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 19 Aug 2015 16:12:21 -0300 Subject: [PATCH 026/268] fix default memory size to match default master instance type --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 81dc9b2d..f796f53c 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -39,7 +39,7 @@ default_spot_price = '0.10' default_worker_instances = '1' default_master_instance_type = 'm3.xlarge' -default_driver_heap_size = '25G' +default_driver_heap_size = '12G' default_region = 'us-east-1' default_zone = default_region + 'b' default_key_id = 'ignition_key' From 980a2784ccabcf435d2df575fcf9c650c820349c Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Wed, 19 Aug 2015 17:39:17 -0300 Subject: [PATCH 027/268] Use the driver heap size param --- tools/cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index f796f53c..1f6fdaa5 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -396,9 +396,9 @@ def job_run(cluster_name, job_name, job_mem, job_date = utc_job_date or datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') job_tag = job_tag or job_date.replace(':', '_').replace('-', '_').replace('Z', 'UTC') tmux_wait_command = ';(echo Press enter to keep the session open && /bin/bash -c "read -t 5" && sleep 7d)' if not detached else '' - tmux_arg = ". /etc/profile; . ~/.profile;tmux new-session {detached} -s spark.{job_name}.{job_tag} '{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {tmux_wait_command}' >& /tmp/commandoutput".format( + tmux_arg = ". /etc/profile; . ~/.profile;tmux new-session {detached} -s spark.{job_name}.{job_tag} '{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} {tmux_wait_command}' >& /tmp/commandoutput".format( aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, tmux_wait_command=tmux_wait_command) - non_tmux_arg = ". /etc/profile; . ~/.profile;{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} >& /tmp/commandoutput".format( + non_tmux_arg = ". /etc/profile; . ~/.profile;{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} >& /tmp/commandoutput".format( aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size) From c78c319a5e2c900888ddc512f4166ee3b5f553fc Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 26 Aug 2015 10:56:53 -0300 Subject: [PATCH 028/268] Update spark_ec2.py --- tools/spark-ec2/spark_ec2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index f5bbaac1..c81d794b 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -51,7 +51,7 @@ raw_input = input xrange = range -SPARK_EC2_VERSION = "1.4.0" +SPARK_EC2_VERSION = "1.4.1" SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__)) VALID_SPARK_VERSIONS = set([ @@ -71,6 +71,7 @@ "1.3.0", "1.3.1", "1.4.0", + "1.4.1, ]) SPARK_TACHYON_MAP = { @@ -84,6 +85,7 @@ "1.3.0": "0.5.0", "1.3.1": "0.5.0", "1.4.0": "0.6.4", + "1.4.1": "0.6.4", } DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION From a5379a0a4a54d2e35b83c2cb4c9b4a467b8091d5 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 26 Aug 2015 10:57:36 -0300 Subject: [PATCH 029/268] Update spark_ec2.py --- tools/spark-ec2/spark_ec2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index c81d794b..5c6458f9 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -71,7 +71,7 @@ "1.3.0", "1.3.1", "1.4.0", - "1.4.1, + "1.4.1", ]) SPARK_TACHYON_MAP = { From 59ba13280fdc49f95bc1e5f3878c8384a5d3d865 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Thu, 27 Aug 2015 10:59:16 -0300 Subject: [PATCH 030/268] Use Spark 1.4.1 --- build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index 7eb2bffe..476dd3bb 100644 --- a/build.sbt +++ b/build.sbt @@ -13,7 +13,7 @@ ideaExcludeFolders += ".idea_modules" // Because we can't run two spark contexts on same VM parallelExecution in Test := false -libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided") +libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.4.1" % "provided") .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") From 63e867a9de1a11f48a8a72906b38f693cccee52c Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Fri, 28 Aug 2015 16:38:44 -0300 Subject: [PATCH 031/268] Increase group to avoid slowdowns --- src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index a1090d20..baf80bc2 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -52,7 +52,7 @@ object SparkContextUtils { if (splittedPaths.size < minimumPaths) throw new Exception(s"Not enough paths found for $paths") - val rdds = splittedPaths.grouped(50).map(pathGroup => f(pathGroup.mkString(","))) + val rdds = splittedPaths.grouped(5000).map(pathGroup => f(pathGroup.mkString(","))) new UnionRDD(sc, rdds.toList) } From f12dfdc9d2029941bd293f3aa3ba90c83bbd885a Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 31 Aug 2015 13:52:32 -0300 Subject: [PATCH 032/268] Updated core to ignore spark ec2 boto --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index cfe2c08a..bcf8c0f8 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,6 @@ project/plugins/project/ # Node node_modules + +# Spark-ec2 boto +tools/spark-ec2/lib From cae677fc26ef20ff46f22c098b2cf903db239e5c Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 31 Aug 2015 16:21:58 -0300 Subject: [PATCH 033/268] Make spark 1.4.1 the default --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 1f6fdaa5..e312d842 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -47,7 +47,7 @@ default_ami = None # will be decided based on spark-ec2 list default_master_ami = None default_env = 'dev' -default_spark_version = '1.3.0' +default_spark_version = '1.4.1' default_spark_repo = 'https://github.com/chaordic/spark' default_remote_user = 'ec2-user' default_remote_control_dir = '/tmp/Ignition' From 14324a2f3a7456b6aae993d0b68f02aa9402924a Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 1 Sep 2015 15:44:27 -0300 Subject: [PATCH 034/268] Added IntBag --- .../ignition/core/utils/CollectionUtils.scala | 6 +++ .../scala/ignition/core/utils/IntBag.scala | 42 +++++++++++++++++++ .../core/utils/CollectionUtilsSpec.scala | 2 - .../ignition/core/utils/IntBagSpec.scala | 23 ++++++++++ 4 files changed, 71 insertions(+), 2 deletions(-) create mode 100644 src/main/scala/ignition/core/utils/IntBag.scala create mode 100644 src/test/scala/ignition/core/utils/IntBagSpec.scala diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala index 52828ca7..eea4755e 100644 --- a/src/main/scala/ignition/core/utils/CollectionUtils.scala +++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala @@ -22,6 +22,12 @@ object CollectionUtils { } } + + + implicit class TraversableOnceLong(xs: TraversableOnce[Long]) { + def toBag(): IntBag = IntBag.from(xs) + } + implicit class TraversableLikeImprovements[A, Repr](xs: TraversableLike[A, Repr]) { def distinctBy[B, That](f: A => B)(implicit cbf: CanBuildFrom[Repr, A, That]) = { val builder = cbf(xs.repr) diff --git a/src/main/scala/ignition/core/utils/IntBag.scala b/src/main/scala/ignition/core/utils/IntBag.scala new file mode 100644 index 00000000..2a36da6e --- /dev/null +++ b/src/main/scala/ignition/core/utils/IntBag.scala @@ -0,0 +1,42 @@ +package ignition.core.utils + +object IntBag { + def from(numbers: TraversableOnce[Long]): IntBag = { + val histogram = scala.collection.mutable.HashMap.empty[Long, Long] + numbers.foreach(n => histogram += (n -> (histogram.getOrElse(n, 0L) + 1))) + new IntBag(histogram) + } + + val empty = from(Seq.empty) +} + +class IntBag(val histogram: collection.Map[Long, Long]) { + def ++(other: IntBag): IntBag = { + val newHistogram = scala.collection.mutable.HashMap.empty[Long, Long] + (histogram.keySet ++ other.histogram.keySet).foreach(k => newHistogram += (k -> (histogram.getOrElse(k, 0L) + other.histogram.getOrElse(k, 0L)))) + new IntBag(newHistogram) + } + + + def median: Option[Long] = { + if (histogram.nonEmpty) { + val total = histogram.values.sum + val half = total / 2 + val max = histogram.keys.max + + val accumulatedFrequency = (0L to max).scanLeft(0L) { case (sumFreq, k) => sumFreq + histogram.getOrElse(k, 0L) }.zipWithIndex + accumulatedFrequency.collectFirst { case (sum, k) if sum >= half => k } + } else { + None + } + } + + def avg: Option[Long] = { + if (histogram.nonEmpty) { + val sum = histogram.map { case (k, f) => k * f }.sum + val count = histogram.values.sum + Option(sum / count) + } else + None + } +} diff --git a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala index c19579ce..f01b8a34 100644 --- a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala +++ b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala @@ -33,6 +33,4 @@ class CollectionUtilsSpec extends FlatSpec with ShouldMatchers { } - - } diff --git a/src/test/scala/ignition/core/utils/IntBagSpec.scala b/src/test/scala/ignition/core/utils/IntBagSpec.scala new file mode 100644 index 00000000..b6694b12 --- /dev/null +++ b/src/test/scala/ignition/core/utils/IntBagSpec.scala @@ -0,0 +1,23 @@ +package ignition.core.utils + +import org.scalatest._ + +import scala.util.Random + +class IntBagSpec extends FlatSpec with ShouldMatchers { + + "IntBag" should "be built from sequence" in { + IntBag.from(Seq(1, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4)).histogram shouldBe Map(1 -> 2, 2 -> 3, 3 -> 1, 4 -> 5) + } + + it should "calculate the median and average" in { + val size = 1000 + val numbers = (0 until 1000).map(_ => Random.nextInt(400).toLong).toList + val bag = IntBag.from(numbers) + + bag.avg.get shouldBe numbers.sum / size + + // TODO: the median is only approximate and it could be better, improve it + } + +} From 3cb2ef5d74282391674d1afea9be30f2eb1a5463 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matheus=20Weber=20da=20Concei=C3=A7=C3=A3o?= Date: Tue, 1 Sep 2015 17:17:47 -0300 Subject: [PATCH 035/268] Adds an option to launch the cluster master as spot --- tools/cluster.py | 5 +- tools/spark-ec2/spark_ec2.py | 95 +++++++++++++++++++++++++++++------- 2 files changed, 82 insertions(+), 18 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index e312d842..ed348fbb 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -202,7 +202,7 @@ def launch(cluster_name, slaves, tag=[], key_id=default_key_id, region=default_region, zone=default_zone, instance_type=default_instance_type, - ondemand=False, spot_price=default_spot_price, + ondemand=False, spot_price=default_spot_price, master_spot=False, user_data=default_user_data, security_group = None, vpc = None, @@ -252,6 +252,8 @@ def launch(cluster_name, slaves, ]) spot_params = ['--spot-price', spot_price] if not ondemand else [] + master_spot_params = ['--master-spot'] if not ondemand and master_spot else [] + ami_params = ['--ami', ami] if ami else [] master_ami_params = ['--master-ami', master_ami] if master_ami else [] @@ -276,6 +278,7 @@ def launch(cluster_name, slaves, '--user-data', user_data, 'launch', cluster_name] + spot_params + + master_spot_params + resume_param + auth_params + ami_params + diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 5c6458f9..3583bf1d 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -260,6 +260,10 @@ def parse_args(): "--spot-price", metavar="PRICE", type="float", help="If specified, launch slaves as spot instances with the given " + "maximum price (in dollars)") + parser.add_option( + "--master-spot", action="store_true", default=False, + help="If specified, launch master as spot instance using the same " + + "bid and instance type of the slave ones") parser.add_option( "--ganglia", action="store_true", default=True, help="Setup Ganglia monitoring on cluster (default: %default). NOTE: " + @@ -729,26 +733,83 @@ def launch_cluster(conn, opts, cluster_name): master_nodes = existing_masters else: master_type = opts.master_instance_type - if master_type == "": + if master_type == "" or opts.master_spot: master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name - master_res = master_image.run( - key_name=opts.key_pair, - security_group_ids=[master_group.id] + additional_group_ids, - instance_type=master_type, - placement=opts.zone, - min_count=1, - max_count=1, - block_device_map=block_map, - subnet_id=opts.subnet_id, - placement_group=opts.placement_group, - user_data=user_data_content, - instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, - instance_profile_name=opts.instance_profile_name) - - master_nodes = master_res.instances - print("Launched master in %s, regid = %s" % (zone, master_res.id)) + if opts.master_spot: + # Launch spot master instance with the requested price + # Note: The spot_price*1.5 is present to ensure a higher bid price to + # the master spot instance, so the master instance will be the + # last one to be terminated in a spot market price increase + print("Requesting master as spot instance with price $%.3f" % + (opts.spot_price)) + master_req = conn.request_spot_instances( + price=(opts.spot_price * 1.5), + image_id=opts.master_ami, + placement=opts.zone, + count=1, + key_name=opts.key_pair, + security_group_ids=[master_group.id] + additional_group_ids, + instance_type=master_type, + block_device_map=block_map, + subnet_id=opts.subnet_id, + placement_group=opts.placement_group, + user_data=user_data_content, + instance_profile_name=opts.instance_profile_name) + my_master_req_id = [req.id for req in master_req] + + start_time = datetime.now() + print("Waiting for master spot instance to be granted... Request ID: %s " % my_master_req_id) + try: + while True: + time.sleep(10) + reqs = conn.get_all_spot_instance_requests(my_master_req_id) + active_instance_ids = filter(lambda req: req.state == "active", reqs) + invalid_states = ["capacity-not-available", "capacity-oversubscribed", "price-too-low"] + invalid = filter(lambda req: req.status.code in invalid_states, reqs) + if len(invalid) > 0: + raise Exception("Invalid state for spot request: %s - status: %s" % + (invalid[0].id, invalid[0].status.message)) + if len(active_instance_ids) == 1: + print("Master spot instance granted") + master_res = conn.get_all_reservations([r.instance_id for r in active_instance_ids]) + master_nodes = master_res[0].instances + break + else: + print("Master spot instance not granted yet, waiting longer") + + if (datetime.now() - start_time).seconds > opts.spot_timeout * 60: + raise Exception("Timed out while waiting for master spot instance") + except: + print("Error: %s" % sys.exc_info()[1]) + print("Canceling master spot instance requests") + conn.cancel_spot_instance_requests(my_master_req_id) + # Log a warning if any of these requests actually launched instances: + (master_nodes, slave_nodes) = get_existing_cluster( + conn, opts, cluster_name, die_on_error=False) + running = len(master_nodes) + len(slave_nodes) + if running: + print(("WARNING: %d instances are still running" % running), file=stderr) + sys.exit(0) + else: + # Launch ondemand instance + master_res = master_image.run( + key_name=opts.key_pair, + security_group_ids=[master_group.id] + additional_group_ids, + instance_type=master_type, + placement=opts.zone, + min_count=1, + max_count=1, + block_device_map=block_map, + subnet_id=opts.subnet_id, + placement_group=opts.placement_group, + user_data=user_data_content, + instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, + instance_profile_name=opts.instance_profile_name) + + master_nodes = master_res.instances + print("Launched master in %s, regid = %s" % (zone, master_res.id)) # This wait time corresponds to SPARK-4983 print("Waiting for AWS to propagate instance metadata...") From 38213b49e13492536ddafe6fe70408552014c52b Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Wed, 2 Sep 2015 16:30:17 -0300 Subject: [PATCH 036/268] Fix serialization --- src/main/scala/ignition/core/utils/IntBag.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/utils/IntBag.scala b/src/main/scala/ignition/core/utils/IntBag.scala index 2a36da6e..a322f6f7 100644 --- a/src/main/scala/ignition/core/utils/IntBag.scala +++ b/src/main/scala/ignition/core/utils/IntBag.scala @@ -4,13 +4,13 @@ object IntBag { def from(numbers: TraversableOnce[Long]): IntBag = { val histogram = scala.collection.mutable.HashMap.empty[Long, Long] numbers.foreach(n => histogram += (n -> (histogram.getOrElse(n, 0L) + 1))) - new IntBag(histogram) + IntBag(histogram) } val empty = from(Seq.empty) } -class IntBag(val histogram: collection.Map[Long, Long]) { +case class IntBag(histogram: collection.Map[Long, Long]) { def ++(other: IntBag): IntBag = { val newHistogram = scala.collection.mutable.HashMap.empty[Long, Long] (histogram.keySet ++ other.histogram.keySet).foreach(k => newHistogram += (k -> (histogram.getOrElse(k, 0L) + other.histogram.getOrElse(k, 0L)))) From d668f40fd93f93a9409d97781e34ce9e1d0d8ea2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADs=20Guilherme=20Fernandes=20Pereira?= Date: Fri, 4 Sep 2015 15:16:30 -0300 Subject: [PATCH 037/268] Date between helper --- src/main/scala/ignition/core/utils/DateUtils.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/scala/ignition/core/utils/DateUtils.scala b/src/main/scala/ignition/core/utils/DateUtils.scala index c3fb5163..8ebf3b13 100644 --- a/src/main/scala/ignition/core/utils/DateUtils.scala +++ b/src/main/scala/ignition/core/utils/DateUtils.scala @@ -20,6 +20,9 @@ object DateUtils { def isEqualOrBefore(other: DateTime) = dateTime.isBefore(other) || dateTime.saneEqual(other) + + def isBetween(start: DateTime, end: DateTime) = + dateTime.isAfter(start) && dateTime.isEqualOrBefore(end) } implicit class SecondsImprovements(val seconds: Seconds) { From b2a602556c699f19da79b4ec6bf442a0a777862a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matheus=20Weber=20da=20Concei=C3=A7=C3=A3o?= Date: Tue, 8 Sep 2015 15:29:16 -0300 Subject: [PATCH 038/268] Adds a TODO! --- tools/spark-ec2/spark_ec2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 3583bf1d..52c21c3f 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -759,6 +759,7 @@ def launch_cluster(conn, opts, cluster_name): instance_profile_name=opts.instance_profile_name) my_master_req_id = [req.id for req in master_req] + # TODO: refactor duplicated spot waiting code start_time = datetime.now() print("Waiting for master spot instance to be granted... Request ID: %s " % my_master_req_id) try: From 08ae1dd35b7540218c2744c259b5d4c9ee6ae9cf Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 9 Sep 2015 16:01:17 -0300 Subject: [PATCH 039/268] some kind of hack to parallel read and list files using spark cluster slaves --- .../core/jobs/utils/SparkContextUtils.scala | 112 +++++++++++++++++- 1 file changed, 107 insertions(+), 5 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index baf80bc2..f421b614 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -1,21 +1,26 @@ package ignition.core.jobs.utils -import java.util.Date - import ignition.core.utils.ByteUtils +import ignition.core.utils.CollectionUtils._ +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.LongWritable -import org.apache.spark.SparkContext +import org.apache.hadoop.io.compress.CompressionCodecFactory +import org.apache.spark.{Partitioner, SparkContext} import org.apache.hadoop.fs.{FileStatus, Path, FileSystem} import org.apache.spark.rdd.{UnionRDD, RDD} -import org.joda.time.{DateTimeZone, DateTime} +import org.joda.time.DateTime import ignition.core.utils.DateUtils._ +import scala.collection.mutable.ArrayBuffer +import scala.io.Source import scala.reflect.ClassTag import scala.util.Try - object SparkContextUtils { + case class Bucket(var size: Long, paths: ArrayBuffer[String]) + case class S3File(path: String, isDir: Boolean, size: Long) + implicit class SparkContextImprovements(sc: SparkContext) { private def getFileSystem(path: Path): FileSystem = { @@ -194,5 +199,102 @@ object SparkContextUtils { else objectHadoopFile(paths, minimumPaths) } + + def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long): RDD[String] = { + val s3Paths = parallelListFiles(paths) + val buckets = buildBuckets(s3Paths, maxBytesPerPartition) + val files = buckets.flatMap(_.paths) + + val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId") + val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey") + + val partitionedFiles = sc.parallelize(files).map(file => file -> ()).partitionBy(createPartitioner(buckets)) + + partitionedFiles.mapPartitions { files => + val conf = new Configuration() + conf.set("fs.s3n.awsAccessKeyId", s3Key) + conf.set("fs.s3n.awsSecretAccessKey", s3Secret) + val codecFactory = new CompressionCodecFactory(conf) + files.map { case (path, _) => path } flatMap { s3Path => + val fileSystem = FileSystem.get(new java.net.URI(s3Path), conf) + val path = new Path(s3Path) + val inputStream = Option(codecFactory.getCodec(path)) match { + case Some(compression) => compression.createInputStream(fileSystem.open(path)) + case None => fileSystem.open(path) + } + Source.fromInputStream(inputStream).getLines() + } + } + } + + private def createPartitioner(buckets: Seq[Bucket]): Partitioner = { + val size = buckets.size + val partitions: Map[Any, Int] = buckets.zipWithIndex.flatMap { case (bucket, index) => bucket.paths.map(path => path -> index) }.toMap + new Partitioner { + override def numPartitions: Int = size + override def getPartition(key: Any): Int = partitions(key) + } + } + + private def buildBuckets(files: Seq[S3File], maxBytesPerPartition: Long): Seq[Bucket] = { + val buckets = ArrayBuffer.empty[Bucket] + files.distinctBy(_.path).foreach { file => + val size = file.size + val bucket = buckets.find(bucket => bucket.size + size < maxBytesPerPartition) match { + case Some(bucketFound) => bucketFound + case None => + val newBucket = Bucket(0, ArrayBuffer.empty) + buckets += newBucket + newBucket + } + bucket.size += size + bucket.paths += file.path + } + buckets + } + + def parallelListFiles(paths: Seq[String]): Seq[S3File] = { + val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId") + val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey") + + val remainingDirectories = new scala.collection.mutable.ArrayBuffer[S3File] + remainingDirectories ++= paths.map(S3File(_, isDir = true, 0)) + val allFiles = new scala.collection.mutable.ArrayBuffer[S3File] + + while (remainingDirectories.nonEmpty) { + val newDirs = sc.parallelize(remainingDirectories.map(_.path)) + val currentBatch = newDirs.flatMap { path => + val conf = new Configuration() + conf.set("fs.s3n.awsAccessKeyId", s3Key) + conf.set("fs.s3n.awsSecretAccessKey", s3Secret) + val fileSystem = FileSystem.get(new java.net.URI(path), conf) + try { + val hadoopPath = new Path(path) + if (fileSystem.isDirectory(hadoopPath)) { + val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) + sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen)) + } else if (fileSystem.isFile(hadoopPath)) { + val status = fileSystem.getFileStatus(hadoopPath) + Seq(S3File(status.getPath.toString, status.isDirectory, status.getLen)) + } else { // Maybe is glob or not found + val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) + sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen)) + } + } catch { + case e: java.io.FileNotFoundException => + println(s"File $path not found.") + e.printStackTrace() + Nil + } + }.collect() + val (dirs, files) = currentBatch.partition(_.isDir) + remainingDirectories.clear() + remainingDirectories ++= dirs + allFiles ++= files + } + + allFiles + } + } } From c56c0273b7b875329ff300d93af484f2beaf045f Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Mon, 14 Sep 2015 17:31:59 -0300 Subject: [PATCH 040/268] some of pr reivews --- .../core/jobs/utils/SparkContextUtils.scala | 122 +++++++++--------- 1 file changed, 62 insertions(+), 60 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index f421b614..03801857 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -18,7 +18,7 @@ import scala.util.Try object SparkContextUtils { - case class Bucket(var size: Long, paths: ArrayBuffer[String]) + case class S3FilePartition(var size: Long, paths: ArrayBuffer[String]) case class S3File(path: String, isDir: Boolean, size: Long) implicit class SparkContextImprovements(sc: SparkContext) { @@ -201,14 +201,13 @@ object SparkContextUtils { } def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long): RDD[String] = { - val s3Paths = parallelListFiles(paths) - val buckets = buildBuckets(s3Paths, maxBytesPerPartition) - val files = buckets.flatMap(_.paths) + val foundFiles = parallelListFiles(paths) + val files = foundFiles.map(_.path) val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId") val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey") - val partitionedFiles = sc.parallelize(files).map(file => file -> ()).partitionBy(createPartitioner(buckets)) + val partitionedFiles = sc.parallelize(files).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition)) partitionedFiles.mapPartitions { files => val conf = new Configuration() @@ -222,78 +221,81 @@ object SparkContextUtils { case Some(compression) => compression.createInputStream(fileSystem.open(path)) case None => fileSystem.open(path) } - Source.fromInputStream(inputStream).getLines() + try { + Source.fromInputStream(inputStream).getLines().toList + } finally { + Try { inputStream.close() } + } } } } - private def createPartitioner(buckets: Seq[Bucket]): Partitioner = { - val size = buckets.size - val partitions: Map[Any, Int] = buckets.zipWithIndex.flatMap { case (bucket, index) => bucket.paths.map(path => path -> index) }.toMap - new Partitioner { - override def numPartitions: Int = size - override def getPartition(key: Any): Int = partitions(key) - } - } - - private def buildBuckets(files: Seq[S3File], maxBytesPerPartition: Long): Seq[Bucket] = { - val buckets = ArrayBuffer.empty[Bucket] + private def createPartitioner(files: Seq[S3File], maxBytesPerPartition: Long): Partitioner = { + val partitions = ArrayBuffer.empty[S3FilePartition] files.distinctBy(_.path).foreach { file => val size = file.size - val bucket = buckets.find(bucket => bucket.size + size < maxBytesPerPartition) match { - case Some(bucketFound) => bucketFound + val partition = partitions.find(bucket => bucket.size + size < maxBytesPerPartition) match { + case Some(partitionFound) => partitionFound case None => - val newBucket = Bucket(0, ArrayBuffer.empty) - buckets += newBucket - newBucket + val newPartition = S3FilePartition(0, ArrayBuffer.empty) + partitions += newPartition + newPartition } - bucket.size += size - bucket.paths += file.path + partition.size += size + partition.paths += file.path + } + + val indexedPartitions: Map[Any, Int] = partitions.zipWithIndex.flatMap { + case (bucket, index) => bucket.paths.map(path => path -> index) + }.toMap + + new Partitioner { + override def numPartitions: Int = partitions.size + override def getPartition(key: Any): Int = indexedPartitions(key) } - buckets } def parallelListFiles(paths: Seq[String]): Seq[S3File] = { val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId") val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey") - val remainingDirectories = new scala.collection.mutable.ArrayBuffer[S3File] - remainingDirectories ++= paths.map(S3File(_, isDir = true, 0)) - val allFiles = new scala.collection.mutable.ArrayBuffer[S3File] - - while (remainingDirectories.nonEmpty) { - val newDirs = sc.parallelize(remainingDirectories.map(_.path)) - val currentBatch = newDirs.flatMap { path => - val conf = new Configuration() - conf.set("fs.s3n.awsAccessKeyId", s3Key) - conf.set("fs.s3n.awsSecretAccessKey", s3Secret) - val fileSystem = FileSystem.get(new java.net.URI(path), conf) - try { - val hadoopPath = new Path(path) - if (fileSystem.isDirectory(hadoopPath)) { - val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) - sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen)) - } else if (fileSystem.isFile(hadoopPath)) { - val status = fileSystem.getFileStatus(hadoopPath) - Seq(S3File(status.getPath.toString, status.isDirectory, status.getLen)) - } else { // Maybe is glob or not found - val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) - sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen)) + val directories = paths.map(S3File(_, isDir = true, 0)) + + def innerListFiles(remainingDirectories: Seq[S3File]): Seq[S3File] = { + if (remainingDirectories.isEmpty) { + Nil + } else { + val newDirs = sc.parallelize(remainingDirectories.map(_.path)) + val currentBatch = newDirs.flatMap { path => + val conf = new Configuration() + conf.set("fs.s3n.awsAccessKeyId", s3Key) + conf.set("fs.s3n.awsSecretAccessKey", s3Secret) + val fileSystem = FileSystem.get(new java.net.URI(path), conf) + try { + val hadoopPath = new Path(path) + if (fileSystem.isDirectory(hadoopPath)) { + val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) + sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen)) + } else if (fileSystem.isFile(hadoopPath)) { + val status = fileSystem.getFileStatus(hadoopPath) + Seq(S3File(status.getPath.toString, status.isDirectory, status.getLen)) + } else { + // Maybe is glob or not found + val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) + sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen)) + } + } catch { + case e: java.io.FileNotFoundException => + println(s"File $path not found.") + Nil } - } catch { - case e: java.io.FileNotFoundException => - println(s"File $path not found.") - e.printStackTrace() - Nil - } - }.collect() - val (dirs, files) = currentBatch.partition(_.isDir) - remainingDirectories.clear() - remainingDirectories ++= dirs - allFiles ++= files - } + }.collect() - allFiles + val (dirs, files) = currentBatch.partition(_.isDir) + files ++ innerListFiles(dirs) + } + } + innerListFiles(directories) } } From 8ffee27d402a33bffd54fad2b11d4b092709d9f4 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 15 Sep 2015 14:26:20 -0300 Subject: [PATCH 041/268] pr review --- .../core/jobs/utils/SparkContextUtils.scala | 107 ++++++++---------- 1 file changed, 49 insertions(+), 58 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 03801857..3fdfea09 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -1,16 +1,17 @@ package ignition.core.jobs.utils import ignition.core.utils.ByteUtils -import ignition.core.utils.CollectionUtils._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.compress.CompressionCodecFactory +import org.apache.spark.broadcast.Broadcast import org.apache.spark.{Partitioner, SparkContext} import org.apache.hadoop.fs.{FileStatus, Path, FileSystem} import org.apache.spark.rdd.{UnionRDD, RDD} import org.joda.time.DateTime import ignition.core.utils.DateUtils._ +import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer import scala.io.Source import scala.reflect.ClassTag @@ -18,8 +19,9 @@ import scala.util.Try object SparkContextUtils { - case class S3FilePartition(var size: Long, paths: ArrayBuffer[String]) - case class S3File(path: String, isDir: Boolean, size: Long) + case class HadoopFile(path: String, isDir: Boolean, size: Long) + + private case class HadoopFilePartition(size: Long, paths: Seq[String]) implicit class SparkContextImprovements(sc: SparkContext) { @@ -201,18 +203,13 @@ object SparkContextUtils { } def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long): RDD[String] = { - val foundFiles = parallelListFiles(paths) - val files = foundFiles.map(_.path) - - val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId") - val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey") + val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) - val partitionedFiles = sc.parallelize(files).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition)) + val foundFiles = parallelListFiles(paths) + val partitionedFiles = sc.parallelize(foundFiles.map(_.path)).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition)) partitionedFiles.mapPartitions { files => - val conf = new Configuration() - conf.set("fs.s3n.awsAccessKeyId", s3Key) - conf.set("fs.s3n.awsSecretAccessKey", s3Secret) + val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val codecFactory = new CompressionCodecFactory(conf) files.map { case (path, _) => path } flatMap { s3Path => val fileSystem = FileSystem.get(new java.net.URI(s3Path), conf) @@ -222,7 +219,7 @@ object SparkContextUtils { case None => fileSystem.open(path) } try { - Source.fromInputStream(inputStream).getLines().toList + Source.fromInputStream(inputStream).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _) } finally { Try { inputStream.close() } } @@ -230,19 +227,15 @@ object SparkContextUtils { } } - private def createPartitioner(files: Seq[S3File], maxBytesPerPartition: Long): Partitioner = { - val partitions = ArrayBuffer.empty[S3FilePartition] - files.distinctBy(_.path).foreach { file => - val size = file.size - val partition = partitions.find(bucket => bucket.size + size < maxBytesPerPartition) match { - case Some(partitionFound) => partitionFound - case None => - val newPartition = S3FilePartition(0, ArrayBuffer.empty) - partitions += newPartition - newPartition - } - partition.size += size - partition.paths += file.path + private def createPartitioner(files: Seq[HadoopFile], maxBytesPerPartition: Long): Partitioner = { + val partitions = files.foldLeft(Seq.empty[HadoopFilePartition]) { + case (acc, file) => + acc.find(bucket => bucket.size + file.size < maxBytesPerPartition) match { + case Some(found) => + val updated = found.copy(size = found.size + file.size, paths = file.path +: found.paths) + acc.updated(acc.indexOf(found), updated) + case None => acc :+ HadoopFilePartition(file.size, Seq(file.path)) + } } val indexedPartitions: Map[Any, Int] = partitions.zipWithIndex.flatMap { @@ -255,43 +248,41 @@ object SparkContextUtils { } } - def parallelListFiles(paths: Seq[String]): Seq[S3File] = { - val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId") - val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey") + private def executeListOnWorkers(hadoopConf: Broadcast[Map[String, String]], paths: RDD[String]): Seq[HadoopFile] = { + paths.flatMap { path => + val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } + val fileSystem = FileSystem.get(new java.net.URI(path), conf) + try { + val hadoopPath = new Path(path) + if (fileSystem.isDirectory(hadoopPath)) { + val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) + sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)) + } else if (fileSystem.isFile(hadoopPath)) { + val status = fileSystem.getFileStatus(hadoopPath) + Seq(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)) + } else { + // Maybe is glob or not found + val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) + sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)) + } + } catch { + case e: java.io.FileNotFoundException => + println(s"File $path not found.") + Nil + } + }.collect().toSeq + } - val directories = paths.map(S3File(_, isDir = true, 0)) + def parallelListFiles(paths: Seq[String]): Seq[HadoopFile] = { + val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) + val directories = paths.map(HadoopFile(_, isDir = true, 0)) - def innerListFiles(remainingDirectories: Seq[S3File]): Seq[S3File] = { + def innerListFiles(remainingDirectories: Seq[HadoopFile]): Seq[HadoopFile] = { if (remainingDirectories.isEmpty) { Nil } else { - val newDirs = sc.parallelize(remainingDirectories.map(_.path)) - val currentBatch = newDirs.flatMap { path => - val conf = new Configuration() - conf.set("fs.s3n.awsAccessKeyId", s3Key) - conf.set("fs.s3n.awsSecretAccessKey", s3Secret) - val fileSystem = FileSystem.get(new java.net.URI(path), conf) - try { - val hadoopPath = new Path(path) - if (fileSystem.isDirectory(hadoopPath)) { - val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) - sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen)) - } else if (fileSystem.isFile(hadoopPath)) { - val status = fileSystem.getFileStatus(hadoopPath) - Seq(S3File(status.getPath.toString, status.isDirectory, status.getLen)) - } else { - // Maybe is glob or not found - val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) - sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen)) - } - } catch { - case e: java.io.FileNotFoundException => - println(s"File $path not found.") - Nil - } - }.collect() - - val (dirs, files) = currentBatch.partition(_.isDir) + val pathsRDD = sc.parallelize(remainingDirectories.map(_.path)) + val (dirs, files) = executeListOnWorkers(hadoopConf, pathsRDD).partition(_.isDir) files ++ innerListFiles(dirs) } } From 7234254729377453b7750166765554cb3eb22951 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 16 Sep 2015 10:14:09 -0300 Subject: [PATCH 042/268] logging input stream close failure --- .../core/jobs/utils/SparkContextUtils.scala | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 3fdfea09..2259d622 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -16,6 +16,7 @@ import scala.collection.mutable.ArrayBuffer import scala.io.Source import scala.reflect.ClassTag import scala.util.Try +import scala.util.control.NonFatal object SparkContextUtils { @@ -211,17 +212,23 @@ object SparkContextUtils { partitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val codecFactory = new CompressionCodecFactory(conf) - files.map { case (path, _) => path } flatMap { s3Path => - val fileSystem = FileSystem.get(new java.net.URI(s3Path), conf) - val path = new Path(s3Path) - val inputStream = Option(codecFactory.getCodec(path)) match { - case Some(compression) => compression.createInputStream(fileSystem.open(path)) - case None => fileSystem.open(path) + files.map { case (path, _) => path } flatMap { path => + val fileSystem = FileSystem.get(new java.net.URI(path), conf) + val hadoopPath = new Path(path) + val inputStream = Option(codecFactory.getCodec(hadoopPath)) match { + case Some(compression) => compression.createInputStream(fileSystem.open(hadoopPath)) + case None => fileSystem.open(hadoopPath) } try { Source.fromInputStream(inputStream).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _) } finally { - Try { inputStream.close() } + try { + inputStream.close() + } catch { + case NonFatal(ex) => + println(s"Fail to close resource from '$path'") + ex.printStackTrace() + } } } } From af00eefa0975159ae760bbfec4444638b9862293 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 16 Sep 2015 10:34:33 -0300 Subject: [PATCH 043/268] better exception report --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 2259d622..0d7f0742 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -226,8 +226,7 @@ object SparkContextUtils { inputStream.close() } catch { case NonFatal(ex) => - println(s"Fail to close resource from '$path'") - ex.printStackTrace() + println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") } } } From 89630eb7f990c36ae749cc8312e6bc199b8b15e7 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 17 Sep 2015 13:53:36 -0300 Subject: [PATCH 044/268] setting UTF-8 codec to read file content (same behavior of hadoop client) --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 0d7f0742..ab20c83b 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -13,7 +13,7 @@ import ignition.core.utils.DateUtils._ import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer -import scala.io.Source +import scala.io.{Codec, Source} import scala.reflect.ClassTag import scala.util.Try import scala.util.control.NonFatal @@ -220,7 +220,7 @@ object SparkContextUtils { case None => fileSystem.open(hadoopPath) } try { - Source.fromInputStream(inputStream).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _) + Source.fromInputStream(inputStream)(Codec.UTF8).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _) } finally { try { inputStream.close() From 06ac774d980a47c05670760bd9c3d8725aabc45f Mon Sep 17 00:00:00 2001 From: Fernando Rodrigues Date: Mon, 21 Sep 2015 19:31:45 -0300 Subject: [PATCH 045/268] will delete SG's after cluster destroy --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index ed348fbb..3f4065e3 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -313,7 +313,7 @@ def launch(cluster_name, slaves, raise CommandError('Failed to created cluster {} after failures'.format(cluster_name)) -def destroy(cluster_name, delete_groups=False, region=default_region): +def destroy(cluster_name, delete_groups=True, region=default_region): delete_sg_param = ['--delete-groups'] if delete_groups else [] ec2_script_path = chdir_to_ec2_script_and_get_path() From 10b086e745f9d304df3e77af1215c34cf0c5b59c Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 15 Oct 2015 14:41:58 -0300 Subject: [PATCH 046/268] spark 1.5.1 update --- build.sbt | 2 +- tools/cluster.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index 476dd3bb..acdef9cb 100644 --- a/build.sbt +++ b/build.sbt @@ -13,7 +13,7 @@ ideaExcludeFolders += ".idea_modules" // Because we can't run two spark contexts on same VM parallelExecution in Test := false -libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.4.1" % "provided") +libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.5.1" % "provided") .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") diff --git a/tools/cluster.py b/tools/cluster.py index 3f4065e3..cd972951 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -47,7 +47,7 @@ default_ami = None # will be decided based on spark-ec2 list default_master_ami = None default_env = 'dev' -default_spark_version = '1.4.1' +default_spark_version = '1.5.1' default_spark_repo = 'https://github.com/chaordic/spark' default_remote_user = 'ec2-user' default_remote_control_dir = '/tmp/Ignition' From a59f2eb92416915304edad9fd5f72f5c048e78a5 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 15 Oct 2015 15:34:38 -0300 Subject: [PATCH 047/268] fix spark_ec2.py --- tools/spark-ec2/spark_ec2.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 52c21c3f..89ade820 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -51,7 +51,7 @@ raw_input = input xrange = range -SPARK_EC2_VERSION = "1.4.1" +SPARK_EC2_VERSION = "1.5.1" SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__)) VALID_SPARK_VERSIONS = set([ @@ -72,6 +72,8 @@ "1.3.1", "1.4.0", "1.4.1", + "1.5.0", + "1.5.1", ]) SPARK_TACHYON_MAP = { @@ -86,6 +88,8 @@ "1.3.1": "0.5.0", "1.4.0": "0.6.4", "1.4.1": "0.6.4", + "1.5.0": "0.7.1", + "1.5.1": "0.7.1", } DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION From b176cc51a513ac6c1d8155ce98e6ea345f6d9abd Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Thu, 15 Oct 2015 17:03:58 -0300 Subject: [PATCH 048/268] Added executor instances option --- tools/cluster.py | 6 +++++- .../deploy.generic/root/spark-ec2/ec2-variables.sh | 1 + tools/spark-ec2/spark_ec2.py | 6 ++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index cd972951..0af46ebe 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -38,6 +38,7 @@ default_instance_type = 'r3.xlarge' default_spot_price = '0.10' default_worker_instances = '1' +default_executor_instances = '1' default_master_instance_type = 'm3.xlarge' default_driver_heap_size = '12G' default_region = 'us-east-1' @@ -209,7 +210,9 @@ def launch(cluster_name, slaves, vpc_subnet = None, master_instance_type=default_master_instance_type, wait_time='180', hadoop_major_version='2', - worker_instances=default_worker_instances, retries_on_same_cluster=5, + worker_instances=default_worker_instances, + executor_instances=default_executor_instances, + retries_on_same_cluster=5, max_clusters_to_create=5, minimum_percentage_healthy_slaves=0.9, remote_user=default_remote_user, @@ -272,6 +275,7 @@ def launch(cluster_name, slaves, '--spark-ec2-git-repo', spark_ec2_git_repo, '--spark-ec2-git-branch', spark_ec2_git_branch, '--worker-instances', worker_instances, + '--executor-instances', executor_instances, '--master-opts', '-Dspark.worker.timeout={0}'.format(worker_timeout), '--spark-git-repo', spark_repo, '-v', spark_version, diff --git a/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh b/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh index 4f3e8da8..bd3b656f 100644 --- a/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh +++ b/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh @@ -29,6 +29,7 @@ export TACHYON_VERSION="{{tachyon_version}}" export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}" export SWAP_MB="{{swap}}" export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}" +export SPARK_EXECUTOR_INSTANCES="{{spark_executor_instances}}" export SPARK_MASTER_OPTS="{{spark_master_opts}}" export AWS_ACCESS_KEY_ID="{{aws_access_key_id}}" export AWS_SECRET_ACCESS_KEY="{{aws_secret_access_key}}" diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 89ade820..e9442448 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -288,6 +288,10 @@ def parse_args(): "--worker-instances", type="int", default=1, help="Number of instances per worker: variable SPARK_WORKER_INSTANCES. Not used if YARN " + "is used as Hadoop major version (default: %default)") + parser.add_option( + "--executor-instances", type="int", default=1, + help="Number of executor instances per worker: variable SPARK_EXECUTOR_INSTANCES. Not used if YARN " + + "is used as Hadoop major version (default: %default)") parser.add_option( "--master-opts", type="string", default="", help="Extra options to give to master through SPARK_MASTER_OPTS variable " + @@ -1161,6 +1165,7 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): master_addresses = [get_dns_name(i, opts.private_ips) for i in master_nodes] slave_addresses = [get_dns_name(i, opts.private_ips) for i in slave_nodes] worker_instances_str = "%d" % opts.worker_instances if opts.worker_instances else "" + executor_instances_str = "%d" % opts.executor_instances if opts.executor_instances else "" template_vars = { "master_list": '\n'.join(master_addresses), "active_master": active_master, @@ -1175,6 +1180,7 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): "tachyon_version": tachyon_v, "hadoop_major_version": opts.hadoop_major_version, "spark_worker_instances": worker_instances_str, + "spark_executor_instances": executor_instances_str, "spark_master_opts": opts.master_opts } From 437e2644463157d5df0328df63de8cb8d68bdef7 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 27 Oct 2015 16:50:59 -0200 Subject: [PATCH 049/268] Adding filterAndGetParallelTextFiles --- .../core/jobs/utils/SparkContextUtils.scala | 42 +++++++++++++++++-- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index ab20c83b..00cbb347 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -36,7 +36,7 @@ object SparkContextUtils { for { path <- paths status <- Option(fs.globStatus(path)).getOrElse(Array.empty).toSeq - if status.isDirectory || !removeEmpty || status.getLen > 0 // remove empty files if necessary + if !removeEmpty || status.getLen > 0 || status.isDirectory // remove empty files if necessary } yield status } @@ -69,6 +69,14 @@ object SparkContextUtils { processPaths((p) => sc.textFile(p), paths, minimumPaths) } + private def processParallelTextFiles(paths: Seq[String], minimumPaths: Int, maxBytesPerPartition: Long): RDD[String] = { + val splittedPaths = paths.flatMap(ignition.core.utils.HadoopUtils.getPathStrings) + if (splittedPaths.size < minimumPaths) + throw new Exception(s"Not enough paths found for $paths") + + parallelTextFiles(splittedPaths, maxBytesPerPartition) + } + private def filterPaths(paths: Seq[String], requireSuccess: Boolean, inclusiveStartDate: Boolean, @@ -145,6 +153,14 @@ object SparkContextUtils { processTextFiles(paths, minimumPaths) } + def getParallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long, synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = { + if (synchLocally) + processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition) + else + processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition) + } + + @deprecated("It may incur in heavy S3 costs and/or be slow with small files, use filterAndGetParallelTextFiles instead", "2015-10-27") def filterAndGetTextFiles(path: String, requireSuccess: Boolean = false, inclusiveStartDate: Boolean = true, @@ -162,6 +178,24 @@ object SparkContextUtils { getTextFiles(paths, synchLocally, forceSynch, minimumPaths) } + def filterAndGetParallelTextFiles(path: String, + maxBytesPerPartition: Long = 64 * 1000 * 1000, + requireSuccess: Boolean = false, + inclusiveStartDate: Boolean = true, + startDate: Option[DateTime] = None, + inclusiveEndDate: Boolean = true, + endDate: Option[DateTime] = None, + lastN: Option[Int] = None, + synchLocally: Boolean = false, + forceSynch: Boolean = false, + ignoreMalformedDates: Boolean = false, + minimumPaths: Int = 1)(implicit dateExtractor: PathDateExtractor): RDD[String] = { + val paths = getFilteredPaths(Seq(path), requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates) + if (paths.size < minimumPaths) + throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of paths $paths is less than the required") + getParallelTextFiles(paths, maxBytesPerPartition, synchLocally, forceSynch, minimumPaths) + } + private def stringHadoopFile(paths: Seq[String], minimumPaths: Int): RDD[Try[String]] = { processPaths((p) => sc.sequenceFile(p, classOf[LongWritable], classOf[org.apache.hadoop.io.BytesWritable]) .map({ case (k, v) => Try { ByteUtils.toString(v.getBytes, 0, v.getLength, "UTF-8") } }), paths, minimumPaths) @@ -260,11 +294,11 @@ object SparkContextUtils { val fileSystem = FileSystem.get(new java.net.URI(path), conf) try { val hadoopPath = new Path(path) - if (fileSystem.isDirectory(hadoopPath)) { + val status = fileSystem.getFileStatus(hadoopPath) + if (status.isDirectory) { val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)) - } else if (fileSystem.isFile(hadoopPath)) { - val status = fileSystem.getFileStatus(hadoopPath) + } else if (status.isFile) { Seq(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)) } else { // Maybe is glob or not found From 637b80d9995e646b5260605c049a76487a054d03 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 27 Oct 2015 19:57:00 -0200 Subject: [PATCH 050/268] Many improvements --- .../core/jobs/utils/SparkContextUtils.scala | 45 ++++++++++++------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 00cbb347..490cda2c 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -13,6 +13,7 @@ import ignition.core.utils.DateUtils._ import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable import scala.io.{Codec, Source} import scala.reflect.ClassTag import scala.util.Try @@ -69,12 +70,12 @@ object SparkContextUtils { processPaths((p) => sc.textFile(p), paths, minimumPaths) } - private def processParallelTextFiles(paths: Seq[String], minimumPaths: Int, maxBytesPerPartition: Long): RDD[String] = { + private def processParallelTextFiles(paths: Seq[String], minimumPaths: Int, maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = { val splittedPaths = paths.flatMap(ignition.core.utils.HadoopUtils.getPathStrings) if (splittedPaths.size < minimumPaths) throw new Exception(s"Not enough paths found for $paths") - parallelTextFiles(splittedPaths, maxBytesPerPartition) + parallelTextFiles(splittedPaths, maxBytesPerPartition, minPartitions) } private def filterPaths(paths: Seq[String], @@ -146,6 +147,7 @@ object SparkContextUtils { } + @deprecated("It may incur heavy S3 costs and/or be slow with small files, use getParallelTextFiles instead", "2015-10-27") def getTextFiles(paths: Seq[String], synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = { if (synchLocally) processTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths) @@ -153,14 +155,17 @@ object SparkContextUtils { processTextFiles(paths, minimumPaths) } - def getParallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long, synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = { + def getParallelTextFiles(paths: Seq[String], + maxBytesPerPartition: Long = 64 * 1000 * 1000, + minPartitions: Int = 500, + synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = { if (synchLocally) - processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition) + processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition, minPartitions) else - processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition) + processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition, minPartitions) } - @deprecated("It may incur in heavy S3 costs and/or be slow with small files, use filterAndGetParallelTextFiles instead", "2015-10-27") + @deprecated("It may incur heavy S3 costs and/or be slow with small files, use filterAndGetParallelTextFiles instead", "2015-10-27") def filterAndGetTextFiles(path: String, requireSuccess: Boolean = false, inclusiveStartDate: Boolean = true, @@ -180,6 +185,7 @@ object SparkContextUtils { def filterAndGetParallelTextFiles(path: String, maxBytesPerPartition: Long = 64 * 1000 * 1000, + minPartitions: Int = 500, requireSuccess: Boolean = false, inclusiveStartDate: Boolean = true, startDate: Option[DateTime] = None, @@ -193,7 +199,7 @@ object SparkContextUtils { val paths = getFilteredPaths(Seq(path), requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates) if (paths.size < minimumPaths) throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of paths $paths is less than the required") - getParallelTextFiles(paths, maxBytesPerPartition, synchLocally, forceSynch, minimumPaths) + getParallelTextFiles(paths, maxBytesPerPartition, minPartitions, synchLocally, forceSynch, minimumPaths) } private def stringHadoopFile(paths: Seq[String], minimumPaths: Int): RDD[Try[String]] = { @@ -237,17 +243,18 @@ object SparkContextUtils { objectHadoopFile(paths, minimumPaths) } - def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long): RDD[String] = { + def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = { + require(paths.nonEmpty, "At least one path is required") val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) val foundFiles = parallelListFiles(paths) - val partitionedFiles = sc.parallelize(foundFiles.map(_.path)).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition)) + val partitionedFiles = sc.parallelize(foundFiles.map(_.path)).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition, minPartitions)) partitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val codecFactory = new CompressionCodecFactory(conf) + val fileSystem = FileSystem.get(new java.net.URI(paths.head), conf) files.map { case (path, _) => path } flatMap { path => - val fileSystem = FileSystem.get(new java.net.URI(path), conf) val hadoopPath = new Path(path) val inputStream = Option(codecFactory.getCodec(hadoopPath)) match { case Some(compression) => compression.createInputStream(fileSystem.open(hadoopPath)) @@ -267,16 +274,22 @@ object SparkContextUtils { } } - private def createPartitioner(files: Seq[HadoopFile], maxBytesPerPartition: Long): Partitioner = { - val partitions = files.foldLeft(Seq.empty[HadoopFilePartition]) { + private def createPartitioner(files: Seq[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long): Partitioner = { + implicit val ordering: Ordering[HadoopFilePartition] = Ordering.by(p => -p.size) // Small partitions come first (highest priority) + + val pq: mutable.PriorityQueue[HadoopFilePartition] = mutable.PriorityQueue.empty + + (0L until minPartitions).foreach(_ => pq += HadoopFilePartition(0, Seq.empty)) + + val partitions = files.foldLeft(pq) { case (acc, file) => - acc.find(bucket => bucket.size + file.size < maxBytesPerPartition) match { + acc.headOption.filter(bucket => bucket.size + file.size < maxBytesPerPartition) match { case Some(found) => val updated = found.copy(size = found.size + file.size, paths = file.path +: found.paths) - acc.updated(acc.indexOf(found), updated) - case None => acc :+ HadoopFilePartition(file.size, Seq(file.path)) + acc.tail += updated + case None => acc += HadoopFilePartition(file.size, Seq(file.path)) } - } + }.filter(_.size > 0).toList // Remove empty partitions val indexedPartitions: Map[Any, Int] = partitions.zipWithIndex.flatMap { case (bucket, index) => bucket.paths.map(path => path -> index) From 0563fab5fa7bbf900436a0bb95b303f62a22bf12 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Wed, 28 Oct 2015 15:53:45 -0200 Subject: [PATCH 051/268] Small improvements --- .../ignition/core/jobs/utils/RDDUtils.scala | 4 ++ .../core/jobs/utils/SparkContextUtils.scala | 37 ++++++++++--------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala index 7e75d5ec..57069bae 100644 --- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala @@ -80,6 +80,10 @@ object RDDUtils { }, preservesPartitioning = true) } + def collectValues[U: ClassTag](f: PartialFunction[V, U]): RDD[(K, U)] = { + rdd.filter { case (k, v) => f.isDefinedAt(v) }.mapValues(f) + } + def groupByKeyAndTake(n: Int): RDD[(K, List[V])] = rdd.aggregateByKey(List.empty[V])( (lst, v) => diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 490cda2c..8e4ec35c 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -75,7 +75,7 @@ object SparkContextUtils { if (splittedPaths.size < minimumPaths) throw new Exception(s"Not enough paths found for $paths") - parallelTextFiles(splittedPaths, maxBytesPerPartition, minPartitions) + parallelTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions) } private def filterPaths(paths: Seq[String], @@ -243,7 +243,7 @@ object SparkContextUtils { objectHadoopFile(paths, minimumPaths) } - def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = { + def parallelTextFiles(paths: List[String], maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = { require(paths.nonEmpty, "At least one path is required") val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) @@ -274,7 +274,7 @@ object SparkContextUtils { } } - private def createPartitioner(files: Seq[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long): Partitioner = { + private def createPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long): Partitioner = { implicit val ordering: Ordering[HadoopFilePartition] = Ordering.by(p => -p.size) // Small partitions come first (highest priority) val pq: mutable.PriorityQueue[HadoopFilePartition] = mutable.PriorityQueue.empty @@ -289,7 +289,7 @@ object SparkContextUtils { acc.tail += updated case None => acc += HadoopFilePartition(file.size, Seq(file.path)) } - }.filter(_.size > 0).toList // Remove empty partitions + }.filter(_.paths.nonEmpty).toList // Remove empty partitions val indexedPartitions: Map[Any, Int] = partitions.zipWithIndex.flatMap { case (bucket, index) => bucket.paths.map(path => path -> index) @@ -301,36 +301,39 @@ object SparkContextUtils { } } - private def executeListOnWorkers(hadoopConf: Broadcast[Map[String, String]], paths: RDD[String]): Seq[HadoopFile] = { + private def executeListOnWorkers(hadoopConf: Broadcast[Map[String, String]], paths: RDD[String]): List[HadoopFile] = { paths.flatMap { path => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val fileSystem = FileSystem.get(new java.net.URI(path), conf) - try { - val hadoopPath = new Path(path) + val hadoopPath = new Path(path) + val tryFind = try { val status = fileSystem.getFileStatus(hadoopPath) if (status.isDirectory) { val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) - sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)) + Option(sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList) } else if (status.isFile) { - Seq(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)) + Option(List(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen))) } else { - // Maybe is glob or not found - val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) - sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)) + None } } catch { case e: java.io.FileNotFoundException => - println(s"File $path not found.") - Nil + None + } + + tryFind.getOrElse { + // Maybe is glob or not found + val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) + sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList } - }.collect().toSeq + }.collect().toList } - def parallelListFiles(paths: Seq[String]): Seq[HadoopFile] = { + def parallelListFiles(paths: List[String]): List[HadoopFile] = { val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) val directories = paths.map(HadoopFile(_, isDir = true, 0)) - def innerListFiles(remainingDirectories: Seq[HadoopFile]): Seq[HadoopFile] = { + def innerListFiles(remainingDirectories: List[HadoopFile]): List[HadoopFile] = { if (remainingDirectories.isEmpty) { Nil } else { From cc4f716f7485e6ca314318006fb3fb943937e3e1 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Wed, 28 Oct 2015 17:29:44 -0200 Subject: [PATCH 052/268] Fix file system issues in corner cases --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 8e4ec35c..6e4d0bc8 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -253,9 +253,9 @@ object SparkContextUtils { partitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val codecFactory = new CompressionCodecFactory(conf) - val fileSystem = FileSystem.get(new java.net.URI(paths.head), conf) files.map { case (path, _) => path } flatMap { path => val hadoopPath = new Path(path) + val fileSystem = hadoopPath.getFileSystem(conf) val inputStream = Option(codecFactory.getCodec(hadoopPath)) match { case Some(compression) => compression.createInputStream(fileSystem.open(hadoopPath)) case None => fileSystem.open(hadoopPath) @@ -304,8 +304,8 @@ object SparkContextUtils { private def executeListOnWorkers(hadoopConf: Broadcast[Map[String, String]], paths: RDD[String]): List[HadoopFile] = { paths.flatMap { path => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } - val fileSystem = FileSystem.get(new java.net.URI(path), conf) val hadoopPath = new Path(path) + val fileSystem = hadoopPath.getFileSystem(conf) val tryFind = try { val status = fileSystem.getFileStatus(hadoopPath) if (status.isDirectory) { From 5a4916489e43d7d3e67818e83969a17c25e6aecb Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 3 Nov 2015 09:09:23 -0200 Subject: [PATCH 053/268] Make it faster in some situations --- .../core/jobs/utils/SparkContextUtils.scala | 90 +++++++++++++++---- 1 file changed, 74 insertions(+), 16 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 6e4d0bc8..842ced37 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -27,6 +27,8 @@ object SparkContextUtils { implicit class SparkContextImprovements(sc: SparkContext) { + lazy val _hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) + private def getFileSystem(path: Path): FileSystem = { path.getFileSystem(sc.hadoopConfiguration) } @@ -70,12 +72,16 @@ object SparkContextUtils { processPaths((p) => sc.textFile(p), paths, minimumPaths) } - private def processParallelTextFiles(paths: Seq[String], minimumPaths: Int, maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = { + private def processParallelTextFiles(paths: Seq[String], + minimumPaths: Int, + maxBytesPerPartition: Long, + minPartitions: Int, + listOnWorkers: Boolean): RDD[String] = { val splittedPaths = paths.flatMap(ignition.core.utils.HadoopUtils.getPathStrings) if (splittedPaths.size < minimumPaths) throw new Exception(s"Not enough paths found for $paths") - parallelTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions) + parallelTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers) } private def filterPaths(paths: Seq[String], @@ -158,11 +164,11 @@ object SparkContextUtils { def getParallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long = 64 * 1000 * 1000, minPartitions: Int = 500, - synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = { + synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1, listOnWorkers: Boolean = false): RDD[String] = { if (synchLocally) - processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition, minPartitions) + processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition, minPartitions, listOnWorkers) else - processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition, minPartitions) + processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition, minPartitions, listOnWorkers) } @deprecated("It may incur heavy S3 costs and/or be slow with small files, use filterAndGetParallelTextFiles instead", "2015-10-27") @@ -195,11 +201,12 @@ object SparkContextUtils { synchLocally: Boolean = false, forceSynch: Boolean = false, ignoreMalformedDates: Boolean = false, - minimumPaths: Int = 1)(implicit dateExtractor: PathDateExtractor): RDD[String] = { + minimumPaths: Int = 1, + listOnWorkers: Boolean = false)(implicit dateExtractor: PathDateExtractor): RDD[String] = { val paths = getFilteredPaths(Seq(path), requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates) if (paths.size < minimumPaths) throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of paths $paths is less than the required") - getParallelTextFiles(paths, maxBytesPerPartition, minPartitions, synchLocally, forceSynch, minimumPaths) + getParallelTextFiles(paths, maxBytesPerPartition, minPartitions, synchLocally, forceSynch, minimumPaths, listOnWorkers) } private def stringHadoopFile(paths: Seq[String], minimumPaths: Int): RDD[Try[String]] = { @@ -243,13 +250,12 @@ object SparkContextUtils { objectHadoopFile(paths, minimumPaths) } - def parallelTextFiles(paths: List[String], maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = { - require(paths.nonEmpty, "At least one path is required") - val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) + def parallelTextFiles(paths: List[String], maxBytesPerPartition: Long, minPartitions: Int, listOnWorkers: Boolean): RDD[String] = { - val foundFiles = parallelListFiles(paths) - val partitionedFiles = sc.parallelize(foundFiles.map(_.path)).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition, minPartitions)) + val foundFiles = (if (listOnWorkers) parallelListFiles(paths) else driverListFiles(paths)).filter(_.size > 0) + val partitionedFiles = sc.parallelize(foundFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition, minPartitions)) + val hadoopConf = _hadoopConf partitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val codecFactory = new CompressionCodecFactory(conf) @@ -262,6 +268,10 @@ object SparkContextUtils { } try { Source.fromInputStream(inputStream)(Codec.UTF8).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _) + } catch { + case NonFatal(ex) => + println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") + throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") } finally { try { inputStream.close() @@ -301,7 +311,9 @@ object SparkContextUtils { } } - private def executeListOnWorkers(hadoopConf: Broadcast[Map[String, String]], paths: RDD[String]): List[HadoopFile] = { + + private def executeListOnWorkers(paths: RDD[String]): List[HadoopFile] = { + val hadoopConf = _hadoopConf paths.flatMap { path => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val hadoopPath = new Path(path) @@ -329,16 +341,62 @@ object SparkContextUtils { }.collect().toList } + def parallelListFiles(paths: List[String]): List[HadoopFile] = { - val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) + + val directories = paths.map(HadoopFile(_, isDir = true, 0)) + + def innerListFiles(remainingDirectories: List[HadoopFile]): List[HadoopFile] = { + if (remainingDirectories.isEmpty) { + Nil + } else { + val remainingPaths = remainingDirectories.map(_.path) + val pathsRDD = sc.parallelize(remainingPaths, remainingPaths.size / 2) + val (dirs, files) = executeListOnWorkers(pathsRDD).partition(_.isDir) + files ++ innerListFiles(dirs) + } + } + innerListFiles(directories) + } + + + private def executeDriverList(paths: Seq[String]): List[HadoopFile] = { + val conf = _hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } + paths.flatMap { path => + val hadoopPath = new Path(path) + val fileSystem = hadoopPath.getFileSystem(conf) + val tryFind = try { + val status = fileSystem.getFileStatus(hadoopPath) + if (status.isDirectory) { + val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) + Option(sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList) + } else if (status.isFile) { + Option(List(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen))) + } else { + None + } + } catch { + case e: java.io.FileNotFoundException => + None + } + + tryFind.getOrElse { + // Maybe is glob or not found + val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) + sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList + } + }.toList + } + + def driverListFiles(paths: List[String]): List[HadoopFile] = { + val directories = paths.map(HadoopFile(_, isDir = true, 0)) def innerListFiles(remainingDirectories: List[HadoopFile]): List[HadoopFile] = { if (remainingDirectories.isEmpty) { Nil } else { - val pathsRDD = sc.parallelize(remainingDirectories.map(_.path)) - val (dirs, files) = executeListOnWorkers(hadoopConf, pathsRDD).partition(_.isDir) + val (dirs, files) = executeDriverList(remainingDirectories.map(_.path)).partition(_.isDir) files ++ innerListFiles(dirs) } } From 506bd1c72affb05c4ebfc001440cfe178e1d30ba Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 9 Nov 2015 13:47:19 -0200 Subject: [PATCH 054/268] Split gzip files and other improvements --- build.sbt | 2 + .../ignition/core/jobs/CoreJobRunner.scala | 2 + .../core/jobs/utils/SparkContextUtils.scala | 93 +++++++++++++++---- tools/cluster.py | 5 + 4 files changed, 85 insertions(+), 17 deletions(-) diff --git a/build.sbt b/build.sbt index acdef9cb..528d30cf 100644 --- a/build.sbt +++ b/build.sbt @@ -19,6 +19,8 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.5.1" % "provided libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") +libraryDependencies += "nl.basjes.hadoop" % "splittablegzip" % "1.2" + libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.2.4" libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16" diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index aa4dcc76..ec5d9039 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -67,6 +67,8 @@ object CoreJobRunner { val sparkConf = new SparkConf() sparkConf.set("spark.executor.memory", config.executorMemory) + sparkConf.set("spark.hadoop.io.compression.codecs", + "org.apache.hadoop.io.compress.DefaultCodec,nl.basjes.hadoop.io.compress.SplittableGzipCodec,org.apache.hadoop.io.compress.BZip2Codec") sparkConf.setMaster(config.master) sparkConf.setAppName(appName) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 842ced37..78b6ec9b 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -2,8 +2,9 @@ package ignition.core.jobs.utils import ignition.core.utils.ByteUtils import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.io.LongWritable +import org.apache.hadoop.io.{Text, LongWritable} import org.apache.hadoop.io.compress.CompressionCodecFactory +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.broadcast.Broadcast import org.apache.spark.{Partitioner, SparkContext} import org.apache.hadoop.fs.{FileStatus, Path, FileSystem} @@ -21,6 +22,10 @@ import scala.util.control.NonFatal object SparkContextUtils { + case class IndexedPartitioner(numPartitions: Int, index: Map[Any, Int]) extends Partitioner { + override def getPartition(key: Any): Int = index(key) + } + case class HadoopFile(path: String, isDir: Boolean, size: Long) private case class HadoopFilePartition(size: Long, paths: Seq[String]) @@ -162,8 +167,8 @@ object SparkContextUtils { } def getParallelTextFiles(paths: Seq[String], - maxBytesPerPartition: Long = 64 * 1000 * 1000, - minPartitions: Int = 500, + maxBytesPerPartition: Long = 256 * 1000 * 1000, + minPartitions: Int = 100, synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1, listOnWorkers: Boolean = false): RDD[String] = { if (synchLocally) processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition, minPartitions, listOnWorkers) @@ -190,8 +195,8 @@ object SparkContextUtils { } def filterAndGetParallelTextFiles(path: String, - maxBytesPerPartition: Long = 64 * 1000 * 1000, - minPartitions: Int = 500, + maxBytesPerPartition: Long = 256 * 1000 * 1000, + minPartitions: Int = 100, requireSuccess: Boolean = false, inclusiveStartDate: Boolean = true, startDate: Option[DateTime] = None, @@ -250,13 +255,27 @@ object SparkContextUtils { objectHadoopFile(paths, minimumPaths) } - def parallelTextFiles(paths: List[String], maxBytesPerPartition: Long, minPartitions: Int, listOnWorkers: Boolean): RDD[String] = { + case class SizeBasedFileHandling(averageEstimatedCompressionRatio: Int = 8, + compressedExtensions: Set[String] = Set(".gz")) { + + def isBig(f: HadoopFile, uncompressedBigSize: Long): Boolean = estimatedSize(f) >= uncompressedBigSize + + def estimatedSize(f: HadoopFile) = if (isCompressed(f)) + f.size * averageEstimatedCompressionRatio + else + f.size + + def isCompressed(f: HadoopFile): Boolean = compressedExtensions.exists(f.path.endsWith) + } - val foundFiles = (if (listOnWorkers) parallelListFiles(paths) else driverListFiles(paths)).filter(_.size > 0) - val partitionedFiles = sc.parallelize(foundFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition, minPartitions)) + def readSmallFiles(smallFiles: List[HadoopFile], + maxBytesPerPartition: Long, + minPartitions: Int, + sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { + val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) val hadoopConf = _hadoopConf - partitionedFiles.mapPartitions { files => + smallPartitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val codecFactory = new CompressionCodecFactory(conf) files.map { case (path, _) => path } flatMap { path => @@ -284,7 +303,48 @@ object SparkContextUtils { } } - private def createPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long): Partitioner = { + def readBigFiles(bigFiles: List[HadoopFile], + maxBytesPerPartition: Long, + minPartitions: Int, + sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { + def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq("mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString)) + .foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } + + def read(file: HadoopFile, conf: Configuration) = sc.newAPIHadoopFile[LongWritable, Text, TextInputFormat](conf = conf, fClass = classOf[TextInputFormat], + kClass = classOf[LongWritable], vClass = classOf[Text], path = file.path).map(pair => pair._2.toString) + + val confCompressed = confWith(maxBytesPerPartition / sizeBasedFileHandling.averageEstimatedCompressionRatio) + val confUncompressed = confWith(maxBytesPerPartition) + + val union = new UnionRDD(sc, bigFiles.map { file => + val conf = if (sizeBasedFileHandling.isCompressed(file)) + confCompressed + else + confUncompressed + read(file, conf) + }) + + if (union.partitions.size < minPartitions) + union.coalesce(minPartitions) + else + union + } + + def parallelTextFiles(paths: List[String], + maxBytesPerPartition: Long, + minPartitions: Int, + listOnWorkers: Boolean, + sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = { + + val foundFiles = (if (listOnWorkers) parallelListFiles(paths) else driverListFiles(paths)).filter(_.size > 0) + val (bigFiles, smallFiles) = foundFiles.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition)) + + sc.union( + readSmallFiles(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling), + readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + } + + private def createPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long, sizeBasedFileHandling: SizeBasedFileHandling): Partitioner = { implicit val ordering: Ordering[HadoopFilePartition] = Ordering.by(p => -p.size) // Small partitions come first (highest priority) val pq: mutable.PriorityQueue[HadoopFilePartition] = mutable.PriorityQueue.empty @@ -293,11 +353,13 @@ object SparkContextUtils { val partitions = files.foldLeft(pq) { case (acc, file) => - acc.headOption.filter(bucket => bucket.size + file.size < maxBytesPerPartition) match { + val fileSize = sizeBasedFileHandling.estimatedSize(file) + + acc.headOption.filter(bucket => bucket.size + fileSize < maxBytesPerPartition) match { case Some(found) => - val updated = found.copy(size = found.size + file.size, paths = file.path +: found.paths) + val updated = found.copy(size = found.size + fileSize, paths = file.path +: found.paths) acc.tail += updated - case None => acc += HadoopFilePartition(file.size, Seq(file.path)) + case None => acc += HadoopFilePartition(fileSize, Seq(file.path)) } }.filter(_.paths.nonEmpty).toList // Remove empty partitions @@ -305,10 +367,7 @@ object SparkContextUtils { case (bucket, index) => bucket.paths.map(path => path -> index) }.toMap - new Partitioner { - override def numPartitions: Int = partitions.size - override def getPartition(key: Any): Int = indexedPartitions(key) - } + IndexedPartitioner(partitions.size, indexedPartitions) } diff --git a/tools/cluster.py b/tools/cluster.py index 0af46ebe..7daf9617 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -49,6 +49,9 @@ default_master_ami = None default_env = 'dev' default_spark_version = '1.5.1' +custom_builds = { + '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz' +} default_spark_repo = 'https://github.com/chaordic/spark' default_remote_user = 'ec2-user' default_remote_control_dir = '/tmp/Ignition' @@ -260,6 +263,8 @@ def launch(cluster_name, slaves, ami_params = ['--ami', ami] if ami else [] master_ami_params = ['--master-ami', master_ami] if master_ami else [] + spark_version = custom_builds.get(spark_version, spark_version) + for i in range(retries_on_same_cluster): log.info('Running script, try %d of %d', i + 1, retries_on_same_cluster) try: From dc12d2a696e5cf847360097f86b33588b8b4cf84 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 10 Nov 2015 17:06:14 -0200 Subject: [PATCH 055/268] Use SplittableGzipCodec only for big files --- src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 2 -- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 6 +++++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index ec5d9039..aa4dcc76 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -67,8 +67,6 @@ object CoreJobRunner { val sparkConf = new SparkConf() sparkConf.set("spark.executor.memory", config.executorMemory) - sparkConf.set("spark.hadoop.io.compression.codecs", - "org.apache.hadoop.io.compress.DefaultCodec,nl.basjes.hadoop.io.compress.SplittableGzipCodec,org.apache.hadoop.io.compress.BZip2Codec") sparkConf.setMaster(config.master) sparkConf.setAppName(appName) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 78b6ec9b..d18d5f76 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -307,7 +307,9 @@ object SparkContextUtils { maxBytesPerPartition: Long, minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { - def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq("mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString)) + def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq( + "io.compression.codecs" -> "org.apache.hadoop.io.compress.DefaultCodec,nl.basjes.hadoop.io.compress.SplittableGzipCodec,org.apache.hadoop.io.compress.BZip2Codec", + "mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString)) .foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } def read(file: HadoopFile, conf: Configuration) = sc.newAPIHadoopFile[LongWritable, Text, TextInputFormat](conf = conf, fClass = classOf[TextInputFormat], @@ -317,10 +319,12 @@ object SparkContextUtils { val confUncompressed = confWith(maxBytesPerPartition) val union = new UnionRDD(sc, bigFiles.map { file => + val conf = if (sizeBasedFileHandling.isCompressed(file)) confCompressed else confUncompressed + read(file, conf) }) From b52eceea89e95ba3416ba1f8c13d77249129e9d6 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Wed, 11 Nov 2015 16:39:09 -0200 Subject: [PATCH 056/268] Dont use build with updated hadoop client --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 7daf9617..2fe6b245 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -50,7 +50,7 @@ default_env = 'dev' default_spark_version = '1.5.1' custom_builds = { - '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz' +# '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz' } default_spark_repo = 'https://github.com/chaordic/spark' default_remote_user = 'ec2-user' From f1075e8d5acd6bc9b03d8974c32adf3d146e2d9b Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 17 Nov 2015 17:27:00 -0200 Subject: [PATCH 057/268] s3 list --- build.sbt | 2 + .../core/jobs/utils/SparkContextUtils.scala | 49 +++++++--- .../scala/ignition/core/utils/S3Utils.scala | 91 +++++++++++++++++++ 3 files changed, 131 insertions(+), 11 deletions(-) create mode 100644 src/main/scala/ignition/core/utils/S3Utils.scala diff --git a/build.sbt b/build.sbt index 528d30cf..7231704b 100644 --- a/build.sbt +++ b/build.sbt @@ -35,6 +35,8 @@ libraryDependencies += "joda-time" % "joda-time" % "2.7" libraryDependencies += "org.joda" % "joda-convert" % "1.7" +libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.9.6" + resolvers += "Akka Repository" at "http://repo.akka.io/releases/" resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/" diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index d18d5f76..96f2341d 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -1,16 +1,18 @@ package ignition.core.jobs.utils +import com.amazonaws.services.s3.AmazonS3Client +import com.amazonaws.services.s3.model.{S3ObjectSummary, S3Object} import ignition.core.utils.ByteUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.{Text, LongWritable} import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.lib.input.TextInputFormat -import org.apache.spark.broadcast.Broadcast import org.apache.spark.{Partitioner, SparkContext} import org.apache.hadoop.fs.{FileStatus, Path, FileSystem} import org.apache.spark.rdd.{UnionRDD, RDD} import org.joda.time.DateTime import ignition.core.utils.DateUtils._ +import ignition.core.utils.S3Utils._ import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer @@ -26,6 +28,11 @@ object SparkContextUtils { override def getPartition(key: Any): Int = index(key) } + implicit class S3ObjectSummaryExtensions(s3Object: S3ObjectSummary) { + def toHadoopFile: HadoopFile = + HadoopFile(s"s3n://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize) + } + case class HadoopFile(path: String, isDir: Boolean, size: Long) private case class HadoopFilePartition(size: Long, paths: Seq[String]) @@ -86,7 +93,7 @@ object SparkContextUtils { if (splittedPaths.size < minimumPaths) throw new Exception(s"Not enough paths found for $paths") - parallelTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers) + parallelListEndReadTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers) } private def filterPaths(paths: Seq[String], @@ -257,14 +264,14 @@ object SparkContextUtils { case class SizeBasedFileHandling(averageEstimatedCompressionRatio: Int = 8, compressedExtensions: Set[String] = Set(".gz")) { - + def isBig(f: HadoopFile, uncompressedBigSize: Long): Boolean = estimatedSize(f) >= uncompressedBigSize - + def estimatedSize(f: HadoopFile) = if (isCompressed(f)) f.size * averageEstimatedCompressionRatio else f.size - + def isCompressed(f: HadoopFile): Boolean = compressedExtensions.exists(f.path.endsWith) } @@ -334,15 +341,21 @@ object SparkContextUtils { union } - def parallelTextFiles(paths: List[String], - maxBytesPerPartition: Long, - minPartitions: Int, - listOnWorkers: Boolean, - sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = { + def parallelListEndReadTextFiles(paths: List[String], + maxBytesPerPartition: Long, + minPartitions: Int, + listOnWorkers: Boolean, + sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = { val foundFiles = (if (listOnWorkers) parallelListFiles(paths) else driverListFiles(paths)).filter(_.size > 0) - val (bigFiles, smallFiles) = foundFiles.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition)) + parallelReadTextFiles(foundFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling) + } + def parallelReadTextFiles(files: List[HadoopFile], + maxBytesPerPartition: Long, + minPartitions: Int, + sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = { + val (bigFiles, smallFiles) = files.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition)) sc.union( readSmallFiles(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling), readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) @@ -466,5 +479,19 @@ object SparkContextUtils { innerListFiles(directories) } + def s3FilterAndGetParallelTextFiles(bucket: String, + prefix: String, + startDate: Option[DateTime] = None, + endDate: Option[DateTime] = None, + endsWith: Option[String] = None, + predicate: S3ObjectSummary => Boolean = _ => true, + maxBytesPerPartition: Long = 256 * 1000 * 1000, + minPartitions: Int = 100, + sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()) + (implicit s3Client: AmazonS3Client, dateExtractor: PathDateExtractor): RDD[String] = { + val foundFiles = s3ListAndFilterFiles(bucket, prefix, startDate, endDate, predicate = predicate)(s3Client, dateExtractor).map(_.toHadoopFile) + parallelReadTextFiles(foundFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling) + } + } } diff --git a/src/main/scala/ignition/core/utils/S3Utils.scala b/src/main/scala/ignition/core/utils/S3Utils.scala new file mode 100644 index 00000000..28866c4c --- /dev/null +++ b/src/main/scala/ignition/core/utils/S3Utils.scala @@ -0,0 +1,91 @@ +package ignition.core.utils + +import com.amazonaws.auth.EnvironmentVariableCredentialsProvider +import com.amazonaws.services.s3.AmazonS3Client +import com.amazonaws.services.s3.model.{S3ObjectSummary, ObjectListing} +import ignition.core.jobs.utils.PathDateExtractor +import ignition.core.utils.DateUtils._ +import org.joda.time.DateTime + +import scala.collection.JavaConversions._ +import scala.collection.mutable +import scala.util.Try + +object S3Utils { + + implicit lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider()) + + def s3List(bucket: String, prefix: String, predicate: S3ObjectSummary => Boolean = _ => true) + (implicit s3: AmazonS3Client): List[S3ObjectSummary] = { + def inner(acc: mutable.ArrayBuffer[S3ObjectSummary], listing: ObjectListing): List[S3ObjectSummary] = { + acc ++= listing.getObjectSummaries.toList.filter(predicate) + if (listing.isTruncated) + inner(acc, s3.listNextBatchOfObjects(listing)) + else + acc.toList + } + + inner(new mutable.ArrayBuffer[S3ObjectSummary], s3.listObjects(bucket, prefix)) + } + + def s3ListAndFilterFiles(bucket: String, + prefix: String, + start: Option[DateTime] = None, + end: Option[DateTime] = None, + endsWith: Option[String] = None, + exclusionPattern: Option[String] = Option("_$folder$"), + predicate: S3ObjectSummary => Boolean = _ => true) + (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): List[S3ObjectSummary] = { + + def excludePatternValidation(s3Object: S3ObjectSummary, exclusionPatternOption: Option[String]): Option[S3ObjectSummary] = + exclusionPatternOption match { + case Some(pattern) if s3Object.getKey.contains(pattern) => None + case Some(_) | None => Option(s3Object) + } + + def endsWithValidation(s3Object: S3ObjectSummary, endsWithOption: Option[String]): Option[S3ObjectSummary] = + endsWithOption match { + case Some(pattern) if s3Object.getKey.endsWith(pattern) => Option(s3Object) + case Some(_) => None + case None => Option(s3Object) + } + + def extractDateFromKey(s3Object: S3ObjectSummary): Option[DateTime] = + Try(pathDateExtractor.extractFromPath(s"s3://$bucket/${s3Object.getKey}")).toOption + + def startValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, startOption: Option[DateTime]): Option[S3ObjectSummary] = + startOption match { + case Some(startDate) if startDate.isEqualOrBefore(extractedDate) => Option(s3Object) + case Some(_) => None + case None => Option(s3Object) + } + + def endValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, endOption: Option[DateTime]): Option[S3ObjectSummary] = + endOption match { + case Some(endDate) if endDate.isEqualOrAfter(extractedDate) => Option(s3Object) + case Some(_) => None + case None => Option(s3Object) + } + + def applyPredicate(s3Object: S3ObjectSummary): Option[S3ObjectSummary] = + if (predicate(s3Object)) + Option(s3Object) + else + None + + val allValidations: S3ObjectSummary => Boolean = s3Object => { + val validatedS3Object = for { + withValidPattern <- excludePatternValidation(s3Object, exclusionPattern) + withValidEndsWith <- endsWithValidation(withValidPattern, endsWith) + extractedDate <- extractDateFromKey(withValidEndsWith) + withValidStart <- startValidation(withValidEndsWith, extractedDate, start) + withValidEnd <- endValidation(withValidStart, extractedDate, end) + valid <- applyPredicate(withValidEnd) + } yield valid + validatedS3Object.isDefined + } + + s3List(bucket, prefix, allValidations)(s3) + } + +} From 909136626ecf4daf300a66893087d8c06609c7e1 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Fri, 20 Nov 2015 18:26:42 -0200 Subject: [PATCH 058/268] Split compressed big files --- .../core/jobs/utils/SparkContextUtils.scala | 92 ++++++++++++++----- .../core/utils/AutoCloseableIterator.scala | 67 ++++++++++++++ 2 files changed, 138 insertions(+), 21 deletions(-) create mode 100644 src/main/scala/ignition/core/utils/AutoCloseableIterator.scala diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index d18d5f76..dec5ca13 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -1,20 +1,21 @@ package ignition.core.jobs.utils -import ignition.core.utils.ByteUtils +import java.io.InputStream + +import ignition.core.utils.DateUtils._ +import ignition.core.utils.{AutoCloseableIterator, ByteUtils} import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.io.{Text, LongWritable} +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.hadoop.io.compress.CompressionCodecFactory +import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.mapreduce.lib.input.TextInputFormat -import org.apache.spark.broadcast.Broadcast +import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.{Partitioner, SparkContext} -import org.apache.hadoop.fs.{FileStatus, Path, FileSystem} -import org.apache.spark.rdd.{UnionRDD, RDD} import org.joda.time.DateTime -import ignition.core.utils.DateUtils._ import scala.collection.JavaConversions._ -import scala.collection.mutable.ArrayBuffer import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer import scala.io.{Codec, Source} import scala.reflect.ClassTag import scala.util.Try @@ -22,6 +23,17 @@ import scala.util.control.NonFatal object SparkContextUtils { + def close(inputStream: InputStream, path: String): Unit = { + try { + inputStream.close() + } catch { + case NonFatal(ex) => + println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") + } + } + + case class BigFileSlice(index: Int) + case class IndexedPartitioner(numPartitions: Int, index: Map[Any, Int]) extends Partitioner { override def getPartition(key: Any): Int = index(key) } @@ -273,7 +285,7 @@ object SparkContextUtils { maxBytesPerPartition: Long, minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { - val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) val hadoopConf = _hadoopConf smallPartitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } @@ -292,13 +304,54 @@ object SparkContextUtils { println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") } finally { - try { - inputStream.close() - } catch { - case NonFatal(ex) => - println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") - } + close(inputStream, path) + } + } + } + } + + def readCompressedBigFile(file: HadoopFile, maxBytesPerPartition: Long, minPartitions: Int, + sizeBasedFileHandling: SizeBasedFileHandling, sampleCount: Int = 100): RDD[String] = { + val estimatedSize = sizeBasedFileHandling.estimatedSize(file) + val totalSlices = (estimatedSize / maxBytesPerPartition + 1).toInt + val slices = (0 until totalSlices).map(BigFileSlice.apply) + + val partitioner = { + val indexedPartitions: Map[Any, Int] = slices.map(s => s -> s.index).toMap + IndexedPartitioner(totalSlices, indexedPartitions) + } + val hadoopConf = _hadoopConf + + val partitionedSlices = sc.parallelize(slices.map(s => s -> ()), 2).partitionBy(partitioner) + + partitionedSlices.mapPartitions { slices => + val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } + val codecFactory = new CompressionCodecFactory(conf) + val hadoopPath = new Path(file.path) + val fileSystem = hadoopPath.getFileSystem(conf) + slices.flatMap { case (slice, _) => + val inputStream = Option(codecFactory.getCodec(hadoopPath)) match { + case Some(compression) => compression.createInputStream(fileSystem.open(hadoopPath)) + case None => fileSystem.open(hadoopPath) } + val lines = Source.fromInputStream(inputStream)(Codec.UTF8).getLines() + + val lineSample = lines.take(sampleCount).toList + val linesPerSlice = { + val sampleSize = lineSample.map(_.size).sum + val estimatedAverageLineSize = Math.round(sampleSize / sampleCount.toFloat) + val estimatedTotalLines = Math.round(estimatedSize / estimatedAverageLineSize.toFloat) + estimatedTotalLines / totalSlices + 1 + } + + val linesAfterSeek = (lineSample.toIterator ++ lines).drop(linesPerSlice * slice.index) + + val finalLines = if (slice.index + 1 == totalSlices) // last slice, read until the end + linesAfterSeek + else + linesAfterSeek.take(linesPerSlice) + + AutoCloseableIterator.wrap(finalLines, () => close(inputStream, s"${file.path}, slice $slice")) } } } @@ -315,17 +368,14 @@ object SparkContextUtils { def read(file: HadoopFile, conf: Configuration) = sc.newAPIHadoopFile[LongWritable, Text, TextInputFormat](conf = conf, fClass = classOf[TextInputFormat], kClass = classOf[LongWritable], vClass = classOf[Text], path = file.path).map(pair => pair._2.toString) - val confCompressed = confWith(maxBytesPerPartition / sizeBasedFileHandling.averageEstimatedCompressionRatio) val confUncompressed = confWith(maxBytesPerPartition) val union = new UnionRDD(sc, bigFiles.map { file => - val conf = if (sizeBasedFileHandling.isCompressed(file)) - confCompressed + if (sizeBasedFileHandling.isCompressed(file)) + readCompressedBigFile(file, maxBytesPerPartition, minPartitions, sizeBasedFileHandling) else - confUncompressed - - read(file, conf) + read(file, confUncompressed) }) if (union.partitions.size < minPartitions) @@ -348,7 +398,7 @@ object SparkContextUtils { readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) } - private def createPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long, sizeBasedFileHandling: SizeBasedFileHandling): Partitioner = { + private def createSmallFilesPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long, sizeBasedFileHandling: SizeBasedFileHandling): Partitioner = { implicit val ordering: Ordering[HadoopFilePartition] = Ordering.by(p => -p.size) // Small partitions come first (highest priority) val pq: mutable.PriorityQueue[HadoopFilePartition] = mutable.PriorityQueue.empty diff --git a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala new file mode 100644 index 00000000..b3f054ba --- /dev/null +++ b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala @@ -0,0 +1,67 @@ +package ignition.core.utils + +import scala.util.Try +import scala.util.control.NonFatal + +object AutoCloseableIterator { + case object empty extends AutoCloseableIterator[Nothing] { + override def naiveHasNext() = false + override def naiveNext() = throw new Exception("Empty AutoCloseableIterator") + override def naiveClose() = {} + } + + def wrap[T](iterator: Iterator[T], close: () => Unit = () => ()): AutoCloseableIterator[T] = new AutoCloseableIterator[T] { + override def naiveClose(): Unit = close() + override def naiveHasNext(): Boolean = iterator.hasNext + override def naiveNext(): T = iterator.next() + } +} + +trait AutoCloseableIterator[T] extends Iterator[T] with AutoCloseable { + // Naive functions should be implemented by the user as in a standard Iterator/AutoCloseable + def naiveHasNext(): Boolean + def naiveNext(): T + def naiveClose(): Unit + + var closed = false + + // hasNext closes the iterator and handles the case where it is already closed + override def hasNext(): Boolean = if (closed) + false + else { + val naiveResult = try { + naiveHasNext + } catch { + case NonFatal(e) => + Try { close } + throw e + } + if (naiveResult) + true + else { + close // auto close when exhausted + false + } + } + + // next closes the iterator and handles the case where it is already closed + override def next(): T = if (closed) + throw new RuntimeException("Trying to get next element on a closed iterator") + else if (hasNext()) + try { + naiveNext + } catch { + case NonFatal(e) => + Try { close } + throw e + } + else + throw new RuntimeException("Trying to get next element on an exhausted iterator") + + override def close() = if (!closed) { + closed = true + naiveClose + } + + override def finalize() = Try { close } +} From 368a9986fa019f15d7f303b403cf96821899609f Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Fri, 20 Nov 2015 18:28:33 -0200 Subject: [PATCH 059/268] Removed unused dependency --- build.sbt | 2 -- src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala | 1 - 2 files changed, 3 deletions(-) diff --git a/build.sbt b/build.sbt index 528d30cf..acdef9cb 100644 --- a/build.sbt +++ b/build.sbt @@ -19,8 +19,6 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.5.1" % "provided libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") -libraryDependencies += "nl.basjes.hadoop" % "splittablegzip" % "1.2" - libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.2.4" libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16" diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index dec5ca13..06ea71ee 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -361,7 +361,6 @@ object SparkContextUtils { minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq( - "io.compression.codecs" -> "org.apache.hadoop.io.compress.DefaultCodec,nl.basjes.hadoop.io.compress.SplittableGzipCodec,org.apache.hadoop.io.compress.BZip2Codec", "mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString)) .foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } From 016de5b8e9db0e8aa51bf6c00f1de880938de1e3 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Mon, 23 Nov 2015 10:48:56 -0200 Subject: [PATCH 060/268] pr review --- .../core/jobs/utils/SparkContextUtils.scala | 110 +++++++++++++++--- .../scala/ignition/core/utils/S3Utils.scala | 91 --------------- 2 files changed, 93 insertions(+), 108 deletions(-) delete mode 100644 src/main/scala/ignition/core/utils/S3Utils.scala diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 96f2341d..de5cdfca 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -1,22 +1,22 @@ package ignition.core.jobs.utils +import com.amazonaws.auth.EnvironmentVariableCredentialsProvider import com.amazonaws.services.s3.AmazonS3Client -import com.amazonaws.services.s3.model.{S3ObjectSummary, S3Object} +import com.amazonaws.services.s3.model.{ObjectListing, S3ObjectSummary} import ignition.core.utils.ByteUtils +import ignition.core.utils.DateUtils._ import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.io.{Text, LongWritable} +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.hadoop.io.compress.CompressionCodecFactory +import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.mapreduce.lib.input.TextInputFormat +import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.{Partitioner, SparkContext} -import org.apache.hadoop.fs.{FileStatus, Path, FileSystem} -import org.apache.spark.rdd.{UnionRDD, RDD} import org.joda.time.DateTime -import ignition.core.utils.DateUtils._ -import ignition.core.utils.S3Utils._ import scala.collection.JavaConversions._ -import scala.collection.mutable.ArrayBuffer import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer import scala.io.{Codec, Source} import scala.reflect.ClassTag import scala.util.Try @@ -24,15 +24,12 @@ import scala.util.control.NonFatal object SparkContextUtils { + implicit lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider()) + case class IndexedPartitioner(numPartitions: Int, index: Map[Any, Int]) extends Partitioner { override def getPartition(key: Any): Int = index(key) } - implicit class S3ObjectSummaryExtensions(s3Object: S3ObjectSummary) { - def toHadoopFile: HadoopFile = - HadoopFile(s"s3n://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize) - } - case class HadoopFile(path: String, isDir: Boolean, size: Long) private case class HadoopFilePartition(size: Long, paths: Seq[String]) @@ -93,7 +90,7 @@ object SparkContextUtils { if (splittedPaths.size < minimumPaths) throw new Exception(s"Not enough paths found for $paths") - parallelListEndReadTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers) + parallelListAndReadTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers) } private def filterPaths(paths: Seq[String], @@ -341,7 +338,7 @@ object SparkContextUtils { union } - def parallelListEndReadTextFiles(paths: List[String], + def parallelListAndReadTextFiles(paths: List[String], maxBytesPerPartition: Long, minPartitions: Int, listOnWorkers: Boolean, @@ -479,17 +476,96 @@ object SparkContextUtils { innerListFiles(directories) } + private def s3List(bucket: String, prefix: String, predicate: S3ObjectSummary => Boolean = _ => true) + (implicit s3: AmazonS3Client): List[S3ObjectSummary] = { + def inner(acc: mutable.ArrayBuffer[S3ObjectSummary], listing: ObjectListing): List[S3ObjectSummary] = { + acc ++= listing.getObjectSummaries.toList.filter(predicate) + if (listing.isTruncated) + inner(acc, s3.listNextBatchOfObjects(listing)) + else + acc.toList + } + + inner(new mutable.ArrayBuffer[S3ObjectSummary], s3.listObjects(bucket, prefix)) + } + + def s3ListAndFilterFiles(bucket: String, + prefix: String, + start: Option[DateTime] = None, + end: Option[DateTime] = None, + endsWith: Option[String] = None, + exclusionPattern: Option[String] = Option("_$folder$"), + predicate: HadoopFile => Boolean = _ => true) + (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): List[HadoopFile] = { + + def excludePatternValidation(s3Object: S3ObjectSummary, exclusionPatternOption: Option[String]): Option[S3ObjectSummary] = + exclusionPatternOption match { + case Some(pattern) if s3Object.getKey.contains(pattern) => None + case Some(_) | None => Option(s3Object) + } + + def endsWithValidation(s3Object: S3ObjectSummary, endsWithOption: Option[String]): Option[S3ObjectSummary] = + endsWithOption match { + case Some(pattern) if s3Object.getKey.endsWith(pattern) => Option(s3Object) + case Some(_) => None + case None => Option(s3Object) + } + + def extractDateFromKey(s3Object: S3ObjectSummary): Option[DateTime] = + Try(pathDateExtractor.extractFromPath(s"s3n://$bucket/${s3Object.getKey}")).toOption + + def startValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, startOption: Option[DateTime]): Option[S3ObjectSummary] = + startOption match { + case Some(startDate) if startDate.isEqualOrBefore(extractedDate) => Option(s3Object) + case Some(_) => None + case None => Option(s3Object) + } + + def endValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, endOption: Option[DateTime]): Option[S3ObjectSummary] = + endOption match { + case Some(endDate) if endDate.isEqualOrAfter(extractedDate) => Option(s3Object) + case Some(_) => None + case None => Option(s3Object) + } + + def applyPredicate(file: HadoopFile): Option[HadoopFile] = + if (predicate(file)) + Option(file) + else + None + + def toHadoopFile(s3Object: S3ObjectSummary): HadoopFile = + HadoopFile(s"s3n://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize) + + val allValidations: S3ObjectSummary => Boolean = s3Object => { + val validatedFile = for { + withValidPattern <- excludePatternValidation(s3Object, exclusionPattern) + withValidEndsWith <- endsWithValidation(withValidPattern, endsWith) + extractedDate <- extractDateFromKey(withValidEndsWith) + withValidStart <- startValidation(withValidEndsWith, extractedDate, start) + withValidEnd <- endValidation(withValidStart, extractedDate, end) + hadoopFile = toHadoopFile(withValidEnd) + valid <- applyPredicate(hadoopFile) + } yield valid + validatedFile.isDefined + } + + s3List(bucket, prefix, allValidations)(s3).map(toHadoopFile) + } + + def s3FilterAndGetParallelTextFiles(bucket: String, prefix: String, startDate: Option[DateTime] = None, endDate: Option[DateTime] = None, endsWith: Option[String] = None, - predicate: S3ObjectSummary => Boolean = _ => true, + predicate: HadoopFile => Boolean = _ => true, maxBytesPerPartition: Long = 256 * 1000 * 1000, minPartitions: Int = 100, sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()) - (implicit s3Client: AmazonS3Client, dateExtractor: PathDateExtractor): RDD[String] = { - val foundFiles = s3ListAndFilterFiles(bucket, prefix, startDate, endDate, predicate = predicate)(s3Client, dateExtractor).map(_.toHadoopFile) + (implicit s3Client: AmazonS3Client = amazonS3ClientFromEnvironmentVariables, + dateExtractor: PathDateExtractor): RDD[String] = { + val foundFiles = s3ListAndFilterFiles(bucket, prefix, startDate, endDate, predicate = predicate)(s3Client, dateExtractor) parallelReadTextFiles(foundFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling) } diff --git a/src/main/scala/ignition/core/utils/S3Utils.scala b/src/main/scala/ignition/core/utils/S3Utils.scala deleted file mode 100644 index 28866c4c..00000000 --- a/src/main/scala/ignition/core/utils/S3Utils.scala +++ /dev/null @@ -1,91 +0,0 @@ -package ignition.core.utils - -import com.amazonaws.auth.EnvironmentVariableCredentialsProvider -import com.amazonaws.services.s3.AmazonS3Client -import com.amazonaws.services.s3.model.{S3ObjectSummary, ObjectListing} -import ignition.core.jobs.utils.PathDateExtractor -import ignition.core.utils.DateUtils._ -import org.joda.time.DateTime - -import scala.collection.JavaConversions._ -import scala.collection.mutable -import scala.util.Try - -object S3Utils { - - implicit lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider()) - - def s3List(bucket: String, prefix: String, predicate: S3ObjectSummary => Boolean = _ => true) - (implicit s3: AmazonS3Client): List[S3ObjectSummary] = { - def inner(acc: mutable.ArrayBuffer[S3ObjectSummary], listing: ObjectListing): List[S3ObjectSummary] = { - acc ++= listing.getObjectSummaries.toList.filter(predicate) - if (listing.isTruncated) - inner(acc, s3.listNextBatchOfObjects(listing)) - else - acc.toList - } - - inner(new mutable.ArrayBuffer[S3ObjectSummary], s3.listObjects(bucket, prefix)) - } - - def s3ListAndFilterFiles(bucket: String, - prefix: String, - start: Option[DateTime] = None, - end: Option[DateTime] = None, - endsWith: Option[String] = None, - exclusionPattern: Option[String] = Option("_$folder$"), - predicate: S3ObjectSummary => Boolean = _ => true) - (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): List[S3ObjectSummary] = { - - def excludePatternValidation(s3Object: S3ObjectSummary, exclusionPatternOption: Option[String]): Option[S3ObjectSummary] = - exclusionPatternOption match { - case Some(pattern) if s3Object.getKey.contains(pattern) => None - case Some(_) | None => Option(s3Object) - } - - def endsWithValidation(s3Object: S3ObjectSummary, endsWithOption: Option[String]): Option[S3ObjectSummary] = - endsWithOption match { - case Some(pattern) if s3Object.getKey.endsWith(pattern) => Option(s3Object) - case Some(_) => None - case None => Option(s3Object) - } - - def extractDateFromKey(s3Object: S3ObjectSummary): Option[DateTime] = - Try(pathDateExtractor.extractFromPath(s"s3://$bucket/${s3Object.getKey}")).toOption - - def startValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, startOption: Option[DateTime]): Option[S3ObjectSummary] = - startOption match { - case Some(startDate) if startDate.isEqualOrBefore(extractedDate) => Option(s3Object) - case Some(_) => None - case None => Option(s3Object) - } - - def endValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, endOption: Option[DateTime]): Option[S3ObjectSummary] = - endOption match { - case Some(endDate) if endDate.isEqualOrAfter(extractedDate) => Option(s3Object) - case Some(_) => None - case None => Option(s3Object) - } - - def applyPredicate(s3Object: S3ObjectSummary): Option[S3ObjectSummary] = - if (predicate(s3Object)) - Option(s3Object) - else - None - - val allValidations: S3ObjectSummary => Boolean = s3Object => { - val validatedS3Object = for { - withValidPattern <- excludePatternValidation(s3Object, exclusionPattern) - withValidEndsWith <- endsWithValidation(withValidPattern, endsWith) - extractedDate <- extractDateFromKey(withValidEndsWith) - withValidStart <- startValidation(withValidEndsWith, extractedDate, start) - withValidEnd <- endValidation(withValidStart, extractedDate, end) - valid <- applyPredicate(withValidEnd) - } yield valid - validatedS3Object.isDefined - } - - s3List(bucket, prefix, allValidations)(s3) - } - -} From 7c23316dd058acce0a607b6e16ba0ff35460a28a Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 25 Nov 2015 14:10:27 -0200 Subject: [PATCH 061/268] fix lambda ref to close resources --- .../scala/ignition/core/utils/AutoCloseableIterator.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala index b3f054ba..bc294f6f 100644 --- a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala +++ b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala @@ -10,8 +10,8 @@ object AutoCloseableIterator { override def naiveClose() = {} } - def wrap[T](iterator: Iterator[T], close: () => Unit = () => ()): AutoCloseableIterator[T] = new AutoCloseableIterator[T] { - override def naiveClose(): Unit = close() + def wrap[T](iterator: Iterator[T], doClose: () => Unit = () => ()): AutoCloseableIterator[T] = new AutoCloseableIterator[T] { + override def naiveClose(): Unit = doClose() override def naiveHasNext(): Boolean = iterator.hasNext override def naiveNext(): T = iterator.next() } From 358459f9ea32143d2c63ef460986cef0d75345d7 Mon Sep 17 00:00:00 2001 From: Leandro Date: Fri, 4 Dec 2015 19:43:09 -0200 Subject: [PATCH 062/268] Small Xlint fixes --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 4 ++-- .../scala/ignition/core/utils/AutoCloseableIterator.scala | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 97aed619..de3bf3ae 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -289,7 +289,7 @@ object SparkContextUtils { maxBytesPerPartition: Long, minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { - val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> Unit ), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) val hadoopConf = _hadoopConf smallPartitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } @@ -326,7 +326,7 @@ object SparkContextUtils { } val hadoopConf = _hadoopConf - val partitionedSlices = sc.parallelize(slices.map(s => s -> ()), 2).partitionBy(partitioner) + val partitionedSlices = sc.parallelize(slices.map(s => s -> Unit ), 2).partitionBy(partitioner) partitionedSlices.mapPartitions { slices => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } diff --git a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala index bc294f6f..4e3db808 100644 --- a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala +++ b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala @@ -26,7 +26,7 @@ trait AutoCloseableIterator[T] extends Iterator[T] with AutoCloseable { var closed = false // hasNext closes the iterator and handles the case where it is already closed - override def hasNext(): Boolean = if (closed) + override def hasNext: Boolean = if (closed) false else { val naiveResult = try { @@ -47,7 +47,7 @@ trait AutoCloseableIterator[T] extends Iterator[T] with AutoCloseable { // next closes the iterator and handles the case where it is already closed override def next(): T = if (closed) throw new RuntimeException("Trying to get next element on a closed iterator") - else if (hasNext()) + else if (hasNext) try { naiveNext } catch { From 5f54641cb7d6448148a0570599504125fb976eaa Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 7 Dec 2015 15:40:17 -0200 Subject: [PATCH 063/268] Make it partially compatible with scala 2.11 and Xlint free and minor cleanups --- build.sbt | 8 ++++---- .../ignition/core/jobs/utils/SparkContextUtils.scala | 12 +++++++----- .../ignition/core/utils/AutoCloseableIterator.scala | 4 ++-- src/main/scala/ignition/core/utils/BetterTrace.scala | 3 ++- src/main/scala/ignition/core/utils/FutureUtils.scala | 2 +- 5 files changed, 16 insertions(+), 13 deletions(-) diff --git a/build.sbt b/build.sbt index ec4d70bf..d0e2b029 100644 --- a/build.sbt +++ b/build.sbt @@ -4,7 +4,7 @@ version := "1.0" scalaVersion := "2.10.4" -scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings") +scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code") ideaExcludeFolders += ".idea" @@ -19,9 +19,7 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.5.1" % "provided libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") -libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.2.4" - -libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16" +libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.4" libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.6" @@ -35,6 +33,8 @@ libraryDependencies += "org.joda" % "joda-convert" % "1.7" libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.9.6" +libraryDependencies += "commons-lang" % "commons-lang" % "2.6" + resolvers += "Akka Repository" at "http://repo.akka.io/releases/" resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/" diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 97aed619..08f4a39d 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -23,6 +23,8 @@ import scala.io.{Codec, Source} import scala.reflect.ClassTag import scala.util.Try import scala.util.control.NonFatal +import ignition.core.utils.ExceptionUtils._ + object SparkContextUtils { @@ -31,7 +33,7 @@ object SparkContextUtils { inputStream.close() } catch { case NonFatal(ex) => - println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") + println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}") } } @@ -289,7 +291,7 @@ object SparkContextUtils { maxBytesPerPartition: Long, minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { - val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> Unit), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) val hadoopConf = _hadoopConf smallPartitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } @@ -305,8 +307,8 @@ object SparkContextUtils { Source.fromInputStream(inputStream)(Codec.UTF8).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _) } catch { case NonFatal(ex) => - println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") - throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") + println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}") + throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}") } finally { close(inputStream, path) } @@ -326,7 +328,7 @@ object SparkContextUtils { } val hadoopConf = _hadoopConf - val partitionedSlices = sc.parallelize(slices.map(s => s -> ()), 2).partitionBy(partitioner) + val partitionedSlices = sc.parallelize(slices.map(s => s -> Unit), 2).partitionBy(partitioner) partitionedSlices.mapPartitions { slices => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } diff --git a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala index bc294f6f..4e3db808 100644 --- a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala +++ b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala @@ -26,7 +26,7 @@ trait AutoCloseableIterator[T] extends Iterator[T] with AutoCloseable { var closed = false // hasNext closes the iterator and handles the case where it is already closed - override def hasNext(): Boolean = if (closed) + override def hasNext: Boolean = if (closed) false else { val naiveResult = try { @@ -47,7 +47,7 @@ trait AutoCloseableIterator[T] extends Iterator[T] with AutoCloseable { // next closes the iterator and handles the case where it is already closed override def next(): T = if (closed) throw new RuntimeException("Trying to get next element on a closed iterator") - else if (hasNext()) + else if (hasNext) try { naiveNext } catch { diff --git a/src/main/scala/ignition/core/utils/BetterTrace.scala b/src/main/scala/ignition/core/utils/BetterTrace.scala index 158e261e..32d5ea5f 100644 --- a/src/main/scala/ignition/core/utils/BetterTrace.scala +++ b/src/main/scala/ignition/core/utils/BetterTrace.scala @@ -1,5 +1,6 @@ package ignition.core.utils +import ignition.core.utils.ExceptionUtils._ // Used mainly to augment scalacheck traces in scalatest trait BetterTrace { def fail(message: String): Nothing @@ -7,7 +8,7 @@ trait BetterTrace { try { block } catch { - case t: Throwable => fail(s"${t.getMessage}: ${t.getStackTraceString}") + case t: Throwable => fail(s"${t.getMessage}: ${t.getFullStacktraceString}") } } diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala index 95b44c2f..4523a94f 100644 --- a/src/main/scala/ignition/core/utils/FutureUtils.scala +++ b/src/main/scala/ignition/core/utils/FutureUtils.scala @@ -6,7 +6,7 @@ import scala.util.{Failure, Success, Try} object FutureUtils { - def blockingFuture[T](body: =>T)(implicit ec: ExecutionContext): Future[T] = future { blocking { body } } + def blockingFuture[T](body: =>T)(implicit ec: ExecutionContext): Future[T] = Future { blocking { body } } implicit class FutureImprovements[V](future: Future[V]) { def toOptionOnFailure(errorHandler: (Throwable) => Option[V])(implicit ec: ExecutionContext): Future[Option[V]] = { From 0ec37240db44a57e0a2117f53bf8d577b4a71037 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 7 Dec 2015 15:41:30 -0200 Subject: [PATCH 064/268] Make it partially compatible with scala 2.11 and Xlint free and minor cleanups --- src/main/scala/ignition/core/utils/ExceptionUtils.scala | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 src/main/scala/ignition/core/utils/ExceptionUtils.scala diff --git a/src/main/scala/ignition/core/utils/ExceptionUtils.scala b/src/main/scala/ignition/core/utils/ExceptionUtils.scala new file mode 100644 index 00000000..e2626764 --- /dev/null +++ b/src/main/scala/ignition/core/utils/ExceptionUtils.scala @@ -0,0 +1,9 @@ +package ignition.core.utils + +object ExceptionUtils { + + implicit class ExceptionImprovements(e: Throwable) { + def getFullStacktraceString(): String = org.apache.commons.lang.exception.ExceptionUtils.getFullStackTrace(e) + } + +} From b66d05dfa1cd328442c52c80d07b19f7643b67a5 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 7 Dec 2015 15:48:44 -0200 Subject: [PATCH 065/268] Renaming --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 6 +++--- src/main/scala/ignition/core/utils/BetterTrace.scala | 2 +- src/main/scala/ignition/core/utils/ExceptionUtils.scala | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 08f4a39d..6765009d 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -33,7 +33,7 @@ object SparkContextUtils { inputStream.close() } catch { case NonFatal(ex) => - println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}") + println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getFullStackTraceString}") } } @@ -307,8 +307,8 @@ object SparkContextUtils { Source.fromInputStream(inputStream)(Codec.UTF8).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _) } catch { case NonFatal(ex) => - println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}") - throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}") + println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStackTraceString}") + throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStackTraceString}") } finally { close(inputStream, path) } diff --git a/src/main/scala/ignition/core/utils/BetterTrace.scala b/src/main/scala/ignition/core/utils/BetterTrace.scala index 32d5ea5f..387f49f7 100644 --- a/src/main/scala/ignition/core/utils/BetterTrace.scala +++ b/src/main/scala/ignition/core/utils/BetterTrace.scala @@ -8,7 +8,7 @@ trait BetterTrace { try { block } catch { - case t: Throwable => fail(s"${t.getMessage}: ${t.getFullStacktraceString}") + case t: Throwable => fail(s"${t.getMessage}: ${t.getFullStackTraceString}") } } diff --git a/src/main/scala/ignition/core/utils/ExceptionUtils.scala b/src/main/scala/ignition/core/utils/ExceptionUtils.scala index e2626764..1ae33568 100644 --- a/src/main/scala/ignition/core/utils/ExceptionUtils.scala +++ b/src/main/scala/ignition/core/utils/ExceptionUtils.scala @@ -3,7 +3,7 @@ package ignition.core.utils object ExceptionUtils { implicit class ExceptionImprovements(e: Throwable) { - def getFullStacktraceString(): String = org.apache.commons.lang.exception.ExceptionUtils.getFullStackTrace(e) + def getFullStackTraceString(): String = org.apache.commons.lang.exception.ExceptionUtils.getFullStackTrace(e) } } From 2f6741dfd3d5ff30738cda6c2cc3279cda06fe0e Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 7 Dec 2015 16:11:10 -0200 Subject: [PATCH 066/268] Use null instead of Unit because Unit isnt serialiable --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 6765009d..648da060 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -291,7 +291,7 @@ object SparkContextUtils { maxBytesPerPartition: Long, minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { - val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> Unit), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> null), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) val hadoopConf = _hadoopConf smallPartitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } @@ -328,7 +328,7 @@ object SparkContextUtils { } val hadoopConf = _hadoopConf - val partitionedSlices = sc.parallelize(slices.map(s => s -> Unit), 2).partitionBy(partitioner) + val partitionedSlices = sc.parallelize(slices.map(s => s -> null), 2).partitionBy(partitioner) partitionedSlices.mapPartitions { slices => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } From 84e98f490d409f8b9de741eada91d932979b1eff Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 27 Nov 2015 10:54:12 -0200 Subject: [PATCH 067/268] new filter and get text files --- .../core/jobs/utils/SparkContextUtils.scala | 448 ++++++++++-------- 1 file changed, 250 insertions(+), 198 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 648da060..1afbd74f 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -4,9 +4,9 @@ import java.io.InputStream import com.amazonaws.auth.EnvironmentVariableCredentialsProvider import com.amazonaws.services.s3.AmazonS3Client -import com.amazonaws.services.s3.model.{ObjectListing, S3ObjectSummary} -import ignition.core.utils.{AutoCloseableIterator, ByteUtils} +import com.amazonaws.services.s3.model.{ListObjectsRequest, ObjectListing, S3ObjectSummary} import ignition.core.utils.DateUtils._ +import ignition.core.utils.{AutoCloseableIterator, ByteUtils, HadoopUtils} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.hadoop.io.compress.CompressionCodecFactory @@ -21,14 +21,24 @@ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.io.{Codec, Source} import scala.reflect.ClassTag -import scala.util.Try +import scala.util.{Failure, Success, Try} import scala.util.control.NonFatal import ignition.core.utils.ExceptionUtils._ object SparkContextUtils { - def close(inputStream: InputStream, path: String): Unit = { + private case class BigFileSlice(index: Int) + + private case class HadoopFilePartition(size: Long, paths: Seq[String]) + + private case class IndexedPartitioner(numPartitions: Int, index: Map[Any, Int]) extends Partitioner { + override def getPartition(key: Any): Int = index(key) + } + + private lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider()) + + private def close(inputStream: InputStream, path: String): Unit = { try { inputStream.close() } catch { @@ -37,17 +47,8 @@ object SparkContextUtils { } } - case class BigFileSlice(index: Int) - implicit lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider()) - - case class IndexedPartitioner(numPartitions: Int, index: Map[Any, Int]) extends Partitioner { - override def getPartition(key: Any): Int = index(key) - } - case class HadoopFile(path: String, isDir: Boolean, size: Long) - private case class HadoopFilePartition(size: Long, paths: Seq[String]) - implicit class SparkContextImprovements(sc: SparkContext) { lazy val _hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) @@ -95,18 +96,6 @@ object SparkContextUtils { processPaths((p) => sc.textFile(p), paths, minimumPaths) } - private def processParallelTextFiles(paths: Seq[String], - minimumPaths: Int, - maxBytesPerPartition: Long, - minPartitions: Int, - listOnWorkers: Boolean): RDD[String] = { - val splittedPaths = paths.flatMap(ignition.core.utils.HadoopUtils.getPathStrings) - if (splittedPaths.size < minimumPaths) - throw new Exception(s"Not enough paths found for $paths") - - parallelListAndReadTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers) - } - private def filterPaths(paths: Seq[String], requireSuccess: Boolean, inclusiveStartDate: Boolean, @@ -141,7 +130,6 @@ object SparkContextUtils { } - def getFilteredPaths(paths: Seq[String], requireSuccess: Boolean, inclusiveStartDate: Boolean, @@ -154,7 +142,6 @@ object SparkContextUtils { filterPaths(paths, requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates) } - lazy val hdfsPathPrefix = sc.master.replaceFirst("spark://(.*):7077", "hdfs://$1:9000/") def synchToHdfs(paths: Seq[String], pathsToRdd: (Seq[String], Int) => RDD[String], forceSynch: Boolean): Seq[String] = { @@ -184,16 +171,6 @@ object SparkContextUtils { processTextFiles(paths, minimumPaths) } - def getParallelTextFiles(paths: Seq[String], - maxBytesPerPartition: Long = 256 * 1000 * 1000, - minPartitions: Int = 100, - synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1, listOnWorkers: Boolean = false): RDD[String] = { - if (synchLocally) - processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition, minPartitions, listOnWorkers) - else - processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition, minPartitions, listOnWorkers) - } - @deprecated("It may incur heavy S3 costs and/or be slow with small files, use filterAndGetParallelTextFiles instead", "2015-10-27") def filterAndGetTextFiles(path: String, requireSuccess: Boolean = false, @@ -212,26 +189,6 @@ object SparkContextUtils { getTextFiles(paths, synchLocally, forceSynch, minimumPaths) } - def filterAndGetParallelTextFiles(path: String, - maxBytesPerPartition: Long = 256 * 1000 * 1000, - minPartitions: Int = 100, - requireSuccess: Boolean = false, - inclusiveStartDate: Boolean = true, - startDate: Option[DateTime] = None, - inclusiveEndDate: Boolean = true, - endDate: Option[DateTime] = None, - lastN: Option[Int] = None, - synchLocally: Boolean = false, - forceSynch: Boolean = false, - ignoreMalformedDates: Boolean = false, - minimumPaths: Int = 1, - listOnWorkers: Boolean = false)(implicit dateExtractor: PathDateExtractor): RDD[String] = { - val paths = getFilteredPaths(Seq(path), requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates) - if (paths.size < minimumPaths) - throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of paths $paths is less than the required") - getParallelTextFiles(paths, maxBytesPerPartition, minPartitions, synchLocally, forceSynch, minimumPaths, listOnWorkers) - } - private def stringHadoopFile(paths: Seq[String], minimumPaths: Int): RDD[Try[String]] = { processPaths((p) => sc.sequenceFile(p, classOf[LongWritable], classOf[org.apache.hadoop.io.BytesWritable]) .map({ case (k, v) => Try { ByteUtils.toString(v.getBytes, 0, v.getLength, "UTF-8") } }), paths, minimumPaths) @@ -287,11 +244,11 @@ object SparkContextUtils { } - def readSmallFiles(smallFiles: List[HadoopFile], - maxBytesPerPartition: Long, - minPartitions: Int, - sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { - val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> null), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + private def readSmallFiles(smallFiles: List[HadoopFile], + maxBytesPerPartition: Long, + minPartitions: Int, + sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { + val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file ->()), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) val hadoopConf = _hadoopConf smallPartitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } @@ -316,8 +273,8 @@ object SparkContextUtils { } } - def readCompressedBigFile(file: HadoopFile, maxBytesPerPartition: Long, minPartitions: Int, - sizeBasedFileHandling: SizeBasedFileHandling, sampleCount: Int = 100): RDD[String] = { + private def readCompressedBigFile(file: HadoopFile, maxBytesPerPartition: Long, minPartitions: Int, + sizeBasedFileHandling: SizeBasedFileHandling, sampleCount: Int = 100): RDD[String] = { val estimatedSize = sizeBasedFileHandling.estimatedSize(file) val totalSlices = (estimatedSize / maxBytesPerPartition + 1).toInt val slices = (0 until totalSlices).map(BigFileSlice.apply) @@ -362,10 +319,10 @@ object SparkContextUtils { } } - def readBigFiles(bigFiles: List[HadoopFile], - maxBytesPerPartition: Long, - minPartitions: Int, - sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { + private def readBigFiles(bigFiles: List[HadoopFile], + maxBytesPerPartition: Long, + minPartitions: Int, + sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq( "mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString)) .foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } @@ -392,21 +349,27 @@ object SparkContextUtils { def parallelListAndReadTextFiles(paths: List[String], maxBytesPerPartition: Long, minPartitions: Int, - listOnWorkers: Boolean, - sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = { - - val foundFiles = (if (listOnWorkers) parallelListFiles(paths) else driverListFiles(paths)).filter(_.size > 0) - parallelReadTextFiles(foundFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling) + sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()) + (implicit dateExtractor: PathDateExtractor): RDD[String] = { + val foundFiles = paths.flatMap(smartList(_)).filter(_.size > 0) + parallelReadTextFiles(foundFiles, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, sizeBasedFileHandling = sizeBasedFileHandling) } def parallelReadTextFiles(files: List[HadoopFile], - maxBytesPerPartition: Long, - minPartitions: Int, - sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = { - val (bigFiles, smallFiles) = files.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition)) - sc.union( - readSmallFiles(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling), - readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + maxBytesPerPartition: Long = 256 * 1000 * 1000, + minPartitions: Int = 100, + sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling(), + synchLocally: Option[String] = None, + forceSynch: Boolean = false): RDD[String] = { + if (synchLocally.isDefined) + doSync(files, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, synchLocally = synchLocally.get, + sizeBasedFileHandling = sizeBasedFileHandling, forceSynch = forceSynch) + else { + val (bigFiles, smallFiles) = files.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition)) + sc.union( + readSmallFiles(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling), + readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + } } private def createSmallFilesPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long, sizeBasedFileHandling: SizeBasedFileHandling): Partitioner = { @@ -435,11 +398,9 @@ object SparkContextUtils { IndexedPartitioner(partitions.size, indexedPartitions) } - - private def executeListOnWorkers(paths: RDD[String]): List[HadoopFile] = { - val hadoopConf = _hadoopConf + private def executeDriverList(paths: Seq[String]): List[HadoopFile] = { + val conf = _hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } paths.flatMap { path => - val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val hadoopPath = new Path(path) val fileSystem = hadoopPath.getFileSystem(conf) val tryFind = try { @@ -462,162 +423,253 @@ object SparkContextUtils { val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList } - }.collect().toList + }.toList } - - def parallelListFiles(paths: List[String]): List[HadoopFile] = { - - val directories = paths.map(HadoopFile(_, isDir = true, 0)) - + private def driverListFiles(path: String): List[HadoopFile] = { def innerListFiles(remainingDirectories: List[HadoopFile]): List[HadoopFile] = { if (remainingDirectories.isEmpty) { Nil } else { - val remainingPaths = remainingDirectories.map(_.path) - val pathsRDD = sc.parallelize(remainingPaths, remainingPaths.size / 2) - val (dirs, files) = executeListOnWorkers(pathsRDD).partition(_.isDir) + val (dirs, files) = executeDriverList(remainingDirectories.map(_.path)).partition(_.isDir) files ++ innerListFiles(dirs) } } - innerListFiles(directories) + innerListFiles(List(HadoopFile(path, isDir = true, 0))) } + def s3ListCommonPrefixes(bucket: String, prefix: String, delimiter: String = "/") + (implicit s3: AmazonS3Client): Stream[String] = { + def inner(current: ObjectListing): Stream[String] = + if (current.isTruncated) + current.getCommonPrefixes.toStream ++ inner(s3.listNextBatchOfObjects(current)) + else + current.getCommonPrefixes.toStream - private def executeDriverList(paths: Seq[String]): List[HadoopFile] = { - val conf = _hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } - paths.flatMap { path => - val hadoopPath = new Path(path) - val fileSystem = hadoopPath.getFileSystem(conf) - val tryFind = try { - val status = fileSystem.getFileStatus(hadoopPath) - if (status.isDirectory) { - val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) - Option(sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList) - } else if (status.isFile) { - Option(List(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen))) - } else { - None - } - } catch { - case e: java.io.FileNotFoundException => - None - } - - tryFind.getOrElse { - // Maybe is glob or not found - val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) - sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList - } - }.toList + val request = new ListObjectsRequest(bucket, prefix, null, delimiter, 1000) + inner(s3.listObjects(request)) } - def driverListFiles(paths: List[String]): List[HadoopFile] = { - - val directories = paths.map(HadoopFile(_, isDir = true, 0)) + def s3ListObjects(bucket: String, prefix: String) + (implicit s3: AmazonS3Client): Stream[S3ObjectSummary] = { + def inner(current: ObjectListing): Stream[S3ObjectSummary] = + if (current.isTruncated) + current.getObjectSummaries.toStream ++ inner(s3.listNextBatchOfObjects(current)) + else + current.getObjectSummaries.toStream + + inner(s3.listObjects(bucket, prefix)) + } + + def s3NarrowPaths(bucket: String, + prefix: String, + delimiter: String = "/", + inclusiveStartDate: Boolean = true, + startDate: Option[DateTime] = None, + inclusiveEndDate: Boolean = true, + endDate: Option[DateTime] = None, + ignoreHours: Boolean = true) + (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): Stream[String] = { + + def isGoodDate(date: DateTime): Boolean = { + val startDateToCompare = startDate.map(date => if (ignoreHours) date.withTimeAtStartOfDay() else date) + val endDateToCompare = endDate.map(date => if (ignoreHours) date.withTime(23, 59, 59, 999) else date) + val goodStartDate = startDateToCompare.isEmpty || (inclusiveStartDate && date.saneEqual(startDateToCompare.get) || date.isAfter(startDateToCompare.get)) + val goodEndDate = endDateToCompare.isEmpty || (inclusiveEndDate && date.saneEqual(endDateToCompare.get) || date.isBefore(endDateToCompare.get)) + goodStartDate && goodEndDate + } - def innerListFiles(remainingDirectories: List[HadoopFile]): List[HadoopFile] = { - if (remainingDirectories.isEmpty) { - Nil - } else { - val (dirs, files) = executeDriverList(remainingDirectories.map(_.path)).partition(_.isDir) - files ++ innerListFiles(dirs) + def classifyPath(path: String): Either[String, (String, DateTime)] = + Try(pathDateExtractor.extractFromPath(s"s3n://$bucket/$path")) match { + case Success(date) => Right(path -> date) + case Failure(_) => Left(path) } + + s3ListCommonPrefixes(bucket, prefix, delimiter).map(classifyPath).flatMap { + case Left(prefixWithoutDate) => s3NarrowPaths(bucket, prefixWithoutDate, delimiter, inclusiveStartDate, + startDate, inclusiveEndDate, endDate, ignoreHours) + case Right((prefixWithDate, date)) if isGoodDate(date) => List(s"s3n://$bucket/$prefixWithDate") + case Right(_) => List.empty } - innerListFiles(directories) } - private def s3List(bucket: String, prefix: String, predicate: S3ObjectSummary => Boolean = _ => true) - (implicit s3: AmazonS3Client): List[S3ObjectSummary] = { - def inner(acc: mutable.ArrayBuffer[S3ObjectSummary], listing: ObjectListing): List[S3ObjectSummary] = { - acc ++= listing.getObjectSummaries.toList.filter(predicate) - if (listing.isTruncated) - inner(acc, s3.listNextBatchOfObjects(listing)) - else - acc.toList + private def s3List(path: String, + inclusiveStartDate: Boolean, + startDate: Option[DateTime], + inclusiveEndDate: Boolean, + endDate: Option[DateTime], + exclusionPattern: Option[String]) + (implicit s3: AmazonS3Client, dateExtractor: PathDateExtractor): Stream[S3ObjectSummary] = { + + val s3Pattern = "s3n?://([^/]+)(.+)".r + + def extractBucketAndPrefix(path: String): Option[(String, String)] = path match { + case s3Pattern(bucket, prefix) => Option(bucket -> prefix.dropWhile(_ == '/')) + case _ => None } - inner(new mutable.ArrayBuffer[S3ObjectSummary], s3.listObjects(bucket, prefix)) + extractBucketAndPrefix(path) match { + case Some((pathBucket, pathPrefix)) => + s3NarrowPaths(pathBucket, pathPrefix, inclusiveStartDate = inclusiveStartDate, inclusiveEndDate = inclusiveEndDate, + startDate = startDate, endDate = endDate).flatMap(extractBucketAndPrefix).flatMap { + case (bucket, prefix) => s3ListObjects(bucket, prefix) + } + case _ => Stream.empty + } } - def s3ListAndFilterFiles(bucket: String, - prefix: String, - start: Option[DateTime] = None, - end: Option[DateTime] = None, - endsWith: Option[String] = None, - exclusionPattern: Option[String] = Option("_$folder$"), - predicate: HadoopFile => Boolean = _ => true) - (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): List[HadoopFile] = { - - def excludePatternValidation(s3Object: S3ObjectSummary, exclusionPatternOption: Option[String]): Option[S3ObjectSummary] = - exclusionPatternOption match { - case Some(pattern) if s3Object.getKey.contains(pattern) => None - case Some(_) | None => Option(s3Object) + def listAndFilterFiles(path: String, + requireSuccess: Boolean = false, + inclusiveStartDate: Boolean = true, + startDate: Option[DateTime] = None, + inclusiveEndDate: Boolean = true, + endDate: Option[DateTime] = None, + lastN: Option[Int] = None, + ignoreMalformedDates: Boolean = false, + endsWith: Option[String] = None, + exclusionPattern: Option[String] = Option(".*_temporary.*|.*_\\$folder.*"), + predicate: HadoopFile => Boolean = _ => true) + (implicit dateExtractor: PathDateExtractor): List[HadoopFile] = { + + def isSuccessFile(file: HadoopFile): Boolean = + file.path.endsWith("_SUCCESS") || file.path.endsWith("_FINISHED") + + def extractDateFromFile(file: HadoopFile): Option[DateTime] = + Try(dateExtractor.extractFromPath(file.path)).toOption + + def excludePatternValidation(file: HadoopFile): Option[HadoopFile] = + exclusionPattern match { + case Some(pattern) if file.path.matches(pattern) => None + case Some(_) | None => Option(file) } - def endsWithValidation(s3Object: S3ObjectSummary, endsWithOption: Option[String]): Option[S3ObjectSummary] = - endsWithOption match { - case Some(pattern) if s3Object.getKey.endsWith(pattern) => Option(s3Object) + def endsWithValidation(file: HadoopFile): Option[HadoopFile] = + endsWith match { + case Some(pattern) if file.path.endsWith(pattern) => Option(file) + case Some(_) if isSuccessFile(file) => Option(file) case Some(_) => None - case None => Option(s3Object) + case None => Option(file) } - def extractDateFromKey(s3Object: S3ObjectSummary): Option[DateTime] = - Try(pathDateExtractor.extractFromPath(s"s3n://$bucket/${s3Object.getKey}")).toOption + def applyPredicate(file: HadoopFile): Option[HadoopFile] = + if (predicate(file)) Option(file) else None - def startValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, startOption: Option[DateTime]): Option[S3ObjectSummary] = - startOption match { - case Some(startDate) if startDate.isEqualOrBefore(extractedDate) => Option(s3Object) - case Some(_) => None - case None => Option(s3Object) + def dateValidation(file: HadoopFile): Option[HadoopFile] = { + val tryDate = extractDateFromFile(file) + if (tryDate.isEmpty && ignoreMalformedDates) + None + else { + val date = tryDate.get + val goodStartDate = startDate.isEmpty || (inclusiveStartDate && date.saneEqual(startDate.get) || date.isAfter(startDate.get)) + val goodEndDate = endDate.isEmpty || (inclusiveEndDate && date.saneEqual(endDate.get) || date.isBefore(endDate.get)) + if (goodStartDate && goodEndDate) Some(file) else None } + } - def endValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, endOption: Option[DateTime]): Option[S3ObjectSummary] = - endOption match { - case Some(endDate) if endDate.isEqualOrAfter(extractedDate) => Option(s3Object) - case Some(_) => None - case None => Option(s3Object) - } + val preValidations: HadoopFile => Boolean = hadoopFile => { + val validatedFile = for { + _ <- excludePatternValidation(hadoopFile) + _ <- endsWithValidation(hadoopFile) + _ <- dateValidation(hadoopFile) + valid <- applyPredicate(hadoopFile) + } yield valid + validatedFile.isDefined + } - def applyPredicate(file: HadoopFile): Option[HadoopFile] = - if (predicate(file)) - Option(file) + val preFilteredFiles = smartList(path, inclusiveStartDate = inclusiveStartDate, inclusiveEndDate = inclusiveEndDate, + startDate = startDate, endDate = endDate, exclusionPattern = exclusionPattern).filter(preValidations) + + val filesByDate = preFilteredFiles.groupBy(extractDateFromFile).collect { + case (Some(date), files) => date -> files + } + + val posFilteredFiles = + if (requireSuccess) + filesByDate.filter { case (_, files) => files.exists(isSuccessFile) } else - None + filesByDate + + val allFiles = if (lastN.isDefined) + posFilteredFiles.toList.sortBy(_._1).reverse.take(lastN.get).flatMap(_._2) + else + posFilteredFiles.toList.flatMap(_._2) + + allFiles.sortBy(_.path) + } + + def smartList(path: String, + inclusiveStartDate: Boolean = false, + startDate: Option[DateTime] = None, + inclusiveEndDate: Boolean = false, + endDate: Option[DateTime] = None, + exclusionPattern: Option[String] = None)(implicit pathDateExtractor: PathDateExtractor): Stream[HadoopFile] = { def toHadoopFile(s3Object: S3ObjectSummary): HadoopFile = HadoopFile(s"s3n://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize) - val allValidations: S3ObjectSummary => Boolean = s3Object => { - val validatedFile = for { - withValidPattern <- excludePatternValidation(s3Object, exclusionPattern) - withValidEndsWith <- endsWithValidation(withValidPattern, endsWith) - extractedDate <- extractDateFromKey(withValidEndsWith) - withValidStart <- startValidation(withValidEndsWith, extractedDate, start) - withValidEnd <- endValidation(withValidStart, extractedDate, end) - hadoopFile = toHadoopFile(withValidEnd) - valid <- applyPredicate(hadoopFile) - } yield valid - validatedFile.isDefined + def listPath(path: String): Stream[HadoopFile] = { + if (path.startsWith("s3")) { + s3List(path, inclusiveStartDate = inclusiveStartDate, startDate = startDate, inclusiveEndDate = inclusiveEndDate, + endDate = endDate, exclusionPattern = exclusionPattern)(amazonS3ClientFromEnvironmentVariables, pathDateExtractor ).map(toHadoopFile) + } else { + driverListFiles(path).toStream + } } - s3List(bucket, prefix, allValidations)(s3).map(toHadoopFile) + HadoopUtils.getPathStrings(path).toStream.flatMap(listPath) } + def filterAndGetParallelTextFiles(path: String, + requireSuccess: Boolean = false, + inclusiveStartDate: Boolean = true, + startDate: Option[DateTime] = None, + inclusiveEndDate: Boolean = true, + endDate: Option[DateTime] = None, + lastN: Option[Int] = None, + ignoreMalformedDates: Boolean = false, + endsWith: Option[String] = None, + predicate: HadoopFile => Boolean = _ => true, + maxBytesPerPartition: Long = 256 * 1000 * 1000, + minPartitions: Int = 100, + sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling(), + minimumFiles: Int = 1, + synchLocally: Option[String] = None, + forceSynch: Boolean = false) + (implicit dateExtractor: PathDateExtractor): RDD[String] = { + + val foundFiles = listAndFilterFiles(path, requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, + endDate, lastN, ignoreMalformedDates, endsWith, predicate = predicate) + + if (foundFiles.size < minimumFiles) + throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of files $foundFiles is less than the required") + + parallelReadTextFiles(foundFiles, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, + sizeBasedFileHandling = sizeBasedFileHandling, synchLocally = synchLocally, forceSynch = forceSynch) + } + + private def doSync(hadoopFiles: List[HadoopFile], + synchLocally: String, + forceSynch: Boolean, + maxBytesPerPartition: Long = 256 * 1000 * 1000, + minPartitions: Int = 100, + sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = { + require(!synchLocally.contains("*"), "Globs are not supported on the sync key") + + def syncPath(suffix: String) = s"$hdfsPathPrefix/_core_ignition_sync_hdfs_cache/$suffix" + + val hashKey = Integer.toHexString(hadoopFiles.toSet.hashCode()) + + lazy val foundLocalPaths = getStatus(syncPath(s"$synchLocally/$hashKey/{_SUCCESS,_FINISHED}"), removeEmpty = false) + + val cacheKey = syncPath(s"$synchLocally/$hashKey") + + if (forceSynch || foundLocalPaths.isEmpty) { + delete(new Path(syncPath(s"$synchLocally/"))) + val data = parallelReadTextFiles(hadoopFiles, maxBytesPerPartition, minPartitions, synchLocally = None) + data.saveAsTextFile(cacheKey) + } - def s3FilterAndGetParallelTextFiles(bucket: String, - prefix: String, - startDate: Option[DateTime] = None, - endDate: Option[DateTime] = None, - endsWith: Option[String] = None, - predicate: HadoopFile => Boolean = _ => true, - maxBytesPerPartition: Long = 256 * 1000 * 1000, - minPartitions: Int = 100, - sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()) - (implicit s3Client: AmazonS3Client = amazonS3ClientFromEnvironmentVariables, - dateExtractor: PathDateExtractor): RDD[String] = { - val foundFiles = s3ListAndFilterFiles(bucket, prefix, startDate, endDate, predicate = predicate)(s3Client, dateExtractor) - parallelReadTextFiles(foundFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling) + sc.textFile(cacheKey) } } From a1d226a8cdf018f0652d06de85ffa11b632531a7 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 8 Dec 2015 09:55:21 -0200 Subject: [PATCH 068/268] merge --- src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 1afbd74f..9a96f78d 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -248,7 +248,7 @@ object SparkContextUtils { maxBytesPerPartition: Long, minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { - val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file ->()), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> null), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) val hadoopConf = _hadoopConf smallPartitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } From f5ad7f29afdd1040d0ca54e94ebf44137dd286f9 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 8 Dec 2015 13:38:16 -0200 Subject: [PATCH 069/268] fix empty file filter --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 9a96f78d..bed7e8f0 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -351,7 +351,7 @@ object SparkContextUtils { minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()) (implicit dateExtractor: PathDateExtractor): RDD[String] = { - val foundFiles = paths.flatMap(smartList(_)).filter(_.size > 0) + val foundFiles = paths.flatMap(smartList(_)) parallelReadTextFiles(foundFiles, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, sizeBasedFileHandling = sizeBasedFileHandling) } @@ -361,11 +361,12 @@ object SparkContextUtils { sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling(), synchLocally: Option[String] = None, forceSynch: Boolean = false): RDD[String] = { + val filteredFiles = files.filter(_.size > 0) if (synchLocally.isDefined) - doSync(files, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, synchLocally = synchLocally.get, + doSync(filteredFiles, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, synchLocally = synchLocally.get, sizeBasedFileHandling = sizeBasedFileHandling, forceSynch = forceSynch) else { - val (bigFiles, smallFiles) = files.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition)) + val (bigFiles, smallFiles) = filteredFiles.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition)) sc.union( readSmallFiles(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling), readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) From 5587537b7e42136daf6ffcae53a9754c19b55fd2 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 10 Dec 2015 09:51:40 -0200 Subject: [PATCH 070/268] fix narrow paths for paths without common prefixes (like final folders) --- .../core/jobs/utils/SparkContextUtils.scala | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index bed7e8f0..4eab7baf 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -486,12 +486,16 @@ object SparkContextUtils { case Failure(_) => Left(path) } - s3ListCommonPrefixes(bucket, prefix, delimiter).map(classifyPath).flatMap { - case Left(prefixWithoutDate) => s3NarrowPaths(bucket, prefixWithoutDate, delimiter, inclusiveStartDate, - startDate, inclusiveEndDate, endDate, ignoreHours) - case Right((prefixWithDate, date)) if isGoodDate(date) => List(s"s3n://$bucket/$prefixWithDate") - case Right(_) => List.empty - } + val commonPrefixes = s3ListCommonPrefixes(bucket, prefix, delimiter).map(classifyPath) + + if (commonPrefixes.isEmpty) + Stream(s"s3n://$bucket/$prefix") + else + commonPrefixes.toStream.flatMap { + case Left(prefixWithoutDate) => s3NarrowPaths(bucket, prefixWithoutDate, delimiter, inclusiveStartDate, startDate, inclusiveEndDate, endDate, ignoreHours) + case Right((prefixWithDate, date)) if isGoodDate(date) => Stream(s"s3n://$bucket/$prefixWithDate") + case Right(_) => Stream.empty + } } private def s3List(path: String, From b253f29f66b2ba6858c46e681237e5f1f6c1cf1c Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 21 Dec 2015 13:34:59 -0200 Subject: [PATCH 071/268] Added some new utils --- .../ignition/core/utils/CollectionUtils.scala | 26 +++++++++++++++++++ .../core/utils/CollectionUtilsSpec.scala | 13 ++++++++++ 2 files changed, 39 insertions(+) diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala index eea4755e..f98fb7ec 100644 --- a/src/main/scala/ignition/core/utils/CollectionUtils.scala +++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala @@ -6,7 +6,32 @@ import scalaz.Validation object CollectionUtils { + + + implicit class SeqImprovements[A](xs: Seq[A]) { + def orElseIfEmpty[B >: A](alternative: => Seq[B]): Seq[B] = { + if (xs.nonEmpty) + xs + else + alternative + } + } + implicit class TraversableOnceImprovements[A](xs: TraversableOnce[A]) { + def maxOption(implicit cmp: Ordering[A]): Option[A] = { + if (xs.isEmpty) + None + else + Option(xs.max) + } + + def minOption(implicit cmp: Ordering[A]): Option[A] = { + if (xs.isEmpty) + None + else + Option(xs.min) + } + def maxByOption[B](f: A => B)(implicit cmp: Ordering[B]): Option[A] = { if (xs.isEmpty) None @@ -65,6 +90,7 @@ object CollectionUtils { builder.result } + } implicit class ValidatedIterableLike[T, R, Repr <: IterableLike[Validation[R, T], Repr]](seq: IterableLike[Validation[R, T], Repr]) { diff --git a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala index f01b8a34..548b2423 100644 --- a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala +++ b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala @@ -32,5 +32,18 @@ class CollectionUtilsSpec extends FlatSpec with ShouldMatchers { list.compressBy(_.value) shouldBe List(MyObj("p1", "v1"), MyObj("p1", "v2")) } + it should "provide orElseIfEmpty" in { + Seq.empty[String].orElseIfEmpty(Seq("something")) shouldBe Seq("something") + Seq("not empty").orElseIfEmpty(Seq("something")) shouldBe Seq("not empty") + } + + it should "provide maxOption and minOption" in { + Seq.empty[Int].maxOption shouldBe None + Seq(1, 3, 2).maxOption shouldBe Some(3) + + Seq.empty[Int].minOption shouldBe None + Seq(1, 3, 2).minOption shouldBe Some(1) + } + } From 352ee0b4d584c4d38ef8bf3bd1b4d8320f0adf4a Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 22 Dec 2015 11:26:40 -0200 Subject: [PATCH 072/268] Minor change --- build.sbt | 2 +- src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/build.sbt b/build.sbt index d0e2b029..5de79888 100644 --- a/build.sbt +++ b/build.sbt @@ -4,7 +4,7 @@ version := "1.0" scalaVersion := "2.10.4" -scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code") +scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130") ideaExcludeFolders += ".idea" diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index aa4dcc76..8430d4ef 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -13,9 +13,11 @@ object CoreJobRunner { // Used to provide contextual logging def setLoggingContextValues(config: RunnerConfig): Unit = { - org.slf4j.MDC.put("setupName", config.setupName) - org.slf4j.MDC.put("tag", config.tag) - org.slf4j.MDC.put("user", config.user) + Try { // yes, this may fail but we don't want everything to shut down + org.slf4j.MDC.put("setupName", config.setupName) + org.slf4j.MDC.put("tag", config.tag) + org.slf4j.MDC.put("user", config.user) + } } case class RunnerConfig(setupName: String = "nosetup", From d780ea589d90f4d5683de05a8ca3339ce66a1fd1 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Fri, 15 Jan 2016 14:03:32 -0200 Subject: [PATCH 073/268] Make try work even if the exception is fatall --- src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index 8430d4ef..bbede553 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -13,10 +13,13 @@ object CoreJobRunner { // Used to provide contextual logging def setLoggingContextValues(config: RunnerConfig): Unit = { - Try { // yes, this may fail but we don't want everything to shut down + try { // yes, this may fail but we don't want everything to shut down org.slf4j.MDC.put("setupName", config.setupName) org.slf4j.MDC.put("tag", config.tag) org.slf4j.MDC.put("user", config.user) + } catch { + case e: Throwable => + // cry } } From 400b1f0d9cfdfb54183f744e9a5f5cf3f3a03df9 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Mon, 1 Feb 2016 10:37:44 -0200 Subject: [PATCH 074/268] zeppelin setup --- remote_hook.sh | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/remote_hook.sh b/remote_hook.sh index 48ba9735..5d4bbad1 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -49,6 +49,23 @@ on_trap_exit() { rm -f "${RUNNING_FILE}" } +install_and_run_zeppelin() { + if [[ ! -d "zeppelin" ]]; then + wget "http://www.us.apache.org/dist/incubator/zeppelin/0.5.6-incubating/zeppelin-0.5.6-incubating-bin-all.tgz" -O zeppelin.tar.gz + tar xvzf zeppelin.tar.gz > /tmp/zeppelin_install.log + mv `ls -d zeppelin-*` zeppelin + fi + if [[ -f "zeppelin/bin/zeppelin.sh" ]]; then + export MASTER="${JOB_MASTER}" + export ZEPPELIN_PORT="8081" + export SPARK_HOME="/root/spark" + export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH}" + sudo -E zeppelin/bin/zeppelin.sh + else + notify_error_and_exit "Not found zeppelin installation" + fi +} + trap "on_trap_exit" EXIT @@ -74,10 +91,11 @@ if [[ "${USE_YARN}" == "yes" ]]; then export SPARK_WORKER_MEMORY=${SPARK_MEM_PARAM} fi - if [[ "${JOB_NAME}" == "shell" ]]; then export ADD_JARS=${JAR_PATH} sudo -E ${SPARK_HOME}/bin/spark-shell || notify_error_and_exit "Execution failed for shell" +elif [[ "${JOB_NAME}" == "zeppelin" ]]; then + install_and_run_zeppelin else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & From 333127927fe9581228a12f57f2c8d1a29c474908 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 2 Feb 2016 09:36:49 -0200 Subject: [PATCH 075/268] pr review --- remote_hook.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/remote_hook.sh b/remote_hook.sh index 5d4bbad1..7d8ed36e 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -52,8 +52,8 @@ on_trap_exit() { install_and_run_zeppelin() { if [[ ! -d "zeppelin" ]]; then wget "http://www.us.apache.org/dist/incubator/zeppelin/0.5.6-incubating/zeppelin-0.5.6-incubating-bin-all.tgz" -O zeppelin.tar.gz - tar xvzf zeppelin.tar.gz > /tmp/zeppelin_install.log - mv `ls -d zeppelin-*` zeppelin + mkdir zepplin + tar xvzf zeppelin.tar.gz -C zeppelin --strip-components 1 > /tmp/zeppelin_install.log fi if [[ -f "zeppelin/bin/zeppelin.sh" ]]; then export MASTER="${JOB_MASTER}" @@ -62,7 +62,7 @@ install_and_run_zeppelin() { export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH}" sudo -E zeppelin/bin/zeppelin.sh else - notify_error_and_exit "Not found zeppelin installation" + notify_error_and_exit "Zepellin installation not found" fi } From 33aa47e2cde896bfd32feaa2e9726c9cd3475871 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 3 Feb 2016 15:09:53 -0200 Subject: [PATCH 076/268] rdd.filterNot --- src/main/scala/ignition/core/jobs/utils/RDDUtils.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala index 57069bae..60bddc9a 100644 --- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala @@ -57,6 +57,8 @@ object RDDUtils { def incrementCounterIf(cond: (V) => Boolean, acc: spark.Accumulator[Int]): RDD[V] = { rdd.map(x => { if (cond(x)) acc += 1; x }) } + + def filterNot(p: V => Boolean): RDD[V] = rdd.filter(!p(_)) } implicit class PairRDDImprovements[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) { From 93964db2d79c6b84f172712b9ce62eaa9fa44687 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 3 Feb 2016 15:45:24 -0200 Subject: [PATCH 077/268] open a browser for zepplin web ui --- tools/cluster.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/cluster.py b/tools/cluster.py index 2fe6b245..4a81eaa9 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -436,6 +436,9 @@ def job_run(cluster_name, job_name, job_mem, src_local=remote_hook_local, remote_path=with_leading_slash(remote_path)) + if job_name == "zeppelin": + subprocess.Popen(["xdg-open", "http://{master}:8081".format(master=master)]) + log.info('Will run job in remote host') if disable_tmux: ssh_call(user=remote_user, host=master, key_file=key_file, args=[non_tmux_arg], allocate_terminal=False) From 5137e43546660658dfe17beb0ff54c80877f16b1 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 3 Feb 2016 18:10:59 -0200 Subject: [PATCH 078/268] using webbrowser lib --- tools/cluster.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 4a81eaa9..daf03d91 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -23,6 +23,7 @@ import getpass import json import glob +import webbrowser log = logging.getLogger() @@ -437,7 +438,7 @@ def job_run(cluster_name, job_name, job_mem, remote_path=with_leading_slash(remote_path)) if job_name == "zeppelin": - subprocess.Popen(["xdg-open", "http://{master}:8081".format(master=master)]) + webbrowser.open("http://{master}:8081".format(master=master)) log.info('Will run job in remote host') if disable_tmux: From b0c323c3f283f4514a644b222a8c2a07dbb6c52c Mon Sep 17 00:00:00 2001 From: Leandro Date: Mon, 22 Feb 2016 10:54:23 -0300 Subject: [PATCH 079/268] Do not delete the security group by default --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 2fe6b245..d9e37533 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -322,7 +322,7 @@ def launch(cluster_name, slaves, raise CommandError('Failed to created cluster {} after failures'.format(cluster_name)) -def destroy(cluster_name, delete_groups=True, region=default_region): +def destroy(cluster_name, delete_groups=False, region=default_region): delete_sg_param = ['--delete-groups'] if delete_groups else [] ec2_script_path = chdir_to_ec2_script_and_get_path() From 0d5b6615c1039d52e529d27d5a28f7838b35359c Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 23 Feb 2016 19:24:09 -0300 Subject: [PATCH 080/268] Added most frequent function to sequences --- src/main/scala/ignition/core/utils/CollectionUtils.scala | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala index f98fb7ec..01960d3d 100644 --- a/src/main/scala/ignition/core/utils/CollectionUtils.scala +++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala @@ -15,6 +15,10 @@ object CollectionUtils { else alternative } + + def mostFrequentOption: Option[A] = { + xs.groupBy(identity).maxByOption(_._2.size).map(_._1) + } } implicit class TraversableOnceImprovements[A](xs: TraversableOnce[A]) { @@ -45,6 +49,7 @@ object CollectionUtils { else Option(xs.minBy(f)) } + } From ce911f6153d238f1db4c74c056c590ad730d636d Mon Sep 17 00:00:00 2001 From: Leandro Date: Wed, 24 Feb 2016 17:10:44 -0300 Subject: [PATCH 081/268] Fixing typo and adding driver heap param --- remote_hook.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/remote_hook.sh b/remote_hook.sh index 7d8ed36e..dd76933a 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -52,14 +52,14 @@ on_trap_exit() { install_and_run_zeppelin() { if [[ ! -d "zeppelin" ]]; then wget "http://www.us.apache.org/dist/incubator/zeppelin/0.5.6-incubating/zeppelin-0.5.6-incubating-bin-all.tgz" -O zeppelin.tar.gz - mkdir zepplin + mkdir zeppelin tar xvzf zeppelin.tar.gz -C zeppelin --strip-components 1 > /tmp/zeppelin_install.log fi if [[ -f "zeppelin/bin/zeppelin.sh" ]]; then export MASTER="${JOB_MASTER}" export ZEPPELIN_PORT="8081" export SPARK_HOME="/root/spark" - export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH}" + export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH} --runner-executor-memory ${SPARK_MEM_PARAM}" sudo -E zeppelin/bin/zeppelin.sh else notify_error_and_exit "Zepellin installation not found" From 99df346f866c674e2595772b94dcdc75fd64ff42 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Thu, 3 Mar 2016 18:52:07 -0300 Subject: [PATCH 082/268] Added new method --- src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala index 548b2423..c800c0f2 100644 --- a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala +++ b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala @@ -45,5 +45,10 @@ class CollectionUtilsSpec extends FlatSpec with ShouldMatchers { Seq(1, 3, 2).minOption shouldBe Some(1) } + it should "provide mostFrequentOption" in { + Seq.empty[String].mostFrequentOption shouldBe None + Seq("a", "b", "b", "c", "a", "b").mostFrequentOption shouldBe Option("b") + } + } From 736e82af6eccd843e173717e747d72f597c66756 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 7 Mar 2016 14:35:21 -0300 Subject: [PATCH 083/268] Added flatten to rdd of sets --- src/main/scala/ignition/core/jobs/utils/RDDUtils.scala | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala index 60bddc9a..e70d8476 100644 --- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala @@ -29,6 +29,12 @@ object RDDUtils { } } + implicit class SetRDDImprovements[V: ClassTag](rdd: RDD[Set[V]]) { + def flatten: RDD[V] = { + rdd.flatMap(x => x) + } + } + implicit class ValidatedRDDImprovements[A: ClassTag, B: ClassTag](rdd: RDD[Validation[A, B]]) { def mapSuccess(f: B => Validation[A, B]): RDD[Validation[A, B]] = { From 950d577600f997ea64875e61596241d847c8c36d Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Fri, 18 Mar 2016 11:38:05 -0300 Subject: [PATCH 084/268] Remove plugin specific configurations --- build.sbt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/build.sbt b/build.sbt index 5de79888..5ae4552b 100644 --- a/build.sbt +++ b/build.sbt @@ -6,10 +6,6 @@ scalaVersion := "2.10.4" scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130") -ideaExcludeFolders += ".idea" - -ideaExcludeFolders += ".idea_modules" - // Because we can't run two spark contexts on same VM parallelExecution in Test := false From 603dae78b8b4c633bfce46655f19e34249d32810 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 14 Jun 2016 19:52:08 -0300 Subject: [PATCH 085/268] Added percentile do IntBag --- .../scala/ignition/core/utils/IntBag.scala | 24 ++++++++++++++----- .../ignition/core/utils/IntBagSpec.scala | 16 ++++++++++--- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/src/main/scala/ignition/core/utils/IntBag.scala b/src/main/scala/ignition/core/utils/IntBag.scala index a322f6f7..38cb3836 100644 --- a/src/main/scala/ignition/core/utils/IntBag.scala +++ b/src/main/scala/ignition/core/utils/IntBag.scala @@ -1,5 +1,7 @@ package ignition.core.utils +import ignition.core.utils.CollectionUtils._ + object IntBag { def from(numbers: TraversableOnce[Long]): IntBag = { val histogram = scala.collection.mutable.HashMap.empty[Long, Long] @@ -19,15 +21,17 @@ case class IntBag(histogram: collection.Map[Long, Long]) { def median: Option[Long] = { - if (histogram.nonEmpty) { + percentile(50) + } + + def percentile(n: Double): Option[Long] = { + require(n > 0 && n <= 100) + histogram.keys.maxOption.flatMap { max => val total = histogram.values.sum - val half = total / 2 - val max = histogram.keys.max + val position = total * (n / 100) val accumulatedFrequency = (0L to max).scanLeft(0L) { case (sumFreq, k) => sumFreq + histogram.getOrElse(k, 0L) }.zipWithIndex - accumulatedFrequency.collectFirst { case (sum, k) if sum >= half => k } - } else { - None + accumulatedFrequency.collectFirst { case (sum, k) if sum >= position => k - 1 } } } @@ -39,4 +43,12 @@ case class IntBag(histogram: collection.Map[Long, Long]) { } else None } + + def min: Option[Long] = { + histogram.keys.minOption + } + + def max: Option[Long] = { + histogram.keys.maxOption + } } diff --git a/src/test/scala/ignition/core/utils/IntBagSpec.scala b/src/test/scala/ignition/core/utils/IntBagSpec.scala index b6694b12..76d37a35 100644 --- a/src/test/scala/ignition/core/utils/IntBagSpec.scala +++ b/src/test/scala/ignition/core/utils/IntBagSpec.scala @@ -10,14 +10,24 @@ class IntBagSpec extends FlatSpec with ShouldMatchers { IntBag.from(Seq(1, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4)).histogram shouldBe Map(1 -> 2, 2 -> 3, 3 -> 1, 4 -> 5) } - it should "calculate the median and average" in { + it should "calculate the average" in { val size = 1000 - val numbers = (0 until 1000).map(_ => Random.nextInt(400).toLong).toList + val numbers = (0 until size).map(_ => Random.nextInt(400).toLong).toList val bag = IntBag.from(numbers) bag.avg.get shouldBe numbers.sum / size + } + + it should "calculate the percentile, min and max" in { + val size = 3 // anything different is hard to guess because of the approximation + val numbers = (0 until size).map(_ => Random.nextInt(400).toLong).toList + val bag = IntBag.from(numbers) - // TODO: the median is only approximate and it could be better, improve it + bag.min.get shouldBe numbers.min + bag.percentile(0.1).get shouldBe numbers.min + bag.median.get shouldBe numbers.sorted.apply(1) + bag.percentile(99.9).get shouldBe numbers.max + bag.max.get shouldBe numbers.max } } From 7f709e4b1ce52981966a64cac336fb530a5e4db6 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 22 Jul 2016 10:06:25 -0300 Subject: [PATCH 086/268] log error on compressed big file read --- .../core/jobs/utils/SparkContextUtils.scala | 47 ++++++++++--------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 4eab7baf..552da25d 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -293,28 +293,33 @@ object SparkContextUtils { val hadoopPath = new Path(file.path) val fileSystem = hadoopPath.getFileSystem(conf) slices.flatMap { case (slice, _) => - val inputStream = Option(codecFactory.getCodec(hadoopPath)) match { - case Some(compression) => compression.createInputStream(fileSystem.open(hadoopPath)) - case None => fileSystem.open(hadoopPath) - } - val lines = Source.fromInputStream(inputStream)(Codec.UTF8).getLines() - - val lineSample = lines.take(sampleCount).toList - val linesPerSlice = { - val sampleSize = lineSample.map(_.size).sum - val estimatedAverageLineSize = Math.round(sampleSize / sampleCount.toFloat) - val estimatedTotalLines = Math.round(estimatedSize / estimatedAverageLineSize.toFloat) - estimatedTotalLines / totalSlices + 1 + try { + val inputStream = Option(codecFactory.getCodec(hadoopPath)) match { + case Some(compression) => compression.createInputStream(fileSystem.open(hadoopPath)) + case None => fileSystem.open(hadoopPath) + } + val lines = Source.fromInputStream(inputStream)(Codec.UTF8).getLines() + + val lineSample = lines.take(sampleCount).toList + val linesPerSlice = { + val sampleSize = lineSample.map(_.size).sum + val estimatedAverageLineSize = Math.round(sampleSize / sampleCount.toFloat) + val estimatedTotalLines = Math.round(estimatedSize / estimatedAverageLineSize.toFloat) + estimatedTotalLines / totalSlices + 1 + } + + val linesAfterSeek = (lineSample.toIterator ++ lines).drop(linesPerSlice * slice.index) + + val finalLines = if (slice.index + 1 == totalSlices) // last slice, read until the end + linesAfterSeek + else + linesAfterSeek.take(linesPerSlice) + + AutoCloseableIterator.wrap(finalLines, () => close(inputStream, s"${file.path}, slice $slice")) + } catch { + case NonFatal(e) => + throw new Exception(s"Error on read compressed big file, slice=$slice, file=$file", e) } - - val linesAfterSeek = (lineSample.toIterator ++ lines).drop(linesPerSlice * slice.index) - - val finalLines = if (slice.index + 1 == totalSlices) // last slice, read until the end - linesAfterSeek - else - linesAfterSeek.take(linesPerSlice) - - AutoCloseableIterator.wrap(finalLines, () => close(inputStream, s"${file.path}, slice $slice")) } } } From 6e6ceac12b311ede633ed54eabb82f6889820828 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Mon, 25 Jul 2016 18:46:01 -0300 Subject: [PATCH 087/268] Improve spark shell job run --- remote_hook.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/remote_hook.sh b/remote_hook.sh index dd76933a..309f85f5 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -93,7 +93,7 @@ fi if [[ "${JOB_NAME}" == "shell" ]]; then export ADD_JARS=${JAR_PATH} - sudo -E ${SPARK_HOME}/bin/spark-shell || notify_error_and_exit "Execution failed for shell" + sudo -E ${SPARK_HOME}/bin/spark-shell --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell" elif [[ "${JOB_NAME}" == "zeppelin" ]]; then install_and_run_zeppelin else From c73bbc13c2082b7b160518c62dfce60bf1e2bf45 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 16 Aug 2016 15:03:15 -0300 Subject: [PATCH 088/268] Smaller partitions are safer --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 552da25d..645b218e 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -361,7 +361,7 @@ object SparkContextUtils { } def parallelReadTextFiles(files: List[HadoopFile], - maxBytesPerPartition: Long = 256 * 1000 * 1000, + maxBytesPerPartition: Long = 128 * 1000 * 1000, minPartitions: Int = 100, sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling(), synchLocally: Option[String] = None, @@ -639,7 +639,7 @@ object SparkContextUtils { ignoreMalformedDates: Boolean = false, endsWith: Option[String] = None, predicate: HadoopFile => Boolean = _ => true, - maxBytesPerPartition: Long = 256 * 1000 * 1000, + maxBytesPerPartition: Long = 128 * 1000 * 1000, minPartitions: Int = 100, sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling(), minimumFiles: Int = 1, @@ -660,7 +660,7 @@ object SparkContextUtils { private def doSync(hadoopFiles: List[HadoopFile], synchLocally: String, forceSynch: Boolean, - maxBytesPerPartition: Long = 256 * 1000 * 1000, + maxBytesPerPartition: Long = 128 * 1000 * 1000, minPartitions: Int = 100, sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = { require(!synchLocally.contains("*"), "Globs are not supported on the sync key") From 800044312ae657a633488f0d8c1c9f55475b4ab9 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 18 Aug 2016 19:11:34 -0300 Subject: [PATCH 089/268] Use mutable List on groupByKeyAndTake --- .../ignition/core/jobs/utils/RDDUtils.scala | 19 ++++++++++++------- .../core/jobs/utils/RDDUtilsSpec.scala | 17 +++++++++-------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala index e70d8476..fd1d74ce 100644 --- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala @@ -3,13 +3,14 @@ package ignition.core.jobs.utils import org.slf4j.LoggerFactory import scala.reflect._ -import org.apache.spark.rdd.{PairRDDFunctions, CoGroupedRDD, RDD} +import org.apache.spark.rdd.{CoGroupedRDD, PairRDDFunctions, RDD} import org.apache.spark.SparkContext._ import org.apache.spark.Partitioner import org.apache.spark import org.joda.time.DateTime import org.joda.time.format.DateTimeFormat +import scala.collection.mutable import scalaz.{Success, Validation} object RDDUtils { @@ -93,13 +94,14 @@ object RDDUtils { } def groupByKeyAndTake(n: Int): RDD[(K, List[V])] = - rdd.aggregateByKey(List.empty[V])( + rdd.aggregateByKey(mutable.ListBuffer.empty[V])( (lst, v) => if (lst.size >= n) { logger.warn(s"Ignoring value '$v' due aggregation result of size '${lst.size}' is bigger than n=$n") lst } else { - v :: lst + lst += v + lst }, (lstA, lstB) => if (lstA.size >= n) @@ -109,11 +111,14 @@ object RDDUtils { else { if (lstA.size + lstB.size > n) { logger.warn(s"Merging partition1=${lstA.size} with partition2=${lstB.size} and taking the first n=$n, sample1='${lstA.take(5)}', sample2='${lstB.take(5)}'") - (lstA ++ lstB).take(n) - } else - lstA ++ lstB + lstA ++= lstB + lstA.take(n) + } else { + lstA ++= lstB + lstA + } } - ) + ).mapValues(_.toList) // Note: completely unoptimized. We could use instead for better performance: // 1) sortByKey diff --git a/src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala b/src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala index a00e5de8..705ba398 100644 --- a/src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala +++ b/src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala @@ -9,17 +9,18 @@ import scala.util.Random class RDDUtilsSpec extends FlatSpec with ShouldMatchers with SharedSparkContext { "RDDUtils" should "provide groupByKeyAndTake" in { - val take = 5 - val rdd = sc.parallelize((1 to Random.nextInt(40) + 10).map(x => "a" -> Random.nextInt()) ++ (1 to Random.nextInt(40) + 10).map(x => "b" -> Random.nextInt())) - val result = rdd.groupByKeyAndTake(take).collect().toMap - result("a").length shouldBe take - result("b").length shouldBe take + (10 to 60 by 10).foreach { take => + val rdd = sc.parallelize((1 to 400).map(x => "a" -> Random.nextInt()) ++ (1 to 400).map(x => "b" -> Random.nextInt()), 60) + val result = rdd.groupByKeyAndTake(take).collect().toMap + result("a").length shouldBe take + result("b").length shouldBe take + } } it should "provide groupByKeyAndTakeOrdered" in { - val take = 5 - val aList = (1 to Random.nextInt(40) + 10).map(x => "a" -> Random.nextInt()).toList - val bList = (1 to Random.nextInt(40) + 10).map(x => "b" -> Random.nextInt()).toList + val take = 50 + val aList = (1 to Random.nextInt(400) + 100).map(x => "a" -> Random.nextInt()).toList + val bList = (1 to Random.nextInt(400) + 100).map(x => "b" -> Random.nextInt()).toList val rdd = sc.parallelize(aList ++ bList) val result = rdd.groupByKeyAndTakeOrdered(take).collect().toMap result("a") shouldBe aList.map(_._2).sorted.take(take) From 64c8cd98cf59efcc6a0b5bcae69f8e5f65d18f85 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Fri, 19 Aug 2016 18:58:43 -0300 Subject: [PATCH 090/268] Preparing por spark 2.0 --- tools/cluster.py | 4 +-- tools/spark-ec2/spark_ec2.py | 53 +++++++++++++++++++++++++----------- 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index e9ad90f3..642d2d98 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -49,7 +49,7 @@ default_ami = None # will be decided based on spark-ec2 list default_master_ami = None default_env = 'dev' -default_spark_version = '1.5.1' +default_spark_version = '2.0.0' custom_builds = { # '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz' } @@ -61,7 +61,7 @@ default_defaults_filename = 'cluster_defaults.json' default_spark_ec2_git_repo = 'https://github.com/chaordic/spark-ec2' -default_spark_ec2_git_branch = 'branch-1.4-merge' +default_spark_ec2_git_branch = 'branch-2.0' master_post_create_commands = [ diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index e9442448..50d67b9b 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -51,7 +51,7 @@ raw_input = input xrange = range -SPARK_EC2_VERSION = "1.5.1" +SPARK_EC2_VERSION = "2.0.0" SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__)) VALID_SPARK_VERSIONS = set([ @@ -74,6 +74,9 @@ "1.4.1", "1.5.0", "1.5.1", + "1.5.2", + "1.6.0", + "2.0.0", ]) SPARK_TACHYON_MAP = { @@ -90,6 +93,9 @@ "1.4.1": "0.6.4", "1.5.0": "0.7.1", "1.5.1": "0.7.1", + "1.5.2": "0.7.1", + "1.6.0": "0.8.2", + "2.0.0": "", } DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION @@ -97,7 +103,7 @@ # Default location to get the spark-ec2 scripts (and ami-list) from DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/amplab/spark-ec2" -DEFAULT_SPARK_EC2_BRANCH = "branch-1.4" +DEFAULT_SPARK_EC2_BRANCH = "branch-2.0" def setup_external_libs(libs): @@ -183,6 +189,10 @@ def parse_args(): parser.add_option( "-i", "--identity-file", help="SSH private key file to use for logging into instances") + parser.add_option( + "-p", "--profile", default=None, + help="If you have multiple profiles (AWS or boto config), you can configure " + + "additional, named profiles by using this option (default: %default)") parser.add_option( "-t", "--instance-type", default="m1.large", help="Type of instance to launch (default: %default). " + @@ -329,7 +339,7 @@ def parse_args(): help="Use private IPs for instances rather than public if VPC/subnet " + "requires that.") parser.add_option( - "--instance-initiated-shutdown-behavior", default="terminate", + "--instance-initiated-shutdown-behavior", default="stop", choices=["stop", "terminate"], help="Whether instances should terminate when shut down or just stop") parser.add_option( @@ -415,11 +425,11 @@ def get_validate_spark_version(version, repo): EC2_INSTANCE_TYPES = { "c1.medium": "pvm", "c1.xlarge": "pvm", - "c3.large": "pvm", - "c3.xlarge": "pvm", - "c3.2xlarge": "pvm", - "c3.4xlarge": "pvm", - "c3.8xlarge": "pvm", + "c3.large": "hvm", + "c3.xlarge": "hvm", + "c3.2xlarge": "hvm", + "c3.4xlarge": "hvm", + "c3.8xlarge": "hvm", "c4.large": "hvm", "c4.xlarge": "hvm", "c4.2xlarge": "hvm", @@ -497,6 +507,7 @@ def get_spark_ami(instance_type, region, spark_ec2_git_repo, spark_ec2_git_branc print("Spark AMI: " + ami) return ami + # Launch a cluster of the given name, by setting up its security groups, # and then starting new instances in them. # Returns a tuple of EC2 reservation objects for the master and slaves @@ -632,11 +643,14 @@ def launch_cluster(conn, opts, cluster_name): device.delete_on_termination = True block_map["/dev/sd" + chr(ord('s') + i)] = device - for i in range(get_num_disks(opts.instance_type)): - dev = BlockDeviceType() - dev.ephemeral_name = 'ephemeral%d' % i - name = '/dev/xvd' + string.letters[i + 1] - block_map[name] = dev + # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). + if opts.instance_type.startswith('m3.'): + for i in range(get_num_disks(opts.instance_type)): + dev = BlockDeviceType() + dev.ephemeral_name = 'ephemeral%d' % i + # The first ephemeral drive is /dev/sdb. + name = '/dev/sd' + string.ascii_letters[i + 1] + block_map[name] = dev # Launch slaves if opts.spot_price is not None: @@ -822,7 +836,7 @@ def launch_cluster(conn, opts, cluster_name): # This wait time corresponds to SPARK-4983 print("Waiting for AWS to propagate instance metadata...") - time.sleep(5) + time.sleep(15) # Give the instances descriptive names and set additional tags additional_tags = {} @@ -903,7 +917,7 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): ssh_write(slave_address, opts, ['tar', 'x'], dot_ssh_tar) modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs', - 'mapreduce', 'spark-standalone', 'tachyon'] + 'mapreduce', 'spark-standalone', 'tachyon', 'rstudio'] if opts.hadoop_major_version == "1": modules = list(filter(lambda x: x != "mapreduce", modules)) @@ -1352,6 +1366,10 @@ def get_ip_address(instance, private_ips=False): def get_dns_name(instance, private_ips=False): dns = instance.public_dns_name if not private_ips else \ instance.private_ip_address + if not dns: + raise UsageError("Failed to determine hostname of {0}.\n" + "Please check that you provided --private-ips if " + "necessary".format(instance)) return dns @@ -1416,7 +1434,10 @@ def real_main(): sys.exit(1) try: - conn = ec2.connect_to_region(opts.region) + if opts.profile is None: + conn = ec2.connect_to_region(opts.region) + else: + conn = ec2.connect_to_region(opts.region, profile_name=opts.profile) except Exception as e: print((e), file=stderr) sys.exit(1) From ee10100061dd87a382ab71295b04d78f6d8e1d6c Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 23 Aug 2016 17:10:26 -0300 Subject: [PATCH 091/268] Preparing por spark 2.0 --- tools/cluster.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index 642d2d98..c4a2f681 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -227,7 +227,8 @@ def launch(cluster_name, slaves, spark_version=default_spark_version, spark_ec2_git_repo=default_spark_ec2_git_repo, spark_ec2_git_branch=default_spark_ec2_git_branch, - ami=default_ami, master_ami=default_master_ami): + ami=default_ami, master_ami=default_master_ami, + instance_profile_name=None): all_args = locals() @@ -264,6 +265,8 @@ def launch(cluster_name, slaves, ami_params = ['--ami', ami] if ami else [] master_ami_params = ['--master-ami', master_ami] if master_ami else [] + iam_params = ['--instance-profile-name', instance_profile_name] if instance_profile_name else [] + spark_version = custom_builds.get(spark_version, spark_version) for i in range(retries_on_same_cluster): @@ -292,7 +295,8 @@ def launch(cluster_name, slaves, resume_param + auth_params + ami_params + - master_ami_params, + master_ami_params + + iam_params, timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes) success = True From 4dbbac0370019b69ce9831ed5297553c7bf673d0 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 25 Aug 2016 17:06:26 -0300 Subject: [PATCH 092/268] Merging --- build.sbt | 14 +++++++------- .../scala/ignition/core/jobs/utils/RDDUtils.scala | 15 --------------- 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/build.sbt b/build.sbt index 5ae4552b..adb6ae01 100644 --- a/build.sbt +++ b/build.sbt @@ -2,32 +2,32 @@ name := "Ignition-Core" version := "1.0" -scalaVersion := "2.10.4" +scalaVersion := "2.11.8" scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130") // Because we can't run two spark contexts on same VM parallelExecution in Test := false -libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.5.1" % "provided") +libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.0" % "provided") .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") -libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") +libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.2" % "provided") -libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.4" +libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.0" -libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.6" +libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.9" libraryDependencies += "com.github.scopt" %% "scopt" % "3.2.0" libraryDependencies += "net.java.dev.jets3t" % "jets3t" % "0.7.1" -libraryDependencies += "joda-time" % "joda-time" % "2.7" +libraryDependencies += "joda-time" % "joda-time" % "2.9.4" libraryDependencies += "org.joda" % "joda-convert" % "1.7" -libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.9.6" +libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.7.4" libraryDependencies += "commons-lang" % "commons-lang" % "2.6" diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala index fd1d74ce..b0b3bc86 100644 --- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala @@ -57,25 +57,10 @@ object RDDUtils { } implicit class RDDImprovements[V: ClassTag](rdd: RDD[V]) { - def incrementCounter(acc: spark.Accumulator[Int]): RDD[V] = { - rdd.map(x => { acc += 1; x }) - } - - def incrementCounterIf(cond: (V) => Boolean, acc: spark.Accumulator[Int]): RDD[V] = { - rdd.map(x => { if (cond(x)) acc += 1; x }) - } - def filterNot(p: V => Boolean): RDD[V] = rdd.filter(!p(_)) } implicit class PairRDDImprovements[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) { - def incrementCounter(acc: spark.Accumulator[Int]): RDD[(K, V)] = { - rdd.mapValues(x => { acc += 1; x }) - } - - def incrementCounterIf(cond: (K, V) => Boolean, acc: spark.Accumulator[Int]): RDD[(K, V)] = { - rdd.mapPreservingPartitions(x => { if(cond(x._1, x._2)) acc += 1; x._2 }) - } def flatMapPreservingPartitions[U: ClassTag](f: ((K, V)) => Seq[U]): RDD[(K, U)] = { rdd.mapPartitions[(K, U)](kvs => { From 11db14ab4c61b3acde269f9d9959682251335c4c Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 25 Aug 2016 18:42:32 -0300 Subject: [PATCH 093/268] Try to fix disks creation --- tools/spark-ec2/spark_ec2.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 50d67b9b..a89dab8f 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -643,14 +643,19 @@ def launch_cluster(conn, opts, cluster_name): device.delete_on_termination = True block_map["/dev/sd" + chr(ord('s') + i)] = device + for i in range(get_num_disks(opts.instance_type)): + dev = BlockDeviceType() + dev.ephemeral_name = 'ephemeral%d' % i + name = '/dev/xvd' + string.letters[i + 1] + block_map[name] = dev # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). - if opts.instance_type.startswith('m3.'): - for i in range(get_num_disks(opts.instance_type)): - dev = BlockDeviceType() - dev.ephemeral_name = 'ephemeral%d' % i - # The first ephemeral drive is /dev/sdb. - name = '/dev/sd' + string.ascii_letters[i + 1] - block_map[name] = dev + #if opts.instance_type.startswith('m3.'): + # for i in range(get_num_disks(opts.instance_type)): + # dev = BlockDeviceType() + # dev.ephemeral_name = 'ephemeral%d' % i + # # The first ephemeral drive is /dev/sdb. + # name = '/dev/sd' + string.ascii_letters[i + 1] + # block_map[name] = dev # Launch slaves if opts.spot_price is not None: From 8109f79e98105decab9b34283c7d9d891a8d85cf Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 25 Aug 2016 18:44:11 -0300 Subject: [PATCH 094/268] Making stuff ready for spark 2.0 --- build.sbt | 10 ++++++---- project/plugins.sbt | 2 +- remote_hook.sh | 3 +-- src/main/scala/ignition/core/utils/S3Client.scala | 4 ++-- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/build.sbt b/build.sbt index adb6ae01..a3f9fdf6 100644 --- a/build.sbt +++ b/build.sbt @@ -13,22 +13,24 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.0" % "provided .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") + libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.2" % "provided") +libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2") + .exclude("org.apache.htrace", "htrace-core") + .exclude("commons-beanutils", "commons-beanutils") + .exclude("org.slf4j", "slf4j-log4j12") + libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.0" libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.9" libraryDependencies += "com.github.scopt" %% "scopt" % "3.2.0" -libraryDependencies += "net.java.dev.jets3t" % "jets3t" % "0.7.1" - libraryDependencies += "joda-time" % "joda-time" % "2.9.4" libraryDependencies += "org.joda" % "joda-convert" % "1.7" -libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.7.4" - libraryDependencies += "commons-lang" % "commons-lang" % "2.6" resolvers += "Akka Repository" at "http://repo.akka.io/releases/" diff --git a/project/plugins.sbt b/project/plugins.sbt index d5f371ab..f6f3b939 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,5 +1,5 @@ addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") -addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2") +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3") addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") diff --git a/remote_hook.sh b/remote_hook.sh index 309f85f5..10902c46 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -92,8 +92,7 @@ if [[ "${USE_YARN}" == "yes" ]]; then fi if [[ "${JOB_NAME}" == "shell" ]]; then - export ADD_JARS=${JAR_PATH} - sudo -E ${SPARK_HOME}/bin/spark-shell --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell" + sudo -E ${SPARK_HOME}/bin/spark-shell --jars ${JAR_PATH} --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell" elif [[ "${JOB_NAME}" == "zeppelin" ]]; then install_and_run_zeppelin else diff --git a/src/main/scala/ignition/core/utils/S3Client.scala b/src/main/scala/ignition/core/utils/S3Client.scala index b806b376..020ab6f4 100644 --- a/src/main/scala/ignition/core/utils/S3Client.scala +++ b/src/main/scala/ignition/core/utils/S3Client.scala @@ -3,7 +3,7 @@ package ignition.core.utils import java.util.Properties import org.jets3t.service.impl.rest.httpclient.RestS3Service -import org.jets3t.service.model.S3Object +import org.jets3t.service.model.{S3Object, StorageObject} import org.jets3t.service.security.AWSCredentials import org.jets3t.service.{Constants, Jets3tProperties} @@ -36,7 +36,7 @@ class S3Client { service.getObject(bucket, key, null, null, null, null, null, null) } - def list(bucket: String, key: String): Array[S3Object] = { + def list(bucket: String, key: String): Array[StorageObject] = { service.listObjectsChunked(bucket, key, null, 99999L, null, true).getObjects } From 4f910ea68bbbcb2543c9bc21c7b5b3bd8cad3c46 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 25 Aug 2016 18:44:11 -0300 Subject: [PATCH 095/268] Making stuff ready for spark 2.0 --- build.sbt | 16 +++++++++------- project/plugins.sbt | 2 +- remote_hook.sh | 3 +-- .../scala/ignition/core/utils/S3Client.scala | 4 ++-- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/build.sbt b/build.sbt index 5ae4552b..f452fcef 100644 --- a/build.sbt +++ b/build.sbt @@ -13,22 +13,24 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.5.1" % "provided .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") -libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") -libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.4" +libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.2" % "provided") + +libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2") + .exclude("org.apache.htrace", "htrace-core") + .exclude("commons-beanutils", "commons-beanutils") + .exclude("org.slf4j", "slf4j-log4j12") + +libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.0" libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.6" libraryDependencies += "com.github.scopt" %% "scopt" % "3.2.0" -libraryDependencies += "net.java.dev.jets3t" % "jets3t" % "0.7.1" - -libraryDependencies += "joda-time" % "joda-time" % "2.7" +libraryDependencies += "joda-time" % "joda-time" % "2.9.4" libraryDependencies += "org.joda" % "joda-convert" % "1.7" -libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.9.6" - libraryDependencies += "commons-lang" % "commons-lang" % "2.6" resolvers += "Akka Repository" at "http://repo.akka.io/releases/" diff --git a/project/plugins.sbt b/project/plugins.sbt index d5f371ab..f6f3b939 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,5 +1,5 @@ addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") -addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2") +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3") addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") diff --git a/remote_hook.sh b/remote_hook.sh index 309f85f5..10902c46 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -92,8 +92,7 @@ if [[ "${USE_YARN}" == "yes" ]]; then fi if [[ "${JOB_NAME}" == "shell" ]]; then - export ADD_JARS=${JAR_PATH} - sudo -E ${SPARK_HOME}/bin/spark-shell --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell" + sudo -E ${SPARK_HOME}/bin/spark-shell --jars ${JAR_PATH} --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell" elif [[ "${JOB_NAME}" == "zeppelin" ]]; then install_and_run_zeppelin else diff --git a/src/main/scala/ignition/core/utils/S3Client.scala b/src/main/scala/ignition/core/utils/S3Client.scala index b806b376..020ab6f4 100644 --- a/src/main/scala/ignition/core/utils/S3Client.scala +++ b/src/main/scala/ignition/core/utils/S3Client.scala @@ -3,7 +3,7 @@ package ignition.core.utils import java.util.Properties import org.jets3t.service.impl.rest.httpclient.RestS3Service -import org.jets3t.service.model.S3Object +import org.jets3t.service.model.{S3Object, StorageObject} import org.jets3t.service.security.AWSCredentials import org.jets3t.service.{Constants, Jets3tProperties} @@ -36,7 +36,7 @@ class S3Client { service.getObject(bucket, key, null, null, null, null, null, null) } - def list(bucket: String, key: String): Array[S3Object] = { + def list(bucket: String, key: String): Array[StorageObject] = { service.listObjectsChunked(bucket, key, null, 99999L, null, true).getObjects } From 61829644b42fa2feaa02d98e677ce4093b9416c5 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Fri, 26 Aug 2016 15:44:43 -0300 Subject: [PATCH 096/268] Permission fix --- remote_hook.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/remote_hook.sh b/remote_hook.sh index 10902c46..c0cd8da0 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -20,6 +20,7 @@ MY_USER=$(whoami) # Avoids problems when another user created our control dir sudo mkdir -p "${JOB_CONTROL_DIR}" sudo chown $MY_USER "${JOB_CONTROL_DIR}" +sudo chown -R o+rx /root RUNNING_FILE="${JOB_CONTROL_DIR}/RUNNING" From 3a44a4e248f0cc5410b1002ad94ff451fe7ec9c1 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Fri, 26 Aug 2016 15:51:16 -0300 Subject: [PATCH 097/268] Permission fix --- remote_hook.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/remote_hook.sh b/remote_hook.sh index c0cd8da0..688bfbc1 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -20,7 +20,7 @@ MY_USER=$(whoami) # Avoids problems when another user created our control dir sudo mkdir -p "${JOB_CONTROL_DIR}" sudo chown $MY_USER "${JOB_CONTROL_DIR}" -sudo chown -R o+rx /root +sudo chmod -R o+rx /root RUNNING_FILE="${JOB_CONTROL_DIR}/RUNNING" From 0ce1847ad4b7a76edd50240b0af529de4a786245 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Fri, 26 Aug 2016 16:11:50 -0300 Subject: [PATCH 098/268] Make possible to process files without dates --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 645b218e..73f7f332 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -567,12 +567,12 @@ object SparkContextUtils { def dateValidation(file: HadoopFile): Option[HadoopFile] = { val tryDate = extractDateFromFile(file) if (tryDate.isEmpty && ignoreMalformedDates) - None + Option(file) else { val date = tryDate.get val goodStartDate = startDate.isEmpty || (inclusiveStartDate && date.saneEqual(startDate.get) || date.isAfter(startDate.get)) val goodEndDate = endDate.isEmpty || (inclusiveEndDate && date.saneEqual(endDate.get) || date.isBefore(endDate.get)) - if (goodStartDate && goodEndDate) Some(file) else None + if (goodStartDate && goodEndDate) Option(file) else None } } @@ -590,7 +590,7 @@ object SparkContextUtils { startDate = startDate, endDate = endDate, exclusionPattern = exclusionPattern).filter(preValidations) val filesByDate = preFilteredFiles.groupBy(extractDateFromFile).collect { - case (Some(date), files) => date -> files + case (date, files) => date.getOrElse(new DateTime(1970, 1, 1, 1, 1)) -> files } val posFilteredFiles = From 34f6bc2f53031f9f27090af1a61cb134baf1ccfc Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Fri, 26 Aug 2016 16:32:59 -0300 Subject: [PATCH 099/268] Supporting s3a on our utils --- .../core/jobs/utils/SparkContextUtils.scala | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 73f7f332..81e1d355 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -148,7 +148,7 @@ object SparkContextUtils { val filesToOutput = 1500 def mapPaths(actionWhenNeedsSynching: (String, String) => Unit): Seq[String] = { paths.map(p => { - val hdfsPath = p.replace("s3n://", hdfsPathPrefix) + val hdfsPath = p.replaceFirst("s3(a|n)://", hdfsPathPrefix) if (forceSynch || getStatus(hdfsPath, false).isEmpty || getStatus(s"$hdfsPath/*", true).filterNot(_.isDirectory).size != filesToOutput) { val _hdfsPath = new Path(hdfsPath) actionWhenNeedsSynching(p, hdfsPath) @@ -486,7 +486,7 @@ object SparkContextUtils { } def classifyPath(path: String): Either[String, (String, DateTime)] = - Try(pathDateExtractor.extractFromPath(s"s3n://$bucket/$path")) match { + Try(pathDateExtractor.extractFromPath(s"s3a://$bucket/$path")) match { case Success(date) => Right(path -> date) case Failure(_) => Left(path) } @@ -494,11 +494,11 @@ object SparkContextUtils { val commonPrefixes = s3ListCommonPrefixes(bucket, prefix, delimiter).map(classifyPath) if (commonPrefixes.isEmpty) - Stream(s"s3n://$bucket/$prefix") + Stream(s"s3a://$bucket/$prefix") else commonPrefixes.toStream.flatMap { case Left(prefixWithoutDate) => s3NarrowPaths(bucket, prefixWithoutDate, delimiter, inclusiveStartDate, startDate, inclusiveEndDate, endDate, ignoreHours) - case Right((prefixWithDate, date)) if isGoodDate(date) => Stream(s"s3n://$bucket/$prefixWithDate") + case Right((prefixWithDate, date)) if isGoodDate(date) => Stream(s"s3a://$bucket/$prefixWithDate") case Right(_) => Stream.empty } } @@ -511,10 +511,10 @@ object SparkContextUtils { exclusionPattern: Option[String]) (implicit s3: AmazonS3Client, dateExtractor: PathDateExtractor): Stream[S3ObjectSummary] = { - val s3Pattern = "s3n?://([^/]+)(.+)".r + val s3Pattern = "s3(a|n)?://([^/]+)(.+)".r def extractBucketAndPrefix(path: String): Option[(String, String)] = path match { - case s3Pattern(bucket, prefix) => Option(bucket -> prefix.dropWhile(_ == '/')) + case s3Pattern(_, bucket, prefix) => Option(bucket -> prefix.dropWhile(_ == '/')) case _ => None } @@ -615,7 +615,7 @@ object SparkContextUtils { exclusionPattern: Option[String] = None)(implicit pathDateExtractor: PathDateExtractor): Stream[HadoopFile] = { def toHadoopFile(s3Object: S3ObjectSummary): HadoopFile = - HadoopFile(s"s3n://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize) + HadoopFile(s"s3a://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize) def listPath(path: String): Stream[HadoopFile] = { if (path.startsWith("s3")) { From 8393d54ba57273a4aa14d76ad4685566247c2067 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 31 Aug 2016 15:08:41 -0300 Subject: [PATCH 100/268] Preparing for new Spark/Scala/Hadoop --- build.sbt | 9 ++------- .../core/jobs/utils/SparkContextUtils.scala | 15 ++++++++------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/build.sbt b/build.sbt index 2061fe3a..b8765224 100644 --- a/build.sbt +++ b/build.sbt @@ -23,13 +23,6 @@ libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2") libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.0" -libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2") - .exclude("org.apache.htrace", "htrace-core") - .exclude("commons-beanutils", "commons-beanutils") - .exclude("org.slf4j", "slf4j-log4j12") - -libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.0" - libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.9" libraryDependencies += "com.github.scopt" %% "scopt" % "3.2.0" @@ -40,6 +33,8 @@ libraryDependencies += "org.joda" % "joda-convert" % "1.7" libraryDependencies += "commons-lang" % "commons-lang" % "2.6" +libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.5" + resolvers += "Akka Repository" at "http://repo.akka.io/releases/" resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/" diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 81e1d355..7588deaa 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -24,10 +24,10 @@ import scala.reflect.ClassTag import scala.util.{Failure, Success, Try} import scala.util.control.NonFatal import ignition.core.utils.ExceptionUtils._ +import org.slf4j.LoggerFactory object SparkContextUtils { - private case class BigFileSlice(index: Int) private case class HadoopFilePartition(size: Long, paths: Seq[String]) @@ -51,6 +51,8 @@ object SparkContextUtils { implicit class SparkContextImprovements(sc: SparkContext) { + private lazy val logger = LoggerFactory.getLogger(getClass) + lazy val _hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) private def getFileSystem(path: Path): FileSystem = { @@ -73,7 +75,7 @@ object SparkContextUtils { } // This call is equivalent to a ls -d in shell, but won't fail if part of a path matches nothing, - // For instance, given path = s3n://bucket/{a,b}, it will work fine if a exists but b is missing + // For instance, given path = s3a://bucket/{a,b}, it will work fine if a exists but b is missing def sortedGlobPath(_paths: Seq[String], removeEmpty: Boolean = true): Seq[String] = { val paths = _paths.flatMap(path => ignition.core.utils.HadoopUtils.getPathStrings(path)) paths.flatMap(p => getStatus(p, removeEmpty)).map(_.getPath.toString).distinct.sorted @@ -148,7 +150,7 @@ object SparkContextUtils { val filesToOutput = 1500 def mapPaths(actionWhenNeedsSynching: (String, String) => Unit): Seq[String] = { paths.map(p => { - val hdfsPath = p.replaceFirst("s3(a|n)://", hdfsPathPrefix) + val hdfsPath = p.replaceFirst("s3[an]://", hdfsPathPrefix) if (forceSynch || getStatus(hdfsPath, false).isEmpty || getStatus(s"$hdfsPath/*", true).filterNot(_.isDirectory).size != filesToOutput) { val _hdfsPath = new Path(hdfsPath) actionWhenNeedsSynching(p, hdfsPath) @@ -286,7 +288,6 @@ object SparkContextUtils { val hadoopConf = _hadoopConf val partitionedSlices = sc.parallelize(slices.map(s => s -> null), 2).partitionBy(partitioner) - partitionedSlices.mapPartitions { slices => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val codecFactory = new CompressionCodecFactory(conf) @@ -511,10 +512,10 @@ object SparkContextUtils { exclusionPattern: Option[String]) (implicit s3: AmazonS3Client, dateExtractor: PathDateExtractor): Stream[S3ObjectSummary] = { - val s3Pattern = "s3(a|n)?://([^/]+)(.+)".r + val s3Pattern = "s3[an]?://([^/]+)(.+)".r def extractBucketAndPrefix(path: String): Option[(String, String)] = path match { - case s3Pattern(_, bucket, prefix) => Option(bucket -> prefix.dropWhile(_ == '/')) + case s3Pattern(bucket, prefix) => Option(bucket -> prefix.dropWhile(_ == '/')) case _ => None } @@ -620,7 +621,7 @@ object SparkContextUtils { def listPath(path: String): Stream[HadoopFile] = { if (path.startsWith("s3")) { s3List(path, inclusiveStartDate = inclusiveStartDate, startDate = startDate, inclusiveEndDate = inclusiveEndDate, - endDate = endDate, exclusionPattern = exclusionPattern)(amazonS3ClientFromEnvironmentVariables, pathDateExtractor ).map(toHadoopFile) + endDate = endDate, exclusionPattern = exclusionPattern)(amazonS3ClientFromEnvironmentVariables, pathDateExtractor).map(toHadoopFile) } else { driverListFiles(path).toStream } From b57d2e4cba3298e0c4cf626697a87bfaed503e38 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 31 Aug 2016 16:27:17 -0300 Subject: [PATCH 101/268] Making tests pass --- src/main/scala/ignition/core/utils/BetterTrace.scala | 4 +++- src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala | 2 +- src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala | 2 +- src/test/scala/ignition/core/utils/FutureUtilsSpec.scala | 2 +- src/test/scala/ignition/core/utils/IntBagSpec.scala | 2 +- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/main/scala/ignition/core/utils/BetterTrace.scala b/src/main/scala/ignition/core/utils/BetterTrace.scala index 387f49f7..49c74606 100644 --- a/src/main/scala/ignition/core/utils/BetterTrace.scala +++ b/src/main/scala/ignition/core/utils/BetterTrace.scala @@ -1,9 +1,11 @@ package ignition.core.utils import ignition.core.utils.ExceptionUtils._ +import org.scalactic.source // Used mainly to augment scalacheck traces in scalatest trait BetterTrace { - def fail(message: String): Nothing + def fail(message: String)(implicit pos: source.Position): Nothing + def withBetterTrace(block: => Unit): Unit = try { block diff --git a/src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala b/src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala index 705ba398..eed298b6 100644 --- a/src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala +++ b/src/test/scala/ignition/core/jobs/utils/RDDUtilsSpec.scala @@ -6,7 +6,7 @@ import org.scalatest._ import scala.util.Random -class RDDUtilsSpec extends FlatSpec with ShouldMatchers with SharedSparkContext { +class RDDUtilsSpec extends FlatSpec with Matchers with SharedSparkContext { "RDDUtils" should "provide groupByKeyAndTake" in { (10 to 60 by 10).foreach { take => diff --git a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala index c800c0f2..26757c26 100644 --- a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala +++ b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala @@ -3,7 +3,7 @@ package ignition.core.utils import org.scalatest._ import CollectionUtils._ -class CollectionUtilsSpec extends FlatSpec with ShouldMatchers { +class CollectionUtilsSpec extends FlatSpec with Matchers { case class MyObj(property: String, value: String) "CollectionUtils" should "provide distinctBy" in { diff --git a/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala b/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala index 8c2b3270..4649fcfc 100644 --- a/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala +++ b/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala @@ -6,7 +6,7 @@ import scala.concurrent.{Await, Future} import scala.concurrent.duration._ import scala.concurrent.ExecutionContext.Implicits.global -class FutureUtilsSpec extends FlatSpec with ShouldMatchers { +class FutureUtilsSpec extends FlatSpec with Matchers { "FutureUtils" should "provide toLazyIterable" in { val timesCalled = collection.mutable.Map.empty[Int, Int].withDefaultValue(0) diff --git a/src/test/scala/ignition/core/utils/IntBagSpec.scala b/src/test/scala/ignition/core/utils/IntBagSpec.scala index 76d37a35..f577237e 100644 --- a/src/test/scala/ignition/core/utils/IntBagSpec.scala +++ b/src/test/scala/ignition/core/utils/IntBagSpec.scala @@ -4,7 +4,7 @@ import org.scalatest._ import scala.util.Random -class IntBagSpec extends FlatSpec with ShouldMatchers { +class IntBagSpec extends FlatSpec with Matchers { "IntBag" should "be built from sequence" in { IntBag.from(Seq(1, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4)).histogram shouldBe Map(1 -> 2, 2 -> 3, 3 -> 1, 4 -> 5) From 7029f03122fb25494a64c71ded621fff4d73335b Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 6 Sep 2016 15:21:20 -0300 Subject: [PATCH 102/268] Use older scalatest which is compatible with current scalamock --- build.sbt | 2 +- src/main/scala/ignition/core/utils/BetterTrace.scala | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/build.sbt b/build.sbt index b8765224..3dd8d22a 100644 --- a/build.sbt +++ b/build.sbt @@ -21,7 +21,7 @@ libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2") .exclude("commons-beanutils", "commons-beanutils") .exclude("org.slf4j", "slf4j-log4j12") -libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.0" +libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.4" libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.9" diff --git a/src/main/scala/ignition/core/utils/BetterTrace.scala b/src/main/scala/ignition/core/utils/BetterTrace.scala index 49c74606..09de73aa 100644 --- a/src/main/scala/ignition/core/utils/BetterTrace.scala +++ b/src/main/scala/ignition/core/utils/BetterTrace.scala @@ -1,10 +1,9 @@ package ignition.core.utils import ignition.core.utils.ExceptionUtils._ -import org.scalactic.source // Used mainly to augment scalacheck traces in scalatest trait BetterTrace { - def fail(message: String)(implicit pos: source.Position): Nothing + def fail(message: String): Nothing def withBetterTrace(block: => Unit): Unit = try { From cfa7017cba8ba2fcb4a4aa1cfc187a6ee2619d0d Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 6 Sep 2016 18:40:05 -0300 Subject: [PATCH 103/268] Avoid temporary files on s3 --- .../ignition/core/jobs/CoreJobRunner.scala | 5 +- .../core/jobs/DirectOutputCommitter.scala | 75 +++++++++++++++++++ .../core/jobs/utils/SparkContextUtils.scala | 1 + 3 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 src/main/scala/ignition/core/jobs/DirectOutputCommitter.scala diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index bbede553..0dec0896 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -75,11 +75,14 @@ object CoreJobRunner { sparkConf.setMaster(config.master) sparkConf.setAppName(appName) - + + sparkConf.set("spark.hadoop.mapred.output.committer.class", classOf[DirectOutputCommitter].getName()) + defaultSparkConfMap.foreach { case (k, v) => sparkConf.set(k, v) } jobConf.foreach { case (k, v) => sparkConf.set(k, v) } + // Add logging context to driver setLoggingContextValues(config) diff --git a/src/main/scala/ignition/core/jobs/DirectOutputCommitter.scala b/src/main/scala/ignition/core/jobs/DirectOutputCommitter.scala new file mode 100644 index 00000000..63611cf4 --- /dev/null +++ b/src/main/scala/ignition/core/jobs/DirectOutputCommitter.scala @@ -0,0 +1,75 @@ +package ignition.core.jobs + +// Code from: https://gist.github.com/aarondav/c513916e72101bbe14ec +// Suggested by: http://tech.grammarly.com/blog/posts/Petabyte-Scale-Text-Processing-with-Spark.html + +/* + * Copyright 2015 Databricks, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapred._ + +/** + * OutputCommitter suitable for S3 workloads. Unlike the usual FileOutputCommitter, which + * writes files to a _temporary/ directory before renaming them to their final location, this + * simply writes directly to the final location. + * + * The FileOutputCommitter is required for HDFS + speculation, which allows only one writer at + * a time for a file (so two people racing to write the same file would not work). However, S3 + * supports multiple writers outputting to the same file, where visibility is guaranteed to be + * atomic. This is a monotonic operation: all writers should be writing the same data, so which + * one wins is immaterial. + * + * Code adapted from Ian Hummel's code from this PR: + * https://github.com/themodernlife/spark/commit/4359664b1d557d55b0579023df809542386d5b8c + */ +class DirectOutputCommitter extends OutputCommitter { + override def setupJob(jobContext: JobContext): Unit = { } + + override def setupTask(taskContext: TaskAttemptContext): Unit = { } + + override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = { + // We return true here to guard against implementations that do not handle false correctly. + // The meaning of returning false is not entirely clear, so it's possible to be interpreted + // as an error. Returning true just means that commitTask() will be called, which is a no-op. + true + } + + override def commitTask(taskContext: TaskAttemptContext): Unit = { } + + override def abortTask(taskContext: TaskAttemptContext): Unit = { } + + /** + * Creates a _SUCCESS file to indicate the entire job was successful. + * This mimics the behavior of FileOutputCommitter, reusing the same file name and conf option. + */ + override def commitJob(context: JobContext): Unit = { + val conf = context.getJobConf + if (shouldCreateSuccessFile(conf)) { + val outputPath = FileOutputFormat.getOutputPath(conf) + if (outputPath != null) { + val fileSys = outputPath.getFileSystem(conf) + val filePath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) + fileSys.create(filePath).close() + } + } + } + + /** By default, we do create the _SUCCESS file, but we allow it to be turned off. */ + private def shouldCreateSuccessFile(conf: JobConf): Boolean = { + conf.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true) + } +} \ No newline at end of file diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 7588deaa..dddd51a6 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -330,6 +330,7 @@ object SparkContextUtils { minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq( + "mapreduce.input.fileinputformat.split.minsize" -> maxSplitSize.toString, "mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString)) .foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } From 30ea5440cae326ded9995c00822441ad5241aa3c Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 6 Sep 2016 19:19:13 -0300 Subject: [PATCH 104/268] Fixed doc error --- .../ignition/core/testsupport/spark/LocalSparkContext.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/scala/ignition/core/testsupport/spark/LocalSparkContext.scala b/src/main/scala/ignition/core/testsupport/spark/LocalSparkContext.scala index 2edb28e7..a272edaa 100644 --- a/src/main/scala/ignition/core/testsupport/spark/LocalSparkContext.scala +++ b/src/main/scala/ignition/core/testsupport/spark/LocalSparkContext.scala @@ -21,7 +21,6 @@ import _root_.io.netty.util.internal.logging.{InternalLoggerFactory, Slf4JLogger import org.apache.spark.SparkContext import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, Suite} -/** Manages a local `sc` {@link SparkContext} variable, correctly stopping it after each test. */ trait LocalSparkContext extends BeforeAndAfterEach with BeforeAndAfterAll { self: Suite => @transient var sc: SparkContext = _ From 85f920dc5a95af9c03fd851ca29ec55443ca4347 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 20 Sep 2016 15:36:49 -0300 Subject: [PATCH 105/268] Added logging factor to groupByKeyAndTake --- .../scala/ignition/core/jobs/utils/RDDUtils.scala | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala index b0b3bc86..e04dd118 100644 --- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala @@ -11,6 +11,7 @@ import org.joda.time.DateTime import org.joda.time.format.DateTimeFormat import scala.collection.mutable +import scala.util.Random import scalaz.{Success, Validation} object RDDUtils { @@ -78,11 +79,14 @@ object RDDUtils { rdd.filter { case (k, v) => f.isDefinedAt(v) }.mapValues(f) } - def groupByKeyAndTake(n: Int): RDD[(K, List[V])] = + // loggingFactor: percentage of the potential logging that will be really printed + // Big jobs will have too much logging and my eat up cluster disk space + def groupByKeyAndTake(n: Int, loggingFactor: Double = 0.5): RDD[(K, List[V])] = rdd.aggregateByKey(mutable.ListBuffer.empty[V])( (lst, v) => if (lst.size >= n) { - logger.warn(s"Ignoring value '$v' due aggregation result of size '${lst.size}' is bigger than n=$n") + if (Random.nextDouble() < loggingFactor) + logger.warn(s"Ignoring value '$v' due aggregation result of size '${lst.size}' is bigger than n=$n") lst } else { lst += v @@ -95,7 +99,8 @@ object RDDUtils { lstB else { if (lstA.size + lstB.size > n) { - logger.warn(s"Merging partition1=${lstA.size} with partition2=${lstB.size} and taking the first n=$n, sample1='${lstA.take(5)}', sample2='${lstB.take(5)}'") + if (Random.nextDouble() < loggingFactor) + logger.warn(s"Merging partition1=${lstA.size} with partition2=${lstB.size} and taking the first n=$n, sample1='${lstA.take(5)}', sample2='${lstB.take(5)}'") lstA ++= lstB lstA.take(n) } else { @@ -115,4 +120,4 @@ object RDDUtils { (lstA, lstB) => (lstA ++ lstB).sorted(ord).take(n)) } } -} +} \ No newline at end of file From 2b3231c7dbc0ea808f47ef7c5c05244f5534b507 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 22 Sep 2016 13:56:32 -0300 Subject: [PATCH 106/268] make provided s3 dependencies --- build.sbt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/build.sbt b/build.sbt index 3dd8d22a..5cddf64a 100644 --- a/build.sbt +++ b/build.sbt @@ -13,13 +13,13 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.0" % "provided .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") - libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.2" % "provided") -libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2") - .exclude("org.apache.htrace", "htrace-core") - .exclude("commons-beanutils", "commons-beanutils") - .exclude("org.slf4j", "slf4j-log4j12") +libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2" % "provided") + +libraryDependencies += ("com.amazonaws" % "aws-java-sdk" % "1.7.4" % "provided") + +libraryDependencies += ("net.java.dev.jets3t" % "jets3t" % "0.9.0" % "provided") libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.4" From ee0ed1ef8e7626023e2ca10960640d39a734c5d3 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 22 Sep 2016 18:22:40 -0300 Subject: [PATCH 107/268] classpath fix revert --- build.sbt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/build.sbt b/build.sbt index 5cddf64a..3dd8d22a 100644 --- a/build.sbt +++ b/build.sbt @@ -13,13 +13,13 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.0" % "provided .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") -libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.2" % "provided") - -libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2" % "provided") -libraryDependencies += ("com.amazonaws" % "aws-java-sdk" % "1.7.4" % "provided") +libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.2" % "provided") -libraryDependencies += ("net.java.dev.jets3t" % "jets3t" % "0.9.0" % "provided") +libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2") + .exclude("org.apache.htrace", "htrace-core") + .exclude("commons-beanutils", "commons-beanutils") + .exclude("org.slf4j", "slf4j-log4j12") libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.4" From 98ef7243a53be75255b79b8e8735c04aa5d597fd Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 27 Sep 2016 15:48:53 -0300 Subject: [PATCH 108/268] Added Future.withTimeout --- build.sbt | 2 ++ src/main/scala/ignition/core/utils/FutureUtils.scala | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/build.sbt b/build.sbt index 3dd8d22a..711d3798 100644 --- a/build.sbt +++ b/build.sbt @@ -35,6 +35,8 @@ libraryDependencies += "commons-lang" % "commons-lang" % "2.6" libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.5" +libraryDependencies += "com.typesafe.akka" %% "akka-actor" % "2.3.4" + resolvers += "Akka Repository" at "http://repo.akka.io/releases/" resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/" diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala index 4523a94f..55853826 100644 --- a/src/main/scala/ignition/core/utils/FutureUtils.scala +++ b/src/main/scala/ignition/core/utils/FutureUtils.scala @@ -1,5 +1,8 @@ package ignition.core.utils +import akka.actor.ActorSystem + +import scala.concurrent.duration.FiniteDuration import scala.concurrent.{ExecutionContext, Future, Promise, blocking, future} import scala.util.control.NonFatal import scala.util.{Failure, Success, Try} @@ -28,6 +31,10 @@ object FutureUtils { def asTry()(implicit ec: ExecutionContext) : Future[Try[V]] = { future.map(v => Success(v)).recover { case NonFatal(e) => Failure(e) } } + + def withTimeout(timeout: => Throwable)(implicit duration: FiniteDuration, system: ActorSystem): Future[V] = { + Future.firstCompletedOf(Seq(future, akka.pattern.after(duration, system.scheduler)(Future.failed(timeout))(system.dispatcher)))(system.dispatcher) + } } implicit class FutureGeneratorImprovements[V](generator: Iterable[() => Future[V]]){ From 34afa426e491ba0566e669cff2ba876ce25206c6 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 29 Sep 2016 19:50:32 -0300 Subject: [PATCH 109/268] Moved async http to core --- build.sbt | 8 + .../core/http/AsyncHttpClientStreamApi.scala | 69 ++++ .../core/http/AsyncSprayHttpClient.scala | 302 ++++++++++++++++++ .../ignition/core/http/ByteStorage.scala | 114 +++++++ .../scala/ignition/core/http/Caching.scala | 22 ++ src/main/scala/ignition/core/http/Retry.scala | 65 ++++ 6 files changed, 580 insertions(+) create mode 100644 src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala create mode 100644 src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala create mode 100644 src/main/scala/ignition/core/http/ByteStorage.scala create mode 100644 src/main/scala/ignition/core/http/Caching.scala create mode 100644 src/main/scala/ignition/core/http/Retry.scala diff --git a/build.sbt b/build.sbt index 711d3798..6ffe0e85 100644 --- a/build.sbt +++ b/build.sbt @@ -37,6 +37,14 @@ libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.5" libraryDependencies += "com.typesafe.akka" %% "akka-actor" % "2.3.4" +libraryDependencies += "io.spray" %% "spray-json" % "1.3.2" + +libraryDependencies += "io.spray" %% "spray-client" % "1.3.2" + +libraryDependencies += "io.spray" %% "spray-http" % "1.3.2" + +libraryDependencies += "io.spray" %% "spray-caching" % "1.3.2" + resolvers += "Akka Repository" at "http://repo.akka.io/releases/" resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/" diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala new file mode 100644 index 00000000..52d97810 --- /dev/null +++ b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala @@ -0,0 +1,69 @@ +package ignition.core.http + +import java.io.InputStream +import java.util.concurrent.TimeUnit + +import akka.util.Timeout +import spray.http.{HttpEntity, HttpMethod, HttpMethods} + +import scala.concurrent.Future +import scala.concurrent.duration._ +import scala.language.postfixOps + + +object AsyncHttpClientStreamApi { + + case class Credentials(user: String, password: String) { + def isEmpty = user.isEmpty && password.isEmpty + + def toOption = Some(this).filter(!_.isEmpty) + } + + object Credentials { + val empty = Credentials("", "") + } + + // TODO: return a stream is dangerous because implies into a lock + case class StreamResponse(status: Int, content: InputStream) + + case class RequestConfiguration(maxRedirects: Int = 15, + maxConnectionsPerHost: Int = 500, + pipelining: Boolean = false, + idleTimeout: Duration = Duration(30, TimeUnit.SECONDS), + requestTimeout: Duration = Duration(20, TimeUnit.SECONDS), + connectingTimeout: Duration = Duration(10, TimeUnit.SECONDS)) + + case class Request(url: String, + params: Map[String, String] = Map.empty, + credentials: Option[Credentials] = None, + method: HttpMethod = HttpMethods.GET, + body: HttpEntity = HttpEntity.Empty, + requestConfiguration: Option[RequestConfiguration] = Option(RequestConfiguration())) + + case class RequestException(message: String, response: StreamResponse) extends RuntimeException(message) + + object NoOpReporter extends ReporterCallback { + def onRequest(request: Request): Unit = {} + def onResponse(request: Request, status: Int): Unit = {} + def onFailure(request: Request, status: Int): Unit = {} + def onRetry(request: Request): Unit = {} + def onGiveUp(request: Request): Unit = {} + def onError(request: Request, error: Any): Unit = {} + } + + abstract class ReporterCallback { + def onRequest(request: Request): Unit + def onResponse(request: Request, status: Int): Unit + def onFailure(request: Request, status: Int): Unit + def onRetry(request: Request): Unit + def onGiveUp(request: Request): Unit + def onError(request: Request, error: Any): Unit + } +} + +trait AsyncHttpClientStreamApi { + + def makeRequest(request: AsyncHttpClientStreamApi.Request, initialBackoff: FiniteDuration = 100 milliseconds, retryOnHttpStatus: Seq[Int] = List.empty) + (implicit timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback = AsyncHttpClientStreamApi.NoOpReporter): Future[AsyncHttpClientStreamApi.StreamResponse] + +} \ No newline at end of file diff --git a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala new file mode 100644 index 00000000..54247a04 --- /dev/null +++ b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala @@ -0,0 +1,302 @@ +package ignition.core.http + +import java.net.URL +import java.util.concurrent.TimeoutException + +import akka.actor._ +import akka.io.IO +import akka.pattern.ask +import akka.util.Timeout + +import spray.can.Http +import spray.can.Http.HostConnectorSetup +import spray.can.client.{ClientConnectionSettings, HostConnectorSettings} +import spray.http.HttpHeaders.Authorization +import spray.http.StatusCodes.Redirection +import spray.http._ + + +import scala.concurrent.duration._ +import scala.concurrent.{ExecutionContext, Future} +import scala.language.postfixOps +import scala.util.control.NonFatal + +import ignition.core.http.AsyncHttpClientStreamApi.{Request, RequestConfiguration} + + +trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi { + + implicit def actorRefFactory: ActorRefFactory + def executionContext: ExecutionContext = actorRefFactory.dispatcher + + override def makeRequest(request: AsyncHttpClientStreamApi.Request, backoff: FiniteDuration, retryOnHttpStatus: Seq[Int]) + (implicit timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback): Future[AsyncHttpClientStreamApi.StreamResponse] = { + val processor = actorRefFactory.actorOf(Props(new RequestProcessorActor(timeout, reporter, backoff, retryOnHttpStatus))) + (processor ? request).mapTo[AsyncHttpClientStreamApi.StreamResponse] + } + + private class RequestProcessorActor(timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback, backoff: FiniteDuration, retryOnHttpStatus: Seq[Int]) + extends Actor with ActorLogging { + + + import context.system + + import scala.language.implicitConversions + + def isRedirection(status: StatusCode): Boolean = status match { + case r: Redirection => true + case _ => false + } + + private def toUriString(url: String, params: Map[String, String] = Map.empty) = { + def encode(content: String) = java.net.URLEncoder.encode(content, "UTF-8") + def encodeParams = params.map { case (k, v) => s"${encode(k)}=${encode(v)}" }.mkString("&") + if (params.isEmpty) url else s"$url?${encodeParams}" + } + + private implicit def toAuthHeader(credentials: AsyncHttpClientStreamApi.Credentials): List[Authorization] = + List(Authorization(credentials = BasicHttpCredentials(username = credentials.user, password = credentials.password))) + + private def toSprayRequest(request: Request): HttpRequest = request match { + case Request(uri, params, Some(credentials), method, body, _) if params.isEmpty => + HttpRequest(method = method, uri = request.url, headers = credentials, entity = body) + + case Request(uri, params, Some(credentials), method, body, _) => + HttpRequest(method = method, uri = toUriString(request.url, params), headers = credentials, entity = body) + + case Request(uri, params, None, method, body, _) if params.isEmpty => + HttpRequest(method = method, uri = toUriString(request.url), entity = body) + + case Request(uri, params, None, method, body, _) => + HttpRequest(method = method, uri = toUriString(request.url, params), entity = body) + } + + private def toSprayHostConnectorSetup(host: String, configuration: AsyncHttpClientStreamApi.RequestConfiguration): HostConnectorSetup = { + // Create based on defaults, change some of them + val ccs: ClientConnectionSettings = ClientConnectionSettings(system) + val hcs: HostConnectorSettings = HostConnectorSettings(system) + + val updatedCcs = ccs.copy( + responseChunkAggregationLimit = 0, // makes our client ineffective if non zero + idleTimeout = configuration.idleTimeout, + connectingTimeout = configuration.connectingTimeout, + requestTimeout = configuration.requestTimeout + ) + + val updatedHcs = hcs.copy( + connectionSettings = updatedCcs, + maxRetries = 0, // We have our own retry mechanism + maxRedirects = 0, // We do our own redirect following + maxConnections = configuration.maxConnectionsPerHost, + pipelining = configuration.pipelining + ) + HostConnectorSetup(host = host, settings = Option(updatedHcs)) + } + + private def executeSprayRequest(request: Request): Unit = request.requestConfiguration match { + case Some(configuration) => + val url = new URL(request.url) + val message = (toSprayRequest(request), toSprayHostConnectorSetup(url.getHost, configuration)) + IO(Http) ! message + case None => + IO(Http) ! toSprayRequest(request) + } + + def handleErrors(commander: ActorRef, request: Request, retry: Retry, storage: ByteStorage, remainingRedirects: Int): Receive = { + case ev @ Http.SendFailed(_) => + log.debug("Communication error, cause: {}", ev) + reporter.onError(request, ev) + storage.close() + context.become(retrying(commander, request, remainingRedirects)) + self ! retry.onError + + case ev @ Timedout(_) => + log.debug("Communication error, cause: {}", ev) + reporter.onError(request, ev) + storage.close() + context.become(retrying(commander, request, remainingRedirects)) + self ! retry.onTimeout + + case Status.Failure(NonFatal(exception)) => + reporter.onError(request, exception) + storage.close() + exception match { + case ex: Http.RequestTimeoutException => + log.warning("Request {} timeout, details: {}", request, ex.getMessage) + context.become(retrying(commander, request, remainingRedirects)) + self ! retry.onTimeout + + case ex: Http.ConnectionException => + log.warning("Connection error on {}, details: {}", request, ex.getMessage) + context.become(retrying(commander, request, remainingRedirects)) + self ! retry.onError + + case unknownException => + log.error(unknownException, "Unknown error on {}", request) + context.become(retrying(commander, request, remainingRedirects)) + self ! retry.onError + } + + case unknownMessage => + log.debug("Unknown message: {}", unknownMessage) + reporter.onError(request, unknownMessage) + storage.close() + context.become(retrying(commander, request, remainingRedirects)) + self ! retry.onError + } + + def receive: Receive = { + case request: Request => + log.debug("Starting request {}", request) + reporter.onRequest(request) + executeSprayRequest(request) + val retry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, timeoutBackoff = backoff) + val storage = new ByteStorage() + val maxRedirects = request.requestConfiguration.getOrElse(RequestConfiguration()).maxRedirects + context.become(waitingForResponse(sender, request, retry, storage, maxRedirects) + .orElse(handleErrors(sender, request, retry, storage, maxRedirects))) + } + + def retrying(commander: ActorRef, request: Request, remainingRedirects: Int): Receive = { + case retry: Retry => + if (retry.shouldGiveUp) { + reporter.onGiveUp(request) + log.warning("Error to get {}, no more retries {}, accepting failure", request, retry) + commander ! Status.Failure(new TimeoutException(s"Failed to get '${request.url}'")) + context.stop(self) + } else { + reporter.onRetry(request) + log.info("Retrying {}, retry status {}, backing off for {} millis", request, retry, retry.backoff.toMillis) + system.scheduler.scheduleOnce(retry.backoff) { + log.debug("Waking from backoff, retrying request {}", request) + executeSprayRequest(request) + }(executionContext) + val storage = new ByteStorage() + context.become(waitingForResponse(commander, request, retry, storage, remainingRedirects) + .orElse(handleErrors(commander, request, retry, storage, remainingRedirects))) + } + } + + def waitingForResponse(commander: ActorRef, request: Request, retry: Retry, storage: ByteStorage, remainingRedirects: Int): Receive = { + case response@HttpResponse(status, entity, headers, _) => try { + storage.write(response.entity.data.toByteArray) + if (isRedirection(status)) + handleRedirect(commander, storage, retry, request, status, response, remainingRedirects) + else if (status.isSuccess) { + reporter.onResponse(request, status.intValue) + commander ! Status.Success(AsyncHttpClientStreamApi.StreamResponse(status.intValue, storage.getInputStream())) + context.stop(self) + } else if (retryOnHttpStatus.contains(status.intValue)) { + storage.close() + log.debug("HttpResponse: Status {}, retrying...", status) + context.become(retrying(commander, request, remainingRedirects)) + self ! retry.onError + } else { + val message = s"HTTP response status ${status.intValue}, on request ${request}, ${status.defaultMessage}" + log.debug("HttpResponse: {}", message) + reporter.onFailure(request, status.intValue) + reporter.onGiveUp(request) + commander ! Status.Failure(new AsyncHttpClientStreamApi.RequestException(message = message, + response = AsyncHttpClientStreamApi.StreamResponse(status.intValue, storage.getInputStream()))) + context.stop(self) + } + } catch { + case NonFatal(ex) => + storage.close() + log.error(ex, "HttpResponse: Failure on creating HttpResponse") + reporter.onError(request, ex) + context.become(retrying(commander, request, remainingRedirects)) + self ! retry.onError + } + + case chunkStart@ChunkedResponseStart(HttpResponse(status, entity, headers, _)) => try { + storage.write(entity.data.toByteArray) + if (isRedirection(status)) + handleRedirect(commander, storage, retry, request, status, chunkStart, remainingRedirects) + else if (status.isSuccess) { + context.become(accumulateChunks(commander, request, retry, storage, status, remainingRedirects) + .orElse(handleErrors(commander, request, retry, storage, remainingRedirects))) + } else if (retryOnHttpStatus.contains(status.intValue)) { + storage.close() + log.debug("ChunkedResponseStart: Status {}, retrying...", status) + context.become(retrying(commander, request, remainingRedirects)) + self ! retry.onError + } else { + val message = s"HTTP response status ${status.intValue}, on request ${request}, ${status.defaultMessage}" + log.debug("ChunkedResponseStart: {}", message) + reporter.onFailure(request, status.intValue) + reporter.onGiveUp(request) + commander ! Status.Failure(new AsyncHttpClientStreamApi.RequestException(message = message, + response = AsyncHttpClientStreamApi.StreamResponse(status.intValue, storage.getInputStream()))) + context.stop(self) + } + } catch { + case NonFatal(ex) => + log.error(ex, "ChunkedResponseStart: Failure on creating ChunkedHttpResponse") + reporter.onError(request, ex) + context.become(retrying(commander, request, remainingRedirects)) + self ! retry.onError + } + } + + def accumulateChunks(commander: ActorRef, request: Request, retry: Retry, storage: ByteStorage, status: StatusCode, remainingRedirects: Int): Receive = { + case message@MessageChunk(data, _) => try { + storage.write(data.toByteArray) + } catch { + case NonFatal(ex) => + storage.close() + log.error(ex, "MessageChunk: Failure on accumulate chunk data") + reporter.onError(request, ex) + context.become(retrying(commander, request, remainingRedirects)) + self ! retry.onError + } + + case chunkEnd: ChunkedMessageEnd => + log.debug("ChunkedMessageEnd: all data was received for request {}, status {}", request, status) + reporter.onResponse(request, status.intValue) + commander ! Status.Success(AsyncHttpClientStreamApi.StreamResponse(status.intValue, storage.getInputStream())) + context.stop(self) + } + + def handleRedirect(commander: ActorRef, oldStorage: ByteStorage, oldRetry: Retry, oldRequest: Request, status: StatusCode, rawResponse: HttpResponsePart, remainingRedirects: Int): Unit = { + if (remainingRedirects <= 0) { + val message = s"HandleRedirect: exceeded redirection limit on $oldRequest with status $status" + log.warning(message) + reporter.onGiveUp(oldRequest) + commander ! Status.Failure(new Exception(message)) + context.stop(self) + } else { + def makeRequest(headers: List[HttpHeader]): Receive = { + oldStorage.close() + val newRemainingRedirects = remainingRedirects - 1 + headers.find(_.is("location")).map(_.value).map { newLocation => + log.debug("Making redirect to {}", newLocation) + val newRequest = oldRequest.copy(url = newLocation) + executeSprayRequest(newRequest) + val newRetry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, timeoutBackoff = backoff) + val newStorage = new ByteStorage() + waitingForResponse(commander, newRequest, newRetry, newStorage, newRemainingRedirects) + .orElse(handleErrors(commander, newRequest, newRetry, newStorage, newRemainingRedirects)) + }.getOrElse { + log.warning("Received redirect for request {} with headers {} without location, retrying...", oldRequest, headers) + retrying(commander, oldRequest, newRemainingRedirects) + } + } + context.become(rawResponse match { + case response@HttpResponse(status, entity, headers, _) => + makeRequest(headers) + case chunkStart@ChunkedResponseStart(HttpResponse(status, entity, headers, _)) => { + case message@MessageChunk(data, _) => + // do nothing + case chunkEnd: ChunkedMessageEnd => + context.become(makeRequest(headers)) + } + case other => + throw new Exception(s"Bug, called on $other") + }) + } + } + + } + +} diff --git a/src/main/scala/ignition/core/http/ByteStorage.scala b/src/main/scala/ignition/core/http/ByteStorage.scala new file mode 100644 index 00000000..c137a5fe --- /dev/null +++ b/src/main/scala/ignition/core/http/ByteStorage.scala @@ -0,0 +1,114 @@ +package ignition.core.http + +import java.io._ +import java.nio.file.{Files, Paths} +import java.util.UUID + +import org.slf4j.LoggerFactory + +import scala.util.control.NonFatal +import scala.util.{Failure, Success, Try} + +class ByteStorage(memoryThreshold: Int = 1024 * 1024 * 5) extends AutoCloseable { + + lazy val log = LoggerFactory.getLogger(getClass) + + lazy val tempDirPath = Files.createDirectories(Paths.get(System.getProperty("java.io.tmpdir"), "ByteStorage")) + + lazy val buffer = new ByteArrayOutputStream + + var fileStorage: Option[(File, FileOutputStream)] = None + + def write(bytes: Array[Byte]): Unit = try { + if (fileStorage.isDefined) { + writeOnFile(bytes) + } else if (buffer.size() + bytes.length > memoryThreshold) { + log.debug("Memory threshold {} reach, going to file storage", memoryThreshold) + setupFileStorage() + writeOnFile(buffer.toByteArray) + writeOnFile(bytes) + // on ByteArrayOutputStream close() takes not effect, + // but if we change the buffer impl this is the a good moment to free resources + buffer.close() + } else { + buffer.write(bytes) + } + } catch { + case NonFatal(ex) => + close() + throw ex + } + + override def close(): Unit = fileStorage match { + case Some((file, outputStream)) => try { + log.debug("Cleaning up temp file {}", file.getAbsolutePath) + outputStream.close() + file.delete() + } catch { + case NonFatal(ex) => log.warn(s"Fail to cleanup temp file ${file.getAbsolutePath}", ex) + } + case None => + log.debug("Cleaning up memory buffer") + buffer.close() + } + + private def setupFileStorage(): Unit = if (fileStorage.isEmpty) { + tryCreateTempFile match { + case Success(storage) => fileStorage = Option(storage) + case Failure(ex) => throw ex + } + } else { + throw new IllegalStateException("File storage already setup") + } + + private def tryCreateTempFile: Try[(File, FileOutputStream)] = Try { + val tempFile = File.createTempFile(s"temp_byte_storage_${UUID.randomUUID().toString}", ".temp", tempDirPath.toFile) + tempFile.deleteOnExit() + log.debug("Creating temp file {}", tempFile.getAbsolutePath) + (tempFile, new FileOutputStream(tempFile)) + } + + private def writeOnFile(bytes: Array[Byte]): Unit = fileStorage match { + case Some((_, outputStream)) => outputStream.write(bytes) + case None => throw new IllegalStateException("File storage not initialized") + } + + def getInputStream(): InputStream = fileStorage match { + case Some((file, outputStream)) => try { + outputStream.close() + new DeleteOnCloseFileInputStream(file) + } catch { + case NonFatal(ex) => + log.error("Fail to create InputStream", ex) + close() + throw ex + } + case None => new ByteArrayInputStream(buffer.toByteArray) + } + + override def finalize() = try { + fileStorage match { + case Some((file, outputStream)) => + log.debug("Cleaning up temp file {}", file.getAbsolutePath) + outputStream.close() + file.delete() + case None => + } + } finally { + super.finalize() + } + +} + +private class DeleteOnCloseFileInputStream(file: File) extends FileInputStream(file) { + lazy val log = LoggerFactory.getLogger(getClass) + override def close() = try { + log.debug("Cleaning up file {}", file.getAbsolutePath) + file.delete() + } catch { + case NonFatal(ex) => + log.warn(s"Failed to clean up file ${file.getAbsolutePath}", ex) + } finally { + super.close() + } +} \ No newline at end of file diff --git a/src/main/scala/ignition/core/http/Caching.scala b/src/main/scala/ignition/core/http/Caching.scala new file mode 100644 index 00000000..112791a5 --- /dev/null +++ b/src/main/scala/ignition/core/http/Caching.scala @@ -0,0 +1,22 @@ +package ignition.core.http + +import org.slf4j.LoggerFactory +import spray.caching.Cache + +import scala.concurrent._ +import scala.util.Failure + +trait Caching[T] { + val log = LoggerFactory.getLogger(classOf[Caching[T]]) + + val cache: Cache[T] + import ExecutionContext.Implicits.global + def fetchCache[K](key: K, f: () => Future[T]): Future[T] = cache(key) { + f.apply andThen { + case Failure(e) => { + cache.remove(key) + log.info(s"Removed $key from cache due to an exception: $e") + } + } + } +} diff --git a/src/main/scala/ignition/core/http/Retry.scala b/src/main/scala/ignition/core/http/Retry.scala new file mode 100644 index 00000000..03d86db6 --- /dev/null +++ b/src/main/scala/ignition/core/http/Retry.scala @@ -0,0 +1,65 @@ +package ignition.core.http + +import java.util.Random +import java.util.concurrent.TimeUnit + +import org.joda.time.DateTime + +import scala.concurrent.duration.{Duration, FiniteDuration, _} +import scala.language.postfixOps + +object Retry { + + sealed trait State + case object Timeout extends State + case object Error extends State + + val random = new Random + + val _maxWaitForNextRetry = 10 + + def exponentialBackOff(r: Int): FiniteDuration = { + val exponent: Double = scala.math.min(r, _maxWaitForNextRetry) + scala.math.pow(2, exponent).round * (random.nextInt(30) + 100 milliseconds) + } + +} + +case class Retry(startTime: DateTime, + timeout: FiniteDuration, + state: Retry.State = Retry.Timeout, + timeoutCount: Int = 0, + timeoutBackoff: FiniteDuration = 100 milliseconds, + maxErrors: Int = 10, + errorsCount: Int = 0, + backoffOnError: FiniteDuration = 100 milliseconds) { + + import Retry._ + + def onError(): Retry = + copy(errorsCount = errorsCount + 1, backoffOnError = exponentialBackOff(errorsCount + 1), state = Retry.Error) + + def onTimeout(): Retry = copy(timeoutCount = timeoutCount + 1, timeoutBackoff = exponentialBackOff(timeoutCount + 1), state = Retry.Timeout) + + def backoff(): FiniteDuration = state match { + case Timeout => timeoutBackoff + case Error => backoffOnError + } + + private def canRetryMore(durations: FiniteDuration*): Boolean = { + val maxTime = startTime.plusMillis(timeout.toMillis.toInt) + val nextEstimatedTime = DateTime.now.plusMillis(durations.map(_.toMillis.toInt).sum) + nextEstimatedTime.isBefore(maxTime) + } + + // This is an approximation and we are ignoring the time waiting on backoff. + // In this way we are overestimating the average request duration, which is fine because it's better to abort early than wait too much time exceed AskTimeouts + private def averageRequestDuration = + Duration((DateTime.now.getMillis - startTime.getMillis) / Math.max(timeoutCount + errorsCount, 1), TimeUnit.MILLISECONDS) + + def shouldGiveUp(): Boolean = state match { + case Timeout => !canRetryMore(averageRequestDuration, timeoutBackoff) + case Error => !canRetryMore(averageRequestDuration, backoffOnError) || errorsCount > maxErrors + } + +} \ No newline at end of file From 733731b3001695a416a96d35e28818a577589b73 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 29 Sep 2016 21:07:48 -0300 Subject: [PATCH 110/268] Add test script --- src/main/scala/TestHttp.scala | 49 +++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 src/main/scala/TestHttp.scala diff --git a/src/main/scala/TestHttp.scala b/src/main/scala/TestHttp.scala new file mode 100644 index 00000000..355575a0 --- /dev/null +++ b/src/main/scala/TestHttp.scala @@ -0,0 +1,49 @@ + +object TestHttp extends App{ + + def goTest(): Unit = { + import java.util.concurrent.TimeUnit + + import akka.actor.{ActorRefFactory, ActorSystem} + import akka.util.Timeout + import ignition.core.http.AsyncHttpClientStreamApi._ + import ignition.core.http.AsyncSprayHttpClient + import ignition.core.utils.ExceptionUtils._ + import org.joda.time.DateTime + + import scala.concurrent.ExecutionContext.Implicits.global + import scala.concurrent.duration.Duration + import scala.io.Source + import scala.util.{Failure, Success} + def now = DateTime.now() + + val system = ActorSystem("http") + val client = new AsyncSprayHttpClient { + override implicit def actorRefFactory: ActorRefFactory = system + } + val url = "http://httpbin.org/delay/10" // "http://127.0.0.1:8081/" + val conf = RequestConfiguration(requestTimeout = Duration(12, TimeUnit.SECONDS), idleTimeout = Duration(5, TimeUnit.SECONDS)) + implicit val reporter = NoOpReporter + implicit val timeout = Timeout(30, TimeUnit.SECONDS) + + println(s"Starting $now") + + // Should complete ok + val request1 = client.makeRequest(Request(url, requestConfiguration = Option(conf))) + request1.onComplete { + case Success(t) => println(s"request1 finished $now with Success: ${Source.fromInputStream(t.content).mkString}") + case Failure(t) => println(s"request1 finished $now with failure: ${t.getFullStackTraceString()}") + } + + //Should time out and keep retrying + val tightConf = conf.copy(requestTimeout = Duration(3, TimeUnit.SECONDS)) + val request2 = client.makeRequest(Request(url, requestConfiguration = Option(tightConf))) + + request2.onComplete { + case Success(t) => println(s"request2 finished $now with Success: ${Source.fromInputStream(t.content).mkString}") + case Failure(t) => println(s"request2 finished $now with failure: ${t.getFullStackTraceString()}") + } + } + + goTest() +} From a1f1c38a4dce2d037fb6726c281f05aad4c7468d Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 30 Sep 2016 11:39:39 -0300 Subject: [PATCH 111/268] retry on cluster health check --- tools/cluster.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index c4a2f681..5f59edad 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -24,7 +24,7 @@ import json import glob import webbrowser - +import ssl log = logging.getLogger() log.setLevel(logging.INFO) @@ -506,15 +506,23 @@ def job_attach(cluster_name, key_file=default_key_file, job_name=None, job_tag=N class NotHealthyCluster(Exception): pass @named('health-check') -def health_check(cluster_name, key_file=default_key_file, master=None, remote_user=default_remote_user, region=default_region): - master = master or get_master(cluster_name, region=region) - all_args = load_cluster_args(master, key_file, remote_user) - nslaves = int(all_args['slaves']) - minimum_percentage_healthy_slaves = all_args['minimum_percentage_healthy_slaves'] - masters, slaves = get_active_nodes(cluster_name, region=region) - if nslaves == 0 or float(len(slaves)) / nslaves < minimum_percentage_healthy_slaves: - raise NotHealthyCluster('Not enough healthy slaves: {0}/{1}'.format(len(slaves), nslaves)) - +def health_check(cluster_name, key_file=default_key_file, master=None, remote_user=default_remote_user, region=default_region, retries=3): + for i in range(retries): + try: + master = master or get_master(cluster_name, region=region) + all_args = load_cluster_args(master, key_file, remote_user) + nslaves = int(all_args['slaves']) + minimum_percentage_healthy_slaves = all_args['minimum_percentage_healthy_slaves'] + masters, slaves = get_active_nodes(cluster_name, region=region) + if nslaves == 0 or float(len(slaves)) / nslaves < minimum_percentage_healthy_slaves: + raise NotHealthyCluster('Not enough healthy slaves: {0}/{1}'.format(len(slaves), nslaves)) + except NotHealthyCluster, e: + raise e + except Exception, e: + log.warning("Failed to check cluster health, cluster: %s, retries %s" % (cluster_name, i), exc_info=True) + if i >= retries - 1: + log.critical("Failed to check cluster health, cluster: %s, giveup!" % (cluster_name)) + raise e class JobFailure(Exception): pass @@ -645,7 +653,7 @@ def collect(show_tail): failures += 1 last_failure = 'Unexpected response: {}'.format(output) health_check(cluster_name=cluster_name, key_file=key_file, master=master, remote_user=remote_user, region=region) - except subprocess.CalledProcessError as e: + except (subprocess.CalledProcessError, ssl.SSLError) as e: failures += 1 log.exception('Got exception') last_failure = 'Exception: {}'.format(e) From b667ce686a505930eec55a25e3a6eb105d3b260a Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Fri, 30 Sep 2016 20:05:42 -0300 Subject: [PATCH 112/268] Set host with correct port and ssl support --- src/main/scala/TestHttp.scala | 2 +- .../ignition/core/http/AsyncSprayHttpClient.scala | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/main/scala/TestHttp.scala b/src/main/scala/TestHttp.scala index 355575a0..f11f35ca 100644 --- a/src/main/scala/TestHttp.scala +++ b/src/main/scala/TestHttp.scala @@ -21,7 +21,7 @@ object TestHttp extends App{ val client = new AsyncSprayHttpClient { override implicit def actorRefFactory: ActorRefFactory = system } - val url = "http://httpbin.org/delay/10" // "http://127.0.0.1:8081/" + val url = "https://httpbin.org/delay/10" // "http://127.0.0.1:8081/" val conf = RequestConfiguration(requestTimeout = Duration(12, TimeUnit.SECONDS), idleTimeout = Duration(5, TimeUnit.SECONDS)) implicit val reporter = NoOpReporter implicit val timeout = Timeout(30, TimeUnit.SECONDS) diff --git a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala index 54247a04..c6adbbbd 100644 --- a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala +++ b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala @@ -71,7 +71,7 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi { HttpRequest(method = method, uri = toUriString(request.url, params), entity = body) } - private def toSprayHostConnectorSetup(host: String, configuration: AsyncHttpClientStreamApi.RequestConfiguration): HostConnectorSetup = { + private def toSprayHostConnectorSetup(uri: Uri, configuration: AsyncHttpClientStreamApi.RequestConfiguration): HostConnectorSetup = { // Create based on defaults, change some of them val ccs: ClientConnectionSettings = ClientConnectionSettings(system) val hcs: HostConnectorSettings = HostConnectorSettings(system) @@ -90,13 +90,15 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi { maxConnections = configuration.maxConnectionsPerHost, pipelining = configuration.pipelining ) - HostConnectorSetup(host = host, settings = Option(updatedHcs)) + + val host = uri.authority.host + HostConnectorSetup(host.toString, uri.effectivePort, sslEncryption = uri.scheme == "https", settings = Option(updatedHcs)) } private def executeSprayRequest(request: Request): Unit = request.requestConfiguration match { case Some(configuration) => - val url = new URL(request.url) - val message = (toSprayRequest(request), toSprayHostConnectorSetup(url.getHost, configuration)) + val url = Uri(request.url) + val message = (toSprayRequest(request), toSprayHostConnectorSetup(url, configuration)) IO(Http) ! message case None => IO(Http) ! toSprayRequest(request) From e0457bb692801aa477e4f4f3a5d82072340af098 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Mon, 3 Oct 2016 18:42:40 -0300 Subject: [PATCH 113/268] Allow a granular fallback to external configuration --- src/main/scala/TestHttp.scala | 4 +-- .../core/http/AsyncHttpClientStreamApi.scala | 24 +++++++++---- .../core/http/AsyncSprayHttpClient.scala | 36 +++++++++++-------- 3 files changed, 41 insertions(+), 23 deletions(-) diff --git a/src/main/scala/TestHttp.scala b/src/main/scala/TestHttp.scala index f11f35ca..901516e0 100644 --- a/src/main/scala/TestHttp.scala +++ b/src/main/scala/TestHttp.scala @@ -22,7 +22,7 @@ object TestHttp extends App{ override implicit def actorRefFactory: ActorRefFactory = system } val url = "https://httpbin.org/delay/10" // "http://127.0.0.1:8081/" - val conf = RequestConfiguration(requestTimeout = Duration(12, TimeUnit.SECONDS), idleTimeout = Duration(5, TimeUnit.SECONDS)) + val conf = RequestConfiguration(requestTimeout = Option(Duration(12, TimeUnit.SECONDS)), idleTimeout = Option(Duration(5, TimeUnit.SECONDS))) implicit val reporter = NoOpReporter implicit val timeout = Timeout(30, TimeUnit.SECONDS) @@ -36,7 +36,7 @@ object TestHttp extends App{ } //Should time out and keep retrying - val tightConf = conf.copy(requestTimeout = Duration(3, TimeUnit.SECONDS)) + val tightConf = conf.copy(requestTimeout = Option(Duration(3, TimeUnit.SECONDS))) val request2 = client.makeRequest(Request(url, requestConfiguration = Option(tightConf))) request2.onComplete { diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala index 52d97810..5ec528ae 100644 --- a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala +++ b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala @@ -26,19 +26,29 @@ object AsyncHttpClientStreamApi { // TODO: return a stream is dangerous because implies into a lock case class StreamResponse(status: Int, content: InputStream) - case class RequestConfiguration(maxRedirects: Int = 15, - maxConnectionsPerHost: Int = 500, - pipelining: Boolean = false, - idleTimeout: Duration = Duration(30, TimeUnit.SECONDS), - requestTimeout: Duration = Duration(20, TimeUnit.SECONDS), - connectingTimeout: Duration = Duration(10, TimeUnit.SECONDS)) + // If any value is None, it will fallback to the implementation's default + object RequestConfiguration { + val defaultMaxRedirects: Int = 15 + val defaultMaxConnectionsPerHost: Int = 500 + val defaultPipelining: Boolean = false + val defaultIdleTimeout: FiniteDuration = Duration(30, TimeUnit.SECONDS) + val defaultRequestTimeout: FiniteDuration = Duration(20, TimeUnit.SECONDS) + val defaultConnectingTimeout: FiniteDuration = Duration(10, TimeUnit.SECONDS) + } + + case class RequestConfiguration(maxRedirects: Option[Int] = Option(RequestConfiguration.defaultMaxRedirects), + maxConnectionsPerHost: Option[Int] = Option(RequestConfiguration.defaultMaxConnectionsPerHost), + pipelining: Option[Boolean] = Option(RequestConfiguration.defaultPipelining), + idleTimeout: Option[Duration] = Option(RequestConfiguration.defaultIdleTimeout), + requestTimeout: Option[Duration] = Option(RequestConfiguration.defaultRequestTimeout), + connectingTimeout: Option[Duration] = Option(RequestConfiguration.defaultConnectingTimeout)) case class Request(url: String, params: Map[String, String] = Map.empty, credentials: Option[Credentials] = None, method: HttpMethod = HttpMethods.GET, body: HttpEntity = HttpEntity.Empty, - requestConfiguration: Option[RequestConfiguration] = Option(RequestConfiguration())) + requestConfiguration: Option[RequestConfiguration] = None) case class RequestException(message: String, response: StreamResponse) extends RuntimeException(message) diff --git a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala index c6adbbbd..a2dc312d 100644 --- a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala +++ b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala @@ -71,37 +71,44 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi { HttpRequest(method = method, uri = toUriString(request.url, params), entity = body) } - private def toSprayHostConnectorSetup(uri: Uri, configuration: AsyncHttpClientStreamApi.RequestConfiguration): HostConnectorSetup = { + private def toSprayHostConnectorSetup(uri: Uri, conf: Option[AsyncHttpClientStreamApi.RequestConfiguration]): HostConnectorSetup = { // Create based on defaults, change some of them val ccs: ClientConnectionSettings = ClientConnectionSettings(system) val hcs: HostConnectorSettings = HostConnectorSettings(system) val updatedCcs = ccs.copy( responseChunkAggregationLimit = 0, // makes our client ineffective if non zero - idleTimeout = configuration.idleTimeout, - connectingTimeout = configuration.connectingTimeout, - requestTimeout = configuration.requestTimeout + idleTimeout = conf.flatMap(_.idleTimeout).getOrElse(ccs.idleTimeout), + connectingTimeout = conf.flatMap(_.connectingTimeout).getOrElse(ccs.connectingTimeout), + requestTimeout = conf.flatMap(_.requestTimeout).getOrElse(ccs.requestTimeout) ) + val maxConnections = conf.flatMap(_.maxConnectionsPerHost).getOrElse { + // Let's avoid someone shoot his own foot + if (hcs.maxConnections == 4) // Spray's default is stupidly low + // Use the API's default, which is more reasonable + RequestConfiguration.defaultMaxConnectionsPerHost + else + // If the conf is the non-default value, then someone know what he's doing. use that configured value + hcs.maxConnections + } + val updatedHcs = hcs.copy( connectionSettings = updatedCcs, maxRetries = 0, // We have our own retry mechanism maxRedirects = 0, // We do our own redirect following - maxConnections = configuration.maxConnectionsPerHost, - pipelining = configuration.pipelining + maxConnections = maxConnections, + pipelining = conf.flatMap(_.pipelining).getOrElse(hcs.pipelining) ) val host = uri.authority.host HostConnectorSetup(host.toString, uri.effectivePort, sslEncryption = uri.scheme == "https", settings = Option(updatedHcs)) } - private def executeSprayRequest(request: Request): Unit = request.requestConfiguration match { - case Some(configuration) => - val url = Uri(request.url) - val message = (toSprayRequest(request), toSprayHostConnectorSetup(url, configuration)) - IO(Http) ! message - case None => - IO(Http) ! toSprayRequest(request) + private def executeSprayRequest(request: Request): Unit = { + val url = Uri(request.url) + val message = (toSprayRequest(request), toSprayHostConnectorSetup(url, request.requestConfiguration)) + IO(Http) ! message } def handleErrors(commander: ActorRef, request: Request, retry: Retry, storage: ByteStorage, remainingRedirects: Int): Receive = { @@ -154,7 +161,8 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi { executeSprayRequest(request) val retry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, timeoutBackoff = backoff) val storage = new ByteStorage() - val maxRedirects = request.requestConfiguration.getOrElse(RequestConfiguration()).maxRedirects + val maxRedirects = + request.requestConfiguration.flatMap(_.maxRedirects).getOrElse(RequestConfiguration.defaultMaxRedirects) context.become(waitingForResponse(sender, request, retry, storage, maxRedirects) .orElse(handleErrors(sender, request, retry, storage, maxRedirects))) } From 52699436ed6e84a1f771db1574e295cf2e421e1a Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Mon, 3 Oct 2016 19:10:26 -0300 Subject: [PATCH 114/268] Move telemetry cache to core --- .../ignition/core/utils/TelemetryCache.scala | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 src/main/scala/ignition/core/utils/TelemetryCache.scala diff --git a/src/main/scala/ignition/core/utils/TelemetryCache.scala b/src/main/scala/ignition/core/utils/TelemetryCache.scala new file mode 100644 index 00000000..d86f98bc --- /dev/null +++ b/src/main/scala/ignition/core/utils/TelemetryCache.scala @@ -0,0 +1,45 @@ +package ignition.core.utils + +import ignition.core.utils.TelemetryCache.TelemetryCacheReporter +import spray.caching.Cache + +import scala.concurrent.{ExecutionContext, Future} + +object TelemetryCache { + + def apply[V](cacheName: String, wrapped: Cache[V], reporter: TelemetryCacheReporter): Cache[V] = + new TelemetryCache[V](cacheName, wrapped, reporter) + + trait TelemetryCacheReporter { + def onHit(name: String): Unit + def onMiss(name: String): Unit + } + +} + +class TelemetryCache[V](cacheName: String, wrapped: Cache[V], reporter: TelemetryCacheReporter) extends Cache[V] { + + override def apply(key: Any, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = { + val value = wrapped.get(key) + if (value.isDefined) { + reporter.onHit(cacheName) + value.get + } else { + reporter.onMiss(cacheName) + wrapped.apply(key, genValue) + } + } + + override def get(key: Any): Option[Future[V]] = wrapped.get(key) + + override def clear(): Unit = wrapped.clear() + + override def size: Int = wrapped.size + + override def remove(key: Any): Option[Future[V]] = wrapped.remove(key) + + override def keys: Set[Any] = wrapped.keys + + override def ascendingKeys(limit: Option[Int]): Iterator[Any] = wrapped.ascendingKeys(limit) + +} From d89a08e0a07720c704803b5d20542bd8948f3207 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 19 Oct 2016 17:09:12 -0200 Subject: [PATCH 115/268] Make retry configurable --- .../core/http/AsyncHttpClientStreamApi.scala | 4 +- .../core/http/AsyncSprayHttpClient.scala | 13 +++-- src/main/scala/ignition/core/http/Retry.scala | 57 ++++++++++++------- .../scala/ignition/core/http/RetrySpec.scala | 39 +++++++++++++ 4 files changed, 87 insertions(+), 26 deletions(-) create mode 100644 src/test/scala/ignition/core/http/RetrySpec.scala diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala index 5ec528ae..4910c98a 100644 --- a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala +++ b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala @@ -73,7 +73,7 @@ object AsyncHttpClientStreamApi { trait AsyncHttpClientStreamApi { - def makeRequest(request: AsyncHttpClientStreamApi.Request, initialBackoff: FiniteDuration = 100 milliseconds, retryOnHttpStatus: Seq[Int] = List.empty) - (implicit timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback = AsyncHttpClientStreamApi.NoOpReporter): Future[AsyncHttpClientStreamApi.StreamResponse] + def makeRequest(request: AsyncHttpClientStreamApi.Request, retryConf: RetryConf = RetryConf(), retryOnHttpStatus: Seq[Int] = List.empty) + (implicit timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback = AsyncHttpClientStreamApi.NoOpReporter): Future[AsyncHttpClientStreamApi.StreamResponse] } \ No newline at end of file diff --git a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala index a2dc312d..0565fe2f 100644 --- a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala +++ b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala @@ -29,13 +29,16 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi { implicit def actorRefFactory: ActorRefFactory def executionContext: ExecutionContext = actorRefFactory.dispatcher - override def makeRequest(request: AsyncHttpClientStreamApi.Request, backoff: FiniteDuration, retryOnHttpStatus: Seq[Int]) + override def makeRequest(request: AsyncHttpClientStreamApi.Request, retryConf: RetryConf, retryOnHttpStatus: Seq[Int]) (implicit timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback): Future[AsyncHttpClientStreamApi.StreamResponse] = { - val processor = actorRefFactory.actorOf(Props(new RequestProcessorActor(timeout, reporter, backoff, retryOnHttpStatus))) + val processor = actorRefFactory.actorOf(Props(new RequestProcessorActor(timeout, reporter, retryConf, retryOnHttpStatus))) (processor ? request).mapTo[AsyncHttpClientStreamApi.StreamResponse] } - private class RequestProcessorActor(timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback, backoff: FiniteDuration, retryOnHttpStatus: Seq[Int]) + private class RequestProcessorActor(timeout: Timeout, + reporter: AsyncHttpClientStreamApi.ReporterCallback, + retryConf: RetryConf, + retryOnHttpStatus: Seq[Int]) extends Actor with ActorLogging { @@ -159,7 +162,7 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi { log.debug("Starting request {}", request) reporter.onRequest(request) executeSprayRequest(request) - val retry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, timeoutBackoff = backoff) + val retry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, conf = retryConf) val storage = new ByteStorage() val maxRedirects = request.requestConfiguration.flatMap(_.maxRedirects).getOrElse(RequestConfiguration.defaultMaxRedirects) @@ -283,7 +286,7 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi { log.debug("Making redirect to {}", newLocation) val newRequest = oldRequest.copy(url = newLocation) executeSprayRequest(newRequest) - val newRetry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, timeoutBackoff = backoff) + val newRetry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, conf = retryConf) val newStorage = new ByteStorage() waitingForResponse(commander, newRequest, newRetry, newStorage, newRemainingRedirects) .orElse(handleErrors(commander, newRequest, newRetry, newStorage, newRemainingRedirects)) diff --git a/src/main/scala/ignition/core/http/Retry.scala b/src/main/scala/ignition/core/http/Retry.scala index 03d86db6..1c94828b 100644 --- a/src/main/scala/ignition/core/http/Retry.scala +++ b/src/main/scala/ignition/core/http/Retry.scala @@ -1,12 +1,12 @@ package ignition.core.http -import java.util.Random import java.util.concurrent.TimeUnit import org.joda.time.DateTime import scala.concurrent.duration.{Duration, FiniteDuration, _} import scala.language.postfixOps +import scala.util.Random object Retry { @@ -14,52 +14,71 @@ object Retry { case object Timeout extends State case object Error extends State - val random = new Random - - val _maxWaitForNextRetry = 10 - - def exponentialBackOff(r: Int): FiniteDuration = { - val exponent: Double = scala.math.min(r, _maxWaitForNextRetry) - scala.math.pow(2, exponent).round * (random.nextInt(30) + 100 milliseconds) + def exponentialBackOff(base: Int, + exponent: Int, + initialBackoff: FiniteDuration, + maxBackoff: FiniteDuration, + maxRandom: FiniteDuration): FiniteDuration = { + val randomMillis = maxRandom.toMillis.toInt + val random = if (randomMillis > 0) + FiniteDuration(Random.nextInt(randomMillis), TimeUnit.MILLISECONDS) + else + FiniteDuration(0, TimeUnit.MILLISECONDS) + + val calculated = scala.math.pow(base, exponent).round * (random + initialBackoff) + calculated.min(maxBackoff) } } -case class Retry(startTime: DateTime, +case class RetryConf(initialTimeoutBackoff: FiniteDuration = 100 milliseconds, + maxErrors: Int = 10, + initialBackoffOnError: FiniteDuration = 100 milliseconds, + timeoutMultiplicationFactor: Int = 2, + errorMultiplicationFactor: Int = 2, + maxBackoff: FiniteDuration = 1 minute, + maxRandom: FiniteDuration = 30 milliseconds) + +case class Retry(conf: RetryConf, + startTime: DateTime, timeout: FiniteDuration, state: Retry.State = Retry.Timeout, timeoutCount: Int = 0, - timeoutBackoff: FiniteDuration = 100 milliseconds, - maxErrors: Int = 10, - errorsCount: Int = 0, - backoffOnError: FiniteDuration = 100 milliseconds) { + errorsCount: Int = 0) { import Retry._ + protected def now = DateTime.now + + private def errorBackoff = + exponentialBackOff(conf.errorMultiplicationFactor, Math.max(errorsCount - 1, 0), conf.initialBackoffOnError, conf.maxBackoff, conf.maxRandom) + private def timeoutBackoff = + exponentialBackOff(conf.timeoutMultiplicationFactor, Math.max(timeoutCount - 1, 0), conf.initialTimeoutBackoff, conf.maxBackoff, conf.maxRandom) + def onError(): Retry = - copy(errorsCount = errorsCount + 1, backoffOnError = exponentialBackOff(errorsCount + 1), state = Retry.Error) + copy(errorsCount = errorsCount + 1, state = Retry.Error) - def onTimeout(): Retry = copy(timeoutCount = timeoutCount + 1, timeoutBackoff = exponentialBackOff(timeoutCount + 1), state = Retry.Timeout) + def onTimeout(): Retry = copy(timeoutCount = timeoutCount + 1, state = Retry.Timeout) def backoff(): FiniteDuration = state match { case Timeout => timeoutBackoff - case Error => backoffOnError + case Error => errorBackoff } private def canRetryMore(durations: FiniteDuration*): Boolean = { val maxTime = startTime.plusMillis(timeout.toMillis.toInt) - val nextEstimatedTime = DateTime.now.plusMillis(durations.map(_.toMillis.toInt).sum) + val nextEstimatedTime = now.plusMillis(durations.map(_.toMillis.toInt).sum) nextEstimatedTime.isBefore(maxTime) } // This is an approximation and we are ignoring the time waiting on backoff. // In this way we are overestimating the average request duration, which is fine because it's better to abort early than wait too much time exceed AskTimeouts private def averageRequestDuration = - Duration((DateTime.now.getMillis - startTime.getMillis) / Math.max(timeoutCount + errorsCount, 1), TimeUnit.MILLISECONDS) + Duration((now.getMillis - startTime.getMillis) / Math.max(timeoutCount + errorsCount, 1), TimeUnit.MILLISECONDS) def shouldGiveUp(): Boolean = state match { case Timeout => !canRetryMore(averageRequestDuration, timeoutBackoff) - case Error => !canRetryMore(averageRequestDuration, backoffOnError) || errorsCount > maxErrors + case Error => !canRetryMore(averageRequestDuration, errorBackoff) || errorsCount > conf.maxErrors } } \ No newline at end of file diff --git a/src/test/scala/ignition/core/http/RetrySpec.scala b/src/test/scala/ignition/core/http/RetrySpec.scala new file mode 100644 index 00000000..88528568 --- /dev/null +++ b/src/test/scala/ignition/core/http/RetrySpec.scala @@ -0,0 +1,39 @@ +package ignition.core.http + +import org.joda.time.DateTime +import org.scalatest.{FlatSpec, Matchers} + +import scala.concurrent.duration._ + +class RetrySpec extends FlatSpec with Matchers { + "Retry" should "return the initial backoff" in { + val now = DateTime.now + val timeout = 60.seconds + + val retry = Retry(RetryConf(initialBackoffOnError = 123.milliseconds, initialTimeoutBackoff = 456.milliseconds, maxRandom = 0.seconds), now, timeout) + + retry.onError().backoff() shouldBe 123.millisecond + retry.onTimeout().backoff() shouldBe 456.millisecond + } + + it should "multiply by the factor on second time" in { + + val now = DateTime.now + val timeout = 60.seconds + + val retry = Retry(RetryConf(initialBackoffOnError = 123.milliseconds, initialTimeoutBackoff = 456.milliseconds, maxRandom = 0.seconds, timeoutMultiplicationFactor = 3, errorMultiplicationFactor = 5), now, timeout) + + retry.onError().onError().backoff() shouldBe (123 * 5).millisecond + retry.onTimeout().onTimeout().backoff() shouldBe (456 * 3).millisecond + } + + it should "not explode if called with no errors or timeouts" in { + val now = DateTime.now + val timeout = 60.seconds + + val retry = Retry(RetryConf(maxRandom = 0.seconds), now, timeout) + + retry.backoff() shouldBe 100.milliseconds + } + +} From a7ee8d415127420babb85324e028ca0f215390a5 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Fri, 21 Oct 2016 16:53:46 -0200 Subject: [PATCH 116/268] Multiple level cache: local with remote fallback --- .../core/cache/MultipleLevelCache.scala | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 src/main/scala/ignition/core/cache/MultipleLevelCache.scala diff --git a/src/main/scala/ignition/core/cache/MultipleLevelCache.scala b/src/main/scala/ignition/core/cache/MultipleLevelCache.scala new file mode 100644 index 00000000..83a5c0b8 --- /dev/null +++ b/src/main/scala/ignition/core/cache/MultipleLevelCache.scala @@ -0,0 +1,152 @@ +package ignition.core.cache + +import ignition.core.utils.FutureUtils._ + +import scala.concurrent.{ExecutionContext, Future} +import scala.util.{Failure, Success, Try} + +trait SimpleCache[V] { + def apply(key: Any, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext): Future[V] +} + +trait LocalCache[V] extends SimpleCache[V] { + def get(key: Any): Option[Future[V]] + def set(key: Any, value: Try[V]): Boolean +} + +trait RemoteWritableCache[V] { + def set(key: Any, value: Try[V])(implicit ec: ExecutionContext): Future[Boolean] +} + +trait RemoteReadableCache[V] { + def get(key: Any)(implicit ec: ExecutionContext): Future[Option[V]] +} + +trait RemoteCacheRW[V] extends SimpleCache[V] with RemoteReadableCache[V] with RemoteWritableCache[V] + + +case class LocalAsRemote[V](local: LocalCache[V]) extends RemoteCacheRW[V] { + override def get(key: Any)(implicit ec: ExecutionContext): Future[Option[V]] = + local.get(key).map(_.map(Option.apply)).getOrElse(Future.successful(None)) + + override def set(key: Any, value: Try[V])(implicit ec: ExecutionContext): Future[Boolean] = + Future.successful(local.set(key, value)) + + override def apply(key: Any, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = + apply(key, genValue) +} + +case class MultipleLevelCache[V](localCache: LocalCache[V], + remoteRW: List[RemoteCacheRW[V]], + remoteReadOnly: List[RemoteReadableCache[V]]) extends SimpleCache[V] { + val allReadableCaches: Array[RemoteReadableCache[V]] = + (LocalAsRemote(localCache) +: (remoteRW ++ remoteReadOnly)).toArray + + // This can be called by multiple instances simultaneously but in the end + // only the one that wins the race will create the final value that will be set in + // the remote caches and read by the other instances + // Unless of course there is some error getting stuff from remote cache + // in which case the local value may be returned + def canonicalValueGenerator(key: Any, genValue: () => Future[V])() = { + val fLocalValue = genValue() + val finalValue: Future[V] = fLocalValue.asTry().flatMap { + case tLocalValue @ Success(localValue) => + // Successfully generated value, try to set it in the first remote Writable cache + remoteRW match { + // No remote cache available, just return this value to be set on local cache + case Nil => + Future.successful(localValue) + // We have at least one remote RW cache + case first :: others => + first.set(key, tLocalValue).asTry().flatMap { + case Success(true) => + // Successfully inserted on first remote store, propagate value to other remote rw caches + // We do it in a fire and forget approach, we only guarantee the data is in the first cache + others.foreach(_.set(key, tLocalValue)) + // Return this value to be set on the local cache + Future.successful(localValue) + case Success(false) => + // There is already a value there, we lost the race, ours won't be the canonical one, try to get it + first.get(key).asTry().flatMap { + case Success(Some(remoteValue)) => + // Just return it + Future.successful(remoteValue) + case Success(None) => + // WTF? the set operation said it was there but now the value disappeared?! + // So return our value which is good and hope for the best + // TODO: generate metric and log here + Future.successful(localValue) + case Failure(_) => + // Oh noes, we failed to get the canonical value + // We are supposing any retries have already been done by the cache implementation + // So return our value which is good and hope for the best + // TODO: generate metric and log here + Future.successful(localValue) + } + case Failure(_) => + // Oh noes, we failed to set the canonical value + // We are supposing any retries have already been done by the cache implementation + // So return our value which is good and hope for the best + // TODO: generate metric and log here + Future.successful(localValue) + } + } + case Failure(eLocal) => + // We failed to generate the value ourselves, our only hope is if someone else successfully did it in the meantime + remoteRW match { + case Nil => + // There are no remote RW caches + // FIXME: perhaps try the read only caches (but we can just be wasting time doing that) + // TODO: generate metric and log here + Future.failed(eLocal) + case first :: others => + first.get(key).asTry().flatMap { + case Success(Some(remoteValue)) => + // Hooray, someone calculated and set the value, return it + Future.successful(remoteValue) + case Success(None) => + // Sadly, there is no value on remote, we failed! + // FIXME: perhaps try other caches (but we can just be wasting time doing that) + // TODO: generate metric and log here + Future.failed(eLocal) + case Failure(eRemote) => + // Oh noes, this failed + // We are supposing any retries have already been done by the cache implementation + // And to make things worse, we don't have a good value + // So return a failure + // FIXME: perhaps try other caches (but we can just be wasting time doing that) + // TODO: generate metric and log here + Future.failed(eLocal) + } + } + } + finalValue + } + + def indexedApply(index: Int, key: Any, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = { + if (index >= allReadableCaches.size) { // nothing found on our caches, calculate value + // We could generate the value then set on local cache, but calling apply guarantees + // canonicalValueGenerator will be called only once in this instance (supposing LocalCache works like Spray Cache) + localCache(key, canonicalValueGenerator(key, genValue)) + } else { + allReadableCaches(index).get(key).asTry().flatMap { + case Success(None) => + // Try next cache + indexedApply(index + 1, key, genValue) + case Success(Some(value)) => + Future.successful(value) + case Failure(e) => + // Oh noes, this failed + // We are supposing any retries have already been done by the cache implementation + // So try the next one, we don't have many options + // TODO: generate metric and log here + indexedApply(index + 1, key, genValue) + } + } + } + + override def apply(key: Any, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = + indexedApply(0, key, genValue) +} + +case class ExpiringRedisWithAsyncUpdate() \ No newline at end of file From e07fe16fe0d78fa91fa0ecc2f4964056379dc8c6 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 26 Oct 2016 15:56:52 -0200 Subject: [PATCH 117/268] Now it implements async updates --- .../core/cache/MultipleLevelCache.scala | 326 ++++++++++++------ .../ignition/core/utils/FutureUtils.scala | 13 + 2 files changed, 232 insertions(+), 107 deletions(-) diff --git a/src/main/scala/ignition/core/cache/MultipleLevelCache.scala b/src/main/scala/ignition/core/cache/MultipleLevelCache.scala index 83a5c0b8..6747b4cf 100644 --- a/src/main/scala/ignition/core/cache/MultipleLevelCache.scala +++ b/src/main/scala/ignition/core/cache/MultipleLevelCache.scala @@ -1,120 +1,187 @@ package ignition.core.cache +import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap import ignition.core.utils.FutureUtils._ +import org.joda.time.{DateTime, Period} +import org.slf4j.LoggerFactory -import scala.concurrent.{ExecutionContext, Future} +import scala.concurrent.duration._ +import scala.concurrent.{ExecutionContext, Future, Promise} import scala.util.{Failure, Success, Try} -trait SimpleCache[V] { - def apply(key: Any, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext): Future[V] -} -trait LocalCache[V] extends SimpleCache[V] { - def get(key: Any): Option[Future[V]] - def set(key: Any, value: Try[V]): Boolean -} +object ExpiringMultipleLevelCache { + case class TimestampedValue[V](date: DateTime, value: V) { + def hasExpired(ttl: Period, now: DateTime): Boolean = { + date.plus(ttl).isBefore(now) + } + } -trait RemoteWritableCache[V] { - def set(key: Any, value: Try[V])(implicit ec: ExecutionContext): Future[Boolean] -} + trait GenericCache[V] { + def apply(key: String, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext): Future[V] + } -trait RemoteReadableCache[V] { - def get(key: Any)(implicit ec: ExecutionContext): Future[Option[V]] -} + trait LocalCache[V] extends GenericCache[V] { + def get(key: Any): Option[Future[V]] + def set(key: Any, value: Try[V]): Unit + } -trait RemoteCacheRW[V] extends SimpleCache[V] with RemoteReadableCache[V] with RemoteWritableCache[V] + trait RemoteWritableCache[V] { + def set(key: String, value: V)(implicit ec: ExecutionContext): Future[Unit] + def setLock(key: String, ttl: FiniteDuration): Future[Boolean] + } + trait RemoteReadableCache[V] { + def get(key: String)(implicit ec: ExecutionContext): Future[Option[V]] + } -case class LocalAsRemote[V](local: LocalCache[V]) extends RemoteCacheRW[V] { - override def get(key: Any)(implicit ec: ExecutionContext): Future[Option[V]] = - local.get(key).map(_.map(Option.apply)).getOrElse(Future.successful(None)) + trait RemoteCacheRW[V] extends RemoteReadableCache[V] with RemoteWritableCache[V] - override def set(key: Any, value: Try[V])(implicit ec: ExecutionContext): Future[Boolean] = - Future.successful(local.set(key, value)) + trait ReporterCallback { + def onError(key: String, t: Throwable): Unit + def onRemoteGiveup(key: String): Unit + } + + object NoOpReporter extends ReporterCallback { + def onError(key: String, t: Throwable): Unit = {} + def onRemoteGiveup(key: String): Unit = {} + } - override def apply(key: Any, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = - apply(key, genValue) } -case class MultipleLevelCache[V](localCache: LocalCache[V], - remoteRW: List[RemoteCacheRW[V]], - remoteReadOnly: List[RemoteReadableCache[V]]) extends SimpleCache[V] { - val allReadableCaches: Array[RemoteReadableCache[V]] = - (LocalAsRemote(localCache) +: (remoteRW ++ remoteReadOnly)).toArray + +import ExpiringMultipleLevelCache._ + + + +case class ExpiringMultipleLevelCache[V](ttl: Period, + localCache: LocalCache[TimestampedValue[V]], + remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None, + reporter: ExpiringMultipleLevelCache.ReporterCallback = ExpiringMultipleLevelCache.NoOpReporter, + maxErrorsToRetryOnRemote: Int = 5) extends GenericCache[V] { + + private val logger = LoggerFactory.getLogger(getClass) + + private val tempUpdate = new ConcurrentLinkedHashMap.Builder[Any, Future[TimestampedValue[V]]].build() + + protected def now = DateTime.now + + private def timestamp(v: V) = TimestampedValue(now, v) + + private def remoteLockKey(key: Any) = s"$key-emlc-lock" + + private val remoteLockTTL = 10.seconds + + // This methods tries to guarantee that everyone that calls it in + // a given moment will be left with the same value in the end + private def remoteSetOrGet(key: String, + calculatedValue: TimestampedValue[V], + remote: RemoteCacheRW[TimestampedValue[V]], + currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { + if (currentRetry > maxErrorsToRetryOnRemote) { + reporter.onRemoteGiveup(key) + // TODO: generate metric and log here + // Use our calculated value as it's the best we can do + Future.successful(calculatedValue) + } else { + remote.setLock(remoteLockKey(key), remoteLockTTL).asTry().flatMap { + case Success(true) => + // Lock acquired, get the current value and replace it + remote.get(key).asTry().flatMap { + case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => + // Current value is good, just return it + Future.successful(remoteValue) + case Success(_) => + // The remote value is missing or has expired + // We have the lock to replace this value. Our calculated value will be the canonical one! + remote.set(key, calculatedValue).asTry().flatMap { + case Success(_) => + // Flawless victory + Future.successful(calculatedValue) + case Failure(e) => + // TODO: generate metric and log here + // Retry failure + remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1) + } + case Failure(_) => + // TODO: generate metric and log here + // Retry failure + remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1) + } + case Success(false) => + // Someone got the lock, let's take a look at the value + remote.get(key).asTry().flatMap { + case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => + // Current value is good, just return it + Future.successful(remoteValue) + case Success(_) => + // The value is missing or has expired + // Let's start from scratch because we need to be able to set or get a good value + // Note: do not increment retry because this isn't an error + remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry) + case Failure(e) => + // TODO: generate metric and log here + // Retry + remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1) + } + case Failure(_) => + // TODO: generate metric and log here + // Retry failure + remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1) + } + } + } + + private def remoteGetWithRetryOnError(key: String, + remote: RemoteCacheRW[TimestampedValue[V]], + currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { + remote.get(key).asTry().flatMap { + case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => + Future.successful(remoteValue) + case Success(_) => + Future.failed(new Exception("No good value found on remote")) + case Failure(e) => + if (currentRetry >= maxErrorsToRetryOnRemote) { + // TODO: generate metric and log here + Future.failed(e) + } else { + // Retry + remoteGetWithRetryOnError(key, remote, currentRetry = currentRetry + 1) + } + } + } // This can be called by multiple instances simultaneously but in the end // only the one that wins the race will create the final value that will be set in // the remote caches and read by the other instances // Unless of course there is some error getting stuff from remote cache - // in which case the local value may be returned - def canonicalValueGenerator(key: Any, genValue: () => Future[V])() = { - val fLocalValue = genValue() - val finalValue: Future[V] = fLocalValue.asTry().flatMap { - case tLocalValue @ Success(localValue) => - // Successfully generated value, try to set it in the first remote Writable cache + // in which case the locally generated value may be returned + protected def canonicalValueGenerator(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext) = { + val fGeneratedValue = Try { genValue().map(timestamp) }.asFutureTry() + val finalValue: Future[TimestampedValue[V]] = fGeneratedValue.flatMap { + case Success(generatedValue) => + // Successfully generated value, try to set it in the remote writable cache remoteRW match { // No remote cache available, just return this value to be set on local cache - case Nil => - Future.successful(localValue) - // We have at least one remote RW cache - case first :: others => - first.set(key, tLocalValue).asTry().flatMap { - case Success(true) => - // Successfully inserted on first remote store, propagate value to other remote rw caches - // We do it in a fire and forget approach, we only guarantee the data is in the first cache - others.foreach(_.set(key, tLocalValue)) - // Return this value to be set on the local cache - Future.successful(localValue) - case Success(false) => - // There is already a value there, we lost the race, ours won't be the canonical one, try to get it - first.get(key).asTry().flatMap { - case Success(Some(remoteValue)) => - // Just return it - Future.successful(remoteValue) - case Success(None) => - // WTF? the set operation said it was there but now the value disappeared?! - // So return our value which is good and hope for the best - // TODO: generate metric and log here - Future.successful(localValue) - case Failure(_) => - // Oh noes, we failed to get the canonical value - // We are supposing any retries have already been done by the cache implementation - // So return our value which is good and hope for the best - // TODO: generate metric and log here - Future.successful(localValue) - } - case Failure(_) => - // Oh noes, we failed to set the canonical value - // We are supposing any retries have already been done by the cache implementation - // So return our value which is good and hope for the best - // TODO: generate metric and log here - Future.successful(localValue) - } + case None => + Future.successful(generatedValue) + case Some(remote) => + remoteSetOrGet(key, generatedValue, remote) } case Failure(eLocal) => - // We failed to generate the value ourselves, our only hope is if someone else successfully did it in the meantime + // We failed to generate the value ourselves, our hope is if someone else successfully did it in the meantime remoteRW match { - case Nil => + case None => // There are no remote RW caches - // FIXME: perhaps try the read only caches (but we can just be wasting time doing that) // TODO: generate metric and log here Future.failed(eLocal) - case first :: others => - first.get(key).asTry().flatMap { - case Success(Some(remoteValue)) => - // Hooray, someone calculated and set the value, return it - Future.successful(remoteValue) - case Success(None) => - // Sadly, there is no value on remote, we failed! - // FIXME: perhaps try other caches (but we can just be wasting time doing that) + case Some(remote) => + remoteGetWithRetryOnError(key, remote).asTry().flatMap { + case Success(v) => // TODO: generate metric and log here - Future.failed(eLocal) + Future.successful(v) case Failure(eRemote) => - // Oh noes, this failed - // We are supposing any retries have already been done by the cache implementation - // And to make things worse, we don't have a good value - // So return a failure - // FIXME: perhaps try other caches (but we can just be wasting time doing that) + // The real error is the eLocal, return it // TODO: generate metric and log here Future.failed(eLocal) } @@ -123,30 +190,75 @@ case class MultipleLevelCache[V](localCache: LocalCache[V], finalValue } - def indexedApply(index: Int, key: Any, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = { - if (index >= allReadableCaches.size) { // nothing found on our caches, calculate value - // We could generate the value then set on local cache, but calling apply guarantees - // canonicalValueGenerator will be called only once in this instance (supposing LocalCache works like Spray Cache) - localCache(key, canonicalValueGenerator(key, genValue)) - } else { - allReadableCaches(index).get(key).asTry().flatMap { - case Success(None) => - // Try next cache - indexedApply(index + 1, key, genValue) - case Success(Some(value)) => - Future.successful(value) - case Failure(e) => - // Oh noes, this failed - // We are supposing any retries have already been done by the cache implementation - // So try the next one, we don't have many options - // TODO: generate metric and log here - indexedApply(index + 1, key, genValue) - } + // Note: this method may return a failed future, but it will never cache it + private def tryGenerateAndSet(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { + val promise = Promise[TimestampedValue[V]]() + tempUpdate.putIfAbsent(key, promise.future) match { + case null => + canonicalValueGenerator(key, genValue).onComplete { + case Success(v) => + localCache.set(key, Success(v)) + promise.trySuccess(v) + tempUpdate.remove(key) + case Failure(e) => + // Note: we don't save failures to cache + promise.tryFailure(e) + tempUpdate.remove(key) + } + promise.future + case fTrying => fTrying } } - override def apply(key: Any, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = - indexedApply(0, key, genValue) + override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = + localCache.get(key).map(_.asTry()) match { + case Some(future) => + future.flatMap { + case Success(localValue) if !localValue.hasExpired(ttl, now) => + // We have locally a good value, just return it + Future.successful(localValue.value) + case Success(expiredLocalValue) if remoteRW.nonEmpty => + // We have locally an expired value, but we can check a remote cache for better value + remoteRW.get.get(key).asTry().flatMap { + case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => + // Remote is good, set locally and return it + localCache.set(key, Success(remoteValue)) + Future.successful(remoteValue.value) + case Success(Some(_)) | Success(None) => + // No good remote, return local, async update both + tryGenerateAndSet(key, genValue) + Future.successful(expiredLocalValue.value) + case Failure(e) => + // TODO: log, generate metrics + tryGenerateAndSet(key, genValue) + Future.successful(expiredLocalValue.value) + } + case Success(expiredLocalValue) if remoteRW.isEmpty => + tryGenerateAndSet(key, genValue) + Future.successful(expiredLocalValue.value) + case Failure(e) => + // This is almost impossible to happen because it's local and we don't save failed values + // TODO: log, generate metrics + tryGenerateAndSet(key, genValue).map(_.value) + } + case None if remoteRW.nonEmpty => + // No local, let's try remote + remoteRW.get.get(key).asTry().flatMap { + case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => + // Remote is good, set locally and return it + localCache.set(key, Success(remoteValue)) + Future.successful(remoteValue.value) + case Success(Some(_)) | Success(None) => + // No good remote, sync generate + tryGenerateAndSet(key, genValue).map(_.value) + case Failure(e) => + // TODO: log, generate metrics + tryGenerateAndSet(key, genValue).map(_.value) + } + case None if remoteRW.isEmpty => + // No local and no remote to look, just generate it + tryGenerateAndSet(key, genValue).map(_.value) + } } case class ExpiringRedisWithAsyncUpdate() \ No newline at end of file diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala index 55853826..684c950b 100644 --- a/src/main/scala/ignition/core/utils/FutureUtils.scala +++ b/src/main/scala/ignition/core/utils/FutureUtils.scala @@ -11,6 +11,7 @@ object FutureUtils { def blockingFuture[T](body: =>T)(implicit ec: ExecutionContext): Future[T] = Future { blocking { body } } + implicit class FutureImprovements[V](future: Future[V]) { def toOptionOnFailure(errorHandler: (Throwable) => Option[V])(implicit ec: ExecutionContext): Future[Option[V]] = { future.map(Option.apply).recover { case t => errorHandler(t) } @@ -37,6 +38,18 @@ object FutureUtils { } } + implicit class TryFutureImprovements[V](future: Try[Future[V]]) { + // Works like asTry(), but will also wrap the outer Try inside the Future + def asFutureTry()(implicit ec: ExecutionContext): Future[Try[V]] = { + future match { + case Success(f) => + f.asTry() + case Failure(e) => + Future.successful(Failure(e)) + } + } + } + implicit class FutureGeneratorImprovements[V](generator: Iterable[() => Future[V]]){ def toLazyIterable(batchSize: Int = 1)(implicit ec: ExecutionContext): Iterable[Future[V]] = new Iterable[Future[V]] { override def iterator = new Iterator[Future[V]] { From 5018bf9b951128e137897d05ecad00989238990c Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 26 Oct 2016 19:35:35 -0200 Subject: [PATCH 118/268] Added spray cache and minor improvements --- ...scala => ExpiringMultipleLevelCache.scala} | 377 +++++++++++------- .../scala/spray/cache/ExpiringLruCache.scala | 139 +++++++ 2 files changed, 367 insertions(+), 149 deletions(-) rename src/main/scala/ignition/core/cache/{MultipleLevelCache.scala => ExpiringMultipleLevelCache.scala} (54%) create mode 100644 src/main/scala/spray/cache/ExpiringLruCache.scala diff --git a/src/main/scala/ignition/core/cache/MultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala similarity index 54% rename from src/main/scala/ignition/core/cache/MultipleLevelCache.scala rename to src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala index 6747b4cf..5469e308 100644 --- a/src/main/scala/ignition/core/cache/MultipleLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala @@ -2,9 +2,11 @@ package ignition.core.cache import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap import ignition.core.utils.FutureUtils._ +import ignition.core.utils.DateUtils._ import org.joda.time.{DateTime, Period} import org.slf4j.LoggerFactory + import scala.concurrent.duration._ import scala.concurrent.{ExecutionContext, Future, Promise} import scala.util.{Failure, Success, Try} @@ -23,7 +25,7 @@ object ExpiringMultipleLevelCache { trait LocalCache[V] extends GenericCache[V] { def get(key: Any): Option[Future[V]] - def set(key: Any, value: Try[V]): Unit + def set(key: Any, value: V): Unit } trait RemoteWritableCache[V] { @@ -38,22 +40,42 @@ object ExpiringMultipleLevelCache { trait RemoteCacheRW[V] extends RemoteReadableCache[V] with RemoteWritableCache[V] trait ReporterCallback { - def onError(key: String, t: Throwable): Unit - def onRemoteGiveup(key: String): Unit + def onCacheMissNothingFound() + def onCacheMissButFoundExpiredLocal() + def onCacheMissButFoundExpiredRemote() + def onRemoteCacheHit() + def onLocalCacheHit() + def onUnexpectedBehaviour() + def onStillTryingToLockOrGet() + def onSuccessfullyRemoteSetValue() + def onRemoteCacheHitAfterGenerating() + def onErrorGeneratingValue(key: String, eLocal: Throwable) + def onLocalError(key: String, e: Throwable) + def onRemoteError(key: String, t: Throwable): Unit + def onRemoteGiveUp(key: String): Unit } object NoOpReporter extends ReporterCallback { - def onError(key: String, t: Throwable): Unit = {} - def onRemoteGiveup(key: String): Unit = {} + override def onCacheMissNothingFound(): Unit = {} + override def onUnexpectedBehaviour(): Unit = {} + override def onSuccessfullyRemoteSetValue(): Unit = {} + override def onRemoteError(key: String, t: Throwable): Unit = {} + override def onRemoteGiveUp(key: String): Unit = {} + override def onLocalError(key: String, e: Throwable): Unit = {} + override def onErrorGeneratingValue(key: String, eLocal: Throwable): Unit = {} + override def onRemoteCacheHitAfterGenerating(): Unit = {} + override def onCacheMissButFoundExpiredRemote(): Unit = {} + override def onStillTryingToLockOrGet(): Unit = {} + override def onLocalCacheHit(): Unit = {} + override def onRemoteCacheHit(): Unit = {} + override def onCacheMissButFoundExpiredLocal(): Unit = {} } - } import ExpiringMultipleLevelCache._ - case class ExpiringMultipleLevelCache[V](ttl: Period, localCache: LocalCache[TimestampedValue[V]], remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None, @@ -70,7 +92,182 @@ case class ExpiringMultipleLevelCache[V](ttl: Period, private def remoteLockKey(key: Any) = s"$key-emlc-lock" - private val remoteLockTTL = 10.seconds + private val remoteLockTTL = 5.seconds + + + // The idea is simple, have two caches: remote and local + // with values that will eventually expire but still be left on the cache + // while a new value is asynchronously being calculated/retrieved + override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = + // The local cache is always the first try. We'll only look the remote if the local value is missing or has expired + localCache.get(key).map(_.asTry()) match { + case Some(future) => + future.flatMap { + case Success(localValue) if !localValue.hasExpired(ttl, now) => + // We have locally a good value, just return it + reporter.onLocalCacheHit() + Future.successful(localValue.value) + case Success(expiredLocalValue) if remoteRW.nonEmpty => + // We have locally an expired value, but we can check a remote cache for better value + remoteRW.get.get(key).asTry().flatMap { + case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => + // Remote is good, set locally and return it + reporter.onRemoteCacheHit() + localCache.set(key, remoteValue) + Future.successful(remoteValue.value) + case Success(Some(expiredRemote)) => + // Expired local and expired remote, return the most recent of them, async update both + reporter.onCacheMissButFoundExpiredRemote() + tryGenerateAndSet(key, genValue) + val mostRecent = Set(expiredLocalValue, expiredRemote).maxBy(_.date) + Future.successful(mostRecent.value) + case Success(None) => + // No remote found, return local, async update both + reporter.onCacheMissButFoundExpiredLocal() + tryGenerateAndSet(key, genValue) + Future.successful(expiredLocalValue.value) + case Failure(e) => + reporter.onRemoteError(key, e) + logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key expired local value and failed to get remote", e) + tryGenerateAndSet(key, genValue) + Future.successful(expiredLocalValue.value) + } + case Success(expiredLocalValue) if remoteRW.isEmpty => + // There is no remote cache configured, we'are on our own + // Return expired value and try to generate a new one for the future + reporter.onCacheMissButFoundExpiredLocal() + tryGenerateAndSet(key, genValue) + Future.successful(expiredLocalValue.value) + case Failure(e) => + // This is almost impossible to happen because it's local and we don't save failed values + reporter.onLocalError(key, e) + logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key got a failed future from cache!? This is almost impossible!", e) + tryGenerateAndSet(key, genValue).map(_.value) + } + case None if remoteRW.nonEmpty => + // No local, let's try remote + remoteRW.get.get(key).asTry().flatMap { + case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => + // Remote is good, set locally and return it + reporter.onRemoteCacheHit() + localCache.set(key, remoteValue) + Future.successful(remoteValue.value) + case Success(Some(expiredRemote)) => + // Expired remote, return the it, async update + reporter.onCacheMissButFoundExpiredRemote() + tryGenerateAndSet(key, genValue).map(_.value) + Future.successful(expiredRemote.value) + case Success(None) => + // No good remote, sync generate + reporter.onCacheMissNothingFound() + tryGenerateAndSet(key, genValue).map(_.value) + case Failure(e) => + reporter.onRemoteError(key, e) + logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key expired local value and no remote configured", e) + tryGenerateAndSet(key, genValue).map(_.value) + } + case None if remoteRW.isEmpty => + // No local and no remote to look, just generate it + // The caller will need to wait for the value generation + reporter.onCacheMissNothingFound() + tryGenerateAndSet(key, genValue).map(_.value) + } + + // Note: this method may return a failed future, but it will never cache it + // Our main purpose here is to avoid multiple local calls to generate new promises/futures in parallel, + // so we use this Map keep everyone in sync + // This is similar to how spray cache works + private def tryGenerateAndSet(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { + val promise = Promise[TimestampedValue[V]]() + tempUpdate.putIfAbsent(key, promise.future) match { + case null => + canonicalValueGenerator(key, genValue).onComplete { + case Success(v) if !v.hasExpired(ttl, now) => + localCache.set(key, v) + promise.trySuccess(v) + tempUpdate.remove(key) + case Success(v) => + // Have we generated/got an expired value!? + reporter.onUnexpectedBehaviour() + logger.warn(s"tryGenerateAndSet, key $key: unexpectedly generated/got an expired value: $v") + localCache.set(key, v) + promise.trySuccess(v) + tempUpdate.remove(key) + case Failure(e) => + // We don't save failures to cache + // There is no need to log here, canonicalValueGenerator will log everything already + promise.tryFailure(e) + tempUpdate.remove(key) + } + promise.future + case fTrying => + // If someone call us while a future is running, we return the running future + fTrying + } + } + + // This can be called by multiple instances/hosts simultaneously but in the end + // only the one that wins the race will create the final value that will be set in + // the remote cache and read by the other instances + // Unless of course there is some error getting stuff from remote cache + // in which case the locally generated value may be returned to avoid further delays + protected def canonicalValueGenerator(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext) = { + val fGeneratedValue = Try { genValue().map(timestamp) }.asFutureTry() + val finalValue: Future[TimestampedValue[V]] = fGeneratedValue.flatMap { + case Success(generatedValue) => + // Successfully generated value, try to set it in the remote writable cache + remoteRW match { + // No remote cache available, just return this value to be set on local cache + case None => + Future.successful(generatedValue) + case Some(remote) => + remoteSetOrGet(key, generatedValue, remote) + } + case Failure(eLocal) => + // We failed to generate the value ourselves, our hope is if someone else successfully did it in the meantime + reporter.onErrorGeneratingValue(key, eLocal) + remoteRW match { + case None => + // There are no remote RW caches + logger.error(s"canonicalValueGenerator, key $key: failed to generate value and no remote cache configured", eLocal) + Future.failed(eLocal) + case Some(remote) => + remoteGetNonExpiredValue(key, remote).asTry().flatMap { + case Success(v) => + logger.warn(s"canonicalValueGenerator, key $key: failed to generate value but got one from remote", eLocal) + Future.successful(v) + case Failure(eRemote) => + // The real error is the eLocal, return it + logger.error(s"canonicalValueGenerator, key $key: failed to generate value and failed to get remote", eLocal) + Future.failed(eLocal) + } + } + } + finalValue + } + + // Auxiliary method, only makes sense to be used by canonicalValueGenerator + private def remoteGetNonExpiredValue(key: String, + remote: RemoteCacheRW[TimestampedValue[V]], + currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { + remote.get(key).asTry().flatMap { + case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => + Future.successful(remoteValue) + case Success(_) => + Future.failed(new Exception("No good value found on remote")) + case Failure(e) => + if (currentRetry >= maxErrorsToRetryOnRemote) { + reporter.onRemoteGiveUp(key) + logger.error(s"remoteGetWithRetryOnError, key $key: returning calculated value because we got more than $maxErrorsToRetryOnRemote errors", e) + Future.failed(e) + } else { + reporter.onRemoteError(key, e) + logger.warn(s"remoteGetWithRetryOnError, key $key: got error trying to get value, retry $currentRetry of $maxErrorsToRetryOnRemote", e) + // Retry + remoteGetNonExpiredValue(key, remote, currentRetry = currentRetry + 1) + } + } + } // This methods tries to guarantee that everyone that calls it in // a given moment will be left with the same value in the end @@ -79,32 +276,39 @@ case class ExpiringMultipleLevelCache[V](ttl: Period, remote: RemoteCacheRW[TimestampedValue[V]], currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { if (currentRetry > maxErrorsToRetryOnRemote) { - reporter.onRemoteGiveup(key) - // TODO: generate metric and log here // Use our calculated value as it's the best we can do + reporter.onRemoteGiveUp(key) + logger.error(s"remoteSetOrGet, key $key: returning calculated value because we got more than $maxErrorsToRetryOnRemote errors") Future.successful(calculatedValue) } else { remote.setLock(remoteLockKey(key), remoteLockTTL).asTry().flatMap { case Success(true) => + logger.info(s"remoteSetOrGet got lock for key $key") // Lock acquired, get the current value and replace it remote.get(key).asTry().flatMap { case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => // Current value is good, just return it + reporter.onRemoteCacheHitAfterGenerating() + logger.info(s"remoteSetOrGet got lock for $key but found already a good value on remote") Future.successful(remoteValue) case Success(_) => - // The remote value is missing or has expired + // The remote value is missing or has expired. This is what we were expecting // We have the lock to replace this value. Our calculated value will be the canonical one! remote.set(key, calculatedValue).asTry().flatMap { case Success(_) => - // Flawless victory + // Flawless victory! + reporter.onSuccessfullyRemoteSetValue() + logger.info(s"remoteSetOrGet successfully set key $key while under lock") Future.successful(calculatedValue) case Failure(e) => - // TODO: generate metric and log here + reporter.onRemoteError(key, e) + logger.warn(s"remoteSetOrGet, key $key: got error setting the value, retry $currentRetry of $maxErrorsToRetryOnRemote", e) // Retry failure remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1) } - case Failure(_) => - // TODO: generate metric and log here + case Failure(e) => + reporter.onRemoteError(key, e) + logger.warn(s"remoteSetOrGet, key $key: got error getting remote value with lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e) // Retry failure remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1) } @@ -113,152 +317,27 @@ case class ExpiringMultipleLevelCache[V](ttl: Period, remote.get(key).asTry().flatMap { case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => // Current value is good, just return it + reporter.onRemoteCacheHitAfterGenerating() Future.successful(remoteValue) case Success(_) => // The value is missing or has expired // Let's start from scratch because we need to be able to set or get a good value // Note: do not increment retry because this isn't an error + reporter.onStillTryingToLockOrGet() + logger.info(s"remoteSetOrGet couldn't lock key $key and didn't found good value on remote") remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry) case Failure(e) => - // TODO: generate metric and log here + reporter.onRemoteError(key, e) + logger.warn(s"remoteSetOrGet, key $key: got error getting remote value without lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e) // Retry remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1) } - case Failure(_) => - // TODO: generate metric and log here + case Failure(e) => // Retry failure + reporter.onRemoteError(key, e) + logger.warn(s"remoteSetOrGet, key $key: got error trying to set lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e) remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1) } } } - - private def remoteGetWithRetryOnError(key: String, - remote: RemoteCacheRW[TimestampedValue[V]], - currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { - remote.get(key).asTry().flatMap { - case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => - Future.successful(remoteValue) - case Success(_) => - Future.failed(new Exception("No good value found on remote")) - case Failure(e) => - if (currentRetry >= maxErrorsToRetryOnRemote) { - // TODO: generate metric and log here - Future.failed(e) - } else { - // Retry - remoteGetWithRetryOnError(key, remote, currentRetry = currentRetry + 1) - } - } - } - - // This can be called by multiple instances simultaneously but in the end - // only the one that wins the race will create the final value that will be set in - // the remote caches and read by the other instances - // Unless of course there is some error getting stuff from remote cache - // in which case the locally generated value may be returned - protected def canonicalValueGenerator(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext) = { - val fGeneratedValue = Try { genValue().map(timestamp) }.asFutureTry() - val finalValue: Future[TimestampedValue[V]] = fGeneratedValue.flatMap { - case Success(generatedValue) => - // Successfully generated value, try to set it in the remote writable cache - remoteRW match { - // No remote cache available, just return this value to be set on local cache - case None => - Future.successful(generatedValue) - case Some(remote) => - remoteSetOrGet(key, generatedValue, remote) - } - case Failure(eLocal) => - // We failed to generate the value ourselves, our hope is if someone else successfully did it in the meantime - remoteRW match { - case None => - // There are no remote RW caches - // TODO: generate metric and log here - Future.failed(eLocal) - case Some(remote) => - remoteGetWithRetryOnError(key, remote).asTry().flatMap { - case Success(v) => - // TODO: generate metric and log here - Future.successful(v) - case Failure(eRemote) => - // The real error is the eLocal, return it - // TODO: generate metric and log here - Future.failed(eLocal) - } - } - } - finalValue - } - - // Note: this method may return a failed future, but it will never cache it - private def tryGenerateAndSet(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { - val promise = Promise[TimestampedValue[V]]() - tempUpdate.putIfAbsent(key, promise.future) match { - case null => - canonicalValueGenerator(key, genValue).onComplete { - case Success(v) => - localCache.set(key, Success(v)) - promise.trySuccess(v) - tempUpdate.remove(key) - case Failure(e) => - // Note: we don't save failures to cache - promise.tryFailure(e) - tempUpdate.remove(key) - } - promise.future - case fTrying => fTrying - } - } - - override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = - localCache.get(key).map(_.asTry()) match { - case Some(future) => - future.flatMap { - case Success(localValue) if !localValue.hasExpired(ttl, now) => - // We have locally a good value, just return it - Future.successful(localValue.value) - case Success(expiredLocalValue) if remoteRW.nonEmpty => - // We have locally an expired value, but we can check a remote cache for better value - remoteRW.get.get(key).asTry().flatMap { - case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => - // Remote is good, set locally and return it - localCache.set(key, Success(remoteValue)) - Future.successful(remoteValue.value) - case Success(Some(_)) | Success(None) => - // No good remote, return local, async update both - tryGenerateAndSet(key, genValue) - Future.successful(expiredLocalValue.value) - case Failure(e) => - // TODO: log, generate metrics - tryGenerateAndSet(key, genValue) - Future.successful(expiredLocalValue.value) - } - case Success(expiredLocalValue) if remoteRW.isEmpty => - tryGenerateAndSet(key, genValue) - Future.successful(expiredLocalValue.value) - case Failure(e) => - // This is almost impossible to happen because it's local and we don't save failed values - // TODO: log, generate metrics - tryGenerateAndSet(key, genValue).map(_.value) - } - case None if remoteRW.nonEmpty => - // No local, let's try remote - remoteRW.get.get(key).asTry().flatMap { - case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => - // Remote is good, set locally and return it - localCache.set(key, Success(remoteValue)) - Future.successful(remoteValue.value) - case Success(Some(_)) | Success(None) => - // No good remote, sync generate - tryGenerateAndSet(key, genValue).map(_.value) - case Failure(e) => - // TODO: log, generate metrics - tryGenerateAndSet(key, genValue).map(_.value) - } - case None if remoteRW.isEmpty => - // No local and no remote to look, just generate it - tryGenerateAndSet(key, genValue).map(_.value) - } -} - -case class ExpiringRedisWithAsyncUpdate() \ No newline at end of file +} \ No newline at end of file diff --git a/src/main/scala/spray/cache/ExpiringLruCache.scala b/src/main/scala/spray/cache/ExpiringLruCache.scala new file mode 100644 index 00000000..b1f461f0 --- /dev/null +++ b/src/main/scala/spray/cache/ExpiringLruCache.scala @@ -0,0 +1,139 @@ +// Note: +// For ignition.core we added two methods to satisfy ExpiringMultipleLevelCache.LocalCache[V] + +/* + * Copyright © 2011-2013 the spray project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package spray.caching + +import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap +import ignition.core.cache.ExpiringMultipleLevelCache + +import scala.annotation.tailrec +import scala.collection.JavaConverters._ +import scala.concurrent.duration.Duration +import scala.concurrent.{ExecutionContext, Future, Promise} +import scala.util.{Failure, Success, Try} +import spray.util.Timestamp + +final class ExpiringLruCache[V](maxCapacity: Long, initialCapacity: Int, + timeToLive: Duration, timeToIdle: Duration) extends Cache[V] with ignition.core.cache.ExpiringMultipleLevelCache.LocalCache[V] { + require(!timeToLive.isFinite || !timeToIdle.isFinite || timeToLive > timeToIdle, + s"timeToLive($timeToLive) must be greater than timeToIdle($timeToIdle)") + + private[caching] val store = new ConcurrentLinkedHashMap.Builder[Any, Entry[V]] + .initialCapacity(initialCapacity) + .maximumWeightedCapacity(maxCapacity) + .build() + + @tailrec + def get(key: Any): Option[Future[V]] = store.get(key) match { + case null ⇒ None + case entry if (isAlive(entry)) ⇒ + entry.refresh() + Some(entry.future) + case entry ⇒ + // remove entry, but only if it hasn't been removed and reinserted in the meantime + if (store.remove(key, entry)) None // successfully removed + else get(key) // nope, try again + } + + def apply(key: Any, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext): Future[V] = { + def insert() = { + val newEntry = new Entry(Promise[V]()) + val valueFuture = + store.put(key, newEntry) match { + case null ⇒ genValue() + case entry ⇒ + if (isAlive(entry)) { + // we date back the new entry we just inserted + // in the meantime someone might have already seen the too fresh timestamp we just put in, + // but since the original entry is also still alive this doesn't matter + newEntry.created = entry.created + entry.future + } else genValue() + } + valueFuture.onComplete { value ⇒ + newEntry.promise.tryComplete(value) + // in case of exceptions we remove the cache entry (i.e. try again later) + if (value.isFailure) store.remove(key, newEntry) + } + newEntry.promise.future + } + store.get(key) match { + case null ⇒ insert() + case entry if (isAlive(entry)) ⇒ + entry.refresh() + entry.future + case entry ⇒ insert() + } + } + + def remove(key: Any) = store.remove(key) match { + case null ⇒ None + case entry if (isAlive(entry)) ⇒ Some(entry.future) + case entry ⇒ None + } + + def clear(): Unit = { store.clear() } + + def keys: Set[Any] = store.keySet().asScala.toSet + + def ascendingKeys(limit: Option[Int] = None) = + limit.map { lim ⇒ store.ascendingKeySetWithLimit(lim) } + .getOrElse(store.ascendingKeySet()) + .iterator().asScala + + def size = store.size + + private def isAlive(entry: Entry[V]) = + (entry.created + timeToLive).isFuture && + (entry.lastAccessed + timeToIdle).isFuture + + // Method required by ExpiringMultipleLevelCache.LocalCache + override def set(key: Any, value: V): Unit = { + val newEntry = new Entry(Promise[V]()) + newEntry.promise.trySuccess(value) + store.put(key, newEntry) match { + case null => + // Nothing to do + case oldEntry => + // If the old promise is pending, complete it with our future + oldEntry.promise.trySuccess(value) + } + } + + // Method required by ExpiringMultipleLevelCache.LocalCache + override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = { + val sprayCache: Cache[V] = this + sprayCache.apply(key, genValue) + } +} + +private[caching] class Entry[T](val promise: Promise[T]) { + @volatile var created = Timestamp.now + @volatile var lastAccessed = Timestamp.now + def future = promise.future + def refresh(): Unit = { + // we dont care whether we overwrite a potentially newer value + lastAccessed = Timestamp.now + } + override def toString = future.value match { + case Some(Success(value)) ⇒ value.toString + case Some(Failure(exception)) ⇒ exception.toString + case None ⇒ "pending" + } +} From 75182789b0df31142689fd7e8411fa3a83733f2d Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 27 Oct 2016 15:03:21 -0200 Subject: [PATCH 119/268] Make remoteLockTTL a parameter, change ttl to FiniteDuration and explicitly define the reporter return type --- .../cache/ExpiringMultipleLevelCache.scala | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala index 5469e308..85e728b9 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala @@ -1,12 +1,11 @@ package ignition.core.cache import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap -import ignition.core.utils.FutureUtils._ import ignition.core.utils.DateUtils._ -import org.joda.time.{DateTime, Period} +import ignition.core.utils.FutureUtils._ +import org.joda.time.DateTime import org.slf4j.LoggerFactory - import scala.concurrent.duration._ import scala.concurrent.{ExecutionContext, Future, Promise} import scala.util.{Failure, Success, Try} @@ -14,8 +13,8 @@ import scala.util.{Failure, Success, Try} object ExpiringMultipleLevelCache { case class TimestampedValue[V](date: DateTime, value: V) { - def hasExpired(ttl: Period, now: DateTime): Boolean = { - date.plus(ttl).isBefore(now) + def hasExpired(ttl: FiniteDuration, now: DateTime): Boolean = { + date.plus(ttl.toMillis).isBefore(now) } } @@ -40,17 +39,17 @@ object ExpiringMultipleLevelCache { trait RemoteCacheRW[V] extends RemoteReadableCache[V] with RemoteWritableCache[V] trait ReporterCallback { - def onCacheMissNothingFound() - def onCacheMissButFoundExpiredLocal() - def onCacheMissButFoundExpiredRemote() - def onRemoteCacheHit() - def onLocalCacheHit() - def onUnexpectedBehaviour() - def onStillTryingToLockOrGet() - def onSuccessfullyRemoteSetValue() - def onRemoteCacheHitAfterGenerating() - def onErrorGeneratingValue(key: String, eLocal: Throwable) - def onLocalError(key: String, e: Throwable) + def onCacheMissNothingFound(): Unit + def onCacheMissButFoundExpiredLocal(): Unit + def onCacheMissButFoundExpiredRemote(): Unit + def onRemoteCacheHit(): Unit + def onLocalCacheHit(): Unit + def onUnexpectedBehaviour(): Unit + def onStillTryingToLockOrGet(): Unit + def onSuccessfullyRemoteSetValue(): Unit + def onRemoteCacheHitAfterGenerating(): Unit + def onErrorGeneratingValue(key: String, eLocal: Throwable): Unit + def onLocalError(key: String, e: Throwable): Unit def onRemoteError(key: String, t: Throwable): Unit def onRemoteGiveUp(key: String): Unit } @@ -73,12 +72,13 @@ object ExpiringMultipleLevelCache { } -import ExpiringMultipleLevelCache._ +import ignition.core.cache.ExpiringMultipleLevelCache._ -case class ExpiringMultipleLevelCache[V](ttl: Period, +case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, localCache: LocalCache[TimestampedValue[V]], remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None, + remoteLockTTL: FiniteDuration = 5.seconds, reporter: ExpiringMultipleLevelCache.ReporterCallback = ExpiringMultipleLevelCache.NoOpReporter, maxErrorsToRetryOnRemote: Int = 5) extends GenericCache[V] { @@ -92,8 +92,6 @@ case class ExpiringMultipleLevelCache[V](ttl: Period, private def remoteLockKey(key: Any) = s"$key-emlc-lock" - private val remoteLockTTL = 5.seconds - // The idea is simple, have two caches: remote and local // with values that will eventually expire but still be left on the cache From 14ff519d28fdd9abdc8c7da1810a62b6cf95eef0 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 27 Oct 2016 15:32:04 -0200 Subject: [PATCH 120/268] Add ec to setLock --- .../scala/ignition/core/cache/ExpiringMultipleLevelCache.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala index 85e728b9..1ddd92b1 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala @@ -29,7 +29,7 @@ object ExpiringMultipleLevelCache { trait RemoteWritableCache[V] { def set(key: String, value: V)(implicit ec: ExecutionContext): Future[Unit] - def setLock(key: String, ttl: FiniteDuration): Future[Boolean] + def setLock(key: String, ttl: FiniteDuration)(implicit ec: ExecutionContext): Future[Boolean] } trait RemoteReadableCache[V] { From 60db77caa522c8860f2edc88d4e5b71dd0a6b254 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 27 Oct 2016 16:44:53 -0200 Subject: [PATCH 121/268] Minor stuff --- .../cache/ExpiringMultipleLevelCache.scala | 87 ++++++++++++------- ...ache.scala => ExpiringLruLocalCache.scala} | 6 +- 2 files changed, 58 insertions(+), 35 deletions(-) rename src/main/scala/spray/cache/{ExpiringLruCache.scala => ExpiringLruLocalCache.scala} (93%) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala index 1ddd92b1..f41f7a35 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala @@ -5,9 +5,11 @@ import ignition.core.utils.DateUtils._ import ignition.core.utils.FutureUtils._ import org.joda.time.DateTime import org.slf4j.LoggerFactory +import spray.caching.ValueMagnet import scala.concurrent.duration._ import scala.concurrent.{ExecutionContext, Future, Promise} +import scala.util.control.NonFatal import scala.util.{Failure, Success, Try} @@ -18,7 +20,26 @@ object ExpiringMultipleLevelCache { } } - trait GenericCache[V] { + trait GenericCache[V] { cache => + // Keep compatible with Spray Cache + def apply(key: String) = new Keyed(key) + + class Keyed(key: String) { + /** + * Returns either the cached Future for the key or evaluates the given call-by-name argument + * which produces either a value instance of type `V` or a `Future[V]`. + */ + def apply(magnet: ⇒ ValueMagnet[V])(implicit ec: ExecutionContext): Future[V] = + cache.apply(key, () ⇒ try magnet.future catch { case NonFatal(e) ⇒ Future.failed(e) }) + + /** + * Returns either the cached Future for the key or evaluates the given function which + * should lead to eventual completion of the promise. + */ + def apply[U](f: Promise[V] ⇒ U)(implicit ec: ExecutionContext): Future[V] = + cache.apply(key, () ⇒ { val p = Promise[V](); f(p); p.future }) + } + def apply(key: String, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext): Future[V] } @@ -39,15 +60,15 @@ object ExpiringMultipleLevelCache { trait RemoteCacheRW[V] extends RemoteReadableCache[V] with RemoteWritableCache[V] trait ReporterCallback { - def onCacheMissNothingFound(): Unit - def onCacheMissButFoundExpiredLocal(): Unit - def onCacheMissButFoundExpiredRemote(): Unit - def onRemoteCacheHit(): Unit - def onLocalCacheHit(): Unit - def onUnexpectedBehaviour(): Unit - def onStillTryingToLockOrGet(): Unit - def onSuccessfullyRemoteSetValue(): Unit - def onRemoteCacheHitAfterGenerating(): Unit + def onCacheMissNothingFound(key: String): Unit + def onCacheMissButFoundExpiredLocal(key: String): Unit + def onCacheMissButFoundExpiredRemote(key: String): Unit + def onRemoteCacheHit(key: String): Unit + def onLocalCacheHit(key: String): Unit + def onUnexpectedBehaviour(key: String): Unit + def onStillTryingToLockOrGet(key: String): Unit + def onSuccessfullyRemoteSetValue(key: String): Unit + def onRemoteCacheHitAfterGenerating(key: String): Unit def onErrorGeneratingValue(key: String, eLocal: Throwable): Unit def onLocalError(key: String, e: Throwable): Unit def onRemoteError(key: String, t: Throwable): Unit @@ -55,19 +76,19 @@ object ExpiringMultipleLevelCache { } object NoOpReporter extends ReporterCallback { - override def onCacheMissNothingFound(): Unit = {} - override def onUnexpectedBehaviour(): Unit = {} - override def onSuccessfullyRemoteSetValue(): Unit = {} + override def onCacheMissNothingFound(key: String): Unit = {} + override def onUnexpectedBehaviour(key: String): Unit = {} + override def onSuccessfullyRemoteSetValue(key: String): Unit = {} override def onRemoteError(key: String, t: Throwable): Unit = {} override def onRemoteGiveUp(key: String): Unit = {} override def onLocalError(key: String, e: Throwable): Unit = {} override def onErrorGeneratingValue(key: String, eLocal: Throwable): Unit = {} - override def onRemoteCacheHitAfterGenerating(): Unit = {} - override def onCacheMissButFoundExpiredRemote(): Unit = {} - override def onStillTryingToLockOrGet(): Unit = {} - override def onLocalCacheHit(): Unit = {} - override def onRemoteCacheHit(): Unit = {} - override def onCacheMissButFoundExpiredLocal(): Unit = {} + override def onRemoteCacheHitAfterGenerating(key: String): Unit = {} + override def onCacheMissButFoundExpiredRemote(key: String): Unit = {} + override def onStillTryingToLockOrGet(key: String): Unit = {} + override def onLocalCacheHit(key: String): Unit = {} + override def onRemoteCacheHit(key: String): Unit = {} + override def onCacheMissButFoundExpiredLocal(key: String): Unit = {} } } @@ -103,25 +124,25 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, future.flatMap { case Success(localValue) if !localValue.hasExpired(ttl, now) => // We have locally a good value, just return it - reporter.onLocalCacheHit() + reporter.onLocalCacheHit(key) Future.successful(localValue.value) case Success(expiredLocalValue) if remoteRW.nonEmpty => // We have locally an expired value, but we can check a remote cache for better value remoteRW.get.get(key).asTry().flatMap { case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => // Remote is good, set locally and return it - reporter.onRemoteCacheHit() + reporter.onRemoteCacheHit(key) localCache.set(key, remoteValue) Future.successful(remoteValue.value) case Success(Some(expiredRemote)) => // Expired local and expired remote, return the most recent of them, async update both - reporter.onCacheMissButFoundExpiredRemote() + reporter.onCacheMissButFoundExpiredRemote(key) tryGenerateAndSet(key, genValue) val mostRecent = Set(expiredLocalValue, expiredRemote).maxBy(_.date) Future.successful(mostRecent.value) case Success(None) => // No remote found, return local, async update both - reporter.onCacheMissButFoundExpiredLocal() + reporter.onCacheMissButFoundExpiredLocal(key) tryGenerateAndSet(key, genValue) Future.successful(expiredLocalValue.value) case Failure(e) => @@ -133,7 +154,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, case Success(expiredLocalValue) if remoteRW.isEmpty => // There is no remote cache configured, we'are on our own // Return expired value and try to generate a new one for the future - reporter.onCacheMissButFoundExpiredLocal() + reporter.onCacheMissButFoundExpiredLocal(key) tryGenerateAndSet(key, genValue) Future.successful(expiredLocalValue.value) case Failure(e) => @@ -147,17 +168,17 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, remoteRW.get.get(key).asTry().flatMap { case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => // Remote is good, set locally and return it - reporter.onRemoteCacheHit() + reporter.onRemoteCacheHit(key) localCache.set(key, remoteValue) Future.successful(remoteValue.value) case Success(Some(expiredRemote)) => // Expired remote, return the it, async update - reporter.onCacheMissButFoundExpiredRemote() + reporter.onCacheMissButFoundExpiredRemote(key) tryGenerateAndSet(key, genValue).map(_.value) Future.successful(expiredRemote.value) case Success(None) => // No good remote, sync generate - reporter.onCacheMissNothingFound() + reporter.onCacheMissNothingFound(key) tryGenerateAndSet(key, genValue).map(_.value) case Failure(e) => reporter.onRemoteError(key, e) @@ -167,7 +188,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, case None if remoteRW.isEmpty => // No local and no remote to look, just generate it // The caller will need to wait for the value generation - reporter.onCacheMissNothingFound() + reporter.onCacheMissNothingFound(key) tryGenerateAndSet(key, genValue).map(_.value) } @@ -186,7 +207,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, tempUpdate.remove(key) case Success(v) => // Have we generated/got an expired value!? - reporter.onUnexpectedBehaviour() + reporter.onUnexpectedBehaviour(key) logger.warn(s"tryGenerateAndSet, key $key: unexpectedly generated/got an expired value: $v") localCache.set(key, v) promise.trySuccess(v) @@ -286,7 +307,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, remote.get(key).asTry().flatMap { case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => // Current value is good, just return it - reporter.onRemoteCacheHitAfterGenerating() + reporter.onRemoteCacheHitAfterGenerating(key) logger.info(s"remoteSetOrGet got lock for $key but found already a good value on remote") Future.successful(remoteValue) case Success(_) => @@ -295,7 +316,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, remote.set(key, calculatedValue).asTry().flatMap { case Success(_) => // Flawless victory! - reporter.onSuccessfullyRemoteSetValue() + reporter.onSuccessfullyRemoteSetValue(key) logger.info(s"remoteSetOrGet successfully set key $key while under lock") Future.successful(calculatedValue) case Failure(e) => @@ -315,13 +336,13 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, remote.get(key).asTry().flatMap { case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => // Current value is good, just return it - reporter.onRemoteCacheHitAfterGenerating() + reporter.onRemoteCacheHitAfterGenerating(key) Future.successful(remoteValue) case Success(_) => // The value is missing or has expired // Let's start from scratch because we need to be able to set or get a good value // Note: do not increment retry because this isn't an error - reporter.onStillTryingToLockOrGet() + reporter.onStillTryingToLockOrGet(key) logger.info(s"remoteSetOrGet couldn't lock key $key and didn't found good value on remote") remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry) case Failure(e) => diff --git a/src/main/scala/spray/cache/ExpiringLruCache.scala b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala similarity index 93% rename from src/main/scala/spray/cache/ExpiringLruCache.scala rename to src/main/scala/spray/cache/ExpiringLruLocalCache.scala index b1f461f0..8c403be9 100644 --- a/src/main/scala/spray/cache/ExpiringLruCache.scala +++ b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala @@ -29,8 +29,10 @@ import scala.concurrent.{ExecutionContext, Future, Promise} import scala.util.{Failure, Success, Try} import spray.util.Timestamp -final class ExpiringLruCache[V](maxCapacity: Long, initialCapacity: Int, - timeToLive: Duration, timeToIdle: Duration) extends Cache[V] with ignition.core.cache.ExpiringMultipleLevelCache.LocalCache[V] { +final class ExpiringLruLocalCache[V](maxCapacity: Long, + initialCapacity: Int = 16, + timeToLive: Duration = Duration.Inf, + timeToIdle: Duration = Duration.Inf) extends Cache[V] with ignition.core.cache.ExpiringMultipleLevelCache.LocalCache[V] { require(!timeToLive.isFinite || !timeToIdle.isFinite || timeToLive > timeToIdle, s"timeToLive($timeToLive) must be greater than timeToIdle($timeToIdle)") From 5f6ace3b8821a516644de4e5f9adc9bde7930eb4 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 27 Oct 2016 18:06:43 -0200 Subject: [PATCH 122/268] Add sanity test --- .../cache/ExpiringMultipleLevelCache.scala | 4 ++- .../cache/ExpiringMultipleLevelCache.scala | 29 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala diff --git a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala index f41f7a35..0c5ada3e 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala @@ -105,7 +105,9 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, private val logger = LoggerFactory.getLogger(getClass) - private val tempUpdate = new ConcurrentLinkedHashMap.Builder[Any, Future[TimestampedValue[V]]].build() + private val tempUpdate = new ConcurrentLinkedHashMap.Builder[Any, Future[TimestampedValue[V]]] + .maximumWeightedCapacity(Long.MaxValue) + .build() protected def now = DateTime.now diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala new file mode 100644 index 00000000..d602a736 --- /dev/null +++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala @@ -0,0 +1,29 @@ +package ignition.core.cache + +import ignition.core.cache.ExpiringMultipleLevelCache.TimestampedValue +import org.scalatest.{FlatSpec, Matchers} +import spray.caching.ExpiringLruLocalCache + +import scala.concurrent.ExecutionContext.Implicits.global +import scala.concurrent.duration._ +import scala.concurrent.{Await, Future} + +class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers { + case class Data(s: String) + "ExpiringMultipleLevelCache" should "calculate a value on cache miss and return it" in { + val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) + val cache = ExpiringMultipleLevelCache[Data](1.minute, local) + Await.result(cache("key", () => Future.successful(Data("success"))), 1.minute) shouldBe Data("success") + } + + it should "calculate a value on cache miss and return a failed future of the calculation" in { + val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) + val cache = ExpiringMultipleLevelCache[Data](1.minute, local) + + class MyException(s: String) extends Exception(s) + + intercept[MyException ] { + Await.result(cache("key", () => Future.failed(new MyException("some failure"))), 1.minute) + } + } +} From c8638492347b3d688599200959345f4050c8cc77 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Mon, 31 Oct 2016 11:46:18 -0200 Subject: [PATCH 123/268] support for setting headers in http client request --- .../core/http/AsyncHttpClientStreamApi.scala | 3 ++- .../core/http/AsyncSprayHttpClient.scala | 16 ++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala index 4910c98a..30f46c53 100644 --- a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala +++ b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala @@ -4,7 +4,7 @@ import java.io.InputStream import java.util.concurrent.TimeUnit import akka.util.Timeout -import spray.http.{HttpEntity, HttpMethod, HttpMethods} +import spray.http.{HttpEntity, HttpHeader, HttpMethod, HttpMethods} import scala.concurrent.Future import scala.concurrent.duration._ @@ -48,6 +48,7 @@ object AsyncHttpClientStreamApi { credentials: Option[Credentials] = None, method: HttpMethod = HttpMethods.GET, body: HttpEntity = HttpEntity.Empty, + headers: List[HttpHeader] = List.empty, requestConfiguration: Option[RequestConfiguration] = None) case class RequestException(message: String, response: StreamResponse) extends RuntimeException(message) diff --git a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala index 0565fe2f..405457ea 100644 --- a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala +++ b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala @@ -61,17 +61,17 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi { List(Authorization(credentials = BasicHttpCredentials(username = credentials.user, password = credentials.password))) private def toSprayRequest(request: Request): HttpRequest = request match { - case Request(uri, params, Some(credentials), method, body, _) if params.isEmpty => - HttpRequest(method = method, uri = request.url, headers = credentials, entity = body) + case Request(uri, params, Some(credentials), method, body, headers, _) if params.isEmpty => + HttpRequest(method = method, uri = request.url, headers = credentials ++ headers, entity = body) - case Request(uri, params, Some(credentials), method, body, _) => - HttpRequest(method = method, uri = toUriString(request.url, params), headers = credentials, entity = body) + case Request(uri, params, Some(credentials), method, body, headers, _) => + HttpRequest(method = method, uri = toUriString(request.url, params), headers = credentials ++ headers, entity = body) - case Request(uri, params, None, method, body, _) if params.isEmpty => - HttpRequest(method = method, uri = toUriString(request.url), entity = body) + case Request(uri, params, None, method, body, headers, _) if params.isEmpty => + HttpRequest(method = method, uri = toUriString(request.url), entity = body, headers = headers) - case Request(uri, params, None, method, body, _) => - HttpRequest(method = method, uri = toUriString(request.url, params), entity = body) + case Request(uri, params, None, method, body, headers, _) => + HttpRequest(method = method, uri = toUriString(request.url, params), entity = body, headers = headers) } private def toSprayHostConnectorSetup(uri: Uri, conf: Option[AsyncHttpClientStreamApi.RequestConfiguration]): HostConnectorSetup = { From 046e4a88c19d6d1db029c2d423a2992b5a25096d Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Mon, 31 Oct 2016 13:18:51 -0200 Subject: [PATCH 124/268] enable new spark version 2.0.1 --- tools/spark-ec2/spark_ec2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index a89dab8f..b1f4e709 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -77,6 +77,7 @@ "1.5.2", "1.6.0", "2.0.0", + "2.0.1", ]) SPARK_TACHYON_MAP = { From 3bfe4d886d7b31f5c06d227bf6f4d7fa05655a18 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Mon, 31 Oct 2016 14:30:46 -0200 Subject: [PATCH 125/268] update to spark-2.0.1 --- build.sbt | 2 +- tools/cluster.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index 6ffe0e85..c0f4bf77 100644 --- a/build.sbt +++ b/build.sbt @@ -9,7 +9,7 @@ scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warning // Because we can't run two spark contexts on same VM parallelExecution in Test := false -libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.0" % "provided") +libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.1" % "provided") .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") diff --git a/tools/cluster.py b/tools/cluster.py index 5f59edad..7d77e8c4 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -49,7 +49,7 @@ default_ami = None # will be decided based on spark-ec2 list default_master_ami = None default_env = 'dev' -default_spark_version = '2.0.0' +default_spark_version = '2.0.1' custom_builds = { # '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz' } From 612c6428e4c431d8b36ef858183f8e3234872933 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Mon, 31 Oct 2016 16:20:40 -0200 Subject: [PATCH 126/268] Added elapsed time and some new metrics --- .../cache/ExpiringMultipleLevelCache.scala | 173 ++++++++++-------- 1 file changed, 99 insertions(+), 74 deletions(-) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala index 0c5ada3e..6de11fdd 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala @@ -1,5 +1,7 @@ package ignition.core.cache +import java.util.concurrent.TimeUnit + import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap import ignition.core.utils.DateUtils._ import ignition.core.utils.FutureUtils._ @@ -12,7 +14,6 @@ import scala.concurrent.{ExecutionContext, Future, Promise} import scala.util.control.NonFatal import scala.util.{Failure, Success, Try} - object ExpiringMultipleLevelCache { case class TimestampedValue[V](date: DateTime, value: V) { def hasExpired(ttl: FiniteDuration, now: DateTime): Boolean = { @@ -60,35 +61,44 @@ object ExpiringMultipleLevelCache { trait RemoteCacheRW[V] extends RemoteReadableCache[V] with RemoteWritableCache[V] trait ReporterCallback { - def onCacheMissNothingFound(key: String): Unit - def onCacheMissButFoundExpiredLocal(key: String): Unit - def onCacheMissButFoundExpiredRemote(key: String): Unit - def onRemoteCacheHit(key: String): Unit - def onLocalCacheHit(key: String): Unit - def onUnexpectedBehaviour(key: String): Unit - def onStillTryingToLockOrGet(key: String): Unit - def onSuccessfullyRemoteSetValue(key: String): Unit - def onRemoteCacheHitAfterGenerating(key: String): Unit - def onErrorGeneratingValue(key: String, eLocal: Throwable): Unit - def onLocalError(key: String, e: Throwable): Unit - def onRemoteError(key: String, t: Throwable): Unit - def onRemoteGiveUp(key: String): Unit + def onCompletedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit + def onGeneratedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit + def onCompletedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit + def onGeneratedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit + def onCacheMissNothingFound(key: String, elapsedTime: FiniteDuration): Unit + def onCacheMissButFoundExpiredLocal(key: String, elapsedTime: FiniteDuration): Unit + def onCacheMissButFoundExpiredRemote(key: String, elapsedTime: FiniteDuration): Unit + def onRemoteCacheHit(key: String, elapsedTime: FiniteDuration): Unit + def onLocalCacheHit(key: String, elapsedTime: FiniteDuration): Unit + def onUnexpectedBehaviour(key: String, elapsedTime: FiniteDuration): Unit + def onStillTryingToLockOrGet(key: String, elapsedTime: FiniteDuration): Unit + def onSuccessfullyRemoteSetValue(key: String, elapsedTime: FiniteDuration): Unit + def onRemoteCacheHitAfterGenerating(key: String, elapsedTime: FiniteDuration): Unit + def onErrorGeneratingValue(key: String, eLocal: Throwable, elapsedTime: FiniteDuration): Unit + def onLocalError(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit + def onRemoteError(key: String, t: Throwable, elapsedTime: FiniteDuration): Unit + def onRemoteGiveUp(key: String, elapsedTime: FiniteDuration): Unit } object NoOpReporter extends ReporterCallback { - override def onCacheMissNothingFound(key: String): Unit = {} - override def onUnexpectedBehaviour(key: String): Unit = {} - override def onSuccessfullyRemoteSetValue(key: String): Unit = {} - override def onRemoteError(key: String, t: Throwable): Unit = {} - override def onRemoteGiveUp(key: String): Unit = {} - override def onLocalError(key: String, e: Throwable): Unit = {} - override def onErrorGeneratingValue(key: String, eLocal: Throwable): Unit = {} - override def onRemoteCacheHitAfterGenerating(key: String): Unit = {} - override def onCacheMissButFoundExpiredRemote(key: String): Unit = {} - override def onStillTryingToLockOrGet(key: String): Unit = {} - override def onLocalCacheHit(key: String): Unit = {} - override def onRemoteCacheHit(key: String): Unit = {} - override def onCacheMissButFoundExpiredLocal(key: String): Unit = {} + override def onCacheMissNothingFound(key: String, elapsedTime: FiniteDuration): Unit = {} + override def onUnexpectedBehaviour(key: String, elapsedTime: FiniteDuration): Unit = {} + override def onSuccessfullyRemoteSetValue(key: String, elapsedTime: FiniteDuration): Unit = {} + override def onRemoteError(key: String, t: Throwable, elapsedTime: FiniteDuration): Unit = {} + override def onRemoteGiveUp(key: String, elapsedTime: FiniteDuration): Unit = {} + override def onLocalError(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit = {} + override def onErrorGeneratingValue(key: String, eLocal: Throwable, elapsedTime: FiniteDuration): Unit = {} + override def onRemoteCacheHitAfterGenerating(key: String, elapsedTime: FiniteDuration): Unit = {} + override def onCacheMissButFoundExpiredRemote(key: String, elapsedTime: FiniteDuration): Unit = {} + override def onStillTryingToLockOrGet(key: String, elapsedTime: FiniteDuration): Unit = {} + override def onLocalCacheHit(key: String, elapsedTime: FiniteDuration): Unit = {} + override def onRemoteCacheHit(key: String, elapsedTime: FiniteDuration): Unit = {} + override def onCacheMissButFoundExpiredLocal(key: String, elapsedTime: FiniteDuration): Unit = {} + override def onCompletedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit = {} + override def onCompletedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit = {} + override def onGeneratedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit = {} + override def onGeneratedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit = {} + } } @@ -113,103 +123,115 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, private def timestamp(v: V) = TimestampedValue(now, v) + private def elapsedTime(startNanoTime: Long) = FiniteDuration(System.nanoTime() - startNanoTime, TimeUnit.NANOSECONDS) + private def remoteLockKey(key: Any) = s"$key-emlc-lock" // The idea is simple, have two caches: remote and local // with values that will eventually expire but still be left on the cache // while a new value is asynchronously being calculated/retrieved - override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = + override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = { // The local cache is always the first try. We'll only look the remote if the local value is missing or has expired - localCache.get(key).map(_.asTry()) match { + val startTime = System.nanoTime() + val result = localCache.get(key).map(_.asTry()) match { case Some(future) => future.flatMap { case Success(localValue) if !localValue.hasExpired(ttl, now) => // We have locally a good value, just return it - reporter.onLocalCacheHit(key) + reporter.onLocalCacheHit(key, elapsedTime(startTime)) Future.successful(localValue.value) case Success(expiredLocalValue) if remoteRW.nonEmpty => // We have locally an expired value, but we can check a remote cache for better value remoteRW.get.get(key).asTry().flatMap { case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => // Remote is good, set locally and return it - reporter.onRemoteCacheHit(key) + reporter.onRemoteCacheHit(key, elapsedTime(startTime)) localCache.set(key, remoteValue) Future.successful(remoteValue.value) case Success(Some(expiredRemote)) => // Expired local and expired remote, return the most recent of them, async update both - reporter.onCacheMissButFoundExpiredRemote(key) - tryGenerateAndSet(key, genValue) + reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime)) + tryGenerateAndSet(key, genValue, startTime) val mostRecent = Set(expiredLocalValue, expiredRemote).maxBy(_.date) Future.successful(mostRecent.value) case Success(None) => // No remote found, return local, async update both - reporter.onCacheMissButFoundExpiredLocal(key) - tryGenerateAndSet(key, genValue) + reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime)) + tryGenerateAndSet(key, genValue, startTime) Future.successful(expiredLocalValue.value) case Failure(e) => - reporter.onRemoteError(key, e) + reporter.onRemoteError(key, e, elapsedTime(startTime)) logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key expired local value and failed to get remote", e) - tryGenerateAndSet(key, genValue) + tryGenerateAndSet(key, genValue, startTime) Future.successful(expiredLocalValue.value) } case Success(expiredLocalValue) if remoteRW.isEmpty => // There is no remote cache configured, we'are on our own // Return expired value and try to generate a new one for the future - reporter.onCacheMissButFoundExpiredLocal(key) - tryGenerateAndSet(key, genValue) + reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime)) + tryGenerateAndSet(key, genValue, startTime) Future.successful(expiredLocalValue.value) case Failure(e) => // This is almost impossible to happen because it's local and we don't save failed values - reporter.onLocalError(key, e) + reporter.onLocalError(key, e, elapsedTime(startTime)) logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key got a failed future from cache!? This is almost impossible!", e) - tryGenerateAndSet(key, genValue).map(_.value) + tryGenerateAndSet(key, genValue, startTime).map(_.value) } case None if remoteRW.nonEmpty => // No local, let's try remote remoteRW.get.get(key).asTry().flatMap { case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => // Remote is good, set locally and return it - reporter.onRemoteCacheHit(key) + reporter.onRemoteCacheHit(key, elapsedTime(startTime)) localCache.set(key, remoteValue) Future.successful(remoteValue.value) case Success(Some(expiredRemote)) => // Expired remote, return the it, async update - reporter.onCacheMissButFoundExpiredRemote(key) - tryGenerateAndSet(key, genValue).map(_.value) + reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime)) + tryGenerateAndSet(key, genValue, startTime).map(_.value) Future.successful(expiredRemote.value) case Success(None) => // No good remote, sync generate - reporter.onCacheMissNothingFound(key) - tryGenerateAndSet(key, genValue).map(_.value) + reporter.onCacheMissNothingFound(key, elapsedTime(startTime)) + tryGenerateAndSet(key, genValue, startTime).map(_.value) case Failure(e) => - reporter.onRemoteError(key, e) + reporter.onRemoteError(key, e, elapsedTime(startTime)) logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key expired local value and no remote configured", e) - tryGenerateAndSet(key, genValue).map(_.value) + tryGenerateAndSet(key, genValue, startTime).map(_.value) } case None if remoteRW.isEmpty => // No local and no remote to look, just generate it // The caller will need to wait for the value generation - reporter.onCacheMissNothingFound(key) - tryGenerateAndSet(key, genValue).map(_.value) + reporter.onCacheMissNothingFound(key, elapsedTime(startTime)) + tryGenerateAndSet(key, genValue, startTime).map(_.value) } + result.onComplete { + case Success(_) => + reporter.onCompletedWithSuccess(key, elapsedTime(startTime)) + case Failure(e) => + reporter.onCompletedWithFailure(key, e, elapsedTime(startTime)) + } + result + } // Note: this method may return a failed future, but it will never cache it // Our main purpose here is to avoid multiple local calls to generate new promises/futures in parallel, // so we use this Map keep everyone in sync // This is similar to how spray cache works - private def tryGenerateAndSet(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { + private def tryGenerateAndSet(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { val promise = Promise[TimestampedValue[V]]() tempUpdate.putIfAbsent(key, promise.future) match { case null => - canonicalValueGenerator(key, genValue).onComplete { + canonicalValueGenerator(key, genValue, nanoStartTime).onComplete { case Success(v) if !v.hasExpired(ttl, now) => + reporter.onGeneratedWithSuccess(key, elapsedTime(nanoStartTime)) localCache.set(key, v) promise.trySuccess(v) tempUpdate.remove(key) case Success(v) => // Have we generated/got an expired value!? - reporter.onUnexpectedBehaviour(key) + reporter.onUnexpectedBehaviour(key, elapsedTime(nanoStartTime)) logger.warn(s"tryGenerateAndSet, key $key: unexpectedly generated/got an expired value: $v") localCache.set(key, v) promise.trySuccess(v) @@ -217,6 +239,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, case Failure(e) => // We don't save failures to cache // There is no need to log here, canonicalValueGenerator will log everything already + reporter.onGeneratedWithFailure(key, e, elapsedTime(nanoStartTime)) promise.tryFailure(e) tempUpdate.remove(key) } @@ -232,7 +255,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, // the remote cache and read by the other instances // Unless of course there is some error getting stuff from remote cache // in which case the locally generated value may be returned to avoid further delays - protected def canonicalValueGenerator(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext) = { + protected def canonicalValueGenerator(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext) = { val fGeneratedValue = Try { genValue().map(timestamp) }.asFutureTry() val finalValue: Future[TimestampedValue[V]] = fGeneratedValue.flatMap { case Success(generatedValue) => @@ -242,18 +265,18 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, case None => Future.successful(generatedValue) case Some(remote) => - remoteSetOrGet(key, generatedValue, remote) + remoteSetOrGet(key, generatedValue, remote, nanoStartTime) } case Failure(eLocal) => // We failed to generate the value ourselves, our hope is if someone else successfully did it in the meantime - reporter.onErrorGeneratingValue(key, eLocal) + reporter.onErrorGeneratingValue(key, eLocal, elapsedTime(nanoStartTime)) remoteRW match { case None => // There are no remote RW caches logger.error(s"canonicalValueGenerator, key $key: failed to generate value and no remote cache configured", eLocal) Future.failed(eLocal) case Some(remote) => - remoteGetNonExpiredValue(key, remote).asTry().flatMap { + remoteGetNonExpiredValue(key, remote, nanoStartTime).asTry().flatMap { case Success(v) => logger.warn(s"canonicalValueGenerator, key $key: failed to generate value but got one from remote", eLocal) Future.successful(v) @@ -270,6 +293,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, // Auxiliary method, only makes sense to be used by canonicalValueGenerator private def remoteGetNonExpiredValue(key: String, remote: RemoteCacheRW[TimestampedValue[V]], + nanoStartTime: Long, currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { remote.get(key).asTry().flatMap { case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => @@ -278,14 +302,14 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, Future.failed(new Exception("No good value found on remote")) case Failure(e) => if (currentRetry >= maxErrorsToRetryOnRemote) { - reporter.onRemoteGiveUp(key) + reporter.onRemoteGiveUp(key, elapsedTime(nanoStartTime)) logger.error(s"remoteGetWithRetryOnError, key $key: returning calculated value because we got more than $maxErrorsToRetryOnRemote errors", e) Future.failed(e) } else { - reporter.onRemoteError(key, e) + reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) logger.warn(s"remoteGetWithRetryOnError, key $key: got error trying to get value, retry $currentRetry of $maxErrorsToRetryOnRemote", e) // Retry - remoteGetNonExpiredValue(key, remote, currentRetry = currentRetry + 1) + remoteGetNonExpiredValue(key, remote, nanoStartTime, currentRetry = currentRetry + 1) } } } @@ -295,10 +319,11 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, private def remoteSetOrGet(key: String, calculatedValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], + nanoStartTime: Long, currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { if (currentRetry > maxErrorsToRetryOnRemote) { // Use our calculated value as it's the best we can do - reporter.onRemoteGiveUp(key) + reporter.onRemoteGiveUp(key, elapsedTime(nanoStartTime)) logger.error(s"remoteSetOrGet, key $key: returning calculated value because we got more than $maxErrorsToRetryOnRemote errors") Future.successful(calculatedValue) } else { @@ -309,7 +334,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, remote.get(key).asTry().flatMap { case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => // Current value is good, just return it - reporter.onRemoteCacheHitAfterGenerating(key) + reporter.onRemoteCacheHitAfterGenerating(key, elapsedTime(nanoStartTime)) logger.info(s"remoteSetOrGet got lock for $key but found already a good value on remote") Future.successful(remoteValue) case Success(_) => @@ -318,46 +343,46 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, remote.set(key, calculatedValue).asTry().flatMap { case Success(_) => // Flawless victory! - reporter.onSuccessfullyRemoteSetValue(key) + reporter.onSuccessfullyRemoteSetValue(key, elapsedTime(nanoStartTime)) logger.info(s"remoteSetOrGet successfully set key $key while under lock") Future.successful(calculatedValue) case Failure(e) => - reporter.onRemoteError(key, e) + reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) logger.warn(s"remoteSetOrGet, key $key: got error setting the value, retry $currentRetry of $maxErrorsToRetryOnRemote", e) // Retry failure - remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1) + remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) } case Failure(e) => - reporter.onRemoteError(key, e) + reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) logger.warn(s"remoteSetOrGet, key $key: got error getting remote value with lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e) // Retry failure - remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1) + remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) } case Success(false) => // Someone got the lock, let's take a look at the value remote.get(key).asTry().flatMap { case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => // Current value is good, just return it - reporter.onRemoteCacheHitAfterGenerating(key) + reporter.onRemoteCacheHitAfterGenerating(key, elapsedTime(nanoStartTime)) Future.successful(remoteValue) case Success(_) => // The value is missing or has expired // Let's start from scratch because we need to be able to set or get a good value // Note: do not increment retry because this isn't an error - reporter.onStillTryingToLockOrGet(key) + reporter.onStillTryingToLockOrGet(key, elapsedTime(nanoStartTime)) logger.info(s"remoteSetOrGet couldn't lock key $key and didn't found good value on remote") - remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry) + remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry) case Failure(e) => - reporter.onRemoteError(key, e) + reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) logger.warn(s"remoteSetOrGet, key $key: got error getting remote value without lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e) // Retry - remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1) + remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) } case Failure(e) => // Retry failure - reporter.onRemoteError(key, e) + reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) logger.warn(s"remoteSetOrGet, key $key: got error trying to set lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e) - remoteSetOrGet(key, calculatedValue, remote, currentRetry = currentRetry + 1) + remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) } } } From b47a54b46e018e53bc62d270ae697cb891bb7279 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 1 Nov 2016 14:04:18 -0200 Subject: [PATCH 127/268] Make local cache optional, do a backoff on retries --- .../cache/ExpiringMultipleLevelCache.scala | 66 ++++++++++++------- .../cache/ExpiringMultipleLevelCache.scala | 7 +- 2 files changed, 48 insertions(+), 25 deletions(-) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala index 6de11fdd..d4147761 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala @@ -2,6 +2,8 @@ package ignition.core.cache import java.util.concurrent.TimeUnit +import akka.actor.Scheduler +import akka.pattern.after import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap import ignition.core.utils.DateUtils._ import ignition.core.utils.FutureUtils._ @@ -107,11 +109,13 @@ import ignition.core.cache.ExpiringMultipleLevelCache._ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, - localCache: LocalCache[TimestampedValue[V]], + localCache: Option[LocalCache[TimestampedValue[V]]], remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None, remoteLockTTL: FiniteDuration = 5.seconds, reporter: ExpiringMultipleLevelCache.ReporterCallback = ExpiringMultipleLevelCache.NoOpReporter, - maxErrorsToRetryOnRemote: Int = 5) extends GenericCache[V] { + maxErrorsToRetryOnRemote: Int = 5, + backoffOnLockAcquire: FiniteDuration = 50.milliseconds, + backoffOnError: FiniteDuration = 50.milliseconds)(implicit scheduler: Scheduler) extends GenericCache[V] { private val logger = LoggerFactory.getLogger(getClass) @@ -134,7 +138,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = { // The local cache is always the first try. We'll only look the remote if the local value is missing or has expired val startTime = System.nanoTime() - val result = localCache.get(key).map(_.asTry()) match { + val result = localCache.flatMap(_.get(key).map(_.asTry())) match { case Some(future) => future.flatMap { case Success(localValue) if !localValue.hasExpired(ttl, now) => @@ -147,7 +151,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => // Remote is good, set locally and return it reporter.onRemoteCacheHit(key, elapsedTime(startTime)) - localCache.set(key, remoteValue) + localCache.foreach(_.set(key, remoteValue)) Future.successful(remoteValue.value) case Success(Some(expiredRemote)) => // Expired local and expired remote, return the most recent of them, async update both @@ -162,7 +166,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, Future.successful(expiredLocalValue.value) case Failure(e) => reporter.onRemoteError(key, e, elapsedTime(startTime)) - logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key expired local value and failed to get remote", e) + logger.warn(s"apply, key: $key expired local value and failed to get remote", e) tryGenerateAndSet(key, genValue, startTime) Future.successful(expiredLocalValue.value) } @@ -175,7 +179,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, case Failure(e) => // This is almost impossible to happen because it's local and we don't save failed values reporter.onLocalError(key, e, elapsedTime(startTime)) - logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key got a failed future from cache!? This is almost impossible!", e) + logger.warn(s"apply, key: $key got a failed future from cache!? This is almost impossible!", e) tryGenerateAndSet(key, genValue, startTime).map(_.value) } case None if remoteRW.nonEmpty => @@ -184,7 +188,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => // Remote is good, set locally and return it reporter.onRemoteCacheHit(key, elapsedTime(startTime)) - localCache.set(key, remoteValue) + localCache.foreach(_.set(key, remoteValue)) Future.successful(remoteValue.value) case Success(Some(expiredRemote)) => // Expired remote, return the it, async update @@ -197,7 +201,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, tryGenerateAndSet(key, genValue, startTime).map(_.value) case Failure(e) => reporter.onRemoteError(key, e, elapsedTime(startTime)) - logger.warn(s"ExpiringMultipleLevelCache.apply, key: $key expired local value and no remote configured", e) + logger.warn(s"apply, key: $key expired local value and remote error", e) tryGenerateAndSet(key, genValue, startTime).map(_.value) } case None if remoteRW.isEmpty => @@ -223,17 +227,18 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, val promise = Promise[TimestampedValue[V]]() tempUpdate.putIfAbsent(key, promise.future) match { case null => + logger.info(s"tryGenerateAndSet, key $key: got request for generating and none in progress found, calling canonicalValueGenerator") canonicalValueGenerator(key, genValue, nanoStartTime).onComplete { case Success(v) if !v.hasExpired(ttl, now) => reporter.onGeneratedWithSuccess(key, elapsedTime(nanoStartTime)) - localCache.set(key, v) + localCache.foreach(_.set(key, v)) promise.trySuccess(v) tempUpdate.remove(key) case Success(v) => // Have we generated/got an expired value!? reporter.onUnexpectedBehaviour(key, elapsedTime(nanoStartTime)) logger.warn(s"tryGenerateAndSet, key $key: unexpectedly generated/got an expired value: $v") - localCache.set(key, v) + localCache.foreach(_.set(key, v)) promise.trySuccess(v) tempUpdate.remove(key) case Failure(e) => @@ -246,6 +251,7 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, promise.future case fTrying => // If someone call us while a future is running, we return the running future + logger.info(s"tryGenerateAndSet, key $key: got request for generating but an existing one is current in progress") fTrying } } @@ -297,19 +303,22 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { remote.get(key).asTry().flatMap { case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => + logger.info(s"remoteGetNonExpiredValue, key $key: got a good value") Future.successful(remoteValue) case Success(_) => Future.failed(new Exception("No good value found on remote")) case Failure(e) => if (currentRetry >= maxErrorsToRetryOnRemote) { reporter.onRemoteGiveUp(key, elapsedTime(nanoStartTime)) - logger.error(s"remoteGetWithRetryOnError, key $key: returning calculated value because we got more than $maxErrorsToRetryOnRemote errors", e) + logger.error(s"remoteGetNonExpiredValue, key $key: returning calculated value because we got more than $maxErrorsToRetryOnRemote errors", e) Future.failed(e) } else { reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) - logger.warn(s"remoteGetWithRetryOnError, key $key: got error trying to get value, retry $currentRetry of $maxErrorsToRetryOnRemote", e) + logger.warn(s"remoteGetNonExpiredValue, key $key: got error trying to get value, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e) // Retry - remoteGetNonExpiredValue(key, remote, nanoStartTime, currentRetry = currentRetry + 1) + after(backoffOnError, scheduler) { + remoteGetNonExpiredValue(key, remote, nanoStartTime, currentRetry = currentRetry + 1) + } } } } @@ -348,21 +357,26 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, Future.successful(calculatedValue) case Failure(e) => reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) - logger.warn(s"remoteSetOrGet, key $key: got error setting the value, retry $currentRetry of $maxErrorsToRetryOnRemote", e) + logger.warn(s"remoteSetOrGet, key $key: got error setting the value, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e) // Retry failure - remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) + after(backoffOnError, scheduler) { + remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) + } } case Failure(e) => reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) - logger.warn(s"remoteSetOrGet, key $key: got error getting remote value with lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e) + logger.warn(s"remoteSetOrGet, key $key: got error getting remote value with lock, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e) // Retry failure - remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) + after(backoffOnError, scheduler) { + remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) + } } case Success(false) => // Someone got the lock, let's take a look at the value remote.get(key).asTry().flatMap { case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => // Current value is good, just return it + logger.info(s"remoteSetOrGet couldn't lock key $key but found a good on remote afterwards") reporter.onRemoteCacheHitAfterGenerating(key, elapsedTime(nanoStartTime)) Future.successful(remoteValue) case Success(_) => @@ -370,19 +384,25 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, // Let's start from scratch because we need to be able to set or get a good value // Note: do not increment retry because this isn't an error reporter.onStillTryingToLockOrGet(key, elapsedTime(nanoStartTime)) - logger.info(s"remoteSetOrGet couldn't lock key $key and didn't found good value on remote") - remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry) + logger.info(s"remoteSetOrGet couldn't lock key $key and didn't found good value on remote, scheduling retry") + after(backoffOnLockAcquire, scheduler) { + remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry) + } case Failure(e) => reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) - logger.warn(s"remoteSetOrGet, key $key: got error getting remote value without lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e) + logger.warn(s"remoteSetOrGet, key $key: got error getting remote value without lock, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e) // Retry - remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) + after(backoffOnError, scheduler) { + remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) + } } case Failure(e) => // Retry failure reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) - logger.warn(s"remoteSetOrGet, key $key: got error trying to set lock, retry $currentRetry of $maxErrorsToRetryOnRemote", e) - remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) + logger.warn(s"remoteSetOrGet, key $key: got error trying to set lock, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e) + after(backoffOnError, scheduler) { + remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) + } } } } diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala index d602a736..c5b81e8c 100644 --- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala +++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala @@ -1,5 +1,6 @@ package ignition.core.cache +import akka.actor.ActorSystem import ignition.core.cache.ExpiringMultipleLevelCache.TimestampedValue import org.scalatest.{FlatSpec, Matchers} import spray.caching.ExpiringLruLocalCache @@ -10,15 +11,17 @@ import scala.concurrent.{Await, Future} class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers { case class Data(s: String) + implicit val scheduler = ActorSystem().scheduler + "ExpiringMultipleLevelCache" should "calculate a value on cache miss and return it" in { val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) - val cache = ExpiringMultipleLevelCache[Data](1.minute, local) + val cache = ExpiringMultipleLevelCache[Data](1.minute, Option(local)) Await.result(cache("key", () => Future.successful(Data("success"))), 1.minute) shouldBe Data("success") } it should "calculate a value on cache miss and return a failed future of the calculation" in { val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) - val cache = ExpiringMultipleLevelCache[Data](1.minute, local) + val cache = ExpiringMultipleLevelCache[Data](1.minute, Option(local)) class MyException(s: String) extends Exception(s) From 5b3cfa0528342f3d651504d6421d9f65026313a0 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 1 Nov 2016 18:07:27 -0200 Subject: [PATCH 128/268] Added set method --- .../cache/ExpiringMultipleLevelCache.scala | 59 +++++++++++++++++-- .../spray/cache/ExpiringLruLocalCache.scala | 6 -- 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala index d4147761..2fbe6a48 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala @@ -44,9 +44,10 @@ object ExpiringMultipleLevelCache { } def apply(key: String, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext): Future[V] + def set(key: String, value: V)(implicit ec: ExecutionContext): Future[Unit] } - trait LocalCache[V] extends GenericCache[V] { + trait LocalCache[V] { def get(key: Any): Option[Future[V]] def set(key: Any, value: V): Unit } @@ -219,13 +220,57 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, result } + // This should be used carefully because it will overwrite the remote value without + // any lock, which may cause a desynchronization between the local and remote cache on other instances + // Note that if any tryGenerateAndSet is in progress, this will wait until it's finished before setting local/remote + override def set(key: String, value: V)(implicit ec: ExecutionContext): Future[Unit] = { + logger.info(s"set, key $key: got a call to overwrite local and remote values") + val startTime = System.nanoTime() + val promise = Promise[TimestampedValue[V]]() + val future = promise.future + def doIt() = { + val tValue = timestamp(value) + localCache.foreach(_.set(key, tValue)) + val result = remoteRW.map(remote => remoteOverwrite(key, tValue, remote, startTime)).getOrElse(Future.successful(tValue)) + promise.completeWith(result) + tempUpdate.remove(key, future) + } + tempUpdate.put(key, future) match { + case null => + doIt() + future.map(_ => ()) + case fTrying => + fTrying.onComplete { case _ => doIt() } + future.map(_ => ()) + } + } + + // Overwrite remote value without lock, retrying on error + private def remoteOverwrite(key: String, calculatedValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], nanoStartTime: Long, currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { + remote.set(key, calculatedValue).asTry().flatMap { + case Success(_) => + reporter.onSuccessfullyRemoteSetValue(key, elapsedTime(nanoStartTime)) + logger.info(s"remoteForceSet successfully overwritten key $key") + Future.successful(calculatedValue) + case Failure(e) => + reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) + logger.warn(s"remoteForceSet, key $key: got error setting the value, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e) + // Retry failure + after(backoffOnError, scheduler) { + remoteOverwrite(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) + } + } + } + + // Note: this method may return a failed future, but it will never cache it // Our main purpose here is to avoid multiple local calls to generate new promises/futures in parallel, // so we use this Map keep everyone in sync // This is similar to how spray cache works private def tryGenerateAndSet(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { val promise = Promise[TimestampedValue[V]]() - tempUpdate.putIfAbsent(key, promise.future) match { + val future = promise.future + tempUpdate.putIfAbsent(key, future) match { case null => logger.info(s"tryGenerateAndSet, key $key: got request for generating and none in progress found, calling canonicalValueGenerator") canonicalValueGenerator(key, genValue, nanoStartTime).onComplete { @@ -233,22 +278,22 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, reporter.onGeneratedWithSuccess(key, elapsedTime(nanoStartTime)) localCache.foreach(_.set(key, v)) promise.trySuccess(v) - tempUpdate.remove(key) + tempUpdate.remove(key, future) case Success(v) => // Have we generated/got an expired value!? reporter.onUnexpectedBehaviour(key, elapsedTime(nanoStartTime)) logger.warn(s"tryGenerateAndSet, key $key: unexpectedly generated/got an expired value: $v") localCache.foreach(_.set(key, v)) promise.trySuccess(v) - tempUpdate.remove(key) + tempUpdate.remove(key, future) case Failure(e) => // We don't save failures to cache // There is no need to log here, canonicalValueGenerator will log everything already reporter.onGeneratedWithFailure(key, e, elapsedTime(nanoStartTime)) promise.tryFailure(e) - tempUpdate.remove(key) + tempUpdate.remove(key, future) } - promise.future + future case fTrying => // If someone call us while a future is running, we return the running future logger.info(s"tryGenerateAndSet, key $key: got request for generating but an existing one is current in progress") @@ -406,4 +451,6 @@ case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, } } } + + } \ No newline at end of file diff --git a/src/main/scala/spray/cache/ExpiringLruLocalCache.scala b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala index 8c403be9..ac7f6e42 100644 --- a/src/main/scala/spray/cache/ExpiringLruLocalCache.scala +++ b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala @@ -117,12 +117,6 @@ final class ExpiringLruLocalCache[V](maxCapacity: Long, oldEntry.promise.trySuccess(value) } } - - // Method required by ExpiringMultipleLevelCache.LocalCache - override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = { - val sprayCache: Cache[V] = this - sprayCache.apply(key, genValue) - } } private[caching] class Entry[T](val promise: Promise[T]) { From 6b450cbf3931f26d765119cd8a7c6fe179c65be3 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 1 Nov 2016 19:09:03 -0200 Subject: [PATCH 129/268] Rename ExpiringMultipleLevelCache to ExpiringMultiLevelCache --- ...he.scala => ExpiringMultiLevelCache.scala} | 20 +++++++++---------- .../spray/cache/ExpiringLruLocalCache.scala | 4 ++-- .../cache/ExpiringMultipleLevelCache.scala | 6 +++--- 3 files changed, 15 insertions(+), 15 deletions(-) rename src/main/scala/ignition/core/cache/{ExpiringMultipleLevelCache.scala => ExpiringMultiLevelCache.scala} (96%) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala similarity index 96% rename from src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala rename to src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala index 2fbe6a48..31ebb015 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala @@ -16,7 +16,7 @@ import scala.concurrent.{ExecutionContext, Future, Promise} import scala.util.control.NonFatal import scala.util.{Failure, Success, Try} -object ExpiringMultipleLevelCache { +object ExpiringMultiLevelCache { case class TimestampedValue[V](date: DateTime, value: V) { def hasExpired(ttl: FiniteDuration, now: DateTime): Boolean = { date.plus(ttl.toMillis).isBefore(now) @@ -106,17 +106,17 @@ object ExpiringMultipleLevelCache { } -import ignition.core.cache.ExpiringMultipleLevelCache._ +import ignition.core.cache.ExpiringMultiLevelCache._ -case class ExpiringMultipleLevelCache[V](ttl: FiniteDuration, - localCache: Option[LocalCache[TimestampedValue[V]]], - remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None, - remoteLockTTL: FiniteDuration = 5.seconds, - reporter: ExpiringMultipleLevelCache.ReporterCallback = ExpiringMultipleLevelCache.NoOpReporter, - maxErrorsToRetryOnRemote: Int = 5, - backoffOnLockAcquire: FiniteDuration = 50.milliseconds, - backoffOnError: FiniteDuration = 50.milliseconds)(implicit scheduler: Scheduler) extends GenericCache[V] { +case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, + localCache: Option[LocalCache[TimestampedValue[V]]], + remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None, + remoteLockTTL: FiniteDuration = 5.seconds, + reporter: ExpiringMultiLevelCache.ReporterCallback = ExpiringMultiLevelCache.NoOpReporter, + maxErrorsToRetryOnRemote: Int = 5, + backoffOnLockAcquire: FiniteDuration = 50.milliseconds, + backoffOnError: FiniteDuration = 50.milliseconds)(implicit scheduler: Scheduler) extends GenericCache[V] { private val logger = LoggerFactory.getLogger(getClass) diff --git a/src/main/scala/spray/cache/ExpiringLruLocalCache.scala b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala index ac7f6e42..33d2b4d9 100644 --- a/src/main/scala/spray/cache/ExpiringLruLocalCache.scala +++ b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala @@ -20,7 +20,7 @@ package spray.caching import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap -import ignition.core.cache.ExpiringMultipleLevelCache +import ignition.core.cache.ExpiringMultiLevelCache import scala.annotation.tailrec import scala.collection.JavaConverters._ @@ -32,7 +32,7 @@ import spray.util.Timestamp final class ExpiringLruLocalCache[V](maxCapacity: Long, initialCapacity: Int = 16, timeToLive: Duration = Duration.Inf, - timeToIdle: Duration = Duration.Inf) extends Cache[V] with ignition.core.cache.ExpiringMultipleLevelCache.LocalCache[V] { + timeToIdle: Duration = Duration.Inf) extends Cache[V] with ignition.core.cache.ExpiringMultiLevelCache.LocalCache[V] { require(!timeToLive.isFinite || !timeToIdle.isFinite || timeToLive > timeToIdle, s"timeToLive($timeToLive) must be greater than timeToIdle($timeToIdle)") diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala index c5b81e8c..c321f794 100644 --- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala +++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala @@ -1,7 +1,7 @@ package ignition.core.cache import akka.actor.ActorSystem -import ignition.core.cache.ExpiringMultipleLevelCache.TimestampedValue +import ignition.core.cache.ExpiringMultiLevelCache.TimestampedValue import org.scalatest.{FlatSpec, Matchers} import spray.caching.ExpiringLruLocalCache @@ -15,13 +15,13 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers { "ExpiringMultipleLevelCache" should "calculate a value on cache miss and return it" in { val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) - val cache = ExpiringMultipleLevelCache[Data](1.minute, Option(local)) + val cache = ExpiringMultiLevelCache[Data](1.minute, Option(local)) Await.result(cache("key", () => Future.successful(Data("success"))), 1.minute) shouldBe Data("success") } it should "calculate a value on cache miss and return a failed future of the calculation" in { val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) - val cache = ExpiringMultipleLevelCache[Data](1.minute, Option(local)) + val cache = ExpiringMultiLevelCache[Data](1.minute, Option(local)) class MyException(s: String) extends Exception(s) From b421352ccf422a2a577fb0b2509ac3b29662ee78 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 1 Nov 2016 20:19:17 -0200 Subject: [PATCH 130/268] Better put the scheduler on each method than on constructor --- .../core/cache/ExpiringMultiLevelCache.scala | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala index 31ebb015..be6b6c49 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala @@ -32,19 +32,19 @@ object ExpiringMultiLevelCache { * Returns either the cached Future for the key or evaluates the given call-by-name argument * which produces either a value instance of type `V` or a `Future[V]`. */ - def apply(magnet: ⇒ ValueMagnet[V])(implicit ec: ExecutionContext): Future[V] = + def apply(magnet: ⇒ ValueMagnet[V])(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = cache.apply(key, () ⇒ try magnet.future catch { case NonFatal(e) ⇒ Future.failed(e) }) /** * Returns either the cached Future for the key or evaluates the given function which * should lead to eventual completion of the promise. */ - def apply[U](f: Promise[V] ⇒ U)(implicit ec: ExecutionContext): Future[V] = + def apply[U](f: Promise[V] ⇒ U)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = cache.apply(key, () ⇒ { val p = Promise[V](); f(p); p.future }) } - def apply(key: String, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext): Future[V] - def set(key: String, value: V)(implicit ec: ExecutionContext): Future[Unit] + def apply(key: String, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] + def set(key: String, value: V)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Unit] } trait LocalCache[V] { @@ -53,12 +53,12 @@ object ExpiringMultiLevelCache { } trait RemoteWritableCache[V] { - def set(key: String, value: V)(implicit ec: ExecutionContext): Future[Unit] - def setLock(key: String, ttl: FiniteDuration)(implicit ec: ExecutionContext): Future[Boolean] + def set(key: String, value: V)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Unit] + def setLock(key: String, ttl: FiniteDuration)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Boolean] } trait RemoteReadableCache[V] { - def get(key: String)(implicit ec: ExecutionContext): Future[Option[V]] + def get(key: String)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Option[V]] } trait RemoteCacheRW[V] extends RemoteReadableCache[V] with RemoteWritableCache[V] @@ -116,7 +116,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, reporter: ExpiringMultiLevelCache.ReporterCallback = ExpiringMultiLevelCache.NoOpReporter, maxErrorsToRetryOnRemote: Int = 5, backoffOnLockAcquire: FiniteDuration = 50.milliseconds, - backoffOnError: FiniteDuration = 50.milliseconds)(implicit scheduler: Scheduler) extends GenericCache[V] { + backoffOnError: FiniteDuration = 50.milliseconds) extends GenericCache[V] { private val logger = LoggerFactory.getLogger(getClass) @@ -136,7 +136,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, // The idea is simple, have two caches: remote and local // with values that will eventually expire but still be left on the cache // while a new value is asynchronously being calculated/retrieved - override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = { + override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = { // The local cache is always the first try. We'll only look the remote if the local value is missing or has expired val startTime = System.nanoTime() val result = localCache.flatMap(_.get(key).map(_.asTry())) match { @@ -223,7 +223,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, // This should be used carefully because it will overwrite the remote value without // any lock, which may cause a desynchronization between the local and remote cache on other instances // Note that if any tryGenerateAndSet is in progress, this will wait until it's finished before setting local/remote - override def set(key: String, value: V)(implicit ec: ExecutionContext): Future[Unit] = { + override def set(key: String, value: V)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Unit] = { logger.info(s"set, key $key: got a call to overwrite local and remote values") val startTime = System.nanoTime() val promise = Promise[TimestampedValue[V]]() @@ -246,7 +246,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, } // Overwrite remote value without lock, retrying on error - private def remoteOverwrite(key: String, calculatedValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], nanoStartTime: Long, currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { + private def remoteOverwrite(key: String, calculatedValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], nanoStartTime: Long, currentRetry: Int = 0)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = { remote.set(key, calculatedValue).asTry().flatMap { case Success(_) => reporter.onSuccessfullyRemoteSetValue(key, elapsedTime(nanoStartTime)) @@ -267,7 +267,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, // Our main purpose here is to avoid multiple local calls to generate new promises/futures in parallel, // so we use this Map keep everyone in sync // This is similar to how spray cache works - private def tryGenerateAndSet(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { + private def tryGenerateAndSet(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = { val promise = Promise[TimestampedValue[V]]() val future = promise.future tempUpdate.putIfAbsent(key, future) match { @@ -306,7 +306,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, // the remote cache and read by the other instances // Unless of course there is some error getting stuff from remote cache // in which case the locally generated value may be returned to avoid further delays - protected def canonicalValueGenerator(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext) = { + protected def canonicalValueGenerator(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext, scheduler: Scheduler) = { val fGeneratedValue = Try { genValue().map(timestamp) }.asFutureTry() val finalValue: Future[TimestampedValue[V]] = fGeneratedValue.flatMap { case Success(generatedValue) => @@ -345,7 +345,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, private def remoteGetNonExpiredValue(key: String, remote: RemoteCacheRW[TimestampedValue[V]], nanoStartTime: Long, - currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { + currentRetry: Int = 0)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = { remote.get(key).asTry().flatMap { case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => logger.info(s"remoteGetNonExpiredValue, key $key: got a good value") @@ -374,7 +374,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, calculatedValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], nanoStartTime: Long, - currentRetry: Int = 0)(implicit ec: ExecutionContext): Future[TimestampedValue[V]] = { + currentRetry: Int = 0)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = { if (currentRetry > maxErrorsToRetryOnRemote) { // Use our calculated value as it's the best we can do reporter.onRemoteGiveUp(key, elapsedTime(nanoStartTime)) From 1b1ad65bab256ab579b59e9e31f837c8e43b9312 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 3 Nov 2016 18:21:10 -0200 Subject: [PATCH 131/268] Added sanity check feature --- .../core/cache/ExpiringMultiLevelCache.scala | 51 +++++++++++++++++-- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala index be6b6c49..911a0e6a 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala @@ -7,7 +7,7 @@ import akka.pattern.after import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap import ignition.core.utils.DateUtils._ import ignition.core.utils.FutureUtils._ -import org.joda.time.DateTime +import org.joda.time.{DateTime, DateTimeZone} import org.slf4j.LoggerFactory import spray.caching.ValueMagnet @@ -81,6 +81,7 @@ object ExpiringMultiLevelCache { def onLocalError(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit def onRemoteError(key: String, t: Throwable, elapsedTime: FiniteDuration): Unit def onRemoteGiveUp(key: String, elapsedTime: FiniteDuration): Unit + def onSanityLocalValueCheckFailedResult(key: String, result: String, elapsedTime: FiniteDuration): Unit } object NoOpReporter extends ReporterCallback { @@ -101,7 +102,7 @@ object ExpiringMultiLevelCache { override def onCompletedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit = {} override def onGeneratedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit = {} override def onGeneratedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit = {} - + override def onSanityLocalValueCheckFailedResult(key: String, result: String, elapsedTime: FiniteDuration): Unit = {} } } @@ -116,7 +117,8 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, reporter: ExpiringMultiLevelCache.ReporterCallback = ExpiringMultiLevelCache.NoOpReporter, maxErrorsToRetryOnRemote: Int = 5, backoffOnLockAcquire: FiniteDuration = 50.milliseconds, - backoffOnError: FiniteDuration = 50.milliseconds) extends GenericCache[V] { + backoffOnError: FiniteDuration = 50.milliseconds, + sanityLocalValueCheck: Boolean = false) extends GenericCache[V] { private val logger = LoggerFactory.getLogger(getClass) @@ -145,7 +147,11 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, case Success(localValue) if !localValue.hasExpired(ttl, now) => // We have locally a good value, just return it reporter.onLocalCacheHit(key, elapsedTime(startTime)) - Future.successful(localValue.value) + // But if we're paranoid, let's check if the local value is consistent with remote + if (sanityLocalValueCheck) + remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, startTime)).getOrElse(Future.successful(localValue.value)) + else + Future.successful(localValue.value) case Success(expiredLocalValue) if remoteRW.nonEmpty => // We have locally an expired value, but we can check a remote cache for better value remoteRW.get.get(key).asTry().flatMap { @@ -245,6 +251,43 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, } } + private def sanityLocalValueCheck(key: String, localValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], startTime: Long)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = { + remote.get(key).asTry().flatMap { + case Success(Some(remoteValue)) if remoteValue == localValue => + // Remote is the same as local, return any of them + Future.successful(remoteValue.value) + case Success(Some(remoteValue)) => + // Something is different, try to figure it out + val valuesResult = if (remoteValue.value == localValue.value) "same-value" else "different-values" + val dateResult = if (remoteValue.date.isAfter(localValue.date)) + "remote-is-older-than-local" + else if (localValue.date.isAfter(remoteValue.date)) + "local-is-older-than-remote" + else if (localValue.date.isEqual(localValue.date)) + "same-date" + else if (localValue.date.withZone(DateTimeZone.UTC).isEqual(localValue.date.withZone(DateTimeZone.UTC))) + "same-date-on-utc" + else + "impossible-dates" + val remoteExpired = remoteValue.hasExpired(ttl, now) + val localExpired = localValue.hasExpired(ttl, now) + val finalResult = s"$valuesResult-$dateResult-remote-expired-${remoteExpired}-local-expired-${localExpired}" + logger.warn(s"sanityLocalValueCheck, key $key: got different results for local $localValue and remote $remoteValue ($finalResult)") + reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime)) + // return remote to keep everyone consistent + Future.successful(remoteValue.value) + case Success(None) => + val localExpired = localValue.hasExpired(ttl, now) + val finalResult = s"missing-remote-local-expired-${localExpired}" + logger.warn(s"sanityLocalValueCheck, key $key: got local $localValue but no remote ($finalResult)") + Future.successful(localValue.value) + case Failure(e) => + reporter.onRemoteError(key, e, elapsedTime(startTime)) + logger.warn(s"sanityLocalValueCheck, key: $key failed to get remote", e) + Future.successful(localValue.value) + } + } + // Overwrite remote value without lock, retrying on error private def remoteOverwrite(key: String, calculatedValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], nanoStartTime: Long, currentRetry: Int = 0)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = { remote.set(key, calculatedValue).asTry().flatMap { From 1f7dbbf749647849e1e6cbe45debf64ca9a06866 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Fri, 4 Nov 2016 17:37:00 -0200 Subject: [PATCH 132/268] Improve sanity check and use UTC dates on timestamped values --- .../ignition/core/cache/ExpiringMultiLevelCache.scala | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala index 911a0e6a..d20a3c4f 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala @@ -7,7 +7,7 @@ import akka.pattern.after import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap import ignition.core.utils.DateUtils._ import ignition.core.utils.FutureUtils._ -import org.joda.time.{DateTime, DateTimeZone} +import org.joda.time.{DateTime, DateTimeZone, Interval} import org.slf4j.LoggerFactory import spray.caching.ValueMagnet @@ -126,7 +126,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, .maximumWeightedCapacity(Long.MaxValue) .build() - protected def now = DateTime.now + protected def now = DateTime.now.withZone(DateTimeZone.UTC) private def timestamp(v: V) = TimestampedValue(now, v) @@ -257,12 +257,14 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, // Remote is the same as local, return any of them Future.successful(remoteValue.value) case Success(Some(remoteValue)) => + def datesAreClose(date1: DateTime, date2: DateTime): Boolean = Math.abs(new Interval(date1, date2).toDurationMillis) <= 5000 // Something is different, try to figure it out val valuesResult = if (remoteValue.value == localValue.value) "same-value" else "different-values" + val closeDatesSuffix = if (datesAreClose(remoteValue.date, localValue.date)) "-but-close-dates" else "" val dateResult = if (remoteValue.date.isAfter(localValue.date)) - "remote-is-older-than-local" + s"remote-is-newer-than-local$closeDatesSuffix" else if (localValue.date.isAfter(remoteValue.date)) - "local-is-older-than-remote" + s"local-is-newer-than-remote$closeDatesSuffix" else if (localValue.date.isEqual(localValue.date)) "same-date" else if (localValue.date.withZone(DateTimeZone.UTC).isEqual(localValue.date.withZone(DateTimeZone.UTC))) From daf6eab6966b5436f4d0fefc156bc614d5d0291e Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Mon, 7 Nov 2016 18:05:19 -0200 Subject: [PATCH 133/268] Fallback on remote not found while in sanity check --- .../ignition/core/cache/ExpiringMultiLevelCache.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala index d20a3c4f..472b88b7 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala @@ -149,7 +149,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, reporter.onLocalCacheHit(key, elapsedTime(startTime)) // But if we're paranoid, let's check if the local value is consistent with remote if (sanityLocalValueCheck) - remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, startTime)).getOrElse(Future.successful(localValue.value)) + remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, genValue, startTime)).getOrElse(Future.successful(localValue.value)) else Future.successful(localValue.value) case Success(expiredLocalValue) if remoteRW.nonEmpty => @@ -251,7 +251,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, } } - private def sanityLocalValueCheck(key: String, localValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], startTime: Long)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = { + private def sanityLocalValueCheck(key: String, localValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], genValue: () => Future[V], startTime: Long)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = { remote.get(key).asTry().flatMap { case Success(Some(remoteValue)) if remoteValue == localValue => // Remote is the same as local, return any of them @@ -282,7 +282,9 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, val localExpired = localValue.hasExpired(ttl, now) val finalResult = s"missing-remote-local-expired-${localExpired}" logger.warn(s"sanityLocalValueCheck, key $key: got local $localValue but no remote ($finalResult)") - Future.successful(localValue.value) + reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime)) + // Try generate it to keep a behaviour equivalent to remote only + tryGenerateAndSet(key, genValue, startTime).map(_.value) case Failure(e) => reporter.onRemoteError(key, e, elapsedTime(startTime)) logger.warn(s"sanityLocalValueCheck, key: $key failed to get remote", e) From b6fbd2195e99e4891eb00d4830cfa9655d36871d Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Mon, 7 Nov 2016 18:05:57 -0200 Subject: [PATCH 134/268] suport for x1 instance type --- tools/spark-ec2/spark_ec2.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index b1f4e709..909c284c 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -478,6 +478,8 @@ def get_validate_spark_version(version, repo): "t2.small": "hvm", "t2.medium": "hvm", "t2.large": "hvm", + "x1.16xlarge": "hvm", + "x1.32xlarge": "hvm", } @@ -1134,6 +1136,9 @@ def get_num_disks(instance_type): "t2.small": 0, "t2.medium": 0, "t2.large": 0, + "x1.16xlarge": 1, + "x1.32xlarge": 2, + } if instance_type in disks_by_instance: return disks_by_instance[instance_type] From aa55e1140d42c4b534c9f150645a25ca71985ab4 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 8 Nov 2016 18:48:50 -0200 Subject: [PATCH 135/268] rename private Entry class to avoid assembly issues --- src/main/scala/spray/cache/ExpiringLruLocalCache.scala | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/main/scala/spray/cache/ExpiringLruLocalCache.scala b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala index 33d2b4d9..9fa476f9 100644 --- a/src/main/scala/spray/cache/ExpiringLruLocalCache.scala +++ b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala @@ -20,14 +20,13 @@ package spray.caching import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap -import ignition.core.cache.ExpiringMultiLevelCache +import spray.util.Timestamp import scala.annotation.tailrec import scala.collection.JavaConverters._ import scala.concurrent.duration.Duration import scala.concurrent.{ExecutionContext, Future, Promise} -import scala.util.{Failure, Success, Try} -import spray.util.Timestamp +import scala.util.{Failure, Success} final class ExpiringLruLocalCache[V](maxCapacity: Long, initialCapacity: Int = 16, @@ -119,7 +118,7 @@ final class ExpiringLruLocalCache[V](maxCapacity: Long, } } -private[caching] class Entry[T](val promise: Promise[T]) { +private[caching] class ExpiringLruLocalCacheEntry[T](val promise: Promise[T]) { @volatile var created = Timestamp.now @volatile var lastAccessed = Timestamp.now def future = promise.future From 9b08575fc097ac00625c60d8bccd75363422f5be Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Fri, 11 Nov 2016 21:58:27 -0200 Subject: [PATCH 136/268] Remove close dates report because it is too much trouble for nothing --- .../scala/ignition/core/cache/ExpiringMultiLevelCache.scala | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala index 472b88b7..138d6cbd 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala @@ -257,14 +257,12 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, // Remote is the same as local, return any of them Future.successful(remoteValue.value) case Success(Some(remoteValue)) => - def datesAreClose(date1: DateTime, date2: DateTime): Boolean = Math.abs(new Interval(date1, date2).toDurationMillis) <= 5000 // Something is different, try to figure it out val valuesResult = if (remoteValue.value == localValue.value) "same-value" else "different-values" - val closeDatesSuffix = if (datesAreClose(remoteValue.date, localValue.date)) "-but-close-dates" else "" val dateResult = if (remoteValue.date.isAfter(localValue.date)) - s"remote-is-newer-than-local$closeDatesSuffix" + s"remote-is-newer-than-local" else if (localValue.date.isAfter(remoteValue.date)) - s"local-is-newer-than-remote$closeDatesSuffix" + s"local-is-newer-than-remote" else if (localValue.date.isEqual(localValue.date)) "same-date" else if (localValue.date.withZone(DateTimeZone.UTC).isEqual(localValue.date.withZone(DateTimeZone.UTC))) From 64c65a6af8756b8a0fbdf3b354f2b1cc0e8364e5 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 17 Nov 2016 23:43:18 -0200 Subject: [PATCH 137/268] Smart and lazy s3 list --- .../core/jobs/utils/SparkContextUtils.scala | 213 +++++++++--------- 1 file changed, 112 insertions(+), 101 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index dddd51a6..b1994e29 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -24,6 +24,7 @@ import scala.reflect.ClassTag import scala.util.{Failure, Success, Try} import scala.util.control.NonFatal import ignition.core.utils.ExceptionUtils._ +import ignition.core.utils.CollectionUtils._ import org.slf4j.LoggerFactory @@ -47,8 +48,24 @@ object SparkContextUtils { } } + object S3SplittedPath { + val s3Pattern = "s3[an]?://([^/]+)(.+)".r + + def from(fullPath: String): Option[S3SplittedPath] = + fullPath match { + case s3Pattern(bucket, prefix) => Option(S3SplittedPath(bucket, prefix.dropWhile(_ == '/'))) + case _ => None + } + } + + case class S3SplittedPath(bucket: String, key: String) { + def join: String = s"s3a://$bucket/$key" + } + case class HadoopFile(path: String, isDir: Boolean, size: Long) + case class WithOptDate[E](date: Option[DateTime], value: E) + implicit class SparkContextImprovements(sc: SparkContext) { private lazy val logger = LoggerFactory.getLogger(getClass) @@ -353,15 +370,6 @@ object SparkContextUtils { union } - def parallelListAndReadTextFiles(paths: List[String], - maxBytesPerPartition: Long, - minPartitions: Int, - sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()) - (implicit dateExtractor: PathDateExtractor): RDD[String] = { - val foundFiles = paths.flatMap(smartList(_)) - parallelReadTextFiles(foundFiles, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, sizeBasedFileHandling = sizeBasedFileHandling) - } - def parallelReadTextFiles(files: List[HadoopFile], maxBytesPerPartition: Long = 128 * 1000 * 1000, minPartitions: Int = 100, @@ -446,38 +454,42 @@ object SparkContextUtils { innerListFiles(List(HadoopFile(path, isDir = true, 0))) } - def s3ListCommonPrefixes(bucket: String, prefix: String, delimiter: String = "/") - (implicit s3: AmazonS3Client): Stream[String] = { + def s3ListCommonPrefixes(path: S3SplittedPath, delimiter: String = "/") + (implicit s3: AmazonS3Client): Stream[S3SplittedPath] = { def inner(current: ObjectListing): Stream[String] = - if (current.isTruncated) + if (current.isTruncated) { + logger.trace(s"list common prefixed truncated for ${path.bucket} ${path.key}: ${current.getCommonPrefixes}") current.getCommonPrefixes.toStream ++ inner(s3.listNextBatchOfObjects(current)) - else + } else { + logger.trace(s"list common prefixed finished for ${path.bucket} ${path.key}: ${current.getCommonPrefixes}") current.getCommonPrefixes.toStream + } - val request = new ListObjectsRequest(bucket, prefix, null, delimiter, 1000) - inner(s3.listObjects(request)) + val request = new ListObjectsRequest(path.bucket, path.key, null, delimiter, 1000) + inner(s3.listObjects(request)).map(prefix => path.copy(key = prefix)) } - def s3ListObjects(bucket: String, prefix: String) + def s3ListObjects(path: S3SplittedPath) (implicit s3: AmazonS3Client): Stream[S3ObjectSummary] = { def inner(current: ObjectListing): Stream[S3ObjectSummary] = - if (current.isTruncated) + if (current.isTruncated) { + logger.trace(s"list objects truncated for ${path.bucket} ${path.key}: $current") current.getObjectSummaries.toStream ++ inner(s3.listNextBatchOfObjects(current)) - else + } else { + logger.trace(s"list objects finished for ${path.bucket} ${path.key}") current.getObjectSummaries.toStream + } - inner(s3.listObjects(bucket, prefix)) + inner(s3.listObjects(path.bucket, path.key)) } - def s3NarrowPaths(bucket: String, - prefix: String, - delimiter: String = "/", + def s3NarrowPaths(splittedPath: S3SplittedPath, inclusiveStartDate: Boolean = true, startDate: Option[DateTime] = None, inclusiveEndDate: Boolean = true, endDate: Option[DateTime] = None, ignoreHours: Boolean = true) - (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): Stream[String] = { + (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): Stream[WithOptDate[S3SplittedPath]] = { def isGoodDate(date: DateTime): Boolean = { val startDateToCompare = startDate.map(date => if (ignoreHours) date.withTimeAtStartOfDay() else date) @@ -487,49 +499,54 @@ object SparkContextUtils { goodStartDate && goodEndDate } - def classifyPath(path: String): Either[String, (String, DateTime)] = - Try(pathDateExtractor.extractFromPath(s"s3a://$bucket/$path")) match { + def classifyPath(path: S3SplittedPath): Either[S3SplittedPath, (S3SplittedPath, DateTime)] = + Try(pathDateExtractor.extractFromPath(path.join)) match { case Success(date) => Right(path -> date) case Failure(_) => Left(path) } - val commonPrefixes = s3ListCommonPrefixes(bucket, prefix, delimiter).map(classifyPath) + val commonPrefixes = s3ListCommonPrefixes(splittedPath).map(classifyPath) + logger.trace(s"s3NarrowPaths for $splittedPath, common prefixes: $commonPrefixes") if (commonPrefixes.isEmpty) - Stream(s"s3a://$bucket/$prefix") + Stream(WithOptDate(None, splittedPath)) else commonPrefixes.toStream.flatMap { - case Left(prefixWithoutDate) => s3NarrowPaths(bucket, prefixWithoutDate, delimiter, inclusiveStartDate, startDate, inclusiveEndDate, endDate, ignoreHours) - case Right((prefixWithDate, date)) if isGoodDate(date) => Stream(s"s3a://$bucket/$prefixWithDate") + case Left(prefixWithoutDate) => + logger.trace(s"s3NarrowPaths prefixWithoutDate: $prefixWithoutDate") + s3NarrowPaths(prefixWithoutDate, inclusiveStartDate, startDate, inclusiveEndDate, endDate, ignoreHours) + case Right((prefixWithDate, date)) if isGoodDate(date) => Stream(WithOptDate(Option(date), prefixWithDate)) case Right(_) => Stream.empty } } - private def s3List(path: String, - inclusiveStartDate: Boolean, - startDate: Option[DateTime], - inclusiveEndDate: Boolean, - endDate: Option[DateTime], - exclusionPattern: Option[String]) - (implicit s3: AmazonS3Client, dateExtractor: PathDateExtractor): Stream[S3ObjectSummary] = { + // Sorted from most recent to least recent path + private def sortPaths[P](paths: Stream[WithOptDate[P]]): Stream[WithOptDate[P]] = { + paths.sortBy { p => p.date.getOrElse(new DateTime(1970, 1, 1, 1, 1)) }(Ordering[DateTime].reverse) + } - val s3Pattern = "s3[an]?://([^/]+)(.+)".r + private def sortedS3List(path: String, + inclusiveStartDate: Boolean, + startDate: Option[DateTime], + inclusiveEndDate: Boolean, + endDate: Option[DateTime], + exclusionPattern: Option[String]) + (implicit s3: AmazonS3Client, dateExtractor: PathDateExtractor): Stream[WithOptDate[Array[S3ObjectSummary]]] = { - def extractBucketAndPrefix(path: String): Option[(String, String)] = path match { - case s3Pattern(bucket, prefix) => Option(bucket -> prefix.dropWhile(_ == '/')) - case _ => None - } - extractBucketAndPrefix(path) match { - case Some((pathBucket, pathPrefix)) => - s3NarrowPaths(pathBucket, pathPrefix, inclusiveStartDate = inclusiveStartDate, inclusiveEndDate = inclusiveEndDate, - startDate = startDate, endDate = endDate).flatMap(extractBucketAndPrefix).flatMap { - case (bucket, prefix) => s3ListObjects(bucket, prefix) - } + S3SplittedPath.from(path) match { + case Some(splittedPath) => + val prefixes: Stream[WithOptDate[S3SplittedPath]] = + s3NarrowPaths(splittedPath, inclusiveStartDate = inclusiveStartDate, inclusiveEndDate = inclusiveEndDate, + startDate = startDate, endDate = endDate) + + sortPaths(prefixes) + .map { case WithOptDate(date, path) => WithOptDate(date, s3ListObjects(path).toArray) } // Will list the most recent path first and only if needed the others case _ => Stream.empty } } + def listAndFilterFiles(path: String, requireSuccess: Boolean = false, inclusiveStartDate: Boolean = true, @@ -546,85 +563,79 @@ object SparkContextUtils { def isSuccessFile(file: HadoopFile): Boolean = file.path.endsWith("_SUCCESS") || file.path.endsWith("_FINISHED") - def extractDateFromFile(file: HadoopFile): Option[DateTime] = - Try(dateExtractor.extractFromPath(file.path)).toOption + def excludePatternValidation(file: HadoopFile): Boolean = + exclusionPattern.map(pattern => !file.path.matches(pattern)).getOrElse(true) - def excludePatternValidation(file: HadoopFile): Option[HadoopFile] = - exclusionPattern match { - case Some(pattern) if file.path.matches(pattern) => None - case Some(_) | None => Option(file) - } + def endsWithValidation(file: HadoopFile): Boolean = + endsWith.map { pattern => + file.path.endsWith(pattern) || isSuccessFile(file) + }.getOrElse(true) - def endsWithValidation(file: HadoopFile): Option[HadoopFile] = - endsWith match { - case Some(pattern) if file.path.endsWith(pattern) => Option(file) - case Some(_) if isSuccessFile(file) => Option(file) - case Some(_) => None - case None => Option(file) - } - - def applyPredicate(file: HadoopFile): Option[HadoopFile] = - if (predicate(file)) Option(file) else None - - def dateValidation(file: HadoopFile): Option[HadoopFile] = { - val tryDate = extractDateFromFile(file) + def dateValidation(tryDate: Option[DateTime]): Boolean = { if (tryDate.isEmpty && ignoreMalformedDates) - Option(file) + true else { val date = tryDate.get val goodStartDate = startDate.isEmpty || (inclusiveStartDate && date.saneEqual(startDate.get) || date.isAfter(startDate.get)) - val goodEndDate = endDate.isEmpty || (inclusiveEndDate && date.saneEqual(endDate.get) || date.isBefore(endDate.get)) - if (goodStartDate && goodEndDate) Option(file) else None + def goodEndDate = endDate.isEmpty || (inclusiveEndDate && date.saneEqual(endDate.get) || date.isBefore(endDate.get)) + goodStartDate && goodEndDate } } - val preValidations: HadoopFile => Boolean = hadoopFile => { - val validatedFile = for { - _ <- excludePatternValidation(hadoopFile) - _ <- endsWithValidation(hadoopFile) - _ <- dateValidation(hadoopFile) - valid <- applyPredicate(hadoopFile) - } yield valid - validatedFile.isDefined + def successFileValidation(files: WithOptDate[Array[HadoopFile]]): Boolean = { + if (requireSuccess) + files.value.exists(isSuccessFile) + else + true } - val preFilteredFiles = smartList(path, inclusiveStartDate = inclusiveStartDate, inclusiveEndDate = inclusiveEndDate, - startDate = startDate, endDate = endDate, exclusionPattern = exclusionPattern).filter(preValidations) - - val filesByDate = preFilteredFiles.groupBy(extractDateFromFile).collect { - case (date, files) => date.getOrElse(new DateTime(1970, 1, 1, 1, 1)) -> files + def preValidations(files: WithOptDate[Array[HadoopFile]]): Option[WithOptDate[Array[HadoopFile]]] = { + if (!dateValidation(files.date) || !successFileValidation(files)) + None + else { + val filtered = files.copy(value = files.value + .filter(excludePatternValidation).filter(endsWithValidation).filter(predicate)) + if (filtered.value.isEmpty) + None + else + Option(filtered) + } } - val posFilteredFiles = - if (requireSuccess) - filesByDate.filter { case (_, files) => files.exists(isSuccessFile) } - else - filesByDate + val groupedAndSortedByDateFiles = sortedSmartList(path, inclusiveStartDate = inclusiveStartDate, inclusiveEndDate = inclusiveEndDate, + startDate = startDate, endDate = endDate, exclusionPattern = exclusionPattern).flatMap(preValidations) val allFiles = if (lastN.isDefined) - posFilteredFiles.toList.sortBy(_._1).reverse.take(lastN.get).flatMap(_._2) + groupedAndSortedByDateFiles.take(lastN.get).flatMap(_.value) else - posFilteredFiles.toList.flatMap(_._2) + groupedAndSortedByDateFiles.flatMap(_.value) - allFiles.sortBy(_.path) + allFiles.sortBy(_.path).toList } - def smartList(path: String, - inclusiveStartDate: Boolean = false, - startDate: Option[DateTime] = None, - inclusiveEndDate: Boolean = false, - endDate: Option[DateTime] = None, - exclusionPattern: Option[String] = None)(implicit pathDateExtractor: PathDateExtractor): Stream[HadoopFile] = { + def sortedSmartList(path: String, + inclusiveStartDate: Boolean = false, + startDate: Option[DateTime] = None, + inclusiveEndDate: Boolean = false, + endDate: Option[DateTime] = None, + exclusionPattern: Option[String] = None)(implicit pathDateExtractor: PathDateExtractor): Stream[WithOptDate[Array[HadoopFile]]] = { def toHadoopFile(s3Object: S3ObjectSummary): HadoopFile = HadoopFile(s"s3a://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize) - def listPath(path: String): Stream[HadoopFile] = { + def listPath(path: String): Stream[WithOptDate[Array[HadoopFile]]] = { if (path.startsWith("s3")) { - s3List(path, inclusiveStartDate = inclusiveStartDate, startDate = startDate, inclusiveEndDate = inclusiveEndDate, - endDate = endDate, exclusionPattern = exclusionPattern)(amazonS3ClientFromEnvironmentVariables, pathDateExtractor).map(toHadoopFile) + sortedS3List(path, inclusiveStartDate = inclusiveStartDate, startDate = startDate, inclusiveEndDate = inclusiveEndDate, + endDate = endDate, exclusionPattern = exclusionPattern)(amazonS3ClientFromEnvironmentVariables, pathDateExtractor).map { + case WithOptDate(date, paths) => WithOptDate(date, paths.map(toHadoopFile).toArray) + } } else { - driverListFiles(path).toStream + val pathsWithDate: Stream[WithOptDate[Iterable[HadoopFile]]] = driverListFiles(path) + .map(p => (Try { pathDateExtractor.extractFromPath(p.path) }.toOption, p)) + .groupByKey() + .map { case (date, path) => WithOptDate(date, path) } + .toStream + sortPaths(pathsWithDate).map { case WithOptDate(date, paths) => WithOptDate(date, paths.toArray) } } } From 5881afe87e7d01f7576000b34b79025e4c5c40dc Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Sat, 19 Nov 2016 19:59:15 -0200 Subject: [PATCH 138/268] Removed custom boto because having 2 boto versions is confusing and unnecessary --- tools/spark-ec2/spark_ec2.py | 54 ------------------------------------ 1 file changed, 54 deletions(-) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 909c284c..79e81484 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -107,60 +107,6 @@ DEFAULT_SPARK_EC2_BRANCH = "branch-2.0" -def setup_external_libs(libs): - """ - Download external libraries from PyPI to SPARK_EC2_DIR/lib/ and prepend them to our PATH. - """ - PYPI_URL_PREFIX = "https://pypi.python.org/packages/source" - SPARK_EC2_LIB_DIR = os.path.join(SPARK_EC2_DIR, "lib") - - if not os.path.exists(SPARK_EC2_LIB_DIR): - print("Downloading external libraries that spark-ec2 needs from PyPI to {path}...".format( - path=SPARK_EC2_LIB_DIR - )) - print("This should be a one-time operation.") - os.mkdir(SPARK_EC2_LIB_DIR) - - for lib in libs: - versioned_lib_name = "{n}-{v}".format(n=lib["name"], v=lib["version"]) - lib_dir = os.path.join(SPARK_EC2_LIB_DIR, versioned_lib_name) - - if not os.path.isdir(lib_dir): - tgz_file_path = os.path.join(SPARK_EC2_LIB_DIR, versioned_lib_name + ".tar.gz") - print(" - Downloading {lib}...".format(lib=lib["name"])) - download_stream = urlopen( - "{prefix}/{first_letter}/{lib_name}/{lib_name}-{lib_version}.tar.gz".format( - prefix=PYPI_URL_PREFIX, - first_letter=lib["name"][:1], - lib_name=lib["name"], - lib_version=lib["version"] - ) - ) - with open(tgz_file_path, "wb") as tgz_file: - tgz_file.write(download_stream.read()) - with open(tgz_file_path, "rb") as tar: - if hashlib.md5(tar.read()).hexdigest() != lib["md5"]: - print("ERROR: Got wrong md5sum for {lib}.".format(lib=lib["name"]), file=stderr) - sys.exit(1) - tar = tarfile.open(tgz_file_path) - tar.extractall(path=SPARK_EC2_LIB_DIR) - tar.close() - os.remove(tgz_file_path) - print(" - Finished downloading {lib}.".format(lib=lib["name"])) - sys.path.insert(1, lib_dir) - - -# Only PyPI libraries are supported. -external_libs = [ - { - "name": "boto", - "version": "2.34.0", - "md5": "5556223d2d0cc4d06dd4829e671dcecd" - } -] - -setup_external_libs(external_libs) - import boto from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType, EBSBlockDeviceType from boto import ec2 From 002c38f9fd0ef013c3d8ba7f4fdd981029133645 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Sun, 20 Nov 2016 13:21:14 -0200 Subject: [PATCH 139/268] fix date filter --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index b1994e29..8fec24dd 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -574,12 +574,13 @@ object SparkContextUtils { def dateValidation(tryDate: Option[DateTime]): Boolean = { if (tryDate.isEmpty && ignoreMalformedDates) true - else { + else if (tryDate.isDefined) { val date = tryDate.get val goodStartDate = startDate.isEmpty || (inclusiveStartDate && date.saneEqual(startDate.get) || date.isAfter(startDate.get)) def goodEndDate = endDate.isEmpty || (inclusiveEndDate && date.saneEqual(endDate.get) || date.isBefore(endDate.get)) goodStartDate && goodEndDate - } + } else + false } def successFileValidation(files: WithOptDate[Array[HadoopFile]]): Boolean = { From 31ace49364059b7b2ec6e6d400de35841a06aef2 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Mon, 21 Nov 2016 17:08:20 -0200 Subject: [PATCH 140/268] Fix date validation for paths without files --- .../core/jobs/utils/SparkContextUtils.scala | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 8fec24dd..d301a0a8 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -509,7 +509,7 @@ object SparkContextUtils { logger.trace(s"s3NarrowPaths for $splittedPath, common prefixes: $commonPrefixes") if (commonPrefixes.isEmpty) - Stream(WithOptDate(None, splittedPath)) + Stream(WithOptDate(Try(pathDateExtractor.extractFromPath(splittedPath.join)).toOption, splittedPath)) else commonPrefixes.toStream.flatMap { case Left(prefixWithoutDate) => @@ -571,16 +571,18 @@ object SparkContextUtils { file.path.endsWith(pattern) || isSuccessFile(file) }.getOrElse(true) - def dateValidation(tryDate: Option[DateTime]): Boolean = { + def dateValidation(files: WithOptDate[Array[HadoopFile]]): Boolean = { + val tryDate = files.date if (tryDate.isEmpty && ignoreMalformedDates) true - else if (tryDate.isDefined) { + else if (tryDate.isEmpty) + throw new Exception(s"Not date found for path $path, expanded files: ${files.value.toList}, consider using ignoreMalformedDates=true if not date is expected on this path") + else { val date = tryDate.get val goodStartDate = startDate.isEmpty || (inclusiveStartDate && date.saneEqual(startDate.get) || date.isAfter(startDate.get)) def goodEndDate = endDate.isEmpty || (inclusiveEndDate && date.saneEqual(endDate.get) || date.isBefore(endDate.get)) goodStartDate && goodEndDate - } else - false + } } def successFileValidation(files: WithOptDate[Array[HadoopFile]]): Boolean = { @@ -591,7 +593,7 @@ object SparkContextUtils { } def preValidations(files: WithOptDate[Array[HadoopFile]]): Option[WithOptDate[Array[HadoopFile]]] = { - if (!dateValidation(files.date) || !successFileValidation(files)) + if (!dateValidation(files) || !successFileValidation(files)) None else { val filtered = files.copy(value = files.value From 4ecd942b1150d303dc42eac2ae13dff563a58591 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 22 Nov 2016 17:08:50 -0200 Subject: [PATCH 141/268] Performs date validation only if there are files to be validated --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index d301a0a8..1de12dd6 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -593,12 +593,12 @@ object SparkContextUtils { } def preValidations(files: WithOptDate[Array[HadoopFile]]): Option[WithOptDate[Array[HadoopFile]]] = { - if (!dateValidation(files) || !successFileValidation(files)) + if (!successFileValidation(files)) None else { val filtered = files.copy(value = files.value .filter(excludePatternValidation).filter(endsWithValidation).filter(predicate)) - if (filtered.value.isEmpty) + if (filtered.value.isEmpty || !dateValidation(filtered)) None else Option(filtered) From 6e1cb5e2c4fd07ab20890636853d95b6b8ff0189 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 24 Nov 2016 11:30:39 -0200 Subject: [PATCH 142/268] to string for IntBag --- src/main/scala/ignition/core/utils/IntBag.scala | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/main/scala/ignition/core/utils/IntBag.scala b/src/main/scala/ignition/core/utils/IntBag.scala index 38cb3836..a53d2d8f 100644 --- a/src/main/scala/ignition/core/utils/IntBag.scala +++ b/src/main/scala/ignition/core/utils/IntBag.scala @@ -35,12 +35,14 @@ case class IntBag(histogram: collection.Map[Long, Long]) { } } + def count: Long = histogram.values.sum + + def sum: Long = histogram.map { case (k, f) => k * f }.sum + def avg: Option[Long] = { - if (histogram.nonEmpty) { - val sum = histogram.map { case (k, f) => k * f }.sum - val count = histogram.values.sum + if (histogram.nonEmpty) Option(sum / count) - } else + else None } @@ -51,4 +53,7 @@ case class IntBag(histogram: collection.Map[Long, Long]) { def max: Option[Long] = { histogram.keys.maxOption } + + override def toString: String = s"IntBag(median=$median, count=$count, sum=$sum, avg=$avg, min=$min, max=$max)" + } From 135a753751524d62050d13f03b2e464400e69311 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 29 Nov 2016 17:11:50 -0200 Subject: [PATCH 143/268] Avoid checking too early the job --- remote_hook.sh | 8 ++++++-- tools/cluster.py | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/remote_hook.sh b/remote_hook.sh index 688bfbc1..3635951e 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -20,12 +20,16 @@ MY_USER=$(whoami) # Avoids problems when another user created our control dir sudo mkdir -p "${JOB_CONTROL_DIR}" sudo chown $MY_USER "${JOB_CONTROL_DIR}" -sudo chmod -R o+rx /root - RUNNING_FILE="${JOB_CONTROL_DIR}/RUNNING" +# This should be the first thing in the script to avoid the wait remote job thinking we died echo $$ > "${RUNNING_FILE}" + + +# Let us read the spark home even when the image doesn't give us the permission +sudo chmod -R o+rx /root + notify_error_and_exit() { description="${1}" echo "Exiting because: ${description}" diff --git a/tools/cluster.py b/tools/cluster.py index 7d77e8c4..080dcedc 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -451,6 +451,7 @@ def job_run(cluster_name, job_name, job_mem, ssh_call(user=remote_user, host=master, key_file=key_file, args=[tmux_arg], allocate_terminal=True) if wait_completion: + time.sleep(5) # wait job to set up before checking it failed = False failed_exception = None try: From 8b91ee353716ba4db6efba0612d5c2f2206d881c Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 7 Dec 2016 16:09:46 -0200 Subject: [PATCH 144/268] Updated spark --- build.sbt | 2 +- tools/cluster.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index c0f4bf77..ad80612f 100644 --- a/build.sbt +++ b/build.sbt @@ -9,7 +9,7 @@ scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warning // Because we can't run two spark contexts on same VM parallelExecution in Test := false -libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.1" % "provided") +libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.2" % "provided") .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") diff --git a/tools/cluster.py b/tools/cluster.py index 080dcedc..c2762f3a 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -49,7 +49,7 @@ default_ami = None # will be decided based on spark-ec2 list default_master_ami = None default_env = 'dev' -default_spark_version = '2.0.1' +default_spark_version = '2.0.2' custom_builds = { # '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz' } From f712f0650c5e19533f75c1acb5282b717baee455 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 7 Dec 2016 16:28:28 -0200 Subject: [PATCH 145/268] Added spark 2.0.2 --- tools/spark-ec2/spark_ec2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 79e81484..1b405d47 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -78,6 +78,7 @@ "1.6.0", "2.0.0", "2.0.1", + "2.0.2", ]) SPARK_TACHYON_MAP = { From 5fb406739caf7b198214d2402e855fa4aae4ab0d Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 29 Dec 2016 19:45:00 -0200 Subject: [PATCH 146/268] Create an uri along the request to guarantee the request is valid --- .../core/http/AsyncHttpClientStreamApi.scala | 12 ++++++-- .../core/http/AsyncSprayHttpClient.scala | 30 ++++--------------- 2 files changed, 16 insertions(+), 26 deletions(-) diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala index 30f46c53..131d2a05 100644 --- a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala +++ b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala @@ -4,7 +4,7 @@ import java.io.InputStream import java.util.concurrent.TimeUnit import akka.util.Timeout -import spray.http.{HttpEntity, HttpHeader, HttpMethod, HttpMethods} +import spray.http._ import scala.concurrent.Future import scala.concurrent.duration._ @@ -49,7 +49,15 @@ object AsyncHttpClientStreamApi { method: HttpMethod = HttpMethods.GET, body: HttpEntity = HttpEntity.Empty, headers: List[HttpHeader] = List.empty, - requestConfiguration: Option[RequestConfiguration] = None) + requestConfiguration: Option[RequestConfiguration] = None) { + val uri: Uri = { + // Note: This will guarantee we create a valid request (one with a valid uri). Will throw an exception if invalid + if (params.nonEmpty) + Uri(url).withQuery(params) + else + Uri(url) + } + } case class RequestException(message: String, response: StreamResponse) extends RuntimeException(message) diff --git a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala index 405457ea..af40c25a 100644 --- a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala +++ b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala @@ -1,13 +1,12 @@ package ignition.core.http -import java.net.URL import java.util.concurrent.TimeoutException import akka.actor._ import akka.io.IO import akka.pattern.ask import akka.util.Timeout - +import ignition.core.http.AsyncHttpClientStreamApi.{Request, RequestConfiguration} import spray.can.Http import spray.can.Http.HostConnectorSetup import spray.can.client.{ClientConnectionSettings, HostConnectorSettings} @@ -15,14 +14,10 @@ import spray.http.HttpHeaders.Authorization import spray.http.StatusCodes.Redirection import spray.http._ - -import scala.concurrent.duration._ import scala.concurrent.{ExecutionContext, Future} import scala.language.postfixOps import scala.util.control.NonFatal -import ignition.core.http.AsyncHttpClientStreamApi.{Request, RequestConfiguration} - trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi { @@ -51,27 +46,15 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi { case _ => false } - private def toUriString(url: String, params: Map[String, String] = Map.empty) = { - def encode(content: String) = java.net.URLEncoder.encode(content, "UTF-8") - def encodeParams = params.map { case (k, v) => s"${encode(k)}=${encode(v)}" }.mkString("&") - if (params.isEmpty) url else s"$url?${encodeParams}" - } - private implicit def toAuthHeader(credentials: AsyncHttpClientStreamApi.Credentials): List[Authorization] = List(Authorization(credentials = BasicHttpCredentials(username = credentials.user, password = credentials.password))) private def toSprayRequest(request: Request): HttpRequest = request match { - case Request(uri, params, Some(credentials), method, body, headers, _) if params.isEmpty => - HttpRequest(method = method, uri = request.url, headers = credentials ++ headers, entity = body) - - case Request(uri, params, Some(credentials), method, body, headers, _) => - HttpRequest(method = method, uri = toUriString(request.url, params), headers = credentials ++ headers, entity = body) - - case Request(uri, params, None, method, body, headers, _) if params.isEmpty => - HttpRequest(method = method, uri = toUriString(request.url), entity = body, headers = headers) + case Request(_, params, Some(credentials), method, body, headers, _) => + HttpRequest(method = method, uri = request.uri, headers = credentials ++ headers, entity = body) - case Request(uri, params, None, method, body, headers, _) => - HttpRequest(method = method, uri = toUriString(request.url, params), entity = body, headers = headers) + case Request(_, params, None, method, body, headers, _) => + HttpRequest(method = method, uri = request.uri, entity = body, headers = headers) } private def toSprayHostConnectorSetup(uri: Uri, conf: Option[AsyncHttpClientStreamApi.RequestConfiguration]): HostConnectorSetup = { @@ -109,8 +92,7 @@ trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi { } private def executeSprayRequest(request: Request): Unit = { - val url = Uri(request.url) - val message = (toSprayRequest(request), toSprayHostConnectorSetup(url, request.requestConfiguration)) + val message = (toSprayRequest(request), toSprayHostConnectorSetup(request.uri, request.requestConfiguration)) IO(Http) ! message } From b2437133518175e1c8c33f654c727a2e22c7690f Mon Sep 17 00:00:00 2001 From: Fernando Date: Thu, 5 Jan 2017 17:20:41 -0200 Subject: [PATCH 147/268] request uri sanitization effort --- .../core/http/AsyncHttpClientStreamApi.scala | 32 +++++++++++-- .../http/AsyncHttpClientStreamApiSpec.scala | 47 +++++++++++++++++++ 2 files changed, 75 insertions(+), 4 deletions(-) create mode 100644 src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala index 131d2a05..9760e100 100644 --- a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala +++ b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala @@ -1,17 +1,40 @@ package ignition.core.http import java.io.InputStream +import java.net.{URL, URLDecoder, URLEncoder} import java.util.concurrent.TimeUnit import akka.util.Timeout +import spray.http.Uri.Query import spray.http._ import scala.concurrent.Future import scala.concurrent.duration._ import scala.language.postfixOps - object AsyncHttpClientStreamApi { + + // Due to ancient standards, Java will encode space as + instead of using percent. + // + // See: + // http://stackoverflow.com/questions/1634271/url-encoding-the-space-character-or-20 + // https://docs.oracle.com/javase/7/docs/api/java/net/URLEncoder.html#encode(java.lang.String,%20java.lang.String) + private def sanitizePathSegment(segment: String) = + URLEncoder.encode(URLDecoder.decode(segment, "UTF-8"), "UTF-8").replace("+", "%20") + + def sanitizeUrl(strUrl: String) = { + val url = new URL(strUrl) + val sanePath = url.getPath.split("/").map(sanitizePathSegment).mkString("/") + + Uri.from( + scheme = url.getProtocol, + userinfo = Option(url.getUserInfo).getOrElse(""), + host = url.getHost, + port = Seq(url.getPort, 0).max, + path = sanePath, + query = Query(Option(url.getQuery)), + fragment = Option(url.getRef)) + } case class Credentials(user: String, password: String) { def isEmpty = user.isEmpty && password.isEmpty @@ -50,12 +73,13 @@ object AsyncHttpClientStreamApi { body: HttpEntity = HttpEntity.Empty, headers: List[HttpHeader] = List.empty, requestConfiguration: Option[RequestConfiguration] = None) { - val uri: Uri = { + + def uri: Uri = { // Note: This will guarantee we create a valid request (one with a valid uri). Will throw an exception if invalid if (params.nonEmpty) - Uri(url).withQuery(params) + sanitizeUrl(url).withQuery(params) else - Uri(url) + sanitizeUrl(url) } } diff --git a/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala b/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala new file mode 100644 index 00000000..ebb5dade --- /dev/null +++ b/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala @@ -0,0 +1,47 @@ +package ignition.core.http + +import ignition.core.http.AsyncHttpClientStreamApi.Request +import org.scalatest.{FunSpec, Matchers} + +class AsyncHttpClientStreamApiSpec extends FunSpec with Matchers { + + describe(".sanitizeUrl") { + it("should percent encode url paths") { + val tests = Seq( + "http://images1.petlove.com.br/products/170301/small/Ração-Special-Dog-Premium-Vegetais-Cenoura-e-Espinafre-para-Cães.jpg", + "http://images0.petlove.com.br/products/175408/small/Ração-Nestlé-Purina-Pro-Plan-Cat-Sensitive-para-Gatos-Adultos-com-Pele-Sensível.jpg", + "http://images3.petlove.com.br/products/171539/small/Ração-Royal-Canin-Feline-Veterinary-Diet-Urinary-SO-High-Dilution-para-Gatos-com-Cálculos-Urinários.jpg" + ) + + val expectations = Seq( + "http://images1.petlove.com.br/products/170301/small/Ra%C3%A7%C3%A3o-Special-Dog-Premium-Vegetais-Cenoura-e-Espinafre-para-C%C3%A3es.jpg", + "http://images0.petlove.com.br/products/175408/small/Ra%C3%A7%C3%A3o-Nestl%C3%A9-Purina-Pro-Plan-Cat-Sensitive-para-Gatos-Adultos-com-Pele-Sens%C3%ADvel.jpg", + "http://images3.petlove.com.br/products/171539/small/Ra%C3%A7%C3%A3o-Royal-Canin-Feline-Veterinary-Diet-Urinary-SO-High-Dilution-para-Gatos-com-C%C3%A1lculos-Urin%C3%A1rios.jpg" + ) + + tests.zip(expectations).foreach { + case (url, expected) => AsyncHttpClientStreamApi.sanitizeUrl(url).toString shouldBe expected + } + } + + it("should not encode percent characters in url path") { + val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf" + val sane = AsyncHttpClientStreamApi.sanitizeUrl(url).toString + sane shouldBe "http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf" + } + + it("should encode space characters with percent in URL path") { + val url = "http://www.example.com/Pentagrama+Invertido.xml?q=blah+bleh" + val sane = AsyncHttpClientStreamApi.sanitizeUrl(url).toString + sane shouldBe "http://www.example.com/Pentagrama%20Invertido.xml?q=blah+bleh" + } + } + + describe("Request") { + it("should do the best to parse the provided uri") { + val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf" + val request = Request(url) + request.uri.toString shouldBe "http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf" + } + } +} From 2a8ff5ff167ec97f49c994b056b5f4924973026a Mon Sep 17 00:00:00 2001 From: Fernando Date: Tue, 17 Jan 2017 14:18:47 -0200 Subject: [PATCH 148/268] URLUtils --- .../core/http/AsyncHttpClientStreamApi.scala | 29 +------ .../scala/ignition/core/utils/URLUtils.scala | 38 ++++++++++ .../http/AsyncHttpClientStreamApiSpec.scala | 42 +---------- .../ignition/core/utils/URLUtilsSpec.scala | 75 +++++++++++++++++++ 4 files changed, 120 insertions(+), 64 deletions(-) create mode 100644 src/main/scala/ignition/core/utils/URLUtils.scala create mode 100644 src/test/scala/ignition/core/utils/URLUtilsSpec.scala diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala index 9760e100..e95e4811 100644 --- a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala +++ b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala @@ -1,11 +1,10 @@ package ignition.core.http import java.io.InputStream -import java.net.{URL, URLDecoder, URLEncoder} import java.util.concurrent.TimeUnit import akka.util.Timeout -import spray.http.Uri.Query +import ignition.core.utils.URLUtils import spray.http._ import scala.concurrent.Future @@ -13,28 +12,6 @@ import scala.concurrent.duration._ import scala.language.postfixOps object AsyncHttpClientStreamApi { - - // Due to ancient standards, Java will encode space as + instead of using percent. - // - // See: - // http://stackoverflow.com/questions/1634271/url-encoding-the-space-character-or-20 - // https://docs.oracle.com/javase/7/docs/api/java/net/URLEncoder.html#encode(java.lang.String,%20java.lang.String) - private def sanitizePathSegment(segment: String) = - URLEncoder.encode(URLDecoder.decode(segment, "UTF-8"), "UTF-8").replace("+", "%20") - - def sanitizeUrl(strUrl: String) = { - val url = new URL(strUrl) - val sanePath = url.getPath.split("/").map(sanitizePathSegment).mkString("/") - - Uri.from( - scheme = url.getProtocol, - userinfo = Option(url.getUserInfo).getOrElse(""), - host = url.getHost, - port = Seq(url.getPort, 0).max, - path = sanePath, - query = Query(Option(url.getQuery)), - fragment = Option(url.getRef)) - } case class Credentials(user: String, password: String) { def isEmpty = user.isEmpty && password.isEmpty @@ -77,9 +54,9 @@ object AsyncHttpClientStreamApi { def uri: Uri = { // Note: This will guarantee we create a valid request (one with a valid uri). Will throw an exception if invalid if (params.nonEmpty) - sanitizeUrl(url).withQuery(params) + URLUtils.parseUri(url).withQuery(params) else - sanitizeUrl(url) + URLUtils.parseUri(url) } } diff --git a/src/main/scala/ignition/core/utils/URLUtils.scala b/src/main/scala/ignition/core/utils/URLUtils.scala new file mode 100644 index 00000000..800a3a1a --- /dev/null +++ b/src/main/scala/ignition/core/utils/URLUtils.scala @@ -0,0 +1,38 @@ +package ignition.core.utils + +import java.net.{URL, URLDecoder, URLEncoder} + +import org.apache.http.client.utils.URIBuilder +import spray.http.Uri +import spray.http.Uri.Query + +object URLUtils { + + // Due to ancient standards, Java will encode space as + instead of using percent. + // + // See: + // http://stackoverflow.com/questions/1634271/url-encoding-the-space-character-or-20 + // https://docs.oracle.com/javase/7/docs/api/java/net/URLEncoder.html#encode(java.lang.String,%20java.lang.String) + def sanitizePathSegment(segment: String) = + URLEncoder.encode(URLDecoder.decode(segment, "UTF-8"), "UTF-8").replace("+", "%20") + + def parseUri(urlStr: String): Uri = { + val url = new URL(urlStr) + val sanePath = url.getPath.split("/").map(sanitizePathSegment).mkString("/") + + Uri.from( + scheme = url.getProtocol, + userinfo = Option(url.getUserInfo).getOrElse(""), + host = url.getHost, + port = Seq(url.getPort, 0).max, + path = sanePath, + query = Query(Option(url.getQuery)), + fragment = Option(url.getRef)) + } + + def addParametersToUrl(url: String, partnerParams: Map[String, String]): String = { + val builder = new URIBuilder(url.trim) + partnerParams.foreach { case (k, v) => builder.addParameter(k, v) } + builder.build().toString + } +} diff --git a/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala b/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala index ebb5dade..37accf5b 100644 --- a/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala +++ b/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala @@ -5,43 +5,9 @@ import org.scalatest.{FunSpec, Matchers} class AsyncHttpClientStreamApiSpec extends FunSpec with Matchers { - describe(".sanitizeUrl") { - it("should percent encode url paths") { - val tests = Seq( - "http://images1.petlove.com.br/products/170301/small/Ração-Special-Dog-Premium-Vegetais-Cenoura-e-Espinafre-para-Cães.jpg", - "http://images0.petlove.com.br/products/175408/small/Ração-Nestlé-Purina-Pro-Plan-Cat-Sensitive-para-Gatos-Adultos-com-Pele-Sensível.jpg", - "http://images3.petlove.com.br/products/171539/small/Ração-Royal-Canin-Feline-Veterinary-Diet-Urinary-SO-High-Dilution-para-Gatos-com-Cálculos-Urinários.jpg" - ) - - val expectations = Seq( - "http://images1.petlove.com.br/products/170301/small/Ra%C3%A7%C3%A3o-Special-Dog-Premium-Vegetais-Cenoura-e-Espinafre-para-C%C3%A3es.jpg", - "http://images0.petlove.com.br/products/175408/small/Ra%C3%A7%C3%A3o-Nestl%C3%A9-Purina-Pro-Plan-Cat-Sensitive-para-Gatos-Adultos-com-Pele-Sens%C3%ADvel.jpg", - "http://images3.petlove.com.br/products/171539/small/Ra%C3%A7%C3%A3o-Royal-Canin-Feline-Veterinary-Diet-Urinary-SO-High-Dilution-para-Gatos-com-C%C3%A1lculos-Urin%C3%A1rios.jpg" - ) - - tests.zip(expectations).foreach { - case (url, expected) => AsyncHttpClientStreamApi.sanitizeUrl(url).toString shouldBe expected - } - } - - it("should not encode percent characters in url path") { - val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf" - val sane = AsyncHttpClientStreamApi.sanitizeUrl(url).toString - sane shouldBe "http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf" - } - - it("should encode space characters with percent in URL path") { - val url = "http://www.example.com/Pentagrama+Invertido.xml?q=blah+bleh" - val sane = AsyncHttpClientStreamApi.sanitizeUrl(url).toString - sane shouldBe "http://www.example.com/Pentagrama%20Invertido.xml?q=blah+bleh" - } - } - - describe("Request") { - it("should do the best to parse the provided uri") { - val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf" - val request = Request(url) - request.uri.toString shouldBe "http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf" - } + it("should do the best to parse the provided uri") { + val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf" + val request = Request(url) + request.uri.toString shouldBe "http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf" } } diff --git a/src/test/scala/ignition/core/utils/URLUtilsSpec.scala b/src/test/scala/ignition/core/utils/URLUtilsSpec.scala new file mode 100644 index 00000000..6665e3ec --- /dev/null +++ b/src/test/scala/ignition/core/utils/URLUtilsSpec.scala @@ -0,0 +1,75 @@ +package ignition.core.utils + +import org.scalatest.{FlatSpec, Matchers} + +class URLUtilsSpec extends FlatSpec with Matchers { + + "URLUtils" should "add parameters to url with encoded params in base url and not be double encoded" in { + val baseUrl: String = "https://tracker.client.com/product=1?email=user%40mail.com" + val params = Map("cc" -> "second@mail.com") + + val result: String = URLUtils.addParametersToUrl(baseUrl, params) + result shouldEqual "https://tracker.client.com/product=1?email=user%40mail.com&cc=second%40mail.com" + } + + it should "add multiples params with the same name" in { + val baseUrl: String = "https://tracker.client.com/product=1?email=user%40mail.com&cc=second%40mail.com" + val params = Map("cc" -> "third@mail.com") + + val result: String = URLUtils.addParametersToUrl(baseUrl, params) + result shouldEqual "https://tracker.client.com/product=1?email=user%40mail.com&cc=second%40mail.com&cc=third%40mail.com" + } + + it should "works with Fragment in original URL" in { + + val baseUrl = "https://www.petlove.com.br/carrinho?utm_campanha=internalmkt#/add/variant_sku/310178,31012214/quantity/1?t=1" + val params: Map[String, String] = Map( + "utm_campaign" -> "abandonodecarrinho", + "utm_source" -> "chaordic-mail", + "utm_medium" -> "emailmkt", + "cc" -> "second@mail.com" + ) + + val result = URLUtils.addParametersToUrl(baseUrl, params) + + val expected = "https://www.petlove.com.br/carrinho?utm_campanha=internalmkt&utm_campaign=abandonodecarrinho&utm_source=chaordic-mail&utm_medium=emailmkt&cc=second%40mail.com#/add/variant_sku/310178,31012214/quantity/1?t=1" + + result shouldEqual expected + } + + it should "handle urls with new line character at the edges" in { + val url = "\n\t\n\thttps://www.petlove.com.br/carrinho#/add/variant_sku/3105748-1,3107615/quantity/1?t=1\n\t" + val finalUrl = URLUtils.addParametersToUrl(url, Map("test" -> "true")) + finalUrl shouldEqual "https://www.petlove.com.br/carrinho?test=true#/add/variant_sku/3105748-1,3107615/quantity/1?t=1" + } + + it should "percent encode url paths" in { + val tests = Seq( + "http://images1.petlove.com.br/products/170301/small/Ração-Special-Dog-Premium-Vegetais-Cenoura-e-Espinafre-para-Cães.jpg", + "http://images0.petlove.com.br/products/175408/small/Ração-Nestlé-Purina-Pro-Plan-Cat-Sensitive-para-Gatos-Adultos-com-Pele-Sensível.jpg", + "http://images3.petlove.com.br/products/171539/small/Ração-Royal-Canin-Feline-Veterinary-Diet-Urinary-SO-High-Dilution-para-Gatos-com-Cálculos-Urinários.jpg" + ) + + val expectations = Seq( + "http://images1.petlove.com.br/products/170301/small/Ra%C3%A7%C3%A3o-Special-Dog-Premium-Vegetais-Cenoura-e-Espinafre-para-C%C3%A3es.jpg", + "http://images0.petlove.com.br/products/175408/small/Ra%C3%A7%C3%A3o-Nestl%C3%A9-Purina-Pro-Plan-Cat-Sensitive-para-Gatos-Adultos-com-Pele-Sens%C3%ADvel.jpg", + "http://images3.petlove.com.br/products/171539/small/Ra%C3%A7%C3%A3o-Royal-Canin-Feline-Veterinary-Diet-Urinary-SO-High-Dilution-para-Gatos-com-C%C3%A1lculos-Urin%C3%A1rios.jpg" + ) + + tests.zip(expectations).foreach { + case (url, expected) => URLUtils.parseUri(url).toString shouldBe expected + } + } + + it should "not encode percent characters in url path" in { + val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf" + val sane = URLUtils.parseUri(url).toString + sane shouldBe "http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf" + } + + it should "encode space characters with percent in URL path" in { + val url = "http://www.example.com/Pentagrama+Invertido.xml?q=blah+bleh" + val sane = URLUtils.parseUri(url).toString + sane shouldBe "http://www.example.com/Pentagrama%20Invertido.xml?q=blah+bleh" + } +} From 51119b1daad3969d6116cf8d2bff05e0276754aa Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 2 Feb 2017 15:17:17 -0200 Subject: [PATCH 149/268] Replace spark-ec2 by flintrock as cluster launcher --- .gitmodules | 4 + remote_hook.sh | 8 +- tools/cluster.py | 188 +- tools/flintrock | 1 + tools/spark-ec2/README | 4 - .../root/spark-ec2/ec2-variables.sh | 35 - tools/spark-ec2/spark-ec2 | 22 - tools/spark-ec2/spark_ec2.py | 1593 ----------------- tools/utils.py | 6 +- 9 files changed, 120 insertions(+), 1741 deletions(-) create mode 100644 .gitmodules create mode 160000 tools/flintrock delete mode 100644 tools/spark-ec2/README delete mode 100644 tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh delete mode 100755 tools/spark-ec2/spark-ec2 delete mode 100755 tools/spark-ec2/spark_ec2.py diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..35ab3b28 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "tools/flintrock"] + path = tools/flintrock + url = git@github.com:chaordic/flintrock.git + branch = ignition_v1 diff --git a/remote_hook.sh b/remote_hook.sh index 3635951e..86f1f56b 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -86,7 +86,9 @@ JAR_PATH="${JOB_CONTROL_DIR}/Ignition.jar" cp ${JAR_PATH_SRC} ${JAR_PATH} -export JOB_MASTER=${MASTER} +# If no $MASTER, then build a url using $SPARK_MASTER_HOST +export JOB_MASTER=${MASTER:-spark://${SPARK_MASTER_HOST}:7077} + if [[ "${USE_YARN}" == "yes" ]]; then export YARN_MODE=true @@ -97,13 +99,13 @@ if [[ "${USE_YARN}" == "yes" ]]; then fi if [[ "${JOB_NAME}" == "shell" ]]; then - sudo -E ${SPARK_HOME}/bin/spark-shell --jars ${JAR_PATH} --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell" + sudo -E ${SPARK_HOME}/bin/spark-shell --master "${JOB_MASTER}" --jars ${JAR_PATH} --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell" elif [[ "${JOB_NAME}" == "zeppelin" ]]; then install_and_run_zeppelin else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & - sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" + sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" fi touch "${JOB_CONTROL_DIR}/SUCCESS" diff --git a/tools/cluster.py b/tools/cluster.py index c2762f3a..4a99a214 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -12,9 +12,8 @@ from argh.decorators import named, arg import subprocess from subprocess import check_output, check_call -from itertools import chain from utils import tag_instances, get_masters, get_active_nodes -from utils import check_call_with_timeout, ProcessTimeoutException +from utils import check_call_with_timeout import os import sys from datetime import datetime @@ -40,32 +39,29 @@ default_spot_price = '0.10' default_worker_instances = '1' default_executor_instances = '1' -default_master_instance_type = 'm3.xlarge' +default_master_instance_type = '' default_driver_heap_size = '12G' default_region = 'us-east-1' default_zone = default_region + 'b' default_key_id = 'ignition_key' default_key_file = os.path.expanduser('~/.ssh/ignition_key.pem') -default_ami = None # will be decided based on spark-ec2 list -default_master_ami = None +default_ami = 'ami-611e7976' +default_master_ami = '' default_env = 'dev' default_spark_version = '2.0.2' -custom_builds = { -# '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz' -} -default_spark_repo = 'https://github.com/chaordic/spark' +default_hdfs_version = '2.7.2' +default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz' +default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz' default_remote_user = 'ec2-user' +default_installation_user = 'root' default_remote_control_dir = '/tmp/Ignition' default_collect_results_dir = '/tmp' default_user_data = os.path.join(script_path, 'scripts', 'noop') default_defaults_filename = 'cluster_defaults.json' -default_spark_ec2_git_repo = 'https://github.com/chaordic/spark-ec2' -default_spark_ec2_git_branch = 'branch-2.0' - master_post_create_commands = [ - 'sudo', 'yum', '-y', 'install', 'tmux' + ['sudo', 'yum', '-y', 'install', 'tmux'], ] @@ -130,17 +126,19 @@ def ssh_call(user, host, key_file, args=(), allocate_terminal=True, get_output=F else: return logged_call(base) +def ec2_script_base_path(): + return os.path.join(script_path, 'flintrock') def chdir_to_ec2_script_and_get_path(): - ec2_script_base = os.path.join(script_path, 'spark-ec2') + ec2_script_base = ec2_script_base_path() os.chdir(ec2_script_base) - ec2_script_path = os.path.join(ec2_script_base, 'spark_ec2.py') + ec2_script_path = os.path.join(ec2_script_base, 'standalone.py') return ec2_script_path def call_ec2_script(args, timeout_total_minutes, timeout_inactivity_minutes): ec2_script_path = chdir_to_ec2_script_and_get_path() - return check_call_with_timeout(['/usr/bin/env', 'python', '-u', + return check_call_with_timeout(['/usr/bin/env', 'python3', '-u', ec2_script_path] + args, timeout_total_minutes=timeout_total_minutes, timeout_inactivity_minutes=timeout_inactivity_minutes) @@ -207,102 +205,101 @@ def launch(cluster_name, slaves, tag=[], key_id=default_key_id, region=default_region, zone=default_zone, instance_type=default_instance_type, - ondemand=False, spot_price=default_spot_price, master_spot=False, + # TODO: implement it in flintrock + ondemand=False, + spot_price=default_spot_price, + # TODO: implement it in flintrock + master_spot=False, user_data=default_user_data, - security_group = None, - vpc = None, - vpc_subnet = None, + security_group=None, + vpc=None, + vpc_subnet=None, + # TODO: consider implementing in flintrock master_instance_type=default_master_instance_type, - wait_time='180', hadoop_major_version='2', - worker_instances=default_worker_instances, executor_instances=default_executor_instances, retries_on_same_cluster=5, max_clusters_to_create=5, minimum_percentage_healthy_slaves=0.9, remote_user=default_remote_user, + installation_user=default_installation_user, script_timeout_total_minutes=55, script_timeout_inactivity_minutes=10, - resume=False, just_ignore_existing=False, worker_timeout=240, - spark_repo=default_spark_repo, + just_ignore_existing=False, + spark_download_source=default_spark_download_source, spark_version=default_spark_version, - spark_ec2_git_repo=default_spark_ec2_git_repo, - spark_ec2_git_branch=default_spark_ec2_git_branch, - ami=default_ami, master_ami=default_master_ami, + hdfs_download_source=default_hdfs_download_source, + hdfs_version=default_hdfs_version, + ami=default_ami, + # TODO: consider implementing in flintrock + master_ami=default_master_ami, instance_profile_name=None): + assert not master_instance_type or master_instance_type == instance_type, 'Different master instance type is currently unsupported' + assert not master_ami or master_ami == ami, 'Different master ami is currently unsupported' + assert not ondemand, 'On demand is unsupported' + assert master_spot, 'On demand master is currently unsupported' + all_args = locals() - if cluster_exists(cluster_name, region=region) and not resume: + if cluster_exists(cluster_name, region=region): if just_ignore_existing: log.info('Cluster exists but that is ok') return '' else: - raise CommandError('Cluster already exists, pick another name or resume the setup using --resume') + raise CommandError('Cluster already exists, pick another name') for j in range(max_clusters_to_create): log.info('Creating new cluster {0}, try {1}'.format(cluster_name, j+1)) success = False - resume_param = ['--resume'] if resume else [] auth_params = [] - if security_group: - auth_params.extend([ - '--authorized-address', '127.0.0.1/32', - '--additional-security-group', security_group - ]) # '--vpc-id', default_vpc, # '--subnet-id', default_vpc_subnet, if vpc and vpc_subnet: auth_params.extend([ - '--vpc-id', vpc, - '--subnet-id', vpc_subnet, + '--ec2-vpc-id', vpc, + '--ec2-subnet-id', vpc_subnet, ]) - spot_params = ['--spot-price', spot_price] if not ondemand else [] - master_spot_params = ['--master-spot'] if not ondemand and master_spot else [] - - ami_params = ['--ami', ami] if ami else [] - master_ami_params = ['--master-ami', master_ami] if master_ami else [] + spot_params = ['--ec2-spot-price', spot_price] if not ondemand else [] + #master_spot_params = ['--master-spot'] if not ondemand and master_spot else [] - iam_params = ['--instance-profile-name', instance_profile_name] if instance_profile_name else [] + ami_params = ['--ec2-ami', ami] if ami else [] + #master_ami_params = ['--master-ami', master_ami] if master_ami else [] - spark_version = custom_builds.get(spark_version, spark_version) + iam_params = ['--ec2-instance-profile-name', instance_profile_name] if instance_profile_name else [] for i in range(retries_on_same_cluster): log.info('Running script, try %d of %d', i + 1, retries_on_same_cluster) try: - call_ec2_script(['--identity-file', key_file, - '--key-pair', key_id, - '--slaves', slaves, - '--region', region, - '--zone', zone, - '--instance-type', instance_type, - '--master-instance-type', master_instance_type, - '--wait', wait_time, - '--hadoop-major-version', hadoop_major_version, - '--spark-ec2-git-repo', spark_ec2_git_repo, - '--spark-ec2-git-branch', spark_ec2_git_branch, - '--worker-instances', worker_instances, - '--executor-instances', executor_instances, - '--master-opts', '-Dspark.worker.timeout={0}'.format(worker_timeout), - '--spark-git-repo', spark_repo, - '-v', spark_version, - '--user-data', user_data, - 'launch', cluster_name] + + call_ec2_script(['--debug', + 'launch', + '--ec2-identity-file', key_file, + '--ec2-key-name', key_id, + '--num-slaves', slaves, + '--ec2-region', region, + '--ec2-availability-zone', zone, + '--ec2-instance-type', instance_type, + '--assume-yes', + '--install-spark', + '--install-hdfs', + '--spark-version', spark_version, + '--hdfs-version', hdfs_version, + '--spark-download-source', spark_download_source, + '--hdfs-download-source', hdfs_download_source, + '--spark-executor-instances', executor_instances, + '--ec2-security-group', security_group, + '--ec2-user', installation_user, + '--ec2-user-data', user_data, + cluster_name] + spot_params + - master_spot_params + - resume_param + auth_params + ami_params + - master_ami_params + iam_params, timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes) success = True - except subprocess.CalledProcessError as e: - resume_param = ['--resume'] - log.warn('Failed with: %s', e) except Exception as e: # Probably a timeout log.exception('Fatal error calling EC2 script') @@ -318,7 +315,8 @@ def launch(cluster_name, slaves, master = get_master(cluster_name, region=region) save_cluster_args(master, key_file, remote_user, all_args) health_check(cluster_name=cluster_name, key_file=key_file, master=master, remote_user=remote_user, region=region) - ssh_call(user=remote_user, host=master, key_file=key_file, args=master_post_create_commands) + for command in master_post_create_commands: + ssh_call(user=remote_user, host=master, key_file=key_file, args=command) return master except Exception as e: log.exception('Got exception on last steps of cluster configuration') @@ -328,16 +326,22 @@ def launch(cluster_name, slaves, def destroy(cluster_name, delete_groups=False, region=default_region): - delete_sg_param = ['--delete-groups'] if delete_groups else [] + assert not delete_groups, 'Delete groups is deprecated and unsupported' + masters, slaves = get_active_nodes(cluster_name, region=region) - ec2_script_path = chdir_to_ec2_script_and_get_path() - p = subprocess.Popen(['/usr/bin/env', 'python', '-u', - ec2_script_path, - 'destroy', cluster_name, - '--region', region] + delete_sg_param, - stdin=subprocess.PIPE, - stdout=sys.stdout, universal_newlines=True) - p.communicate('y') + all_instances = masters + slaves + if all_instances: + log.info('The following instances will be terminated:') + for i in all_instances: + log.info('-> %s' % i.public_dns_name) + + log.info('Terminating master...') + for i in masters: + i.terminate() + log.info('Terminating slaves...') + for i in slaves: + i.terminate() + log.info('Done.') def get_master(cluster_name, region=default_region): @@ -388,7 +392,6 @@ def job_run(cluster_name, job_name, job_mem, remote_control_dir = default_remote_control_dir, remote_path=None, master=None, disable_assembly_build=False, - run_tests=False, kill_on_failure=False, destroy_cluster=False, region=default_region, @@ -403,7 +406,6 @@ def job_run(cluster_name, job_name, job_mem, project_path = get_project_path() project_name = os.path.basename(project_path) - module_name = os.path.basename(get_module_path()) # Use job user on remote path to avoid too many conflicts for different local users remote_path = remote_path or '/home/%s/%s.%s' % (default_remote_user, job_user, project_name) remote_hook_local = '{module_path}/remote_hook.sh'.format(module_path=get_module_path()) @@ -517,6 +519,8 @@ def health_check(cluster_name, key_file=default_key_file, master=None, remote_us masters, slaves = get_active_nodes(cluster_name, region=region) if nslaves == 0 or float(len(slaves)) / nslaves < minimum_percentage_healthy_slaves: raise NotHealthyCluster('Not enough healthy slaves: {0}/{1}'.format(len(slaves), nslaves)) + if not masters: + raise NotHealthyCluster('No master found') except NotHealthyCluster, e: raise e except Exception, e: @@ -703,7 +707,28 @@ def killall_jobs(cluster_name, key_file=default_key_file, done >& /dev/null || true'''.format(remote_control_dir=remote_control_dir) ]) - +def check_flintrock_installation(): + try: + call_ec2_script(['--help'], 1 , 1) + except: + setup = os.path.join(ec2_script_base_path(), 'setup.py') + if not os.path.exists(setup): + log.error(''' +Flintrock is missing (or the wrong version is being used). +Check if you have checked out the submodule. Try: + git submode update --init --recursive +Or checkout ignition with: + git clone --recursive .... +''') + else: + log.error(''' +Some dependencies are missing. For an Ubuntu system, try the following: +sudo apt-get install python3-yaml libyaml-dev +sudo python3 -m pip install -U pip packaging setuptools +cd {flintrock} +sudo pip3 -r requirements/user.pip + '''.format(flintrock=ec2_script_base_path())) + sys.exit(1) parser = ArghParser() @@ -712,4 +737,5 @@ def killall_jobs(cluster_name, key_file=default_key_file, kill_job, killall_jobs, collect_job_results], namespace="jobs") if __name__ == '__main__': + parser.dispatch() diff --git a/tools/flintrock b/tools/flintrock new file mode 160000 index 00000000..541697fb --- /dev/null +++ b/tools/flintrock @@ -0,0 +1 @@ +Subproject commit 541697fb11912df6298d588b845809966e94d280 diff --git a/tools/spark-ec2/README b/tools/spark-ec2/README deleted file mode 100644 index 72434f24..00000000 --- a/tools/spark-ec2/README +++ /dev/null @@ -1,4 +0,0 @@ -This folder contains a script, spark-ec2, for launching Spark clusters on -Amazon EC2. Usage instructions are available online at: - -http://spark.apache.org/docs/latest/ec2-scripts.html diff --git a/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh b/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh deleted file mode 100644 index bd3b656f..00000000 --- a/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# These variables are automatically filled in by the spark-ec2 script. -export MASTERS="{{master_list}}" -export SLAVES="{{slave_list}}" -export HDFS_DATA_DIRS="{{hdfs_data_dirs}}" -export MAPRED_LOCAL_DIRS="{{mapred_local_dirs}}" -export SPARK_LOCAL_DIRS="{{spark_local_dirs}}" -export MODULES="{{modules}}" -export SPARK_VERSION="{{spark_version}}" -export TACHYON_VERSION="{{tachyon_version}}" -export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}" -export SWAP_MB="{{swap}}" -export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}" -export SPARK_EXECUTOR_INSTANCES="{{spark_executor_instances}}" -export SPARK_MASTER_OPTS="{{spark_master_opts}}" -export AWS_ACCESS_KEY_ID="{{aws_access_key_id}}" -export AWS_SECRET_ACCESS_KEY="{{aws_secret_access_key}}" diff --git a/tools/spark-ec2/spark-ec2 b/tools/spark-ec2/spark-ec2 deleted file mode 100755 index 31f97712..00000000 --- a/tools/spark-ec2/spark-ec2 +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/sh - -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -cd "`dirname $0`" -PYTHONPATH="./third_party/boto-2.4.1.zip/boto-2.4.1:$PYTHONPATH" python ./spark_ec2.py "$@" diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py deleted file mode 100755 index 1b405d47..00000000 --- a/tools/spark-ec2/spark_ec2.py +++ /dev/null @@ -1,1593 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import division, print_function, with_statement - -import codecs -import hashlib -import itertools -import logging -import os -import os.path -import pipes -import random -import shutil -import string -from stat import S_IRUSR -import subprocess -import sys -import tarfile -import tempfile -import textwrap -import time -import warnings -from datetime import datetime -from optparse import OptionParser -from sys import stderr - -if sys.version < "3": - from urllib2 import urlopen, Request, HTTPError -else: - from urllib.request import urlopen, Request - from urllib.error import HTTPError - raw_input = input - xrange = range - -SPARK_EC2_VERSION = "2.0.0" -SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__)) - -VALID_SPARK_VERSIONS = set([ - "0.7.3", - "0.8.0", - "0.8.1", - "0.9.0", - "0.9.1", - "0.9.2", - "1.0.0", - "1.0.1", - "1.0.2", - "1.1.0", - "1.1.1", - "1.2.0", - "1.2.1", - "1.3.0", - "1.3.1", - "1.4.0", - "1.4.1", - "1.5.0", - "1.5.1", - "1.5.2", - "1.6.0", - "2.0.0", - "2.0.1", - "2.0.2", -]) - -SPARK_TACHYON_MAP = { - "1.0.0": "0.4.1", - "1.0.1": "0.4.1", - "1.0.2": "0.4.1", - "1.1.0": "0.5.0", - "1.1.1": "0.5.0", - "1.2.0": "0.5.0", - "1.2.1": "0.5.0", - "1.3.0": "0.5.0", - "1.3.1": "0.5.0", - "1.4.0": "0.6.4", - "1.4.1": "0.6.4", - "1.5.0": "0.7.1", - "1.5.1": "0.7.1", - "1.5.2": "0.7.1", - "1.6.0": "0.8.2", - "2.0.0": "", -} - -DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION -DEFAULT_SPARK_GITHUB_REPO = "https://github.com/apache/spark" - -# Default location to get the spark-ec2 scripts (and ami-list) from -DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/amplab/spark-ec2" -DEFAULT_SPARK_EC2_BRANCH = "branch-2.0" - - -import boto -from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType, EBSBlockDeviceType -from boto import ec2 - - -class UsageError(Exception): - pass - - -# Configure and parse our command-line arguments -def parse_args(): - parser = OptionParser( - prog="spark-ec2", - version="%prog {v}".format(v=SPARK_EC2_VERSION), - usage="%prog [options] \n\n" - + " can be: launch, destroy, login, stop, start, get-master, reboot-slaves") - - parser.add_option( - "-s", "--slaves", type="int", default=1, - help="Number of slaves to launch (default: %default)") - parser.add_option( - "-w", "--wait", type="int", - help="DEPRECATED (no longer necessary) - Seconds to wait for nodes to start") - parser.add_option( - "-k", "--key-pair", - help="Key pair to use on instances") - parser.add_option( - "-i", "--identity-file", - help="SSH private key file to use for logging into instances") - parser.add_option( - "-p", "--profile", default=None, - help="If you have multiple profiles (AWS or boto config), you can configure " + - "additional, named profiles by using this option (default: %default)") - parser.add_option( - "-t", "--instance-type", default="m1.large", - help="Type of instance to launch (default: %default). " + - "WARNING: must be 64-bit; small instances won't work") - parser.add_option( - "-m", "--master-instance-type", default="", - help="Master instance type (leave empty for same as instance-type)") - parser.add_option( - "-r", "--region", default="us-east-1", - help="EC2 region used to launch instances in, or to find them in (default: %default)") - parser.add_option( - "-z", "--zone", default="", - help="Availability zone to launch instances in, or 'all' to spread " + - "slaves across multiple (an additional $0.01/Gb for bandwidth" + - "between zones applies) (default: a single zone chosen at random)") - parser.add_option( - "-a", "--ami", - help="Amazon Machine Image ID to use") - parser.add_option("--master-ami", - help="Amazon Machine Image ID to use for the Master") - parser.add_option( - "-v", "--spark-version", default=DEFAULT_SPARK_VERSION, - help="Version of Spark to use: 'X.Y.Z' or a specific git hash (default: %default)") - parser.add_option( - "--spark-git-repo", - default=DEFAULT_SPARK_GITHUB_REPO, - help="Github repo from which to checkout supplied commit hash (default: %default)") - parser.add_option( - "--spark-ec2-git-repo", - default=DEFAULT_SPARK_EC2_GITHUB_REPO, - help="Github repo from which to checkout spark-ec2 (default: %default)") - parser.add_option( - "--spark-ec2-git-branch", - default=DEFAULT_SPARK_EC2_BRANCH, - help="Github repo branch of spark-ec2 to use (default: %default)") - parser.add_option( - "--deploy-root-dir", - default=None, - help="A directory to copy into / on the first master. " + - "Must be absolute. Note that a trailing slash is handled as per rsync: " + - "If you omit it, the last directory of the --deploy-root-dir path will be created " + - "in / before copying its contents. If you append the trailing slash, " + - "the directory is not created and its contents are copied directly into /. " + - "(default: %default).") - parser.add_option( - "--hadoop-major-version", default="1", - help="Major version of Hadoop. Valid options are 1 (Hadoop 1.0.4), 2 (CDH 4.7.1), yarn " + - "(Hadoop 2.4.0) (default: %default)") - parser.add_option( - "-D", metavar="[ADDRESS:]PORT", dest="proxy_port", - help="Use SSH dynamic port forwarding to create a SOCKS proxy at " + - "the given local address (for use with login)") - parser.add_option( - "--resume", action="store_true", default=False, - help="Resume installation on a previously launched cluster " + - "(for debugging)") - parser.add_option( - "--ebs-vol-size", metavar="SIZE", type="int", default=0, - help="Size (in GB) of each EBS volume.") - parser.add_option( - "--ebs-vol-type", default="standard", - help="EBS volume type (e.g. 'gp2', 'standard').") - parser.add_option( - "--ebs-vol-num", type="int", default=1, - help="Number of EBS volumes to attach to each node as /vol[x]. " + - "The volumes will be deleted when the instances terminate. " + - "Only possible on EBS-backed AMIs. " + - "EBS volumes are only attached if --ebs-vol-size > 0. " + - "Only support up to 8 EBS volumes.") - parser.add_option( - "--placement-group", type="string", default=None, - help="Which placement group to try and launch " + - "instances into. Assumes placement group is already " + - "created.") - parser.add_option( - "--swap", metavar="SWAP", type="int", default=1024, - help="Swap space to set up per node, in MB (default: %default)") - parser.add_option( - "--spot-price", metavar="PRICE", type="float", - help="If specified, launch slaves as spot instances with the given " + - "maximum price (in dollars)") - parser.add_option( - "--master-spot", action="store_true", default=False, - help="If specified, launch master as spot instance using the same " + - "bid and instance type of the slave ones") - parser.add_option( - "--ganglia", action="store_true", default=True, - help="Setup Ganglia monitoring on cluster (default: %default). NOTE: " + - "the Ganglia page will be publicly accessible") - parser.add_option( - "--no-ganglia", action="store_false", dest="ganglia", - help="Disable Ganglia monitoring for the cluster") - parser.add_option( - "-u", "--user", default="root", - help="The SSH user you want to connect as (default: %default)") - parser.add_option( - "--delete-groups", action="store_true", default=False, - help="When destroying a cluster, delete the security groups that were created") - parser.add_option( - "--use-existing-master", action="store_true", default=False, - help="Launch fresh slaves, but use an existing stopped master if possible") - parser.add_option( - "--worker-instances", type="int", default=1, - help="Number of instances per worker: variable SPARK_WORKER_INSTANCES. Not used if YARN " + - "is used as Hadoop major version (default: %default)") - parser.add_option( - "--executor-instances", type="int", default=1, - help="Number of executor instances per worker: variable SPARK_EXECUTOR_INSTANCES. Not used if YARN " + - "is used as Hadoop major version (default: %default)") - parser.add_option( - "--master-opts", type="string", default="", - help="Extra options to give to master through SPARK_MASTER_OPTS variable " + - "(e.g -Dspark.worker.timeout=180)") - parser.add_option( - "--user-data", type="string", default="", - help="Path to a user-data file (most AMIs interpret this as an initialization script)") - parser.add_option( - "--security-group-prefix", type="string", default=None, - help="Use this prefix for the security group rather than the cluster name.") - parser.add_option( - "--authorized-address", type="string", default="0.0.0.0/0", - help="Address to authorize on created security groups (default: %default)") - parser.add_option( - "--additional-security-group", type="string", default="", - help="Additional security group to place the machines in") - parser.add_option( - "--additional-tags", type="string", default="", - help="Additional tags to set on the machines; tags are comma-separated, while name and " + - "value are colon separated; ex: \"Task:MySparkProject,Env:production\"") - parser.add_option( - "--copy-aws-credentials", action="store_true", default=False, - help="Add AWS credentials to hadoop configuration to allow Spark to access S3") - parser.add_option( - "--subnet-id", default=None, - help="VPC subnet to launch instances in") - parser.add_option( - "--vpc-id", default=None, - help="VPC to launch instances in") - parser.add_option( - "--spot-timeout", type="int", default=45, - help="Maximum amount of time (in minutes) to wait for spot requests to be fulfilled") - parser.add_option( - "--private-ips", action="store_true", default=False, - help="Use private IPs for instances rather than public if VPC/subnet " + - "requires that.") - parser.add_option( - "--instance-initiated-shutdown-behavior", default="stop", - choices=["stop", "terminate"], - help="Whether instances should terminate when shut down or just stop") - parser.add_option( - "--instance-profile-name", default=None, - help="IAM profile name to launch instances under") - - (opts, args) = parser.parse_args() - if len(args) != 2: - parser.print_help() - sys.exit(1) - (action, cluster_name) = args - - # Boto config check - # http://boto.cloudhackers.com/en/latest/boto_config_tut.html - home_dir = os.getenv('HOME') - if home_dir is None or not os.path.isfile(home_dir + '/.boto'): - if not os.path.isfile('/etc/boto.cfg'): - # If there is no boto config, check aws credentials - if not os.path.isfile(home_dir + '/.aws/credentials'): - if os.getenv('AWS_ACCESS_KEY_ID') is None: - print("ERROR: The environment variable AWS_ACCESS_KEY_ID must be set", - file=stderr) - sys.exit(1) - if os.getenv('AWS_SECRET_ACCESS_KEY') is None: - print("ERROR: The environment variable AWS_SECRET_ACCESS_KEY must be set", - file=stderr) - sys.exit(1) - return (opts, action, cluster_name) - - -# Get the EC2 security group of the given name, creating it if it doesn't exist -def get_or_make_group(conn, name, vpc_id): - groups = conn.get_all_security_groups() - group = [g for g in groups if g.name == name] - if len(group) > 0: - return group[0] - else: - print("Creating security group " + name) - return conn.create_security_group(name, "Spark EC2 group", vpc_id) - -def check_if_http_resource_exists(resource): - request = Request(resource) - request.get_method = lambda: 'HEAD' - try: - response = urlopen(request) - if response.getcode() == 200: - return True - else: - raise RuntimeError("Resource {resource} not found. Error: {code}".format(resource, response.getcode())) - except HTTPError, e: - print >> stderr, "Unable to check if HTTP resource {url} exists. Error: {code}".format( - url=resource, - code=e.code) - return False - -def get_validate_spark_version(version, repo): - if version.startswith("http"): - #check if custom package URL exists - if check_if_http_resource_exists: - return version - else: - print("Unable to validate pre-built spark version {version}".format(version=version), file=stderr) - sys.exit(1) - elif "." in version: - version = version.replace("v", "") - if version not in VALID_SPARK_VERSIONS: - print("Don't know about Spark version: {v}".format(v=version), file=stderr) - sys.exit(1) - return version - else: - github_commit_url = "{repo}/commit/{commit_hash}".format(repo=repo, commit_hash=version) - if not check_if_http_resource_exists(github_commit_url): - print >> stderr, "Couldn't validate Spark commit: {repo} / {commit}".format( - repo=repo, commit=version) - sys.exit(1) - else: - return version - - -# Source: http://aws.amazon.com/amazon-linux-ami/instance-type-matrix/ -# Last Updated: 2015-06-19 -# For easy maintainability, please keep this manually-inputted dictionary sorted by key. -EC2_INSTANCE_TYPES = { - "c1.medium": "pvm", - "c1.xlarge": "pvm", - "c3.large": "hvm", - "c3.xlarge": "hvm", - "c3.2xlarge": "hvm", - "c3.4xlarge": "hvm", - "c3.8xlarge": "hvm", - "c4.large": "hvm", - "c4.xlarge": "hvm", - "c4.2xlarge": "hvm", - "c4.4xlarge": "hvm", - "c4.8xlarge": "hvm", - "cc1.4xlarge": "hvm", - "cc2.8xlarge": "hvm", - "cg1.4xlarge": "hvm", - "cr1.8xlarge": "hvm", - "d2.xlarge": "hvm", - "d2.2xlarge": "hvm", - "d2.4xlarge": "hvm", - "d2.8xlarge": "hvm", - "g2.2xlarge": "hvm", - "g2.8xlarge": "hvm", - "hi1.4xlarge": "pvm", - "hs1.8xlarge": "pvm", - "i2.xlarge": "hvm", - "i2.2xlarge": "hvm", - "i2.4xlarge": "hvm", - "i2.8xlarge": "hvm", - "m1.small": "pvm", - "m1.medium": "pvm", - "m1.large": "pvm", - "m1.xlarge": "pvm", - "m2.xlarge": "pvm", - "m2.2xlarge": "pvm", - "m2.4xlarge": "pvm", - "m3.medium": "hvm", - "m3.large": "hvm", - "m3.xlarge": "hvm", - "m3.2xlarge": "hvm", - "m4.large": "hvm", - "m4.xlarge": "hvm", - "m4.2xlarge": "hvm", - "m4.4xlarge": "hvm", - "m4.10xlarge": "hvm", - "r3.large": "hvm", - "r3.xlarge": "hvm", - "r3.2xlarge": "hvm", - "r3.4xlarge": "hvm", - "r3.8xlarge": "hvm", - "t1.micro": "pvm", - "t2.micro": "hvm", - "t2.small": "hvm", - "t2.medium": "hvm", - "t2.large": "hvm", - "x1.16xlarge": "hvm", - "x1.32xlarge": "hvm", -} - - -def get_tachyon_version(spark_version): - return SPARK_TACHYON_MAP.get(spark_version, "") - -# Attempt to resolve an appropriate AMI given the architecture and region of the request. -def get_spark_ami(instance_type, region, spark_ec2_git_repo, spark_ec2_git_branch): - if instance_type in EC2_INSTANCE_TYPES: - instance_type = EC2_INSTANCE_TYPES[instance_type] - else: - instance_type = "pvm" - print("Don't recognize %s, assuming type is pvm" % instance_type, file=stderr) - - # URL prefix from which to fetch AMI information - ami_prefix = "{r}/{b}/ami-list".format( - r=spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1), - b=spark_ec2_git_branch) - - ami_path = "%s/%s/%s" % (ami_prefix, region, instance_type) - reader = codecs.getreader("ascii") - try: - ami = reader(urlopen(ami_path)).read().strip() - except: - print("Could not resolve AMI at: " + ami_path, file=stderr) - sys.exit(1) - - print("Spark AMI: " + ami) - return ami - - -# Launch a cluster of the given name, by setting up its security groups, -# and then starting new instances in them. -# Returns a tuple of EC2 reservation objects for the master and slaves -# Fails if there already instances running in the cluster's groups. -def launch_cluster(conn, opts, cluster_name): - if opts.identity_file is None: - print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr) - sys.exit(1) - - if opts.key_pair is None: - print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr) - sys.exit(1) - - user_data_content = None - if opts.user_data: - with open(opts.user_data) as user_data_file: - user_data_content = user_data_file.read() - - print("Setting up security groups...") - if opts.security_group_prefix is None: - master_group = get_or_make_group(conn, cluster_name + "-master", opts.vpc_id) - slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id) - else: - master_group = get_or_make_group(conn, opts.security_group_prefix + "-master", opts.vpc_id) - slave_group = get_or_make_group(conn, opts.security_group_prefix + "-slaves", opts.vpc_id) - - authorized_address = opts.authorized_address - if master_group.rules == []: # Group was just now created - if opts.vpc_id is None: - master_group.authorize(src_group=master_group) - master_group.authorize(src_group=slave_group) - else: - master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, - src_group=master_group) - master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, - src_group=master_group) - master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, - src_group=master_group) - master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, - src_group=slave_group) - master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, - src_group=slave_group) - master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, - src_group=slave_group) - master_group.authorize('tcp', 22, 22, authorized_address) - master_group.authorize('tcp', 8080, 8081, authorized_address) - master_group.authorize('tcp', 18080, 18080, authorized_address) - master_group.authorize('tcp', 19999, 19999, authorized_address) - master_group.authorize('tcp', 50030, 50030, authorized_address) - master_group.authorize('tcp', 50070, 50070, authorized_address) - master_group.authorize('tcp', 60070, 60070, authorized_address) - master_group.authorize('tcp', 4040, 4045, authorized_address) - # Rstudio (GUI for R) needs port 8787 for web access - master_group.authorize('tcp', 8787, 8787, authorized_address) - # HDFS NFS gateway requires 111,2049,4242 for tcp & udp - master_group.authorize('tcp', 111, 111, authorized_address) - master_group.authorize('udp', 111, 111, authorized_address) - master_group.authorize('tcp', 2049, 2049, authorized_address) - master_group.authorize('udp', 2049, 2049, authorized_address) - master_group.authorize('tcp', 4242, 4242, authorized_address) - master_group.authorize('udp', 4242, 4242, authorized_address) - # RM in YARN mode uses 8088 - master_group.authorize('tcp', 8088, 8088, authorized_address) - if opts.ganglia: - master_group.authorize('tcp', 5080, 5080, authorized_address) - if slave_group.rules == []: # Group was just now created - if opts.vpc_id is None: - slave_group.authorize(src_group=master_group) - slave_group.authorize(src_group=slave_group) - else: - slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, - src_group=master_group) - slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, - src_group=master_group) - slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, - src_group=master_group) - slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, - src_group=slave_group) - slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, - src_group=slave_group) - slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, - src_group=slave_group) - slave_group.authorize('tcp', 22, 22, authorized_address) - slave_group.authorize('tcp', 8080, 8081, authorized_address) - slave_group.authorize('tcp', 50060, 50060, authorized_address) - slave_group.authorize('tcp', 50075, 50075, authorized_address) - slave_group.authorize('tcp', 60060, 60060, authorized_address) - slave_group.authorize('tcp', 60075, 60075, authorized_address) - - # Check if instances are already running in our groups - existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, - die_on_error=False) - if existing_slaves or (existing_masters and not opts.use_existing_master): - print("ERROR: There are already instances running in group %s or %s" % - (master_group.name, slave_group.name), file=stderr) - sys.exit(1) - - # Figure out Spark AMI - if opts.ami is None: - opts.ami = get_spark_ami(opts.instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch) - - if opts.master_ami is None: - opts.master_ami = get_spark_ami(opts.master_instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch) - - # we use group ids to work around https://github.com/boto/boto/issues/350 - additional_group_ids = [] - if opts.additional_security_group: - additional_group_ids = [sg.id - for sg in conn.get_all_security_groups() - if opts.additional_security_group in (sg.name, sg.id)] - print("Launching instances...") - - try: - image = conn.get_all_images(image_ids=[opts.ami])[0] - except: - print("Could not find AMI " + opts.ami, file=stderr) - sys.exit(1) - - try: - master_image = conn.get_all_images(image_ids=[opts.master_ami])[0] - except: - print >> stderr, "Could not find AMI " + opts.master_ami - sys.exit(1) - - # Create block device mapping so that we can add EBS volumes if asked to. - # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz - block_map = BlockDeviceMapping() - if opts.ebs_vol_size > 0: - for i in range(opts.ebs_vol_num): - device = EBSBlockDeviceType() - device.size = opts.ebs_vol_size - device.volume_type = opts.ebs_vol_type - device.delete_on_termination = True - block_map["/dev/sd" + chr(ord('s') + i)] = device - - for i in range(get_num_disks(opts.instance_type)): - dev = BlockDeviceType() - dev.ephemeral_name = 'ephemeral%d' % i - name = '/dev/xvd' + string.letters[i + 1] - block_map[name] = dev - # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). - #if opts.instance_type.startswith('m3.'): - # for i in range(get_num_disks(opts.instance_type)): - # dev = BlockDeviceType() - # dev.ephemeral_name = 'ephemeral%d' % i - # # The first ephemeral drive is /dev/sdb. - # name = '/dev/sd' + string.ascii_letters[i + 1] - # block_map[name] = dev - - # Launch slaves - if opts.spot_price is not None: - # Launch spot instances with the requested price - print("Requesting %d slaves as spot instances with price $%.3f" % - (opts.slaves, opts.spot_price)) - zones = get_zones(conn, opts) - num_zones = len(zones) - i = 0 - my_req_ids = [] - for zone in zones: - num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) - slave_reqs = conn.request_spot_instances( - price=opts.spot_price, - image_id=opts.ami, - launch_group="launch-group-%s" % cluster_name, - placement=zone, - count=num_slaves_this_zone, - key_name=opts.key_pair, - security_group_ids=[slave_group.id] + additional_group_ids, - instance_type=opts.instance_type, - block_device_map=block_map, - subnet_id=opts.subnet_id, - placement_group=opts.placement_group, - user_data=user_data_content, - instance_profile_name=opts.instance_profile_name) - my_req_ids += [req.id for req in slave_reqs] - i += 1 - - start_time = datetime.now() - print("Waiting for spot instances to be granted... Request IDs: %s " % my_req_ids) - try: - while True: - time.sleep(10) - reqs = conn.get_all_spot_instance_requests(my_req_ids) - active_instance_ids = filter(lambda req: req.state == "active", reqs) - invalid_states = ["capacity-not-available", "capacity-oversubscribed", "price-too-low"] - invalid = filter(lambda req: req.status.code in invalid_states, reqs) - if len(invalid) > 0: - raise Exception("Invalid state for spot request: %s - status: %s" % - (invalid[0].id, invalid[0].status.message)) - if len(active_instance_ids) == opts.slaves: - print("All %d slaves granted" % opts.slaves) - reservations = conn.get_all_reservations([r.instance_id for r in active_instance_ids]) - slave_nodes = [] - for r in reservations: - slave_nodes += r.instances - break - else: - print("%d of %d slaves granted, waiting longer" % ( - len(active_instance_ids), opts.slaves)) - - if (datetime.now() - start_time).seconds > opts.spot_timeout * 60: - raise Exception("Timed out while waiting for spot instances") - except: - print("Error: %s" % sys.exc_info()[1]) - print("Canceling spot instance requests") - conn.cancel_spot_instance_requests(my_req_ids) - # Log a warning if any of these requests actually launched instances: - (master_nodes, slave_nodes) = get_existing_cluster( - conn, opts, cluster_name, die_on_error=False) - running = len(master_nodes) + len(slave_nodes) - if running: - print(("WARNING: %d instances are still running" % running), file=stderr) - sys.exit(0) - else: - # Launch non-spot instances - zones = get_zones(conn, opts) - num_zones = len(zones) - i = 0 - slave_nodes = [] - for zone in zones: - num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) - if num_slaves_this_zone > 0: - slave_res = image.run( - key_name=opts.key_pair, - security_group_ids=[slave_group.id] + additional_group_ids, - instance_type=opts.instance_type, - placement=zone, - min_count=num_slaves_this_zone, - max_count=num_slaves_this_zone, - block_device_map=block_map, - subnet_id=opts.subnet_id, - placement_group=opts.placement_group, - user_data=user_data_content, - instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, - instance_profile_name=opts.instance_profile_name) - slave_nodes += slave_res.instances - print("Launched {s} slave{plural_s} in {z}, regid = {r}".format( - s=num_slaves_this_zone, - plural_s=('' if num_slaves_this_zone == 1 else 's'), - z=zone, - r=slave_res.id)) - i += 1 - - # Launch or resume masters - if existing_masters: - print("Starting master...") - for inst in existing_masters: - if inst.state not in ["shutting-down", "terminated"]: - inst.start() - master_nodes = existing_masters - else: - master_type = opts.master_instance_type - if master_type == "" or opts.master_spot: - master_type = opts.instance_type - if opts.zone == 'all': - opts.zone = random.choice(conn.get_all_zones()).name - if opts.master_spot: - # Launch spot master instance with the requested price - # Note: The spot_price*1.5 is present to ensure a higher bid price to - # the master spot instance, so the master instance will be the - # last one to be terminated in a spot market price increase - print("Requesting master as spot instance with price $%.3f" % - (opts.spot_price)) - master_req = conn.request_spot_instances( - price=(opts.spot_price * 1.5), - image_id=opts.master_ami, - placement=opts.zone, - count=1, - key_name=opts.key_pair, - security_group_ids=[master_group.id] + additional_group_ids, - instance_type=master_type, - block_device_map=block_map, - subnet_id=opts.subnet_id, - placement_group=opts.placement_group, - user_data=user_data_content, - instance_profile_name=opts.instance_profile_name) - my_master_req_id = [req.id for req in master_req] - - # TODO: refactor duplicated spot waiting code - start_time = datetime.now() - print("Waiting for master spot instance to be granted... Request ID: %s " % my_master_req_id) - try: - while True: - time.sleep(10) - reqs = conn.get_all_spot_instance_requests(my_master_req_id) - active_instance_ids = filter(lambda req: req.state == "active", reqs) - invalid_states = ["capacity-not-available", "capacity-oversubscribed", "price-too-low"] - invalid = filter(lambda req: req.status.code in invalid_states, reqs) - if len(invalid) > 0: - raise Exception("Invalid state for spot request: %s - status: %s" % - (invalid[0].id, invalid[0].status.message)) - if len(active_instance_ids) == 1: - print("Master spot instance granted") - master_res = conn.get_all_reservations([r.instance_id for r in active_instance_ids]) - master_nodes = master_res[0].instances - break - else: - print("Master spot instance not granted yet, waiting longer") - - if (datetime.now() - start_time).seconds > opts.spot_timeout * 60: - raise Exception("Timed out while waiting for master spot instance") - except: - print("Error: %s" % sys.exc_info()[1]) - print("Canceling master spot instance requests") - conn.cancel_spot_instance_requests(my_master_req_id) - # Log a warning if any of these requests actually launched instances: - (master_nodes, slave_nodes) = get_existing_cluster( - conn, opts, cluster_name, die_on_error=False) - running = len(master_nodes) + len(slave_nodes) - if running: - print(("WARNING: %d instances are still running" % running), file=stderr) - sys.exit(0) - else: - # Launch ondemand instance - master_res = master_image.run( - key_name=opts.key_pair, - security_group_ids=[master_group.id] + additional_group_ids, - instance_type=master_type, - placement=opts.zone, - min_count=1, - max_count=1, - block_device_map=block_map, - subnet_id=opts.subnet_id, - placement_group=opts.placement_group, - user_data=user_data_content, - instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, - instance_profile_name=opts.instance_profile_name) - - master_nodes = master_res.instances - print("Launched master in %s, regid = %s" % (zone, master_res.id)) - - # This wait time corresponds to SPARK-4983 - print("Waiting for AWS to propagate instance metadata...") - time.sleep(15) - - # Give the instances descriptive names and set additional tags - additional_tags = {} - if opts.additional_tags.strip(): - additional_tags = dict( - map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',') - ) - - for master in master_nodes: - master.add_tags( - dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) - ) - - for slave in slave_nodes: - slave.add_tags( - dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) - ) - - # Return all the instances - return (master_nodes, slave_nodes) - - -def get_existing_cluster(conn, opts, cluster_name, die_on_error=True): - """ - Get the EC2 instances in an existing cluster if available. - Returns a tuple of lists of EC2 instance objects for the masters and slaves. - """ - print("Searching for existing cluster {c} in region {r}...".format( - c=cluster_name, r=opts.region)) - - def get_instances(group_names): - """ - Get all non-terminated instances that belong to any of the provided security groups. - - EC2 reservation filters and instance states are documented here: - http://docs.aws.amazon.com/cli/latest/reference/ec2/describe-instances.html#options - """ - reservations = conn.get_all_reservations( - filters={"instance.group-name": group_names}) - instances = itertools.chain.from_iterable(r.instances for r in reservations) - return [i for i in instances if i.state not in ["shutting-down", "terminated"]] - - master_instances = get_instances([cluster_name + "-master"]) - slave_instances = get_instances([cluster_name + "-slaves"]) - - if any((master_instances, slave_instances)): - print("Found {m} master{plural_m}, {s} slave{plural_s}.".format( - m=len(master_instances), - plural_m=('' if len(master_instances) == 1 else 's'), - s=len(slave_instances), - plural_s=('' if len(slave_instances) == 1 else 's'))) - - if not master_instances and die_on_error: - print("ERROR: Could not find a master for cluster {c} in region {r}.".format( - c=cluster_name, r=opts.region), file=sys.stderr) - sys.exit(1) - - return (master_instances, slave_instances) - - -# Deploy configuration files and run setup scripts on a newly launched -# or started EC2 cluster. -def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): - master = get_dns_name(master_nodes[0], opts.private_ips) - if deploy_ssh_key: - print("Generating cluster's SSH key on master...") - key_setup = """ - [ -f ~/.ssh/id_rsa ] || - (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa && - cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys) - """ - ssh(master, opts, key_setup) - dot_ssh_tar = ssh_read(master, opts, ['tar', 'c', '.ssh']) - print("Transferring cluster's SSH key to slaves...") - for slave in slave_nodes: - slave_address = get_dns_name(slave, opts.private_ips) - print(slave_address) - ssh_write(slave_address, opts, ['tar', 'x'], dot_ssh_tar) - - modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs', - 'mapreduce', 'spark-standalone', 'tachyon', 'rstudio'] - - if opts.hadoop_major_version == "1": - modules = list(filter(lambda x: x != "mapreduce", modules)) - - if opts.ganglia: - modules.append('ganglia') - - # Clear SPARK_WORKER_INSTANCES if running on YARN - if opts.hadoop_major_version == "yarn": - opts.worker_instances = "" - - # NOTE: We should clone the repository before running deploy_files to - # prevent ec2-variables.sh from being overwritten - print("Cloning spark-ec2 scripts from {r}/tree/{b} on master...".format( - r=opts.spark_ec2_git_repo, b=opts.spark_ec2_git_branch)) - ssh( - host=master, - opts=opts, - command="rm -rf spark-ec2" - + " && " - + "git clone {r} -b {b} spark-ec2".format(r=opts.spark_ec2_git_repo, - b=opts.spark_ec2_git_branch) - ) - - print("Deploying files to master...") - deploy_files( - conn=conn, - root_dir=SPARK_EC2_DIR + "/" + "deploy.generic", - opts=opts, - master_nodes=master_nodes, - slave_nodes=slave_nodes, - modules=modules - ) - - if opts.deploy_root_dir is not None: - print("Deploying {s} to master...".format(s=opts.deploy_root_dir)) - deploy_user_files( - root_dir=opts.deploy_root_dir, - opts=opts, - master_nodes=master_nodes - ) - - print("Running setup on master...") - setup_spark_cluster(master, opts) - print("Done!") - - -def setup_spark_cluster(master, opts): - ssh(master, opts, "chmod u+x spark-ec2/setup.sh") - ssh(master, opts, "spark-ec2/setup.sh") - print("Spark standalone cluster started at http://%s:8080" % master) - - if opts.ganglia: - print("Ganglia started at http://%s:5080/ganglia" % master) - - -def is_ssh_available(host, opts, print_ssh_output=True): - """ - Check if SSH is available on a host. - """ - s = subprocess.Popen( - ssh_command(opts) + ['-t', '-t', '-o', 'ConnectTimeout=3', - '%s@%s' % (opts.user, host), stringify_command('true')], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT # we pipe stderr through stdout to preserve output order - ) - cmd_output = s.communicate()[0] # [1] is stderr, which we redirected to stdout - - if s.returncode != 0 and print_ssh_output: - # extra leading newline is for spacing in wait_for_cluster_state() - print(textwrap.dedent("""\n - Warning: SSH connection error. (This could be temporary.) - Host: {h} - SSH return code: {r} - SSH output: {o} - """).format( - h=host, - r=s.returncode, - o=cmd_output.strip() - )) - - return s.returncode == 0 - - -def is_cluster_ssh_available(cluster_instances, opts): - """ - Check if SSH is available on all the instances in a cluster. - """ - for i in cluster_instances: - dns_name = get_dns_name(i, opts.private_ips) - if not is_ssh_available(host=dns_name, opts=opts): - return False - else: - return True - - -def wait_for_cluster_state(conn, opts, cluster_instances, cluster_state): - """ - Wait for all the instances in the cluster to reach a designated state. - - cluster_instances: a list of boto.ec2.instance.Instance - cluster_state: a string representing the desired state of all the instances in the cluster - value can be 'ssh-ready' or a valid value from boto.ec2.instance.InstanceState such as - 'running', 'terminated', etc. - (would be nice to replace this with a proper enum: http://stackoverflow.com/a/1695250) - """ - sys.stdout.write( - "Waiting for cluster to enter '{s}' state.".format(s=cluster_state) - ) - sys.stdout.flush() - - start_time = datetime.now() - num_attempts = 0 - - while True: - time.sleep(5 * num_attempts) # seconds - - for i in cluster_instances: - i.update() - - max_batch = 100 - statuses = [] - for j in xrange(0, len(cluster_instances), max_batch): - batch = [i.id for i in cluster_instances[j:j + max_batch]] - statuses.extend(conn.get_all_instance_status(instance_ids=batch)) - - if cluster_state == 'ssh-ready': - if all(i.state == 'running' for i in cluster_instances) and \ - all(s.system_status.status == 'ok' for s in statuses) and \ - all(s.instance_status.status == 'ok' for s in statuses) and \ - is_cluster_ssh_available(cluster_instances, opts): - break - else: - if all(i.state == cluster_state for i in cluster_instances): - break - - num_attempts += 1 - - sys.stdout.write(".") - sys.stdout.flush() - - sys.stdout.write("\n") - - end_time = datetime.now() - print("Cluster is now in '{s}' state. Waited {t} seconds.".format( - s=cluster_state, - t=(end_time - start_time).seconds - )) - - -# Get number of local disks available for a given EC2 instance type. -def get_num_disks(instance_type): - # Source: http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html - # Last Updated: 2015-06-19 - # For easy maintainability, please keep this manually-inputted dictionary sorted by key. - disks_by_instance = { - "c1.medium": 1, - "c1.xlarge": 4, - "c3.large": 2, - "c3.xlarge": 2, - "c3.2xlarge": 2, - "c3.4xlarge": 2, - "c3.8xlarge": 2, - "c4.large": 0, - "c4.xlarge": 0, - "c4.2xlarge": 0, - "c4.4xlarge": 0, - "c4.8xlarge": 0, - "cc1.4xlarge": 2, - "cc2.8xlarge": 4, - "cg1.4xlarge": 2, - "cr1.8xlarge": 2, - "d2.xlarge": 3, - "d2.2xlarge": 6, - "d2.4xlarge": 12, - "d2.8xlarge": 24, - "g2.2xlarge": 1, - "g2.8xlarge": 2, - "hi1.4xlarge": 2, - "hs1.8xlarge": 24, - "i2.xlarge": 1, - "i2.2xlarge": 2, - "i2.4xlarge": 4, - "i2.8xlarge": 8, - "m1.small": 1, - "m1.medium": 1, - "m1.large": 2, - "m1.xlarge": 4, - "m2.xlarge": 1, - "m2.2xlarge": 1, - "m2.4xlarge": 2, - "m3.medium": 1, - "m3.large": 1, - "m3.xlarge": 2, - "m3.2xlarge": 2, - "m4.large": 0, - "m4.xlarge": 0, - "m4.2xlarge": 0, - "m4.4xlarge": 0, - "m4.10xlarge": 0, - "r3.large": 1, - "r3.xlarge": 1, - "r3.2xlarge": 1, - "r3.4xlarge": 1, - "r3.8xlarge": 2, - "t1.micro": 0, - "t2.micro": 0, - "t2.small": 0, - "t2.medium": 0, - "t2.large": 0, - "x1.16xlarge": 1, - "x1.32xlarge": 2, - - } - if instance_type in disks_by_instance: - return disks_by_instance[instance_type] - else: - print("WARNING: Don't know number of disks on instance type %s; assuming 1" - % instance_type, file=stderr) - return 1 - - -# Deploy the configuration file templates in a given local directory to -# a cluster, filling in any template parameters with information about the -# cluster (e.g. lists of masters and slaves). Files are only deployed to -# the first master instance in the cluster, and we expect the setup -# script to be run on that instance to copy them to other nodes. -# -# root_dir should be an absolute path to the directory with the files we want to deploy. -def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): - active_master = get_dns_name(master_nodes[0], opts.private_ips) - - num_disks = get_num_disks(opts.instance_type) - hdfs_data_dirs = "/mnt/ephemeral-hdfs/data" - mapred_local_dirs = "/mnt/hadoop/mrlocal" - spark_local_dirs = "/mnt/spark" - if num_disks > 1: - for i in range(2, num_disks + 1): - hdfs_data_dirs += ",/mnt%d/ephemeral-hdfs/data" % i - mapred_local_dirs += ",/mnt%d/hadoop/mrlocal" % i - spark_local_dirs += ",/mnt%d/spark" % i - - cluster_url = "%s:7077" % active_master - - if opts.spark_version.startswith("http"): - # Custom pre-built spark package - spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo) - tachyon_v = "" - print("Deploying Spark via custom bunlde; Tachyon won't be set up") - modules = filter(lambda x: x != "tachyon", modules) - elif "." in opts.spark_version: - # Pre-built Spark deploy - spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo) - tachyon_v = get_tachyon_version(spark_v) - else: - # Spark-only custom deploy - spark_v = "%s|%s" % (opts.spark_git_repo, opts.spark_version) - tachyon_v = "" - print("Deploying Spark via git hash; Tachyon won't be set up") - modules = filter(lambda x: x != "tachyon", modules) - - master_addresses = [get_dns_name(i, opts.private_ips) for i in master_nodes] - slave_addresses = [get_dns_name(i, opts.private_ips) for i in slave_nodes] - worker_instances_str = "%d" % opts.worker_instances if opts.worker_instances else "" - executor_instances_str = "%d" % opts.executor_instances if opts.executor_instances else "" - template_vars = { - "master_list": '\n'.join(master_addresses), - "active_master": active_master, - "slave_list": '\n'.join(slave_addresses), - "cluster_url": cluster_url, - "hdfs_data_dirs": hdfs_data_dirs, - "mapred_local_dirs": mapred_local_dirs, - "spark_local_dirs": spark_local_dirs, - "swap": str(opts.swap), - "modules": '\n'.join(modules), - "spark_version": spark_v, - "tachyon_version": tachyon_v, - "hadoop_major_version": opts.hadoop_major_version, - "spark_worker_instances": worker_instances_str, - "spark_executor_instances": executor_instances_str, - "spark_master_opts": opts.master_opts - } - - if opts.copy_aws_credentials: - template_vars["aws_access_key_id"] = conn.aws_access_key_id - template_vars["aws_secret_access_key"] = conn.aws_secret_access_key - else: - template_vars["aws_access_key_id"] = "" - template_vars["aws_secret_access_key"] = "" - - # Create a temp directory in which we will place all the files to be - # deployed after we substitue template parameters in them - tmp_dir = tempfile.mkdtemp() - for path, dirs, files in os.walk(root_dir): - if path.find(".svn") == -1: - dest_dir = os.path.join('/', path[len(root_dir):]) - local_dir = tmp_dir + dest_dir - if not os.path.exists(local_dir): - os.makedirs(local_dir) - for filename in files: - if filename[0] not in '#.~' and filename[-1] != '~': - dest_file = os.path.join(dest_dir, filename) - local_file = tmp_dir + dest_file - with open(os.path.join(path, filename)) as src: - with open(local_file, "w") as dest: - text = src.read() - for key in template_vars: - text = text.replace("{{" + key + "}}", template_vars[key]) - dest.write(text) - dest.close() - # rsync the whole directory over to the master machine - command = [ - 'rsync', '-rv', - '-e', stringify_command(ssh_command(opts)), - "%s/" % tmp_dir, - "%s@%s:/" % (opts.user, active_master) - ] - subprocess.check_call(command) - # Remove the temp directory we created above - shutil.rmtree(tmp_dir) - - -# Deploy a given local directory to a cluster, WITHOUT parameter substitution. -# Note that unlike deploy_files, this works for binary files. -# Also, it is up to the user to add (or not) the trailing slash in root_dir. -# Files are only deployed to the first master instance in the cluster. -# -# root_dir should be an absolute path. -def deploy_user_files(root_dir, opts, master_nodes): - active_master = get_dns_name(master_nodes[0], opts.private_ips) - command = [ - 'rsync', '-rv', - '-e', stringify_command(ssh_command(opts)), - "%s" % root_dir, - "%s@%s:/" % (opts.user, active_master) - ] - subprocess.check_call(command) - - -def stringify_command(parts): - if isinstance(parts, str): - return parts - else: - return ' '.join(map(pipes.quote, parts)) - - -def ssh_args(opts): - parts = ['-o', 'StrictHostKeyChecking=no'] - parts += ['-o', 'UserKnownHostsFile=/dev/null'] - if opts.identity_file is not None: - parts += ['-i', opts.identity_file] - return parts - - -def ssh_command(opts): - return ['ssh'] + ssh_args(opts) - - -# Run a command on a host through ssh, retrying up to five times -# and then throwing an exception if ssh continues to fail. -def ssh(host, opts, command): - tries = 0 - while True: - try: - return subprocess.check_call( - ssh_command(opts) + ['-t', '-t', '%s@%s' % (opts.user, host), - stringify_command(command)]) - except subprocess.CalledProcessError as e: - if tries > 5: - # If this was an ssh failure, provide the user with hints. - if e.returncode == 255: - raise UsageError( - "Failed to SSH to remote host {0}.\n" - "Please check that you have provided the correct --identity-file and " - "--key-pair parameters and try again.".format(host)) - else: - raise e - print("Error executing remote command, retrying after 30 seconds: {0}".format(e), - file=stderr) - time.sleep(30) - tries = tries + 1 - - -# Backported from Python 2.7 for compatiblity with 2.6 (See SPARK-1990) -def _check_output(*popenargs, **kwargs): - if 'stdout' in kwargs: - raise ValueError('stdout argument not allowed, it will be overridden.') - process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs) - output, unused_err = process.communicate() - retcode = process.poll() - if retcode: - cmd = kwargs.get("args") - if cmd is None: - cmd = popenargs[0] - raise subprocess.CalledProcessError(retcode, cmd, output=output) - return output - - -def ssh_read(host, opts, command): - return _check_output( - ssh_command(opts) + ['%s@%s' % (opts.user, host), stringify_command(command)]) - - -def ssh_write(host, opts, command, arguments): - tries = 0 - while True: - proc = subprocess.Popen( - ssh_command(opts) + ['%s@%s' % (opts.user, host), stringify_command(command)], - stdin=subprocess.PIPE) - proc.stdin.write(arguments) - proc.stdin.close() - status = proc.wait() - if status == 0: - break - elif tries > 5: - raise RuntimeError("ssh_write failed with error %s" % proc.returncode) - else: - print("Error {0} while executing remote command, retrying after 30 seconds". - format(status), file=stderr) - time.sleep(30) - tries = tries + 1 - - -# Gets a list of zones to launch instances in -def get_zones(conn, opts): - if opts.zone == 'all': - zones = [z.name for z in conn.get_all_zones()] - else: - zones = [opts.zone] - return zones - - -# Gets the number of items in a partition -def get_partition(total, num_partitions, current_partitions): - num_slaves_this_zone = total // num_partitions - if (total % num_partitions) - current_partitions > 0: - num_slaves_this_zone += 1 - return num_slaves_this_zone - - -# Gets the IP address, taking into account the --private-ips flag -def get_ip_address(instance, private_ips=False): - ip = instance.ip_address if not private_ips else \ - instance.private_ip_address - return ip - - -# Gets the DNS name, taking into account the --private-ips flag -def get_dns_name(instance, private_ips=False): - dns = instance.public_dns_name if not private_ips else \ - instance.private_ip_address - if not dns: - raise UsageError("Failed to determine hostname of {0}.\n" - "Please check that you provided --private-ips if " - "necessary".format(instance)) - return dns - - -def real_main(): - (opts, action, cluster_name) = parse_args() - - # Input parameter validation - get_validate_spark_version(opts.spark_version, opts.spark_git_repo) - - if opts.wait is not None: - # NOTE: DeprecationWarnings are silent in 2.7+ by default. - # To show them, run Python with the -Wdefault switch. - # See: https://docs.python.org/3.5/whatsnew/2.7.html - warnings.warn( - "This option is deprecated and has no effect. " - "spark-ec2 automatically waits as long as necessary for clusters to start up.", - DeprecationWarning - ) - - if opts.identity_file is not None: - if not os.path.exists(opts.identity_file): - print("ERROR: The identity file '{f}' doesn't exist.".format(f=opts.identity_file), - file=stderr) - sys.exit(1) - - file_mode = os.stat(opts.identity_file).st_mode - if not (file_mode & S_IRUSR) or not oct(file_mode)[-2:] == '00': - print("ERROR: The identity file must be accessible only by you.", file=stderr) - print('You can fix this with: chmod 400 "{f}"'.format(f=opts.identity_file), - file=stderr) - sys.exit(1) - - if opts.instance_type not in EC2_INSTANCE_TYPES: - print("Warning: Unrecognized EC2 instance type for instance-type: {t}".format( - t=opts.instance_type), file=stderr) - - if opts.master_instance_type != "": - if opts.master_instance_type not in EC2_INSTANCE_TYPES: - print("Warning: Unrecognized EC2 instance type for master-instance-type: {t}".format( - t=opts.master_instance_type), file=stderr) - - if opts.ebs_vol_num > 8: - print("ebs-vol-num cannot be greater than 8", file=stderr) - sys.exit(1) - - # Prevent breaking ami_prefix (/, .git and startswith checks) - # Prevent forks with non spark-ec2 names for now. - if opts.spark_ec2_git_repo.endswith("/") or \ - opts.spark_ec2_git_repo.endswith(".git") or \ - not opts.spark_ec2_git_repo.startswith("https://github.com") or \ - not opts.spark_ec2_git_repo.endswith("spark-ec2"): - print("spark-ec2-git-repo must be a github repo and it must not have a trailing / or .git. " - "Furthermore, we currently only support forks named spark-ec2.", file=stderr) - sys.exit(1) - - if not (opts.deploy_root_dir is None or - (os.path.isabs(opts.deploy_root_dir) and - os.path.isdir(opts.deploy_root_dir) and - os.path.exists(opts.deploy_root_dir))): - print("--deploy-root-dir must be an absolute path to a directory that exists " - "on the local file system", file=stderr) - sys.exit(1) - - try: - if opts.profile is None: - conn = ec2.connect_to_region(opts.region) - else: - conn = ec2.connect_to_region(opts.region, profile_name=opts.profile) - except Exception as e: - print((e), file=stderr) - sys.exit(1) - - # Select an AZ at random if it was not specified. - if opts.zone == "": - opts.zone = random.choice(conn.get_all_zones()).name - - if action == "launch": - if opts.slaves <= 0: - print("ERROR: You have to start at least 1 slave", file=sys.stderr) - sys.exit(1) - if opts.resume: - (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) - else: - (master_nodes, slave_nodes) = launch_cluster(conn, opts, cluster_name) - wait_for_cluster_state( - conn=conn, - opts=opts, - cluster_instances=(master_nodes + slave_nodes), - cluster_state='ssh-ready' - ) - setup_cluster(conn, master_nodes, slave_nodes, opts, True) - - elif action == "destroy": - (master_nodes, slave_nodes) = get_existing_cluster( - conn, opts, cluster_name, die_on_error=False) - - if any(master_nodes + slave_nodes): - print("The following instances will be terminated:") - for inst in master_nodes + slave_nodes: - print("> %s" % get_dns_name(inst, opts.private_ips)) - print("ALL DATA ON ALL NODES WILL BE LOST!!") - - msg = "Are you sure you want to destroy the cluster {c}? (y/N) ".format(c=cluster_name) - response = raw_input(msg) - if response == "y": - print("Terminating master...") - for inst in master_nodes: - inst.terminate() - print("Terminating slaves...") - for inst in slave_nodes: - inst.terminate() - - # Delete security groups as well - if opts.delete_groups: - group_names = [cluster_name + "-master", cluster_name + "-slaves"] - wait_for_cluster_state( - conn=conn, - opts=opts, - cluster_instances=(master_nodes + slave_nodes), - cluster_state='terminated' - ) - print("Deleting security groups (this will take some time)...") - attempt = 1 - while attempt <= 3: - print("Attempt %d" % attempt) - groups = [g for g in conn.get_all_security_groups() if g.name in group_names] - success = True - # Delete individual rules in all groups before deleting groups to - # remove dependencies between them - for group in groups: - print("Deleting rules in security group " + group.name) - for rule in group.rules: - for grant in rule.grants: - success &= group.revoke(ip_protocol=rule.ip_protocol, - from_port=rule.from_port, - to_port=rule.to_port, - src_group=grant) - - # Sleep for AWS eventual-consistency to catch up, and for instances - # to terminate - time.sleep(30) # Yes, it does have to be this long :-( - for group in groups: - try: - # It is needed to use group_id to make it work with VPC - conn.delete_security_group(group_id=group.id) - print("Deleted security group %s" % group.name) - except boto.exception.EC2ResponseError: - success = False - print("Failed to delete security group %s" % group.name) - - # Unfortunately, group.revoke() returns True even if a rule was not - # deleted, so this needs to be rerun if something fails - if success: - break - - attempt += 1 - - if not success: - print("Failed to delete all security groups after 3 tries.") - print("Try re-running in a few minutes.") - - elif action == "login": - (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) - if not master_nodes[0].public_dns_name and not opts.private_ips: - print("Master has no public DNS name. Maybe you meant to specify --private-ips?") - else: - master = get_dns_name(master_nodes[0], opts.private_ips) - print("Logging into master " + master + "...") - proxy_opt = [] - if opts.proxy_port is not None: - proxy_opt = ['-D', opts.proxy_port] - subprocess.check_call( - ssh_command(opts) + proxy_opt + ['-t', '-t', "%s@%s" % (opts.user, master)]) - - elif action == "reboot-slaves": - response = raw_input( - "Are you sure you want to reboot the cluster " + - cluster_name + " slaves?\n" + - "Reboot cluster slaves " + cluster_name + " (y/N): ") - if response == "y": - (master_nodes, slave_nodes) = get_existing_cluster( - conn, opts, cluster_name, die_on_error=False) - print("Rebooting slaves...") - for inst in slave_nodes: - if inst.state not in ["shutting-down", "terminated"]: - print("Rebooting " + inst.id) - inst.reboot() - - elif action == "get-master": - (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) - if not master_nodes[0].public_dns_name and not opts.private_ips: - print("Master has no public DNS name. Maybe you meant to specify --private-ips?") - else: - print(get_dns_name(master_nodes[0], opts.private_ips)) - - elif action == "stop": - response = raw_input( - "Are you sure you want to stop the cluster " + - cluster_name + "?\nDATA ON EPHEMERAL DISKS WILL BE LOST, " + - "BUT THE CLUSTER WILL KEEP USING SPACE ON\n" + - "AMAZON EBS IF IT IS EBS-BACKED!!\n" + - "All data on spot-instance slaves will be lost.\n" + - "Stop cluster " + cluster_name + " (y/N): ") - if response == "y": - (master_nodes, slave_nodes) = get_existing_cluster( - conn, opts, cluster_name, die_on_error=False) - print("Stopping master...") - for inst in master_nodes: - if inst.state not in ["shutting-down", "terminated"]: - inst.stop() - print("Stopping slaves...") - for inst in slave_nodes: - if inst.state not in ["shutting-down", "terminated"]: - if inst.spot_instance_request_id: - inst.terminate() - else: - inst.stop() - - elif action == "start": - (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) - print("Starting slaves...") - for inst in slave_nodes: - if inst.state not in ["shutting-down", "terminated"]: - inst.start() - print("Starting master...") - for inst in master_nodes: - if inst.state not in ["shutting-down", "terminated"]: - inst.start() - wait_for_cluster_state( - conn=conn, - opts=opts, - cluster_instances=(master_nodes + slave_nodes), - cluster_state='ssh-ready' - ) - - # Determine types of running instances - existing_master_type = master_nodes[0].instance_type - existing_slave_type = slave_nodes[0].instance_type - # Setting opts.master_instance_type to the empty string indicates we - # have the same instance type for the master and the slaves - if existing_master_type == existing_slave_type: - existing_master_type = "" - opts.master_instance_type = existing_master_type - opts.instance_type = existing_slave_type - - setup_cluster(conn, master_nodes, slave_nodes, opts, False) - - else: - print("Invalid action: %s" % action, file=stderr) - sys.exit(1) - - -def main(): - try: - real_main() - except UsageError as e: - print("\nError:\n", e, file=stderr) - sys.exit(1) - - -if __name__ == "__main__": - logging.basicConfig() - main() diff --git a/tools/utils.py b/tools/utils.py index bac56029..39d6129f 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -20,9 +20,10 @@ def parse_nodes(active_instances, cluster_name): slave_nodes = [] for instance in active_instances: group_names = [g.name for g in instance.groups] - if (cluster_name + "-master") in group_names: + # This can handle both spark-ec2 and flintrock clusters + if (cluster_name + "-master") in group_names or (("flintrock-" + cluster_name) in group_names and instance.tags.get('flintrock-role') == 'master'): master_nodes.append(instance) - elif (cluster_name + "-slaves") in group_names: + elif (cluster_name + "-slaves") in group_names or (("flintrock-" + cluster_name) in group_names and instance.tags.get('flintrock-role') in ('slave', None)): slave_nodes.append(instance) return (master_nodes, slave_nodes) @@ -121,4 +122,3 @@ def check_call_with_timeout(args, stdin=None, stdout=None, if p.returncode != 0: raise subprocess.CalledProcessError(p.returncode, args) return p.returncode - From 2b3c7a39cc58b028579582404750cd06aa4aeeff Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Mon, 6 Feb 2017 18:17:29 -0200 Subject: [PATCH 150/268] Avoid changing permissions of .ssh directory --- remote_hook.sh | 3 ++- tools/flintrock | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/remote_hook.sh b/remote_hook.sh index 86f1f56b..081ca880 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -28,7 +28,8 @@ echo $$ > "${RUNNING_FILE}" # Let us read the spark home even when the image doesn't give us the permission -sudo chmod -R o+rx /root +sudo chmod o+rx /root +sudo chmod -R o+rx /root/spark notify_error_and_exit() { description="${1}" diff --git a/tools/flintrock b/tools/flintrock index 541697fb..325d3eb1 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit 541697fb11912df6298d588b845809966e94d280 +Subproject commit 325d3eb12a2c7a732a7ebd7d1a5d806803216d03 From 58610661d80c9c79edc3a768ee951a0a2e78628e Mon Sep 17 00:00:00 2001 From: Fernando Rodrigues da Silva Date: Tue, 14 Feb 2017 16:54:58 -0200 Subject: [PATCH 151/268] make parseUri return a Try (#116) --- .../core/http/AsyncHttpClientStreamApi.scala | 6 +++--- .../scala/ignition/core/utils/URLUtils.scala | 19 ++++++++++++------- .../http/AsyncHttpClientStreamApiSpec.scala | 2 ++ .../ignition/core/utils/URLUtilsSpec.scala | 12 +++++++----- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala index e95e4811..6868f0b7 100644 --- a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala +++ b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala @@ -10,6 +10,7 @@ import spray.http._ import scala.concurrent.Future import scala.concurrent.duration._ import scala.language.postfixOps +import scala.util.Try object AsyncHttpClientStreamApi { @@ -52,11 +53,10 @@ object AsyncHttpClientStreamApi { requestConfiguration: Option[RequestConfiguration] = None) { def uri: Uri = { - // Note: This will guarantee we create a valid request (one with a valid uri). Will throw an exception if invalid if (params.nonEmpty) - URLUtils.parseUri(url).withQuery(params) + URLUtils.parseUri(url).map(_.withQuery(params)).get else - URLUtils.parseUri(url) + URLUtils.parseUri(url).get } } diff --git a/src/main/scala/ignition/core/utils/URLUtils.scala b/src/main/scala/ignition/core/utils/URLUtils.scala index 800a3a1a..f66a3f03 100644 --- a/src/main/scala/ignition/core/utils/URLUtils.scala +++ b/src/main/scala/ignition/core/utils/URLUtils.scala @@ -6,6 +6,8 @@ import org.apache.http.client.utils.URIBuilder import spray.http.Uri import spray.http.Uri.Query +import scala.util.Try + object URLUtils { // Due to ancient standards, Java will encode space as + instead of using percent. @@ -13,14 +15,17 @@ object URLUtils { // See: // http://stackoverflow.com/questions/1634271/url-encoding-the-space-character-or-20 // https://docs.oracle.com/javase/7/docs/api/java/net/URLEncoder.html#encode(java.lang.String,%20java.lang.String) - def sanitizePathSegment(segment: String) = - URLEncoder.encode(URLDecoder.decode(segment, "UTF-8"), "UTF-8").replace("+", "%20") - - def parseUri(urlStr: String): Uri = { - val url = new URL(urlStr) - val sanePath = url.getPath.split("/").map(sanitizePathSegment).mkString("/") + def sanitizePathSegment(segment: String): Try[String] = + Try { URLEncoder.encode(URLDecoder.decode(segment, "UTF-8"), "UTF-8").replace("+", "%20") } - Uri.from( + def parseUri(urlStr: String): Try[Uri] = { + for { + url <- Try(new URL(urlStr)) + rawSegments = url.getPath.split("/") + saneSegments = rawSegments.map(sanitizePathSegment) + if saneSegments.forall(_.isSuccess) + sanePath = saneSegments.map(_.get).mkString("/") + } yield Uri.from( scheme = url.getProtocol, userinfo = Option(url.getUserInfo).getOrElse(""), host = url.getHost, diff --git a/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala b/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala index 37accf5b..fb774b6e 100644 --- a/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala +++ b/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala @@ -3,6 +3,8 @@ package ignition.core.http import ignition.core.http.AsyncHttpClientStreamApi.Request import org.scalatest.{FunSpec, Matchers} +import scala.util.Success + class AsyncHttpClientStreamApiSpec extends FunSpec with Matchers { it("should do the best to parse the provided uri") { diff --git a/src/test/scala/ignition/core/utils/URLUtilsSpec.scala b/src/test/scala/ignition/core/utils/URLUtilsSpec.scala index 6665e3ec..114da15f 100644 --- a/src/test/scala/ignition/core/utils/URLUtilsSpec.scala +++ b/src/test/scala/ignition/core/utils/URLUtilsSpec.scala @@ -2,6 +2,8 @@ package ignition.core.utils import org.scalatest.{FlatSpec, Matchers} +import scala.util.Success + class URLUtilsSpec extends FlatSpec with Matchers { "URLUtils" should "add parameters to url with encoded params in base url and not be double encoded" in { @@ -57,19 +59,19 @@ class URLUtilsSpec extends FlatSpec with Matchers { ) tests.zip(expectations).foreach { - case (url, expected) => URLUtils.parseUri(url).toString shouldBe expected + case (url, expected) => URLUtils.parseUri(url).map(_.toString) shouldBe Success(expected) } } it should "not encode percent characters in url path" in { val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf" - val sane = URLUtils.parseUri(url).toString - sane shouldBe "http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf" + val sane = URLUtils.parseUri(url).map(_.toString) + sane shouldBe Success("http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf") } it should "encode space characters with percent in URL path" in { val url = "http://www.example.com/Pentagrama+Invertido.xml?q=blah+bleh" - val sane = URLUtils.parseUri(url).toString - sane shouldBe "http://www.example.com/Pentagrama%20Invertido.xml?q=blah+bleh" + val sane = URLUtils.parseUri(url).map(_.toString) + sane shouldBe Success("http://www.example.com/Pentagrama%20Invertido.xml?q=blah+bleh") } } From 21f9136cba067b5f2b6b53dff4e548e552e229a6 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 1 Mar 2017 18:09:39 -0300 Subject: [PATCH 152/268] Make it possible to save job execution --- remote_hook.sh | 2 ++ src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 2 ++ 2 files changed, 4 insertions(+) diff --git a/remote_hook.sh b/remote_hook.sh index 081ca880..6648ccd8 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -31,6 +31,8 @@ echo $$ > "${RUNNING_FILE}" sudo chmod o+rx /root sudo chmod -R o+rx /root/spark +sudo mkdir -p /media/tmp/spark-events + notify_error_and_exit() { description="${1}" echo "Exiting because: ${description}" diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index 0dec0896..ab47ee12 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -73,6 +73,8 @@ object CoreJobRunner { val sparkConf = new SparkConf() sparkConf.set("spark.executor.memory", config.executorMemory) + sparkConf.set("spark.eventLog.dir", "file:///media/tmp/spark-events") + sparkConf.setMaster(config.master) sparkConf.setAppName(appName) From efbf31b1ca8d19def8de0b8c4fc895f595241a57 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 14 Mar 2017 17:49:11 -0300 Subject: [PATCH 153/268] Make EBS root size be configurable --- tools/cluster.py | 3 +++ tools/flintrock | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 4a99a214..83e2b81d 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -41,6 +41,7 @@ default_executor_instances = '1' default_master_instance_type = '' default_driver_heap_size = '12G' +default_min_root_ebs_size_gb = '30' default_region = 'us-east-1' default_zone = default_region + 'b' default_key_id = 'ignition_key' @@ -217,6 +218,7 @@ def launch(cluster_name, slaves, # TODO: consider implementing in flintrock master_instance_type=default_master_instance_type, executor_instances=default_executor_instances, + min_root_ebs_size_gb=default_min_root_ebs_size_gb, retries_on_same_cluster=5, max_clusters_to_create=5, minimum_percentage_healthy_slaves=0.9, @@ -281,6 +283,7 @@ def launch(cluster_name, slaves, '--ec2-region', region, '--ec2-availability-zone', zone, '--ec2-instance-type', instance_type, + '--ec2-min-root-ebs-size-gb', min_root_ebs_size_gb, '--assume-yes', '--install-spark', '--install-hdfs', diff --git a/tools/flintrock b/tools/flintrock index 325d3eb1..ecee499a 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit 325d3eb12a2c7a732a7ebd7d1a5d806803216d03 +Subproject commit ecee499a762aa0dc5e5a875f096f8f606f0e79ea From 2f57ac4caa8b3042734998ed5cee648b78142ceb Mon Sep 17 00:00:00 2001 From: Fernando Rodrigues da Silva Date: Mon, 3 Apr 2017 14:45:27 -0300 Subject: [PATCH 154/268] updating flintrock version (#118) --- tools/flintrock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flintrock b/tools/flintrock index ecee499a..f9091b3c 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit ecee499a762aa0dc5e5a875f096f8f606f0e79ea +Subproject commit f9091b3ce508c814fd97ab3936ae77335feafff8 From d15fca4a505ed8fee35d4c2837d04f7e18cffa7d Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Mon, 3 Apr 2017 20:54:05 -0300 Subject: [PATCH 155/268] Improve flintrock sanity check and minor stuff --- src/main/scala/ignition/core/utils/IntBag.scala | 4 ++++ tools/cluster.py | 16 +++++++++------- tools/flintrock | 2 +- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/main/scala/ignition/core/utils/IntBag.scala b/src/main/scala/ignition/core/utils/IntBag.scala index a53d2d8f..1dfce82a 100644 --- a/src/main/scala/ignition/core/utils/IntBag.scala +++ b/src/main/scala/ignition/core/utils/IntBag.scala @@ -13,6 +13,10 @@ object IntBag { } case class IntBag(histogram: collection.Map[Long, Long]) { + + def +(n: Long) = + this ++ IntBag.from(n :: Nil) + def ++(other: IntBag): IntBag = { val newHistogram = scala.collection.mutable.HashMap.empty[Long, Long] (histogram.keySet ++ other.histogram.keySet).foreach(k => newHistogram += (k -> (histogram.getOrElse(k, 0L) + other.histogram.getOrElse(k, 0L)))) diff --git a/tools/cluster.py b/tools/cluster.py index 83e2b81d..29d75ac2 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -137,12 +137,13 @@ def chdir_to_ec2_script_and_get_path(): return ec2_script_path -def call_ec2_script(args, timeout_total_minutes, timeout_inactivity_minutes): +def call_ec2_script(args, timeout_total_minutes, timeout_inactivity_minutes, stdout=None): ec2_script_path = chdir_to_ec2_script_and_get_path() return check_call_with_timeout(['/usr/bin/env', 'python3', '-u', ec2_script_path] + args, - timeout_total_minutes=timeout_total_minutes, - timeout_inactivity_minutes=timeout_inactivity_minutes) + stdout=stdout, + timeout_total_minutes=timeout_total_minutes, + timeout_inactivity_minutes=timeout_inactivity_minutes) def cluster_exists(cluster_name, region): @@ -712,7 +713,8 @@ def killall_jobs(cluster_name, key_file=default_key_file, def check_flintrock_installation(): try: - call_ec2_script(['--help'], 1 , 1) + with file('/dev/null', 'w') as devnull: + call_ec2_script(['--help'], 1 , 1, stdout=devnull) except: setup = os.path.join(ec2_script_base_path(), 'setup.py') if not os.path.exists(setup): @@ -726,10 +728,10 @@ def check_flintrock_installation(): else: log.error(''' Some dependencies are missing. For an Ubuntu system, try the following: -sudo apt-get install python3-yaml libyaml-dev +sudo apt-get install python3-yaml libyaml-dev python3-pip sudo python3 -m pip install -U pip packaging setuptools cd {flintrock} -sudo pip3 -r requirements/user.pip +sudo pip3 install -r requirements/user.pip '''.format(flintrock=ec2_script_base_path())) sys.exit(1) @@ -740,5 +742,5 @@ def check_flintrock_installation(): kill_job, killall_jobs, collect_job_results], namespace="jobs") if __name__ == '__main__': - + check_flintrock_installation() parser.dispatch() diff --git a/tools/flintrock b/tools/flintrock index f9091b3c..dd7354ac 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit f9091b3ce508c814fd97ab3936ae77335feafff8 +Subproject commit dd7354ac8319ecbc6240ef5542ecfeeb4c0f55a6 From a8d8a226f80f03a561a3565cbb192bc024d7d3f3 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 4 Apr 2017 15:27:54 -0300 Subject: [PATCH 156/268] Use master flintrock --- tools/flintrock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flintrock b/tools/flintrock index dd7354ac..4629fe4b 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit dd7354ac8319ecbc6240ef5542ecfeeb4c0f55a6 +Subproject commit 4629fe4bc1f333dd149a44dcc5d9b8775186b324 From 66b16c51a95279d2ebcd52b72632f85c9058f32f Mon Sep 17 00:00:00 2001 From: Rafael Zimmermann Date: Thu, 15 Jun 2017 16:40:52 +0200 Subject: [PATCH 157/268] Create README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8f30027e..8b395319 100644 --- a/README.md +++ b/README.md @@ -8,4 +8,4 @@ It also provides many utilities for Spark jobs and Scala programs in general. It should be used inside a project as a submodule. See https://github.com/chaordic/ignition-template for an example. # Getting started -See http://monkeys.chaordic.com.br/start-using-spark-with-ignition/ for quick-start tutorial +See [Start using Spark with Ignition!](http://monkeys.chaordic.com.br/2015/03/22/start-using-spark-with-ignition.html) for quick-start tutorial From 4861be7fc664dcc3abb4963d1d3dfd84f392dc8c Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 7 Jul 2017 10:01:12 -0300 Subject: [PATCH 158/268] disable verbose spark logging for tests --- .../ignition/core/testsupport/spark/SharedSparkContext.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/scala/ignition/core/testsupport/spark/SharedSparkContext.scala b/src/main/scala/ignition/core/testsupport/spark/SharedSparkContext.scala index 314d5442..4fa5756b 100644 --- a/src/main/scala/ignition/core/testsupport/spark/SharedSparkContext.scala +++ b/src/main/scala/ignition/core/testsupport/spark/SharedSparkContext.scala @@ -33,6 +33,7 @@ trait SharedSparkContext extends BeforeAndAfterAll { self: Suite => //Logger.getRootLogger().removeAllAppenders(); //Logger.getRootLogger().addAppender(new NullAppender()); _sc = new SparkContext("local", "test", conf) + _sc.setLogLevel("OFF") super.beforeAll() } From c628ed529e796cafc2ca981fd8e7183d3646e3b1 Mon Sep 17 00:00:00 2001 From: Henrique Goulart Date: Wed, 19 Jul 2017 16:09:54 -0300 Subject: [PATCH 159/268] Fix weird await on test --- .../ignition/core/cache/ExpiringMultipleLevelCache.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala index c321f794..7f2101c3 100644 --- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala +++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala @@ -2,6 +2,7 @@ package ignition.core.cache import akka.actor.ActorSystem import ignition.core.cache.ExpiringMultiLevelCache.TimestampedValue +import org.scalatest.concurrent.ScalaFutures import org.scalatest.{FlatSpec, Matchers} import spray.caching.ExpiringLruLocalCache @@ -9,7 +10,7 @@ import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent.duration._ import scala.concurrent.{Await, Future} -class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers { +class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFutures { case class Data(s: String) implicit val scheduler = ActorSystem().scheduler @@ -25,8 +26,9 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers { class MyException(s: String) extends Exception(s) - intercept[MyException ] { - Await.result(cache("key", () => Future.failed(new MyException("some failure"))), 1.minute) + val eventualCache = cache("key", () => Future.failed(new MyException("some failure"))) + whenReady(eventualCache.failed) { failure => + failure shouldBe a [MyException] } } } From c8fcc3ed88488bf48c97bb6c1dbaf709bb9e1125 Mon Sep 17 00:00:00 2001 From: Henrique Goulart Date: Wed, 19 Jul 2017 16:10:01 -0300 Subject: [PATCH 160/268] Add log4j configuration to avoid annoying test log --- src/test/resources/log4j.properties | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 src/test/resources/log4j.properties diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties new file mode 100644 index 00000000..8455c4cf --- /dev/null +++ b/src/test/resources/log4j.properties @@ -0,0 +1,21 @@ +log4j.rootCategory=ERROR, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Ignition! +log4j.logger.ignition=ERROR + +# Disable annoying logger that is always logging an error message on ExpiringMultipleLevelCacheSpec test +log4j.logger.ignition.core.cache.ExpiringMultiLevelCache=OFF + +# Spark, Hadoop, etc +log4j.logger.org.apache=ERROR + +# Akka +log4j.logger.Remoting=ERROR + +# Jetty +log4j.logger.org.eclipse.jetty=ERROR +log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR +org.eclipse.jetty.LEVEL=ERROR From ce4cfb50f87dbcb5375aa265472b12d25cc12a70 Mon Sep 17 00:00:00 2001 From: Henrique Goulart Date: Wed, 19 Jul 2017 16:20:13 -0300 Subject: [PATCH 161/268] Fix wrong file name --- ...tipleLevelCache.scala => ExpiringMultipleLevelCacheSpec.scala} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/test/scala/ignition/core/cache/{ExpiringMultipleLevelCache.scala => ExpiringMultipleLevelCacheSpec.scala} (100%) diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala similarity index 100% rename from src/test/scala/ignition/core/cache/ExpiringMultipleLevelCache.scala rename to src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala From 34a13f84a2d030ff161eb710472471050cf6c4e3 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 25 Jul 2017 10:18:30 -0300 Subject: [PATCH 162/268] update dependencies --- build.sbt | 27 +++++++++---------- .../testsupport/spark/LocalSparkContext.scala | 2 +- .../ignition/core/utils/BetterTrace.scala | 2 +- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/build.sbt b/build.sbt index ad80612f..63b5c2e2 100644 --- a/build.sbt +++ b/build.sbt @@ -2,18 +2,17 @@ name := "Ignition-Core" version := "1.0" -scalaVersion := "2.11.8" +scalaVersion := "2.11.11" scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130") // Because we can't run two spark contexts on same VM parallelExecution in Test := false -libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.0.2" % "provided") +libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.2.0" % "provided") .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") - libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.2" % "provided") libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2") @@ -21,29 +20,29 @@ libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2") .exclude("commons-beanutils", "commons-beanutils") .exclude("org.slf4j", "slf4j-log4j12") -libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.4" +libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.3" -libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.9" +libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.2.14" -libraryDependencies += "com.github.scopt" %% "scopt" % "3.2.0" +libraryDependencies += "com.github.scopt" %% "scopt" % "3.6.0" -libraryDependencies += "joda-time" % "joda-time" % "2.9.4" +libraryDependencies += "joda-time" % "joda-time" % "2.9.9" -libraryDependencies += "org.joda" % "joda-convert" % "1.7" +libraryDependencies += "org.joda" % "joda-convert" % "1.8.2" libraryDependencies += "commons-lang" % "commons-lang" % "2.6" -libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.5" +libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.25" -libraryDependencies += "com.typesafe.akka" %% "akka-actor" % "2.3.4" +libraryDependencies += "com.typesafe.akka" %% "akka-actor" % "2.5.3" -libraryDependencies += "io.spray" %% "spray-json" % "1.3.2" +libraryDependencies += "io.spray" %% "spray-json" % "1.3.3" -libraryDependencies += "io.spray" %% "spray-client" % "1.3.2" +libraryDependencies += "io.spray" %% "spray-client" % "1.3.4" -libraryDependencies += "io.spray" %% "spray-http" % "1.3.2" +libraryDependencies += "io.spray" %% "spray-http" % "1.3.4" -libraryDependencies += "io.spray" %% "spray-caching" % "1.3.2" +libraryDependencies += "io.spray" %% "spray-caching" % "1.3.4" resolvers += "Akka Repository" at "http://repo.akka.io/releases/" diff --git a/src/main/scala/ignition/core/testsupport/spark/LocalSparkContext.scala b/src/main/scala/ignition/core/testsupport/spark/LocalSparkContext.scala index a272edaa..814f565d 100644 --- a/src/main/scala/ignition/core/testsupport/spark/LocalSparkContext.scala +++ b/src/main/scala/ignition/core/testsupport/spark/LocalSparkContext.scala @@ -26,7 +26,7 @@ trait LocalSparkContext extends BeforeAndAfterEach with BeforeAndAfterAll { self @transient var sc: SparkContext = _ override def beforeAll() { - InternalLoggerFactory.setDefaultFactory(new Slf4JLoggerFactory()) + InternalLoggerFactory.setDefaultFactory(Slf4JLoggerFactory.INSTANCE) super.beforeAll() } diff --git a/src/main/scala/ignition/core/utils/BetterTrace.scala b/src/main/scala/ignition/core/utils/BetterTrace.scala index 09de73aa..9c91ca05 100644 --- a/src/main/scala/ignition/core/utils/BetterTrace.scala +++ b/src/main/scala/ignition/core/utils/BetterTrace.scala @@ -3,7 +3,7 @@ package ignition.core.utils import ignition.core.utils.ExceptionUtils._ // Used mainly to augment scalacheck traces in scalatest trait BetterTrace { - def fail(message: String): Nothing + def fail(message: String): Nothing = throw new NotImplementedError(message) def withBetterTrace(block: => Unit): Unit = try { From 2fd48757063e1146c0e85b4bc39e49454049df2d Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 27 Jul 2017 16:16:01 -0300 Subject: [PATCH 163/268] update spark and flintrock --- tools/cluster.py | 2 +- tools/flintrock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index 29d75ac2..5efefeb5 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -49,7 +49,7 @@ default_ami = 'ami-611e7976' default_master_ami = '' default_env = 'dev' -default_spark_version = '2.0.2' +default_spark_version = '2.2.0' default_hdfs_version = '2.7.2' default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz' default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz' diff --git a/tools/flintrock b/tools/flintrock index 4629fe4b..2cc5ddaf 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit 4629fe4bc1f333dd149a44dcc5d9b8775186b324 +Subproject commit 2cc5ddaf12a5850a710c168c9b52def6cfdcadd0 From 766ab9cec3aae0f2a13e9df5bf2ffcfc39f7993d Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 1 Aug 2017 09:58:02 -0300 Subject: [PATCH 164/268] update flintrock --- tools/flintrock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flintrock b/tools/flintrock index 2cc5ddaf..eba6ab1d 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit 2cc5ddaf12a5850a710c168c9b52def6cfdcadd0 +Subproject commit eba6ab1dceb942937bdc9610736e70d72e2a6579 From 5141aa7724284f324c583eee720fd3b4c9bc4390 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 1 Aug 2017 11:24:33 -0300 Subject: [PATCH 165/268] fix versions --- build.sbt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/build.sbt b/build.sbt index 63b5c2e2..f4fa51d0 100644 --- a/build.sbt +++ b/build.sbt @@ -2,7 +2,7 @@ name := "Ignition-Core" version := "1.0" -scalaVersion := "2.11.11" +scalaVersion := "2.11.8" scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130") @@ -34,15 +34,15 @@ libraryDependencies += "commons-lang" % "commons-lang" % "2.6" libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.25" -libraryDependencies += "com.typesafe.akka" %% "akka-actor" % "2.5.3" +libraryDependencies += "com.typesafe.akka" %% "akka-actor" % "2.3.4" -libraryDependencies += "io.spray" %% "spray-json" % "1.3.3" +libraryDependencies += "io.spray" %% "spray-json" % "1.3.2" -libraryDependencies += "io.spray" %% "spray-client" % "1.3.4" +libraryDependencies += "io.spray" %% "spray-client" % "1.3.2" -libraryDependencies += "io.spray" %% "spray-http" % "1.3.4" +libraryDependencies += "io.spray" %% "spray-http" % "1.3.2" -libraryDependencies += "io.spray" %% "spray-caching" % "1.3.4" +libraryDependencies += "io.spray" %% "spray-caching" % "1.3.2" resolvers += "Akka Repository" at "http://repo.akka.io/releases/" From 0c0d986a05e5e79259b5dd4ae607f8fe5b8a7663 Mon Sep 17 00:00:00 2001 From: Leonardo Santos Date: Fri, 18 Aug 2017 18:15:17 -0400 Subject: [PATCH 166/268] ExpiringMultiLevelCache: caching 404 --- .../core/cache/ExpiringMultiLevelCache.scala | 102 ++++++++++++++---- 1 file changed, 79 insertions(+), 23 deletions(-) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala index 138d6cbd..8f953550 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala @@ -17,10 +17,22 @@ import scala.util.control.NonFatal import scala.util.{Failure, Success, Try} object ExpiringMultiLevelCache { - case class TimestampedValue[V](date: DateTime, value: V) { + case class TimestampedValue[V](date: DateTime, + value: Option[V] = None, + status4XX: Boolean = false, + status5XX: Boolean = false, + error: Option[Throwable] = None) { def hasExpired(ttl: FiniteDuration, now: DateTime): Boolean = { + // TODO: Cached Error should have little ttl date.plus(ttl.toMillis).isBefore(now) } + def getValue: V = { + this.value match { + case Some(x) => x + // We should never try to get a value that was saved as None, probably from an error + case None => throw new Exception("Trying to get None value") + } + } } trait GenericCache[V] { cache => @@ -109,7 +121,6 @@ object ExpiringMultiLevelCache { import ignition.core.cache.ExpiringMultiLevelCache._ - case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, localCache: Option[LocalCache[TimestampedValue[V]]], remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None, @@ -128,12 +139,34 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, protected def now = DateTime.now.withZone(DateTimeZone.UTC) - private def timestamp(v: V) = TimestampedValue(now, v) + private def timestamp(v: V) = TimestampedValue(date = now, value = Some(v)) + + private def timestamp(status4XX: Boolean, status5XX: Boolean, error: Throwable): TimestampedValue[V] = { + TimestampedValue(value = None, date = now, status4XX = status4XX, status5XX = status5XX, error = Some(error)) + } private def elapsedTime(startNanoTime: Long) = FiniteDuration(System.nanoTime() - startNanoTime, TimeUnit.NANOSECONDS) private def remoteLockKey(key: Any) = s"$key-emlc-lock" + case class SavedErrorCache(ttl: FiniteDuration = 1.minutes, + status4XX: Boolean = false, + status5XX: Boolean = false, + error: Throwable) + + case class CustomException(private val message: String = "", private val cause: Throwable = None.orNull) extends Exception(message, cause) + + private def checkSavedErrorCache(key: String, genValue: () => Future[V], startTime: Long, v: TimestampedValue[V]): Future[V] = { + val promise = Promise[V]() + val future = promise.future + if (v.status4XX || v.status5XX) { + promise.tryFailure(v.error.getOrElse(None.orNull)) + } + else { + promise.trySuccess(v.getValue) + } + future + } // The idea is simple, have two caches: remote and local // with values that will eventually expire but still be left on the cache @@ -141,7 +174,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = { // The local cache is always the first try. We'll only look the remote if the local value is missing or has expired val startTime = System.nanoTime() - val result = localCache.flatMap(_.get(key).map(_.asTry())) match { + val result: Future[V] = localCache.flatMap(_.get(key).map(_.asTry())) match { case Some(future) => future.flatMap { case Success(localValue) if !localValue.hasExpired(ttl, now) => @@ -149,9 +182,10 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, reporter.onLocalCacheHit(key, elapsedTime(startTime)) // But if we're paranoid, let's check if the local value is consistent with remote if (sanityLocalValueCheck) - remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, genValue, startTime)).getOrElse(Future.successful(localValue.value)) + remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, genValue, startTime)).getOrElse(checkSavedErrorCache(key, genValue, startTime, localValue)) else - Future.successful(localValue.value) + // We can even get a SavedErrorCache + checkSavedErrorCache(key, genValue, startTime, localValue) case Success(expiredLocalValue) if remoteRW.nonEmpty => // We have locally an expired value, but we can check a remote cache for better value remoteRW.get.get(key).asTry().flatMap { @@ -159,35 +193,40 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, // Remote is good, set locally and return it reporter.onRemoteCacheHit(key, elapsedTime(startTime)) localCache.foreach(_.set(key, remoteValue)) - Future.successful(remoteValue.value) + // We can even get a SavedErrorCache + checkSavedErrorCache(key, genValue, startTime, remoteValue) case Success(Some(expiredRemote)) => // Expired local and expired remote, return the most recent of them, async update both reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime)) tryGenerateAndSet(key, genValue, startTime) val mostRecent = Set(expiredLocalValue, expiredRemote).maxBy(_.date) - Future.successful(mostRecent.value) + // We can even get a SavedErrorCache + checkSavedErrorCache(key, genValue, startTime, mostRecent) case Success(None) => // No remote found, return local, async update both reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime)) tryGenerateAndSet(key, genValue, startTime) - Future.successful(expiredLocalValue.value) + // We can even get a SavedErrorCache + checkSavedErrorCache(key, genValue, startTime, expiredLocalValue) case Failure(e) => reporter.onRemoteError(key, e, elapsedTime(startTime)) logger.warn(s"apply, key: $key expired local value and failed to get remote", e) tryGenerateAndSet(key, genValue, startTime) - Future.successful(expiredLocalValue.value) + // We can even get a SavedErrorCache + checkSavedErrorCache(key, genValue, startTime, expiredLocalValue) } case Success(expiredLocalValue) if remoteRW.isEmpty => // There is no remote cache configured, we'are on our own // Return expired value and try to generate a new one for the future reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime)) tryGenerateAndSet(key, genValue, startTime) - Future.successful(expiredLocalValue.value) + // We can even get a SavedErrorCache + checkSavedErrorCache(key, genValue, startTime, expiredLocalValue) case Failure(e) => // This is almost impossible to happen because it's local and we don't save failed values reporter.onLocalError(key, e, elapsedTime(startTime)) logger.warn(s"apply, key: $key got a failed future from cache!? This is almost impossible!", e) - tryGenerateAndSet(key, genValue, startTime).map(_.value) + tryGenerateAndSet(key, genValue, startTime).map(_.getValue) } case None if remoteRW.nonEmpty => // No local, let's try remote @@ -196,26 +235,28 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, // Remote is good, set locally and return it reporter.onRemoteCacheHit(key, elapsedTime(startTime)) localCache.foreach(_.set(key, remoteValue)) - Future.successful(remoteValue.value) + // We can even get a SavedErrorCache + checkSavedErrorCache(key, genValue, startTime, remoteValue) case Success(Some(expiredRemote)) => // Expired remote, return the it, async update reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime)) - tryGenerateAndSet(key, genValue, startTime).map(_.value) - Future.successful(expiredRemote.value) + tryGenerateAndSet(key, genValue, startTime).map(_.getValue) + // We can even get a SavedErrorCache + checkSavedErrorCache(key, genValue, startTime, expiredRemote) case Success(None) => // No good remote, sync generate reporter.onCacheMissNothingFound(key, elapsedTime(startTime)) - tryGenerateAndSet(key, genValue, startTime).map(_.value) + tryGenerateAndSet(key, genValue, startTime).map(_.getValue) case Failure(e) => reporter.onRemoteError(key, e, elapsedTime(startTime)) logger.warn(s"apply, key: $key expired local value and remote error", e) - tryGenerateAndSet(key, genValue, startTime).map(_.value) + tryGenerateAndSet(key, genValue, startTime).map(_.getValue) } case None if remoteRW.isEmpty => // No local and no remote to look, just generate it // The caller will need to wait for the value generation reporter.onCacheMissNothingFound(key, elapsedTime(startTime)) - tryGenerateAndSet(key, genValue, startTime).map(_.value) + tryGenerateAndSet(key, genValue, startTime).map(_.getValue) } result.onComplete { case Success(_) => @@ -255,7 +296,8 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, remote.get(key).asTry().flatMap { case Success(Some(remoteValue)) if remoteValue == localValue => // Remote is the same as local, return any of them - Future.successful(remoteValue.value) + // We can even get a SavedErrorCache + checkSavedErrorCache(key, genValue, startTime, remoteValue) case Success(Some(remoteValue)) => // Something is different, try to figure it out val valuesResult = if (remoteValue.value == localValue.value) "same-value" else "different-values" @@ -275,18 +317,20 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, logger.warn(s"sanityLocalValueCheck, key $key: got different results for local $localValue and remote $remoteValue ($finalResult)") reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime)) // return remote to keep everyone consistent - Future.successful(remoteValue.value) + // We can even get a SavedErrorCache + checkSavedErrorCache(key, genValue, startTime, remoteValue) case Success(None) => val localExpired = localValue.hasExpired(ttl, now) val finalResult = s"missing-remote-local-expired-${localExpired}" logger.warn(s"sanityLocalValueCheck, key $key: got local $localValue but no remote ($finalResult)") reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime)) // Try generate it to keep a behaviour equivalent to remote only - tryGenerateAndSet(key, genValue, startTime).map(_.value) + tryGenerateAndSet(key, genValue, startTime).map(_.getValue) case Failure(e) => reporter.onRemoteError(key, e, elapsedTime(startTime)) logger.warn(s"sanityLocalValueCheck, key: $key failed to get remote", e) - Future.successful(localValue.value) + // We can even get a SavedErrorCache + checkSavedErrorCache(key, genValue, startTime, localValue) } } @@ -379,7 +423,19 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, case Failure(eRemote) => // The real error is the eLocal, return it logger.error(s"canonicalValueGenerator, key $key: failed to generate value and failed to get remote", eLocal) - Future.failed(eLocal) + eLocal match { + case NonFatal(e) => { + // if error was nonFatal (404) then saves it to cache + // TODO: check if it is actually a 4XX error, or something else + // TODO: handle 5XX errors as well? + val timestampedValue = timestamp(status4XX = true, status5XX = false, error = e) + remoteSetOrGet(key, timestampedValue, remote, nanoStartTime) + Future.failed(eLocal) + } + case _ => { + Future.failed(eLocal) + } + } } } } From 36c1448b4e270adf5da4f03d0f07ba1899dda611 Mon Sep 17 00:00:00 2001 From: Leonardo Santos Date: Tue, 22 Aug 2017 14:48:43 -0400 Subject: [PATCH 167/268] ExpiringMultiLevelCache: caching errors when only LocalCache --- .../core/cache/ExpiringMultiLevelCache.scala | 17 ++++++++--- .../ExpiringMultipleLevelCacheSpec.scala | 29 +++++++++++++++++++ 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala index 8f953550..8674c174 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala @@ -414,7 +414,18 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, case None => // There are no remote RW caches logger.error(s"canonicalValueGenerator, key $key: failed to generate value and no remote cache configured", eLocal) - Future.failed(eLocal) + eLocal match { + case NonFatal(e) => { + // if error was nonFatal (404) then saves it to cache + // TODO: check if it is actually a 4XX error, or something else + // TODO: handle 5XX errors as well? + val timestampedValue = timestamp(status4XX = true, status5XX = false, error = e) + // Saved it only in localCache + localCache.foreach(_.set(key, timestampedValue)) + Future.failed(eLocal) + } + case _ => Future.failed(eLocal) + } case Some(remote) => remoteGetNonExpiredValue(key, remote, nanoStartTime).asTry().flatMap { case Success(v) => @@ -432,9 +443,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, remoteSetOrGet(key, timestampedValue, remote, nanoStartTime) Future.failed(eLocal) } - case _ => { - Future.failed(eLocal) - } + case _ => Future.failed(eLocal) } } } diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala index 7f2101c3..bd932868 100644 --- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala +++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala @@ -31,4 +31,33 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu failure shouldBe a [MyException] } } + + it should "calculate a value on cache miss just once, the second call should be from cache hit" in { + var myFailedRequestCount: Int = 0 + + // TODO: Throw a 404 error + class MyException(s: String) extends ArithmeticException(s) // Some NonFatal Exception + def myFailedRequest(): Future[Nothing] = { + println("calling myFailedRequest()") + myFailedRequestCount = myFailedRequestCount + 1 + Future.failed(new MyException("some failure")) + } + + val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) + val cache = ExpiringMultiLevelCache[Data](1.minute, Option(local)) + + val eventualCache = cache("key", myFailedRequest) + whenReady(eventualCache.failed) { failure => + failure shouldBe a [MyException] + myFailedRequestCount shouldBe 1 + } + + val eventualCache2 = cache("key", myFailedRequest) + whenReady(eventualCache2.failed) { failure => + failure shouldBe a [MyException] + myFailedRequestCount shouldBe 1 + } + + } + } From 131f6baf94b40ba4246fe0a26d88f49434670253 Mon Sep 17 00:00:00 2001 From: Leonardo Santos Date: Thu, 24 Aug 2017 16:25:02 -0400 Subject: [PATCH 168/268] ExpiringMultiLevelCache: some improvement in tests --- .../ExpiringMultipleLevelCacheSpec.scala | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala index bd932868..3a02d903 100644 --- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala +++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala @@ -1,5 +1,7 @@ package ignition.core.cache +import java.io.FileNotFoundException + import akka.actor.ActorSystem import ignition.core.cache.ExpiringMultiLevelCache.TimestampedValue import org.scalatest.concurrent.ScalaFutures @@ -35,8 +37,7 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu it should "calculate a value on cache miss just once, the second call should be from cache hit" in { var myFailedRequestCount: Int = 0 - // TODO: Throw a 404 error - class MyException(s: String) extends ArithmeticException(s) // Some NonFatal Exception + class MyException(s: String) extends FileNotFoundException(s) // Some NonFatal Exception def myFailedRequest(): Future[Nothing] = { println("calling myFailedRequest()") myFailedRequestCount = myFailedRequestCount + 1 @@ -58,6 +59,24 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu myFailedRequestCount shouldBe 1 } + val eventualCache3 = cache("key", myFailedRequest) + whenReady(eventualCache3.failed) { failure => + failure shouldBe a [MyException] + myFailedRequestCount shouldBe 1 + } + + val eventualCache4 = cache("key", myFailedRequest) + whenReady(eventualCache4.failed) { failure => + failure shouldBe a [MyException] + myFailedRequestCount shouldBe 1 + } + + val eventualCache5 = cache("key", myFailedRequest) + whenReady(eventualCache5.failed) { failure => + failure shouldBe a [MyException] + myFailedRequestCount shouldBe 1 + } + } } From bf04dc6593d0d5b70fa09b51536d922edfb97c33 Mon Sep 17 00:00:00 2001 From: Leonardo Santos Date: Thu, 24 Aug 2017 17:17:14 -0400 Subject: [PATCH 169/268] ExpiringMultiLevelCache: optional support to cache error with differrent ttl --- .../core/cache/ExpiringMultiLevelCache.scala | 75 +++++++-------- .../ExpiringMultipleLevelCacheSpec.scala | 96 ++++++++++++++++++- 2 files changed, 130 insertions(+), 41 deletions(-) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala index 8674c174..b5c10667 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala @@ -19,12 +19,14 @@ import scala.util.{Failure, Success, Try} object ExpiringMultiLevelCache { case class TimestampedValue[V](date: DateTime, value: Option[V] = None, - status4XX: Boolean = false, - status5XX: Boolean = false, - error: Option[Throwable] = None) { - def hasExpired(ttl: FiniteDuration, now: DateTime): Boolean = { - // TODO: Cached Error should have little ttl - date.plus(ttl.toMillis).isBefore(now) + hasError: Boolean = false, + throwable: Option[Throwable] = None) { + def hasExpired(ttl: FiniteDuration, now: DateTime, ttlCachedErrors: FiniteDuration = 1.minute): Boolean = { + if (!hasError) { + date.plus(ttl.toMillis).isBefore(now) + } else { + date.plus(ttlCachedErrors.toMillis).isBefore(now) + } } def getValue: V = { this.value match { @@ -129,7 +131,9 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, maxErrorsToRetryOnRemote: Int = 5, backoffOnLockAcquire: FiniteDuration = 50.milliseconds, backoffOnError: FiniteDuration = 50.milliseconds, - sanityLocalValueCheck: Boolean = false) extends GenericCache[V] { + sanityLocalValueCheck: Boolean = false, + cacheErrors: Boolean = false, + ttlCachedErrors: FiniteDuration = 1.minute) extends GenericCache[V] { private val logger = LoggerFactory.getLogger(getClass) @@ -141,26 +145,19 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, private def timestamp(v: V) = TimestampedValue(date = now, value = Some(v)) - private def timestamp(status4XX: Boolean, status5XX: Boolean, error: Throwable): TimestampedValue[V] = { - TimestampedValue(value = None, date = now, status4XX = status4XX, status5XX = status5XX, error = Some(error)) + private def timestamp(hasError: Boolean, throwable: Throwable): TimestampedValue[V] = { + TimestampedValue(value = None, date = now, hasError = hasError, throwable = Some(throwable)) } private def elapsedTime(startNanoTime: Long) = FiniteDuration(System.nanoTime() - startNanoTime, TimeUnit.NANOSECONDS) private def remoteLockKey(key: Any) = s"$key-emlc-lock" - case class SavedErrorCache(ttl: FiniteDuration = 1.minutes, - status4XX: Boolean = false, - status5XX: Boolean = false, - error: Throwable) - - case class CustomException(private val message: String = "", private val cause: Throwable = None.orNull) extends Exception(message, cause) - private def checkSavedErrorCache(key: String, genValue: () => Future[V], startTime: Long, v: TimestampedValue[V]): Future[V] = { val promise = Promise[V]() val future = promise.future - if (v.status4XX || v.status5XX) { - promise.tryFailure(v.error.getOrElse(None.orNull)) + if (v.hasError) { + promise.tryFailure(v.throwable.getOrElse(None.orNull)) } else { promise.trySuccess(v.getValue) @@ -177,7 +174,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, val result: Future[V] = localCache.flatMap(_.get(key).map(_.asTry())) match { case Some(future) => future.flatMap { - case Success(localValue) if !localValue.hasExpired(ttl, now) => + case Success(localValue) if !localValue.hasExpired(ttl, now, ttlCachedErrors) => // We have locally a good value, just return it reporter.onLocalCacheHit(key, elapsedTime(startTime)) // But if we're paranoid, let's check if the local value is consistent with remote @@ -189,7 +186,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, case Success(expiredLocalValue) if remoteRW.nonEmpty => // We have locally an expired value, but we can check a remote cache for better value remoteRW.get.get(key).asTry().flatMap { - case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => + case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) => // Remote is good, set locally and return it reporter.onRemoteCacheHit(key, elapsedTime(startTime)) localCache.foreach(_.set(key, remoteValue)) @@ -231,7 +228,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, case None if remoteRW.nonEmpty => // No local, let's try remote remoteRW.get.get(key).asTry().flatMap { - case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => + case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) => // Remote is good, set locally and return it reporter.onRemoteCacheHit(key, elapsedTime(startTime)) localCache.foreach(_.set(key, remoteValue)) @@ -311,8 +308,8 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, "same-date-on-utc" else "impossible-dates" - val remoteExpired = remoteValue.hasExpired(ttl, now) - val localExpired = localValue.hasExpired(ttl, now) + val remoteExpired = remoteValue.hasExpired(ttl, now, ttlCachedErrors) + val localExpired = localValue.hasExpired(ttl, now, ttlCachedErrors) val finalResult = s"$valuesResult-$dateResult-remote-expired-${remoteExpired}-local-expired-${localExpired}" logger.warn(s"sanityLocalValueCheck, key $key: got different results for local $localValue and remote $remoteValue ($finalResult)") reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime)) @@ -320,7 +317,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, // We can even get a SavedErrorCache checkSavedErrorCache(key, genValue, startTime, remoteValue) case Success(None) => - val localExpired = localValue.hasExpired(ttl, now) + val localExpired = localValue.hasExpired(ttl, now, ttlCachedErrors) val finalResult = s"missing-remote-local-expired-${localExpired}" logger.warn(s"sanityLocalValueCheck, key $key: got local $localValue but no remote ($finalResult)") reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime)) @@ -363,7 +360,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, case null => logger.info(s"tryGenerateAndSet, key $key: got request for generating and none in progress found, calling canonicalValueGenerator") canonicalValueGenerator(key, genValue, nanoStartTime).onComplete { - case Success(v) if !v.hasExpired(ttl, now) => + case Success(v) if !v.hasExpired(ttl, now, ttlCachedErrors) => reporter.onGeneratedWithSuccess(key, elapsedTime(nanoStartTime)) localCache.foreach(_.set(key, v)) promise.trySuccess(v) @@ -416,12 +413,12 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, logger.error(s"canonicalValueGenerator, key $key: failed to generate value and no remote cache configured", eLocal) eLocal match { case NonFatal(e) => { - // if error was nonFatal (404) then saves it to cache - // TODO: check if it is actually a 4XX error, or something else - // TODO: handle 5XX errors as well? - val timestampedValue = timestamp(status4XX = true, status5XX = false, error = e) - // Saved it only in localCache - localCache.foreach(_.set(key, timestampedValue)) + if (cacheErrors) { + // if error was NonFatal Error then saves it to cache + val timestampedValue = timestamp(hasError = true, throwable = e) + // Saved it only in localCache + localCache.foreach(_.set(key, timestampedValue)) + } Future.failed(eLocal) } case _ => Future.failed(eLocal) @@ -436,11 +433,11 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, logger.error(s"canonicalValueGenerator, key $key: failed to generate value and failed to get remote", eLocal) eLocal match { case NonFatal(e) => { - // if error was nonFatal (404) then saves it to cache - // TODO: check if it is actually a 4XX error, or something else - // TODO: handle 5XX errors as well? - val timestampedValue = timestamp(status4XX = true, status5XX = false, error = e) - remoteSetOrGet(key, timestampedValue, remote, nanoStartTime) + if (cacheErrors) { + // if error was NonFatal Error then saves it to cache + val timestampedValue = timestamp(hasError = true, throwable = e) + remoteSetOrGet(key, timestampedValue, remote, nanoStartTime) + } Future.failed(eLocal) } case _ => Future.failed(eLocal) @@ -457,7 +454,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, nanoStartTime: Long, currentRetry: Int = 0)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = { remote.get(key).asTry().flatMap { - case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => + case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) => logger.info(s"remoteGetNonExpiredValue, key $key: got a good value") Future.successful(remoteValue) case Success(_) => @@ -496,7 +493,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, logger.info(s"remoteSetOrGet got lock for key $key") // Lock acquired, get the current value and replace it remote.get(key).asTry().flatMap { - case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => + case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) => // Current value is good, just return it reporter.onRemoteCacheHitAfterGenerating(key, elapsedTime(nanoStartTime)) logger.info(s"remoteSetOrGet got lock for $key but found already a good value on remote") @@ -529,7 +526,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, case Success(false) => // Someone got the lock, let's take a look at the value remote.get(key).asTry().flatMap { - case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now) => + case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) => // Current value is good, just return it logger.info(s"remoteSetOrGet couldn't lock key $key but found a good on remote afterwards") reporter.onRemoteCacheHitAfterGenerating(key, elapsedTime(nanoStartTime)) diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala index 3a02d903..ecf99a0d 100644 --- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala +++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala @@ -39,13 +39,12 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu class MyException(s: String) extends FileNotFoundException(s) // Some NonFatal Exception def myFailedRequest(): Future[Nothing] = { - println("calling myFailedRequest()") myFailedRequestCount = myFailedRequestCount + 1 Future.failed(new MyException("some failure")) } val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) - val cache = ExpiringMultiLevelCache[Data](1.minute, Option(local)) + val cache = ExpiringMultiLevelCache[Data](ttl = 1.minute, localCache = Option(local), cacheErrors = true, ttlCachedErrors = 9.seconds) val eventualCache = cache("key", myFailedRequest) whenReady(eventualCache.failed) { failure => @@ -79,4 +78,97 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu } + it should "calculate a value on cache miss on every request" in { + var myFailedRequestCount: Int = 0 + + class MyException(s: String) extends FileNotFoundException(s) // Some NonFatal Exception + def myFailedRequest(): Future[Nothing] = { + myFailedRequestCount = myFailedRequestCount + 1 + Future.failed(new MyException("some failure")) + } + + val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) + val cache = ExpiringMultiLevelCache[Data](ttl = 1.minute, localCache = Option(local), cacheErrors = false) + + val eventualCache = cache("key", myFailedRequest) + whenReady(eventualCache.failed) { failure => + failure shouldBe a [MyException] + myFailedRequestCount shouldBe 1 + } + + val eventualCache2 = cache("key", myFailedRequest) + whenReady(eventualCache2.failed) { failure => + failure shouldBe a [MyException] + myFailedRequestCount shouldBe 2 + } + + val eventualCache3 = cache("key", myFailedRequest) + whenReady(eventualCache3.failed) { failure => + failure shouldBe a [MyException] + myFailedRequestCount shouldBe 3 + } + + val eventualCache4 = cache("key", myFailedRequest) + whenReady(eventualCache4.failed) { failure => + failure shouldBe a [MyException] + myFailedRequestCount shouldBe 4 + } + + val eventualCache5 = cache("key", myFailedRequest) + whenReady(eventualCache5.failed) { failure => + failure shouldBe a [MyException] + myFailedRequestCount shouldBe 5 + } + + } + + it should "calculate a value on cache miss, then wait ttlCachedError to get a cache miss again" in { + var myFailedRequestCount: Int = 0 + + class MyException(s: String) extends FileNotFoundException(s) // Some NonFatal Exception + def myFailedRequest(): Future[Nothing] = { + myFailedRequestCount = myFailedRequestCount + 1 + Future.failed(new MyException("some failure")) + } + + val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) + val cache = ExpiringMultiLevelCache[Data](ttl = 1.minute, localCache = Option(local), cacheErrors = true, ttlCachedErrors = 9.seconds) + + val eventualCache = cache("key", myFailedRequest) + whenReady(eventualCache.failed) { failure => + failure shouldBe a [MyException] + myFailedRequestCount shouldBe 1 + } + + val eventualCache2 = cache("key", myFailedRequest) + whenReady(eventualCache2.failed) { failure => + failure shouldBe a [MyException] + myFailedRequestCount shouldBe 1 + } + + Thread.sleep(10000) + + val eventualCache3 = cache("key", myFailedRequest) + whenReady(eventualCache3.failed) { failure => + failure shouldBe a [MyException] + myFailedRequestCount shouldBe 2 + } + + val eventualCache4 = cache("key", myFailedRequest) + whenReady(eventualCache4.failed) { failure => + failure shouldBe a [MyException] + myFailedRequestCount shouldBe 2 + } + + Thread.sleep(1000) + + val eventualCache5 = cache("key", myFailedRequest) + whenReady(eventualCache5.failed) { failure => + failure shouldBe a [MyException] + myFailedRequestCount shouldBe 2 + } + + } + + } From 44651b4d8aeccd7e42dd324d9af386a1f699d180 Mon Sep 17 00:00:00 2001 From: Leonardo Santos Date: Thu, 24 Aug 2017 17:30:29 -0400 Subject: [PATCH 170/268] ExpiringMultiLevelCache: testing ttl --- .../ExpiringMultipleLevelCacheSpec.scala | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala index ecf99a0d..0a21ff6c 100644 --- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala +++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala @@ -34,6 +34,30 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu } } + it should "calculate a value on cache miss after ttl" in { + var myRequestCount: Int = 0 + + def myRequest(): Future[Data] = { + myRequestCount = myRequestCount + 1 + Future.successful(Data("success")) + } + + val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) + val cache = ExpiringMultiLevelCache[Data](ttl = 9.seconds, localCache = Option(local)) + + Await.result(cache("key", myRequest), 1.minute) shouldBe Data("success") + myRequestCount shouldBe 1 + Await.result(cache("key", myRequest), 1.minute) shouldBe Data("success") + myRequestCount shouldBe 1 + + Thread.sleep(10000) + + Await.result(cache("key", myRequest), 1.minute) shouldBe Data("success") + myRequestCount shouldBe 2 + Await.result(cache("key", myRequest), 1.minute) shouldBe Data("success") + myRequestCount shouldBe 2 + } + it should "calculate a value on cache miss just once, the second call should be from cache hit" in { var myFailedRequestCount: Int = 0 @@ -170,5 +194,4 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu } - } From 14a5ca29c54795f8cebfe476d87ca199e83263d1 Mon Sep 17 00:00:00 2001 From: Leonardo Santos Date: Mon, 28 Aug 2017 13:18:05 -0400 Subject: [PATCH 171/268] ExpiringMultiLevelCache: refactoring after code review --- .../core/cache/ExpiringMultiLevelCache.scala | 91 ++++++------------- 1 file changed, 29 insertions(+), 62 deletions(-) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala index b5c10667..bfe392e2 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala @@ -17,22 +17,11 @@ import scala.util.control.NonFatal import scala.util.{Failure, Success, Try} object ExpiringMultiLevelCache { - case class TimestampedValue[V](date: DateTime, - value: Option[V] = None, - hasError: Boolean = false, - throwable: Option[Throwable] = None) { + case class TimestampedValue[V](date: DateTime, value: Try[V]) { def hasExpired(ttl: FiniteDuration, now: DateTime, ttlCachedErrors: FiniteDuration = 1.minute): Boolean = { - if (!hasError) { - date.plus(ttl.toMillis).isBefore(now) - } else { - date.plus(ttlCachedErrors.toMillis).isBefore(now) - } - } - def getValue: V = { - this.value match { - case Some(x) => x - // We should never try to get a value that was saved as None, probably from an error - case None => throw new Exception("Trying to get None value") + value match { + case Success(_) => date.plus(ttl.toMillis).isBefore(now) + case Failure(_) => date.plus(ttlCachedErrors.toMillis).isBefore(now) } } } @@ -123,6 +112,7 @@ object ExpiringMultiLevelCache { import ignition.core.cache.ExpiringMultiLevelCache._ + case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, localCache: Option[LocalCache[TimestampedValue[V]]], remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None, @@ -143,27 +133,14 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, protected def now = DateTime.now.withZone(DateTimeZone.UTC) - private def timestamp(v: V) = TimestampedValue(date = now, value = Some(v)) + private def timestamp(v: V): TimestampedValue[V] = TimestampedValue(now, Try(v)) - private def timestamp(hasError: Boolean, throwable: Throwable): TimestampedValue[V] = { - TimestampedValue(value = None, date = now, hasError = hasError, throwable = Some(throwable)) - } + private def timestampError(e: Throwable): TimestampedValue[V] = TimestampedValue(now, Failure(e)) private def elapsedTime(startNanoTime: Long) = FiniteDuration(System.nanoTime() - startNanoTime, TimeUnit.NANOSECONDS) private def remoteLockKey(key: Any) = s"$key-emlc-lock" - private def checkSavedErrorCache(key: String, genValue: () => Future[V], startTime: Long, v: TimestampedValue[V]): Future[V] = { - val promise = Promise[V]() - val future = promise.future - if (v.hasError) { - promise.tryFailure(v.throwable.getOrElse(None.orNull)) - } - else { - promise.trySuccess(v.getValue) - } - future - } // The idea is simple, have two caches: remote and local // with values that will eventually expire but still be left on the cache @@ -179,10 +156,9 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, reporter.onLocalCacheHit(key, elapsedTime(startTime)) // But if we're paranoid, let's check if the local value is consistent with remote if (sanityLocalValueCheck) - remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, genValue, startTime)).getOrElse(checkSavedErrorCache(key, genValue, startTime, localValue)) + remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, genValue, startTime)).getOrElse(Future.successful(localValue.value.get)) else - // We can even get a SavedErrorCache - checkSavedErrorCache(key, genValue, startTime, localValue) + Future.successful(localValue.value.get) case Success(expiredLocalValue) if remoteRW.nonEmpty => // We have locally an expired value, but we can check a remote cache for better value remoteRW.get.get(key).asTry().flatMap { @@ -190,40 +166,35 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, // Remote is good, set locally and return it reporter.onRemoteCacheHit(key, elapsedTime(startTime)) localCache.foreach(_.set(key, remoteValue)) - // We can even get a SavedErrorCache - checkSavedErrorCache(key, genValue, startTime, remoteValue) + Future.successful(remoteValue.value.get) case Success(Some(expiredRemote)) => // Expired local and expired remote, return the most recent of them, async update both reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime)) tryGenerateAndSet(key, genValue, startTime) val mostRecent = Set(expiredLocalValue, expiredRemote).maxBy(_.date) - // We can even get a SavedErrorCache - checkSavedErrorCache(key, genValue, startTime, mostRecent) + Future.successful(mostRecent.value.get) case Success(None) => // No remote found, return local, async update both reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime)) tryGenerateAndSet(key, genValue, startTime) - // We can even get a SavedErrorCache - checkSavedErrorCache(key, genValue, startTime, expiredLocalValue) + Future.successful(expiredLocalValue.value.get) case Failure(e) => reporter.onRemoteError(key, e, elapsedTime(startTime)) logger.warn(s"apply, key: $key expired local value and failed to get remote", e) tryGenerateAndSet(key, genValue, startTime) - // We can even get a SavedErrorCache - checkSavedErrorCache(key, genValue, startTime, expiredLocalValue) + Future.successful(expiredLocalValue.value.get) } case Success(expiredLocalValue) if remoteRW.isEmpty => // There is no remote cache configured, we'are on our own // Return expired value and try to generate a new one for the future reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime)) tryGenerateAndSet(key, genValue, startTime) - // We can even get a SavedErrorCache - checkSavedErrorCache(key, genValue, startTime, expiredLocalValue) + Future.successful(expiredLocalValue.value.get) case Failure(e) => // This is almost impossible to happen because it's local and we don't save failed values reporter.onLocalError(key, e, elapsedTime(startTime)) logger.warn(s"apply, key: $key got a failed future from cache!? This is almost impossible!", e) - tryGenerateAndSet(key, genValue, startTime).map(_.getValue) + tryGenerateAndSet(key, genValue, startTime).map(_.value.get) } case None if remoteRW.nonEmpty => // No local, let's try remote @@ -232,28 +203,26 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, // Remote is good, set locally and return it reporter.onRemoteCacheHit(key, elapsedTime(startTime)) localCache.foreach(_.set(key, remoteValue)) - // We can even get a SavedErrorCache - checkSavedErrorCache(key, genValue, startTime, remoteValue) + Future.successful(remoteValue.value.get) case Success(Some(expiredRemote)) => // Expired remote, return the it, async update reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime)) - tryGenerateAndSet(key, genValue, startTime).map(_.getValue) - // We can even get a SavedErrorCache - checkSavedErrorCache(key, genValue, startTime, expiredRemote) + tryGenerateAndSet(key, genValue, startTime).map(_.value) + Future.successful(expiredRemote.value.get) case Success(None) => // No good remote, sync generate reporter.onCacheMissNothingFound(key, elapsedTime(startTime)) - tryGenerateAndSet(key, genValue, startTime).map(_.getValue) + tryGenerateAndSet(key, genValue, startTime).map(_.value.get) case Failure(e) => reporter.onRemoteError(key, e, elapsedTime(startTime)) logger.warn(s"apply, key: $key expired local value and remote error", e) - tryGenerateAndSet(key, genValue, startTime).map(_.getValue) + tryGenerateAndSet(key, genValue, startTime).map(_.value.get) } case None if remoteRW.isEmpty => // No local and no remote to look, just generate it // The caller will need to wait for the value generation reporter.onCacheMissNothingFound(key, elapsedTime(startTime)) - tryGenerateAndSet(key, genValue, startTime).map(_.getValue) + tryGenerateAndSet(key, genValue, startTime).map(_.value.get) } result.onComplete { case Success(_) => @@ -293,8 +262,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, remote.get(key).asTry().flatMap { case Success(Some(remoteValue)) if remoteValue == localValue => // Remote is the same as local, return any of them - // We can even get a SavedErrorCache - checkSavedErrorCache(key, genValue, startTime, remoteValue) + Future.successful(remoteValue.value.get) case Success(Some(remoteValue)) => // Something is different, try to figure it out val valuesResult = if (remoteValue.value == localValue.value) "same-value" else "different-values" @@ -314,20 +282,18 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, logger.warn(s"sanityLocalValueCheck, key $key: got different results for local $localValue and remote $remoteValue ($finalResult)") reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime)) // return remote to keep everyone consistent - // We can even get a SavedErrorCache - checkSavedErrorCache(key, genValue, startTime, remoteValue) + Future.successful(remoteValue.value.get) case Success(None) => val localExpired = localValue.hasExpired(ttl, now, ttlCachedErrors) val finalResult = s"missing-remote-local-expired-${localExpired}" logger.warn(s"sanityLocalValueCheck, key $key: got local $localValue but no remote ($finalResult)") reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime)) // Try generate it to keep a behaviour equivalent to remote only - tryGenerateAndSet(key, genValue, startTime).map(_.getValue) + tryGenerateAndSet(key, genValue, startTime).map(_.value.get) case Failure(e) => reporter.onRemoteError(key, e, elapsedTime(startTime)) logger.warn(s"sanityLocalValueCheck, key: $key failed to get remote", e) - // We can even get a SavedErrorCache - checkSavedErrorCache(key, genValue, startTime, localValue) + Future.successful(localValue.value.get) } } @@ -415,7 +381,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, case NonFatal(e) => { if (cacheErrors) { // if error was NonFatal Error then saves it to cache - val timestampedValue = timestamp(hasError = true, throwable = e) + val timestampedValue: TimestampedValue[V] = timestampError(e) // Saved it only in localCache localCache.foreach(_.set(key, timestampedValue)) } @@ -435,8 +401,9 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, case NonFatal(e) => { if (cacheErrors) { // if error was NonFatal Error then saves it to cache - val timestampedValue = timestamp(hasError = true, throwable = e) - remoteSetOrGet(key, timestampedValue, remote, nanoStartTime) + val timestampedValue = timestampError(e) + // Saved it only in localCache + localCache.foreach(_.set(key, timestampedValue)) } Future.failed(eLocal) } From 4952a61bae51b24ca62bfe3bf875fabf3b096115 Mon Sep 17 00:00:00 2001 From: Leonardo Santos Date: Thu, 31 Aug 2017 17:24:07 -0400 Subject: [PATCH 172/268] ExpiringMultiLevelCache: Using Try properly with Future.fromTry --- .../core/cache/ExpiringMultiLevelCache.scala | 53 ++++++++++++------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala index bfe392e2..6ac0f626 100644 --- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala +++ b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala @@ -156,9 +156,9 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, reporter.onLocalCacheHit(key, elapsedTime(startTime)) // But if we're paranoid, let's check if the local value is consistent with remote if (sanityLocalValueCheck) - remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, genValue, startTime)).getOrElse(Future.successful(localValue.value.get)) + remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, genValue, startTime)).getOrElse(Future.fromTry(localValue.value)) else - Future.successful(localValue.value.get) + Future.fromTry(localValue.value) case Success(expiredLocalValue) if remoteRW.nonEmpty => // We have locally an expired value, but we can check a remote cache for better value remoteRW.get.get(key).asTry().flatMap { @@ -166,35 +166,39 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, // Remote is good, set locally and return it reporter.onRemoteCacheHit(key, elapsedTime(startTime)) localCache.foreach(_.set(key, remoteValue)) - Future.successful(remoteValue.value.get) + Future.fromTry(remoteValue.value) case Success(Some(expiredRemote)) => // Expired local and expired remote, return the most recent of them, async update both reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime)) tryGenerateAndSet(key, genValue, startTime) val mostRecent = Set(expiredLocalValue, expiredRemote).maxBy(_.date) - Future.successful(mostRecent.value.get) + Future.fromTry(mostRecent.value) case Success(None) => // No remote found, return local, async update both reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime)) tryGenerateAndSet(key, genValue, startTime) - Future.successful(expiredLocalValue.value.get) + Future.fromTry(expiredLocalValue.value) case Failure(e) => reporter.onRemoteError(key, e, elapsedTime(startTime)) logger.warn(s"apply, key: $key expired local value and failed to get remote", e) tryGenerateAndSet(key, genValue, startTime) - Future.successful(expiredLocalValue.value.get) + Future.fromTry(expiredLocalValue.value) } case Success(expiredLocalValue) if remoteRW.isEmpty => // There is no remote cache configured, we'are on our own // Return expired value and try to generate a new one for the future reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime)) tryGenerateAndSet(key, genValue, startTime) - Future.successful(expiredLocalValue.value.get) + Future.fromTry(expiredLocalValue.value) case Failure(e) => // This is almost impossible to happen because it's local and we don't save failed values + // Failed values are stored into property "value", not as the value itself reporter.onLocalError(key, e, elapsedTime(startTime)) logger.warn(s"apply, key: $key got a failed future from cache!? This is almost impossible!", e) - tryGenerateAndSet(key, genValue, startTime).map(_.value.get) + for { + tsv <- tryGenerateAndSet(key, genValue, startTime) + value <- Future.fromTry(tsv.value) + } yield value } case None if remoteRW.nonEmpty => // No local, let's try remote @@ -203,26 +207,35 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, // Remote is good, set locally and return it reporter.onRemoteCacheHit(key, elapsedTime(startTime)) localCache.foreach(_.set(key, remoteValue)) - Future.successful(remoteValue.value.get) + Future.fromTry(remoteValue.value) case Success(Some(expiredRemote)) => // Expired remote, return the it, async update reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime)) - tryGenerateAndSet(key, genValue, startTime).map(_.value) - Future.successful(expiredRemote.value.get) + tryGenerateAndSet(key, genValue, startTime) + Future.fromTry(expiredRemote.value) case Success(None) => // No good remote, sync generate reporter.onCacheMissNothingFound(key, elapsedTime(startTime)) - tryGenerateAndSet(key, genValue, startTime).map(_.value.get) + for { + tsv <- tryGenerateAndSet(key, genValue, startTime) + value <- Future.fromTry(tsv.value) + } yield value case Failure(e) => reporter.onRemoteError(key, e, elapsedTime(startTime)) logger.warn(s"apply, key: $key expired local value and remote error", e) - tryGenerateAndSet(key, genValue, startTime).map(_.value.get) + for { + tsv <- tryGenerateAndSet(key, genValue, startTime) + value <- Future.fromTry(tsv.value) + } yield value } case None if remoteRW.isEmpty => // No local and no remote to look, just generate it // The caller will need to wait for the value generation reporter.onCacheMissNothingFound(key, elapsedTime(startTime)) - tryGenerateAndSet(key, genValue, startTime).map(_.value.get) + for { + tsv <- tryGenerateAndSet(key, genValue, startTime) + value <- Future.fromTry(tsv.value) + } yield value } result.onComplete { case Success(_) => @@ -262,7 +275,7 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, remote.get(key).asTry().flatMap { case Success(Some(remoteValue)) if remoteValue == localValue => // Remote is the same as local, return any of them - Future.successful(remoteValue.value.get) + Future.fromTry(remoteValue.value) case Success(Some(remoteValue)) => // Something is different, try to figure it out val valuesResult = if (remoteValue.value == localValue.value) "same-value" else "different-values" @@ -282,18 +295,22 @@ case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, logger.warn(s"sanityLocalValueCheck, key $key: got different results for local $localValue and remote $remoteValue ($finalResult)") reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime)) // return remote to keep everyone consistent - Future.successful(remoteValue.value.get) + Future.fromTry(remoteValue.value) case Success(None) => val localExpired = localValue.hasExpired(ttl, now, ttlCachedErrors) val finalResult = s"missing-remote-local-expired-${localExpired}" logger.warn(s"sanityLocalValueCheck, key $key: got local $localValue but no remote ($finalResult)") reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime)) // Try generate it to keep a behaviour equivalent to remote only - tryGenerateAndSet(key, genValue, startTime).map(_.value.get) + for { + tsv <- tryGenerateAndSet(key, genValue, startTime) + value <- Future.fromTry(tsv.value) + } yield value + case Failure(e) => reporter.onRemoteError(key, e, elapsedTime(startTime)) logger.warn(s"sanityLocalValueCheck, key: $key failed to get remote", e) - Future.successful(localValue.value.get) + Future.fromTry(localValue.value) } } From 4cff4323b4cd305d2231538d17696151164e5211 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 17 Oct 2017 17:28:27 -0200 Subject: [PATCH 173/268] I hope now we fixed those tests random failures --- .../ExpiringMultipleLevelCacheSpec.scala | 35 +++++++++++++------ .../ignition/core/utils/FutureUtilsSpec.scala | 35 +++++++++++++------ 2 files changed, 49 insertions(+), 21 deletions(-) diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala index 0a21ff6c..dec108b4 100644 --- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala +++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala @@ -1,6 +1,7 @@ package ignition.core.cache import java.io.FileNotFoundException +import java.util.concurrent.atomic.AtomicInteger import akka.actor.ActorSystem import ignition.core.cache.ExpiringMultiLevelCache.TimestampedValue @@ -35,27 +36,41 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu } it should "calculate a value on cache miss after ttl" in { - var myRequestCount: Int = 0 + val myRequestCount = new AtomicInteger() def myRequest(): Future[Data] = { - myRequestCount = myRequestCount + 1 + myRequestCount.incrementAndGet() Future.successful(Data("success")) } val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) val cache = ExpiringMultiLevelCache[Data](ttl = 9.seconds, localCache = Option(local)) - Await.result(cache("key", myRequest), 1.minute) shouldBe Data("success") - myRequestCount shouldBe 1 - Await.result(cache("key", myRequest), 1.minute) shouldBe Data("success") - myRequestCount shouldBe 1 + whenReady(cache("key", myRequest), timeout(1.minute)) { result => + result shouldBe Data("success") + } + + myRequestCount.get() shouldBe 1 + + whenReady(cache("key", myRequest), timeout(1.minute)) { result => + result shouldBe Data("success") + } + + myRequestCount.get() shouldBe 1 Thread.sleep(10000) - Await.result(cache("key", myRequest), 1.minute) shouldBe Data("success") - myRequestCount shouldBe 2 - Await.result(cache("key", myRequest), 1.minute) shouldBe Data("success") - myRequestCount shouldBe 2 + whenReady(cache("key", myRequest), timeout(1.minute)) { result => + result shouldBe Data("success") + } + + myRequestCount.get() shouldBe 2 + + whenReady(cache("key", myRequest), timeout(1.minute)) { result => + result shouldBe Data("success") + } + + myRequestCount.get() shouldBe 2 } it should "calculate a value on cache miss just once, the second call should be from cache hit" in { diff --git a/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala b/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala index 4649fcfc..bb47e196 100644 --- a/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala +++ b/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala @@ -1,43 +1,56 @@ package ignition.core.utils -import FutureUtils._ +import ignition.core.utils.FutureUtils._ import org.scalatest._ +import org.scalatest.concurrent.ScalaFutures -import scala.concurrent.{Await, Future} -import scala.concurrent.duration._ import scala.concurrent.ExecutionContext.Implicits.global +import scala.concurrent.Future +import scala.concurrent.duration._ -class FutureUtilsSpec extends FlatSpec with Matchers { +class FutureUtilsSpec extends FlatSpec with Matchers with ScalaFutures { "FutureUtils" should "provide toLazyIterable" in { val timesCalled = collection.mutable.Map.empty[Int, Int].withDefaultValue(0) val generators = (0 until 20).map { i => () => Future { timesCalled(i) += 1 ; i } } val iterable = generators.toLazyIterable() val iterator = iterable.toIterator - timesCalled.forall { case (key, count) => count == 0 } shouldBe true + timesCalled.forall { case (_, count) => count == 0 } shouldBe true - Await.result(iterator.next(), 2.seconds) + whenReady(iterator.next(), timeout(2.seconds)) { _ => () } timesCalled(0) shouldBe 1 (1 until 20).foreach { i => timesCalled(i) shouldBe 0 } - Await.result(Future.sequence(iterator), 5.seconds).toList shouldBe (1 until 20).toList + whenReady(Future.sequence(iterator), timeout(5.seconds)) { result => + result.toList shouldBe (1 until 20).toList + } (0 until 20).foreach { i => timesCalled(i) shouldBe 1 } } it should "provide collectAndTake" in { val timesCalled = collection.mutable.Map.empty[Int, Int].withDefaultValue(0) - val iterable = (0 until 30).map { i => () => Future { timesCalled(i) += 1 ; i } }.toLazyIterable() + val iterable = (0 until 30).map { i => + () => + Future { + synchronized { + timesCalled(i) += 1 + } + i + } + }.toLazyIterable() val expectedRange = Range(5, 15) - val result = Await.result(iterable.collectAndTake({ case i if expectedRange.contains(i) => i }, n = expectedRange.size), 5.seconds) - result shouldBe expectedRange.toList + val f: Future[List[Int]] = iterable.collectAndTake({ case i if expectedRange.contains(i) => i }, n = expectedRange.size) + + whenReady(f, timeout(5.seconds)) { result => + result shouldBe expectedRange.toList + } (0 until 20).foreach { i => timesCalled(i) shouldBe 1 } // 2 batches of size 10 (20 until 30).foreach { i => timesCalled(i) shouldBe 0 } // last batch won't be ran - } } From ce0e6f26e469eb9fdb4e2b626cf83d4784c7c70d Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 17 Oct 2017 18:03:50 -0200 Subject: [PATCH 174/268] and here we go... --- .../ExpiringMultipleLevelCacheSpec.scala | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala index dec108b4..121f34a8 100644 --- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala +++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala @@ -70,6 +70,34 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu result shouldBe Data("success") } + whenReady(cache("key", myRequest), timeout(1.minute)) { result => + result shouldBe Data("success") + } + + whenReady(cache("key", myRequest), timeout(1.minute)) { result => + result shouldBe Data("success") + } + + whenReady(cache("key", myRequest), timeout(1.minute)) { result => + result shouldBe Data("success") + } + + whenReady(cache("key", myRequest), timeout(1.minute)) { result => + result shouldBe Data("success") + } + + whenReady(cache("key", myRequest), timeout(1.minute)) { result => + result shouldBe Data("success") + } + + whenReady(cache("key", myRequest), timeout(1.minute)) { result => + result shouldBe Data("success") + } + + whenReady(cache("key", myRequest), timeout(1.minute)) { result => + result shouldBe Data("success") + } + myRequestCount.get() shouldBe 2 } From 3f885eb342a78ae43d032088d219b84f1acf1139 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 18 Oct 2017 09:21:45 -0200 Subject: [PATCH 175/268] random programming... --- .../ExpiringMultipleLevelCacheSpec.scala | 59 ++++++------------- 1 file changed, 18 insertions(+), 41 deletions(-) diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala index 121f34a8..d202e4e7 100644 --- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala +++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala @@ -36,6 +36,7 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu } it should "calculate a value on cache miss after ttl" in { + val cacheTtl = 3.seconds val myRequestCount = new AtomicInteger() def myRequest(): Future[Data] = { @@ -44,61 +45,37 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu } val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) - val cache = ExpiringMultiLevelCache[Data](ttl = 9.seconds, localCache = Option(local)) + val cache = ExpiringMultiLevelCache[Data](ttl = cacheTtl, localCache = Option(local)) - whenReady(cache("key", myRequest), timeout(1.minute)) { result => + whenReady(cache("key", myRequest)) { result => result shouldBe Data("success") } myRequestCount.get() shouldBe 1 - whenReady(cache("key", myRequest), timeout(1.minute)) { result => + whenReady(cache("key", myRequest)) { result => result shouldBe Data("success") } myRequestCount.get() shouldBe 1 - Thread.sleep(10000) - - whenReady(cache("key", myRequest), timeout(1.minute)) { result => - result shouldBe Data("success") - } - - myRequestCount.get() shouldBe 2 - - whenReady(cache("key", myRequest), timeout(1.minute)) { result => - result shouldBe Data("success") - } - - whenReady(cache("key", myRequest), timeout(1.minute)) { result => - result shouldBe Data("success") - } - - whenReady(cache("key", myRequest), timeout(1.minute)) { result => - result shouldBe Data("success") + val f = Future { + Thread.sleep(cacheTtl.toMillis + 10) } - whenReady(cache("key", myRequest), timeout(1.minute)) { result => - result shouldBe Data("success") - } - - whenReady(cache("key", myRequest), timeout(1.minute)) { result => - result shouldBe Data("success") - } + whenReady(f, timeout(cacheTtl + 20.milli)) { _ => + whenReady(cache("key", myRequest)) { result => + result shouldBe Data("success") + } - whenReady(cache("key", myRequest), timeout(1.minute)) { result => - result shouldBe Data("success") - } + myRequestCount.get() shouldBe 2 - whenReady(cache("key", myRequest), timeout(1.minute)) { result => - result shouldBe Data("success") - } + whenReady(cache("key", myRequest)) { result => + result shouldBe Data("success") + } - whenReady(cache("key", myRequest), timeout(1.minute)) { result => - result shouldBe Data("success") + myRequestCount.get() shouldBe 2 } - - myRequestCount.get() shouldBe 2 } it should "calculate a value on cache miss just once, the second call should be from cache hit" in { @@ -199,7 +176,7 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu } val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) - val cache = ExpiringMultiLevelCache[Data](ttl = 1.minute, localCache = Option(local), cacheErrors = true, ttlCachedErrors = 9.seconds) + val cache = ExpiringMultiLevelCache[Data](ttl = 1.minute, localCache = Option(local), cacheErrors = true, ttlCachedErrors = 4.seconds) val eventualCache = cache("key", myFailedRequest) whenReady(eventualCache.failed) { failure => @@ -213,7 +190,7 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu myFailedRequestCount shouldBe 1 } - Thread.sleep(10000) + Thread.sleep(5000) val eventualCache3 = cache("key", myFailedRequest) whenReady(eventualCache3.failed) { failure => @@ -227,7 +204,7 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu myFailedRequestCount shouldBe 2 } - Thread.sleep(1000) + Thread.sleep(500) val eventualCache5 = cache("key", myFailedRequest) whenReady(eventualCache5.failed) { failure => From e9d4703576d5256ed4677e0fb397351ebca64241 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 18 Oct 2017 10:25:41 -0200 Subject: [PATCH 176/268] simple version... --- .../ExpiringMultipleLevelCacheSpec.scala | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala index d202e4e7..9fd77d78 100644 --- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala +++ b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala @@ -59,23 +59,19 @@ class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFu myRequestCount.get() shouldBe 1 - val f = Future { - Thread.sleep(cacheTtl.toMillis + 10) - } - - whenReady(f, timeout(cacheTtl + 20.milli)) { _ => - whenReady(cache("key", myRequest)) { result => - result shouldBe Data("success") - } + Thread.sleep(cacheTtl.toMillis + 10) - myRequestCount.get() shouldBe 2 + whenReady(cache("key", myRequest)) { result => + result shouldBe Data("success") + } - whenReady(cache("key", myRequest)) { result => - result shouldBe Data("success") - } + myRequestCount.get() shouldBe 2 - myRequestCount.get() shouldBe 2 + whenReady(cache("key", myRequest)) { result => + result shouldBe Data("success") } + + myRequestCount.get() shouldBe 2 } it should "calculate a value on cache miss just once, the second call should be from cache hit" in { From 6a52e1980d438f5e25a63ff401f39c488c08adf8 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 18 Oct 2017 13:48:37 -0200 Subject: [PATCH 177/268] sync this too --- src/test/scala/ignition/core/utils/FutureUtilsSpec.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala b/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala index bb47e196..c10b50d5 100644 --- a/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala +++ b/src/test/scala/ignition/core/utils/FutureUtilsSpec.scala @@ -11,7 +11,7 @@ class FutureUtilsSpec extends FlatSpec with Matchers with ScalaFutures { "FutureUtils" should "provide toLazyIterable" in { val timesCalled = collection.mutable.Map.empty[Int, Int].withDefaultValue(0) - val generators = (0 until 20).map { i => () => Future { timesCalled(i) += 1 ; i } } + val generators = (0 until 20).map { i => () => Future { synchronized { timesCalled(i) += 1 } ; i } } val iterable = generators.toLazyIterable() val iterator = iterable.toIterator timesCalled.forall { case (_, count) => count == 0 } shouldBe true From 979a8995e8d74e6b1709c42c1934f4d3fb350db1 Mon Sep 17 00:00:00 2001 From: Fernando Rodrigues da Silva Date: Mon, 6 Nov 2017 20:05:33 -0200 Subject: [PATCH 178/268] fix too many open files (#126) --- tools/scripts/noop | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/scripts/noop b/tools/scripts/noop index cc1f786e..ed32eb97 100644 --- a/tools/scripts/noop +++ b/tools/scripts/noop @@ -1 +1,3 @@ -#!/bin/bash \ No newline at end of file +#!/bin/bash + +echo '* - nofile 256000' >> /etc/security/limits.conf \ No newline at end of file From 6a9f7a84737b4317d4b87f96567ad46ee0f2dae7 Mon Sep 17 00:00:00 2001 From: Fernando Rodrigues da Silva Date: Wed, 8 Nov 2017 11:33:54 -0200 Subject: [PATCH 179/268] changing ulimit -n to an usual value (#127) --- tools/scripts/noop | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/scripts/noop b/tools/scripts/noop index ed32eb97..eb34279f 100644 --- a/tools/scripts/noop +++ b/tools/scripts/noop @@ -1,3 +1,3 @@ #!/bin/bash -echo '* - nofile 256000' >> /etc/security/limits.conf \ No newline at end of file +echo '* - nofile 65535' >> /etc/security/limits.conf \ No newline at end of file From 7c5ebfd64ca74698e7d6d58e52f1e93235941ed6 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 23 Nov 2017 11:58:26 -0200 Subject: [PATCH 180/268] moving this class to a better scope --- .../core/jobs/utils/SparkContextUtils.scala | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 1de12dd6..cd362de0 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -37,6 +37,18 @@ object SparkContextUtils { override def getPartition(key: Any): Int = index(key) } + case class SizeBasedFileHandling(averageEstimatedCompressionRatio: Int = 8, compressedExtensions: Set[String] = Set(".gz")) { + + def isBig(f: HadoopFile, uncompressedBigSize: Long): Boolean = estimatedSize(f) >= uncompressedBigSize + + def estimatedSize(f: HadoopFile): Long = if (isCompressed(f)) + f.size * averageEstimatedCompressionRatio + else + f.size + + def isCompressed(f: HadoopFile): Boolean = compressedExtensions.exists(f.path.endsWith) + } + private lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider()) private def close(inputStream: InputStream, path: String): Unit = { @@ -249,20 +261,6 @@ object SparkContextUtils { objectHadoopFile(paths, minimumPaths) } - case class SizeBasedFileHandling(averageEstimatedCompressionRatio: Int = 8, - compressedExtensions: Set[String] = Set(".gz")) { - - def isBig(f: HadoopFile, uncompressedBigSize: Long): Boolean = estimatedSize(f) >= uncompressedBigSize - - def estimatedSize(f: HadoopFile) = if (isCompressed(f)) - f.size * averageEstimatedCompressionRatio - else - f.size - - def isCompressed(f: HadoopFile): Boolean = compressedExtensions.exists(f.path.endsWith) - } - - private def readSmallFiles(smallFiles: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Int, From ce9d356932ceb845dcb231acf2c13296bda1c811 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 23 Nov 2017 15:22:11 -0200 Subject: [PATCH 181/268] using java8 --- circle.yml | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 circle.yml diff --git a/circle.yml b/circle.yml new file mode 100644 index 00000000..abd78de2 --- /dev/null +++ b/circle.yml @@ -0,0 +1,3 @@ +machine: + java: + version: oraclejdk8 From cc08dcefdfbf59307b9d1819e92b6d138d6e0b25 Mon Sep 17 00:00:00 2001 From: Henrique Goulart Date: Mon, 19 Feb 2018 09:24:25 -0300 Subject: [PATCH 182/268] Update zeppelin to latest version and spark memory parameter --- remote_hook.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/remote_hook.sh b/remote_hook.sh index 6648ccd8..cb43904f 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -59,7 +59,7 @@ on_trap_exit() { install_and_run_zeppelin() { if [[ ! -d "zeppelin" ]]; then - wget "http://www.us.apache.org/dist/incubator/zeppelin/0.5.6-incubating/zeppelin-0.5.6-incubating-bin-all.tgz" -O zeppelin.tar.gz + wget "http://www-us.apache.org/dist/zeppelin/zeppelin-0.7.3/zeppelin-0.7.3-bin-all.tgz" -O zeppelin.tar.gz mkdir zeppelin tar xvzf zeppelin.tar.gz -C zeppelin --strip-components 1 > /tmp/zeppelin_install.log fi @@ -67,7 +67,7 @@ install_and_run_zeppelin() { export MASTER="${JOB_MASTER}" export ZEPPELIN_PORT="8081" export SPARK_HOME="/root/spark" - export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH} --runner-executor-memory ${SPARK_MEM_PARAM}" + export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH} --executor-memory ${SPARK_MEM_PARAM}" sudo -E zeppelin/bin/zeppelin.sh else notify_error_and_exit "Zepellin installation not found" From 92599ce2defaac6685eb3d3ed590059f8616627b Mon Sep 17 00:00:00 2001 From: Henrique Goulart Date: Thu, 8 Mar 2018 10:09:59 -0300 Subject: [PATCH 183/268] Workaround to use private vpc --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 5efefeb5..7b5863ef 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -352,7 +352,7 @@ def get_master(cluster_name, region=default_region): masters = get_masters(cluster_name, region=region) if not masters: raise CommandError("No master on {}".format(cluster_name)) - return masters[0].public_dns_name + return masters[0].private_dns_name def ssh_master(cluster_name, key_file=default_key_file, user=default_remote_user, region=default_region, *args): From 4c118a107d36651133c35ac39934632ca82af593 Mon Sep 17 00:00:00 2001 From: Henrique Goulart Date: Thu, 8 Mar 2018 10:17:20 -0300 Subject: [PATCH 184/268] Fix DNS names for launching on private subnet --- tools/cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index 7b5863ef..c56a803a 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -337,7 +337,7 @@ def destroy(cluster_name, delete_groups=False, region=default_region): if all_instances: log.info('The following instances will be terminated:') for i in all_instances: - log.info('-> %s' % i.public_dns_name) + log.info('-> %s' % (i.public_dns_name or i.private_dns_name)) log.info('Terminating master...') for i in masters: @@ -352,7 +352,7 @@ def get_master(cluster_name, region=default_region): masters = get_masters(cluster_name, region=region) if not masters: raise CommandError("No master on {}".format(cluster_name)) - return masters[0].private_dns_name + return masters[0].public_dns_name or masters[0].private_dns_name def ssh_master(cluster_name, key_file=default_key_file, user=default_remote_user, region=default_region, *args): From 67ac87a62c2fd4e56b65365fe64abf9c41ae7ebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Roeck?= Date: Wed, 14 Mar 2018 13:49:08 -0300 Subject: [PATCH 185/268] Improve stderr and cluster destroy (#132) * Return stdout and stderr as subprocess output * Add wait_termination param to destroy method * Add log and timetout to cluster termination * Rename wait_timeout param --- tools/cluster.py | 14 +++++++++++++- tools/utils.py | 3 ++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index 5efefeb5..6e475300 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -329,7 +329,7 @@ def launch(cluster_name, slaves, raise CommandError('Failed to created cluster {} after failures'.format(cluster_name)) -def destroy(cluster_name, delete_groups=False, region=default_region): +def destroy(cluster_name, delete_groups=False, region=default_region, wait_termination=False, wait_timeout_minutes=10): assert not delete_groups, 'Delete groups is deprecated and unsupported' masters, slaves = get_active_nodes(cluster_name, region=region) @@ -342,9 +342,21 @@ def destroy(cluster_name, delete_groups=False, region=default_region): log.info('Terminating master...') for i in masters: i.terminate() + log.info('Terminating slaves...') for i in slaves: i.terminate() + + if wait_termination: + log.info('Waiting for instances termination...') + termination_timeout = wait_timeout_minutes*60 + termination_start = time.time() + while wait_termination and all_instances and time.time() < termination_start+termination_timeout: + all_instances = [i for i in all_instances if i.state != 'terminated'] + time.sleep(5) + for i in all_instances: + i.update() + log.info('Done.') diff --git a/tools/utils.py b/tools/utils.py index 39d6129f..88a236cd 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -120,5 +120,6 @@ def check_call_with_timeout(args, stdin=None, stdout=None, read_from_to(p.stdout, stdout) read_from_to(p.stderr, stderr) if p.returncode != 0: - raise subprocess.CalledProcessError(p.returncode, args) + stdall = 'STDOUT:\n{}\nSTDERR:\n{}'.format(stdout, stderr) + raise subprocess.CalledProcessError(p.returncode, args, output=stdall) return p.returncode From d30f140dbcc87e1cd3c87b5ef28898bab9dd5a0c Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 3 Apr 2018 14:46:24 -0300 Subject: [PATCH 186/268] flintrok with private vpc support --- tools/flintrock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flintrock b/tools/flintrock index eba6ab1d..b4bd82cc 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit eba6ab1dceb942937bdc9610736e70d72e2a6579 +Subproject commit b4bd82cc3cb5e72c2fd301510db7570326ce3086 From 306cd39b50e98e3080a8c9fbbfcd6efb0eaad17f Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 10 Apr 2018 10:42:57 -0300 Subject: [PATCH 187/268] update flintrock --- tools/flintrock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flintrock b/tools/flintrock index 67721e89..239fec7e 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit 67721e8997b1bf85a7ec1283714039959e9f6c63 +Subproject commit 239fec7eb5c81ad428c1ce7aafd66998bc887a10 From 6ced247c1071f1a43e5e4636aa79361f4cc62e5c Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 24 Apr 2018 11:14:19 -0300 Subject: [PATCH 188/268] update spark and hadoop --- build.sbt | 6 +++--- src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 4 ++-- tools/cluster.py | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/build.sbt b/build.sbt index f4fa51d0..39bbd8b6 100644 --- a/build.sbt +++ b/build.sbt @@ -9,13 +9,13 @@ scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warning // Because we can't run two spark contexts on same VM parallelExecution in Test := false -libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.2.0" % "provided") +libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.3.0" % "provided") .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") -libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.2" % "provided") +libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided") -libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.2") +libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.6") .exclude("org.apache.htrace", "htrace-core") .exclude("commons-beanutils", "commons-beanutils") .exclude("org.slf4j", "slf4j-log4j12") diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index ab47ee12..0fa12c9c 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -92,8 +92,8 @@ object CoreJobRunner { // Also try to propagate logging context to workers // TODO: find a more efficient and bullet-proof way - val configBroadCast = sc.broadcast(config) - sc.parallelize(Range(1, 2000), numSlices = 2000).foreachPartition(_ => setLoggingContextValues(configBroadCast.value)) +// val configBroadCast = sc.broadcast(config) +// sc.parallelize(Range(1, 2000), numSlices = 2000).foreachPartition(_ => setLoggingContextValues(configBroadCast.value)) val context = RunnerContext(sc, config) diff --git a/tools/cluster.py b/tools/cluster.py index 83a289ec..5bdcb4da 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -46,11 +46,11 @@ default_zone = default_region + 'b' default_key_id = 'ignition_key' default_key_file = os.path.expanduser('~/.ssh/ignition_key.pem') -default_ami = 'ami-611e7976' +default_ami = 'ami-5679a229' default_master_ami = '' default_env = 'dev' -default_spark_version = '2.2.0' -default_hdfs_version = '2.7.2' +default_spark_version = '2.3.0' +default_hdfs_version = '2.7.6' default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz' default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz' default_remote_user = 'ec2-user' From 3591fdb4f19d6800b1d83cdee7f47cae69e1cb1f Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 25 Apr 2018 15:06:03 -0300 Subject: [PATCH 189/268] script to create ami --- tools/create_image.sh | 47 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 tools/create_image.sh diff --git a/tools/create_image.sh b/tools/create_image.sh new file mode 100644 index 00000000..5f807365 --- /dev/null +++ b/tools/create_image.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Creates an AMI for the Spark EC2 scripts starting with a stock Amazon Linux AMI. + +# This script was adapted from: +# https://github.com/amplab/spark-ec2/blob/branch-1.6/create_image.sh + +set -e + +if [ "$(id -u)" != "0" ]; then + echo "This script must be run as root" 1>&2 + exit 1 +fi + +# Dev tools +sudo yum install -y java-1.8.0-openjdk-devel +# Perf tools +sudo yum install -y dstat iotop strace sysstat htop perf +sudo debuginfo-install -q -y glibc +sudo debuginfo-install -q -y kernel +sudo yum --enablerepo='*-debug*' install -q -y java-1.8.0-openjdk-debuginfo.x86_64 + +# Root ssh config +sudo sed -i 's/PermitRootLogin.*/PermitRootLogin without-password/g' \ + /etc/ssh/sshd_config +sudo sed -i 's/disable_root.*/disable_root: 0/g' /etc/cloud/cloud.cfg + +# Edit bash profile +echo "export PS1=\"\\u@\\h \\W]\\$ \"" >> ~/.bash_profile +echo "export JAVA_HOME=/usr/lib/jvm/java-1.8.0" >> ~/.bash_profile + +source ~/.bash_profile + +# Global JAVA_HOME env +echo "export JAVA_HOME=/usr/lib/jvm/java-1.8.0" >> /etc/environment + +# Install Snappy lib (for Hadoop) +yum install -y snappy + +# Install netlib-java native dependencies +yum install -y blas atlas lapack + +# Create /usr/bin/realpath which is used by R to find Java installations +# NOTE: /usr/bin/realpath is missing in CentOS AMIs. See +# http://superuser.com/questions/771104/usr-bin-realpath-not-found-in-centos-6-5 +echo '#!/bin/bash' > /usr/bin/realpath +echo 'readlink -e "$@"' >> /usr/bin/realpath +chmod a+x /usr/bin/realpath \ No newline at end of file From 040b7e6df351523c2a1ba72eebf26910a892fa6e Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 25 Apr 2018 15:07:13 -0300 Subject: [PATCH 190/268] update to a new ami --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 5bdcb4da..bd114e68 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -46,7 +46,7 @@ default_zone = default_region + 'b' default_key_id = 'ignition_key' default_key_file = os.path.expanduser('~/.ssh/ignition_key.pem') -default_ami = 'ami-5679a229' +default_ami = 'ami-60b6001f' default_master_ami = '' default_env = 'dev' default_spark_version = '2.3.0' From 144f2403c6498b02c1de6d3ee184676724a591b4 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 25 Apr 2018 15:12:11 -0300 Subject: [PATCH 191/268] update flintrock --- tools/flintrock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flintrock b/tools/flintrock index 239fec7e..787faa4d 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit 239fec7eb5c81ad428c1ce7aafd66998bc887a10 +Subproject commit 787faa4d1b7708e0a387c7243723eddd2b1a33cb From fedf0eb3e4c99d2adb09c7e7afa60c189f84e387 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 25 Apr 2018 15:16:27 -0300 Subject: [PATCH 192/268] reverting commented lines by mistake --- src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index 0fa12c9c..ab47ee12 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -92,8 +92,8 @@ object CoreJobRunner { // Also try to propagate logging context to workers // TODO: find a more efficient and bullet-proof way -// val configBroadCast = sc.broadcast(config) -// sc.parallelize(Range(1, 2000), numSlices = 2000).foreachPartition(_ => setLoggingContextValues(configBroadCast.value)) + val configBroadCast = sc.broadcast(config) + sc.parallelize(Range(1, 2000), numSlices = 2000).foreachPartition(_ => setLoggingContextValues(configBroadCast.value)) val context = RunnerContext(sc, config) From 78a5ce3b20a944e7ccc08de92920c592818b3b54 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 2 May 2018 15:58:32 -0300 Subject: [PATCH 193/268] flintrock updates --- tools/flintrock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flintrock b/tools/flintrock index 787faa4d..f9304910 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit 787faa4d1b7708e0a387c7243723eddd2b1a33cb +Subproject commit f9304910e69bbf858f95abeb6be1204a38c169d5 From ea1fc349de5c869b01c565163f06874c64b0be91 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 3 May 2018 15:56:30 -0300 Subject: [PATCH 194/268] flintrock update --- tools/flintrock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flintrock b/tools/flintrock index f9304910..39d2c249 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit f9304910e69bbf858f95abeb6be1204a38c169d5 +Subproject commit 39d2c249b08fbd1c3869e7d435d66b7a97de8cc3 From 37735f7f22c928f9dd4279a3ba155f12a6f591d1 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 7 Jun 2018 08:16:14 -0300 Subject: [PATCH 195/268] passing forward to sync the file size estimator --- src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index cd362de0..2d6bf6b5 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -689,7 +689,7 @@ object SparkContextUtils { if (forceSynch || foundLocalPaths.isEmpty) { delete(new Path(syncPath(s"$synchLocally/"))) - val data = parallelReadTextFiles(hadoopFiles, maxBytesPerPartition, minPartitions, synchLocally = None) + val data = parallelReadTextFiles(hadoopFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling = sizeBasedFileHandling, synchLocally = None) data.saveAsTextFile(cacheKey) } From db7125d4870e6a25eaff6c66779dc7b50541f765 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 15 Jun 2018 17:03:46 -0300 Subject: [PATCH 196/268] update spark 2.3.1, hadoop 2.8.4, flintrock with support for spark without hadoop build --- build.sbt | 6 +++--- tools/cluster.py | 6 +++--- tools/flintrock | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/build.sbt b/build.sbt index 39bbd8b6..7271b924 100644 --- a/build.sbt +++ b/build.sbt @@ -9,13 +9,13 @@ scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warning // Because we can't run two spark contexts on same VM parallelExecution in Test := false -libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.3.0" % "provided") +libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.3.1" % "provided") .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") -libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided") +libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.8.4" % "provided") -libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.6") +libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.8.4") .exclude("org.apache.htrace", "htrace-core") .exclude("commons-beanutils", "commons-beanutils") .exclude("org.slf4j", "slf4j-log4j12") diff --git a/tools/cluster.py b/tools/cluster.py index bd114e68..99a63b28 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -49,9 +49,9 @@ default_ami = 'ami-60b6001f' default_master_ami = '' default_env = 'dev' -default_spark_version = '2.3.0' -default_hdfs_version = '2.7.6' -default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz' +default_spark_version = '2.3.1' +default_hdfs_version = '2.8.4' +default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-without-hadoop.tgz' default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz' default_remote_user = 'ec2-user' default_installation_user = 'root' diff --git a/tools/flintrock b/tools/flintrock index 39d2c249..9b560c7a 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit 39d2c249b08fbd1c3869e7d435d66b7a97de8cc3 +Subproject commit 9b560c7a54f898bd3924a55410c1ed2509c97152 From 74c3cb32c6147870b832a42d51f4f59a4fc0df71 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 19 Jun 2018 10:55:48 -0300 Subject: [PATCH 197/268] update flintrock --- tools/flintrock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flintrock b/tools/flintrock index 9b560c7a..0e540aef 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit 9b560c7a54f898bd3924a55410c1ed2509c97152 +Subproject commit 0e540aef41632c43db7db6387b8e22dd07a791d9 From 6350c67a3cd280dfb5160ef1cd907a863b7af01d Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 22 Jun 2018 16:46:41 -0300 Subject: [PATCH 198/268] increase ulimit open files, to help in big shuffles --- tools/scripts/noop | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/scripts/noop b/tools/scripts/noop index eb34279f..0e872836 100644 --- a/tools/scripts/noop +++ b/tools/scripts/noop @@ -1,3 +1,3 @@ #!/bin/bash -echo '* - nofile 65535' >> /etc/security/limits.conf \ No newline at end of file +echo '* - nofile 1000000' >> /etc/security/limits.conf \ No newline at end of file From 8b58235d043314fd9982583d27cc374b0ac98a5d Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 22 Jun 2018 18:23:28 -0300 Subject: [PATCH 199/268] update flintrok with nvme support --- tools/flintrock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flintrock b/tools/flintrock index 39d2c249..d2318e99 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit 39d2c249b08fbd1c3869e7d435d66b7a97de8cc3 +Subproject commit d2318e99dd972765673a0b8d716d3409d337e2da From 0b6c0c5f9fd46605bc5f7f050088ccc75f588d11 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 26 Jun 2018 09:21:08 -0300 Subject: [PATCH 200/268] rollback hadoop to 2.7.6, because of issues with spark sql --- build.sbt | 4 ++-- tools/cluster.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/build.sbt b/build.sbt index 7271b924..81f585b6 100644 --- a/build.sbt +++ b/build.sbt @@ -13,9 +13,9 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.3.1" % "provided .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") -libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.8.4" % "provided") +libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided") -libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.8.4") +libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.6") .exclude("org.apache.htrace", "htrace-core") .exclude("commons-beanutils", "commons-beanutils") .exclude("org.slf4j", "slf4j-log4j12") diff --git a/tools/cluster.py b/tools/cluster.py index 99a63b28..34272f4f 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -50,8 +50,8 @@ default_master_ami = '' default_env = 'dev' default_spark_version = '2.3.1' -default_hdfs_version = '2.8.4' -default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-without-hadoop.tgz' +default_hdfs_version = '2.7.6' +default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz' default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz' default_remote_user = 'ec2-user' default_installation_user = 'root' From 83ed71d6894bf95bbd9d904fe78ecea4e668ef31 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Mon, 2 Jul 2018 11:49:23 -0300 Subject: [PATCH 201/268] update zeppeling 0.8.0 --- remote_hook.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/remote_hook.sh b/remote_hook.sh index cb43904f..3821d92e 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -59,7 +59,7 @@ on_trap_exit() { install_and_run_zeppelin() { if [[ ! -d "zeppelin" ]]; then - wget "http://www-us.apache.org/dist/zeppelin/zeppelin-0.7.3/zeppelin-0.7.3-bin-all.tgz" -O zeppelin.tar.gz + wget "http://www-us.apache.org/dist/zeppelin/zeppelin-0.8.0/zeppelin-0.8.0-bin-all.tgz" -O zeppelin.tar.gz mkdir zeppelin tar xvzf zeppelin.tar.gz -C zeppelin --strip-components 1 > /tmp/zeppelin_install.log fi @@ -70,7 +70,7 @@ install_and_run_zeppelin() { export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH} --executor-memory ${SPARK_MEM_PARAM}" sudo -E zeppelin/bin/zeppelin.sh else - notify_error_and_exit "Zepellin installation not found" + notify_error_and_exit "Zeppelin installation not found" fi } From 6f9f219888aa6f7fa5f191ec725b9c55584f00e4 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 4 Jul 2018 09:42:20 -0300 Subject: [PATCH 202/268] update flintrock --- tools/flintrock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flintrock b/tools/flintrock index d2318e99..c9f58f54 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit d2318e99dd972765673a0b8d716d3409d337e2da +Subproject commit c9f58f547adaa57401e910df78c5986e76b8a155 From cb3f518cf78389d98397877473a81fa01a99fe02 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 19 Sep 2018 18:27:03 -0300 Subject: [PATCH 203/268] Allow to IAM and other credentials to work on s3 listing --- .../ignition/core/jobs/utils/SparkContextUtils.scala | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 2d6bf6b5..1d06505d 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -2,10 +2,12 @@ package ignition.core.jobs.utils import java.io.InputStream -import com.amazonaws.auth.EnvironmentVariableCredentialsProvider +import com.amazonaws.auth.DefaultAWSCredentialsProviderChain import com.amazonaws.services.s3.AmazonS3Client import com.amazonaws.services.s3.model.{ListObjectsRequest, ObjectListing, S3ObjectSummary} +import ignition.core.utils.CollectionUtils._ import ignition.core.utils.DateUtils._ +import ignition.core.utils.ExceptionUtils._ import ignition.core.utils.{AutoCloseableIterator, ByteUtils, HadoopUtils} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} @@ -15,17 +17,15 @@ import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.{Partitioner, SparkContext} import org.joda.time.DateTime +import org.slf4j.LoggerFactory import scala.collection.JavaConversions._ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.io.{Codec, Source} import scala.reflect.ClassTag -import scala.util.{Failure, Success, Try} import scala.util.control.NonFatal -import ignition.core.utils.ExceptionUtils._ -import ignition.core.utils.CollectionUtils._ -import org.slf4j.LoggerFactory +import scala.util.{Failure, Success, Try} object SparkContextUtils { @@ -49,7 +49,7 @@ object SparkContextUtils { def isCompressed(f: HadoopFile): Boolean = compressedExtensions.exists(f.path.endsWith) } - private lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider()) + private lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new DefaultAWSCredentialsProviderChain()) private def close(inputStream: InputStream, path: String): Unit = { try { From 8aaf8602018f3fe7c61ece74d8bc7ba8b5591f65 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 20 Sep 2018 16:45:05 -0300 Subject: [PATCH 204/268] yarn support --- tools/cluster.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 34272f4f..cb5d1798 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -510,6 +510,73 @@ def job_run(cluster_name, job_name, job_mem, return (job_name, job_tag) +@arg('job-mem', help='The amount of memory to use for this job (like: 80G)') +@named('local-yarn-run') +def job_local_yarn_run(job_name, job_mem, queue, + job_user=getpass.getuser(), + utc_job_date=None, job_tag=None, + disable_assembly_build=False, + spark_submit='spark-submit', + deploy_mode='cluster', + yarn_memory_overhead=0.3, + driver_heap_size=default_driver_heap_size): + + def parse_memory(s): + import re + match = re.match(r'([0-9]+)([a-zA-Z]+)', s) + if match is None or len(match.groups()) != 2: + raise Exception('Invalid memory size: ' + s) + return match.groups() + + def calculate_overhead(s): + from math import ceil + (n, unit) = parse_memory(s) + return str(int(ceil(float(n) * (1 + yarn_memory_overhead)))) + unit + + driver_overhead = calculate_overhead(driver_heap_size) + executor_overhead = calculate_overhead(job_mem) + + utc_job_date_example = '2014-05-04T13:13:10Z' + if utc_job_date and len(utc_job_date) != len(utc_job_date_example): + raise CommandError('UTC Job Date should be given as in the following example: {}'.format(utc_job_date_example)) + + project_path = get_project_path() + project_name = os.path.basename(project_path) + # Use job user on remote path to avoid too many conflicts for different local users + job_date = utc_job_date or datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') + job_tag = job_tag or job_date.replace(':', '_').replace('-', '_').replace('Z', 'UTC') + + if not disable_assembly_build: + build_assembly() + + assembly_path = get_assembly_path() + if assembly_path is None: + raise Exception('Something is wrong: no assembly found') + + + log.info('Will run job using local installation of yarn') + + check_call([ + spark_submit, + '--class', 'ignition.jobs.Runner', + '--master', 'yarn', + '--deploy-mode', deploy_mode, + '--queue', queue, + '--driver-memory', driver_heap_size, + '--conf', 'spark.yarn.am.memory', driver_heap_size, + '--executor-memory', job_mem, + '--conf', 'spark.yarn.am.memoryOverhead', driver_overhead, + '--conf', 'spark.driver.memoryOverhead', driver_overhead, + '--conf', 'spark.executor.memoryOverhead', executor_overhead, + assembly_path, + job_name, + '--runner-master', 'yarn', + '--runner-executor-memory', job_mem + # add job tag, date, etc + + ]) + + @named('attach') def job_attach(cluster_name, key_file=default_key_file, job_name=None, job_tag=None, master=None, remote_user=default_remote_user, region=default_region): @@ -750,7 +817,7 @@ def check_flintrock_installation(): parser = ArghParser() parser.add_commands([launch, destroy, get_master, ssh_master, tag_cluster_instances, health_check]) -parser.add_commands([job_run, job_attach, wait_for_job, +parser.add_commands([job_run, job_local_yarn_run, job_attach, wait_for_job, kill_job, killall_jobs, collect_job_results], namespace="jobs") if __name__ == '__main__': From 465286559d12dc8d15f6f48581e2073dc0d88a93 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 20 Sep 2018 21:40:21 -0300 Subject: [PATCH 205/268] Added remaining options --- tools/cluster.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index cb5d1798..3f0edcbf 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -518,8 +518,9 @@ def job_local_yarn_run(job_name, job_mem, queue, disable_assembly_build=False, spark_submit='spark-submit', deploy_mode='cluster', - yarn_memory_overhead=0.3, - driver_heap_size=default_driver_heap_size): + yarn_memory_overhead=0.2, + driver_heap_size=default_driver_heap_size, + driver_java_options='-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps'): def parse_memory(s): import re @@ -540,9 +541,6 @@ def calculate_overhead(s): if utc_job_date and len(utc_job_date) != len(utc_job_date_example): raise CommandError('UTC Job Date should be given as in the following example: {}'.format(utc_job_date_example)) - project_path = get_project_path() - project_name = os.path.basename(project_path) - # Use job user on remote path to avoid too many conflicts for different local users job_date = utc_job_date or datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') job_tag = job_tag or job_date.replace(':', '_').replace('-', '_').replace('Z', 'UTC') @@ -555,25 +553,26 @@ def calculate_overhead(s): log.info('Will run job using local installation of yarn') - check_call([ spark_submit, '--class', 'ignition.jobs.Runner', '--master', 'yarn', + '--driver-java-options', driver_java_options, '--deploy-mode', deploy_mode, '--queue', queue, '--driver-memory', driver_heap_size, - '--conf', 'spark.yarn.am.memory', driver_heap_size, + '--conf', 'spark.yarn.am.memory=' + driver_heap_size, '--executor-memory', job_mem, - '--conf', 'spark.yarn.am.memoryOverhead', driver_overhead, - '--conf', 'spark.driver.memoryOverhead', driver_overhead, - '--conf', 'spark.executor.memoryOverhead', executor_overhead, + '--conf', 'spark.yarn.am.memoryOverhead=' + driver_overhead, + '--conf', 'spark.driver.memoryOverhead=' + driver_overhead, + '--conf', 'spark.executor.memoryOverhead=' + executor_overhead, assembly_path, job_name, '--runner-master', 'yarn', - '--runner-executor-memory', job_mem - # add job tag, date, etc - + '--runner-executor-memory', job_mem, + '--runner-user', job_user, + '--runner-tag', job_tag, + '--runner-date', job_date ]) From 77fb7a7b77ce509e368ed482d26f6a98566664c9 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Fri, 21 Sep 2018 14:04:42 -0300 Subject: [PATCH 206/268] Make YARN jobs finish successfully --- src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index ab47ee12..7200d747 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -1,8 +1,9 @@ package ignition.core.jobs import org.apache.spark.{SparkConf, SparkContext} -import org.joda.time.{DateTimeZone, DateTime} +import org.joda.time.{DateTime, DateTimeZone} +import scala.concurrent.Future import scala.util.Try object CoreJobRunner { @@ -105,7 +106,13 @@ object CoreJobRunner { System.exit(1) // force exit of all threads } Try { sc.stop() } - System.exit(0) // force exit of all threads + import scala.concurrent.ExecutionContext.Implicits.global + Future { + // If everything is fine, the system will shut down without the help of this thread and YARN will report success + // But sometimes it gets stuck, then it's necessary to use the force, but this may finish the job as failed on YARN + Thread.sleep(30 * 1000) + System.exit(0) // force exit of all threads + } } } } From fa6d32f40afbca2111e0a051649edf12dc61f354 Mon Sep 17 00:00:00 2001 From: Henrique Goulart Date: Mon, 24 Sep 2018 20:07:08 -0300 Subject: [PATCH 207/268] Update AMI I deleted the other AMI. Now we will use the one that have been used in datalake production for a long time (platform AMI) --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 34272f4f..37b327c0 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -46,7 +46,7 @@ default_zone = default_region + 'b' default_key_id = 'ignition_key' default_key_file = os.path.expanduser('~/.ssh/ignition_key.pem') -default_ami = 'ami-60b6001f' +default_ami = 'ami-611e7976' default_master_ami = '' default_env = 'dev' default_spark_version = '2.3.1' From 709d508ca69dcc864ef6957515580c8ff1d4ab79 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 26 Sep 2018 10:25:17 -0300 Subject: [PATCH 208/268] support for jupyter with pyspark --- remote_hook.sh | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/remote_hook.sh b/remote_hook.sh index 3821d92e..eb9b3616 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -66,7 +66,7 @@ install_and_run_zeppelin() { if [[ -f "zeppelin/bin/zeppelin.sh" ]]; then export MASTER="${JOB_MASTER}" export ZEPPELIN_PORT="8081" - export SPARK_HOME="/root/spark" + export SPARK_HOME=$(get_first_present /root/spark /opt/spark ~/spark*/) export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH} --executor-memory ${SPARK_MEM_PARAM}" sudo -E zeppelin/bin/zeppelin.sh else @@ -74,6 +74,19 @@ install_and_run_zeppelin() { fi } +install_and_run_jupyter() { + sudo yum -y install python3 python3-pip + sudo pip3 install jupyter pandas boto3 matplotlib numpy sklearn scipy + export SPARK_HOME=$(get_first_present /root/spark /opt/spark ~/spark*/) + export HADOOP_HOME=$(get_first_present /root/hadoop /opt/hadoop ~/hadoop*/) + export SPARK_CONF_DIR="${SPARK_HOME}/conf" + export HADOOP_CONF_DIR="${HADOOP_HOME}/conf" + export JOB_MASTER=${MASTER:-spark://${SPARK_MASTER_HOST}:7077} + export PYSPARK_PYTHON=$(which python3) + export PYSPARK_DRIVER_PYTHON=$(which jupyter) + export PYSPARK_DRIVER_PYTHON_OPTS="notebook --allow-root --ip=${SPARK_MASTER_HOST} --no-browser --port=8888" + sudo -E "${SPARK_HOME}/bin/pyspark" --master "${JOB_MASTER}" --executor-memory "${SPARK_MEM_PARAM}" --driver-memory "${DRIVER_HEAP_SIZE}" +} trap "on_trap_exit" EXIT @@ -105,6 +118,8 @@ if [[ "${JOB_NAME}" == "shell" ]]; then sudo -E ${SPARK_HOME}/bin/spark-shell --master "${JOB_MASTER}" --jars ${JAR_PATH} --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell" elif [[ "${JOB_NAME}" == "zeppelin" ]]; then install_and_run_zeppelin +elif [[ "${JOB_NAME}" == "jupyter" ]]; then + install_and_run_jupyter else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & From 9ba1e71da5ee34ea333db6a95874621605d730b3 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 26 Sep 2018 14:37:17 -0300 Subject: [PATCH 209/268] Avoid explicit spark context stopping --- src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index 7200d747..c1d0541f 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -105,7 +105,7 @@ object CoreJobRunner { t.printStackTrace() System.exit(1) // force exit of all threads } - Try { sc.stop() } + import scala.concurrent.ExecutionContext.Implicits.global Future { // If everything is fine, the system will shut down without the help of this thread and YARN will report success From cc71992d5c849e4ec9106f93e42432f1bdcca670 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 26 Sep 2018 15:38:57 -0300 Subject: [PATCH 210/268] fixed memory calculation and added executor cores --- tools/cluster.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 3f0edcbf..682dd97e 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -516,6 +516,7 @@ def job_local_yarn_run(job_name, job_mem, queue, job_user=getpass.getuser(), utc_job_date=None, job_tag=None, disable_assembly_build=False, + executor_cores=5, spark_submit='spark-submit', deploy_mode='cluster', yarn_memory_overhead=0.2, @@ -532,7 +533,7 @@ def parse_memory(s): def calculate_overhead(s): from math import ceil (n, unit) = parse_memory(s) - return str(int(ceil(float(n) * (1 + yarn_memory_overhead)))) + unit + return str(int(ceil(float(n) * yarn_memory_overhead))) + unit driver_overhead = calculate_overhead(driver_heap_size) executor_overhead = calculate_overhead(job_mem) @@ -560,6 +561,7 @@ def calculate_overhead(s): '--driver-java-options', driver_java_options, '--deploy-mode', deploy_mode, '--queue', queue, + '--conf', 'spark.executor.cores=' + str(executor_cores), '--driver-memory', driver_heap_size, '--conf', 'spark.yarn.am.memory=' + driver_heap_size, '--executor-memory', job_mem, From bfb364ab1e4ac016717bebdab635198ae57e4138 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 27 Sep 2018 16:51:35 -0300 Subject: [PATCH 211/268] tail it :) --- remote_hook.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/remote_hook.sh b/remote_hook.sh index 3821d92e..68412ca9 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -x + # We suppose we are in a subdirectory of the root project DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" @@ -84,7 +86,7 @@ MAIN_CLASS="ignition.jobs.Runner" cd "${DIR}" || notify_error_and_exit "Internal script error for job ${JOB_WITH_TAG}" -JAR_PATH_SRC=$(echo "${DIR}"/*assembly*.jar) +JAR_PATH_SRC=$(ls "${DIR}"/*assembly*.jar | tail -1) JAR_PATH="${JOB_CONTROL_DIR}/Ignition.jar" cp ${JAR_PATH_SRC} ${JAR_PATH} From 438297459b28959e14046fd8738dcc3ad5e904e4 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 28 Sep 2018 10:23:10 -0300 Subject: [PATCH 212/268] exec shell in cluster --- tools/cluster.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index aeb5eb6a..b3780ba3 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -372,6 +372,14 @@ def ssh_master(cluster_name, key_file=default_key_file, user=default_remote_user ssh_call(user=user, host=master, key_file=key_file, args=args) +def exec_shell(cluster_name, command, key_file=default_key_file, user=default_remote_user, region=default_region): + masters, slaves = get_active_nodes(cluster_name, region=region) + for node in masters + slaves: + host = node.public_dns_name or node.private_dns_name + output = ssh_call(user=user, host=host, key_file=key_file, args=[command], allocate_terminal=True, get_output=True) + log.info("exec output of host %s:\n%s", host, output) + + def rsync_call(user, host, key_file, args=[], src_local='', dest_local='', remote_path='', tries=3): rsync_args = ['rsync', '--timeout', '60', '-azvP'] rsync_args += ['-e', 'ssh -i {} -o StrictHostKeyChecking=no'.format(key_file)] @@ -817,7 +825,7 @@ def check_flintrock_installation(): parser = ArghParser() -parser.add_commands([launch, destroy, get_master, ssh_master, tag_cluster_instances, health_check]) +parser.add_commands([launch, destroy, get_master, ssh_master, tag_cluster_instances, health_check, exec_shell]) parser.add_commands([job_run, job_local_yarn_run, job_attach, wait_for_job, kill_job, killall_jobs, collect_job_results], namespace="jobs") From 4e900d913af7c5ea3e593d15f762a1ea87c916ee Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 28 Sep 2018 10:56:22 -0300 Subject: [PATCH 213/268] by default, install python3 and pip also tmux --- tools/create_image.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/create_image.sh b/tools/create_image.sh index 5f807365..5b2af793 100644 --- a/tools/create_image.sh +++ b/tools/create_image.sh @@ -37,7 +37,13 @@ echo "export JAVA_HOME=/usr/lib/jvm/java-1.8.0" >> /etc/environment yum install -y snappy # Install netlib-java native dependencies -yum install -y blas atlas lapack +yum install -y blas atlas lapack + +# Install python3 and pip3 +yum install -y python3 python3-pip + +# Install python3 and pip3 +yum install -y tmux # Create /usr/bin/realpath which is used by R to find Java installations # NOTE: /usr/bin/realpath is missing in CentOS AMIs. See From 9bfa59b3736cfba61262066f2ebf94d6b0784124 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 28 Sep 2018 13:27:15 -0300 Subject: [PATCH 214/268] fix commentary description --- tools/create_image.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/create_image.sh b/tools/create_image.sh index 5b2af793..852b861c 100644 --- a/tools/create_image.sh +++ b/tools/create_image.sh @@ -42,7 +42,7 @@ yum install -y blas atlas lapack # Install python3 and pip3 yum install -y python3 python3-pip -# Install python3 and pip3 +# Install tmux yum install -y tmux # Create /usr/bin/realpath which is used by R to find Java installations From 881892b2a726e1737aeea475cba5d4877c47b866 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Mon, 1 Oct 2018 18:15:43 -0300 Subject: [PATCH 215/268] Get latest asssembly (lexicographically) --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index aeb5eb6a..9f4eb18b 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -387,7 +387,7 @@ def build_assembly(): def get_assembly_path(): paths = glob.glob(get_project_path() + '/target/scala-*/*assembly*.jar') if paths: - return paths[0] + return paths[-1] else: return None From 2d5c63b9249d88c94a18da35e36b0adb42f31aa4 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 2 Oct 2018 15:04:17 -0300 Subject: [PATCH 216/268] Enable hive support --- build.sbt | 2 + .../ignition/core/jobs/CoreJobRunner.scala | 38 ++++++++++++------- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/build.sbt b/build.sbt index 81f585b6..b85671d8 100644 --- a/build.sbt +++ b/build.sbt @@ -13,6 +13,8 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.3.1" % "provided .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") +libraryDependencies += ("org.apache.spark" %% "spark-sql" % "2.3.1" % "provided") + libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided") libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.6") diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index c1d0541f..4e7c27fe 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -1,14 +1,19 @@ package ignition.core.jobs -import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.SparkContext +import org.apache.spark.sql.SparkSession import org.joda.time.{DateTime, DateTimeZone} +import org.slf4j.{Logger, LoggerFactory} import scala.concurrent.Future import scala.util.Try object CoreJobRunner { + val logger: Logger = LoggerFactory.getLogger(getClass) + case class RunnerContext(sparkContext: SparkContext, + sparkSession: SparkSession, config: RunnerConfig) @@ -71,32 +76,39 @@ object CoreJobRunner { val appName = s"${config.setupName}.${config.tag}" - val sparkConf = new SparkConf() - sparkConf.set("spark.executor.memory", config.executorMemory) - - sparkConf.set("spark.eventLog.dir", "file:///media/tmp/spark-events") + val builder = SparkSession.builder + builder.config("spark.executor.memory", config.executorMemory) - sparkConf.setMaster(config.master) - sparkConf.setAppName(appName) + builder.config("spark.eventLog.dir", "file:///media/tmp/spark-events") - sparkConf.set("spark.hadoop.mapred.output.committer.class", classOf[DirectOutputCommitter].getName()) + builder.master(config.master) + builder.appName(appName) - defaultSparkConfMap.foreach { case (k, v) => sparkConf.set(k, v) } + builder.config("spark.hadoop.mapred.output.committer.class", classOf[DirectOutputCommitter].getName()) - jobConf.foreach { case (k, v) => sparkConf.set(k, v) } + defaultSparkConfMap.foreach { case (k, v) => builder.config(k, v) } + jobConf.foreach { case (k, v) => builder.config(k, v) } // Add logging context to driver setLoggingContextValues(config) - - val sc = new SparkContext(sparkConf) + try { + builder.enableHiveSupport() + } catch { + case t: Throwable => logger.warn("Failed to enable HIVE support", t) + } + + val session = builder.getOrCreate() + + val sc = session.sparkContext // Also try to propagate logging context to workers // TODO: find a more efficient and bullet-proof way val configBroadCast = sc.broadcast(config) + sc.parallelize(Range(1, 2000), numSlices = 2000).foreachPartition(_ => setLoggingContextValues(configBroadCast.value)) - val context = RunnerContext(sc, config) + val context = RunnerContext(sc, session, config) try { jobSetup.apply(context) From d67b7f478fd1e9d18141ab02b0832c9e552186e0 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 4 Oct 2018 16:34:07 -0300 Subject: [PATCH 217/268] Get latest assembly by time --- remote_hook.sh | 2 +- tools/cluster.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/remote_hook.sh b/remote_hook.sh index ddfe80a2..f30879e3 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -99,7 +99,7 @@ MAIN_CLASS="ignition.jobs.Runner" cd "${DIR}" || notify_error_and_exit "Internal script error for job ${JOB_WITH_TAG}" -JAR_PATH_SRC=$(ls "${DIR}"/*assembly*.jar | tail -1) +JAR_PATH_SRC=$(ls -t "${DIR}"/*assembly*.jar | head -1) # most recent jar JAR_PATH="${JOB_CONTROL_DIR}/Ignition.jar" cp ${JAR_PATH_SRC} ${JAR_PATH} diff --git a/tools/cluster.py b/tools/cluster.py index 9f4eb18b..f22107d5 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -387,6 +387,7 @@ def build_assembly(): def get_assembly_path(): paths = glob.glob(get_project_path() + '/target/scala-*/*assembly*.jar') if paths: + paths.sort(key=os.path.getmtime) return paths[-1] else: return None From d7e635a61a664f6378cfb5e59d1012e95217eea2 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 9 Oct 2018 15:14:49 -0300 Subject: [PATCH 218/268] Added options to conf yarn --- remote_hook.sh | 1 - tools/cluster.py | 40 ++++++++++++++++++++++++---------------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/remote_hook.sh b/remote_hook.sh index f30879e3..0a5a2cb8 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -1,6 +1,5 @@ #!/bin/bash -set -x # We suppose we are in a subdirectory of the root project DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" diff --git a/tools/cluster.py b/tools/cluster.py index f22107d5..eac73844 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -510,7 +510,7 @@ def job_run(cluster_name, job_name, job_mem, raise failed_exception or Exception('Failed!?') return (job_name, job_tag) - +@argh.arg('-c', '--conf', action='append', type=str) @arg('job-mem', help='The amount of memory to use for this job (like: 80G)') @named('local-yarn-run') def job_local_yarn_run(job_name, job_mem, queue, @@ -522,7 +522,8 @@ def job_local_yarn_run(job_name, job_mem, queue, deploy_mode='cluster', yarn_memory_overhead=0.2, driver_heap_size=default_driver_heap_size, - driver_java_options='-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps'): + driver_java_options='-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps', + conf=[]): def parse_memory(s): import re @@ -555,20 +556,27 @@ def calculate_overhead(s): log.info('Will run job using local installation of yarn') - check_call([ - spark_submit, - '--class', 'ignition.jobs.Runner', - '--master', 'yarn', - '--driver-java-options', driver_java_options, - '--deploy-mode', deploy_mode, - '--queue', queue, - '--conf', 'spark.executor.cores=' + str(executor_cores), - '--driver-memory', driver_heap_size, - '--conf', 'spark.yarn.am.memory=' + driver_heap_size, - '--executor-memory', job_mem, - '--conf', 'spark.yarn.am.memoryOverhead=' + driver_overhead, - '--conf', 'spark.driver.memoryOverhead=' + driver_overhead, - '--conf', 'spark.executor.memoryOverhead=' + executor_overhead, + confs = [ + spark_submit, + '--class', 'ignition.jobs.Runner', + '--master', 'yarn', + '--driver-java-options', driver_java_options, + '--deploy-mode', deploy_mode, + '--queue', queue, + '--conf', 'spark.executor.cores=' + str(executor_cores), + '--driver-memory', driver_heap_size, + '--conf', 'spark.yarn.am.memory=' + driver_heap_size, + '--executor-memory', job_mem, + '--conf', 'spark.yarn.am.memoryOverhead=' + driver_overhead, + '--conf', 'spark.driver.memoryOverhead=' + driver_overhead, + '--conf', 'spark.executor.memoryOverhead=' + executor_overhead + ] + + for c in conf: + confs.extend(['--conf', c]) + + check_call( + confs + [ assembly_path, job_name, '--runner-master', 'yarn', From f7468e1551342da13194296cda6332c0dd9a6850 Mon Sep 17 00:00:00 2001 From: Felipe Mafatti Date: Wed, 10 Oct 2018 11:54:20 -0300 Subject: [PATCH 219/268] Update submodule - Change ebs to delete on termination (#152) --- tools/flintrock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flintrock b/tools/flintrock index c9f58f54..e5b3b9b2 160000 --- a/tools/flintrock +++ b/tools/flintrock @@ -1 +1 @@ -Subproject commit c9f58f547adaa57401e910df78c5986e76b8a155 +Subproject commit e5b3b9b2a6ac66536ba6e105cd42f988f9d8bb7e From 1c3644f689f37b4d01c563b7564921ddeebc3266 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 16 Oct 2018 23:35:53 -0300 Subject: [PATCH 220/268] Support sudo, avoid loop break on failures --- tools/cluster.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index fddabd65..7cb07a83 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -114,8 +114,10 @@ def logged_call(args, tries=1): return logged_call_base(check_call, args, tries) -def ssh_call(user, host, key_file, args=(), allocate_terminal=True, get_output=False): - base = ['ssh', '-q'] +def ssh_call(user, host, key_file, args=(), allocate_terminal=True, get_output=False, quiet=True): + base = ['ssh'] + if quiet: + base += ['-q'] if allocate_terminal: base += ['-tt'] base += ['-i', key_file, @@ -372,12 +374,22 @@ def ssh_master(cluster_name, key_file=default_key_file, user=default_remote_user ssh_call(user=user, host=master, key_file=key_file, args=args) -def exec_shell(cluster_name, command, key_file=default_key_file, user=default_remote_user, region=default_region): +def exec_shell(cluster_name, command, key_file=default_key_file, user=default_remote_user, region=default_region, sudo=False): + import subprocess masters, slaves = get_active_nodes(cluster_name, region=region) + if not masters: + log.warn('No master found') for node in masters + slaves: host = node.public_dns_name or node.private_dns_name - output = ssh_call(user=user, host=host, key_file=key_file, args=[command], allocate_terminal=True, get_output=True) - log.info("exec output of host %s:\n%s", host, output) + log.info("exec output of host %s\n", host) + cmd = ['ssh', '-t', '-o', 'StrictHostKeyChecking=no', user + '@' + host ,'-i', key_file] + if sudo: + cmd += ['sudo'] + cmd += ['bash'] + p = subprocess.Popen(cmd, stdin=subprocess.PIPE) + p.communicate(command) + if p.wait() != 0: + log.warn('\nError executing command on host: %s', host) def rsync_call(user, host, key_file, args=[], src_local='', dest_local='', remote_path='', tries=3): From bb5e6adf85cbb35d1bb12726776b7eb2a7cfdb29 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 26 Oct 2018 10:19:34 -0300 Subject: [PATCH 221/268] update to spark 2.3.2 --- build.sbt | 4 ++-- tools/cluster.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/build.sbt b/build.sbt index b85671d8..03f135d5 100644 --- a/build.sbt +++ b/build.sbt @@ -9,11 +9,11 @@ scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warning // Because we can't run two spark contexts on same VM parallelExecution in Test := false -libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.3.1" % "provided") +libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.3.2" % "provided") .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") -libraryDependencies += ("org.apache.spark" %% "spark-sql" % "2.3.1" % "provided") +libraryDependencies += ("org.apache.spark" %% "spark-sql" % "2.3.2" % "provided") libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided") diff --git a/tools/cluster.py b/tools/cluster.py index eac73844..4d7e7dd3 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -49,7 +49,7 @@ default_ami = 'ami-611e7976' default_master_ami = '' default_env = 'dev' -default_spark_version = '2.3.1' +default_spark_version = '2.3.2' default_hdfs_version = '2.7.6' default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz' default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz' From f489fd2d50b65c1f43b43f7afb0b2459145485fb Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 21 Nov 2018 19:16:28 -0200 Subject: [PATCH 222/268] Make cluster.py compatible with python3 --- tools/cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index 4d7e7dd3..38b7a19f 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -614,9 +614,9 @@ def health_check(cluster_name, key_file=default_key_file, master=None, remote_us raise NotHealthyCluster('Not enough healthy slaves: {0}/{1}'.format(len(slaves), nslaves)) if not masters: raise NotHealthyCluster('No master found') - except NotHealthyCluster, e: + except NotHealthyCluster as e: raise e - except Exception, e: + except Exception as e: log.warning("Failed to check cluster health, cluster: %s, retries %s" % (cluster_name, i), exc_info=True) if i >= retries - 1: log.critical("Failed to check cluster health, cluster: %s, giveup!" % (cluster_name)) From cc510d89ad22f0824d98a618095aa738d0ec7643 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 22 Nov 2018 16:34:36 -0200 Subject: [PATCH 223/268] making compatible --- tools/cluster.py | 2 +- tools/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index 38b7a19f..060bce71 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -123,7 +123,7 @@ def ssh_call(user, host, key_file, args=(), allocate_terminal=True, get_output=F '{0}@{1}'.format(user, host)] base += args if get_output: - return logged_call_output(base) + return logged_call_output(base).decode("utf-8") else: return logged_call(base) diff --git a/tools/utils.py b/tools/utils.py index 88a236cd..5064be61 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -79,7 +79,7 @@ def read_non_blocking(f): while select.select([f], [], [], 0)[0]: c = f.read(1) if c: - result.append(c) + result.append(c.decode('utf-8')) else: break return ''.join(result) if result else None From e8ec4d6c58ecb352865755e2a98692b922d4ec60 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 27 Nov 2018 17:41:22 -0200 Subject: [PATCH 224/268] Lightweight core (#155) * remove a lot of stuff * spark 2.4 --- build.sbt | 37 +- project/build.properties | 2 +- project/plugins.sbt | 5 - src/main/scala/TestHttp.scala | 49 -- .../core/cache/ExpiringMultiLevelCache.scala | 547 ------------------ .../core/http/AsyncHttpClientStreamApi.scala | 89 --- .../core/http/AsyncSprayHttpClient.scala | 297 ---------- .../ignition/core/http/ByteStorage.scala | 114 ---- .../scala/ignition/core/http/Caching.scala | 22 - src/main/scala/ignition/core/http/Retry.scala | 84 --- .../core/jobs/utils/SparkContextUtils.scala | 13 +- .../ignition/core/utils/FutureUtils.scala | 5 - .../scala/ignition/core/utils/S3Client.scala | 62 -- .../ignition/core/utils/TelemetryCache.scala | 45 -- .../scala/ignition/core/utils/URLUtils.scala | 21 +- .../spray/cache/ExpiringLruLocalCache.scala | 134 ----- .../ExpiringMultipleLevelCacheSpec.scala | 213 ------- .../http/AsyncHttpClientStreamApiSpec.scala | 15 - .../scala/ignition/core/http/RetrySpec.scala | 39 -- .../ignition/core/utils/URLUtilsSpec.scala | 29 - tools/cluster.py | 2 +- 21 files changed, 16 insertions(+), 1808 deletions(-) delete mode 100644 project/plugins.sbt delete mode 100644 src/main/scala/TestHttp.scala delete mode 100644 src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala delete mode 100644 src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala delete mode 100644 src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala delete mode 100644 src/main/scala/ignition/core/http/ByteStorage.scala delete mode 100644 src/main/scala/ignition/core/http/Caching.scala delete mode 100644 src/main/scala/ignition/core/http/Retry.scala delete mode 100644 src/main/scala/ignition/core/utils/S3Client.scala delete mode 100644 src/main/scala/ignition/core/utils/TelemetryCache.scala delete mode 100644 src/main/scala/spray/cache/ExpiringLruLocalCache.scala delete mode 100644 src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala delete mode 100644 src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala delete mode 100644 src/test/scala/ignition/core/http/RetrySpec.scala diff --git a/build.sbt b/build.sbt index 03f135d5..b27321ea 100644 --- a/build.sbt +++ b/build.sbt @@ -2,29 +2,20 @@ name := "Ignition-Core" version := "1.0" -scalaVersion := "2.11.8" +scalaVersion := "2.11.12" scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130") // Because we can't run two spark contexts on same VM parallelExecution in Test := false -libraryDependencies += ("org.apache.spark" %% "spark-core" % "2.3.2" % "provided") - .exclude("org.apache.hadoop", "hadoop-client") - .exclude("org.slf4j", "slf4j-log4j12") +test in assembly := {} -libraryDependencies += ("org.apache.spark" %% "spark-sql" % "2.3.2" % "provided") +libraryDependencies += ("org.apache.spark" %% "spark-sql" % "2.4.0" % "provided") libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided") -libraryDependencies += ("org.apache.hadoop" % "hadoop-aws" % "2.7.6") - .exclude("org.apache.htrace", "htrace-core") - .exclude("commons-beanutils", "commons-beanutils") - .exclude("org.slf4j", "slf4j-log4j12") - -libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.3" - -libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.2.14" +libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.2.27" libraryDependencies += "com.github.scopt" %% "scopt" % "3.6.0" @@ -32,24 +23,8 @@ libraryDependencies += "joda-time" % "joda-time" % "2.9.9" libraryDependencies += "org.joda" % "joda-convert" % "1.8.2" -libraryDependencies += "commons-lang" % "commons-lang" % "2.6" - libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.25" -libraryDependencies += "com.typesafe.akka" %% "akka-actor" % "2.3.4" - -libraryDependencies += "io.spray" %% "spray-json" % "1.3.2" - -libraryDependencies += "io.spray" %% "spray-client" % "1.3.2" +libraryDependencies += "com.amazonaws" % "aws-java-sdk-s3" % "1.11.456" -libraryDependencies += "io.spray" %% "spray-http" % "1.3.2" - -libraryDependencies += "io.spray" %% "spray-caching" % "1.3.2" - -resolvers += "Akka Repository" at "http://repo.akka.io/releases/" - -resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/" - -resolvers += "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/" - -resolvers += Resolver.sonatypeRepo("public") +libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.3" diff --git a/project/build.properties b/project/build.properties index be6c454f..7c58a83a 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=0.13.5 +sbt.version=1.2.6 diff --git a/project/plugins.sbt b/project/plugins.sbt deleted file mode 100644 index f6f3b939..00000000 --- a/project/plugins.sbt +++ /dev/null @@ -1,5 +0,0 @@ -addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") - -addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3") - -addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") diff --git a/src/main/scala/TestHttp.scala b/src/main/scala/TestHttp.scala deleted file mode 100644 index 901516e0..00000000 --- a/src/main/scala/TestHttp.scala +++ /dev/null @@ -1,49 +0,0 @@ - -object TestHttp extends App{ - - def goTest(): Unit = { - import java.util.concurrent.TimeUnit - - import akka.actor.{ActorRefFactory, ActorSystem} - import akka.util.Timeout - import ignition.core.http.AsyncHttpClientStreamApi._ - import ignition.core.http.AsyncSprayHttpClient - import ignition.core.utils.ExceptionUtils._ - import org.joda.time.DateTime - - import scala.concurrent.ExecutionContext.Implicits.global - import scala.concurrent.duration.Duration - import scala.io.Source - import scala.util.{Failure, Success} - def now = DateTime.now() - - val system = ActorSystem("http") - val client = new AsyncSprayHttpClient { - override implicit def actorRefFactory: ActorRefFactory = system - } - val url = "https://httpbin.org/delay/10" // "http://127.0.0.1:8081/" - val conf = RequestConfiguration(requestTimeout = Option(Duration(12, TimeUnit.SECONDS)), idleTimeout = Option(Duration(5, TimeUnit.SECONDS))) - implicit val reporter = NoOpReporter - implicit val timeout = Timeout(30, TimeUnit.SECONDS) - - println(s"Starting $now") - - // Should complete ok - val request1 = client.makeRequest(Request(url, requestConfiguration = Option(conf))) - request1.onComplete { - case Success(t) => println(s"request1 finished $now with Success: ${Source.fromInputStream(t.content).mkString}") - case Failure(t) => println(s"request1 finished $now with failure: ${t.getFullStackTraceString()}") - } - - //Should time out and keep retrying - val tightConf = conf.copy(requestTimeout = Option(Duration(3, TimeUnit.SECONDS))) - val request2 = client.makeRequest(Request(url, requestConfiguration = Option(tightConf))) - - request2.onComplete { - case Success(t) => println(s"request2 finished $now with Success: ${Source.fromInputStream(t.content).mkString}") - case Failure(t) => println(s"request2 finished $now with failure: ${t.getFullStackTraceString()}") - } - } - - goTest() -} diff --git a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala b/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala deleted file mode 100644 index 6ac0f626..00000000 --- a/src/main/scala/ignition/core/cache/ExpiringMultiLevelCache.scala +++ /dev/null @@ -1,547 +0,0 @@ -package ignition.core.cache - -import java.util.concurrent.TimeUnit - -import akka.actor.Scheduler -import akka.pattern.after -import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap -import ignition.core.utils.DateUtils._ -import ignition.core.utils.FutureUtils._ -import org.joda.time.{DateTime, DateTimeZone, Interval} -import org.slf4j.LoggerFactory -import spray.caching.ValueMagnet - -import scala.concurrent.duration._ -import scala.concurrent.{ExecutionContext, Future, Promise} -import scala.util.control.NonFatal -import scala.util.{Failure, Success, Try} - -object ExpiringMultiLevelCache { - case class TimestampedValue[V](date: DateTime, value: Try[V]) { - def hasExpired(ttl: FiniteDuration, now: DateTime, ttlCachedErrors: FiniteDuration = 1.minute): Boolean = { - value match { - case Success(_) => date.plus(ttl.toMillis).isBefore(now) - case Failure(_) => date.plus(ttlCachedErrors.toMillis).isBefore(now) - } - } - } - - trait GenericCache[V] { cache => - // Keep compatible with Spray Cache - def apply(key: String) = new Keyed(key) - - class Keyed(key: String) { - /** - * Returns either the cached Future for the key or evaluates the given call-by-name argument - * which produces either a value instance of type `V` or a `Future[V]`. - */ - def apply(magnet: ⇒ ValueMagnet[V])(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = - cache.apply(key, () ⇒ try magnet.future catch { case NonFatal(e) ⇒ Future.failed(e) }) - - /** - * Returns either the cached Future for the key or evaluates the given function which - * should lead to eventual completion of the promise. - */ - def apply[U](f: Promise[V] ⇒ U)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = - cache.apply(key, () ⇒ { val p = Promise[V](); f(p); p.future }) - } - - def apply(key: String, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] - def set(key: String, value: V)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Unit] - } - - trait LocalCache[V] { - def get(key: Any): Option[Future[V]] - def set(key: Any, value: V): Unit - } - - trait RemoteWritableCache[V] { - def set(key: String, value: V)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Unit] - def setLock(key: String, ttl: FiniteDuration)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Boolean] - } - - trait RemoteReadableCache[V] { - def get(key: String)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Option[V]] - } - - trait RemoteCacheRW[V] extends RemoteReadableCache[V] with RemoteWritableCache[V] - - trait ReporterCallback { - def onCompletedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit - def onGeneratedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit - def onCompletedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit - def onGeneratedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit - def onCacheMissNothingFound(key: String, elapsedTime: FiniteDuration): Unit - def onCacheMissButFoundExpiredLocal(key: String, elapsedTime: FiniteDuration): Unit - def onCacheMissButFoundExpiredRemote(key: String, elapsedTime: FiniteDuration): Unit - def onRemoteCacheHit(key: String, elapsedTime: FiniteDuration): Unit - def onLocalCacheHit(key: String, elapsedTime: FiniteDuration): Unit - def onUnexpectedBehaviour(key: String, elapsedTime: FiniteDuration): Unit - def onStillTryingToLockOrGet(key: String, elapsedTime: FiniteDuration): Unit - def onSuccessfullyRemoteSetValue(key: String, elapsedTime: FiniteDuration): Unit - def onRemoteCacheHitAfterGenerating(key: String, elapsedTime: FiniteDuration): Unit - def onErrorGeneratingValue(key: String, eLocal: Throwable, elapsedTime: FiniteDuration): Unit - def onLocalError(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit - def onRemoteError(key: String, t: Throwable, elapsedTime: FiniteDuration): Unit - def onRemoteGiveUp(key: String, elapsedTime: FiniteDuration): Unit - def onSanityLocalValueCheckFailedResult(key: String, result: String, elapsedTime: FiniteDuration): Unit - } - - object NoOpReporter extends ReporterCallback { - override def onCacheMissNothingFound(key: String, elapsedTime: FiniteDuration): Unit = {} - override def onUnexpectedBehaviour(key: String, elapsedTime: FiniteDuration): Unit = {} - override def onSuccessfullyRemoteSetValue(key: String, elapsedTime: FiniteDuration): Unit = {} - override def onRemoteError(key: String, t: Throwable, elapsedTime: FiniteDuration): Unit = {} - override def onRemoteGiveUp(key: String, elapsedTime: FiniteDuration): Unit = {} - override def onLocalError(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit = {} - override def onErrorGeneratingValue(key: String, eLocal: Throwable, elapsedTime: FiniteDuration): Unit = {} - override def onRemoteCacheHitAfterGenerating(key: String, elapsedTime: FiniteDuration): Unit = {} - override def onCacheMissButFoundExpiredRemote(key: String, elapsedTime: FiniteDuration): Unit = {} - override def onStillTryingToLockOrGet(key: String, elapsedTime: FiniteDuration): Unit = {} - override def onLocalCacheHit(key: String, elapsedTime: FiniteDuration): Unit = {} - override def onRemoteCacheHit(key: String, elapsedTime: FiniteDuration): Unit = {} - override def onCacheMissButFoundExpiredLocal(key: String, elapsedTime: FiniteDuration): Unit = {} - override def onCompletedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit = {} - override def onCompletedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit = {} - override def onGeneratedWithFailure(key: String, e: Throwable, elapsedTime: FiniteDuration): Unit = {} - override def onGeneratedWithSuccess(key: String, elapsedTime: FiniteDuration): Unit = {} - override def onSanityLocalValueCheckFailedResult(key: String, result: String, elapsedTime: FiniteDuration): Unit = {} - } -} - - -import ignition.core.cache.ExpiringMultiLevelCache._ - - -case class ExpiringMultiLevelCache[V](ttl: FiniteDuration, - localCache: Option[LocalCache[TimestampedValue[V]]], - remoteRW: Option[RemoteCacheRW[TimestampedValue[V]]] = None, - remoteLockTTL: FiniteDuration = 5.seconds, - reporter: ExpiringMultiLevelCache.ReporterCallback = ExpiringMultiLevelCache.NoOpReporter, - maxErrorsToRetryOnRemote: Int = 5, - backoffOnLockAcquire: FiniteDuration = 50.milliseconds, - backoffOnError: FiniteDuration = 50.milliseconds, - sanityLocalValueCheck: Boolean = false, - cacheErrors: Boolean = false, - ttlCachedErrors: FiniteDuration = 1.minute) extends GenericCache[V] { - - private val logger = LoggerFactory.getLogger(getClass) - - private val tempUpdate = new ConcurrentLinkedHashMap.Builder[Any, Future[TimestampedValue[V]]] - .maximumWeightedCapacity(Long.MaxValue) - .build() - - protected def now = DateTime.now.withZone(DateTimeZone.UTC) - - private def timestamp(v: V): TimestampedValue[V] = TimestampedValue(now, Try(v)) - - private def timestampError(e: Throwable): TimestampedValue[V] = TimestampedValue(now, Failure(e)) - - private def elapsedTime(startNanoTime: Long) = FiniteDuration(System.nanoTime() - startNanoTime, TimeUnit.NANOSECONDS) - - private def remoteLockKey(key: Any) = s"$key-emlc-lock" - - - // The idea is simple, have two caches: remote and local - // with values that will eventually expire but still be left on the cache - // while a new value is asynchronously being calculated/retrieved - override def apply(key: String, genValue: () => Future[V])(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = { - // The local cache is always the first try. We'll only look the remote if the local value is missing or has expired - val startTime = System.nanoTime() - val result: Future[V] = localCache.flatMap(_.get(key).map(_.asTry())) match { - case Some(future) => - future.flatMap { - case Success(localValue) if !localValue.hasExpired(ttl, now, ttlCachedErrors) => - // We have locally a good value, just return it - reporter.onLocalCacheHit(key, elapsedTime(startTime)) - // But if we're paranoid, let's check if the local value is consistent with remote - if (sanityLocalValueCheck) - remoteRW.map(remote => sanityLocalValueCheck(key, localValue, remote, genValue, startTime)).getOrElse(Future.fromTry(localValue.value)) - else - Future.fromTry(localValue.value) - case Success(expiredLocalValue) if remoteRW.nonEmpty => - // We have locally an expired value, but we can check a remote cache for better value - remoteRW.get.get(key).asTry().flatMap { - case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) => - // Remote is good, set locally and return it - reporter.onRemoteCacheHit(key, elapsedTime(startTime)) - localCache.foreach(_.set(key, remoteValue)) - Future.fromTry(remoteValue.value) - case Success(Some(expiredRemote)) => - // Expired local and expired remote, return the most recent of them, async update both - reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime)) - tryGenerateAndSet(key, genValue, startTime) - val mostRecent = Set(expiredLocalValue, expiredRemote).maxBy(_.date) - Future.fromTry(mostRecent.value) - case Success(None) => - // No remote found, return local, async update both - reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime)) - tryGenerateAndSet(key, genValue, startTime) - Future.fromTry(expiredLocalValue.value) - case Failure(e) => - reporter.onRemoteError(key, e, elapsedTime(startTime)) - logger.warn(s"apply, key: $key expired local value and failed to get remote", e) - tryGenerateAndSet(key, genValue, startTime) - Future.fromTry(expiredLocalValue.value) - } - case Success(expiredLocalValue) if remoteRW.isEmpty => - // There is no remote cache configured, we'are on our own - // Return expired value and try to generate a new one for the future - reporter.onCacheMissButFoundExpiredLocal(key, elapsedTime(startTime)) - tryGenerateAndSet(key, genValue, startTime) - Future.fromTry(expiredLocalValue.value) - case Failure(e) => - // This is almost impossible to happen because it's local and we don't save failed values - // Failed values are stored into property "value", not as the value itself - reporter.onLocalError(key, e, elapsedTime(startTime)) - logger.warn(s"apply, key: $key got a failed future from cache!? This is almost impossible!", e) - for { - tsv <- tryGenerateAndSet(key, genValue, startTime) - value <- Future.fromTry(tsv.value) - } yield value - } - case None if remoteRW.nonEmpty => - // No local, let's try remote - remoteRW.get.get(key).asTry().flatMap { - case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) => - // Remote is good, set locally and return it - reporter.onRemoteCacheHit(key, elapsedTime(startTime)) - localCache.foreach(_.set(key, remoteValue)) - Future.fromTry(remoteValue.value) - case Success(Some(expiredRemote)) => - // Expired remote, return the it, async update - reporter.onCacheMissButFoundExpiredRemote(key, elapsedTime(startTime)) - tryGenerateAndSet(key, genValue, startTime) - Future.fromTry(expiredRemote.value) - case Success(None) => - // No good remote, sync generate - reporter.onCacheMissNothingFound(key, elapsedTime(startTime)) - for { - tsv <- tryGenerateAndSet(key, genValue, startTime) - value <- Future.fromTry(tsv.value) - } yield value - case Failure(e) => - reporter.onRemoteError(key, e, elapsedTime(startTime)) - logger.warn(s"apply, key: $key expired local value and remote error", e) - for { - tsv <- tryGenerateAndSet(key, genValue, startTime) - value <- Future.fromTry(tsv.value) - } yield value - } - case None if remoteRW.isEmpty => - // No local and no remote to look, just generate it - // The caller will need to wait for the value generation - reporter.onCacheMissNothingFound(key, elapsedTime(startTime)) - for { - tsv <- tryGenerateAndSet(key, genValue, startTime) - value <- Future.fromTry(tsv.value) - } yield value - } - result.onComplete { - case Success(_) => - reporter.onCompletedWithSuccess(key, elapsedTime(startTime)) - case Failure(e) => - reporter.onCompletedWithFailure(key, e, elapsedTime(startTime)) - } - result - } - - // This should be used carefully because it will overwrite the remote value without - // any lock, which may cause a desynchronization between the local and remote cache on other instances - // Note that if any tryGenerateAndSet is in progress, this will wait until it's finished before setting local/remote - override def set(key: String, value: V)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[Unit] = { - logger.info(s"set, key $key: got a call to overwrite local and remote values") - val startTime = System.nanoTime() - val promise = Promise[TimestampedValue[V]]() - val future = promise.future - def doIt() = { - val tValue = timestamp(value) - localCache.foreach(_.set(key, tValue)) - val result = remoteRW.map(remote => remoteOverwrite(key, tValue, remote, startTime)).getOrElse(Future.successful(tValue)) - promise.completeWith(result) - tempUpdate.remove(key, future) - } - tempUpdate.put(key, future) match { - case null => - doIt() - future.map(_ => ()) - case fTrying => - fTrying.onComplete { case _ => doIt() } - future.map(_ => ()) - } - } - - private def sanityLocalValueCheck(key: String, localValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], genValue: () => Future[V], startTime: Long)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[V] = { - remote.get(key).asTry().flatMap { - case Success(Some(remoteValue)) if remoteValue == localValue => - // Remote is the same as local, return any of them - Future.fromTry(remoteValue.value) - case Success(Some(remoteValue)) => - // Something is different, try to figure it out - val valuesResult = if (remoteValue.value == localValue.value) "same-value" else "different-values" - val dateResult = if (remoteValue.date.isAfter(localValue.date)) - s"remote-is-newer-than-local" - else if (localValue.date.isAfter(remoteValue.date)) - s"local-is-newer-than-remote" - else if (localValue.date.isEqual(localValue.date)) - "same-date" - else if (localValue.date.withZone(DateTimeZone.UTC).isEqual(localValue.date.withZone(DateTimeZone.UTC))) - "same-date-on-utc" - else - "impossible-dates" - val remoteExpired = remoteValue.hasExpired(ttl, now, ttlCachedErrors) - val localExpired = localValue.hasExpired(ttl, now, ttlCachedErrors) - val finalResult = s"$valuesResult-$dateResult-remote-expired-${remoteExpired}-local-expired-${localExpired}" - logger.warn(s"sanityLocalValueCheck, key $key: got different results for local $localValue and remote $remoteValue ($finalResult)") - reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime)) - // return remote to keep everyone consistent - Future.fromTry(remoteValue.value) - case Success(None) => - val localExpired = localValue.hasExpired(ttl, now, ttlCachedErrors) - val finalResult = s"missing-remote-local-expired-${localExpired}" - logger.warn(s"sanityLocalValueCheck, key $key: got local $localValue but no remote ($finalResult)") - reporter.onSanityLocalValueCheckFailedResult(key, finalResult, elapsedTime(startTime)) - // Try generate it to keep a behaviour equivalent to remote only - for { - tsv <- tryGenerateAndSet(key, genValue, startTime) - value <- Future.fromTry(tsv.value) - } yield value - - case Failure(e) => - reporter.onRemoteError(key, e, elapsedTime(startTime)) - logger.warn(s"sanityLocalValueCheck, key: $key failed to get remote", e) - Future.fromTry(localValue.value) - } - } - - // Overwrite remote value without lock, retrying on error - private def remoteOverwrite(key: String, calculatedValue: TimestampedValue[V], remote: RemoteCacheRW[TimestampedValue[V]], nanoStartTime: Long, currentRetry: Int = 0)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = { - remote.set(key, calculatedValue).asTry().flatMap { - case Success(_) => - reporter.onSuccessfullyRemoteSetValue(key, elapsedTime(nanoStartTime)) - logger.info(s"remoteForceSet successfully overwritten key $key") - Future.successful(calculatedValue) - case Failure(e) => - reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) - logger.warn(s"remoteForceSet, key $key: got error setting the value, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e) - // Retry failure - after(backoffOnError, scheduler) { - remoteOverwrite(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) - } - } - } - - - // Note: this method may return a failed future, but it will never cache it - // Our main purpose here is to avoid multiple local calls to generate new promises/futures in parallel, - // so we use this Map keep everyone in sync - // This is similar to how spray cache works - private def tryGenerateAndSet(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = { - val promise = Promise[TimestampedValue[V]]() - val future = promise.future - tempUpdate.putIfAbsent(key, future) match { - case null => - logger.info(s"tryGenerateAndSet, key $key: got request for generating and none in progress found, calling canonicalValueGenerator") - canonicalValueGenerator(key, genValue, nanoStartTime).onComplete { - case Success(v) if !v.hasExpired(ttl, now, ttlCachedErrors) => - reporter.onGeneratedWithSuccess(key, elapsedTime(nanoStartTime)) - localCache.foreach(_.set(key, v)) - promise.trySuccess(v) - tempUpdate.remove(key, future) - case Success(v) => - // Have we generated/got an expired value!? - reporter.onUnexpectedBehaviour(key, elapsedTime(nanoStartTime)) - logger.warn(s"tryGenerateAndSet, key $key: unexpectedly generated/got an expired value: $v") - localCache.foreach(_.set(key, v)) - promise.trySuccess(v) - tempUpdate.remove(key, future) - case Failure(e) => - // We don't save failures to cache - // There is no need to log here, canonicalValueGenerator will log everything already - reporter.onGeneratedWithFailure(key, e, elapsedTime(nanoStartTime)) - promise.tryFailure(e) - tempUpdate.remove(key, future) - } - future - case fTrying => - // If someone call us while a future is running, we return the running future - logger.info(s"tryGenerateAndSet, key $key: got request for generating but an existing one is current in progress") - fTrying - } - } - - // This can be called by multiple instances/hosts simultaneously but in the end - // only the one that wins the race will create the final value that will be set in - // the remote cache and read by the other instances - // Unless of course there is some error getting stuff from remote cache - // in which case the locally generated value may be returned to avoid further delays - protected def canonicalValueGenerator(key: String, genValue: () => Future[V], nanoStartTime: Long)(implicit ec: ExecutionContext, scheduler: Scheduler) = { - val fGeneratedValue = Try { genValue().map(timestamp) }.asFutureTry() - val finalValue: Future[TimestampedValue[V]] = fGeneratedValue.flatMap { - case Success(generatedValue) => - // Successfully generated value, try to set it in the remote writable cache - remoteRW match { - // No remote cache available, just return this value to be set on local cache - case None => - Future.successful(generatedValue) - case Some(remote) => - remoteSetOrGet(key, generatedValue, remote, nanoStartTime) - } - case Failure(eLocal) => - // We failed to generate the value ourselves, our hope is if someone else successfully did it in the meantime - reporter.onErrorGeneratingValue(key, eLocal, elapsedTime(nanoStartTime)) - remoteRW match { - case None => - // There are no remote RW caches - logger.error(s"canonicalValueGenerator, key $key: failed to generate value and no remote cache configured", eLocal) - eLocal match { - case NonFatal(e) => { - if (cacheErrors) { - // if error was NonFatal Error then saves it to cache - val timestampedValue: TimestampedValue[V] = timestampError(e) - // Saved it only in localCache - localCache.foreach(_.set(key, timestampedValue)) - } - Future.failed(eLocal) - } - case _ => Future.failed(eLocal) - } - case Some(remote) => - remoteGetNonExpiredValue(key, remote, nanoStartTime).asTry().flatMap { - case Success(v) => - logger.warn(s"canonicalValueGenerator, key $key: failed to generate value but got one from remote", eLocal) - Future.successful(v) - case Failure(eRemote) => - // The real error is the eLocal, return it - logger.error(s"canonicalValueGenerator, key $key: failed to generate value and failed to get remote", eLocal) - eLocal match { - case NonFatal(e) => { - if (cacheErrors) { - // if error was NonFatal Error then saves it to cache - val timestampedValue = timestampError(e) - // Saved it only in localCache - localCache.foreach(_.set(key, timestampedValue)) - } - Future.failed(eLocal) - } - case _ => Future.failed(eLocal) - } - } - } - } - finalValue - } - - // Auxiliary method, only makes sense to be used by canonicalValueGenerator - private def remoteGetNonExpiredValue(key: String, - remote: RemoteCacheRW[TimestampedValue[V]], - nanoStartTime: Long, - currentRetry: Int = 0)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = { - remote.get(key).asTry().flatMap { - case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) => - logger.info(s"remoteGetNonExpiredValue, key $key: got a good value") - Future.successful(remoteValue) - case Success(_) => - Future.failed(new Exception("No good value found on remote")) - case Failure(e) => - if (currentRetry >= maxErrorsToRetryOnRemote) { - reporter.onRemoteGiveUp(key, elapsedTime(nanoStartTime)) - logger.error(s"remoteGetNonExpiredValue, key $key: returning calculated value because we got more than $maxErrorsToRetryOnRemote errors", e) - Future.failed(e) - } else { - reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) - logger.warn(s"remoteGetNonExpiredValue, key $key: got error trying to get value, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e) - // Retry - after(backoffOnError, scheduler) { - remoteGetNonExpiredValue(key, remote, nanoStartTime, currentRetry = currentRetry + 1) - } - } - } - } - - // This methods tries to guarantee that everyone that calls it in - // a given moment will be left with the same value in the end - private def remoteSetOrGet(key: String, - calculatedValue: TimestampedValue[V], - remote: RemoteCacheRW[TimestampedValue[V]], - nanoStartTime: Long, - currentRetry: Int = 0)(implicit ec: ExecutionContext, scheduler: Scheduler): Future[TimestampedValue[V]] = { - if (currentRetry > maxErrorsToRetryOnRemote) { - // Use our calculated value as it's the best we can do - reporter.onRemoteGiveUp(key, elapsedTime(nanoStartTime)) - logger.error(s"remoteSetOrGet, key $key: returning calculated value because we got more than $maxErrorsToRetryOnRemote errors") - Future.successful(calculatedValue) - } else { - remote.setLock(remoteLockKey(key), remoteLockTTL).asTry().flatMap { - case Success(true) => - logger.info(s"remoteSetOrGet got lock for key $key") - // Lock acquired, get the current value and replace it - remote.get(key).asTry().flatMap { - case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) => - // Current value is good, just return it - reporter.onRemoteCacheHitAfterGenerating(key, elapsedTime(nanoStartTime)) - logger.info(s"remoteSetOrGet got lock for $key but found already a good value on remote") - Future.successful(remoteValue) - case Success(_) => - // The remote value is missing or has expired. This is what we were expecting - // We have the lock to replace this value. Our calculated value will be the canonical one! - remote.set(key, calculatedValue).asTry().flatMap { - case Success(_) => - // Flawless victory! - reporter.onSuccessfullyRemoteSetValue(key, elapsedTime(nanoStartTime)) - logger.info(s"remoteSetOrGet successfully set key $key while under lock") - Future.successful(calculatedValue) - case Failure(e) => - reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) - logger.warn(s"remoteSetOrGet, key $key: got error setting the value, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e) - // Retry failure - after(backoffOnError, scheduler) { - remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) - } - } - case Failure(e) => - reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) - logger.warn(s"remoteSetOrGet, key $key: got error getting remote value with lock, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e) - // Retry failure - after(backoffOnError, scheduler) { - remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) - } - } - case Success(false) => - // Someone got the lock, let's take a look at the value - remote.get(key).asTry().flatMap { - case Success(Some(remoteValue)) if !remoteValue.hasExpired(ttl, now, ttlCachedErrors) => - // Current value is good, just return it - logger.info(s"remoteSetOrGet couldn't lock key $key but found a good on remote afterwards") - reporter.onRemoteCacheHitAfterGenerating(key, elapsedTime(nanoStartTime)) - Future.successful(remoteValue) - case Success(_) => - // The value is missing or has expired - // Let's start from scratch because we need to be able to set or get a good value - // Note: do not increment retry because this isn't an error - reporter.onStillTryingToLockOrGet(key, elapsedTime(nanoStartTime)) - logger.info(s"remoteSetOrGet couldn't lock key $key and didn't found good value on remote, scheduling retry") - after(backoffOnLockAcquire, scheduler) { - remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry) - } - case Failure(e) => - reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) - logger.warn(s"remoteSetOrGet, key $key: got error getting remote value without lock, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e) - // Retry - after(backoffOnError, scheduler) { - remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) - } - } - case Failure(e) => - // Retry failure - reporter.onRemoteError(key, e, elapsedTime(nanoStartTime)) - logger.warn(s"remoteSetOrGet, key $key: got error trying to set lock, scheduling retry $currentRetry of $maxErrorsToRetryOnRemote", e) - after(backoffOnError, scheduler) { - remoteSetOrGet(key, calculatedValue, remote, nanoStartTime, currentRetry = currentRetry + 1) - } - } - } - } - - -} \ No newline at end of file diff --git a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala b/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala deleted file mode 100644 index 6868f0b7..00000000 --- a/src/main/scala/ignition/core/http/AsyncHttpClientStreamApi.scala +++ /dev/null @@ -1,89 +0,0 @@ -package ignition.core.http - -import java.io.InputStream -import java.util.concurrent.TimeUnit - -import akka.util.Timeout -import ignition.core.utils.URLUtils -import spray.http._ - -import scala.concurrent.Future -import scala.concurrent.duration._ -import scala.language.postfixOps -import scala.util.Try - -object AsyncHttpClientStreamApi { - - case class Credentials(user: String, password: String) { - def isEmpty = user.isEmpty && password.isEmpty - - def toOption = Some(this).filter(!_.isEmpty) - } - - object Credentials { - val empty = Credentials("", "") - } - - // TODO: return a stream is dangerous because implies into a lock - case class StreamResponse(status: Int, content: InputStream) - - // If any value is None, it will fallback to the implementation's default - object RequestConfiguration { - val defaultMaxRedirects: Int = 15 - val defaultMaxConnectionsPerHost: Int = 500 - val defaultPipelining: Boolean = false - val defaultIdleTimeout: FiniteDuration = Duration(30, TimeUnit.SECONDS) - val defaultRequestTimeout: FiniteDuration = Duration(20, TimeUnit.SECONDS) - val defaultConnectingTimeout: FiniteDuration = Duration(10, TimeUnit.SECONDS) - } - - case class RequestConfiguration(maxRedirects: Option[Int] = Option(RequestConfiguration.defaultMaxRedirects), - maxConnectionsPerHost: Option[Int] = Option(RequestConfiguration.defaultMaxConnectionsPerHost), - pipelining: Option[Boolean] = Option(RequestConfiguration.defaultPipelining), - idleTimeout: Option[Duration] = Option(RequestConfiguration.defaultIdleTimeout), - requestTimeout: Option[Duration] = Option(RequestConfiguration.defaultRequestTimeout), - connectingTimeout: Option[Duration] = Option(RequestConfiguration.defaultConnectingTimeout)) - - case class Request(url: String, - params: Map[String, String] = Map.empty, - credentials: Option[Credentials] = None, - method: HttpMethod = HttpMethods.GET, - body: HttpEntity = HttpEntity.Empty, - headers: List[HttpHeader] = List.empty, - requestConfiguration: Option[RequestConfiguration] = None) { - - def uri: Uri = { - if (params.nonEmpty) - URLUtils.parseUri(url).map(_.withQuery(params)).get - else - URLUtils.parseUri(url).get - } - } - - case class RequestException(message: String, response: StreamResponse) extends RuntimeException(message) - - object NoOpReporter extends ReporterCallback { - def onRequest(request: Request): Unit = {} - def onResponse(request: Request, status: Int): Unit = {} - def onFailure(request: Request, status: Int): Unit = {} - def onRetry(request: Request): Unit = {} - def onGiveUp(request: Request): Unit = {} - def onError(request: Request, error: Any): Unit = {} - } - - abstract class ReporterCallback { - def onRequest(request: Request): Unit - def onResponse(request: Request, status: Int): Unit - def onFailure(request: Request, status: Int): Unit - def onRetry(request: Request): Unit - def onGiveUp(request: Request): Unit - def onError(request: Request, error: Any): Unit - } -} - -trait AsyncHttpClientStreamApi { - - def makeRequest(request: AsyncHttpClientStreamApi.Request, retryConf: RetryConf = RetryConf(), retryOnHttpStatus: Seq[Int] = List.empty) - (implicit timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback = AsyncHttpClientStreamApi.NoOpReporter): Future[AsyncHttpClientStreamApi.StreamResponse] - -} \ No newline at end of file diff --git a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala b/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala deleted file mode 100644 index af40c25a..00000000 --- a/src/main/scala/ignition/core/http/AsyncSprayHttpClient.scala +++ /dev/null @@ -1,297 +0,0 @@ -package ignition.core.http - -import java.util.concurrent.TimeoutException - -import akka.actor._ -import akka.io.IO -import akka.pattern.ask -import akka.util.Timeout -import ignition.core.http.AsyncHttpClientStreamApi.{Request, RequestConfiguration} -import spray.can.Http -import spray.can.Http.HostConnectorSetup -import spray.can.client.{ClientConnectionSettings, HostConnectorSettings} -import spray.http.HttpHeaders.Authorization -import spray.http.StatusCodes.Redirection -import spray.http._ - -import scala.concurrent.{ExecutionContext, Future} -import scala.language.postfixOps -import scala.util.control.NonFatal - - -trait AsyncSprayHttpClient extends AsyncHttpClientStreamApi { - - implicit def actorRefFactory: ActorRefFactory - def executionContext: ExecutionContext = actorRefFactory.dispatcher - - override def makeRequest(request: AsyncHttpClientStreamApi.Request, retryConf: RetryConf, retryOnHttpStatus: Seq[Int]) - (implicit timeout: Timeout, reporter: AsyncHttpClientStreamApi.ReporterCallback): Future[AsyncHttpClientStreamApi.StreamResponse] = { - val processor = actorRefFactory.actorOf(Props(new RequestProcessorActor(timeout, reporter, retryConf, retryOnHttpStatus))) - (processor ? request).mapTo[AsyncHttpClientStreamApi.StreamResponse] - } - - private class RequestProcessorActor(timeout: Timeout, - reporter: AsyncHttpClientStreamApi.ReporterCallback, - retryConf: RetryConf, - retryOnHttpStatus: Seq[Int]) - extends Actor with ActorLogging { - - - import context.system - - import scala.language.implicitConversions - - def isRedirection(status: StatusCode): Boolean = status match { - case r: Redirection => true - case _ => false - } - - private implicit def toAuthHeader(credentials: AsyncHttpClientStreamApi.Credentials): List[Authorization] = - List(Authorization(credentials = BasicHttpCredentials(username = credentials.user, password = credentials.password))) - - private def toSprayRequest(request: Request): HttpRequest = request match { - case Request(_, params, Some(credentials), method, body, headers, _) => - HttpRequest(method = method, uri = request.uri, headers = credentials ++ headers, entity = body) - - case Request(_, params, None, method, body, headers, _) => - HttpRequest(method = method, uri = request.uri, entity = body, headers = headers) - } - - private def toSprayHostConnectorSetup(uri: Uri, conf: Option[AsyncHttpClientStreamApi.RequestConfiguration]): HostConnectorSetup = { - // Create based on defaults, change some of them - val ccs: ClientConnectionSettings = ClientConnectionSettings(system) - val hcs: HostConnectorSettings = HostConnectorSettings(system) - - val updatedCcs = ccs.copy( - responseChunkAggregationLimit = 0, // makes our client ineffective if non zero - idleTimeout = conf.flatMap(_.idleTimeout).getOrElse(ccs.idleTimeout), - connectingTimeout = conf.flatMap(_.connectingTimeout).getOrElse(ccs.connectingTimeout), - requestTimeout = conf.flatMap(_.requestTimeout).getOrElse(ccs.requestTimeout) - ) - - val maxConnections = conf.flatMap(_.maxConnectionsPerHost).getOrElse { - // Let's avoid someone shoot his own foot - if (hcs.maxConnections == 4) // Spray's default is stupidly low - // Use the API's default, which is more reasonable - RequestConfiguration.defaultMaxConnectionsPerHost - else - // If the conf is the non-default value, then someone know what he's doing. use that configured value - hcs.maxConnections - } - - val updatedHcs = hcs.copy( - connectionSettings = updatedCcs, - maxRetries = 0, // We have our own retry mechanism - maxRedirects = 0, // We do our own redirect following - maxConnections = maxConnections, - pipelining = conf.flatMap(_.pipelining).getOrElse(hcs.pipelining) - ) - - val host = uri.authority.host - HostConnectorSetup(host.toString, uri.effectivePort, sslEncryption = uri.scheme == "https", settings = Option(updatedHcs)) - } - - private def executeSprayRequest(request: Request): Unit = { - val message = (toSprayRequest(request), toSprayHostConnectorSetup(request.uri, request.requestConfiguration)) - IO(Http) ! message - } - - def handleErrors(commander: ActorRef, request: Request, retry: Retry, storage: ByteStorage, remainingRedirects: Int): Receive = { - case ev @ Http.SendFailed(_) => - log.debug("Communication error, cause: {}", ev) - reporter.onError(request, ev) - storage.close() - context.become(retrying(commander, request, remainingRedirects)) - self ! retry.onError - - case ev @ Timedout(_) => - log.debug("Communication error, cause: {}", ev) - reporter.onError(request, ev) - storage.close() - context.become(retrying(commander, request, remainingRedirects)) - self ! retry.onTimeout - - case Status.Failure(NonFatal(exception)) => - reporter.onError(request, exception) - storage.close() - exception match { - case ex: Http.RequestTimeoutException => - log.warning("Request {} timeout, details: {}", request, ex.getMessage) - context.become(retrying(commander, request, remainingRedirects)) - self ! retry.onTimeout - - case ex: Http.ConnectionException => - log.warning("Connection error on {}, details: {}", request, ex.getMessage) - context.become(retrying(commander, request, remainingRedirects)) - self ! retry.onError - - case unknownException => - log.error(unknownException, "Unknown error on {}", request) - context.become(retrying(commander, request, remainingRedirects)) - self ! retry.onError - } - - case unknownMessage => - log.debug("Unknown message: {}", unknownMessage) - reporter.onError(request, unknownMessage) - storage.close() - context.become(retrying(commander, request, remainingRedirects)) - self ! retry.onError - } - - def receive: Receive = { - case request: Request => - log.debug("Starting request {}", request) - reporter.onRequest(request) - executeSprayRequest(request) - val retry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, conf = retryConf) - val storage = new ByteStorage() - val maxRedirects = - request.requestConfiguration.flatMap(_.maxRedirects).getOrElse(RequestConfiguration.defaultMaxRedirects) - context.become(waitingForResponse(sender, request, retry, storage, maxRedirects) - .orElse(handleErrors(sender, request, retry, storage, maxRedirects))) - } - - def retrying(commander: ActorRef, request: Request, remainingRedirects: Int): Receive = { - case retry: Retry => - if (retry.shouldGiveUp) { - reporter.onGiveUp(request) - log.warning("Error to get {}, no more retries {}, accepting failure", request, retry) - commander ! Status.Failure(new TimeoutException(s"Failed to get '${request.url}'")) - context.stop(self) - } else { - reporter.onRetry(request) - log.info("Retrying {}, retry status {}, backing off for {} millis", request, retry, retry.backoff.toMillis) - system.scheduler.scheduleOnce(retry.backoff) { - log.debug("Waking from backoff, retrying request {}", request) - executeSprayRequest(request) - }(executionContext) - val storage = new ByteStorage() - context.become(waitingForResponse(commander, request, retry, storage, remainingRedirects) - .orElse(handleErrors(commander, request, retry, storage, remainingRedirects))) - } - } - - def waitingForResponse(commander: ActorRef, request: Request, retry: Retry, storage: ByteStorage, remainingRedirects: Int): Receive = { - case response@HttpResponse(status, entity, headers, _) => try { - storage.write(response.entity.data.toByteArray) - if (isRedirection(status)) - handleRedirect(commander, storage, retry, request, status, response, remainingRedirects) - else if (status.isSuccess) { - reporter.onResponse(request, status.intValue) - commander ! Status.Success(AsyncHttpClientStreamApi.StreamResponse(status.intValue, storage.getInputStream())) - context.stop(self) - } else if (retryOnHttpStatus.contains(status.intValue)) { - storage.close() - log.debug("HttpResponse: Status {}, retrying...", status) - context.become(retrying(commander, request, remainingRedirects)) - self ! retry.onError - } else { - val message = s"HTTP response status ${status.intValue}, on request ${request}, ${status.defaultMessage}" - log.debug("HttpResponse: {}", message) - reporter.onFailure(request, status.intValue) - reporter.onGiveUp(request) - commander ! Status.Failure(new AsyncHttpClientStreamApi.RequestException(message = message, - response = AsyncHttpClientStreamApi.StreamResponse(status.intValue, storage.getInputStream()))) - context.stop(self) - } - } catch { - case NonFatal(ex) => - storage.close() - log.error(ex, "HttpResponse: Failure on creating HttpResponse") - reporter.onError(request, ex) - context.become(retrying(commander, request, remainingRedirects)) - self ! retry.onError - } - - case chunkStart@ChunkedResponseStart(HttpResponse(status, entity, headers, _)) => try { - storage.write(entity.data.toByteArray) - if (isRedirection(status)) - handleRedirect(commander, storage, retry, request, status, chunkStart, remainingRedirects) - else if (status.isSuccess) { - context.become(accumulateChunks(commander, request, retry, storage, status, remainingRedirects) - .orElse(handleErrors(commander, request, retry, storage, remainingRedirects))) - } else if (retryOnHttpStatus.contains(status.intValue)) { - storage.close() - log.debug("ChunkedResponseStart: Status {}, retrying...", status) - context.become(retrying(commander, request, remainingRedirects)) - self ! retry.onError - } else { - val message = s"HTTP response status ${status.intValue}, on request ${request}, ${status.defaultMessage}" - log.debug("ChunkedResponseStart: {}", message) - reporter.onFailure(request, status.intValue) - reporter.onGiveUp(request) - commander ! Status.Failure(new AsyncHttpClientStreamApi.RequestException(message = message, - response = AsyncHttpClientStreamApi.StreamResponse(status.intValue, storage.getInputStream()))) - context.stop(self) - } - } catch { - case NonFatal(ex) => - log.error(ex, "ChunkedResponseStart: Failure on creating ChunkedHttpResponse") - reporter.onError(request, ex) - context.become(retrying(commander, request, remainingRedirects)) - self ! retry.onError - } - } - - def accumulateChunks(commander: ActorRef, request: Request, retry: Retry, storage: ByteStorage, status: StatusCode, remainingRedirects: Int): Receive = { - case message@MessageChunk(data, _) => try { - storage.write(data.toByteArray) - } catch { - case NonFatal(ex) => - storage.close() - log.error(ex, "MessageChunk: Failure on accumulate chunk data") - reporter.onError(request, ex) - context.become(retrying(commander, request, remainingRedirects)) - self ! retry.onError - } - - case chunkEnd: ChunkedMessageEnd => - log.debug("ChunkedMessageEnd: all data was received for request {}, status {}", request, status) - reporter.onResponse(request, status.intValue) - commander ! Status.Success(AsyncHttpClientStreamApi.StreamResponse(status.intValue, storage.getInputStream())) - context.stop(self) - } - - def handleRedirect(commander: ActorRef, oldStorage: ByteStorage, oldRetry: Retry, oldRequest: Request, status: StatusCode, rawResponse: HttpResponsePart, remainingRedirects: Int): Unit = { - if (remainingRedirects <= 0) { - val message = s"HandleRedirect: exceeded redirection limit on $oldRequest with status $status" - log.warning(message) - reporter.onGiveUp(oldRequest) - commander ! Status.Failure(new Exception(message)) - context.stop(self) - } else { - def makeRequest(headers: List[HttpHeader]): Receive = { - oldStorage.close() - val newRemainingRedirects = remainingRedirects - 1 - headers.find(_.is("location")).map(_.value).map { newLocation => - log.debug("Making redirect to {}", newLocation) - val newRequest = oldRequest.copy(url = newLocation) - executeSprayRequest(newRequest) - val newRetry = Retry(startTime = org.joda.time.DateTime.now, timeout = timeout.duration, conf = retryConf) - val newStorage = new ByteStorage() - waitingForResponse(commander, newRequest, newRetry, newStorage, newRemainingRedirects) - .orElse(handleErrors(commander, newRequest, newRetry, newStorage, newRemainingRedirects)) - }.getOrElse { - log.warning("Received redirect for request {} with headers {} without location, retrying...", oldRequest, headers) - retrying(commander, oldRequest, newRemainingRedirects) - } - } - context.become(rawResponse match { - case response@HttpResponse(status, entity, headers, _) => - makeRequest(headers) - case chunkStart@ChunkedResponseStart(HttpResponse(status, entity, headers, _)) => { - case message@MessageChunk(data, _) => - // do nothing - case chunkEnd: ChunkedMessageEnd => - context.become(makeRequest(headers)) - } - case other => - throw new Exception(s"Bug, called on $other") - }) - } - } - - } - -} diff --git a/src/main/scala/ignition/core/http/ByteStorage.scala b/src/main/scala/ignition/core/http/ByteStorage.scala deleted file mode 100644 index c137a5fe..00000000 --- a/src/main/scala/ignition/core/http/ByteStorage.scala +++ /dev/null @@ -1,114 +0,0 @@ -package ignition.core.http - -import java.io._ -import java.nio.file.{Files, Paths} -import java.util.UUID - -import org.slf4j.LoggerFactory - -import scala.util.control.NonFatal -import scala.util.{Failure, Success, Try} - -class ByteStorage(memoryThreshold: Int = 1024 * 1024 * 5) extends AutoCloseable { - - lazy val log = LoggerFactory.getLogger(getClass) - - lazy val tempDirPath = Files.createDirectories(Paths.get(System.getProperty("java.io.tmpdir"), "ByteStorage")) - - lazy val buffer = new ByteArrayOutputStream - - var fileStorage: Option[(File, FileOutputStream)] = None - - def write(bytes: Array[Byte]): Unit = try { - if (fileStorage.isDefined) { - writeOnFile(bytes) - } else if (buffer.size() + bytes.length > memoryThreshold) { - log.debug("Memory threshold {} reach, going to file storage", memoryThreshold) - setupFileStorage() - writeOnFile(buffer.toByteArray) - writeOnFile(bytes) - // on ByteArrayOutputStream close() takes not effect, - // but if we change the buffer impl this is the a good moment to free resources - buffer.close() - } else { - buffer.write(bytes) - } - } catch { - case NonFatal(ex) => - close() - throw ex - } - - override def close(): Unit = fileStorage match { - case Some((file, outputStream)) => try { - log.debug("Cleaning up temp file {}", file.getAbsolutePath) - outputStream.close() - file.delete() - } catch { - case NonFatal(ex) => log.warn(s"Fail to cleanup temp file ${file.getAbsolutePath}", ex) - } - case None => - log.debug("Cleaning up memory buffer") - buffer.close() - } - - private def setupFileStorage(): Unit = if (fileStorage.isEmpty) { - tryCreateTempFile match { - case Success(storage) => fileStorage = Option(storage) - case Failure(ex) => throw ex - } - } else { - throw new IllegalStateException("File storage already setup") - } - - private def tryCreateTempFile: Try[(File, FileOutputStream)] = Try { - val tempFile = File.createTempFile(s"temp_byte_storage_${UUID.randomUUID().toString}", ".temp", tempDirPath.toFile) - tempFile.deleteOnExit() - log.debug("Creating temp file {}", tempFile.getAbsolutePath) - (tempFile, new FileOutputStream(tempFile)) - } - - private def writeOnFile(bytes: Array[Byte]): Unit = fileStorage match { - case Some((_, outputStream)) => outputStream.write(bytes) - case None => throw new IllegalStateException("File storage not initialized") - } - - def getInputStream(): InputStream = fileStorage match { - case Some((file, outputStream)) => try { - outputStream.close() - new DeleteOnCloseFileInputStream(file) - } catch { - case NonFatal(ex) => - log.error("Fail to create InputStream", ex) - close() - throw ex - } - case None => new ByteArrayInputStream(buffer.toByteArray) - } - - override def finalize() = try { - fileStorage match { - case Some((file, outputStream)) => - log.debug("Cleaning up temp file {}", file.getAbsolutePath) - outputStream.close() - file.delete() - case None => - } - } finally { - super.finalize() - } - -} - -private class DeleteOnCloseFileInputStream(file: File) extends FileInputStream(file) { - lazy val log = LoggerFactory.getLogger(getClass) - override def close() = try { - log.debug("Cleaning up file {}", file.getAbsolutePath) - file.delete() - } catch { - case NonFatal(ex) => - log.warn(s"Failed to clean up file ${file.getAbsolutePath}", ex) - } finally { - super.close() - } -} \ No newline at end of file diff --git a/src/main/scala/ignition/core/http/Caching.scala b/src/main/scala/ignition/core/http/Caching.scala deleted file mode 100644 index 112791a5..00000000 --- a/src/main/scala/ignition/core/http/Caching.scala +++ /dev/null @@ -1,22 +0,0 @@ -package ignition.core.http - -import org.slf4j.LoggerFactory -import spray.caching.Cache - -import scala.concurrent._ -import scala.util.Failure - -trait Caching[T] { - val log = LoggerFactory.getLogger(classOf[Caching[T]]) - - val cache: Cache[T] - import ExecutionContext.Implicits.global - def fetchCache[K](key: K, f: () => Future[T]): Future[T] = cache(key) { - f.apply andThen { - case Failure(e) => { - cache.remove(key) - log.info(s"Removed $key from cache due to an exception: $e") - } - } - } -} diff --git a/src/main/scala/ignition/core/http/Retry.scala b/src/main/scala/ignition/core/http/Retry.scala deleted file mode 100644 index 1c94828b..00000000 --- a/src/main/scala/ignition/core/http/Retry.scala +++ /dev/null @@ -1,84 +0,0 @@ -package ignition.core.http - -import java.util.concurrent.TimeUnit - -import org.joda.time.DateTime - -import scala.concurrent.duration.{Duration, FiniteDuration, _} -import scala.language.postfixOps -import scala.util.Random - -object Retry { - - sealed trait State - case object Timeout extends State - case object Error extends State - - def exponentialBackOff(base: Int, - exponent: Int, - initialBackoff: FiniteDuration, - maxBackoff: FiniteDuration, - maxRandom: FiniteDuration): FiniteDuration = { - val randomMillis = maxRandom.toMillis.toInt - val random = if (randomMillis > 0) - FiniteDuration(Random.nextInt(randomMillis), TimeUnit.MILLISECONDS) - else - FiniteDuration(0, TimeUnit.MILLISECONDS) - - val calculated = scala.math.pow(base, exponent).round * (random + initialBackoff) - calculated.min(maxBackoff) - } - -} - -case class RetryConf(initialTimeoutBackoff: FiniteDuration = 100 milliseconds, - maxErrors: Int = 10, - initialBackoffOnError: FiniteDuration = 100 milliseconds, - timeoutMultiplicationFactor: Int = 2, - errorMultiplicationFactor: Int = 2, - maxBackoff: FiniteDuration = 1 minute, - maxRandom: FiniteDuration = 30 milliseconds) - -case class Retry(conf: RetryConf, - startTime: DateTime, - timeout: FiniteDuration, - state: Retry.State = Retry.Timeout, - timeoutCount: Int = 0, - errorsCount: Int = 0) { - - import Retry._ - - protected def now = DateTime.now - - private def errorBackoff = - exponentialBackOff(conf.errorMultiplicationFactor, Math.max(errorsCount - 1, 0), conf.initialBackoffOnError, conf.maxBackoff, conf.maxRandom) - private def timeoutBackoff = - exponentialBackOff(conf.timeoutMultiplicationFactor, Math.max(timeoutCount - 1, 0), conf.initialTimeoutBackoff, conf.maxBackoff, conf.maxRandom) - - def onError(): Retry = - copy(errorsCount = errorsCount + 1, state = Retry.Error) - - def onTimeout(): Retry = copy(timeoutCount = timeoutCount + 1, state = Retry.Timeout) - - def backoff(): FiniteDuration = state match { - case Timeout => timeoutBackoff - case Error => errorBackoff - } - - private def canRetryMore(durations: FiniteDuration*): Boolean = { - val maxTime = startTime.plusMillis(timeout.toMillis.toInt) - val nextEstimatedTime = now.plusMillis(durations.map(_.toMillis.toInt).sum) - nextEstimatedTime.isBefore(maxTime) - } - - // This is an approximation and we are ignoring the time waiting on backoff. - // In this way we are overestimating the average request duration, which is fine because it's better to abort early than wait too much time exceed AskTimeouts - private def averageRequestDuration = - Duration((now.getMillis - startTime.getMillis) / Math.max(timeoutCount + errorsCount, 1), TimeUnit.MILLISECONDS) - - def shouldGiveUp(): Boolean = state match { - case Timeout => !canRetryMore(averageRequestDuration, timeoutBackoff) - case Error => !canRetryMore(averageRequestDuration, errorBackoff) || errorsCount > conf.maxErrors - } - -} \ No newline at end of file diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 1d06505d..e07ba54f 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -3,7 +3,7 @@ package ignition.core.jobs.utils import java.io.InputStream import com.amazonaws.auth.DefaultAWSCredentialsProviderChain -import com.amazonaws.services.s3.AmazonS3Client +import com.amazonaws.services.s3.{AmazonS3, AmazonS3Builder, AmazonS3Client} import com.amazonaws.services.s3.model.{ListObjectsRequest, ObjectListing, S3ObjectSummary} import ignition.core.utils.CollectionUtils._ import ignition.core.utils.DateUtils._ @@ -49,7 +49,8 @@ object SparkContextUtils { def isCompressed(f: HadoopFile): Boolean = compressedExtensions.exists(f.path.endsWith) } - private lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new DefaultAWSCredentialsProviderChain()) + private lazy val amazonS3ClientFromEnvironmentVariables: AmazonS3 = + AmazonS3Client.builder().withCredentials(new DefaultAWSCredentialsProviderChain()).build() private def close(inputStream: InputStream, path: String): Unit = { try { @@ -453,7 +454,7 @@ object SparkContextUtils { } def s3ListCommonPrefixes(path: S3SplittedPath, delimiter: String = "/") - (implicit s3: AmazonS3Client): Stream[S3SplittedPath] = { + (implicit s3: AmazonS3): Stream[S3SplittedPath] = { def inner(current: ObjectListing): Stream[String] = if (current.isTruncated) { logger.trace(s"list common prefixed truncated for ${path.bucket} ${path.key}: ${current.getCommonPrefixes}") @@ -468,7 +469,7 @@ object SparkContextUtils { } def s3ListObjects(path: S3SplittedPath) - (implicit s3: AmazonS3Client): Stream[S3ObjectSummary] = { + (implicit s3: AmazonS3): Stream[S3ObjectSummary] = { def inner(current: ObjectListing): Stream[S3ObjectSummary] = if (current.isTruncated) { logger.trace(s"list objects truncated for ${path.bucket} ${path.key}: $current") @@ -487,7 +488,7 @@ object SparkContextUtils { inclusiveEndDate: Boolean = true, endDate: Option[DateTime] = None, ignoreHours: Boolean = true) - (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): Stream[WithOptDate[S3SplittedPath]] = { + (implicit s3: AmazonS3, pathDateExtractor: PathDateExtractor): Stream[WithOptDate[S3SplittedPath]] = { def isGoodDate(date: DateTime): Boolean = { val startDateToCompare = startDate.map(date => if (ignoreHours) date.withTimeAtStartOfDay() else date) @@ -529,7 +530,7 @@ object SparkContextUtils { inclusiveEndDate: Boolean, endDate: Option[DateTime], exclusionPattern: Option[String]) - (implicit s3: AmazonS3Client, dateExtractor: PathDateExtractor): Stream[WithOptDate[Array[S3ObjectSummary]]] = { + (implicit s3: AmazonS3, dateExtractor: PathDateExtractor): Stream[WithOptDate[Array[S3ObjectSummary]]] = { S3SplittedPath.from(path) match { diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala index 684c950b..f12918db 100644 --- a/src/main/scala/ignition/core/utils/FutureUtils.scala +++ b/src/main/scala/ignition/core/utils/FutureUtils.scala @@ -1,7 +1,5 @@ package ignition.core.utils -import akka.actor.ActorSystem - import scala.concurrent.duration.FiniteDuration import scala.concurrent.{ExecutionContext, Future, Promise, blocking, future} import scala.util.control.NonFatal @@ -33,9 +31,6 @@ object FutureUtils { future.map(v => Success(v)).recover { case NonFatal(e) => Failure(e) } } - def withTimeout(timeout: => Throwable)(implicit duration: FiniteDuration, system: ActorSystem): Future[V] = { - Future.firstCompletedOf(Seq(future, akka.pattern.after(duration, system.scheduler)(Future.failed(timeout))(system.dispatcher)))(system.dispatcher) - } } implicit class TryFutureImprovements[V](future: Try[Future[V]]) { diff --git a/src/main/scala/ignition/core/utils/S3Client.scala b/src/main/scala/ignition/core/utils/S3Client.scala deleted file mode 100644 index 020ab6f4..00000000 --- a/src/main/scala/ignition/core/utils/S3Client.scala +++ /dev/null @@ -1,62 +0,0 @@ -package ignition.core.utils - -import java.util.Properties - -import org.jets3t.service.impl.rest.httpclient.RestS3Service -import org.jets3t.service.model.{S3Object, StorageObject} -import org.jets3t.service.security.AWSCredentials -import org.jets3t.service.{Constants, Jets3tProperties} - - -class S3Client { - - val jets3tProperties = { - val jets3tProperties = Jets3tProperties.getInstance(Constants.JETS3T_PROPERTIES_FILENAME) - val properties = new Properties() -// properties.put("httpclient.max-connections", "2") // The maximum number of simultaneous connections to allow globally -// properties.put("httpclient.retry-max", "10") // How many times to retry connections when they fail with IO errors -// properties.put("httpclient.socket-timeout-ms", "30000") // How many milliseconds to wait before a connection times out. 0 means infinity. - - jets3tProperties.loadAndReplaceProperties(properties, "ignition'") - jets3tProperties - } - - val service = new RestS3Service( - new AWSCredentials(System.getenv("AWS_ACCESS_KEY_ID"), System.getenv("AWS_SECRET_ACCESS_KEY")), - null, null, jets3tProperties - ) - - def writeContent(bucket: String, key: String, content: String, contentType: String = "text/plain"): S3Object = { - val obj = new S3Object(key, content) - obj.setContentType(contentType) - service.putObject(bucket, obj) - } - - def readContent(bucket: String, key: String): S3Object = { - service.getObject(bucket, key, null, null, null, null, null, null) - } - - def list(bucket: String, key: String): Array[StorageObject] = { - service.listObjectsChunked(bucket, key, null, 99999L, null, true).getObjects - } - - def copyFile(sourceBucket: String, sourceKey: String, - destBucket: String, destKey: String, - destContentType: Option[String] = None, - destContentEncoding: Option[String] = None): Unit = { - val destFile = new S3Object(destKey) - val replaceMetaData = destContentType.isDefined || destContentEncoding.isDefined - destContentEncoding.foreach(encoding => destFile.setContentEncoding(encoding)) - destContentType.foreach(contentType => destFile.setContentType(contentType)) - service.copyObject(sourceBucket, sourceKey, destBucket, destFile, replaceMetaData) - } - - def fileExists(bucket: String, key: String): Boolean = { - try { - service.getObjectDetails(bucket, key, null, null, null, null) - true - } catch { - case e: org.jets3t.service.S3ServiceException if e.getResponseCode == 404 => false - } - } -} diff --git a/src/main/scala/ignition/core/utils/TelemetryCache.scala b/src/main/scala/ignition/core/utils/TelemetryCache.scala deleted file mode 100644 index d86f98bc..00000000 --- a/src/main/scala/ignition/core/utils/TelemetryCache.scala +++ /dev/null @@ -1,45 +0,0 @@ -package ignition.core.utils - -import ignition.core.utils.TelemetryCache.TelemetryCacheReporter -import spray.caching.Cache - -import scala.concurrent.{ExecutionContext, Future} - -object TelemetryCache { - - def apply[V](cacheName: String, wrapped: Cache[V], reporter: TelemetryCacheReporter): Cache[V] = - new TelemetryCache[V](cacheName, wrapped, reporter) - - trait TelemetryCacheReporter { - def onHit(name: String): Unit - def onMiss(name: String): Unit - } - -} - -class TelemetryCache[V](cacheName: String, wrapped: Cache[V], reporter: TelemetryCacheReporter) extends Cache[V] { - - override def apply(key: Any, genValue: () => Future[V])(implicit ec: ExecutionContext): Future[V] = { - val value = wrapped.get(key) - if (value.isDefined) { - reporter.onHit(cacheName) - value.get - } else { - reporter.onMiss(cacheName) - wrapped.apply(key, genValue) - } - } - - override def get(key: Any): Option[Future[V]] = wrapped.get(key) - - override def clear(): Unit = wrapped.clear() - - override def size: Int = wrapped.size - - override def remove(key: Any): Option[Future[V]] = wrapped.remove(key) - - override def keys: Set[Any] = wrapped.keys - - override def ascendingKeys(limit: Option[Int]): Iterator[Any] = wrapped.ascendingKeys(limit) - -} diff --git a/src/main/scala/ignition/core/utils/URLUtils.scala b/src/main/scala/ignition/core/utils/URLUtils.scala index f66a3f03..4a0ae28c 100644 --- a/src/main/scala/ignition/core/utils/URLUtils.scala +++ b/src/main/scala/ignition/core/utils/URLUtils.scala @@ -1,10 +1,8 @@ package ignition.core.utils -import java.net.{URL, URLDecoder, URLEncoder} +import java.net.{URLDecoder, URLEncoder} import org.apache.http.client.utils.URIBuilder -import spray.http.Uri -import spray.http.Uri.Query import scala.util.Try @@ -18,23 +16,6 @@ object URLUtils { def sanitizePathSegment(segment: String): Try[String] = Try { URLEncoder.encode(URLDecoder.decode(segment, "UTF-8"), "UTF-8").replace("+", "%20") } - def parseUri(urlStr: String): Try[Uri] = { - for { - url <- Try(new URL(urlStr)) - rawSegments = url.getPath.split("/") - saneSegments = rawSegments.map(sanitizePathSegment) - if saneSegments.forall(_.isSuccess) - sanePath = saneSegments.map(_.get).mkString("/") - } yield Uri.from( - scheme = url.getProtocol, - userinfo = Option(url.getUserInfo).getOrElse(""), - host = url.getHost, - port = Seq(url.getPort, 0).max, - path = sanePath, - query = Query(Option(url.getQuery)), - fragment = Option(url.getRef)) - } - def addParametersToUrl(url: String, partnerParams: Map[String, String]): String = { val builder = new URIBuilder(url.trim) partnerParams.foreach { case (k, v) => builder.addParameter(k, v) } diff --git a/src/main/scala/spray/cache/ExpiringLruLocalCache.scala b/src/main/scala/spray/cache/ExpiringLruLocalCache.scala deleted file mode 100644 index 9fa476f9..00000000 --- a/src/main/scala/spray/cache/ExpiringLruLocalCache.scala +++ /dev/null @@ -1,134 +0,0 @@ -// Note: -// For ignition.core we added two methods to satisfy ExpiringMultipleLevelCache.LocalCache[V] - -/* - * Copyright © 2011-2013 the spray project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package spray.caching - -import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap -import spray.util.Timestamp - -import scala.annotation.tailrec -import scala.collection.JavaConverters._ -import scala.concurrent.duration.Duration -import scala.concurrent.{ExecutionContext, Future, Promise} -import scala.util.{Failure, Success} - -final class ExpiringLruLocalCache[V](maxCapacity: Long, - initialCapacity: Int = 16, - timeToLive: Duration = Duration.Inf, - timeToIdle: Duration = Duration.Inf) extends Cache[V] with ignition.core.cache.ExpiringMultiLevelCache.LocalCache[V] { - require(!timeToLive.isFinite || !timeToIdle.isFinite || timeToLive > timeToIdle, - s"timeToLive($timeToLive) must be greater than timeToIdle($timeToIdle)") - - private[caching] val store = new ConcurrentLinkedHashMap.Builder[Any, Entry[V]] - .initialCapacity(initialCapacity) - .maximumWeightedCapacity(maxCapacity) - .build() - - @tailrec - def get(key: Any): Option[Future[V]] = store.get(key) match { - case null ⇒ None - case entry if (isAlive(entry)) ⇒ - entry.refresh() - Some(entry.future) - case entry ⇒ - // remove entry, but only if it hasn't been removed and reinserted in the meantime - if (store.remove(key, entry)) None // successfully removed - else get(key) // nope, try again - } - - def apply(key: Any, genValue: () ⇒ Future[V])(implicit ec: ExecutionContext): Future[V] = { - def insert() = { - val newEntry = new Entry(Promise[V]()) - val valueFuture = - store.put(key, newEntry) match { - case null ⇒ genValue() - case entry ⇒ - if (isAlive(entry)) { - // we date back the new entry we just inserted - // in the meantime someone might have already seen the too fresh timestamp we just put in, - // but since the original entry is also still alive this doesn't matter - newEntry.created = entry.created - entry.future - } else genValue() - } - valueFuture.onComplete { value ⇒ - newEntry.promise.tryComplete(value) - // in case of exceptions we remove the cache entry (i.e. try again later) - if (value.isFailure) store.remove(key, newEntry) - } - newEntry.promise.future - } - store.get(key) match { - case null ⇒ insert() - case entry if (isAlive(entry)) ⇒ - entry.refresh() - entry.future - case entry ⇒ insert() - } - } - - def remove(key: Any) = store.remove(key) match { - case null ⇒ None - case entry if (isAlive(entry)) ⇒ Some(entry.future) - case entry ⇒ None - } - - def clear(): Unit = { store.clear() } - - def keys: Set[Any] = store.keySet().asScala.toSet - - def ascendingKeys(limit: Option[Int] = None) = - limit.map { lim ⇒ store.ascendingKeySetWithLimit(lim) } - .getOrElse(store.ascendingKeySet()) - .iterator().asScala - - def size = store.size - - private def isAlive(entry: Entry[V]) = - (entry.created + timeToLive).isFuture && - (entry.lastAccessed + timeToIdle).isFuture - - // Method required by ExpiringMultipleLevelCache.LocalCache - override def set(key: Any, value: V): Unit = { - val newEntry = new Entry(Promise[V]()) - newEntry.promise.trySuccess(value) - store.put(key, newEntry) match { - case null => - // Nothing to do - case oldEntry => - // If the old promise is pending, complete it with our future - oldEntry.promise.trySuccess(value) - } - } -} - -private[caching] class ExpiringLruLocalCacheEntry[T](val promise: Promise[T]) { - @volatile var created = Timestamp.now - @volatile var lastAccessed = Timestamp.now - def future = promise.future - def refresh(): Unit = { - // we dont care whether we overwrite a potentially newer value - lastAccessed = Timestamp.now - } - override def toString = future.value match { - case Some(Success(value)) ⇒ value.toString - case Some(Failure(exception)) ⇒ exception.toString - case None ⇒ "pending" - } -} diff --git a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala b/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala deleted file mode 100644 index 9fd77d78..00000000 --- a/src/test/scala/ignition/core/cache/ExpiringMultipleLevelCacheSpec.scala +++ /dev/null @@ -1,213 +0,0 @@ -package ignition.core.cache - -import java.io.FileNotFoundException -import java.util.concurrent.atomic.AtomicInteger - -import akka.actor.ActorSystem -import ignition.core.cache.ExpiringMultiLevelCache.TimestampedValue -import org.scalatest.concurrent.ScalaFutures -import org.scalatest.{FlatSpec, Matchers} -import spray.caching.ExpiringLruLocalCache - -import scala.concurrent.ExecutionContext.Implicits.global -import scala.concurrent.duration._ -import scala.concurrent.{Await, Future} - -class ExpiringMultipleLevelCacheSpec extends FlatSpec with Matchers with ScalaFutures { - case class Data(s: String) - implicit val scheduler = ActorSystem().scheduler - - "ExpiringMultipleLevelCache" should "calculate a value on cache miss and return it" in { - val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) - val cache = ExpiringMultiLevelCache[Data](1.minute, Option(local)) - Await.result(cache("key", () => Future.successful(Data("success"))), 1.minute) shouldBe Data("success") - } - - it should "calculate a value on cache miss and return a failed future of the calculation" in { - val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) - val cache = ExpiringMultiLevelCache[Data](1.minute, Option(local)) - - class MyException(s: String) extends Exception(s) - - val eventualCache = cache("key", () => Future.failed(new MyException("some failure"))) - whenReady(eventualCache.failed) { failure => - failure shouldBe a [MyException] - } - } - - it should "calculate a value on cache miss after ttl" in { - val cacheTtl = 3.seconds - val myRequestCount = new AtomicInteger() - - def myRequest(): Future[Data] = { - myRequestCount.incrementAndGet() - Future.successful(Data("success")) - } - - val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) - val cache = ExpiringMultiLevelCache[Data](ttl = cacheTtl, localCache = Option(local)) - - whenReady(cache("key", myRequest)) { result => - result shouldBe Data("success") - } - - myRequestCount.get() shouldBe 1 - - whenReady(cache("key", myRequest)) { result => - result shouldBe Data("success") - } - - myRequestCount.get() shouldBe 1 - - Thread.sleep(cacheTtl.toMillis + 10) - - whenReady(cache("key", myRequest)) { result => - result shouldBe Data("success") - } - - myRequestCount.get() shouldBe 2 - - whenReady(cache("key", myRequest)) { result => - result shouldBe Data("success") - } - - myRequestCount.get() shouldBe 2 - } - - it should "calculate a value on cache miss just once, the second call should be from cache hit" in { - var myFailedRequestCount: Int = 0 - - class MyException(s: String) extends FileNotFoundException(s) // Some NonFatal Exception - def myFailedRequest(): Future[Nothing] = { - myFailedRequestCount = myFailedRequestCount + 1 - Future.failed(new MyException("some failure")) - } - - val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) - val cache = ExpiringMultiLevelCache[Data](ttl = 1.minute, localCache = Option(local), cacheErrors = true, ttlCachedErrors = 9.seconds) - - val eventualCache = cache("key", myFailedRequest) - whenReady(eventualCache.failed) { failure => - failure shouldBe a [MyException] - myFailedRequestCount shouldBe 1 - } - - val eventualCache2 = cache("key", myFailedRequest) - whenReady(eventualCache2.failed) { failure => - failure shouldBe a [MyException] - myFailedRequestCount shouldBe 1 - } - - val eventualCache3 = cache("key", myFailedRequest) - whenReady(eventualCache3.failed) { failure => - failure shouldBe a [MyException] - myFailedRequestCount shouldBe 1 - } - - val eventualCache4 = cache("key", myFailedRequest) - whenReady(eventualCache4.failed) { failure => - failure shouldBe a [MyException] - myFailedRequestCount shouldBe 1 - } - - val eventualCache5 = cache("key", myFailedRequest) - whenReady(eventualCache5.failed) { failure => - failure shouldBe a [MyException] - myFailedRequestCount shouldBe 1 - } - - } - - it should "calculate a value on cache miss on every request" in { - var myFailedRequestCount: Int = 0 - - class MyException(s: String) extends FileNotFoundException(s) // Some NonFatal Exception - def myFailedRequest(): Future[Nothing] = { - myFailedRequestCount = myFailedRequestCount + 1 - Future.failed(new MyException("some failure")) - } - - val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) - val cache = ExpiringMultiLevelCache[Data](ttl = 1.minute, localCache = Option(local), cacheErrors = false) - - val eventualCache = cache("key", myFailedRequest) - whenReady(eventualCache.failed) { failure => - failure shouldBe a [MyException] - myFailedRequestCount shouldBe 1 - } - - val eventualCache2 = cache("key", myFailedRequest) - whenReady(eventualCache2.failed) { failure => - failure shouldBe a [MyException] - myFailedRequestCount shouldBe 2 - } - - val eventualCache3 = cache("key", myFailedRequest) - whenReady(eventualCache3.failed) { failure => - failure shouldBe a [MyException] - myFailedRequestCount shouldBe 3 - } - - val eventualCache4 = cache("key", myFailedRequest) - whenReady(eventualCache4.failed) { failure => - failure shouldBe a [MyException] - myFailedRequestCount shouldBe 4 - } - - val eventualCache5 = cache("key", myFailedRequest) - whenReady(eventualCache5.failed) { failure => - failure shouldBe a [MyException] - myFailedRequestCount shouldBe 5 - } - - } - - it should "calculate a value on cache miss, then wait ttlCachedError to get a cache miss again" in { - var myFailedRequestCount: Int = 0 - - class MyException(s: String) extends FileNotFoundException(s) // Some NonFatal Exception - def myFailedRequest(): Future[Nothing] = { - myFailedRequestCount = myFailedRequestCount + 1 - Future.failed(new MyException("some failure")) - } - - val local = new ExpiringLruLocalCache[TimestampedValue[Data]](100) - val cache = ExpiringMultiLevelCache[Data](ttl = 1.minute, localCache = Option(local), cacheErrors = true, ttlCachedErrors = 4.seconds) - - val eventualCache = cache("key", myFailedRequest) - whenReady(eventualCache.failed) { failure => - failure shouldBe a [MyException] - myFailedRequestCount shouldBe 1 - } - - val eventualCache2 = cache("key", myFailedRequest) - whenReady(eventualCache2.failed) { failure => - failure shouldBe a [MyException] - myFailedRequestCount shouldBe 1 - } - - Thread.sleep(5000) - - val eventualCache3 = cache("key", myFailedRequest) - whenReady(eventualCache3.failed) { failure => - failure shouldBe a [MyException] - myFailedRequestCount shouldBe 2 - } - - val eventualCache4 = cache("key", myFailedRequest) - whenReady(eventualCache4.failed) { failure => - failure shouldBe a [MyException] - myFailedRequestCount shouldBe 2 - } - - Thread.sleep(500) - - val eventualCache5 = cache("key", myFailedRequest) - whenReady(eventualCache5.failed) { failure => - failure shouldBe a [MyException] - myFailedRequestCount shouldBe 2 - } - - } - -} diff --git a/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala b/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala deleted file mode 100644 index fb774b6e..00000000 --- a/src/test/scala/ignition/core/http/AsyncHttpClientStreamApiSpec.scala +++ /dev/null @@ -1,15 +0,0 @@ -package ignition.core.http - -import ignition.core.http.AsyncHttpClientStreamApi.Request -import org.scalatest.{FunSpec, Matchers} - -import scala.util.Success - -class AsyncHttpClientStreamApiSpec extends FunSpec with Matchers { - - it("should do the best to parse the provided uri") { - val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf" - val request = Request(url) - request.uri.toString shouldBe "http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf" - } -} diff --git a/src/test/scala/ignition/core/http/RetrySpec.scala b/src/test/scala/ignition/core/http/RetrySpec.scala deleted file mode 100644 index 88528568..00000000 --- a/src/test/scala/ignition/core/http/RetrySpec.scala +++ /dev/null @@ -1,39 +0,0 @@ -package ignition.core.http - -import org.joda.time.DateTime -import org.scalatest.{FlatSpec, Matchers} - -import scala.concurrent.duration._ - -class RetrySpec extends FlatSpec with Matchers { - "Retry" should "return the initial backoff" in { - val now = DateTime.now - val timeout = 60.seconds - - val retry = Retry(RetryConf(initialBackoffOnError = 123.milliseconds, initialTimeoutBackoff = 456.milliseconds, maxRandom = 0.seconds), now, timeout) - - retry.onError().backoff() shouldBe 123.millisecond - retry.onTimeout().backoff() shouldBe 456.millisecond - } - - it should "multiply by the factor on second time" in { - - val now = DateTime.now - val timeout = 60.seconds - - val retry = Retry(RetryConf(initialBackoffOnError = 123.milliseconds, initialTimeoutBackoff = 456.milliseconds, maxRandom = 0.seconds, timeoutMultiplicationFactor = 3, errorMultiplicationFactor = 5), now, timeout) - - retry.onError().onError().backoff() shouldBe (123 * 5).millisecond - retry.onTimeout().onTimeout().backoff() shouldBe (456 * 3).millisecond - } - - it should "not explode if called with no errors or timeouts" in { - val now = DateTime.now - val timeout = 60.seconds - - val retry = Retry(RetryConf(maxRandom = 0.seconds), now, timeout) - - retry.backoff() shouldBe 100.milliseconds - } - -} diff --git a/src/test/scala/ignition/core/utils/URLUtilsSpec.scala b/src/test/scala/ignition/core/utils/URLUtilsSpec.scala index 114da15f..a4b4f10d 100644 --- a/src/test/scala/ignition/core/utils/URLUtilsSpec.scala +++ b/src/test/scala/ignition/core/utils/URLUtilsSpec.scala @@ -45,33 +45,4 @@ class URLUtilsSpec extends FlatSpec with Matchers { finalUrl shouldEqual "https://www.petlove.com.br/carrinho?test=true#/add/variant_sku/3105748-1,3107615/quantity/1?t=1" } - it should "percent encode url paths" in { - val tests = Seq( - "http://images1.petlove.com.br/products/170301/small/Ração-Special-Dog-Premium-Vegetais-Cenoura-e-Espinafre-para-Cães.jpg", - "http://images0.petlove.com.br/products/175408/small/Ração-Nestlé-Purina-Pro-Plan-Cat-Sensitive-para-Gatos-Adultos-com-Pele-Sensível.jpg", - "http://images3.petlove.com.br/products/171539/small/Ração-Royal-Canin-Feline-Veterinary-Diet-Urinary-SO-High-Dilution-para-Gatos-com-Cálculos-Urinários.jpg" - ) - - val expectations = Seq( - "http://images1.petlove.com.br/products/170301/small/Ra%C3%A7%C3%A3o-Special-Dog-Premium-Vegetais-Cenoura-e-Espinafre-para-C%C3%A3es.jpg", - "http://images0.petlove.com.br/products/175408/small/Ra%C3%A7%C3%A3o-Nestl%C3%A9-Purina-Pro-Plan-Cat-Sensitive-para-Gatos-Adultos-com-Pele-Sens%C3%ADvel.jpg", - "http://images3.petlove.com.br/products/171539/small/Ra%C3%A7%C3%A3o-Royal-Canin-Feline-Veterinary-Diet-Urinary-SO-High-Dilution-para-Gatos-com-C%C3%A1lculos-Urin%C3%A1rios.jpg" - ) - - tests.zip(expectations).foreach { - case (url, expected) => URLUtils.parseUri(url).map(_.toString) shouldBe Success(expected) - } - } - - it should "not encode percent characters in url path" in { - val url = "http://www.example.com/Pentagrama%C2%AE Acessórios em São Paulo/Qualquer%20Arquivo%20Encodado.pdf" - val sane = URLUtils.parseUri(url).map(_.toString) - sane shouldBe Success("http://www.example.com/Pentagrama%C2%AE%20Acess%C3%B3rios%20em%20S%C3%A3o%20Paulo/Qualquer%20Arquivo%20Encodado.pdf") - } - - it should "encode space characters with percent in URL path" in { - val url = "http://www.example.com/Pentagrama+Invertido.xml?q=blah+bleh" - val sane = URLUtils.parseUri(url).map(_.toString) - sane shouldBe Success("http://www.example.com/Pentagrama%20Invertido.xml?q=blah+bleh") - } } diff --git a/tools/cluster.py b/tools/cluster.py index 060bce71..2b0e38d4 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -49,7 +49,7 @@ default_ami = 'ami-611e7976' default_master_ami = '' default_env = 'dev' -default_spark_version = '2.3.2' +default_spark_version = '2.4.0' default_hdfs_version = '2.7.6' default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz' default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz' From e40cf6bf16c2ca3af3eff02e2147cafd7eb06163 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 28 Nov 2018 09:59:30 -0200 Subject: [PATCH 225/268] fix runtime classnotfound issue (#156) --- build.sbt | 10 ++++++---- .../ignition/core/jobs/utils/SparkContextUtils.scala | 5 ++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/build.sbt b/build.sbt index b27321ea..51044b57 100644 --- a/build.sbt +++ b/build.sbt @@ -11,9 +11,13 @@ parallelExecution in Test := false test in assembly := {} -libraryDependencies += ("org.apache.spark" %% "spark-sql" % "2.4.0" % "provided") +libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.0" % "provided" -libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided") +libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided" + +libraryDependencies += "org.apache.hadoop" % "hadoop-aws" % "2.7.6" % "provided" + +libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.7.4" % "provided" libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.2.27" @@ -25,6 +29,4 @@ libraryDependencies += "org.joda" % "joda-convert" % "1.8.2" libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.25" -libraryDependencies += "com.amazonaws" % "aws-java-sdk-s3" % "1.11.456" - libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.3" diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index e07ba54f..3e4ff961 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -3,8 +3,8 @@ package ignition.core.jobs.utils import java.io.InputStream import com.amazonaws.auth.DefaultAWSCredentialsProviderChain -import com.amazonaws.services.s3.{AmazonS3, AmazonS3Builder, AmazonS3Client} import com.amazonaws.services.s3.model.{ListObjectsRequest, ObjectListing, S3ObjectSummary} +import com.amazonaws.services.s3.{AmazonS3, AmazonS3Client} import ignition.core.utils.CollectionUtils._ import ignition.core.utils.DateUtils._ import ignition.core.utils.ExceptionUtils._ @@ -49,8 +49,7 @@ object SparkContextUtils { def isCompressed(f: HadoopFile): Boolean = compressedExtensions.exists(f.path.endsWith) } - private lazy val amazonS3ClientFromEnvironmentVariables: AmazonS3 = - AmazonS3Client.builder().withCredentials(new DefaultAWSCredentialsProviderChain()).build() + private lazy val amazonS3ClientFromEnvironmentVariables: AmazonS3 = new AmazonS3Client(new DefaultAWSCredentialsProviderChain()) private def close(inputStream: InputStream, path: String): Unit = { try { From ca56cf60fabec8c03012757e7097e6b2b6849bf8 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 28 Nov 2018 23:04:36 -0200 Subject: [PATCH 226/268] python3 compatibility --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 2b0e38d4..a74951bc 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -802,7 +802,7 @@ def killall_jobs(cluster_name, key_file=default_key_file, def check_flintrock_installation(): try: - with file('/dev/null', 'w') as devnull: + with open('/dev/null', 'w') as devnull: call_ec2_script(['--help'], 1 , 1, stdout=devnull) except: setup = os.path.join(ec2_script_base_path(), 'setup.py') From b1d5ae096b74f63fa6bda9e35bd1fb24b5f899a2 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 4 Dec 2018 21:06:32 -0200 Subject: [PATCH 227/268] The quiet flag makes debugging some issues pretty hard --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index a74951bc..add0f633 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -115,7 +115,7 @@ def logged_call(args, tries=1): def ssh_call(user, host, key_file, args=(), allocate_terminal=True, get_output=False): - base = ['ssh', '-q'] + base = ['ssh'] if allocate_terminal: base += ['-tt'] base += ['-i', key_file, From ba3b39f1078de8f2996313312381b2a4a0a4ad10 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 19 Dec 2018 20:31:15 -0200 Subject: [PATCH 228/268] Added RDD-like .values --- src/main/scala/ignition/core/utils/CollectionUtils.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala index 01960d3d..c3b87d4c 100644 --- a/src/main/scala/ignition/core/utils/CollectionUtils.scala +++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala @@ -139,6 +139,8 @@ object CollectionUtils { .mapValues(_.map { case (k, v) => v }.reduce(fn)) .toList } + def values: List[V] = + iterable.map { case (k, v) => v }.toList } From 04964c088edd0f6ce26a124c3c8e07c6e7c2360a Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Thu, 20 Dec 2018 13:48:29 -0200 Subject: [PATCH 229/268] Fix extra data cluster saving so scripts like job runner can reuse the cluster --- tools/cluster.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index 5a08877e..f27d74f9 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -172,18 +172,22 @@ def save_cluster_args(master, key_file, remote_user, all_args): args=["echo '{}' > /tmp/cluster_args.json".format(json.dumps(all_args))]) def load_cluster_args(master, key_file, remote_user): - return json.loads(ssh_call(user=remote_user, host=master, key_file=key_file, + return json.loads(ssh_call(user=remote_user, host=master, key_file=key_file, allocate_terminal=False, args=["cat", "/tmp/cluster_args.json"], get_output=True)) # Util to be used by external scripts def save_extra_data(data_str, cluster_name, region=default_region, key_file=default_key_file, remote_user=default_remote_user, master=None): master = master or get_master(cluster_name, region=region) - ssh_call(user=remote_user, host=master, key_file=key_file, - args=["echo '{}' > /tmp/cluster_extra_data.txt".format(data_str)]) + cmd = ['ssh', '-o', 'StrictHostKeyChecking=no', remote_user + '@' + master , '-i', key_file, '/bin/bash', '-c', 'cat > /tmp/cluster_extra_data.txt'] + p = subprocess.Popen(cmd, stdin=subprocess.PIPE) + p.communicate(data_str) + if p.wait() != 0: + raise Exception('Error saving extra data on master') + def load_extra_data(cluster_name, region=default_region, key_file=default_key_file, remote_user=default_remote_user, master=None): master = master or get_master(cluster_name, region=region) - return ssh_call(user=remote_user, host=master, key_file=key_file, + return ssh_call(user=remote_user, host=master, key_file=key_file, allocate_terminal=False, args=["cat", "/tmp/cluster_extra_data.txt"], get_output=True) From debd6afd4d3cfa4d515cb033c04c6b12d70c0704 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 9 Jan 2019 20:01:41 -0200 Subject: [PATCH 230/268] Added new command and made remove files from collect a parameter --- tools/cluster.py | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index f27d74f9..261b2b85 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -436,7 +436,8 @@ def job_run(cluster_name, job_name, job_mem, kill_on_failure=False, destroy_cluster=False, region=default_region, - driver_heap_size=default_driver_heap_size): + driver_heap_size=default_driver_heap_size, + remove_files=True): utc_job_date_example = '2014-05-04T13:13:10Z' if utc_job_date and len(utc_job_date) != len(utc_job_date_example): @@ -503,7 +504,7 @@ def job_run(cluster_name, job_name, job_mem, region=region, job_timeout_minutes=job_timeout_minutes, remote_user=remote_user, remote_control_dir=remote_control_dir, - collect_results_dir=collect_results_dir) + collect_results_dir=collect_results_dir, remove_files=remove_files) except JobFailure as e: failed = True failed_exception = e @@ -666,16 +667,18 @@ def collect_job_results(cluster_name, job_name, job_tag, region=default_region, master=None, remote_user=default_remote_user, remote_control_dir=default_remote_control_dir, - collect_results_dir=default_collect_results_dir): + collect_results_dir=default_collect_results_dir, + remove_files=False): master = master or get_master(cluster_name, region=region) job_with_tag = get_job_with_tag(job_name, job_tag) job_control_dir = get_job_control_dir(remote_control_dir, job_with_tag) + # Keep the RUNNING file so we can kill the job if needed + args = ['--remove-source-files', '--exclude', 'RUNNING'] if remove_files else [] rsync_call(user=remote_user, host=master, - # Keep the RUNNING file so we can kill the job if needed - args=['--remove-source-files', '--exclude', 'RUNNING'], + args=args, key_file=key_file, dest_local=with_leading_slash(collect_results_dir), remote_path=job_control_dir) @@ -683,13 +686,35 @@ def collect_job_results(cluster_name, job_name, job_tag, return os.path.join(collect_results_dir, os.path.basename(job_control_dir)) +@named('collect-all-results') +def collect_all_job_results(cluster_name, + key_file=default_key_file, + region=default_region, + master=None, remote_user=default_remote_user, + remote_control_dir=default_remote_control_dir, + collect_results_dir=default_collect_results_dir, + remove_files=False): + master = master or get_master(cluster_name, region=region) + + # Keep the RUNNING file so we can kill the job if needed + args = ['--remove-source-files', '--exclude', 'RUNNING'] if remove_files else [] + rsync_call(user=remote_user, + host=master, + args=args, + key_file=key_file, + dest_local=with_leading_slash(collect_results_dir), + remote_path=with_leading_slash(remote_control_dir)) + + return collect_results_dir + + @named('wait-for') def wait_for_job(cluster_name, job_name, job_tag, key_file=default_key_file, master=None, remote_user=default_remote_user, region=default_region, remote_control_dir=default_remote_control_dir, collect_results_dir=default_collect_results_dir, - job_timeout_minutes=0, max_failures=5, seconds_to_sleep=60): + job_timeout_minutes=0, max_failures=5, seconds_to_sleep=60, remove_files=True): master = master or get_master(cluster_name, region=region) @@ -714,7 +739,7 @@ def collect(show_tail): key_file=key_file, region=region, master=master, remote_user=remote_user, remote_control_dir=remote_control_dir, - collect_results_dir=collect_results_dir) + collect_results_dir=collect_results_dir, remove_files=remove_files) log.info('Jobs results saved on: {}'.format(dest_log_dir)) if show_tail: output_log = os.path.join(dest_log_dir, 'output.log') @@ -852,7 +877,7 @@ def check_flintrock_installation(): parser = ArghParser() parser.add_commands([launch, destroy, get_master, ssh_master, tag_cluster_instances, health_check, exec_shell]) parser.add_commands([job_run, job_local_yarn_run, job_attach, wait_for_job, - kill_job, killall_jobs, collect_job_results], namespace="jobs") + kill_job, killall_jobs, collect_job_results, collect_all_job_results], namespace="jobs") if __name__ == '__main__': check_flintrock_installation() From 1ad9969bc4572768a6b3c10bed1731c43f46c6c0 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Tue, 29 Jan 2019 20:33:08 -0200 Subject: [PATCH 231/268] Make extra args really be usable --- remote_hook.sh | 3 ++- .../scala/ignition/core/jobs/CoreJobRunner.scala | 6 +++--- tools/cluster.py | 13 ++++++++----- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/remote_hook.sh b/remote_hook.sh index 0a5a2cb8..1fd970f6 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -13,6 +13,7 @@ SPARK_MEM_PARAM="${6?Please give the Job Memory Size to use}" USE_YARN="${7?Please tell if we should use YARN (yes/no)}" NOTIFY_ON_ERRORS="${8?Please tell if we will notify on errors (yes/no)}" DRIVER_HEAP_SIZE="${9?Please tell driver heap size to use}" +shift 9 JOB_WITH_TAG=${JOB_NAME}.${JOB_TAG} JOB_CONTROL_DIR="${CONTROL_DIR}/${JOB_WITH_TAG}" @@ -124,7 +125,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & - sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" + sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" fi touch "${JOB_CONTROL_DIR}/SUCCESS" diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index 4e7c27fe..5ee6fbce 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -35,7 +35,7 @@ object CoreJobRunner { user: String = "nouser", master: String = "local[*]", executorMemory: String = "2G", - additionalArgs: Map[String, String] = Map.empty) + extraArgs: Map[String, String] = Map.empty) def runJobSetup(args: Array[String], jobsSetups: Map[String, (CoreJobRunner.RunnerContext => Unit, Map[String, String])], defaultSparkConfMap: Map[String, String]) { val parser = new scopt.OptionParser[RunnerConfig]("Runner") { @@ -60,8 +60,8 @@ object CoreJobRunner { c.copy(executorMemory = x) } - opt[(String, String)]('w', "runner-with-arg") unbounded() action { (x, c) => - c.copy(additionalArgs = c.additionalArgs ++ Map(x)) + opt[(String, String)]('w', "runner-extra") unbounded() action { (x, c) => + c.copy(extraArgs = c.extraArgs ++ Map(x)) } } diff --git a/tools/cluster.py b/tools/cluster.py index 261b2b85..f80db920 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -422,6 +422,7 @@ def get_assembly_path(): @arg('--disable-tmux', help='Do not use tmux. Warning: many features will not work without tmux. Use only if the tmux is missing on the master.') @arg('--detached', help='Run job in background, requires tmux') @arg('--destroy-cluster', help='Will destroy cluster after finishing the job') +@arg('--extra', action='append', type=str, help='Additional arguments for the job in the format k=v') @named('run') def job_run(cluster_name, job_name, job_mem, key_file=default_key_file, disable_tmux=False, @@ -437,7 +438,8 @@ def job_run(cluster_name, job_name, job_mem, destroy_cluster=False, region=default_region, driver_heap_size=default_driver_heap_size, - remove_files=True): + remove_files=True, + extra=[]): utc_job_date_example = '2014-05-04T13:13:10Z' if utc_job_date and len(utc_job_date) != len(utc_job_date_example): @@ -456,11 +458,12 @@ def job_run(cluster_name, job_name, job_mem, yarn_param = 'yes' if yarn else 'no' job_date = utc_job_date or datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') job_tag = job_tag or job_date.replace(':', '_').replace('-', '_').replace('Z', 'UTC') + runner_extra_args = ' '.join('--runner-extra "%s"' % arg for arg in extra) tmux_wait_command = ';(echo Press enter to keep the session open && /bin/bash -c "read -t 5" && sleep 7d)' if not detached else '' - tmux_arg = ". /etc/profile; . ~/.profile;tmux new-session {detached} -s spark.{job_name}.{job_tag} '{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} {tmux_wait_command}' >& /tmp/commandoutput".format( - aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, tmux_wait_command=tmux_wait_command) - non_tmux_arg = ". /etc/profile; . ~/.profile;{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} >& /tmp/commandoutput".format( - aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size) + tmux_arg = ". /etc/profile; . ~/.profile;tmux new-session {detached} -s spark.{job_name}.{job_tag} '{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} {runner_extra_args} {tmux_wait_command}' >& /tmp/commandoutput".format( + aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, runner_extra_args=runner_extra_args, tmux_wait_command=tmux_wait_command) + non_tmux_arg = ". /etc/profile; . ~/.profile;{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} {runner_extra_args} >& /tmp/commandoutput".format( + aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, runner_extra_args=runner_extra_args) if not disable_assembly_build: From 0eb6a1828a44c4fc0ca3a5bf3c13ef36ba6a0f0f Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Mon, 11 Feb 2019 19:20:01 -0200 Subject: [PATCH 232/268] Added singleton to ExecutionRetry --- src/main/scala/ignition/core/jobs/ExecutionRetry.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/scala/ignition/core/jobs/ExecutionRetry.scala b/src/main/scala/ignition/core/jobs/ExecutionRetry.scala index 61daa523..7e5a3953 100644 --- a/src/main/scala/ignition/core/jobs/ExecutionRetry.scala +++ b/src/main/scala/ignition/core/jobs/ExecutionRetry.scala @@ -2,6 +2,8 @@ package ignition.core.jobs import scala.util.Try +object ExecutionRetry extends ExecutionRetry + trait ExecutionRetry { def executeRetrying[T](code: => T, maxExecutions: Int = 3): T = { From e7e7a7666cd235614f2ca5fd73d96f13f4645b83 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 12 Mar 2019 16:01:37 -0300 Subject: [PATCH 233/268] install toree (#161) * install toree --- remote_hook.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/remote_hook.sh b/remote_hook.sh index 1fd970f6..5078786b 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -78,7 +78,7 @@ install_and_run_zeppelin() { install_and_run_jupyter() { sudo yum -y install python3 python3-pip - sudo pip3 install jupyter pandas boto3 matplotlib numpy sklearn scipy + sudo pip3 install jupyter pandas boto3 matplotlib numpy sklearn scipy toree export SPARK_HOME=$(get_first_present /root/spark /opt/spark ~/spark*/) export HADOOP_HOME=$(get_first_present /root/hadoop /opt/hadoop ~/hadoop*/) export SPARK_CONF_DIR="${SPARK_HOME}/conf" @@ -87,6 +87,7 @@ install_and_run_jupyter() { export PYSPARK_PYTHON=$(which python3) export PYSPARK_DRIVER_PYTHON=$(which jupyter) export PYSPARK_DRIVER_PYTHON_OPTS="notebook --allow-root --ip=${SPARK_MASTER_HOST} --no-browser --port=8888" + sudo $(which jupyter) toree install --spark_home="${SPARK_HOME}" --spark_opts="--master ${JOB_MASTER} --executor-memory ${SPARK_MEM_PARAM} --driver-memory ${DRIVER_HEAP_SIZE}" sudo -E "${SPARK_HOME}/bin/pyspark" --master "${JOB_MASTER}" --executor-memory "${SPARK_MEM_PARAM}" --driver-memory "${DRIVER_HEAP_SIZE}" } From 2225f897aa8bb615bd956ac78d544fc448a095ed Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 20 Mar 2019 17:05:28 -0300 Subject: [PATCH 234/268] Added Timestamp comparison --- src/main/scala/ignition/core/utils/DateUtils.scala | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/utils/DateUtils.scala b/src/main/scala/ignition/core/utils/DateUtils.scala index 8ebf3b13..71ec771f 100644 --- a/src/main/scala/ignition/core/utils/DateUtils.scala +++ b/src/main/scala/ignition/core/utils/DateUtils.scala @@ -1,6 +1,8 @@ package ignition.core.utils -import org.joda.time.{Seconds, Period, DateTimeZone, DateTime} +import java.sql.Timestamp + +import org.joda.time.{DateTime, DateTimeZone, Period, Seconds} import org.joda.time.format.ISODateTimeFormat object DateUtils { @@ -9,6 +11,10 @@ object DateUtils { implicit def dateTimeOrdering: Ordering[DateTime] = Ordering.fromLessThan(_ isBefore _) implicit def periodOrdering: Ordering[Period] = Ordering.fromLessThan(_.toStandardSeconds.getSeconds < _.toStandardSeconds.getSeconds) + implicit def timestampOrdering: Ordering[Timestamp] = new Ordering[Timestamp] { + def compare(x: Timestamp, y: Timestamp): Int = x compareTo y + } + implicit class DateTimeImprovements(val dateTime: DateTime) { def toIsoString = isoDateTimeFormatter.print(dateTime) From 71d94ca83218b0a424db9f2cfb85eb61d844826e Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 26 Jun 2019 11:31:26 -0300 Subject: [PATCH 235/268] update to spark=2.4.3, scala=2.12.8 and some compiler fixes (#163) * update to spark=2.4.3, scala=2.12.8 and some compiler fixes * using a 2.12 scala build --- build.sbt | 4 ++-- .../ignition/core/jobs/CoreJobRunner.scala | 1 - .../ignition/core/jobs/utils/RDDUtils.scala | 12 +++--------- .../core/jobs/utils/SparkContextUtils.scala | 19 +++++++++---------- .../ignition/core/utils/CollectionUtils.scala | 8 +++----- .../ignition/core/utils/FutureUtils.scala | 5 ++--- .../ignition/core/utils/URLUtilsSpec.scala | 2 -- tools/cluster.py | 4 ++-- 8 files changed, 21 insertions(+), 34 deletions(-) diff --git a/build.sbt b/build.sbt index 51044b57..8bf00c9d 100644 --- a/build.sbt +++ b/build.sbt @@ -2,7 +2,7 @@ name := "Ignition-Core" version := "1.0" -scalaVersion := "2.11.12" +scalaVersion := "2.12.8" scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130") @@ -11,7 +11,7 @@ parallelExecution in Test := false test in assembly := {} -libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.0" % "provided" +libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.3" % "provided" libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "2.7.6" % "provided" diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index 5ee6fbce..eb1c7014 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -6,7 +6,6 @@ import org.joda.time.{DateTime, DateTimeZone} import org.slf4j.{Logger, LoggerFactory} import scala.concurrent.Future -import scala.util.Try object CoreJobRunner { diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala index e04dd118..ab08c3c7 100644 --- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala @@ -1,18 +1,12 @@ package ignition.core.jobs.utils +import org.apache.spark.rdd.RDD import org.slf4j.LoggerFactory - -import scala.reflect._ -import org.apache.spark.rdd.{CoGroupedRDD, PairRDDFunctions, RDD} -import org.apache.spark.SparkContext._ -import org.apache.spark.Partitioner -import org.apache.spark -import org.joda.time.DateTime -import org.joda.time.format.DateTimeFormat +import scalaz.{Success, Validation} import scala.collection.mutable +import scala.reflect._ import scala.util.Random -import scalaz.{Success, Validation} object RDDUtils { diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 3e4ff961..e5155340 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -19,7 +19,7 @@ import org.apache.spark.{Partitioner, SparkContext} import org.joda.time.DateTime import org.slf4j.LoggerFactory -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.io.{Codec, Source} @@ -82,7 +82,7 @@ object SparkContextUtils { private lazy val logger = LoggerFactory.getLogger(getClass) - lazy val _hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) + lazy val _hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().asScala.map { case entry => entry.getKey -> entry.getValue }.toMap) private def getFileSystem(path: Path): FileSystem = { path.getFileSystem(sc.hadoopConfiguration) @@ -181,7 +181,6 @@ object SparkContextUtils { paths.map(p => { val hdfsPath = p.replaceFirst("s3[an]://", hdfsPathPrefix) if (forceSynch || getStatus(hdfsPath, false).isEmpty || getStatus(s"$hdfsPath/*", true).filterNot(_.isDirectory).size != filesToOutput) { - val _hdfsPath = new Path(hdfsPath) actionWhenNeedsSynching(p, hdfsPath) } hdfsPath @@ -457,10 +456,10 @@ object SparkContextUtils { def inner(current: ObjectListing): Stream[String] = if (current.isTruncated) { logger.trace(s"list common prefixed truncated for ${path.bucket} ${path.key}: ${current.getCommonPrefixes}") - current.getCommonPrefixes.toStream ++ inner(s3.listNextBatchOfObjects(current)) + current.getCommonPrefixes.asScala.toStream ++ inner(s3.listNextBatchOfObjects(current)) } else { logger.trace(s"list common prefixed finished for ${path.bucket} ${path.key}: ${current.getCommonPrefixes}") - current.getCommonPrefixes.toStream + current.getCommonPrefixes.asScala.toStream } val request = new ListObjectsRequest(path.bucket, path.key, null, delimiter, 1000) @@ -472,10 +471,10 @@ object SparkContextUtils { def inner(current: ObjectListing): Stream[S3ObjectSummary] = if (current.isTruncated) { logger.trace(s"list objects truncated for ${path.bucket} ${path.key}: $current") - current.getObjectSummaries.toStream ++ inner(s3.listNextBatchOfObjects(current)) + current.getObjectSummaries.asScala.toStream ++ inner(s3.listNextBatchOfObjects(current)) } else { logger.trace(s"list objects finished for ${path.bucket} ${path.key}") - current.getObjectSummaries.toStream + current.getObjectSummaries.asScala.toStream } inner(s3.listObjects(path.bucket, path.key)) @@ -674,9 +673,9 @@ object SparkContextUtils { private def doSync(hadoopFiles: List[HadoopFile], synchLocally: String, forceSynch: Boolean, - maxBytesPerPartition: Long = 128 * 1000 * 1000, - minPartitions: Int = 100, - sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = { + maxBytesPerPartition: Long, + minPartitions: Int, + sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { require(!synchLocally.contains("*"), "Globs are not supported on the sync key") def syncPath(suffix: String) = s"$hdfsPathPrefix/_core_ignition_sync_hdfs_cache/$suffix" diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala index c3b87d4c..2405c7ef 100644 --- a/src/main/scala/ignition/core/utils/CollectionUtils.scala +++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala @@ -1,12 +1,10 @@ package ignition.core.utils -import scala.collection.{TraversableLike, IterableLike} -import scala.collection.generic.CanBuildFrom -import scala.language.implicitConversions import scalaz.Validation -object CollectionUtils { - +import scala.collection.generic.CanBuildFrom +import scala.collection.{IterableLike, TraversableLike} +object CollectionUtils { implicit class SeqImprovements[A](xs: Seq[A]) { def orElseIfEmpty[B >: A](alternative: => Seq[B]): Seq[B] = { diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala index f12918db..4054f750 100644 --- a/src/main/scala/ignition/core/utils/FutureUtils.scala +++ b/src/main/scala/ignition/core/utils/FutureUtils.scala @@ -1,7 +1,6 @@ package ignition.core.utils -import scala.concurrent.duration.FiniteDuration -import scala.concurrent.{ExecutionContext, Future, Promise, blocking, future} +import scala.concurrent.{ExecutionContext, Future, Promise, blocking} import scala.util.control.NonFatal import scala.util.{Failure, Success, Try} @@ -46,7 +45,7 @@ object FutureUtils { } implicit class FutureGeneratorImprovements[V](generator: Iterable[() => Future[V]]){ - def toLazyIterable(batchSize: Int = 1)(implicit ec: ExecutionContext): Iterable[Future[V]] = new Iterable[Future[V]] { + def toLazyIterable(batchSize: Int = 1): Iterable[Future[V]] = new Iterable[Future[V]] { override def iterator = new Iterator[Future[V]] { val generatorIterator = generator.toIterator var currentBatch: List[Future[V]] = List.empty diff --git a/src/test/scala/ignition/core/utils/URLUtilsSpec.scala b/src/test/scala/ignition/core/utils/URLUtilsSpec.scala index a4b4f10d..61781903 100644 --- a/src/test/scala/ignition/core/utils/URLUtilsSpec.scala +++ b/src/test/scala/ignition/core/utils/URLUtilsSpec.scala @@ -2,8 +2,6 @@ package ignition.core.utils import org.scalatest.{FlatSpec, Matchers} -import scala.util.Success - class URLUtilsSpec extends FlatSpec with Matchers { "URLUtils" should "add parameters to url with encoded params in base url and not be double encoded" in { diff --git a/tools/cluster.py b/tools/cluster.py index f80db920..0e0fd864 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -49,9 +49,9 @@ default_ami = 'ami-611e7976' default_master_ami = '' default_env = 'dev' -default_spark_version = '2.4.0' +default_spark_version = '2.4.3' default_hdfs_version = '2.7.6' -default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz' +default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-without-hadoop-scala-2.12.tgz' default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz' default_remote_user = 'ec2-user' default_installation_user = 'root' From d1c02ee3dccef0f5c72e090da926cf3ae22678f8 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 27 Jun 2019 15:12:13 -0300 Subject: [PATCH 236/268] rollback to scala 2.11 (#164) --- build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index 8bf00c9d..c6d7dbdb 100644 --- a/build.sbt +++ b/build.sbt @@ -2,7 +2,7 @@ name := "Ignition-Core" version := "1.0" -scalaVersion := "2.12.8" +scalaVersion := "2.11.12" scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130") From c9f09ddc5c4a109595dd3e6412a19e69d046c6e0 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 27 Jun 2019 15:18:35 -0300 Subject: [PATCH 237/268] rollback to spark with hadoop (#165) --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 0e0fd864..7c50ccae 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -51,7 +51,7 @@ default_env = 'dev' default_spark_version = '2.4.3' default_hdfs_version = '2.7.6' -default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-without-hadoop-scala-2.12.tgz' +default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz' default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz' default_remote_user = 'ec2-user' default_installation_user = 'root' From 0d94bdc64dd8337ea2bc91be425f8fc3cba584e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=AA=20Couto=20e=20Silva?= <31329678+csrene@users.noreply.github.com> Date: Tue, 24 Sep 2019 16:22:20 -0300 Subject: [PATCH 238/268] Optional AWS credentials propagation (#166) added disable-propagate-aws-credentials option to run job --- tools/cluster.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index 7c50ccae..966dd987 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -423,6 +423,7 @@ def get_assembly_path(): @arg('--detached', help='Run job in background, requires tmux') @arg('--destroy-cluster', help='Will destroy cluster after finishing the job') @arg('--extra', action='append', type=str, help='Additional arguments for the job in the format k=v') +@arg('--disable-propagate-aws-credentials', help='Setting this to true will not propagate your AWS credentials from your environment to the master') @named('run') def job_run(cluster_name, job_name, job_mem, key_file=default_key_file, disable_tmux=False, @@ -439,6 +440,7 @@ def job_run(cluster_name, job_name, job_mem, region=default_region, driver_heap_size=default_driver_heap_size, remove_files=True, + disable_propagate_aws_credentials=False, extra=[]): utc_job_date_example = '2014-05-04T13:13:10Z' @@ -456,14 +458,15 @@ def job_run(cluster_name, job_name, job_mem, remote_hook = '{remote_path}/remote_hook.sh'.format(remote_path=remote_path) notify_param = 'yes' if notify_on_errors else 'no' yarn_param = 'yes' if yarn else 'no' + aws_vars = get_aws_keys_str() if not disable_propagate_aws_credentials else '' job_date = utc_job_date or datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') job_tag = job_tag or job_date.replace(':', '_').replace('-', '_').replace('Z', 'UTC') runner_extra_args = ' '.join('--runner-extra "%s"' % arg for arg in extra) tmux_wait_command = ';(echo Press enter to keep the session open && /bin/bash -c "read -t 5" && sleep 7d)' if not detached else '' tmux_arg = ". /etc/profile; . ~/.profile;tmux new-session {detached} -s spark.{job_name}.{job_tag} '{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} {runner_extra_args} {tmux_wait_command}' >& /tmp/commandoutput".format( - aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, runner_extra_args=runner_extra_args, tmux_wait_command=tmux_wait_command) + aws_vars=aws_vars, job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, runner_extra_args=runner_extra_args, tmux_wait_command=tmux_wait_command) non_tmux_arg = ". /etc/profile; . ~/.profile;{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} {runner_extra_args} >& /tmp/commandoutput".format( - aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, runner_extra_args=runner_extra_args) + aws_vars=aws_vars, job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, runner_extra_args=runner_extra_args) if not disable_assembly_build: From e2adf968611b31f02902b678fe472b30b413b369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1vio=20Zantut=20Nogueira?= Date: Mon, 16 May 2022 15:27:20 -0300 Subject: [PATCH 239/268] Wait termination on destroy unsuccessful cluster --- tools/cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index 966dd987..8cc12f98 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -331,8 +331,8 @@ def launch(cluster_name, slaves, except Exception as e: log.exception('Got exception on last steps of cluster configuration') log.warn('Destroying unsuccessful cluster') - destroy(cluster_name=cluster_name, region=region) - raise CommandError('Failed to created cluster {} after failures'.format(cluster_name)) + destroy(cluster_name=cluster_name, region=region, wait_termination=True) + raise CommandError('Failed to created cluster {0} after failures'.format(cluster_name)) def destroy(cluster_name, delete_groups=False, region=default_region, wait_termination=False, wait_timeout_minutes=10): From 54c5faefb90d43349136f5b389903456a67bfbcc Mon Sep 17 00:00:00 2001 From: alexopss <68519704+alexopss@users.noreply.github.com> Date: Tue, 17 May 2022 10:05:53 -0300 Subject: [PATCH 240/268] Fix/circleci Add config circle --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 .circleci/config.yml diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1 @@ + From 3b5f93fb3226273514aae312111b1d2cd11fa8ae Mon Sep 17 00:00:00 2001 From: Machine User <80485061+chaordic-automation@users.noreply.github.com> Date: Tue, 17 May 2022 10:09:52 -0300 Subject: [PATCH 241/268] Updated config.yml --- .circleci/config.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 8b137891..26423d9c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1 +1,2 @@ - +orbs: + node: circleci/node@5.0.2 \ No newline at end of file From a55ffe79cedc75964281a50e7063a6489a56b51b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1vio=20Zantut=20Nogueira?= Date: Mon, 30 May 2022 14:05:38 -0300 Subject: [PATCH 242/268] Add shutting-down state to active instances --- tools/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/utils.py b/tools/utils.py index 5064be61..5cfecb77 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -12,7 +12,7 @@ def get_active_instances(conn): active = [instance for res in conn.get_all_instances() for instance in res.instances if instance.state in set(['pending', 'running', - 'stopping', 'stopped'])] + 'stopping', 'stopped', 'shutting-down'])] return active def parse_nodes(active_instances, cluster_name): From 9223c02f94f6d0e6dd9c53ea555d34d8a2308f1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1vio=20Zantut=20Nogueira?= Date: Wed, 1 Jun 2022 17:23:02 -0300 Subject: [PATCH 243/268] Update CI --- .circleci/config.yml | 17 +++++++++++++++-- circle.yml | 3 --- 2 files changed, 15 insertions(+), 5 deletions(-) delete mode 100644 circle.yml diff --git a/.circleci/config.yml b/.circleci/config.yml index 26423d9c..79ea469e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,2 +1,15 @@ -orbs: - node: circleci/node@5.0.2 \ No newline at end of file +version: 2.1 + +# Define the jobs we want to run for this project +jobs: + build: + docker: + - image: openjdk:8-jdk-oraclelinux7 + steps: + - run: echo "build job is not implemented" + +# Orchestrate our job run sequence +workflows: + build: + jobs: + - build \ No newline at end of file diff --git a/circle.yml b/circle.yml deleted file mode 100644 index abd78de2..00000000 --- a/circle.yml +++ /dev/null @@ -1,3 +0,0 @@ -machine: - java: - version: oraclejdk8 From 2ee8f07e651101a4f16a29daa81d8ab549c4a82b Mon Sep 17 00:00:00 2001 From: AllanRolli Date: Wed, 8 Jun 2022 10:04:45 -0300 Subject: [PATCH 244/268] updated destroy function inside module cluster.py --- tools/cluster.py | 55 ++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index 8cc12f98..778ccfaf 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -335,35 +335,36 @@ def launch(cluster_name, slaves, raise CommandError('Failed to created cluster {0} after failures'.format(cluster_name)) -def destroy(cluster_name, delete_groups=False, region=default_region, wait_termination=False, wait_timeout_minutes=10): +def destroy(cluster_name, wait_termination=False, wait_timeout_minutes=10, delete_groups=False, region=default_region,script_timeout_total_minutes=55,script_timeout_inactivity_minutes=10): assert not delete_groups, 'Delete groups is deprecated and unsupported' masters, slaves = get_active_nodes(cluster_name, region=region) - - all_instances = masters + slaves - if all_instances: - log.info('The following instances will be terminated:') - for i in all_instances: - log.info('-> %s' % (i.public_dns_name or i.private_dns_name)) - - log.info('Terminating master...') - for i in masters: - i.terminate() - - log.info('Terminating slaves...') - for i in slaves: - i.terminate() - - if wait_termination: - log.info('Waiting for instances termination...') - termination_timeout = wait_timeout_minutes*60 - termination_start = time.time() - while wait_termination and all_instances and time.time() < termination_start+termination_timeout: - all_instances = [i for i in all_instances if i.state != 'terminated'] - time.sleep(5) - for i in all_instances: - i.update() - - log.info('Done.') + + try: # First we test if exist the cluster with the function cluster_exists + if cluster_exists(cluster_name,region): + # Here we use the script to destroy the cluster using the name of it + call_ec2_script(['destroy','--assume-yes', cluster_name,'--ec2-vpc-id','vpc-94215df1'],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes) + all_instances = masters + slaves + # To better view about what the script is doing i choose to let the same code of the destroy i have updated + if all_instances: + log.info('The %s will be terminated:', cluster_name) + for i in all_instances: + log.info('-> %s' % (i.public_dns_name or i.private_dns_name)) + + if wait_termination: + log.info('Waiting for instances termination...') + termination_timeout = wait_timeout_minutes*60 + termination_start = time.time() + + while wait_termination and all_instances and time.time() < termination_start+termination_timeout: + all_instances = [i for i in all_instances if i.state != 'terminated'] + time.sleep(5) + for i in all_instances: + i.update() + # The log says the destruction is Done but is still running, just chill and enjoy the ride + log.info('Done.') + # Here is the exception of the try if we don't find the cluster + except Exception as e: + print('Does not exist the cluster %s', cluster_name) def get_master(cluster_name, region=default_region): From dee4d21919de0b6e89b850e6a761c2d33e496852 Mon Sep 17 00:00:00 2001 From: Allan Rolli Date: Wed, 8 Jun 2022 16:02:51 -0300 Subject: [PATCH 245/268] Update cluster.py removed cluster_exists and added vpc inside variable --- tools/cluster.py | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index 778ccfaf..60e3a8b4 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -59,6 +59,7 @@ default_collect_results_dir = '/tmp' default_user_data = os.path.join(script_path, 'scripts', 'noop') default_defaults_filename = 'cluster_defaults.json' +default_vpc='vpc-94215df1' master_post_create_commands = [ @@ -335,33 +336,31 @@ def launch(cluster_name, slaves, raise CommandError('Failed to created cluster {0} after failures'.format(cluster_name)) -def destroy(cluster_name, wait_termination=False, wait_timeout_minutes=10, delete_groups=False, region=default_region,script_timeout_total_minutes=55,script_timeout_inactivity_minutes=10): +def destroy(cluster_name, wait_termination=False, wait_timeout_minutes=10, delete_groups=False, default_vpc, region=default_region,script_timeout_total_minutes=55,script_timeout_inactivity_minutes=10): assert not delete_groups, 'Delete groups is deprecated and unsupported' masters, slaves = get_active_nodes(cluster_name, region=region) - try: # First we test if exist the cluster with the function cluster_exists - if cluster_exists(cluster_name,region): - # Here we use the script to destroy the cluster using the name of it - call_ec2_script(['destroy','--assume-yes', cluster_name,'--ec2-vpc-id','vpc-94215df1'],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes) - all_instances = masters + slaves - # To better view about what the script is doing i choose to let the same code of the destroy i have updated - if all_instances: - log.info('The %s will be terminated:', cluster_name) + try:# Here we use the script to destroy the cluster using the name of it + call_ec2_script(['destroy','--assume-yes', cluster_name,'--ec2-vpc-id',default_vpc],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes) + all_instances = masters + slaves + # To better view about what the script is doing i choose to let the same code of the destroy i have updated + if all_instances: + log.info('The %s will be terminated:', cluster_name) + for i in all_instances: + log.info('-> %s' % (i.public_dns_name or i.private_dns_name)) + + if wait_termination: + log.info('Waiting for instances termination...') + termination_timeout = wait_timeout_minutes*60 + termination_start = time.time() + + while wait_termination and all_instances and time.time() < termination_start+termination_timeout: + all_instances = [i for i in all_instances if i.state != 'terminated'] + time.sleep(5) for i in all_instances: - log.info('-> %s' % (i.public_dns_name or i.private_dns_name)) - - if wait_termination: - log.info('Waiting for instances termination...') - termination_timeout = wait_timeout_minutes*60 - termination_start = time.time() - - while wait_termination and all_instances and time.time() < termination_start+termination_timeout: - all_instances = [i for i in all_instances if i.state != 'terminated'] - time.sleep(5) - for i in all_instances: - i.update() - # The log says the destruction is Done but is still running, just chill and enjoy the ride - log.info('Done.') + i.update() + # The log says the destruction is Done but is still running, just chill and enjoy the ride + log.info('Done.') # Here is the exception of the try if we don't find the cluster except Exception as e: print('Does not exist the cluster %s', cluster_name) From 5cf960a82bb4b1b926b5252e8f38e9ebacab0d7d Mon Sep 17 00:00:00 2001 From: Allan Rolli Date: Fri, 10 Jun 2022 14:03:10 -0300 Subject: [PATCH 246/268] Update utils.py --- tools/utils.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tools/utils.py b/tools/utils.py index 5cfecb77..a33faa47 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -123,3 +123,44 @@ def check_call_with_timeout(args, stdin=None, stdout=None, stdall = 'STDOUT:\n{}\nSTDERR:\n{}'.format(stdout, stderr) raise subprocess.CalledProcessError(p.returncode, args, output=stdall) return p.returncode + +def check_call_with_timeout_describe(args, stdin=None, stdout=None, + stderr=None, shell=False, + timeout_total_minutes=0, + timeout_inactivity_minutes=0): + stdout = stdout or sys.stdout + stderr = stderr or sys.stderr + begin_time_total = time.time() + begin_time_inactivity = time.time() + p = subprocess.Popen(args, + stdin=stdin, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=shell, + universal_newlines=False) + while True: + if read_from_to(p.stdout, stdout): + begin_time_inactivity = time.time() + if read_from_to(p.stderr, stderr): + begin_time_inactivity = time.time() + if p.poll() is not None: + break + terminate_by_total_timeout = timeout_total_minutes > 0 and time.time() - begin_time_total > (timeout_total_minutes * 60) + terminate_by_inactivity_timeout = timeout_inactivity_minutes > 0 and time.time() - begin_time_inactivity > (timeout_inactivity_minutes * 60) + if terminate_by_inactivity_timeout or terminate_by_total_timeout: + p.terminate() + for i in range(100): + if p.poll is not None: + break + time.sleep(0.1) + p.kill() + message = 'Terminated by inactivity' if terminate_by_inactivity_timeout else 'Terminated by total timeout' + raise ProcessTimeoutException(message) + time.sleep(0.5) + read_from_to(p.stdout, stdout) + read_from_to(p.stderr, stderr) + if p.returncode != 0: + stdall = 'STDOUT:\n{}\nSTDERR:\n{}'.format(stdout, stderr) + raise subprocess.CalledProcessError(p.returncode, args, output=stdall) + if len(args) > 5: + return args[5] From 5148803d65ed760f9fc5d0a58eb46a3a899358b1 Mon Sep 17 00:00:00 2001 From: Allan Rolli Date: Fri, 10 Jun 2022 14:05:41 -0300 Subject: [PATCH 247/268] Update cluster.py --- tools/cluster.py | 56 ++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index 60e3a8b4..821a6f9a 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -13,7 +13,7 @@ import subprocess from subprocess import check_output, check_call from utils import tag_instances, get_masters, get_active_nodes -from utils import check_call_with_timeout +from utils import check_call_with_timeout, check_call_with_timeout_describe import os import sys from datetime import datetime @@ -147,6 +147,13 @@ def call_ec2_script(args, timeout_total_minutes, timeout_inactivity_minutes, std stdout=stdout, timeout_total_minutes=timeout_total_minutes, timeout_inactivity_minutes=timeout_inactivity_minutes) +def call_ec2_script_describe(args, timeout_total_minutes, timeout_inactivity_minutes, stdout=None): + ec2_script_path = chdir_to_ec2_script_and_get_path() + return check_call_with_timeout_describe(['/usr/bin/env', 'python3', '-u', + ec2_script_path] + args, + stdout=stdout, + timeout_total_minutes=timeout_total_minutes, + timeout_inactivity_minutes=timeout_inactivity_minutes) def cluster_exists(cluster_name, region): @@ -336,34 +343,37 @@ def launch(cluster_name, slaves, raise CommandError('Failed to created cluster {0} after failures'.format(cluster_name)) -def destroy(cluster_name, wait_termination=False, wait_timeout_minutes=10, delete_groups=False, default_vpc, region=default_region,script_timeout_total_minutes=55,script_timeout_inactivity_minutes=10): +def destroy(cluster_name, wait_termination=False, vpc=default_vpc, wait_timeout_minutes=10, delete_groups=False, region=default_region,script_timeout_total_minutes=55,script_timeout_inactivity_minutes=10): assert not delete_groups, 'Delete groups is deprecated and unsupported' masters, slaves = get_active_nodes(cluster_name, region=region) - try:# Here we use the script to destroy the cluster using the name of it - call_ec2_script(['destroy','--assume-yes', cluster_name,'--ec2-vpc-id',default_vpc],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes) - all_instances = masters + slaves - # To better view about what the script is doing i choose to let the same code of the destroy i have updated - if all_instances: - log.info('The %s will be terminated:', cluster_name) - for i in all_instances: - log.info('-> %s' % (i.public_dns_name or i.private_dns_name)) - - if wait_termination: - log.info('Waiting for instances termination...') - termination_timeout = wait_timeout_minutes*60 - termination_start = time.time() - - while wait_termination and all_instances and time.time() < termination_start+termination_timeout: - all_instances = [i for i in all_instances if i.state != 'terminated'] - time.sleep(5) + try: # First we test if exist the cluster with the function cluster_exists + cluster = call_ec2_script_describe(['describe', cluster_name,'--ec2-vpc-id',vpc],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes) + if cluster == cluster_name: + call_ec2_script(['destroy','--assume-yes', cluster_name,'--ec2-vpc-id',vpc],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes) + # Here we use the script to destroy the cluster using the name of it + all_instances = masters + slaves + # To better view about what the script is doing i choose to let the same code of the destroy i have updated + if all_instances: + log.info('The %s will be terminated:', cluster_name) for i in all_instances: - i.update() - # The log says the destruction is Done but is still running, just chill and enjoy the ride - log.info('Done.') + log.info('-> %s' % (i.public_dns_name or i.private_dns_name)) + + if wait_termination: + log.info('Waiting for instances termination...') + termination_timeout = wait_timeout_minutes*60 + termination_start = time.time() + + while wait_termination and all_instances and time.time() < termination_start+termination_timeout: + all_instances = [i for i in all_instances if i.state != 'terminated'] + time.sleep(5) + for i in all_instances: + i.update() + # The log says the destruction is Done but is still running, just chill and enjoy the ride + log.info('Done.') # Here is the exception of the try if we don't find the cluster except Exception as e: - print('Does not exist the cluster %s', cluster_name) + log.info('Does not exist %s', cluster_name) def get_master(cluster_name, region=default_region): From 268e6d0fcf153e5370bc38429497fa998a7f0062 Mon Sep 17 00:00:00 2001 From: Emerson Ferreira Date: Wed, 15 Jun 2022 16:34:09 -0300 Subject: [PATCH 248/268] Fixing the destroy function on flintrock. --- tools/cluster.py | 79 +++++++++++++++++++++++++++++++++++------------- tools/utils.py | 6 ++++ 2 files changed, 64 insertions(+), 21 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index 821a6f9a..ef42e646 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -7,12 +7,13 @@ """ +from cgitb import reset import argh from argh import ArghParser, CommandError from argh.decorators import named, arg import subprocess from subprocess import check_output, check_call -from utils import tag_instances, get_masters, get_active_nodes +from utils import tag_instances, get_masters, get_active_nodes, get_active_nodes_by_tag from utils import check_call_with_timeout, check_call_with_timeout_describe import os import sys @@ -343,37 +344,73 @@ def launch(cluster_name, slaves, raise CommandError('Failed to created cluster {0} after failures'.format(cluster_name)) +def destroy_by_flyntrock(region, cluster_name, vpc=default_vpc, script_timeout_total_minutes=55, script_timeout_inactivity_minutes=10, wait_termination=False, wait_timeout_minutes=10): + # create a variable to store the result + result = False + + try: # create a try catch to manage the possible erros + cluster = call_ec2_script_describe(['describe', cluster_name,'--ec2-vpc-id',vpc],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes) + if cluster == cluster_name: + call_ec2_script(['destroy','--assume-yes', cluster_name,'--ec2-vpc-id',vpc],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes) + result = True + except Exception as e: + #log.info('Error to destroy cluster {0} by flintrock'.format(cluster_name)) + destroy_by_cluster_name_tag(region, 'spark_cluster_name', cluster_name, wait_termination, wait_timeout_minutes) + pass + + return result + +def destroy_by_cluster_name_tag(region, tag_name, cluster_name, wait_termination, wait_timeout_minutes): + instances = get_active_nodes_by_tag(region, tag_name, cluster_name) + + if instances: + #log.info('Trying to terminate remain instances by id.') + + for instance in instances: + #log.info('Terminate instance {0}'.format(instance.id)) + instance.terminate() + log.info('Instance {0} is terminating.'.format(instance.id)) + + # call this function to wait instances to terminate + wait_for_intances_to_terminate(cluster_name, wait_termination, wait_timeout_minutes, instances) + + return instances + + def destroy(cluster_name, wait_termination=False, vpc=default_vpc, wait_timeout_minutes=10, delete_groups=False, region=default_region,script_timeout_total_minutes=55,script_timeout_inactivity_minutes=10): assert not delete_groups, 'Delete groups is deprecated and unsupported' masters, slaves = get_active_nodes(cluster_name, region=region) try: # First we test if exist the cluster with the function cluster_exists - cluster = call_ec2_script_describe(['describe', cluster_name,'--ec2-vpc-id',vpc],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes) - if cluster == cluster_name: - call_ec2_script(['destroy','--assume-yes', cluster_name,'--ec2-vpc-id',vpc],timeout_total_minutes=script_timeout_total_minutes, timeout_inactivity_minutes=script_timeout_inactivity_minutes) + if(destroy_by_flyntrock(region, cluster_name, vpc, script_timeout_total_minutes, script_timeout_inactivity_minutes, wait_termination, wait_timeout_minutes)): # Here we use the script to destroy the cluster using the name of it all_instances = masters + slaves # To better view about what the script is doing i choose to let the same code of the destroy i have updated - if all_instances: - log.info('The %s will be terminated:', cluster_name) - for i in all_instances: - log.info('-> %s' % (i.public_dns_name or i.private_dns_name)) - - if wait_termination: - log.info('Waiting for instances termination...') - termination_timeout = wait_timeout_minutes*60 - termination_start = time.time() - - while wait_termination and all_instances and time.time() < termination_start+termination_timeout: - all_instances = [i for i in all_instances if i.state != 'terminated'] - time.sleep(5) - for i in all_instances: - i.update() - # The log says the destruction is Done but is still running, just chill and enjoy the ride - log.info('Done.') + wait_for_intances_to_terminate(cluster_name, wait_termination, wait_timeout_minutes, all_instances) # Here is the exception of the try if we don't find the cluster except Exception as e: log.info('Does not exist %s', cluster_name) + pass + +def wait_for_intances_to_terminate(cluster_name, wait_termination=False, wait_timeout_minutes=10, all_instances=[]): + # To better view about what the script is doing i choose to let the same code of the destroy i have updated + if all_instances: + log.info('The %s will be terminated:', cluster_name) + for i in all_instances: + log.info('-> %s' % (i.public_dns_name or i.private_dns_name)) + + if wait_termination: + log.info('Waiting for instances termination...') + termination_timeout = wait_timeout_minutes*60 + termination_start = time.time() + + while wait_termination and all_instances and time.time() < termination_start+termination_timeout: + all_instances = [i for i in all_instances if i.state != 'terminated'] + time.sleep(5) + for i in all_instances: + i.update() + # The log says the destruction is Done but is still running, just chill and enjoy the ride + log.info('Done.') def get_master(cluster_name, region=default_region): diff --git a/tools/utils.py b/tools/utils.py index a33faa47..4f8b175e 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -40,6 +40,12 @@ def get_active_nodes(cluster_name, region): return parse_nodes(active, cluster_name) +def get_active_nodes_by_tag(region, tag_name, tag_value): + conn = boto.ec2.connect_to_region(region) + filter = {"tag:{0}".format(tag_name):["{0}".format(tag_value)], "instance-state-name":["running"]} + return conn.get_only_instances(filters=filter) + + def tag_instances(cluster_name, tags, region): conn = boto.ec2.connect_to_region(region) From 5692e45642c6ea4c1fa06553b7b51bf23be19f3e Mon Sep 17 00:00:00 2001 From: Emerson Ferreira Date: Mon, 20 Jun 2022 14:25:58 -0300 Subject: [PATCH 249/268] Creating new methods to manage the destroy cluster before try to create a new cluster. --- tools/cluster.py | 10 ++++-- tools/utils.py | 93 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 3 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index ef42e646..e970aa64 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -14,7 +14,7 @@ import subprocess from subprocess import check_output, check_call from utils import tag_instances, get_masters, get_active_nodes, get_active_nodes_by_tag -from utils import check_call_with_timeout, check_call_with_timeout_describe +from utils import check_call_with_timeout, check_call_with_timeout_describe, destroy_by_request_spot_ids import os import sys from datetime import datetime @@ -381,7 +381,11 @@ def destroy(cluster_name, wait_termination=False, vpc=default_vpc, wait_timeout_ assert not delete_groups, 'Delete groups is deprecated and unsupported' masters, slaves = get_active_nodes(cluster_name, region=region) - try: # First we test if exist the cluster with the function cluster_exists + try: # First we test if exist the cluster with the function cluster_exists + # get instances ids by json return and cancel the requests + wait_for_intances_to_terminate(cluster_name, wait_termination, wait_timeout_minutes, destroy_by_request_spot_ids(region, cluster_name)) + + # test if the cluster exists and call destroy by fintorock to destroy it if(destroy_by_flyntrock(region, cluster_name, vpc, script_timeout_total_minutes, script_timeout_inactivity_minutes, wait_termination, wait_timeout_minutes)): # Here we use the script to destroy the cluster using the name of it all_instances = masters + slaves @@ -397,7 +401,7 @@ def wait_for_intances_to_terminate(cluster_name, wait_termination=False, wait_ti if all_instances: log.info('The %s will be terminated:', cluster_name) for i in all_instances: - log.info('-> %s' % (i.public_dns_name or i.private_dns_name)) + log.info('-> %s' % (i.public_dns_name or i.private_dns_name or i.id)) if wait_termination: log.info('Waiting for instances termination...') diff --git a/tools/utils.py b/tools/utils.py index 4f8b175e..e579361a 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -5,6 +5,13 @@ import subprocess import select import time +import json +from os.path import exists +from os import makedirs +import os + +# get a folder_log_path from env variable +folder_log_path = os.getenv('LOG_FOLDER') logging.basicConfig(level=logging.INFO) @@ -45,6 +52,92 @@ def get_active_nodes_by_tag(region, tag_name, tag_value): filter = {"tag:{0}".format(tag_name):["{0}".format(tag_value)], "instance-state-name":["running"]} return conn.get_only_instances(filters=filter) +def get_requests_ids_by_cluster_name(cluster_name): + # create a array with the requests ids + requests_ids = [] + folder_full_path = os.path.abspath(os.getcwd()) + + if folder_log_path: + # check if the folder exists and if not create it + folderExist = exists(folder_log_path) + + if folderExist != True: + makedirs(folder_log_path) + + file_name = '{0}/{1}.json'.format(folder_log_path, cluster_name) + else: + file_name = '{0}.json'.format(cluster_name) + + # verify if the file exists + file_exists = exists(file_name) + + if file_exists: + # open a json log file if exists + json_file = open(file_name) + + # deserialize the json file to object + json_content = json.load(json_file) + + # create a array with the requests ids + for request_id in json_content: + requests_ids.append(str(request_id['SpotInstanceRequestId'])) + + return requests_ids + + +def destroy_by_request_spot_ids(region, cluster_name): + conn = boto.ec2.connect_to_region(region) + instances = [] + + try: + # get requets ids from json log file + request_ids = get_requests_ids_by_cluster_name(cluster_name) + logging.info('The amount of requests ids found in json log file: {0}'.format(len(request_ids))) + instances_cancelled = [] + + # test if the request has any id + if len(request_ids) > 0: + spot_requests = conn.get_all_spot_instance_requests() + for request in request_ids: + for spot_request in spot_requests: + if request == spot_request.id: + # cancel the requests returned before + conn.cancel_spot_instance_requests(request) + instances_cancelled.append(spot_request) + + # verify if the cancelled list is not empty + if len(instances_cancelled) > 0: + instances_ids = [] + + # create the instance list of machines based on requests ids + for request_cancelled in instances_cancelled: + if request_cancelled.instance_id: + instances_ids.append(request_cancelled.instance_id) + + # test if the instance id is not empty + if len(instances_ids) > 0: + instances_requested = conn.get_only_instances(instances_ids) + + # terminate instances from request spot + for instance in instances_requested: + # checking again if the object is in the list to not terminate wrong machines + if instances_ids.index(instance.id) > -1: + if instance.state == 'running': + logging.info('Terminating instance: {0}'.format(instance.id)) + # add only instances that are running to return list + instances.append(instance) + # terminate the instance + instance.terminate() + elif instance.state == 'shutting-down': + # add the instance to the wait list + instances.append(instance) + + except Exception as e: + logging.error('Error to destroy cluster {0} by request ids.'.format(cluster_name)) + pass + + return instances + def tag_instances(cluster_name, tags, region): conn = boto.ec2.connect_to_region(region) From 16e427be2adbaba8e7497f85d468effb56d7ae21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dtalo?= Date: Tue, 21 Jun 2022 19:06:57 -0300 Subject: [PATCH 250/268] feat: new flintrock option for create cluster --- tools/cluster.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/cluster.py b/tools/cluster.py index 8cc12f98..56a844de 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -302,6 +302,7 @@ def launch(cluster_name, slaves, '--ec2-security-group', security_group, '--ec2-user', installation_user, '--ec2-user-data', user_data, + '--launch-template-name', cluster_name, cluster_name] + spot_params + auth_params + From ce1f19df5561c84fca8b6c9400ccbe6c4b62b2be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dtalo?= Date: Fri, 8 Jul 2022 10:13:27 -0300 Subject: [PATCH 251/268] feat: removing the zone parameter, due to multi az flintrock --- tools/cluster.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index d20a287b..0cb4e3b9 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -297,7 +297,6 @@ def launch(cluster_name, slaves, '--ec2-key-name', key_id, '--num-slaves', slaves, '--ec2-region', region, - '--ec2-availability-zone', zone, '--ec2-instance-type', instance_type, '--ec2-min-root-ebs-size-gb', min_root_ebs_size_gb, '--assume-yes', From 9280c4ce6df2eb736cd6eb89c6195455dcab71f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1vio=20Zantut=20Nogueira?= Date: Wed, 13 Jul 2022 17:17:38 -0300 Subject: [PATCH 252/268] Change installation and launch user from root to ec2-user --- remote_hook.sh | 14 +++++++------- tools/cluster.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/remote_hook.sh b/remote_hook.sh index 5078786b..25233a56 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -30,10 +30,10 @@ echo $$ > "${RUNNING_FILE}" # Let us read the spark home even when the image doesn't give us the permission -sudo chmod o+rx /root -sudo chmod -R o+rx /root/spark +sudo chmod o+rx /home/ec2-user +sudo chmod -R o+rx /home/ec2-user/spark -sudo mkdir -p /media/tmp/spark-events +mkdir -p /media/tmp/spark-events notify_error_and_exit() { description="${1}" @@ -70,7 +70,7 @@ install_and_run_zeppelin() { export ZEPPELIN_PORT="8081" export SPARK_HOME=$(get_first_present /root/spark /opt/spark ~/spark*/) export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH} --executor-memory ${SPARK_MEM_PARAM}" - sudo -E zeppelin/bin/zeppelin.sh + zeppelin/bin/zeppelin.sh else notify_error_and_exit "Zeppelin installation not found" fi @@ -88,7 +88,7 @@ install_and_run_jupyter() { export PYSPARK_DRIVER_PYTHON=$(which jupyter) export PYSPARK_DRIVER_PYTHON_OPTS="notebook --allow-root --ip=${SPARK_MASTER_HOST} --no-browser --port=8888" sudo $(which jupyter) toree install --spark_home="${SPARK_HOME}" --spark_opts="--master ${JOB_MASTER} --executor-memory ${SPARK_MEM_PARAM} --driver-memory ${DRIVER_HEAP_SIZE}" - sudo -E "${SPARK_HOME}/bin/pyspark" --master "${JOB_MASTER}" --executor-memory "${SPARK_MEM_PARAM}" --driver-memory "${DRIVER_HEAP_SIZE}" + ${SPARK_HOME}/bin/pyspark --master "${JOB_MASTER}" --executor-memory "${SPARK_MEM_PARAM}" --driver-memory "${DRIVER_HEAP_SIZE}" } trap "on_trap_exit" EXIT @@ -118,7 +118,7 @@ if [[ "${USE_YARN}" == "yes" ]]; then fi if [[ "${JOB_NAME}" == "shell" ]]; then - sudo -E ${SPARK_HOME}/bin/spark-shell --master "${JOB_MASTER}" --jars ${JAR_PATH} --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell" + ${SPARK_HOME}/bin/spark-shell --master "${JOB_MASTER}" --jars ${JAR_PATH} --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --executor-memory "${SPARK_MEM_PARAM}" || notify_error_and_exit "Execution failed for shell" elif [[ "${JOB_NAME}" == "zeppelin" ]]; then install_and_run_zeppelin elif [[ "${JOB_NAME}" == "jupyter" ]]; then @@ -126,7 +126,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & - sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" + ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" fi touch "${JOB_CONTROL_DIR}/SUCCESS" diff --git a/tools/cluster.py b/tools/cluster.py index e970aa64..629dc40c 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -55,7 +55,7 @@ default_spark_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/spark-{v}-bin-hadoop2.7.tgz' default_hdfs_download_source = 'https://s3.amazonaws.com/chaordic-ignition-public/hadoop-{v}.tar.gz' default_remote_user = 'ec2-user' -default_installation_user = 'root' +default_installation_user = 'ec2-user' default_remote_control_dir = '/tmp/Ignition' default_collect_results_dir = '/tmp' default_user_data = os.path.join(script_path, 'scripts', 'noop') From aa39bb5a670b9101dd4307a8d2448bd2fc54d435 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dtalo?= Date: Wed, 13 Jul 2022 17:57:32 -0300 Subject: [PATCH 253/268] feat: add a function to delete Flintrock SG rules in dev environment - it is a module inside core/tools which is called with subprocess of python2 (cluster.py) due to the script (revoke_sg_rules.py) be written in python3. --- tools/cluster.py | 8 +++ tools/revoke_sg_rules.py | 107 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 tools/revoke_sg_rules.py diff --git a/tools/cluster.py b/tools/cluster.py index e970aa64..1025926e 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -383,6 +383,14 @@ def destroy(cluster_name, wait_termination=False, vpc=default_vpc, wait_timeout_ try: # First we test if exist the cluster with the function cluster_exists # get instances ids by json return and cancel the requests + + # if in dev environment, will delete the flintrock SG rules of the machine running this script + if os.getenv('ENVIRONMENT') == 'development': + revoke_sg_script = os.path.join(script_path, 'revoke_sg_rules.py') + process = subprocess.Popen(["python3", revoke_sg_script, region, vpc], stdout=subprocess.PIPE) + stdout_str = process.communicate()[0] + log.info(stdout_str) + wait_for_intances_to_terminate(cluster_name, wait_termination, wait_timeout_minutes, destroy_by_request_spot_ids(region, cluster_name)) # test if the cluster exists and call destroy by fintorock to destroy it diff --git a/tools/revoke_sg_rules.py b/tools/revoke_sg_rules.py new file mode 100644 index 00000000..40f5df72 --- /dev/null +++ b/tools/revoke_sg_rules.py @@ -0,0 +1,107 @@ +import urllib.request +import sys + +from botocore.exceptions import ClientError +import boto3 + + +def _get_security_group(region, vpc_id): + ec2 = boto3.client('ec2', region_name=region) + response = ec2.describe_security_groups( + Filters=[ + { + 'Name': 'vpc-id', + 'Values': [ + vpc_id, + ] + }, + ], + ) + return response + + +def _client_cidr(): + flintrock_client_ip = ( + urllib.request.urlopen('http://checkip.amazonaws.com/') + .read().decode('utf-8').strip()) + flintrock__client_cidr = '{ip}/32'.format(ip=flintrock_client_ip) + return flintrock__client_cidr + + +def _delete_rule(cidr_ip, ip_protocol, from_port, to_port, group_id, region): + ec2 = boto3.client('ec2', region_name=region) + ec2.revoke_security_group_ingress( + CidrIp=cidr_ip, + GroupId=group_id, + IpProtocol=ip_protocol, + FromPort=from_port, + ToPort=to_port + ) + +def revoke_flintrock_sg_ingress(region, vpc_id): + """Revoke Flintrock Security Group's Rules matched with the IP from + the current machine given the Region and VPC ID + + :param region: The AWS region where the VPC is located + :type region: str + :param vpc_id: The VPC ID where flintrock Security Group was created + :type vpc_id: str + :returns: a string with a message explaining the success or fail + :rtype: str + """ + + response = _get_security_group(region=region, vpc_id=vpc_id) + # variables required to delete rule + cidr_to_revoke_rules = _client_cidr() + group_id = '' + group_name = '' + from_port = '' + to_port = '' + ip_protocol = '' + # variable to store the success of the + # loop and give the right return message + success = False + + security_groups = response["SecurityGroups"] + + if len(security_groups) == 0: + return 'There is no security groups in the vpc {} at region {}'.format(vpc_id, region) + + for sg in security_groups: + group_id = sg['GroupId'] + group_name = sg['GroupName'] + if group_name == 'flintrock': + for ip in sg['IpPermissions']: + if 'FromPort' in ip: + from_port = ip['FromPort'] + ip_protocol = ip['IpProtocol'] + to_port = ip['ToPort'] + for cidr in ip['IpRanges']: + # identifying which rules contain the local IP range + if cidr['CidrIp'] == cidr_to_revoke_rules: + try: + _delete_rule( + cidr_ip=cidr['CidrIp'], + ip_protocol=ip_protocol, + from_port=from_port, + to_port=to_port, + group_id=group_id, + region=region + ) + success = True + except ClientError as error: + raise error + + + if not success: + return 'There is no rule from this client to delete in the vpc id: {}.'.format(vpc_id) + + else: + return 'Successfully deleted the rules from this client in the vpc id: {}.'.format(vpc_id) + + +if __name__ == '__main__': + region = sys.argv[1] + vpc_id = sys.argv[2] + result = revoke_flintrock_sg_ingress(region=region, vpc_id=vpc_id) + print(result) \ No newline at end of file From ce109e46eab145d3940206c409e33fad226ecabc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dtalo?= Date: Wed, 13 Jul 2022 18:07:04 -0300 Subject: [PATCH 254/268] refactor: add new blank line at the end --- tools/revoke_sg_rules.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/revoke_sg_rules.py b/tools/revoke_sg_rules.py index 40f5df72..06f42b9e 100644 --- a/tools/revoke_sg_rules.py +++ b/tools/revoke_sg_rules.py @@ -104,4 +104,5 @@ def revoke_flintrock_sg_ingress(region, vpc_id): region = sys.argv[1] vpc_id = sys.argv[2] result = revoke_flintrock_sg_ingress(region=region, vpc_id=vpc_id) - print(result) \ No newline at end of file + print(result) + \ No newline at end of file From c81c362e1a81b226961aa80973f09b99cea5466c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dtalo=20Pinto?= Date: Thu, 14 Jul 2022 15:40:17 -0300 Subject: [PATCH 255/268] Update tools/revoke_sg_rules.py Co-authored-by: Iury Krieger --- tools/revoke_sg_rules.py | 73 +++++++++++++++------------------------- 1 file changed, 28 insertions(+), 45 deletions(-) diff --git a/tools/revoke_sg_rules.py b/tools/revoke_sg_rules.py index 06f42b9e..c5ee51f6 100644 --- a/tools/revoke_sg_rules.py +++ b/tools/revoke_sg_rules.py @@ -51,53 +51,36 @@ def revoke_flintrock_sg_ingress(region, vpc_id): """ response = _get_security_group(region=region, vpc_id=vpc_id) - # variables required to delete rule - cidr_to_revoke_rules = _client_cidr() - group_id = '' - group_name = '' - from_port = '' - to_port = '' - ip_protocol = '' - # variable to store the success of the - # loop and give the right return message - success = False - security_groups = response["SecurityGroups"] + cidr_to_revoke_rules = _client_cidr() + + if not len(security_groups): + raise 'There is no security groups in the vpc {} at region {}'.format(vpc_id, region) - if len(security_groups) == 0: - return 'There is no security groups in the vpc {} at region {}'.format(vpc_id, region) - - for sg in security_groups: - group_id = sg['GroupId'] - group_name = sg['GroupName'] - if group_name == 'flintrock': - for ip in sg['IpPermissions']: - if 'FromPort' in ip: - from_port = ip['FromPort'] - ip_protocol = ip['IpProtocol'] - to_port = ip['ToPort'] - for cidr in ip['IpRanges']: - # identifying which rules contain the local IP range - if cidr['CidrIp'] == cidr_to_revoke_rules: - try: - _delete_rule( - cidr_ip=cidr['CidrIp'], - ip_protocol=ip_protocol, - from_port=from_port, - to_port=to_port, - group_id=group_id, - region=region - ) - success = True - except ClientError as error: - raise error - - - if not success: - return 'There is no rule from this client to delete in the vpc id: {}.'.format(vpc_id) - - else: - return 'Successfully deleted the rules from this client in the vpc id: {}.'.format(vpc_id) + for security_group in security_groups: + for ip_permission in security_group['IpPermissions']: + for ip_range in ip_permission['IpRanges']: + group_id = security_group['GroupId'] + group_name = security_group['GroupName'] + from_port = ip_permission['FromPort'] + ip_protocol = ip_permission['IpProtocol'] + to_port = ip_permission['ToPort'] + + if group_name == 'flintrock' and 'FromPort' in ip_permission and ip_range['CidrIp'] == cidr_to_revoke_rules: + try: + _delete_rule( + cidr_ip=ip_range['CidrIp'], + ip_protocol=ip_protocol, + from_port=from_port, + to_port=to_port, + group_id=group_id, + region=region + ) + except ClientError as error: + print('There is no rule from this client to delete in the vpc id: {}.'.format(vpc_id)) + raise error + + print('Successfully deleted the rules from this client in the vpc id: {}.'.format(vpc_id)) if __name__ == '__main__': From 12737d002a50347a462a04d6fb0b41deed4e0bcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dtalo?= Date: Thu, 14 Jul 2022 16:21:04 -0300 Subject: [PATCH 256/268] refactor: fixing identation, style and add new function - add a boolean function to return true if a given cidr exists in a given security group id. - extends the function _get_security_group() to return an specific security group through the param `sg_name`. - add a logic to verifie if the rules were deleted before end the script. --- tools/revoke_sg_rules.py | 116 +++++++++++++++++++++++++-------------- 1 file changed, 75 insertions(+), 41 deletions(-) diff --git a/tools/revoke_sg_rules.py b/tools/revoke_sg_rules.py index c5ee51f6..1577f80f 100644 --- a/tools/revoke_sg_rules.py +++ b/tools/revoke_sg_rules.py @@ -5,7 +5,7 @@ import boto3 -def _get_security_group(region, vpc_id): +def _get_security_group(region, vpc_id, sg_name): ec2 = boto3.client('ec2', region_name=region) response = ec2.describe_security_groups( Filters=[ @@ -17,7 +17,13 @@ def _get_security_group(region, vpc_id): }, ], ) - return response + desired_sg = None + security_groups = response['SecurityGroups'] + for security_group in security_groups: + if security_group['GroupName'] == sg_name: + desired_sg = security_group + + return desired_sg def _client_cidr(): @@ -28,6 +34,29 @@ def _client_cidr(): return flintrock__client_cidr +def _exists_cidr_in_sg(region, cidr, sg_id): + """Boolean function to return `true` if a given cidr + exists in a given security group id. Otherwise returns + `false`. + """ + ec2 = boto3.client('ec2', region_name=region) + response = ec2.describe_security_group_rules( + Filters=[ + { + 'Name': 'group-id', + 'Values': [ + sg_id, + ] + }, + ] + ) + rules = response['SecurityGroupRules'] + for rule in rules: + if rule['CidrIpv4'] == cidr: + return True + return False + + def _delete_rule(cidr_ip, ip_protocol, from_port, to_port, group_id, region): ec2 = boto3.client('ec2', region_name=region) ec2.revoke_security_group_ingress( @@ -37,55 +66,60 @@ def _delete_rule(cidr_ip, ip_protocol, from_port, to_port, group_id, region): FromPort=from_port, ToPort=to_port ) - + + def revoke_flintrock_sg_ingress(region, vpc_id): """Revoke Flintrock Security Group's Rules matched with the IP from the current machine given the Region and VPC ID - - :param region: The AWS region where the VPC is located - :type region: str - :param vpc_id: The VPC ID where flintrock Security Group was created - :type vpc_id: str - :returns: a string with a message explaining the success or fail - :rtype: str + :param `region`: The AWS region where the VPC is located + :type `region`: str + :param `vpc_id`: The VPC ID where flintrock Security Group was created + :type `vpc_id`: str """ - response = _get_security_group(region=region, vpc_id=vpc_id) - security_groups = response["SecurityGroups"] + flintrock_security_group = _get_security_group(region=region, vpc_id=vpc_id, sg_name='flintrock') cidr_to_revoke_rules = _client_cidr() + flintrock_group_id = flintrock_security_group['GroupId'] + + if flintrock_security_group['GroupName'] != 'flintrock': + print('Flintrock security groups doesn\'t exist in this vpc {} at region {}'.format(vpc_id, region)) + return # we don't want the script to ``raise`` an error, to not mess with the job_runner.py logs + + # check if the local IP is in some rule or not + if not _exists_cidr_in_sg(region=region, cidr=cidr_to_revoke_rules, sg_id=flintrock_group_id): + print('There is no rules with the IP of this client in Flintrock security group.') + return + + for ip_permission in flintrock_security_group['IpPermissions']: + for ip_range in ip_permission['IpRanges']: + group_id = flintrock_group_id + from_port = ip_permission['FromPort'] + ip_protocol = ip_permission['IpProtocol'] + to_port = ip_permission['ToPort'] + + if 'FromPort' in ip_permission and ip_range['CidrIp'] == cidr_to_revoke_rules: + try: + _delete_rule( + cidr_ip=ip_range['CidrIp'], + ip_protocol=ip_protocol, + from_port=from_port, + to_port=to_port, + group_id=group_id, + region=region + ) + except ClientError as error: + print(error) - if not len(security_groups): - raise 'There is no security groups in the vpc {} at region {}'.format(vpc_id, region) - - for security_group in security_groups: - for ip_permission in security_group['IpPermissions']: - for ip_range in ip_permission['IpRanges']: - group_id = security_group['GroupId'] - group_name = security_group['GroupName'] - from_port = ip_permission['FromPort'] - ip_protocol = ip_permission['IpProtocol'] - to_port = ip_permission['ToPort'] - - if group_name == 'flintrock' and 'FromPort' in ip_permission and ip_range['CidrIp'] == cidr_to_revoke_rules: - try: - _delete_rule( - cidr_ip=ip_range['CidrIp'], - ip_protocol=ip_protocol, - from_port=from_port, - to_port=to_port, - group_id=group_id, - region=region - ) - except ClientError as error: - print('There is no rule from this client to delete in the vpc id: {}.'.format(vpc_id)) - raise error - - print('Successfully deleted the rules from this client in the vpc id: {}.'.format(vpc_id)) + # check again to confirm if the rules were revoked + status = False + while not status: + status = _exists_cidr_in_sg(region=region, cidr=cidr_to_revoke_rules, sg_id=flintrock_group_id) + + print('Successfully deleted rules of this client from flintrock security group at vpc {}'.format(vpc_id)) if __name__ == '__main__': region = sys.argv[1] vpc_id = sys.argv[2] - result = revoke_flintrock_sg_ingress(region=region, vpc_id=vpc_id) - print(result) + revoke_flintrock_sg_ingress(region=region, vpc_id=vpc_id) \ No newline at end of file From bb8c938a1ce74d0895e7a76aff61aa363b0f1788 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dtalo?= Date: Thu, 14 Jul 2022 16:39:18 -0300 Subject: [PATCH 257/268] fix: the while check was taking too long to return - it is due to requests that are made with boto3 --- tools/revoke_sg_rules.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tools/revoke_sg_rules.py b/tools/revoke_sg_rules.py index 1577f80f..d4ab31a0 100644 --- a/tools/revoke_sg_rules.py +++ b/tools/revoke_sg_rules.py @@ -111,11 +111,8 @@ def revoke_flintrock_sg_ingress(region, vpc_id): print(error) # check again to confirm if the rules were revoked - status = False - while not status: - status = _exists_cidr_in_sg(region=region, cidr=cidr_to_revoke_rules, sg_id=flintrock_group_id) - - print('Successfully deleted rules of this client from flintrock security group at vpc {}'.format(vpc_id)) + if not _exists_cidr_in_sg(region=region, cidr=cidr_to_revoke_rules, sg_id=flintrock_group_id): + print('Successfully deleted rules of this client from flintrock security group at vpc {}'.format(vpc_id)) if __name__ == '__main__': From 9e9ba1cc0457231080c09d3dcf50da61fc22acbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dtalo?= Date: Wed, 20 Jul 2022 17:52:48 -0300 Subject: [PATCH 258/268] refactor: rename the function to destroy by request id and change in code style - now the function is to destroy by fleet id --- tools/cluster.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index 0cb4e3b9..0ca4618b 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -14,7 +14,7 @@ import subprocess from subprocess import check_output, check_call from utils import tag_instances, get_masters, get_active_nodes, get_active_nodes_by_tag -from utils import check_call_with_timeout, check_call_with_timeout_describe, destroy_by_request_spot_ids +from utils import check_call_with_timeout, check_call_with_timeout_describe, destroy_by_fleet_id import os import sys from datetime import datetime @@ -383,10 +383,10 @@ def destroy(cluster_name, wait_termination=False, vpc=default_vpc, wait_timeout_ try: # First we test if exist the cluster with the function cluster_exists # get instances ids by json return and cancel the requests - wait_for_intances_to_terminate(cluster_name, wait_termination, wait_timeout_minutes, destroy_by_request_spot_ids(region, cluster_name)) + wait_for_intances_to_terminate(cluster_name, wait_termination, wait_timeout_minutes, destroy_by_fleet_id(region, cluster_name)) # test if the cluster exists and call destroy by fintorock to destroy it - if(destroy_by_flyntrock(region, cluster_name, vpc, script_timeout_total_minutes, script_timeout_inactivity_minutes, wait_termination, wait_timeout_minutes)): + if destroy_by_flyntrock(region, cluster_name, vpc, script_timeout_total_minutes, script_timeout_inactivity_minutes, wait_termination, wait_timeout_minutes): # Here we use the script to destroy the cluster using the name of it all_instances = masters + slaves # To better view about what the script is doing i choose to let the same code of the destroy i have updated From c6418c7c90304eacb7378bc0f8f67d3b0bcfe6f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dtalo?= Date: Wed, 20 Jul 2022 17:54:36 -0300 Subject: [PATCH 259/268] feat: now destroy the cluster by fleet id --- tools/utils.py | 113 +++++++++++++++++++++++-------------------------- 1 file changed, 53 insertions(+), 60 deletions(-) diff --git a/tools/utils.py b/tools/utils.py index e579361a..447e1512 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -1,5 +1,7 @@ #!/usr/bin/env python +import ast import logging +from pprint import pprint import boto.ec2 import sys import subprocess @@ -52,87 +54,78 @@ def get_active_nodes_by_tag(region, tag_name, tag_value): filter = {"tag:{0}".format(tag_name):["{0}".format(tag_value)], "instance-state-name":["running"]} return conn.get_only_instances(filters=filter) -def get_requests_ids_by_cluster_name(cluster_name): +def get_fleet_id_by_cluster_name(cluster_name): # create a array with the requests ids - requests_ids = [] - folder_full_path = os.path.abspath(os.getcwd()) + fleet_id = '' + file_name = '{0}.json'.format(cluster_name) if folder_log_path: # check if the folder exists and if not create it - folderExist = exists(folder_log_path) - - if folderExist != True: + if not exists(folder_log_path): makedirs(folder_log_path) file_name = '{0}/{1}.json'.format(folder_log_path, cluster_name) - else: - file_name = '{0}.json'.format(cluster_name) # verify if the file exists - file_exists = exists(file_name) - - if file_exists: + if exists(file_name): # open a json log file if exists - json_file = open(file_name) - - # deserialize the json file to object - json_content = json.load(json_file) + with open(file_name) as json_file:# deserialize the json file to object + json_content = json.load(json_file) + + # create a array with the requests ids + for request in json_content: + fleet_id = str(request['FleetId']) - # create a array with the requests ids - for request_id in json_content: - requests_ids.append(str(request_id['SpotInstanceRequestId'])) + return fleet_id - return requests_ids - -def destroy_by_request_spot_ids(region, cluster_name): +def destroy_by_fleet_id(region, cluster_name): conn = boto.ec2.connect_to_region(region) + fleet_instances_ids = [] instances = [] try: # get requets ids from json log file - request_ids = get_requests_ids_by_cluster_name(cluster_name) - logging.info('The amount of requests ids found in json log file: {0}'.format(len(request_ids))) - instances_cancelled = [] - - # test if the request has any id - if len(request_ids) > 0: - spot_requests = conn.get_all_spot_instance_requests() - for request in request_ids: - for spot_request in spot_requests: - if request == spot_request.id: - # cancel the requests returned before - conn.cancel_spot_instance_requests(request) - instances_cancelled.append(spot_request) - - # verify if the cancelled list is not empty - if len(instances_cancelled) > 0: - instances_ids = [] + fleet_id = get_fleet_id_by_cluster_name(cluster_name) + logging.info('The fleet id found in json log file: {0}'.format(fleet_id)) - # create the instance list of machines based on requests ids - for request_cancelled in instances_cancelled: - if request_cancelled.instance_id: - instances_ids.append(request_cancelled.instance_id) - - # test if the instance id is not empty - if len(instances_ids) > 0: - instances_requested = conn.get_only_instances(instances_ids) - - # terminate instances from request spot - for instance in instances_requested: - # checking again if the object is in the list to not terminate wrong machines - if instances_ids.index(instance.id) > -1: - if instance.state == 'running': - logging.info('Terminating instance: {0}'.format(instance.id)) - # add only instances that are running to return list - instances.append(instance) - # terminate the instance - instance.terminate() - elif instance.state == 'shutting-down': - # add the instance to the wait list - instances.append(instance) + # call an external script to delete the fleet and retrieve the list of instances + delete_fleet_script = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'delete_fleet.py') + process = subprocess.Popen(["python3", delete_fleet_script, region, fleet_id], stdout=subprocess.PIPE) + stdout_str = process.communicate()[0] + + # the subprocess return a string with the character '\n' separating the delete message and the list of instances + stdout_str_split = stdout_str.split('\n') + + # message of fleet deletion + deleted_fleet = stdout_str_split[0] + logging.info(deleted_fleet) + + # getting the list of the string containing the list of istances + # e.g."['i-0e90a67a64693dc39', 'i-00889275ebe58bb7b', 'i-0982e3e6728044bef']" + fleet_instances = ast.literal_eval(stdout_str_split[1]) + fleet_instances_ids.extend(fleet_instances) + + # test if the instance id is not empty + if len(fleet_instances_ids) > 0: + instances_requested = conn.get_only_instances(fleet_instances_ids) + + # terminate instances from request spot + for instance in instances_requested: + # checking again if the object is in the list to not terminate wrong machines + if fleet_instances_ids.index(instance.id) > -1: + if instance.state == 'running': + logging.info('Terminating instance: {0}'.format(instance.id)) + # add only instances that are running to return list + instances.append(instance) + # terminate the instance + instance.terminate() + elif instance.state == 'shutting-down': + # add the instance to the wait list + instances.append(instance) except Exception as e: + logging.error(e) logging.error('Error to destroy cluster {0} by request ids.'.format(cluster_name)) pass From 0df5898730acb737d12ecde79e8df3a2ebefa481 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dtalo?= Date: Wed, 20 Jul 2022 17:56:33 -0300 Subject: [PATCH 260/268] feat: script in python 3 to handle the fleet delete - it is called inside the util.py (python2) through subprocess.Popen(). - this script returns a message for the fleet deleted and a list containing the instance ids. --- tools/delete_fleet.py | 46 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 tools/delete_fleet.py diff --git a/tools/delete_fleet.py b/tools/delete_fleet.py new file mode 100644 index 00000000..79aa6bbe --- /dev/null +++ b/tools/delete_fleet.py @@ -0,0 +1,46 @@ +import sys +from time import sleep + +import boto3 +from botocore.exceptions import ClientError + +def describe_fleets(region, fleet_id): + ec2 = boto3.client('ec2', region_name=region) + response = ec2.describe_fleets( + FleetIds=[ + fleet_id + ], + ) + + return response['Fleets'][0]['Instances'][0]['InstanceIds'] + +def delete_fleet(region, fleet_id): + ec2 = boto3.client('ec2', region_name=region) + response = ec2.delete_fleets( + FleetIds=[ + fleet_id, + ], + TerminateInstances=True + ) + + return response['SuccessfulFleetDeletions'][0]['CurrentFleetState'] + + +if __name__ == '__main__': + region = sys.argv[1] + fleet_id = sys.argv[2] + try: + # Delete the fleet + fleet_deleted_states = ["deleted", "deleted_running", "deleted_terminating"] + fleet_state = None + while fleet_state not in fleet_deleted_states: + sleep(5) + fleet_state = delete_fleet(region=region, fleet_id=fleet_id) + print(f"Fleet deleted. Fleet state: {fleet_state}") + + # get the instance ids from the fleet + print(describe_fleets(region=region, fleet_id=fleet_id)) + except (ClientError, Exception) as e: + print(e) + + From 26cd862cb26d68c26a5338c9854ba357e8a51ace Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dtalo?= Date: Thu, 28 Jul 2022 16:26:49 -0300 Subject: [PATCH 261/268] hotfix: fix the output when there is no json file with fleet id --- tools/delete_fleet.py | 11 +++++++---- tools/utils.py | 12 ++++++++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/tools/delete_fleet.py b/tools/delete_fleet.py index 79aa6bbe..70f1ce0f 100644 --- a/tools/delete_fleet.py +++ b/tools/delete_fleet.py @@ -11,8 +11,12 @@ def describe_fleets(region, fleet_id): fleet_id ], ) - - return response['Fleets'][0]['Instances'][0]['InstanceIds'] + errors = response['Fleets'][0]['Errors'] + instances = response['Fleets'][0]['Instances'] + # to ensure we are returning an array anyway + if len(errors) > 0 and len(instances) == 0: + return [''] + return instances[0]['InstanceIds'] def delete_fleet(region, fleet_id): ec2 = boto3.client('ec2', region_name=region) @@ -42,5 +46,4 @@ def delete_fleet(region, fleet_id): print(describe_fleets(region=region, fleet_id=fleet_id)) except (ClientError, Exception) as e: print(e) - - + \ No newline at end of file diff --git a/tools/utils.py b/tools/utils.py index 447e1512..8c97d23f 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -85,8 +85,12 @@ def destroy_by_fleet_id(region, cluster_name): instances = [] try: - # get requets ids from json log file + # get fleet id from json log file fleet_id = get_fleet_id_by_cluster_name(cluster_name) + + if fleet_id in [None, '']: + raise Exception('There is no fleet id to delete. Keep going.') + logging.info('The fleet id found in json log file: {0}'.format(fleet_id)) # call an external script to delete the fleet and retrieve the list of instances @@ -106,8 +110,8 @@ def destroy_by_fleet_id(region, cluster_name): fleet_instances = ast.literal_eval(stdout_str_split[1]) fleet_instances_ids.extend(fleet_instances) - # test if the instance id is not empty - if len(fleet_instances_ids) > 0: + # test if the instance id is not empty and contains an instance id for sure + if len(fleet_instances_ids) > 0 and fleet_instances_ids[0].startswith('i-'): instances_requested = conn.get_only_instances(fleet_instances_ids) # terminate instances from request spot @@ -126,7 +130,7 @@ def destroy_by_fleet_id(region, cluster_name): except Exception as e: logging.error(e) - logging.error('Error to destroy cluster {0} by request ids.'.format(cluster_name)) + logging.error('Error to destroy cluster {0} by fleet id.'.format(cluster_name)) pass return instances From 421f95980ed8c22cb771527666deaab921dc4637 Mon Sep 17 00:00:00 2001 From: Daniel Dantas Date: Fri, 29 Jul 2022 16:49:00 -0300 Subject: [PATCH 262/268] added javaagent parameter (spark-submit) to enable metrics by jmx protocol --- remote_hook.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/remote_hook.sh b/remote_hook.sh index 5078786b..bee42f3e 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -126,7 +126,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & - sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" + sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" fi touch "${JOB_CONTROL_DIR}/SUCCESS" From ec8a2ef0ae3e71ba018b6115a9a463409a73a867 Mon Sep 17 00:00:00 2001 From: Daniel Dantas Date: Wed, 3 Aug 2022 12:00:28 -0300 Subject: [PATCH 263/268] added parameter spark.metrics.conf --- remote_hook.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/remote_hook.sh b/remote_hook.sh index bee42f3e..c01df7e7 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -126,7 +126,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & - sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" + sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --class "spark.metrics.conf=/tmp/jmx/metrics.properties" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" fi touch "${JOB_CONTROL_DIR}/SUCCESS" From 82f7fd83397188723402c89314483c2cd068bd71 Mon Sep 17 00:00:00 2001 From: Daniel Dantas Date: Wed, 3 Aug 2022 16:15:32 -0300 Subject: [PATCH 264/268] added conf spark.metrics.conf --- remote_hook.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/remote_hook.sh b/remote_hook.sh index c01df7e7..573c888b 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -126,7 +126,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & - sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --class "spark.metrics.conf=/tmp/jmx/metrics.properties" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" + sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --conf "spark.metrics.conf=/tmp/jmx/metrics.properties" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" fi touch "${JOB_CONTROL_DIR}/SUCCESS" From fbb8128a523793bb9fcc730ea1464094179f8b5d Mon Sep 17 00:00:00 2001 From: Daniel Dantas Date: Thu, 4 Aug 2022 21:12:13 -0300 Subject: [PATCH 265/268] added --packages parameter in spark-submit command --- remote_hook.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/remote_hook.sh b/remote_hook.sh index 5969e00a..84ceec72 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -126,7 +126,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & - ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --conf "spark.metrics.conf=/tmp/jmx/metrics.properties" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" + ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --conf "spark.metrics.conf=/tmp/jmx/metrics.properties" --repositories "https://raw.github.com/banzaicloud/spark-metrics/master/maven-repo/releases" --packages "com.banzaicloud:spark-metrics_2.11:2.4-1.0.6,io.prometheus:simpleclient:0.3.0,io.prometheus:simpleclient_dropwizard:0.3.0,io.prometheus:simpleclient_pushgateway:0.3.0,io.dropwizard.metrics:metrics-core:3.1.2" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" fi touch "${JOB_CONTROL_DIR}/SUCCESS" From 44bece63a15766f09f9c6526a4834bd5d00c0f0d Mon Sep 17 00:00:00 2001 From: Daniel Dantas Date: Tue, 9 Aug 2022 09:35:38 -0300 Subject: [PATCH 266/268] removed spark.metrics.conf parameter from spark-submit --- remote_hook.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/remote_hook.sh b/remote_hook.sh index 84ceec72..a8cbe8ee 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -126,7 +126,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & - ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --conf "spark.metrics.conf=/tmp/jmx/metrics.properties" --repositories "https://raw.github.com/banzaicloud/spark-metrics/master/maven-repo/releases" --packages "com.banzaicloud:spark-metrics_2.11:2.4-1.0.6,io.prometheus:simpleclient:0.3.0,io.prometheus:simpleclient_dropwizard:0.3.0,io.prometheus:simpleclient_pushgateway:0.3.0,io.dropwizard.metrics:metrics-core:3.1.2" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" + ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --repositories "https://mvnrepository.com/artifact/com.banzaicloud/spark-metrics" --packages "com.banzaicloud:spark-metrics_2.11:2.4-1.0.6,io.prometheus:simpleclient:0.3.0,io.prometheus:simpleclient_dropwizard:0.3.0,io.prometheus:simpleclient_pushgateway:0.3.0,io.dropwizard.metrics:metrics-core:3.1.2" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" fi touch "${JOB_CONTROL_DIR}/SUCCESS" From 98594ff2171b695011612fa07fb118a3fa74fe47 Mon Sep 17 00:00:00 2001 From: Daniel Dantas Date: Tue, 9 Aug 2022 17:08:45 -0300 Subject: [PATCH 267/268] removed --repositories and --packages parameters from spark-submit --- remote_hook.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/remote_hook.sh b/remote_hook.sh index a8cbe8ee..ac9b828f 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -126,7 +126,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & - ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --repositories "https://mvnrepository.com/artifact/com.banzaicloud/spark-metrics" --packages "com.banzaicloud:spark-metrics_2.11:2.4-1.0.6,io.prometheus:simpleclient:0.3.0,io.prometheus:simpleclient_dropwizard:0.3.0,io.prometheus:simpleclient_pushgateway:0.3.0,io.dropwizard.metrics:metrics-core:3.1.2" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" + ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" fi touch "${JOB_CONTROL_DIR}/SUCCESS" From 90983fbb8c017c920118bf1a1a9bfea7ca05c459 Mon Sep 17 00:00:00 2001 From: Daniel Dantas Date: Mon, 15 Aug 2022 11:13:15 -0300 Subject: [PATCH 268/268] removed javaagent parameter --- remote_hook.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/remote_hook.sh b/remote_hook.sh index ac9b828f..903d618e 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -126,7 +126,7 @@ elif [[ "${JOB_NAME}" == "jupyter" ]]; then else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & - ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --conf "spark.driver.extraJavaOptions=-javaagent:/tmp/jmx/jmx_prometheus_javaagent-0.17.0.jar=9095:/tmp/jmx/spark.yml" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" + ${SPARK_HOME}/bin/spark-submit --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/media/tmp -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" "$@" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" fi touch "${JOB_CONTROL_DIR}/SUCCESS"