From f9ffdd5fc5c1a1b539c9e1fc29e1cab1c8829e85 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Thu, 7 May 2015 11:43:52 -0300 Subject: [PATCH 01/80] Fixed typo --- src/main/scala/ignition/core/jobs/utils/RDDUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala index fc42ded5..7e75d5ec 100644 --- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala @@ -84,7 +84,7 @@ object RDDUtils { rdd.aggregateByKey(List.empty[V])( (lst, v) => if (lst.size >= n) { - logger.warn(s"Ignoring value '$v' due aggregation result of size '${lst.size}' is bigger then n = '$n'") + logger.warn(s"Ignoring value '$v' due aggregation result of size '${lst.size}' is bigger than n=$n") lst } else { v :: lst From b0e168e5d79b2cd46fd9722eca572fb358e3d421 Mon Sep 17 00:00:00 2001 From: ZaGo Date: Fri, 8 May 2015 13:38:26 -0300 Subject: [PATCH 02/80] refactoring to allow changes in ignition.mail --- .../ignition/core/jobs/utils/SparkContextUtils.scala | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 29c32112..a1090d20 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -130,6 +130,13 @@ object SparkContextUtils { } + def getTextFiles(paths: Seq[String], synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = { + if (synchLocally) + processTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths) + else + processTextFiles(paths, minimumPaths) + } + def filterAndGetTextFiles(path: String, requireSuccess: Boolean = false, inclusiveStartDate: Boolean = true, @@ -144,10 +151,7 @@ object SparkContextUtils { val paths = getFilteredPaths(Seq(path), requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates) if (paths.size < minimumPaths) throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of paths $paths is less than the required") - else if (synchLocally) - processTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths) - else - processTextFiles(paths, minimumPaths) + getTextFiles(paths, synchLocally, forceSynch, minimumPaths) } private def stringHadoopFile(paths: Seq[String], minimumPaths: Int): RDD[Try[String]] = { From a8e9734b844bf9d5bdbe0572c0a7e247399983de Mon Sep 17 00:00:00 2001 From: Filipe Niero Felisbino Date: Fri, 8 May 2015 15:49:25 -0300 Subject: [PATCH 03/80] Fix ec2 request issue --- tools/spark-ec2/spark_ec2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 5fdf0467..a608f9ce 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -540,7 +540,7 @@ def launch_cluster(conn, opts, cluster_name): (invalid[0].id, invalid[0].status.message)) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves - reservations = conn.get_all_reservations(active_instance_ids) + reservations = conn.get_all_reservations([r.instance_id for r in active_instance_ids]) slave_nodes = [] for r in reservations: slave_nodes += r.instances From 9ae5178549af17b57a19e0ff2fefcb385c5401bf Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 18 May 2015 10:41:19 -0300 Subject: [PATCH 04/80] Minor improvements --- build.sbt | 2 +- src/main/scala/ignition/core/utils/DateUtils.scala | 10 +++++++++- src/main/scala/ignition/core/utils/FutureUtils.scala | 4 +++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/build.sbt b/build.sbt index 095c1228..4dfcd1ae 100644 --- a/build.sbt +++ b/build.sbt @@ -17,7 +17,7 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") -libraryDependencies += "com.github.nscala-time" %% "nscala-time" % "0.8.0" +libraryDependencies += "com.github.nscala-time" %% "nscala-time" % "2.0.0" libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.0" diff --git a/src/main/scala/ignition/core/utils/DateUtils.scala b/src/main/scala/ignition/core/utils/DateUtils.scala index 231817c7..c3fb5163 100644 --- a/src/main/scala/ignition/core/utils/DateUtils.scala +++ b/src/main/scala/ignition/core/utils/DateUtils.scala @@ -1,6 +1,6 @@ package ignition.core.utils -import org.joda.time.{Period, DateTimeZone, DateTime} +import org.joda.time.{Seconds, Period, DateTimeZone, DateTime} import org.joda.time.format.ISODateTimeFormat object DateUtils { @@ -21,4 +21,12 @@ object DateUtils { def isEqualOrBefore(other: DateTime) = dateTime.isBefore(other) || dateTime.saneEqual(other) } + + implicit class SecondsImprovements(val seconds: Seconds) { + + implicit def toScalaDuration: scala.concurrent.duration.FiniteDuration = { + scala.concurrent.duration.Duration(seconds.getSeconds, scala.concurrent.duration.SECONDS) + } + + } } diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala index 068d63bc..81b0490e 100644 --- a/src/main/scala/ignition/core/utils/FutureUtils.scala +++ b/src/main/scala/ignition/core/utils/FutureUtils.scala @@ -1,10 +1,12 @@ package ignition.core.utils -import scala.concurrent.{ExecutionContext, Future, Promise} +import scala.concurrent.{ExecutionContext, Future, Promise, blocking, future} import scala.util.{Failure, Success} object FutureUtils { + def blockingFuture[T](body: =>T)(implicit ec: ExecutionContext): Future[T] = future { blocking { body } } + implicit class FutureImprovements[V](future: Future[V]) { def toOptionOnFailure(errorHandler: (Throwable) => Option[V])(implicit ec: ExecutionContext): Future[Option[V]] = { future.map(Option.apply).recover { case t => errorHandler(t) } From 53cfe885d21307acb4072260f68d6d2f718dc746 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Mon, 18 May 2015 17:05:35 -0300 Subject: [PATCH 05/80] remove unused lib --- build.sbt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index 4dfcd1ae..046d9503 100644 --- a/build.sbt +++ b/build.sbt @@ -17,8 +17,6 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") -libraryDependencies += "com.github.nscala-time" %% "nscala-time" % "2.0.0" - libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.0" libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16" @@ -29,6 +27,10 @@ libraryDependencies += "com.github.scopt" %% "scopt" % "3.2.0" libraryDependencies += "net.java.dev.jets3t" % "jets3t" % "0.7.1" +libraryDependencies += "joda-time" % "joda-time" % "2.7" + +libraryDependencies += "org.joda" % "joda-convert" % "1.7" + resolvers += "Akka Repository" at "http://repo.akka.io/releases/" resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/" From d965fd6ad12bbf2fadf9302837ec7b242661eba8 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 1 Jun 2015 14:10:58 -0300 Subject: [PATCH 06/80] Added utilitary function for better stack traces --- .../scala/ignition/core/utils/BetterTrace.scala | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 src/main/scala/ignition/core/utils/BetterTrace.scala diff --git a/src/main/scala/ignition/core/utils/BetterTrace.scala b/src/main/scala/ignition/core/utils/BetterTrace.scala new file mode 100644 index 00000000..158e261e --- /dev/null +++ b/src/main/scala/ignition/core/utils/BetterTrace.scala @@ -0,0 +1,13 @@ +package ignition.core.utils + +// Used mainly to augment scalacheck traces in scalatest +trait BetterTrace { + def fail(message: String): Nothing + def withBetterTrace(block: => Unit): Unit = + try { + block + } catch { + case t: Throwable => fail(s"${t.getMessage}: ${t.getStackTraceString}") + } + +} From c89961984bbd4be54c63366d4df5b915a25c89fc Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 8 Jun 2015 18:08:22 -0300 Subject: [PATCH 07/80] Updated scalatest to fix conflicts --- build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index 046d9503..be7e1b12 100644 --- a/build.sbt +++ b/build.sbt @@ -17,7 +17,7 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") -libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.0" +libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.1.3" libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16" From 82a09c0ff017484bfbada7d1a4b451e7c288a025 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Thu, 11 Jun 2015 16:34:05 -0300 Subject: [PATCH 08/80] Improved s3 service --- src/main/scala/ignition/core/utils/S3Client.scala | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/utils/S3Client.scala b/src/main/scala/ignition/core/utils/S3Client.scala index f02d7acd..a988aa7f 100644 --- a/src/main/scala/ignition/core/utils/S3Client.scala +++ b/src/main/scala/ignition/core/utils/S3Client.scala @@ -37,7 +37,14 @@ class S3Client { } def list(bucket: String, key: String): Array[S3Object] = { - service.listObjects(bucket, key, null, 99999L) + service.listObjectsChunked(bucket, key, null, 99999L, null, true).getObjects + } + + def copyFile(sourceBucket: String, sourceKey: String, destBucket: String, destKey: String, destContentType: Option[String] = None): Unit = { + val destFile = new S3Object(destKey) + val replaceMetaData = destContentType.isDefined + destContentType.foreach(contentType => destFile.setContentType(contentType)) + service.copyObject(sourceBucket, sourceKey, destBucket, destFile, replaceMetaData) } def fileExists(bucket: String, key: String): Boolean = { From c32cce56fab86bc3372bd17adc1f7745f3d0c797 Mon Sep 17 00:00:00 2001 From: Flavio Sales Truzzi Date: Thu, 11 Jun 2015 19:11:30 -0300 Subject: [PATCH 09/80] Add optinal content type --- src/main/scala/ignition/core/utils/S3Client.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/utils/S3Client.scala b/src/main/scala/ignition/core/utils/S3Client.scala index a988aa7f..fe509a4b 100644 --- a/src/main/scala/ignition/core/utils/S3Client.scala +++ b/src/main/scala/ignition/core/utils/S3Client.scala @@ -26,9 +26,9 @@ class S3Client { null, null, jets3tProperties ) - def writeContent(bucket: String, key: String, content: String): S3Object = { + def writeContent(bucket: String, key: String, content: String, contentType: String = "text/plain"): S3Object = { val obj = new S3Object(key, content) - obj.setContentType("text/plain") + obj.setContentType(contentType) service.putObject(bucket, obj) } From 8f51a86897eb401c9190a8e1fbc2e40359e8a678 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Fri, 12 Jun 2015 16:27:35 -0300 Subject: [PATCH 10/80] Added content encoding --- src/main/scala/ignition/core/utils/S3Client.scala | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/utils/S3Client.scala b/src/main/scala/ignition/core/utils/S3Client.scala index fe509a4b..b806b376 100644 --- a/src/main/scala/ignition/core/utils/S3Client.scala +++ b/src/main/scala/ignition/core/utils/S3Client.scala @@ -40,9 +40,13 @@ class S3Client { service.listObjectsChunked(bucket, key, null, 99999L, null, true).getObjects } - def copyFile(sourceBucket: String, sourceKey: String, destBucket: String, destKey: String, destContentType: Option[String] = None): Unit = { + def copyFile(sourceBucket: String, sourceKey: String, + destBucket: String, destKey: String, + destContentType: Option[String] = None, + destContentEncoding: Option[String] = None): Unit = { val destFile = new S3Object(destKey) - val replaceMetaData = destContentType.isDefined + val replaceMetaData = destContentType.isDefined || destContentEncoding.isDefined + destContentEncoding.foreach(encoding => destFile.setContentEncoding(encoding)) destContentType.foreach(contentType => destFile.setContentType(contentType)) service.copyObject(sourceBucket, sourceKey, destBucket, destFile, replaceMetaData) } From c752d9379edc91e37c261eed5610dfe09a3a06bf Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Thu, 18 Jun 2015 20:01:26 -0300 Subject: [PATCH 11/80] Upgraded scalatest --- build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index be7e1b12..c4723faf 100644 --- a/build.sbt +++ b/build.sbt @@ -17,7 +17,7 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") -libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.1.3" +libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.2.4" libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16" From 59f818da5aedc7dd919eca2d6e58f21208672316 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 23 Jun 2015 18:01:01 -0300 Subject: [PATCH 12/80] Added removeEmpty to Maps --- src/main/scala/ignition/core/utils/CollectionUtils.scala | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala index 27977270..5994b153 100644 --- a/src/main/scala/ignition/core/utils/CollectionUtils.scala +++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala @@ -103,4 +103,10 @@ object CollectionUtils { .toList } } + + + implicit class CollectionMap[K, V <: TraversableOnce[Any]](map: Map[K, V]) { + def removeEmpty(): Map[K, V] = + map.filter { case (k, v) => !v.isEmpty } + } } From 842ca9dba49ed76ddedb4990779928b01d46cfc3 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 23 Jun 2015 18:01:47 -0300 Subject: [PATCH 13/80] Added removeEmpty to Maps --- src/main/scala/ignition/core/utils/CollectionUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala index 5994b153..52828ca7 100644 --- a/src/main/scala/ignition/core/utils/CollectionUtils.scala +++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala @@ -107,6 +107,6 @@ object CollectionUtils { implicit class CollectionMap[K, V <: TraversableOnce[Any]](map: Map[K, V]) { def removeEmpty(): Map[K, V] = - map.filter { case (k, v) => !v.isEmpty } + map.filter { case (k, v) => v.nonEmpty } } } From d05f836d8967657fd6df96293d65e013f45861e5 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 25 Jun 2015 09:34:14 -0300 Subject: [PATCH 14/80] exclude slf4j-log4j12 backend --- build.sbt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index c4723faf..7eb2bffe 100644 --- a/build.sbt +++ b/build.sbt @@ -13,7 +13,9 @@ ideaExcludeFolders += ".idea_modules" // Because we can't run two spark contexts on same VM parallelExecution in Test := false -libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided").exclude("org.apache.hadoop", "hadoop-client") +libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided") + .exclude("org.apache.hadoop", "hadoop-client") + .exclude("org.slf4j", "slf4j-log4j12") libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") From c9abcd53eb2d839d61c017e0b574e00be911be4c Mon Sep 17 00:00:00 2001 From: sisso Date: Tue, 30 Jun 2015 14:41:15 -0300 Subject: [PATCH 15/80] added method that allow to map future using success/failure --- .../ignition/core/utils/FutureUtils.scala | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala index 81b0490e..41cf75a3 100644 --- a/src/main/scala/ignition/core/utils/FutureUtils.scala +++ b/src/main/scala/ignition/core/utils/FutureUtils.scala @@ -1,7 +1,7 @@ package ignition.core.utils import scala.concurrent.{ExecutionContext, Future, Promise, blocking, future} -import scala.util.{Failure, Success} +import scala.util.{Failure, Success, Try} object FutureUtils { @@ -11,6 +11,23 @@ object FutureUtils { def toOptionOnFailure(errorHandler: (Throwable) => Option[V])(implicit ec: ExecutionContext): Future[Option[V]] = { future.map(Option.apply).recover { case t => errorHandler(t) } } + + /** + * Appear to be redundant. But its the only way to map a future with + * Success and Failure in same algorithm without split it to use map/recover + * or transform. + * + * future.asTry.map { case Success(v) => 1; case Failure(e) => 0 } + * + * instead + * + * future.map(i=>1).recover(case _: Exception => 0) + * future.transform(=> 1, => 0) + * + */ + def asTry()(implicit ec: ExecutionContext) : Future[Try[V]] = { + future.map(v => Success(v)).recover { case e: Exception => Failure(e) } + } } implicit class FutureGeneratorImprovements[V](generator: Iterable[() => Future[V]]){ From 48f4e2c2798c3d5a2262234ae001ae6cb5ad5cc6 Mon Sep 17 00:00:00 2001 From: sisso Date: Tue, 7 Jul 2015 18:09:12 -0300 Subject: [PATCH 16/80] change catch to NonFatal --- src/main/scala/ignition/core/utils/FutureUtils.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala index 41cf75a3..95b44c2f 100644 --- a/src/main/scala/ignition/core/utils/FutureUtils.scala +++ b/src/main/scala/ignition/core/utils/FutureUtils.scala @@ -1,6 +1,7 @@ package ignition.core.utils import scala.concurrent.{ExecutionContext, Future, Promise, blocking, future} +import scala.util.control.NonFatal import scala.util.{Failure, Success, Try} object FutureUtils { @@ -22,11 +23,10 @@ object FutureUtils { * instead * * future.map(i=>1).recover(case _: Exception => 0) - * future.transform(=> 1, => 0) * */ def asTry()(implicit ec: ExecutionContext) : Future[Try[V]] = { - future.map(v => Success(v)).recover { case e: Exception => Failure(e) } + future.map(v => Success(v)).recover { case NonFatal(e) => Failure(e) } } } From bab487acfb4e74c7115a764d5774f5eefaa40630 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 13 Aug 2015 15:54:42 -0300 Subject: [PATCH 17/80] attempt to update spark_ec2.py --- tools/spark-ec2/spark_ec2.py | 708 ++++++++++++++++++++++++----------- 1 file changed, 482 insertions(+), 226 deletions(-) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index a608f9ce..8cc44d30 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -19,9 +19,11 @@ # limitations under the License. # -from __future__ import with_statement +from __future__ import division, print_function, with_statement +import codecs import hashlib +import itertools import logging import os import os.path @@ -36,13 +38,20 @@ import tempfile import textwrap import time -import urllib2 import warnings from datetime import datetime from optparse import OptionParser from sys import stderr -SPARK_EC2_VERSION = "1.3.0" +if sys.version < "3": + from urllib2 import urlopen, Request, HTTPError +else: + from urllib.request import urlopen, Request + from urllib.error import HTTPError + raw_input = input + xrange = range + +SPARK_EC2_VERSION = "1.4.0" SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__)) VALID_SPARK_VERSIONS = set([ @@ -60,14 +69,84 @@ "1.2.0", "1.2.1", "1.3.0", + "1.3.1", + "1.4.0", ]) +SPARK_TACHYON_MAP = { + "1.0.0": "0.4.1", + "1.0.1": "0.4.1", + "1.0.2": "0.4.1", + "1.1.0": "0.5.0", + "1.1.1": "0.5.0", + "1.2.0": "0.5.0", + "1.2.1": "0.5.0", + "1.3.0": "0.5.0", + "1.3.1": "0.5.0", + "1.4.0": "0.6.4", +} + DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION DEFAULT_SPARK_GITHUB_REPO = "https://github.com/apache/spark" # Default location to get the spark-ec2 scripts (and ami-list) from -DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/mesos/spark-ec2" -DEFAULT_SPARK_EC2_BRANCH = "branch-1.3" +DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/amplab/spark-ec2" +DEFAULT_SPARK_EC2_BRANCH = "branch-1.4" + + +def setup_external_libs(libs): + """ + Download external libraries from PyPI to SPARK_EC2_DIR/lib/ and prepend them to our PATH. + """ + PYPI_URL_PREFIX = "https://pypi.python.org/packages/source" + SPARK_EC2_LIB_DIR = os.path.join(SPARK_EC2_DIR, "lib") + + if not os.path.exists(SPARK_EC2_LIB_DIR): + print("Downloading external libraries that spark-ec2 needs from PyPI to {path}...".format( + path=SPARK_EC2_LIB_DIR + )) + print("This should be a one-time operation.") + os.mkdir(SPARK_EC2_LIB_DIR) + + for lib in libs: + versioned_lib_name = "{n}-{v}".format(n=lib["name"], v=lib["version"]) + lib_dir = os.path.join(SPARK_EC2_LIB_DIR, versioned_lib_name) + + if not os.path.isdir(lib_dir): + tgz_file_path = os.path.join(SPARK_EC2_LIB_DIR, versioned_lib_name + ".tar.gz") + print(" - Downloading {lib}...".format(lib=lib["name"])) + download_stream = urlopen( + "{prefix}/{first_letter}/{lib_name}/{lib_name}-{lib_version}.tar.gz".format( + prefix=PYPI_URL_PREFIX, + first_letter=lib["name"][:1], + lib_name=lib["name"], + lib_version=lib["version"] + ) + ) + with open(tgz_file_path, "wb") as tgz_file: + tgz_file.write(download_stream.read()) + with open(tgz_file_path, "rb") as tar: + if hashlib.md5(tar.read()).hexdigest() != lib["md5"]: + print("ERROR: Got wrong md5sum for {lib}.".format(lib=lib["name"]), file=stderr) + sys.exit(1) + tar = tarfile.open(tgz_file_path) + tar.extractall(path=SPARK_EC2_LIB_DIR) + tar.close() + os.remove(tgz_file_path) + print(" - Finished downloading {lib}.".format(lib=lib["name"])) + sys.path.insert(1, lib_dir) + + +# Only PyPI libraries are supported. +external_libs = [ + { + "name": "boto", + "version": "2.34.0", + "md5": "5556223d2d0cc4d06dd4829e671dcecd" + } +] + +setup_external_libs(external_libs) import boto from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType, EBSBlockDeviceType @@ -107,7 +186,7 @@ def parse_args(): help="Master instance type (leave empty for same as instance-type)") parser.add_option( "-r", "--region", default="us-east-1", - help="EC2 region zone to launch instances in") + help="EC2 region used to launch instances in, or to find them in (default: %default)") parser.add_option( "-z", "--zone", default="", help="Availability zone to launch instances in, or 'all' to spread " + @@ -133,9 +212,19 @@ def parse_args(): "--spark-ec2-git-branch", default=DEFAULT_SPARK_EC2_BRANCH, help="Github repo branch of spark-ec2 to use (default: %default)") + parser.add_option( + "--deploy-root-dir", + default=None, + help="A directory to copy into / on the first master. " + + "Must be absolute. Note that a trailing slash is handled as per rsync: " + + "If you omit it, the last directory of the --deploy-root-dir path will be created " + + "in / before copying its contents. If you append the trailing slash, " + + "the directory is not created and its contents are copied directly into /. " + + "(default: %default).") parser.add_option( "--hadoop-major-version", default="1", - help="Major version of Hadoop (default: %default)") + help="Major version of Hadoop. Valid options are 1 (Hadoop 1.0.4), 2 (CDH 4.7.1), yarn " + + "(Hadoop 2.4.0) (default: %default)") parser.add_option( "-D", metavar="[ADDRESS:]PORT", dest="proxy_port", help="Use SSH dynamic port forwarding to create a SOCKS proxy at " + @@ -155,7 +244,7 @@ def parse_args(): help="Number of EBS volumes to attach to each node as /vol[x]. " + "The volumes will be deleted when the instances terminate. " + "Only possible on EBS-backed AMIs. " + - "EBS volumes are only attached if --ebs-vol-size > 0." + + "EBS volumes are only attached if --ebs-vol-size > 0. " + "Only support up to 8 EBS volumes.") parser.add_option( "--placement-group", type="string", default=None, @@ -187,14 +276,15 @@ def parse_args(): help="Launch fresh slaves, but use an existing stopped master if possible") parser.add_option( "--worker-instances", type="int", default=1, - help="Number of instances per worker: variable SPARK_WORKER_INSTANCES (default: %default)") + help="Number of instances per worker: variable SPARK_WORKER_INSTANCES. Not used if YARN " + + "is used as Hadoop major version (default: %default)") parser.add_option( "--master-opts", type="string", default="", help="Extra options to give to master through SPARK_MASTER_OPTS variable " + "(e.g -Dspark.worker.timeout=180)") parser.add_option( "--user-data", type="string", default="", - help="Path to a user-data file (most AMI's interpret this as an initialization script)") + help="Path to a user-data file (most AMIs interpret this as an initialization script)") parser.add_option( "--security-group-prefix", type="string", default=None, help="Use this prefix for the security group rather than the cluster name.") @@ -204,6 +294,10 @@ def parse_args(): parser.add_option( "--additional-security-group", type="string", default="", help="Additional security group to place the machines in") + parser.add_option( + "--additional-tags", type="string", default="", + help="Additional tags to set on the machines; tags are comma-separated, while name and " + + "value are colon separated; ex: \"Task:MySparkProject,Env:production\"") parser.add_option( "--copy-aws-credentials", action="store_true", default=False, help="Add AWS credentials to hadoop configuration to allow Spark to access S3") @@ -216,6 +310,17 @@ def parse_args(): parser.add_option( "--spot-timeout", type="int", default=45, help="Maximum amount of time (in minutes) to wait for spot requests to be fulfilled") + parser.add_option( + "--private-ips", action="store_true", default=False, + help="Use private IPs for instances rather than public if VPC/subnet " + + "requires that.") + parser.add_option( + "--instance-initiated-shutdown-behavior", default="terminate", + choices=["stop", "terminate"], + help="Whether instances should terminate when shut down or just stop") + parser.add_option( + "--instance-profile-name", default=None, + help="IAM profile name to launch instances under") (opts, args) = parser.parse_args() if len(args) != 2: @@ -228,14 +333,16 @@ def parse_args(): home_dir = os.getenv('HOME') if home_dir is None or not os.path.isfile(home_dir + '/.boto'): if not os.path.isfile('/etc/boto.cfg'): - if os.getenv('AWS_ACCESS_KEY_ID') is None: - print >> stderr, ("ERROR: The environment variable AWS_ACCESS_KEY_ID " + - "must be set") - sys.exit(1) - if os.getenv('AWS_SECRET_ACCESS_KEY') is None: - print >> stderr, ("ERROR: The environment variable AWS_SECRET_ACCESS_KEY " + - "must be set") - sys.exit(1) + # If there is no boto config, check aws credentials + if not os.path.isfile(home_dir + '/.aws/credentials'): + if os.getenv('AWS_ACCESS_KEY_ID') is None: + print("ERROR: The environment variable AWS_ACCESS_KEY_ID must be set", + file=stderr) + sys.exit(1) + if os.getenv('AWS_SECRET_ACCESS_KEY') is None: + print("ERROR: The environment variable AWS_SECRET_ACCESS_KEY must be set", + file=stderr) + sys.exit(1) return (opts, action, cluster_name) @@ -246,7 +353,7 @@ def get_or_make_group(conn, name, vpc_id): if len(group) > 0: return group[0] else: - print "Creating security group " + name + print("Creating security group " + name) return conn.create_security_group(name, "Spark EC2 group", vpc_id) def check_if_http_resource_exists(resource): @@ -270,12 +377,12 @@ def get_validate_spark_version(version, repo): if check_if_http_resource_exists: return version else: - print >> stderr, "Unable to validate pre-built spark version {version}".format(version=version) + print("Unable to validate pre-built spark version {version}".format(version=version), file=stderr) sys.exit(1) elif "." in version: version = version.replace("v", "") if version not in VALID_SPARK_VERSIONS: - print >> stderr, "Don't know about Spark version: {v}".format(v=version) + print("Don't know about Spark version: {v}".format(v=version), file=stderr) sys.exit(1) return version else: @@ -288,84 +395,93 @@ def get_validate_spark_version(version, repo): return version -# Check whether a given EC2 instance object is in a state we consider active, -# i.e. not terminating or terminated. We count both stopping and stopped as -# active since we can restart stopped clusters. -def is_active(instance): - return (instance.state in ['pending', 'running', 'stopping', 'stopped']) - - # Source: http://aws.amazon.com/amazon-linux-ami/instance-type-matrix/ -# Last Updated: 2014-06-20 +# Last Updated: 2015-06-19 # For easy maintainability, please keep this manually-inputted dictionary sorted by key. EC2_INSTANCE_TYPES = { "c1.medium": "pvm", "c1.xlarge": "pvm", + "c3.large": "pvm", + "c3.xlarge": "pvm", "c3.2xlarge": "pvm", "c3.4xlarge": "pvm", "c3.8xlarge": "pvm", - "c3.large": "pvm", - "c3.xlarge": "pvm", + "c4.large": "hvm", + "c4.xlarge": "hvm", + "c4.2xlarge": "hvm", + "c4.4xlarge": "hvm", + "c4.8xlarge": "hvm", "cc1.4xlarge": "hvm", "cc2.8xlarge": "hvm", "cg1.4xlarge": "hvm", "cr1.8xlarge": "hvm", + "d2.xlarge": "hvm", + "d2.2xlarge": "hvm", + "d2.4xlarge": "hvm", + "d2.8xlarge": "hvm", + "g2.2xlarge": "hvm", + "g2.8xlarge": "hvm", "hi1.4xlarge": "pvm", "hs1.8xlarge": "pvm", + "i2.xlarge": "hvm", "i2.2xlarge": "hvm", "i2.4xlarge": "hvm", "i2.8xlarge": "hvm", - "i2.xlarge": "hvm", - "m1.large": "pvm", - "m1.medium": "pvm", "m1.small": "pvm", + "m1.medium": "pvm", + "m1.large": "pvm", "m1.xlarge": "pvm", + "m2.xlarge": "pvm", "m2.2xlarge": "pvm", "m2.4xlarge": "pvm", - "m2.xlarge": "pvm", - "m3.2xlarge": "hvm", - "m3.large": "hvm", "m3.medium": "hvm", + "m3.large": "hvm", "m3.xlarge": "hvm", + "m3.2xlarge": "hvm", + "m4.large": "hvm", + "m4.xlarge": "hvm", + "m4.2xlarge": "hvm", + "m4.4xlarge": "hvm", + "m4.10xlarge": "hvm", + "r3.large": "hvm", + "r3.xlarge": "hvm", "r3.2xlarge": "hvm", "r3.4xlarge": "hvm", "r3.8xlarge": "hvm", - "r3.large": "hvm", - "r3.xlarge": "hvm", "t1.micro": "pvm", - "t2.medium": "hvm", "t2.micro": "hvm", "t2.small": "hvm", - "d2.2xlarge": "hvm", - "d2.4xlarge": "hvm", - "d2.8xlarge": "hvm", - "d2.large": "hvm", - "d2.xlarge": "hvm", + "t2.medium": "hvm", + "t2.large": "hvm", } +def get_tachyon_version(spark_version): + return SPARK_TACHYON_MAP.get(spark_version, "") + + # Attempt to resolve an appropriate AMI given the architecture and region of the request. -def get_spark_ami(instance_type, region, spark_ec2_git_repo, spark_ec2_git_branch): - if instance_type in EC2_INSTANCE_TYPES: - instance_type = EC2_INSTANCE_TYPES[instance_type] +def get_spark_ami(opts): + if opts.instance_type in EC2_INSTANCE_TYPES: + instance_type = EC2_INSTANCE_TYPES[opts.instance_type] else: instance_type = "pvm" - print >> stderr,\ - "Don't recognize %s, assuming type is pvm" % instance_type + print("Don't recognize %s, assuming type is pvm" % opts.instance_type, file=stderr) # URL prefix from which to fetch AMI information ami_prefix = "{r}/{b}/ami-list".format( - r=spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1), - b=spark_ec2_git_branch) + r=opts.spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1), + b=opts.spark_ec2_git_branch) - ami_path = "%s/%s/%s" % (ami_prefix, region, instance_type) + ami_path = "%s/%s/%s" % (ami_prefix, opts.region, instance_type) + reader = codecs.getreader("ascii") try: - ami = urllib2.urlopen(ami_path).read().strip() - print "Spark AMI for %s: %s" % (instance_type, ami) + ami = reader(urlopen(ami_path)).read().strip() except: - print >> stderr, "Could not resolve AMI at: " + ami_path + print("Could not resolve AMI at: " + ami_path, file=stderr) sys.exit(1) + print("Spark AMI: " + ami) return ami @@ -375,11 +491,11 @@ def get_spark_ami(instance_type, region, spark_ec2_git_repo, spark_ec2_git_branc # Fails if there already instances running in the cluster's groups. def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: - print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." + print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr) sys.exit(1) if opts.key_pair is None: - print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." + print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr) sys.exit(1) user_data_content = None @@ -387,7 +503,7 @@ def launch_cluster(conn, opts, cluster_name): with open(opts.user_data) as user_data_file: user_data_content = user_data_file.read() - print "Setting up security groups..." + print("Setting up security groups...") if opts.security_group_prefix is None: master_group = get_or_make_group(conn, cluster_name + "-master", opts.vpc_id) slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id) @@ -421,6 +537,17 @@ def launch_cluster(conn, opts, cluster_name): master_group.authorize('tcp', 50070, 50070, authorized_address) master_group.authorize('tcp', 60070, 60070, authorized_address) master_group.authorize('tcp', 4040, 4045, authorized_address) + # Rstudio (GUI for R) needs port 8787 for web access + master_group.authorize('tcp', 8787, 8787, authorized_address) + # HDFS NFS gateway requires 111,2049,4242 for tcp & udp + master_group.authorize('tcp', 111, 111, authorized_address) + master_group.authorize('udp', 111, 111, authorized_address) + master_group.authorize('tcp', 2049, 2049, authorized_address) + master_group.authorize('udp', 2049, 2049, authorized_address) + master_group.authorize('tcp', 4242, 4242, authorized_address) + master_group.authorize('udp', 4242, 4242, authorized_address) + # RM in YARN mode uses 8088 + master_group.authorize('tcp', 8088, 8088, authorized_address) if opts.ganglia: master_group.authorize('tcp', 5080, 5080, authorized_address) if slave_group.rules == []: # Group was just now created @@ -451,13 +578,13 @@ def launch_cluster(conn, opts, cluster_name): existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): - print >> stderr, ("ERROR: There are already instances running in " + - "group %s or %s" % (master_group.name, slave_group.name)) + print("ERROR: There are already instances running in group %s or %s" % + (master_group.name, slave_group.name), file=stderr) sys.exit(1) # Figure out Spark AMI if opts.ami is None: - opts.ami = get_spark_ami(opts.instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch) + opts.ami = get_spark_ami(opts) if opts.master_ami is None: opts.master_ami = get_spark_ami(opts.master_instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch) @@ -468,12 +595,12 @@ def launch_cluster(conn, opts, cluster_name): additional_group_ids = [sg.id for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id)] - print "Launching instances..." + print("Launching instances...") try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: - print >> stderr, "Could not find AMI " + opts.ami + print("Could not find AMI " + opts.ami, file=stderr) sys.exit(1) try: @@ -502,8 +629,8 @@ def launch_cluster(conn, opts, cluster_name): # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price - print ("Requesting %d slaves as spot instances with price $%.3f" % - (opts.slaves, opts.spot_price)) + print("Requesting %d slaves as spot instances with price $%.3f" % + (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 @@ -522,12 +649,13 @@ def launch_cluster(conn, opts, cluster_name): block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, - user_data=user_data_content) + user_data=user_data_content, + instance_profile_name=opts.instance_profile_name) my_req_ids += [req.id for req in slave_reqs] i += 1 start_time = datetime.now() - print "Waiting for spot instances to be granted... Request IDs: %s " % my_req_ids + print("Waiting for spot instances to be granted... Request IDs: %s " % my_req_ids) try: while True: time.sleep(10) @@ -539,28 +667,28 @@ def launch_cluster(conn, opts, cluster_name): raise Exception("Invalid state for spot request: %s - status: %s" % (invalid[0].id, invalid[0].status.message)) if len(active_instance_ids) == opts.slaves: - print "All %d slaves granted" % opts.slaves + print("All %d slaves granted" % opts.slaves) reservations = conn.get_all_reservations([r.instance_id for r in active_instance_ids]) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: - print "%d of %d slaves granted, waiting longer" % ( - len(active_instance_ids), opts.slaves) + print("%d of %d slaves granted, waiting longer" % ( + len(active_instance_ids), opts.slaves)) if (datetime.now() - start_time).seconds > opts.spot_timeout * 60: raise Exception("Timed out while waiting for spot instances") except: - print "Error: %s" % sys.exc_info()[1] - print "Canceling spot instance requests" + print("Error: %s" % sys.exc_info()[1]) + print("Canceling spot instance requests") conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: - print >> stderr, ("WARNING: %d instances are still running" % running) + print(("WARNING: %d instances are still running" % running), file=stderr) sys.exit(0) else: # Launch non-spot instances @@ -571,24 +699,30 @@ def launch_cluster(conn, opts, cluster_name): for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: - slave_res = image.run(key_name=opts.key_pair, - security_group_ids=[slave_group.id] + additional_group_ids, - instance_type=opts.instance_type, - placement=zone, - min_count=num_slaves_this_zone, - max_count=num_slaves_this_zone, - block_device_map=block_map, - subnet_id=opts.subnet_id, - placement_group=opts.placement_group, - user_data=user_data_content) + slave_res = image.run( + key_name=opts.key_pair, + security_group_ids=[slave_group.id] + additional_group_ids, + instance_type=opts.instance_type, + placement=zone, + min_count=num_slaves_this_zone, + max_count=num_slaves_this_zone, + block_device_map=block_map, + subnet_id=opts.subnet_id, + placement_group=opts.placement_group, + user_data=user_data_content, + instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, + instance_profile_name=opts.instance_profile_name) slave_nodes += slave_res.instances - print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, - zone, slave_res.id) + print("Launched {s} slave{plural_s} in {z}, regid = {r}".format( + s=num_slaves_this_zone, + plural_s=('' if num_slaves_this_zone == 1 else 's'), + z=zone, + r=slave_res.id)) i += 1 # Launch or resume masters if existing_masters: - print "Starting master..." + print("Starting master...") for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() @@ -599,72 +733,92 @@ def launch_cluster(conn, opts, cluster_name): master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name - master_res = master_image.run(key_name=opts.key_pair, - security_group_ids=[master_group.id] + additional_group_ids, - instance_type=master_type, - placement=opts.zone, - min_count=1, - max_count=1, - block_device_map=block_map, - subnet_id=opts.subnet_id, - placement_group=opts.placement_group, - user_data=user_data_content) + master_res = master_image.run( + key_name=opts.key_pair, + security_group_ids=[master_group.id] + additional_group_ids, + instance_type=master_type, + placement=opts.zone, + min_count=1, + max_count=1, + block_device_map=block_map, + subnet_id=opts.subnet_id, + placement_group=opts.placement_group, + user_data=user_data_content, + instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, + instance_profile_name=opts.instance_profile_name) master_nodes = master_res.instances - print "Launched master in %s, regid = %s" % (zone, master_res.id) + print("Launched master in %s, regid = %s" % (zone, master_res.id)) # This wait time corresponds to SPARK-4983 - print "Waiting for AWS to propagate instance metadata..." + print("Waiting for AWS to propagate instance metadata...") time.sleep(5) - # Give the instances descriptive names + + # Give the instances descriptive names and set additional tags + additional_tags = {} + if opts.additional_tags.strip(): + additional_tags = dict( + map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',') + ) + for master in master_nodes: - master.add_tag( - key='Name', - value='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) + master.add_tags( + dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) + ) + for slave in slave_nodes: - slave.add_tag( - key='Name', - value='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) + slave.add_tags( + dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) + ) # Return all the instances return (master_nodes, slave_nodes) -# Get the EC2 instances in an existing cluster if available. -# Returns a tuple of lists of EC2 instance objects for the masters and slaves def get_existing_cluster(conn, opts, cluster_name, die_on_error=True): - print "Searching for existing cluster " + cluster_name + "..." - reservations = conn.get_all_reservations() - master_nodes = [] - slave_nodes = [] - for res in reservations: - active = [i for i in res.instances if is_active(i)] - for inst in active: - group_names = [g.name for g in inst.groups] - if (cluster_name + "-master") in group_names: - master_nodes.append(inst) - elif (cluster_name + "-slaves") in group_names: - slave_nodes.append(inst) - if any((master_nodes, slave_nodes)): - print "Found %d master(s), %d slaves" % (len(master_nodes), len(slave_nodes)) - if master_nodes != [] or not die_on_error: - return (master_nodes, slave_nodes) - else: - if master_nodes == [] and slave_nodes != []: - print >> sys.stderr, "ERROR: Could not find master in group " + cluster_name + "-master" - else: - print >> sys.stderr, "ERROR: Could not find any existing cluster" + """ + Get the EC2 instances in an existing cluster if available. + Returns a tuple of lists of EC2 instance objects for the masters and slaves. + """ + print("Searching for existing cluster {c} in region {r}...".format( + c=cluster_name, r=opts.region)) + + def get_instances(group_names): + """ + Get all non-terminated instances that belong to any of the provided security groups. + + EC2 reservation filters and instance states are documented here: + http://docs.aws.amazon.com/cli/latest/reference/ec2/describe-instances.html#options + """ + reservations = conn.get_all_reservations( + filters={"instance.group-name": group_names}) + instances = itertools.chain.from_iterable(r.instances for r in reservations) + return [i for i in instances if i.state not in ["shutting-down", "terminated"]] + + master_instances = get_instances([cluster_name + "-master"]) + slave_instances = get_instances([cluster_name + "-slaves"]) + + if any((master_instances, slave_instances)): + print("Found {m} master{plural_m}, {s} slave{plural_s}.".format( + m=len(master_instances), + plural_m=('' if len(master_instances) == 1 else 's'), + s=len(slave_instances), + plural_s=('' if len(slave_instances) == 1 else 's'))) + + if not master_instances and die_on_error: + print("ERROR: Could not find a master for cluster {c} in region {r}.".format( + c=cluster_name, r=opts.region), file=sys.stderr) sys.exit(1) + return (master_instances, slave_instances) + # Deploy configuration files and run setup scripts on a newly launched # or started EC2 cluster. - - def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): - master = master_nodes[0].public_dns_name + master = get_dns_name(master_nodes[0], opts.private_ips) if deploy_ssh_key: - print "Generating cluster's SSH key on master..." + print("Generating cluster's SSH key on master...") key_setup = """ [ -f ~/.ssh/id_rsa ] || (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa && @@ -672,24 +826,29 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): """ ssh(master, opts, key_setup) dot_ssh_tar = ssh_read(master, opts, ['tar', 'c', '.ssh']) - print "Transferring cluster's SSH key to slaves..." + print("Transferring cluster's SSH key to slaves...") for slave in slave_nodes: - print slave.public_dns_name - ssh_write(slave.public_dns_name, opts, ['tar', 'x'], dot_ssh_tar) + slave_address = get_dns_name(slave, opts.private_ips) + print(slave_address) + ssh_write(slave_address, opts, ['tar', 'x'], dot_ssh_tar) modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs', - 'mapreduce', 'spark-standalone', 'tachyon'] + 'mapreduce', 'spark-standalone', 'tachyon', 'rstudio'] if opts.hadoop_major_version == "1": - modules = filter(lambda x: x != "mapreduce", modules) + modules = list(filter(lambda x: x != "mapreduce", modules)) if opts.ganglia: modules.append('ganglia') + # Clear SPARK_WORKER_INSTANCES if running on YARN + if opts.hadoop_major_version == "yarn": + opts.worker_instances = "" + # NOTE: We should clone the repository before running deploy_files to # prevent ec2-variables.sh from being overwritten - print "Cloning spark-ec2 scripts from {r}/tree/{b} on master...".format( - r=opts.spark_ec2_git_repo, b=opts.spark_ec2_git_branch) + print("Cloning spark-ec2 scripts from {r}/tree/{b} on master...".format( + r=opts.spark_ec2_git_repo, b=opts.spark_ec2_git_branch)) ssh( host=master, opts=opts, @@ -699,7 +858,7 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): b=opts.spark_ec2_git_branch) ) - print "Deploying files to master..." + print("Deploying files to master...") deploy_files( conn=conn, root_dir=SPARK_EC2_DIR + "/" + "deploy.generic", @@ -709,18 +868,26 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): modules=modules ) - print "Running setup on master..." + if opts.deploy_root_dir is not None: + print("Deploying {s} to master...".format(s=opts.deploy_root_dir)) + deploy_user_files( + root_dir=opts.deploy_root_dir, + opts=opts, + master_nodes=master_nodes + ) + + print("Running setup on master...") setup_spark_cluster(master, opts) - print "Done!" + print("Done!") def setup_spark_cluster(master, opts): ssh(master, opts, "chmod u+x spark-ec2/setup.sh") ssh(master, opts, "spark-ec2/setup.sh") - print "Spark standalone cluster started at http://%s:8080" % master + print("Spark standalone cluster started at http://%s:8080" % master) if opts.ganglia: - print "Ganglia started at http://%s:5080/ganglia" % master + print("Ganglia started at http://%s:5080/ganglia" % master) def is_ssh_available(host, opts, print_ssh_output=True): @@ -737,7 +904,7 @@ def is_ssh_available(host, opts, print_ssh_output=True): if s.returncode != 0 and print_ssh_output: # extra leading newline is for spacing in wait_for_cluster_state() - print textwrap.dedent("""\n + print(textwrap.dedent("""\n Warning: SSH connection error. (This could be temporary.) Host: {h} SSH return code: {r} @@ -746,7 +913,7 @@ def is_ssh_available(host, opts, print_ssh_output=True): h=host, r=s.returncode, o=cmd_output.strip() - ) + )) return s.returncode == 0 @@ -756,7 +923,8 @@ def is_cluster_ssh_available(cluster_instances, opts): Check if SSH is available on all the instances in a cluster. """ for i in cluster_instances: - if not is_ssh_available(host=i.ip_address, opts=opts): + dns_name = get_dns_name(i, opts.private_ips) + if not is_ssh_available(host=dns_name, opts=opts): return False else: return True @@ -786,7 +954,11 @@ def wait_for_cluster_state(conn, opts, cluster_instances, cluster_state): for i in cluster_instances: i.update() - statuses = conn.get_all_instance_status(instance_ids=[i.id for i in cluster_instances]) + max_batch = 100 + statuses = [] + for j in xrange(0, len(cluster_instances), max_batch): + batch = [i.id for i in cluster_instances[j:j + max_batch]] + statuses.extend(conn.get_all_instance_status(instance_ids=batch)) if cluster_state == 'ssh-ready': if all(i.state == 'running' for i in cluster_instances) and \ @@ -806,63 +978,78 @@ def wait_for_cluster_state(conn, opts, cluster_instances, cluster_state): sys.stdout.write("\n") end_time = datetime.now() - print "Cluster is now in '{s}' state. Waited {t} seconds.".format( + print("Cluster is now in '{s}' state. Waited {t} seconds.".format( s=cluster_state, t=(end_time - start_time).seconds - ) + )) # Get number of local disks available for a given EC2 instance type. def get_num_disks(instance_type): # Source: http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html - # Last Updated: 2014-06-20 + # Last Updated: 2015-06-19 # For easy maintainability, please keep this manually-inputted dictionary sorted by key. disks_by_instance = { "c1.medium": 1, "c1.xlarge": 4, + "c3.large": 2, + "c3.xlarge": 2, "c3.2xlarge": 2, "c3.4xlarge": 2, "c3.8xlarge": 2, - "c3.large": 2, - "c3.xlarge": 2, + "c4.large": 0, + "c4.xlarge": 0, + "c4.2xlarge": 0, + "c4.4xlarge": 0, + "c4.8xlarge": 0, "cc1.4xlarge": 2, "cc2.8xlarge": 4, "cg1.4xlarge": 2, "cr1.8xlarge": 2, + "d2.xlarge": 3, + "d2.2xlarge": 6, + "d2.4xlarge": 12, + "d2.8xlarge": 24, "g2.2xlarge": 1, + "g2.8xlarge": 2, "hi1.4xlarge": 2, "hs1.8xlarge": 24, + "i2.xlarge": 1, "i2.2xlarge": 2, "i2.4xlarge": 4, "i2.8xlarge": 8, - "i2.xlarge": 1, - "m1.large": 2, - "m1.medium": 1, "m1.small": 1, + "m1.medium": 1, + "m1.large": 2, "m1.xlarge": 4, + "m2.xlarge": 1, "m2.2xlarge": 1, "m2.4xlarge": 2, - "m2.xlarge": 1, - "m3.2xlarge": 2, - "m3.large": 1, "m3.medium": 1, + "m3.large": 1, "m3.xlarge": 2, + "m3.2xlarge": 2, + "m4.large": 0, + "m4.xlarge": 0, + "m4.2xlarge": 0, + "m4.4xlarge": 0, + "m4.10xlarge": 0, + "r3.large": 1, + "r3.xlarge": 1, "r3.2xlarge": 1, "r3.4xlarge": 1, "r3.8xlarge": 2, - "r3.large": 1, - "r3.xlarge": 1, "t1.micro": 0, - 'd2.xlarge': 3, - 'd2.2xlarge': 6, - 'd2.4xlarge': 12, - 'd2.8xlarge': 24, + "t2.micro": 0, + "t2.small": 0, + "t2.medium": 0, + "t2.large": 0, } if instance_type in disks_by_instance: return disks_by_instance[instance_type] else: - print >> stderr, ("WARNING: Don't know number of disks on instance type %s; assuming 1" - % instance_type) + print("WARNING: Don't know number of disks on instance type %s; assuming 1" + % instance_type, file=stderr) return 1 @@ -874,7 +1061,7 @@ def get_num_disks(instance_type): # # root_dir should be an absolute path to the directory with the files we want to deploy. def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): - active_master = master_nodes[0].public_dns_name + active_master = get_dns_name(master_nodes[0], opts.private_ips) num_disks = get_num_disks(opts.instance_type) hdfs_data_dirs = "/mnt/ephemeral-hdfs/data" @@ -891,17 +1078,27 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): if opts.spark_version.startswith("http"): # Custom pre-built spark package spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo) + tachyon_v = "" + print("Deploying Spark via custom bunlde; Tachyon won't be set up") + modules = filter(lambda x: x != "tachyon", modules) elif "." in opts.spark_version: # Pre-built Spark deploy spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo) + tachyon_v = get_tachyon_version(spark_v) else: # Spark-only custom deploy spark_v = "%s|%s" % (opts.spark_git_repo, opts.spark_version) + tachyon_v = "" + print("Deploying Spark via git hash; Tachyon won't be set up") + modules = filter(lambda x: x != "tachyon", modules) + master_addresses = [get_dns_name(i, opts.private_ips) for i in master_nodes] + slave_addresses = [get_dns_name(i, opts.private_ips) for i in slave_nodes] + worker_instances_str = "%d" % opts.worker_instances if opts.worker_instances else "" template_vars = { - "master_list": '\n'.join([i.public_dns_name for i in master_nodes]), + "master_list": '\n'.join(master_addresses), "active_master": active_master, - "slave_list": '\n'.join([i.public_dns_name for i in slave_nodes]), + "slave_list": '\n'.join(slave_addresses), "cluster_url": cluster_url, "hdfs_data_dirs": hdfs_data_dirs, "mapred_local_dirs": mapred_local_dirs, @@ -909,8 +1106,9 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): "swap": str(opts.swap), "modules": '\n'.join(modules), "spark_version": spark_v, + "tachyon_version": tachyon_v, "hadoop_major_version": opts.hadoop_major_version, - "spark_worker_instances": "%d" % opts.worker_instances, + "spark_worker_instances": worker_instances_str, "spark_master_opts": opts.master_opts } @@ -953,6 +1151,23 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): shutil.rmtree(tmp_dir) +# Deploy a given local directory to a cluster, WITHOUT parameter substitution. +# Note that unlike deploy_files, this works for binary files. +# Also, it is up to the user to add (or not) the trailing slash in root_dir. +# Files are only deployed to the first master instance in the cluster. +# +# root_dir should be an absolute path. +def deploy_user_files(root_dir, opts, master_nodes): + active_master = get_dns_name(master_nodes[0], opts.private_ips) + command = [ + 'rsync', '-rv', + '-e', stringify_command(ssh_command(opts)), + "%s" % root_dir, + "%s@%s:/" % (opts.user, active_master) + ] + subprocess.check_call(command) + + def stringify_command(parts): if isinstance(parts, str): return parts @@ -986,13 +1201,13 @@ def ssh(host, opts, command): # If this was an ssh failure, provide the user with hints. if e.returncode == 255: raise UsageError( - "Failed to SSH to remote host {0}.\n" + - "Please check that you have provided the correct --identity-file and " + + "Failed to SSH to remote host {0}.\n" + "Please check that you have provided the correct --identity-file and " "--key-pair parameters and try again.".format(host)) else: raise e - print >> stderr, \ - "Error executing remote command, retrying after 30 seconds: {0}".format(e) + print("Error executing remote command, retrying after 30 seconds: {0}".format(e), + file=stderr) time.sleep(30) tries = tries + 1 @@ -1031,8 +1246,8 @@ def ssh_write(host, opts, command, arguments): elif tries > 5: raise RuntimeError("ssh_write failed with error %s" % proc.returncode) else: - print >> stderr, \ - "Error {0} while executing remote command, retrying after 30 seconds".format(status) + print("Error {0} while executing remote command, retrying after 30 seconds". + format(status), file=stderr) time.sleep(30) tries = tries + 1 @@ -1048,12 +1263,26 @@ def get_zones(conn, opts): # Gets the number of items in a partition def get_partition(total, num_partitions, current_partitions): - num_slaves_this_zone = total / num_partitions + num_slaves_this_zone = total // num_partitions if (total % num_partitions) - current_partitions > 0: num_slaves_this_zone += 1 return num_slaves_this_zone +# Gets the IP address, taking into account the --private-ips flag +def get_ip_address(instance, private_ips=False): + ip = instance.ip_address if not private_ips else \ + instance.private_ip_address + return ip + + +# Gets the DNS name, taking into account the --private-ips flag +def get_dns_name(instance, private_ips=False): + dns = instance.public_dns_name if not private_ips else \ + instance.private_ip_address + return dns + + def real_main(): (opts, action, cluster_name) = parse_args() @@ -1072,28 +1301,28 @@ def real_main(): if opts.identity_file is not None: if not os.path.exists(opts.identity_file): - print >> stderr,\ - "ERROR: The identity file '{f}' doesn't exist.".format(f=opts.identity_file) + print("ERROR: The identity file '{f}' doesn't exist.".format(f=opts.identity_file), + file=stderr) sys.exit(1) file_mode = os.stat(opts.identity_file).st_mode if not (file_mode & S_IRUSR) or not oct(file_mode)[-2:] == '00': - print >> stderr, "ERROR: The identity file must be accessible only by you." - print >> stderr, 'You can fix this with: chmod 400 "{f}"'.format(f=opts.identity_file) + print("ERROR: The identity file must be accessible only by you.", file=stderr) + print('You can fix this with: chmod 400 "{f}"'.format(f=opts.identity_file), + file=stderr) sys.exit(1) if opts.instance_type not in EC2_INSTANCE_TYPES: - print >> stderr, "Warning: Unrecognized EC2 instance type for instance-type: {t}".format( - t=opts.instance_type) + print("Warning: Unrecognized EC2 instance type for instance-type: {t}".format( + t=opts.instance_type), file=stderr) if opts.master_instance_type != "": if opts.master_instance_type not in EC2_INSTANCE_TYPES: - print >> stderr, \ - "Warning: Unrecognized EC2 instance type for master-instance-type: {t}".format( - t=opts.master_instance_type) + print("Warning: Unrecognized EC2 instance type for master-instance-type: {t}".format( + t=opts.master_instance_type), file=stderr) if opts.ebs_vol_num > 8: - print >> stderr, "ebs-vol-num cannot be greater than 8" + print("ebs-vol-num cannot be greater than 8", file=stderr) sys.exit(1) # Prevent breaking ami_prefix (/, .git and startswith checks) @@ -1102,15 +1331,22 @@ def real_main(): opts.spark_ec2_git_repo.endswith(".git") or \ not opts.spark_ec2_git_repo.startswith("https://github.com") or \ not opts.spark_ec2_git_repo.endswith("spark-ec2"): - print >> stderr, "spark-ec2-git-repo must be a github repo and it must not have a " \ - "trailing / or .git. " \ - "Furthermore, we currently only support forks named spark-ec2." + print("spark-ec2-git-repo must be a github repo and it must not have a trailing / or .git. " + "Furthermore, we currently only support forks named spark-ec2.", file=stderr) + sys.exit(1) + + if not (opts.deploy_root_dir is None or + (os.path.isabs(opts.deploy_root_dir) and + os.path.isdir(opts.deploy_root_dir) and + os.path.exists(opts.deploy_root_dir))): + print("--deploy-root-dir must be an absolute path to a directory that exists " + "on the local file system", file=stderr) sys.exit(1) try: conn = ec2.connect_to_region(opts.region) except Exception as e: - print >> stderr, (e) + print((e), file=stderr) sys.exit(1) # Select an AZ at random if it was not specified. @@ -1119,7 +1355,7 @@ def real_main(): if action == "launch": if opts.slaves <= 0: - print >> sys.stderr, "ERROR: You have to start at least 1 slave" + print("ERROR: You have to start at least 1 slave", file=sys.stderr) sys.exit(1) if opts.resume: (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) @@ -1134,26 +1370,27 @@ def real_main(): setup_cluster(conn, master_nodes, slave_nodes, opts, True) elif action == "destroy": - print "Are you sure you want to destroy the cluster %s?" % cluster_name - print "The following instances will be terminated:" (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) - for inst in master_nodes + slave_nodes: - print "> %s" % inst.public_dns_name - msg = "ALL DATA ON ALL NODES WILL BE LOST!!\nDestroy cluster %s (y/N): " % cluster_name + if any(master_nodes + slave_nodes): + print("The following instances will be terminated:") + for inst in master_nodes + slave_nodes: + print("> %s" % get_dns_name(inst, opts.private_ips)) + print("ALL DATA ON ALL NODES WILL BE LOST!!") + + msg = "Are you sure you want to destroy the cluster {c}? (y/N) ".format(c=cluster_name) response = raw_input(msg) if response == "y": - print "Terminating master..." + print("Terminating master...") for inst in master_nodes: inst.terminate() - print "Terminating slaves..." + print("Terminating slaves...") for inst in slave_nodes: inst.terminate() # Delete security groups as well if opts.delete_groups: - print "Deleting security groups (this will take some time)..." group_names = [cluster_name + "-master", cluster_name + "-slaves"] wait_for_cluster_state( conn=conn, @@ -1161,15 +1398,16 @@ def real_main(): cluster_instances=(master_nodes + slave_nodes), cluster_state='terminated' ) + print("Deleting security groups (this will take some time)...") attempt = 1 while attempt <= 3: - print "Attempt %d" % attempt + print("Attempt %d" % attempt) groups = [g for g in conn.get_all_security_groups() if g.name in group_names] success = True # Delete individual rules in all groups before deleting groups to # remove dependencies between them for group in groups: - print "Deleting rules in security group " + group.name + print("Deleting rules in security group " + group.name) for rule in group.rules: for grant in rule.grants: success &= group.revoke(ip_protocol=rule.ip_protocol, @@ -1182,11 +1420,12 @@ def real_main(): time.sleep(30) # Yes, it does have to be this long :-( for group in groups: try: - conn.delete_security_group(group.name) - print "Deleted security group " + group.name + # It is needed to use group_id to make it work with VPC + conn.delete_security_group(group_id=group.id) + print("Deleted security group %s" % group.name) except boto.exception.EC2ResponseError: success = False - print "Failed to delete security group " + group.name + print("Failed to delete security group %s" % group.name) # Unfortunately, group.revoke() returns True even if a rule was not # deleted, so this needs to be rerun if something fails @@ -1196,18 +1435,21 @@ def real_main(): attempt += 1 if not success: - print "Failed to delete all security groups after 3 tries." - print "Try re-running in a few minutes." + print("Failed to delete all security groups after 3 tries.") + print("Try re-running in a few minutes.") elif action == "login": (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) - master = master_nodes[0].public_dns_name - print "Logging into master " + master + "..." - proxy_opt = [] - if opts.proxy_port is not None: - proxy_opt = ['-D', opts.proxy_port] - subprocess.check_call( - ssh_command(opts) + proxy_opt + ['-t', '-t', "%s@%s" % (opts.user, master)]) + if not master_nodes[0].public_dns_name and not opts.private_ips: + print("Master has no public DNS name. Maybe you meant to specify --private-ips?") + else: + master = get_dns_name(master_nodes[0], opts.private_ips) + print("Logging into master " + master + "...") + proxy_opt = [] + if opts.proxy_port is not None: + proxy_opt = ['-D', opts.proxy_port] + subprocess.check_call( + ssh_command(opts) + proxy_opt + ['-t', '-t', "%s@%s" % (opts.user, master)]) elif action == "reboot-slaves": response = raw_input( @@ -1217,15 +1459,18 @@ def real_main(): if response == "y": (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) - print "Rebooting slaves..." + print("Rebooting slaves...") for inst in slave_nodes: if inst.state not in ["shutting-down", "terminated"]: - print "Rebooting " + inst.id + print("Rebooting " + inst.id) inst.reboot() elif action == "get-master": (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) - print master_nodes[0].public_dns_name + if not master_nodes[0].public_dns_name and not opts.private_ips: + print("Master has no public DNS name. Maybe you meant to specify --private-ips?") + else: + print(get_dns_name(master_nodes[0], opts.private_ips)) elif action == "stop": response = raw_input( @@ -1238,11 +1483,11 @@ def real_main(): if response == "y": (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) - print "Stopping master..." + print("Stopping master...") for inst in master_nodes: if inst.state not in ["shutting-down", "terminated"]: inst.stop() - print "Stopping slaves..." + print("Stopping slaves...") for inst in slave_nodes: if inst.state not in ["shutting-down", "terminated"]: if inst.spot_instance_request_id: @@ -1252,11 +1497,11 @@ def real_main(): elif action == "start": (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) - print "Starting slaves..." + print("Starting slaves...") for inst in slave_nodes: if inst.state not in ["shutting-down", "terminated"]: inst.start() - print "Starting master..." + print("Starting master...") for inst in master_nodes: if inst.state not in ["shutting-down", "terminated"]: inst.start() @@ -1266,18 +1511,29 @@ def real_main(): cluster_instances=(master_nodes + slave_nodes), cluster_state='ssh-ready' ) + + # Determine types of running instances + existing_master_type = master_nodes[0].instance_type + existing_slave_type = slave_nodes[0].instance_type + # Setting opts.master_instance_type to the empty string indicates we + # have the same instance type for the master and the slaves + if existing_master_type == existing_slave_type: + existing_master_type = "" + opts.master_instance_type = existing_master_type + opts.instance_type = existing_slave_type + setup_cluster(conn, master_nodes, slave_nodes, opts, False) else: - print >> stderr, "Invalid action: %s" % action + print("Invalid action: %s" % action, file=stderr) sys.exit(1) def main(): try: real_main() - except UsageError, e: - print >> stderr, "\nError:\n", e + except UsageError as e: + print("\nError:\n", e, file=stderr) sys.exit(1) From 83fcbcba62dcec3d50bb768135f8eae888467e49 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 13 Aug 2015 16:16:32 -0300 Subject: [PATCH 18/80] get_spark_ami fix --- tools/spark-ec2/spark_ec2.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 8cc44d30..4fbf5bd8 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -459,21 +459,20 @@ def get_validate_spark_version(version, repo): def get_tachyon_version(spark_version): return SPARK_TACHYON_MAP.get(spark_version, "") - # Attempt to resolve an appropriate AMI given the architecture and region of the request. -def get_spark_ami(opts): - if opts.instance_type in EC2_INSTANCE_TYPES: - instance_type = EC2_INSTANCE_TYPES[opts.instance_type] +def get_spark_ami(instance_type, region, spark_ec2_git_repo, spark_ec2_git_branch): + if instance_type in EC2_INSTANCE_TYPES: + instance_type = EC2_INSTANCE_TYPES[instance_type] else: instance_type = "pvm" - print("Don't recognize %s, assuming type is pvm" % opts.instance_type, file=stderr) + print("Don't recognize %s, assuming type is pvm" % instance_type, file=stderr) # URL prefix from which to fetch AMI information ami_prefix = "{r}/{b}/ami-list".format( - r=opts.spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1), - b=opts.spark_ec2_git_branch) + r=spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1), + b=spark_ec2_git_branch) - ami_path = "%s/%s/%s" % (ami_prefix, opts.region, instance_type) + ami_path = "%s/%s/%s" % (ami_prefix, region, instance_type) reader = codecs.getreader("ascii") try: ami = reader(urlopen(ami_path)).read().strip() @@ -484,7 +483,6 @@ def get_spark_ami(opts): print("Spark AMI: " + ami) return ami - # Launch a cluster of the given name, by setting up its security groups, # and then starting new instances in them. # Returns a tuple of EC2 reservation objects for the master and slaves @@ -584,10 +582,10 @@ def launch_cluster(conn, opts, cluster_name): # Figure out Spark AMI if opts.ami is None: - opts.ami = get_spark_ami(opts) + opts.ami = get_spark_ami(opts.instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch) if opts.master_ami is None: - opts.master_ami = get_spark_ami(opts.master_instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch) + opts.master_ami = get_spark_ami(opts.master_instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch) # we use group ids to work around https://github.com/boto/boto/issues/350 additional_group_ids = [] From 807f0f616973d74a51998caadfc1bc1b17b7a306 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 14 Aug 2015 13:51:39 -0300 Subject: [PATCH 19/80] remove user data, spark-ec2 takes care on formatting disks --- tools/cluster.py | 3 --- tools/scripts/S05mount-disks | 11 ----------- 2 files changed, 14 deletions(-) delete mode 100644 tools/scripts/S05mount-disks diff --git a/tools/cluster.py b/tools/cluster.py index 3cf1828a..6ebd2386 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -51,7 +51,6 @@ default_remote_user = 'ec2-user' default_remote_control_dir = '/tmp/Ignition' default_collect_results_dir = '/tmp' -default_user_data = os.path.join(script_path, 'scripts', 'S05mount-disks') default_defaults_filename = 'cluster_defaults.json' default_spark_ec2_git_repo = 'https://github.com/chaordic/spark-ec2' @@ -202,7 +201,6 @@ def launch(cluster_name, slaves, key_id=default_key_id, region=default_region, zone=default_zone, instance_type=default_instance_type, ondemand=False, spot_price=default_spot_price, - user_data=default_user_data, security_group = None, vpc = None, vpc_subnet = None, @@ -272,7 +270,6 @@ def launch(cluster_name, slaves, '--master-opts', '-Dspark.worker.timeout={0}'.format(worker_timeout), '--spark-git-repo', spark_repo, '-v', spark_version, - '--user-data', user_data, 'launch', cluster_name] + spot_params + resume_param + diff --git a/tools/scripts/S05mount-disks b/tools/scripts/S05mount-disks deleted file mode 100644 index 8f129a30..00000000 --- a/tools/scripts/S05mount-disks +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -echo 'Mounting disks' >> /tmp/mount-disks.log -mkdir -p /mnt -mkdir -p /mnt{2,3,4} -chmod -R 777 /mnt* -[ -r /dev/xvdb ] && mkfs.ext4 /dev/xvdb && mount /dev/xvdb /mnt -[ -r /dev/xvdc ] && mkfs.ext4 /dev/xvdc && mount /dev/xvdc /mnt2 -[ -r /dev/xvdd ] && mkfs.ext4 /dev/xvdd && mount /dev/xvdd /mnt3 -[ -r /dev/xvde ] && mkfs.ext4 /dev/xvde && mount /dev/xvde /mnt4 - From 637ab060de8b564d6b7a6021ef493b84152af350 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 14 Aug 2015 14:06:31 -0300 Subject: [PATCH 20/80] fix variable replacement --- .../spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh b/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh index 3570891b..4f3e8da8 100644 --- a/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh +++ b/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh @@ -25,8 +25,10 @@ export MAPRED_LOCAL_DIRS="{{mapred_local_dirs}}" export SPARK_LOCAL_DIRS="{{spark_local_dirs}}" export MODULES="{{modules}}" export SPARK_VERSION="{{spark_version}}" -export SHARK_VERSION="{{shark_version}}" +export TACHYON_VERSION="{{tachyon_version}}" export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}" export SWAP_MB="{{swap}}" export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}" export SPARK_MASTER_OPTS="{{spark_master_opts}}" +export AWS_ACCESS_KEY_ID="{{aws_access_key_id}}" +export AWS_SECRET_ACCESS_KEY="{{aws_secret_access_key}}" From f6d5d0dd7cebb0bc32a9c13f04959015f3e36427 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 14 Aug 2015 14:07:32 -0300 Subject: [PATCH 21/80] remove rstudio and some fixes --- tools/spark-ec2/spark_ec2.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 4fbf5bd8..f5bbaac1 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -357,15 +357,15 @@ def get_or_make_group(conn, name, vpc_id): return conn.create_security_group(name, "Spark EC2 group", vpc_id) def check_if_http_resource_exists(resource): - request = urllib2.Request(resource) + request = Request(resource) request.get_method = lambda: 'HEAD' try: - response = urllib2.urlopen(request) + response = urlopen(request) if response.getcode() == 200: return True else: raise RuntimeError("Resource {resource} not found. Error: {code}".format(resource, response.getcode())) - except urllib2.HTTPError, e: + except HTTPError, e: print >> stderr, "Unable to check if HTTP resource {url} exists. Error: {code}".format( url=resource, code=e.code) @@ -831,7 +831,7 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): ssh_write(slave_address, opts, ['tar', 'x'], dot_ssh_tar) modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs', - 'mapreduce', 'spark-standalone', 'tachyon', 'rstudio'] + 'mapreduce', 'spark-standalone', 'tachyon'] if opts.hadoop_major_version == "1": modules = list(filter(lambda x: x != "mapreduce", modules)) From 7787045de3c1ff132c17f341b3cdecae60ceade0 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 14 Aug 2015 14:49:14 -0300 Subject: [PATCH 22/80] update spark-ec2 version --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 6ebd2386..23b3bed9 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -54,7 +54,7 @@ default_defaults_filename = 'cluster_defaults.json' default_spark_ec2_git_repo = 'https://github.com/chaordic/spark-ec2' -default_spark_ec2_git_branch = 'v4-yarn' +default_spark_ec2_git_branch = 'branch-1.4-merge' master_post_create_commands = [ From ccfed3f661b0bae939dae704204767a3ef899ad1 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Mon, 17 Aug 2015 11:08:23 -0300 Subject: [PATCH 23/80] pr review, fix removed feature and added noop user-data --- tools/cluster.py | 3 +++ tools/scripts/noop | 1 + 2 files changed, 4 insertions(+) create mode 100644 tools/scripts/noop diff --git a/tools/cluster.py b/tools/cluster.py index 23b3bed9..d6a0263d 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -51,6 +51,7 @@ default_remote_user = 'ec2-user' default_remote_control_dir = '/tmp/Ignition' default_collect_results_dir = '/tmp' +default_user_data = os.path.join(script_path, 'scripts', 'noop') default_defaults_filename = 'cluster_defaults.json' default_spark_ec2_git_repo = 'https://github.com/chaordic/spark-ec2' @@ -201,6 +202,7 @@ def launch(cluster_name, slaves, key_id=default_key_id, region=default_region, zone=default_zone, instance_type=default_instance_type, ondemand=False, spot_price=default_spot_price, + user_data=default_user_data, security_group = None, vpc = None, vpc_subnet = None, @@ -270,6 +272,7 @@ def launch(cluster_name, slaves, '--master-opts', '-Dspark.worker.timeout={0}'.format(worker_timeout), '--spark-git-repo', spark_repo, '-v', spark_version, + '--user-data', user_data, 'launch', cluster_name] + spot_params + resume_param + diff --git a/tools/scripts/noop b/tools/scripts/noop new file mode 100644 index 00000000..cc1f786e --- /dev/null +++ b/tools/scripts/noop @@ -0,0 +1 @@ +#!/bin/bash \ No newline at end of file From 9bbcd181723dcdb1d275a5a6040c0eda2c540569 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 18 Aug 2015 16:43:51 -0300 Subject: [PATCH 24/80] added heap size param for driver --- remote_hook.sh | 3 ++- tools/cluster.py | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/remote_hook.sh b/remote_hook.sh index 305a0ff6..65e71070 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -11,6 +11,7 @@ CONTROL_DIR="${5?Please give the Control Directory}" SPARK_MEM_PARAM="${6?Please give the Job Memory Size to use}" USE_YARN="${7?Please tell if we should use YARN (yes/no)}" NOTIFY_ON_ERRORS="${8?Please tell if we will notify on errors (yes/no)}" +DRIVER_HEAP_SIZE="${9?Please tell driver heap size to use in MB}" JOB_WITH_TAG=${JOB_NAME}.${JOB_TAG} JOB_CONTROL_DIR="${CONTROL_DIR}/${JOB_WITH_TAG}" @@ -80,7 +81,7 @@ if [[ "${JOB_NAME}" == "shell" ]]; then else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & - sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory 25000M --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" + sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}M" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" fi touch "${JOB_CONTROL_DIR}/SUCCESS" diff --git a/tools/cluster.py b/tools/cluster.py index d6a0263d..3ac89be8 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -39,6 +39,7 @@ default_spot_price = '0.10' default_worker_instances = '1' default_master_instance_type = 'm3.xlarge' +default_driver_heap_size = '25000' default_region = 'us-east-1' default_zone = default_region + 'b' default_key_id = 'ignition_key' @@ -372,7 +373,9 @@ def job_run(cluster_name, job_name, job_mem, disable_assembly_build=False, run_tests=False, kill_on_failure=False, - destroy_cluster=False, region=default_region): + destroy_cluster=False, + region=default_region, + driver_heap_size=default_driver_heap_size): utc_job_date_example = '2014-05-04T13:13:10Z' if utc_job_date and len(utc_job_date) != len(utc_job_date_example): @@ -394,9 +397,9 @@ def job_run(cluster_name, job_name, job_mem, job_tag = job_tag or job_date.replace(':', '_').replace('-', '_').replace('Z', 'UTC') tmux_wait_command = ';(echo Press enter to keep the session open && /bin/bash -c "read -t 5" && sleep 7d)' if not detached else '' tmux_arg = ". /etc/profile; . ~/.profile;tmux new-session {detached} -s spark.{job_name}.{job_tag} '{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {tmux_wait_command}' >& /tmp/commandoutput".format( - aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, tmux_wait_command=tmux_wait_command) + aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, tmux_wait_command=tmux_wait_command) non_tmux_arg = ". /etc/profile; . ~/.profile;{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} >& /tmp/commandoutput".format( - aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param) + aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size) if not disable_assembly_build: From a2d5af977c37bd7e14fa6b304bf17d4ffd25e231 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 18 Aug 2015 17:03:21 -0300 Subject: [PATCH 25/80] parameterized memory unit --- remote_hook.sh | 4 ++-- tools/cluster.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/remote_hook.sh b/remote_hook.sh index 65e71070..48ba9735 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -11,7 +11,7 @@ CONTROL_DIR="${5?Please give the Control Directory}" SPARK_MEM_PARAM="${6?Please give the Job Memory Size to use}" USE_YARN="${7?Please tell if we should use YARN (yes/no)}" NOTIFY_ON_ERRORS="${8?Please tell if we will notify on errors (yes/no)}" -DRIVER_HEAP_SIZE="${9?Please tell driver heap size to use in MB}" +DRIVER_HEAP_SIZE="${9?Please tell driver heap size to use}" JOB_WITH_TAG=${JOB_NAME}.${JOB_TAG} JOB_CONTROL_DIR="${CONTROL_DIR}/${JOB_WITH_TAG}" @@ -81,7 +81,7 @@ if [[ "${JOB_NAME}" == "shell" ]]; then else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & - sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}M" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" + sudo -E "${SPARK_HOME}/bin/spark-submit" --master "${JOB_MASTER}" --driver-memory "${DRIVER_HEAP_SIZE}" --driver-java-options "-Djava.io.tmpdir=/mnt -verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps" --class "${MAIN_CLASS}" ${JAR_PATH} "${JOB_NAME}" --runner-date "${JOB_DATE}" --runner-tag "${JOB_TAG}" --runner-user "${JOB_USER}" --runner-master "${JOB_MASTER}" --runner-executor-memory "${SPARK_MEM_PARAM}" >& "${JOB_OUTPUT}" || notify_error_and_exit "Execution failed for job ${JOB_WITH_TAG}" fi touch "${JOB_CONTROL_DIR}/SUCCESS" diff --git a/tools/cluster.py b/tools/cluster.py index 3ac89be8..81dc9b2d 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -39,7 +39,7 @@ default_spot_price = '0.10' default_worker_instances = '1' default_master_instance_type = 'm3.xlarge' -default_driver_heap_size = '25000' +default_driver_heap_size = '25G' default_region = 'us-east-1' default_zone = default_region + 'b' default_key_id = 'ignition_key' From f165937d608288be7fd673301256f2f6e122bc2e Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 19 Aug 2015 16:12:21 -0300 Subject: [PATCH 26/80] fix default memory size to match default master instance type --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 81dc9b2d..f796f53c 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -39,7 +39,7 @@ default_spot_price = '0.10' default_worker_instances = '1' default_master_instance_type = 'm3.xlarge' -default_driver_heap_size = '25G' +default_driver_heap_size = '12G' default_region = 'us-east-1' default_zone = default_region + 'b' default_key_id = 'ignition_key' From 980a2784ccabcf435d2df575fcf9c650c820349c Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Wed, 19 Aug 2015 17:39:17 -0300 Subject: [PATCH 27/80] Use the driver heap size param --- tools/cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index f796f53c..1f6fdaa5 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -396,9 +396,9 @@ def job_run(cluster_name, job_name, job_mem, job_date = utc_job_date or datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') job_tag = job_tag or job_date.replace(':', '_').replace('-', '_').replace('Z', 'UTC') tmux_wait_command = ';(echo Press enter to keep the session open && /bin/bash -c "read -t 5" && sleep 7d)' if not detached else '' - tmux_arg = ". /etc/profile; . ~/.profile;tmux new-session {detached} -s spark.{job_name}.{job_tag} '{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {tmux_wait_command}' >& /tmp/commandoutput".format( + tmux_arg = ". /etc/profile; . ~/.profile;tmux new-session {detached} -s spark.{job_name}.{job_tag} '{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} {tmux_wait_command}' >& /tmp/commandoutput".format( aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, detached='-d' if detached else '', yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size, tmux_wait_command=tmux_wait_command) - non_tmux_arg = ". /etc/profile; . ~/.profile;{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} >& /tmp/commandoutput".format( + non_tmux_arg = ". /etc/profile; . ~/.profile;{aws_vars} {remote_hook} {job_name} {job_date} {job_tag} {job_user} {remote_control_dir} {spark_mem} {yarn_param} {notify_param} {driver_heap_size} >& /tmp/commandoutput".format( aws_vars=get_aws_keys_str(), job_name=job_name, job_date=job_date, job_tag=job_tag, job_user=job_user, remote_control_dir=remote_control_dir, remote_hook=remote_hook, spark_mem=job_mem, yarn_param=yarn_param, notify_param=notify_param, driver_heap_size=driver_heap_size) From c78c319a5e2c900888ddc512f4166ee3b5f553fc Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 26 Aug 2015 10:56:53 -0300 Subject: [PATCH 28/80] Update spark_ec2.py --- tools/spark-ec2/spark_ec2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index f5bbaac1..c81d794b 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -51,7 +51,7 @@ raw_input = input xrange = range -SPARK_EC2_VERSION = "1.4.0" +SPARK_EC2_VERSION = "1.4.1" SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__)) VALID_SPARK_VERSIONS = set([ @@ -71,6 +71,7 @@ "1.3.0", "1.3.1", "1.4.0", + "1.4.1, ]) SPARK_TACHYON_MAP = { @@ -84,6 +85,7 @@ "1.3.0": "0.5.0", "1.3.1": "0.5.0", "1.4.0": "0.6.4", + "1.4.1": "0.6.4", } DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION From a5379a0a4a54d2e35b83c2cb4c9b4a467b8091d5 Mon Sep 17 00:00:00 2001 From: "Allan Douglas R. de Oliveira" Date: Wed, 26 Aug 2015 10:57:36 -0300 Subject: [PATCH 29/80] Update spark_ec2.py --- tools/spark-ec2/spark_ec2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index c81d794b..5c6458f9 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -71,7 +71,7 @@ "1.3.0", "1.3.1", "1.4.0", - "1.4.1, + "1.4.1", ]) SPARK_TACHYON_MAP = { From 59ba13280fdc49f95bc1e5f3878c8384a5d3d865 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Thu, 27 Aug 2015 10:59:16 -0300 Subject: [PATCH 30/80] Use Spark 1.4.1 --- build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index 7eb2bffe..476dd3bb 100644 --- a/build.sbt +++ b/build.sbt @@ -13,7 +13,7 @@ ideaExcludeFolders += ".idea_modules" // Because we can't run two spark contexts on same VM parallelExecution in Test := false -libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.3.0" % "provided") +libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.4.1" % "provided") .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") From 63e867a9de1a11f48a8a72906b38f693cccee52c Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Fri, 28 Aug 2015 16:38:44 -0300 Subject: [PATCH 31/80] Increase group to avoid slowdowns --- src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index a1090d20..baf80bc2 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -52,7 +52,7 @@ object SparkContextUtils { if (splittedPaths.size < minimumPaths) throw new Exception(s"Not enough paths found for $paths") - val rdds = splittedPaths.grouped(50).map(pathGroup => f(pathGroup.mkString(","))) + val rdds = splittedPaths.grouped(5000).map(pathGroup => f(pathGroup.mkString(","))) new UnionRDD(sc, rdds.toList) } From f12dfdc9d2029941bd293f3aa3ba90c83bbd885a Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 31 Aug 2015 13:52:32 -0300 Subject: [PATCH 32/80] Updated core to ignore spark ec2 boto --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index cfe2c08a..bcf8c0f8 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,6 @@ project/plugins/project/ # Node node_modules + +# Spark-ec2 boto +tools/spark-ec2/lib From cae677fc26ef20ff46f22c098b2cf903db239e5c Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 31 Aug 2015 16:21:58 -0300 Subject: [PATCH 33/80] Make spark 1.4.1 the default --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 1f6fdaa5..e312d842 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -47,7 +47,7 @@ default_ami = None # will be decided based on spark-ec2 list default_master_ami = None default_env = 'dev' -default_spark_version = '1.3.0' +default_spark_version = '1.4.1' default_spark_repo = 'https://github.com/chaordic/spark' default_remote_user = 'ec2-user' default_remote_control_dir = '/tmp/Ignition' From 14324a2f3a7456b6aae993d0b68f02aa9402924a Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 1 Sep 2015 15:44:27 -0300 Subject: [PATCH 34/80] Added IntBag --- .../ignition/core/utils/CollectionUtils.scala | 6 +++ .../scala/ignition/core/utils/IntBag.scala | 42 +++++++++++++++++++ .../core/utils/CollectionUtilsSpec.scala | 2 - .../ignition/core/utils/IntBagSpec.scala | 23 ++++++++++ 4 files changed, 71 insertions(+), 2 deletions(-) create mode 100644 src/main/scala/ignition/core/utils/IntBag.scala create mode 100644 src/test/scala/ignition/core/utils/IntBagSpec.scala diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala index 52828ca7..eea4755e 100644 --- a/src/main/scala/ignition/core/utils/CollectionUtils.scala +++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala @@ -22,6 +22,12 @@ object CollectionUtils { } } + + + implicit class TraversableOnceLong(xs: TraversableOnce[Long]) { + def toBag(): IntBag = IntBag.from(xs) + } + implicit class TraversableLikeImprovements[A, Repr](xs: TraversableLike[A, Repr]) { def distinctBy[B, That](f: A => B)(implicit cbf: CanBuildFrom[Repr, A, That]) = { val builder = cbf(xs.repr) diff --git a/src/main/scala/ignition/core/utils/IntBag.scala b/src/main/scala/ignition/core/utils/IntBag.scala new file mode 100644 index 00000000..2a36da6e --- /dev/null +++ b/src/main/scala/ignition/core/utils/IntBag.scala @@ -0,0 +1,42 @@ +package ignition.core.utils + +object IntBag { + def from(numbers: TraversableOnce[Long]): IntBag = { + val histogram = scala.collection.mutable.HashMap.empty[Long, Long] + numbers.foreach(n => histogram += (n -> (histogram.getOrElse(n, 0L) + 1))) + new IntBag(histogram) + } + + val empty = from(Seq.empty) +} + +class IntBag(val histogram: collection.Map[Long, Long]) { + def ++(other: IntBag): IntBag = { + val newHistogram = scala.collection.mutable.HashMap.empty[Long, Long] + (histogram.keySet ++ other.histogram.keySet).foreach(k => newHistogram += (k -> (histogram.getOrElse(k, 0L) + other.histogram.getOrElse(k, 0L)))) + new IntBag(newHistogram) + } + + + def median: Option[Long] = { + if (histogram.nonEmpty) { + val total = histogram.values.sum + val half = total / 2 + val max = histogram.keys.max + + val accumulatedFrequency = (0L to max).scanLeft(0L) { case (sumFreq, k) => sumFreq + histogram.getOrElse(k, 0L) }.zipWithIndex + accumulatedFrequency.collectFirst { case (sum, k) if sum >= half => k } + } else { + None + } + } + + def avg: Option[Long] = { + if (histogram.nonEmpty) { + val sum = histogram.map { case (k, f) => k * f }.sum + val count = histogram.values.sum + Option(sum / count) + } else + None + } +} diff --git a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala index c19579ce..f01b8a34 100644 --- a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala +++ b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala @@ -33,6 +33,4 @@ class CollectionUtilsSpec extends FlatSpec with ShouldMatchers { } - - } diff --git a/src/test/scala/ignition/core/utils/IntBagSpec.scala b/src/test/scala/ignition/core/utils/IntBagSpec.scala new file mode 100644 index 00000000..b6694b12 --- /dev/null +++ b/src/test/scala/ignition/core/utils/IntBagSpec.scala @@ -0,0 +1,23 @@ +package ignition.core.utils + +import org.scalatest._ + +import scala.util.Random + +class IntBagSpec extends FlatSpec with ShouldMatchers { + + "IntBag" should "be built from sequence" in { + IntBag.from(Seq(1, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4)).histogram shouldBe Map(1 -> 2, 2 -> 3, 3 -> 1, 4 -> 5) + } + + it should "calculate the median and average" in { + val size = 1000 + val numbers = (0 until 1000).map(_ => Random.nextInt(400).toLong).toList + val bag = IntBag.from(numbers) + + bag.avg.get shouldBe numbers.sum / size + + // TODO: the median is only approximate and it could be better, improve it + } + +} From 3cb2ef5d74282391674d1afea9be30f2eb1a5463 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matheus=20Weber=20da=20Concei=C3=A7=C3=A3o?= Date: Tue, 1 Sep 2015 17:17:47 -0300 Subject: [PATCH 35/80] Adds an option to launch the cluster master as spot --- tools/cluster.py | 5 +- tools/spark-ec2/spark_ec2.py | 95 +++++++++++++++++++++++++++++------- 2 files changed, 82 insertions(+), 18 deletions(-) diff --git a/tools/cluster.py b/tools/cluster.py index e312d842..ed348fbb 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -202,7 +202,7 @@ def launch(cluster_name, slaves, tag=[], key_id=default_key_id, region=default_region, zone=default_zone, instance_type=default_instance_type, - ondemand=False, spot_price=default_spot_price, + ondemand=False, spot_price=default_spot_price, master_spot=False, user_data=default_user_data, security_group = None, vpc = None, @@ -252,6 +252,8 @@ def launch(cluster_name, slaves, ]) spot_params = ['--spot-price', spot_price] if not ondemand else [] + master_spot_params = ['--master-spot'] if not ondemand and master_spot else [] + ami_params = ['--ami', ami] if ami else [] master_ami_params = ['--master-ami', master_ami] if master_ami else [] @@ -276,6 +278,7 @@ def launch(cluster_name, slaves, '--user-data', user_data, 'launch', cluster_name] + spot_params + + master_spot_params + resume_param + auth_params + ami_params + diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 5c6458f9..3583bf1d 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -260,6 +260,10 @@ def parse_args(): "--spot-price", metavar="PRICE", type="float", help="If specified, launch slaves as spot instances with the given " + "maximum price (in dollars)") + parser.add_option( + "--master-spot", action="store_true", default=False, + help="If specified, launch master as spot instance using the same " + + "bid and instance type of the slave ones") parser.add_option( "--ganglia", action="store_true", default=True, help="Setup Ganglia monitoring on cluster (default: %default). NOTE: " + @@ -729,26 +733,83 @@ def launch_cluster(conn, opts, cluster_name): master_nodes = existing_masters else: master_type = opts.master_instance_type - if master_type == "": + if master_type == "" or opts.master_spot: master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name - master_res = master_image.run( - key_name=opts.key_pair, - security_group_ids=[master_group.id] + additional_group_ids, - instance_type=master_type, - placement=opts.zone, - min_count=1, - max_count=1, - block_device_map=block_map, - subnet_id=opts.subnet_id, - placement_group=opts.placement_group, - user_data=user_data_content, - instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, - instance_profile_name=opts.instance_profile_name) - - master_nodes = master_res.instances - print("Launched master in %s, regid = %s" % (zone, master_res.id)) + if opts.master_spot: + # Launch spot master instance with the requested price + # Note: The spot_price*1.5 is present to ensure a higher bid price to + # the master spot instance, so the master instance will be the + # last one to be terminated in a spot market price increase + print("Requesting master as spot instance with price $%.3f" % + (opts.spot_price)) + master_req = conn.request_spot_instances( + price=(opts.spot_price * 1.5), + image_id=opts.master_ami, + placement=opts.zone, + count=1, + key_name=opts.key_pair, + security_group_ids=[master_group.id] + additional_group_ids, + instance_type=master_type, + block_device_map=block_map, + subnet_id=opts.subnet_id, + placement_group=opts.placement_group, + user_data=user_data_content, + instance_profile_name=opts.instance_profile_name) + my_master_req_id = [req.id for req in master_req] + + start_time = datetime.now() + print("Waiting for master spot instance to be granted... Request ID: %s " % my_master_req_id) + try: + while True: + time.sleep(10) + reqs = conn.get_all_spot_instance_requests(my_master_req_id) + active_instance_ids = filter(lambda req: req.state == "active", reqs) + invalid_states = ["capacity-not-available", "capacity-oversubscribed", "price-too-low"] + invalid = filter(lambda req: req.status.code in invalid_states, reqs) + if len(invalid) > 0: + raise Exception("Invalid state for spot request: %s - status: %s" % + (invalid[0].id, invalid[0].status.message)) + if len(active_instance_ids) == 1: + print("Master spot instance granted") + master_res = conn.get_all_reservations([r.instance_id for r in active_instance_ids]) + master_nodes = master_res[0].instances + break + else: + print("Master spot instance not granted yet, waiting longer") + + if (datetime.now() - start_time).seconds > opts.spot_timeout * 60: + raise Exception("Timed out while waiting for master spot instance") + except: + print("Error: %s" % sys.exc_info()[1]) + print("Canceling master spot instance requests") + conn.cancel_spot_instance_requests(my_master_req_id) + # Log a warning if any of these requests actually launched instances: + (master_nodes, slave_nodes) = get_existing_cluster( + conn, opts, cluster_name, die_on_error=False) + running = len(master_nodes) + len(slave_nodes) + if running: + print(("WARNING: %d instances are still running" % running), file=stderr) + sys.exit(0) + else: + # Launch ondemand instance + master_res = master_image.run( + key_name=opts.key_pair, + security_group_ids=[master_group.id] + additional_group_ids, + instance_type=master_type, + placement=opts.zone, + min_count=1, + max_count=1, + block_device_map=block_map, + subnet_id=opts.subnet_id, + placement_group=opts.placement_group, + user_data=user_data_content, + instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, + instance_profile_name=opts.instance_profile_name) + + master_nodes = master_res.instances + print("Launched master in %s, regid = %s" % (zone, master_res.id)) # This wait time corresponds to SPARK-4983 print("Waiting for AWS to propagate instance metadata...") From 38213b49e13492536ddafe6fe70408552014c52b Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Wed, 2 Sep 2015 16:30:17 -0300 Subject: [PATCH 36/80] Fix serialization --- src/main/scala/ignition/core/utils/IntBag.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/utils/IntBag.scala b/src/main/scala/ignition/core/utils/IntBag.scala index 2a36da6e..a322f6f7 100644 --- a/src/main/scala/ignition/core/utils/IntBag.scala +++ b/src/main/scala/ignition/core/utils/IntBag.scala @@ -4,13 +4,13 @@ object IntBag { def from(numbers: TraversableOnce[Long]): IntBag = { val histogram = scala.collection.mutable.HashMap.empty[Long, Long] numbers.foreach(n => histogram += (n -> (histogram.getOrElse(n, 0L) + 1))) - new IntBag(histogram) + IntBag(histogram) } val empty = from(Seq.empty) } -class IntBag(val histogram: collection.Map[Long, Long]) { +case class IntBag(histogram: collection.Map[Long, Long]) { def ++(other: IntBag): IntBag = { val newHistogram = scala.collection.mutable.HashMap.empty[Long, Long] (histogram.keySet ++ other.histogram.keySet).foreach(k => newHistogram += (k -> (histogram.getOrElse(k, 0L) + other.histogram.getOrElse(k, 0L)))) From d668f40fd93f93a9409d97781e34ce9e1d0d8ea2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADs=20Guilherme=20Fernandes=20Pereira?= Date: Fri, 4 Sep 2015 15:16:30 -0300 Subject: [PATCH 37/80] Date between helper --- src/main/scala/ignition/core/utils/DateUtils.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/scala/ignition/core/utils/DateUtils.scala b/src/main/scala/ignition/core/utils/DateUtils.scala index c3fb5163..8ebf3b13 100644 --- a/src/main/scala/ignition/core/utils/DateUtils.scala +++ b/src/main/scala/ignition/core/utils/DateUtils.scala @@ -20,6 +20,9 @@ object DateUtils { def isEqualOrBefore(other: DateTime) = dateTime.isBefore(other) || dateTime.saneEqual(other) + + def isBetween(start: DateTime, end: DateTime) = + dateTime.isAfter(start) && dateTime.isEqualOrBefore(end) } implicit class SecondsImprovements(val seconds: Seconds) { From b2a602556c699f19da79b4ec6bf442a0a777862a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matheus=20Weber=20da=20Concei=C3=A7=C3=A3o?= Date: Tue, 8 Sep 2015 15:29:16 -0300 Subject: [PATCH 38/80] Adds a TODO! --- tools/spark-ec2/spark_ec2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 3583bf1d..52c21c3f 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -759,6 +759,7 @@ def launch_cluster(conn, opts, cluster_name): instance_profile_name=opts.instance_profile_name) my_master_req_id = [req.id for req in master_req] + # TODO: refactor duplicated spot waiting code start_time = datetime.now() print("Waiting for master spot instance to be granted... Request ID: %s " % my_master_req_id) try: From 08ae1dd35b7540218c2744c259b5d4c9ee6ae9cf Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 9 Sep 2015 16:01:17 -0300 Subject: [PATCH 39/80] some kind of hack to parallel read and list files using spark cluster slaves --- .../core/jobs/utils/SparkContextUtils.scala | 112 +++++++++++++++++- 1 file changed, 107 insertions(+), 5 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index baf80bc2..f421b614 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -1,21 +1,26 @@ package ignition.core.jobs.utils -import java.util.Date - import ignition.core.utils.ByteUtils +import ignition.core.utils.CollectionUtils._ +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.LongWritable -import org.apache.spark.SparkContext +import org.apache.hadoop.io.compress.CompressionCodecFactory +import org.apache.spark.{Partitioner, SparkContext} import org.apache.hadoop.fs.{FileStatus, Path, FileSystem} import org.apache.spark.rdd.{UnionRDD, RDD} -import org.joda.time.{DateTimeZone, DateTime} +import org.joda.time.DateTime import ignition.core.utils.DateUtils._ +import scala.collection.mutable.ArrayBuffer +import scala.io.Source import scala.reflect.ClassTag import scala.util.Try - object SparkContextUtils { + case class Bucket(var size: Long, paths: ArrayBuffer[String]) + case class S3File(path: String, isDir: Boolean, size: Long) + implicit class SparkContextImprovements(sc: SparkContext) { private def getFileSystem(path: Path): FileSystem = { @@ -194,5 +199,102 @@ object SparkContextUtils { else objectHadoopFile(paths, minimumPaths) } + + def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long): RDD[String] = { + val s3Paths = parallelListFiles(paths) + val buckets = buildBuckets(s3Paths, maxBytesPerPartition) + val files = buckets.flatMap(_.paths) + + val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId") + val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey") + + val partitionedFiles = sc.parallelize(files).map(file => file -> ()).partitionBy(createPartitioner(buckets)) + + partitionedFiles.mapPartitions { files => + val conf = new Configuration() + conf.set("fs.s3n.awsAccessKeyId", s3Key) + conf.set("fs.s3n.awsSecretAccessKey", s3Secret) + val codecFactory = new CompressionCodecFactory(conf) + files.map { case (path, _) => path } flatMap { s3Path => + val fileSystem = FileSystem.get(new java.net.URI(s3Path), conf) + val path = new Path(s3Path) + val inputStream = Option(codecFactory.getCodec(path)) match { + case Some(compression) => compression.createInputStream(fileSystem.open(path)) + case None => fileSystem.open(path) + } + Source.fromInputStream(inputStream).getLines() + } + } + } + + private def createPartitioner(buckets: Seq[Bucket]): Partitioner = { + val size = buckets.size + val partitions: Map[Any, Int] = buckets.zipWithIndex.flatMap { case (bucket, index) => bucket.paths.map(path => path -> index) }.toMap + new Partitioner { + override def numPartitions: Int = size + override def getPartition(key: Any): Int = partitions(key) + } + } + + private def buildBuckets(files: Seq[S3File], maxBytesPerPartition: Long): Seq[Bucket] = { + val buckets = ArrayBuffer.empty[Bucket] + files.distinctBy(_.path).foreach { file => + val size = file.size + val bucket = buckets.find(bucket => bucket.size + size < maxBytesPerPartition) match { + case Some(bucketFound) => bucketFound + case None => + val newBucket = Bucket(0, ArrayBuffer.empty) + buckets += newBucket + newBucket + } + bucket.size += size + bucket.paths += file.path + } + buckets + } + + def parallelListFiles(paths: Seq[String]): Seq[S3File] = { + val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId") + val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey") + + val remainingDirectories = new scala.collection.mutable.ArrayBuffer[S3File] + remainingDirectories ++= paths.map(S3File(_, isDir = true, 0)) + val allFiles = new scala.collection.mutable.ArrayBuffer[S3File] + + while (remainingDirectories.nonEmpty) { + val newDirs = sc.parallelize(remainingDirectories.map(_.path)) + val currentBatch = newDirs.flatMap { path => + val conf = new Configuration() + conf.set("fs.s3n.awsAccessKeyId", s3Key) + conf.set("fs.s3n.awsSecretAccessKey", s3Secret) + val fileSystem = FileSystem.get(new java.net.URI(path), conf) + try { + val hadoopPath = new Path(path) + if (fileSystem.isDirectory(hadoopPath)) { + val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) + sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen)) + } else if (fileSystem.isFile(hadoopPath)) { + val status = fileSystem.getFileStatus(hadoopPath) + Seq(S3File(status.getPath.toString, status.isDirectory, status.getLen)) + } else { // Maybe is glob or not found + val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) + sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen)) + } + } catch { + case e: java.io.FileNotFoundException => + println(s"File $path not found.") + e.printStackTrace() + Nil + } + }.collect() + val (dirs, files) = currentBatch.partition(_.isDir) + remainingDirectories.clear() + remainingDirectories ++= dirs + allFiles ++= files + } + + allFiles + } + } } From c56c0273b7b875329ff300d93af484f2beaf045f Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Mon, 14 Sep 2015 17:31:59 -0300 Subject: [PATCH 40/80] some of pr reivews --- .../core/jobs/utils/SparkContextUtils.scala | 122 +++++++++--------- 1 file changed, 62 insertions(+), 60 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index f421b614..03801857 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -18,7 +18,7 @@ import scala.util.Try object SparkContextUtils { - case class Bucket(var size: Long, paths: ArrayBuffer[String]) + case class S3FilePartition(var size: Long, paths: ArrayBuffer[String]) case class S3File(path: String, isDir: Boolean, size: Long) implicit class SparkContextImprovements(sc: SparkContext) { @@ -201,14 +201,13 @@ object SparkContextUtils { } def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long): RDD[String] = { - val s3Paths = parallelListFiles(paths) - val buckets = buildBuckets(s3Paths, maxBytesPerPartition) - val files = buckets.flatMap(_.paths) + val foundFiles = parallelListFiles(paths) + val files = foundFiles.map(_.path) val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId") val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey") - val partitionedFiles = sc.parallelize(files).map(file => file -> ()).partitionBy(createPartitioner(buckets)) + val partitionedFiles = sc.parallelize(files).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition)) partitionedFiles.mapPartitions { files => val conf = new Configuration() @@ -222,78 +221,81 @@ object SparkContextUtils { case Some(compression) => compression.createInputStream(fileSystem.open(path)) case None => fileSystem.open(path) } - Source.fromInputStream(inputStream).getLines() + try { + Source.fromInputStream(inputStream).getLines().toList + } finally { + Try { inputStream.close() } + } } } } - private def createPartitioner(buckets: Seq[Bucket]): Partitioner = { - val size = buckets.size - val partitions: Map[Any, Int] = buckets.zipWithIndex.flatMap { case (bucket, index) => bucket.paths.map(path => path -> index) }.toMap - new Partitioner { - override def numPartitions: Int = size - override def getPartition(key: Any): Int = partitions(key) - } - } - - private def buildBuckets(files: Seq[S3File], maxBytesPerPartition: Long): Seq[Bucket] = { - val buckets = ArrayBuffer.empty[Bucket] + private def createPartitioner(files: Seq[S3File], maxBytesPerPartition: Long): Partitioner = { + val partitions = ArrayBuffer.empty[S3FilePartition] files.distinctBy(_.path).foreach { file => val size = file.size - val bucket = buckets.find(bucket => bucket.size + size < maxBytesPerPartition) match { - case Some(bucketFound) => bucketFound + val partition = partitions.find(bucket => bucket.size + size < maxBytesPerPartition) match { + case Some(partitionFound) => partitionFound case None => - val newBucket = Bucket(0, ArrayBuffer.empty) - buckets += newBucket - newBucket + val newPartition = S3FilePartition(0, ArrayBuffer.empty) + partitions += newPartition + newPartition } - bucket.size += size - bucket.paths += file.path + partition.size += size + partition.paths += file.path + } + + val indexedPartitions: Map[Any, Int] = partitions.zipWithIndex.flatMap { + case (bucket, index) => bucket.paths.map(path => path -> index) + }.toMap + + new Partitioner { + override def numPartitions: Int = partitions.size + override def getPartition(key: Any): Int = indexedPartitions(key) } - buckets } def parallelListFiles(paths: Seq[String]): Seq[S3File] = { val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId") val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey") - val remainingDirectories = new scala.collection.mutable.ArrayBuffer[S3File] - remainingDirectories ++= paths.map(S3File(_, isDir = true, 0)) - val allFiles = new scala.collection.mutable.ArrayBuffer[S3File] - - while (remainingDirectories.nonEmpty) { - val newDirs = sc.parallelize(remainingDirectories.map(_.path)) - val currentBatch = newDirs.flatMap { path => - val conf = new Configuration() - conf.set("fs.s3n.awsAccessKeyId", s3Key) - conf.set("fs.s3n.awsSecretAccessKey", s3Secret) - val fileSystem = FileSystem.get(new java.net.URI(path), conf) - try { - val hadoopPath = new Path(path) - if (fileSystem.isDirectory(hadoopPath)) { - val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) - sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen)) - } else if (fileSystem.isFile(hadoopPath)) { - val status = fileSystem.getFileStatus(hadoopPath) - Seq(S3File(status.getPath.toString, status.isDirectory, status.getLen)) - } else { // Maybe is glob or not found - val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) - sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen)) + val directories = paths.map(S3File(_, isDir = true, 0)) + + def innerListFiles(remainingDirectories: Seq[S3File]): Seq[S3File] = { + if (remainingDirectories.isEmpty) { + Nil + } else { + val newDirs = sc.parallelize(remainingDirectories.map(_.path)) + val currentBatch = newDirs.flatMap { path => + val conf = new Configuration() + conf.set("fs.s3n.awsAccessKeyId", s3Key) + conf.set("fs.s3n.awsSecretAccessKey", s3Secret) + val fileSystem = FileSystem.get(new java.net.URI(path), conf) + try { + val hadoopPath = new Path(path) + if (fileSystem.isDirectory(hadoopPath)) { + val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) + sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen)) + } else if (fileSystem.isFile(hadoopPath)) { + val status = fileSystem.getFileStatus(hadoopPath) + Seq(S3File(status.getPath.toString, status.isDirectory, status.getLen)) + } else { + // Maybe is glob or not found + val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) + sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen)) + } + } catch { + case e: java.io.FileNotFoundException => + println(s"File $path not found.") + Nil } - } catch { - case e: java.io.FileNotFoundException => - println(s"File $path not found.") - e.printStackTrace() - Nil - } - }.collect() - val (dirs, files) = currentBatch.partition(_.isDir) - remainingDirectories.clear() - remainingDirectories ++= dirs - allFiles ++= files - } + }.collect() - allFiles + val (dirs, files) = currentBatch.partition(_.isDir) + files ++ innerListFiles(dirs) + } + } + innerListFiles(directories) } } From 8ffee27d402a33bffd54fad2b11d4b092709d9f4 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 15 Sep 2015 14:26:20 -0300 Subject: [PATCH 41/80] pr review --- .../core/jobs/utils/SparkContextUtils.scala | 107 ++++++++---------- 1 file changed, 49 insertions(+), 58 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 03801857..3fdfea09 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -1,16 +1,17 @@ package ignition.core.jobs.utils import ignition.core.utils.ByteUtils -import ignition.core.utils.CollectionUtils._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.compress.CompressionCodecFactory +import org.apache.spark.broadcast.Broadcast import org.apache.spark.{Partitioner, SparkContext} import org.apache.hadoop.fs.{FileStatus, Path, FileSystem} import org.apache.spark.rdd.{UnionRDD, RDD} import org.joda.time.DateTime import ignition.core.utils.DateUtils._ +import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer import scala.io.Source import scala.reflect.ClassTag @@ -18,8 +19,9 @@ import scala.util.Try object SparkContextUtils { - case class S3FilePartition(var size: Long, paths: ArrayBuffer[String]) - case class S3File(path: String, isDir: Boolean, size: Long) + case class HadoopFile(path: String, isDir: Boolean, size: Long) + + private case class HadoopFilePartition(size: Long, paths: Seq[String]) implicit class SparkContextImprovements(sc: SparkContext) { @@ -201,18 +203,13 @@ object SparkContextUtils { } def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long): RDD[String] = { - val foundFiles = parallelListFiles(paths) - val files = foundFiles.map(_.path) - - val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId") - val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey") + val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) - val partitionedFiles = sc.parallelize(files).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition)) + val foundFiles = parallelListFiles(paths) + val partitionedFiles = sc.parallelize(foundFiles.map(_.path)).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition)) partitionedFiles.mapPartitions { files => - val conf = new Configuration() - conf.set("fs.s3n.awsAccessKeyId", s3Key) - conf.set("fs.s3n.awsSecretAccessKey", s3Secret) + val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val codecFactory = new CompressionCodecFactory(conf) files.map { case (path, _) => path } flatMap { s3Path => val fileSystem = FileSystem.get(new java.net.URI(s3Path), conf) @@ -222,7 +219,7 @@ object SparkContextUtils { case None => fileSystem.open(path) } try { - Source.fromInputStream(inputStream).getLines().toList + Source.fromInputStream(inputStream).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _) } finally { Try { inputStream.close() } } @@ -230,19 +227,15 @@ object SparkContextUtils { } } - private def createPartitioner(files: Seq[S3File], maxBytesPerPartition: Long): Partitioner = { - val partitions = ArrayBuffer.empty[S3FilePartition] - files.distinctBy(_.path).foreach { file => - val size = file.size - val partition = partitions.find(bucket => bucket.size + size < maxBytesPerPartition) match { - case Some(partitionFound) => partitionFound - case None => - val newPartition = S3FilePartition(0, ArrayBuffer.empty) - partitions += newPartition - newPartition - } - partition.size += size - partition.paths += file.path + private def createPartitioner(files: Seq[HadoopFile], maxBytesPerPartition: Long): Partitioner = { + val partitions = files.foldLeft(Seq.empty[HadoopFilePartition]) { + case (acc, file) => + acc.find(bucket => bucket.size + file.size < maxBytesPerPartition) match { + case Some(found) => + val updated = found.copy(size = found.size + file.size, paths = file.path +: found.paths) + acc.updated(acc.indexOf(found), updated) + case None => acc :+ HadoopFilePartition(file.size, Seq(file.path)) + } } val indexedPartitions: Map[Any, Int] = partitions.zipWithIndex.flatMap { @@ -255,43 +248,41 @@ object SparkContextUtils { } } - def parallelListFiles(paths: Seq[String]): Seq[S3File] = { - val s3Key = sc.hadoopConfiguration.get("fs.s3n.awsAccessKeyId") - val s3Secret = sc.hadoopConfiguration.get("fs.s3n.awsSecretAccessKey") + private def executeListOnWorkers(hadoopConf: Broadcast[Map[String, String]], paths: RDD[String]): Seq[HadoopFile] = { + paths.flatMap { path => + val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } + val fileSystem = FileSystem.get(new java.net.URI(path), conf) + try { + val hadoopPath = new Path(path) + if (fileSystem.isDirectory(hadoopPath)) { + val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) + sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)) + } else if (fileSystem.isFile(hadoopPath)) { + val status = fileSystem.getFileStatus(hadoopPath) + Seq(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)) + } else { + // Maybe is glob or not found + val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) + sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)) + } + } catch { + case e: java.io.FileNotFoundException => + println(s"File $path not found.") + Nil + } + }.collect().toSeq + } - val directories = paths.map(S3File(_, isDir = true, 0)) + def parallelListFiles(paths: Seq[String]): Seq[HadoopFile] = { + val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) + val directories = paths.map(HadoopFile(_, isDir = true, 0)) - def innerListFiles(remainingDirectories: Seq[S3File]): Seq[S3File] = { + def innerListFiles(remainingDirectories: Seq[HadoopFile]): Seq[HadoopFile] = { if (remainingDirectories.isEmpty) { Nil } else { - val newDirs = sc.parallelize(remainingDirectories.map(_.path)) - val currentBatch = newDirs.flatMap { path => - val conf = new Configuration() - conf.set("fs.s3n.awsAccessKeyId", s3Key) - conf.set("fs.s3n.awsSecretAccessKey", s3Secret) - val fileSystem = FileSystem.get(new java.net.URI(path), conf) - try { - val hadoopPath = new Path(path) - if (fileSystem.isDirectory(hadoopPath)) { - val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) - sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen)) - } else if (fileSystem.isFile(hadoopPath)) { - val status = fileSystem.getFileStatus(hadoopPath) - Seq(S3File(status.getPath.toString, status.isDirectory, status.getLen)) - } else { - // Maybe is glob or not found - val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) - sanitize.map(status => S3File(status.getPath.toString, status.isDirectory, status.getLen)) - } - } catch { - case e: java.io.FileNotFoundException => - println(s"File $path not found.") - Nil - } - }.collect() - - val (dirs, files) = currentBatch.partition(_.isDir) + val pathsRDD = sc.parallelize(remainingDirectories.map(_.path)) + val (dirs, files) = executeListOnWorkers(hadoopConf, pathsRDD).partition(_.isDir) files ++ innerListFiles(dirs) } } From 7234254729377453b7750166765554cb3eb22951 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 16 Sep 2015 10:14:09 -0300 Subject: [PATCH 42/80] logging input stream close failure --- .../core/jobs/utils/SparkContextUtils.scala | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 3fdfea09..2259d622 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -16,6 +16,7 @@ import scala.collection.mutable.ArrayBuffer import scala.io.Source import scala.reflect.ClassTag import scala.util.Try +import scala.util.control.NonFatal object SparkContextUtils { @@ -211,17 +212,23 @@ object SparkContextUtils { partitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val codecFactory = new CompressionCodecFactory(conf) - files.map { case (path, _) => path } flatMap { s3Path => - val fileSystem = FileSystem.get(new java.net.URI(s3Path), conf) - val path = new Path(s3Path) - val inputStream = Option(codecFactory.getCodec(path)) match { - case Some(compression) => compression.createInputStream(fileSystem.open(path)) - case None => fileSystem.open(path) + files.map { case (path, _) => path } flatMap { path => + val fileSystem = FileSystem.get(new java.net.URI(path), conf) + val hadoopPath = new Path(path) + val inputStream = Option(codecFactory.getCodec(hadoopPath)) match { + case Some(compression) => compression.createInputStream(fileSystem.open(hadoopPath)) + case None => fileSystem.open(hadoopPath) } try { Source.fromInputStream(inputStream).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _) } finally { - Try { inputStream.close() } + try { + inputStream.close() + } catch { + case NonFatal(ex) => + println(s"Fail to close resource from '$path'") + ex.printStackTrace() + } } } } From af00eefa0975159ae760bbfec4444638b9862293 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 16 Sep 2015 10:34:33 -0300 Subject: [PATCH 43/80] better exception report --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 2259d622..0d7f0742 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -226,8 +226,7 @@ object SparkContextUtils { inputStream.close() } catch { case NonFatal(ex) => - println(s"Fail to close resource from '$path'") - ex.printStackTrace() + println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") } } } From 89630eb7f990c36ae749cc8312e6bc199b8b15e7 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 17 Sep 2015 13:53:36 -0300 Subject: [PATCH 44/80] setting UTF-8 codec to read file content (same behavior of hadoop client) --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 0d7f0742..ab20c83b 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -13,7 +13,7 @@ import ignition.core.utils.DateUtils._ import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer -import scala.io.Source +import scala.io.{Codec, Source} import scala.reflect.ClassTag import scala.util.Try import scala.util.control.NonFatal @@ -220,7 +220,7 @@ object SparkContextUtils { case None => fileSystem.open(hadoopPath) } try { - Source.fromInputStream(inputStream).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _) + Source.fromInputStream(inputStream)(Codec.UTF8).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _) } finally { try { inputStream.close() From 06ac774d980a47c05670760bd9c3d8725aabc45f Mon Sep 17 00:00:00 2001 From: Fernando Rodrigues Date: Mon, 21 Sep 2015 19:31:45 -0300 Subject: [PATCH 45/80] will delete SG's after cluster destroy --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index ed348fbb..3f4065e3 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -313,7 +313,7 @@ def launch(cluster_name, slaves, raise CommandError('Failed to created cluster {} after failures'.format(cluster_name)) -def destroy(cluster_name, delete_groups=False, region=default_region): +def destroy(cluster_name, delete_groups=True, region=default_region): delete_sg_param = ['--delete-groups'] if delete_groups else [] ec2_script_path = chdir_to_ec2_script_and_get_path() From 10b086e745f9d304df3e77af1215c34cf0c5b59c Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 15 Oct 2015 14:41:58 -0300 Subject: [PATCH 46/80] spark 1.5.1 update --- build.sbt | 2 +- tools/cluster.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index 476dd3bb..acdef9cb 100644 --- a/build.sbt +++ b/build.sbt @@ -13,7 +13,7 @@ ideaExcludeFolders += ".idea_modules" // Because we can't run two spark contexts on same VM parallelExecution in Test := false -libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.4.1" % "provided") +libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.5.1" % "provided") .exclude("org.apache.hadoop", "hadoop-client") .exclude("org.slf4j", "slf4j-log4j12") diff --git a/tools/cluster.py b/tools/cluster.py index 3f4065e3..cd972951 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -47,7 +47,7 @@ default_ami = None # will be decided based on spark-ec2 list default_master_ami = None default_env = 'dev' -default_spark_version = '1.4.1' +default_spark_version = '1.5.1' default_spark_repo = 'https://github.com/chaordic/spark' default_remote_user = 'ec2-user' default_remote_control_dir = '/tmp/Ignition' From a59f2eb92416915304edad9fd5f72f5c048e78a5 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 15 Oct 2015 15:34:38 -0300 Subject: [PATCH 47/80] fix spark_ec2.py --- tools/spark-ec2/spark_ec2.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 52c21c3f..89ade820 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -51,7 +51,7 @@ raw_input = input xrange = range -SPARK_EC2_VERSION = "1.4.1" +SPARK_EC2_VERSION = "1.5.1" SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__)) VALID_SPARK_VERSIONS = set([ @@ -72,6 +72,8 @@ "1.3.1", "1.4.0", "1.4.1", + "1.5.0", + "1.5.1", ]) SPARK_TACHYON_MAP = { @@ -86,6 +88,8 @@ "1.3.1": "0.5.0", "1.4.0": "0.6.4", "1.4.1": "0.6.4", + "1.5.0": "0.7.1", + "1.5.1": "0.7.1", } DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION From b176cc51a513ac6c1d8155ce98e6ea345f6d9abd Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Thu, 15 Oct 2015 17:03:58 -0300 Subject: [PATCH 48/80] Added executor instances option --- tools/cluster.py | 6 +++++- .../deploy.generic/root/spark-ec2/ec2-variables.sh | 1 + tools/spark-ec2/spark_ec2.py | 6 ++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index cd972951..0af46ebe 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -38,6 +38,7 @@ default_instance_type = 'r3.xlarge' default_spot_price = '0.10' default_worker_instances = '1' +default_executor_instances = '1' default_master_instance_type = 'm3.xlarge' default_driver_heap_size = '12G' default_region = 'us-east-1' @@ -209,7 +210,9 @@ def launch(cluster_name, slaves, vpc_subnet = None, master_instance_type=default_master_instance_type, wait_time='180', hadoop_major_version='2', - worker_instances=default_worker_instances, retries_on_same_cluster=5, + worker_instances=default_worker_instances, + executor_instances=default_executor_instances, + retries_on_same_cluster=5, max_clusters_to_create=5, minimum_percentage_healthy_slaves=0.9, remote_user=default_remote_user, @@ -272,6 +275,7 @@ def launch(cluster_name, slaves, '--spark-ec2-git-repo', spark_ec2_git_repo, '--spark-ec2-git-branch', spark_ec2_git_branch, '--worker-instances', worker_instances, + '--executor-instances', executor_instances, '--master-opts', '-Dspark.worker.timeout={0}'.format(worker_timeout), '--spark-git-repo', spark_repo, '-v', spark_version, diff --git a/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh b/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh index 4f3e8da8..bd3b656f 100644 --- a/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh +++ b/tools/spark-ec2/deploy.generic/root/spark-ec2/ec2-variables.sh @@ -29,6 +29,7 @@ export TACHYON_VERSION="{{tachyon_version}}" export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}" export SWAP_MB="{{swap}}" export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}" +export SPARK_EXECUTOR_INSTANCES="{{spark_executor_instances}}" export SPARK_MASTER_OPTS="{{spark_master_opts}}" export AWS_ACCESS_KEY_ID="{{aws_access_key_id}}" export AWS_SECRET_ACCESS_KEY="{{aws_secret_access_key}}" diff --git a/tools/spark-ec2/spark_ec2.py b/tools/spark-ec2/spark_ec2.py index 89ade820..e9442448 100755 --- a/tools/spark-ec2/spark_ec2.py +++ b/tools/spark-ec2/spark_ec2.py @@ -288,6 +288,10 @@ def parse_args(): "--worker-instances", type="int", default=1, help="Number of instances per worker: variable SPARK_WORKER_INSTANCES. Not used if YARN " + "is used as Hadoop major version (default: %default)") + parser.add_option( + "--executor-instances", type="int", default=1, + help="Number of executor instances per worker: variable SPARK_EXECUTOR_INSTANCES. Not used if YARN " + + "is used as Hadoop major version (default: %default)") parser.add_option( "--master-opts", type="string", default="", help="Extra options to give to master through SPARK_MASTER_OPTS variable " + @@ -1161,6 +1165,7 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): master_addresses = [get_dns_name(i, opts.private_ips) for i in master_nodes] slave_addresses = [get_dns_name(i, opts.private_ips) for i in slave_nodes] worker_instances_str = "%d" % opts.worker_instances if opts.worker_instances else "" + executor_instances_str = "%d" % opts.executor_instances if opts.executor_instances else "" template_vars = { "master_list": '\n'.join(master_addresses), "active_master": active_master, @@ -1175,6 +1180,7 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): "tachyon_version": tachyon_v, "hadoop_major_version": opts.hadoop_major_version, "spark_worker_instances": worker_instances_str, + "spark_executor_instances": executor_instances_str, "spark_master_opts": opts.master_opts } From 437e2644463157d5df0328df63de8cb8d68bdef7 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 27 Oct 2015 16:50:59 -0200 Subject: [PATCH 49/80] Adding filterAndGetParallelTextFiles --- .../core/jobs/utils/SparkContextUtils.scala | 42 +++++++++++++++++-- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index ab20c83b..00cbb347 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -36,7 +36,7 @@ object SparkContextUtils { for { path <- paths status <- Option(fs.globStatus(path)).getOrElse(Array.empty).toSeq - if status.isDirectory || !removeEmpty || status.getLen > 0 // remove empty files if necessary + if !removeEmpty || status.getLen > 0 || status.isDirectory // remove empty files if necessary } yield status } @@ -69,6 +69,14 @@ object SparkContextUtils { processPaths((p) => sc.textFile(p), paths, minimumPaths) } + private def processParallelTextFiles(paths: Seq[String], minimumPaths: Int, maxBytesPerPartition: Long): RDD[String] = { + val splittedPaths = paths.flatMap(ignition.core.utils.HadoopUtils.getPathStrings) + if (splittedPaths.size < minimumPaths) + throw new Exception(s"Not enough paths found for $paths") + + parallelTextFiles(splittedPaths, maxBytesPerPartition) + } + private def filterPaths(paths: Seq[String], requireSuccess: Boolean, inclusiveStartDate: Boolean, @@ -145,6 +153,14 @@ object SparkContextUtils { processTextFiles(paths, minimumPaths) } + def getParallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long, synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = { + if (synchLocally) + processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition) + else + processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition) + } + + @deprecated("It may incur in heavy S3 costs and/or be slow with small files, use filterAndGetParallelTextFiles instead", "2015-10-27") def filterAndGetTextFiles(path: String, requireSuccess: Boolean = false, inclusiveStartDate: Boolean = true, @@ -162,6 +178,24 @@ object SparkContextUtils { getTextFiles(paths, synchLocally, forceSynch, minimumPaths) } + def filterAndGetParallelTextFiles(path: String, + maxBytesPerPartition: Long = 64 * 1000 * 1000, + requireSuccess: Boolean = false, + inclusiveStartDate: Boolean = true, + startDate: Option[DateTime] = None, + inclusiveEndDate: Boolean = true, + endDate: Option[DateTime] = None, + lastN: Option[Int] = None, + synchLocally: Boolean = false, + forceSynch: Boolean = false, + ignoreMalformedDates: Boolean = false, + minimumPaths: Int = 1)(implicit dateExtractor: PathDateExtractor): RDD[String] = { + val paths = getFilteredPaths(Seq(path), requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates) + if (paths.size < minimumPaths) + throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of paths $paths is less than the required") + getParallelTextFiles(paths, maxBytesPerPartition, synchLocally, forceSynch, minimumPaths) + } + private def stringHadoopFile(paths: Seq[String], minimumPaths: Int): RDD[Try[String]] = { processPaths((p) => sc.sequenceFile(p, classOf[LongWritable], classOf[org.apache.hadoop.io.BytesWritable]) .map({ case (k, v) => Try { ByteUtils.toString(v.getBytes, 0, v.getLength, "UTF-8") } }), paths, minimumPaths) @@ -260,11 +294,11 @@ object SparkContextUtils { val fileSystem = FileSystem.get(new java.net.URI(path), conf) try { val hadoopPath = new Path(path) - if (fileSystem.isDirectory(hadoopPath)) { + val status = fileSystem.getFileStatus(hadoopPath) + if (status.isDirectory) { val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)) - } else if (fileSystem.isFile(hadoopPath)) { - val status = fileSystem.getFileStatus(hadoopPath) + } else if (status.isFile) { Seq(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)) } else { // Maybe is glob or not found From 637b80d9995e646b5260605c049a76487a054d03 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 27 Oct 2015 19:57:00 -0200 Subject: [PATCH 50/80] Many improvements --- .../core/jobs/utils/SparkContextUtils.scala | 45 ++++++++++++------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 00cbb347..490cda2c 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -13,6 +13,7 @@ import ignition.core.utils.DateUtils._ import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable import scala.io.{Codec, Source} import scala.reflect.ClassTag import scala.util.Try @@ -69,12 +70,12 @@ object SparkContextUtils { processPaths((p) => sc.textFile(p), paths, minimumPaths) } - private def processParallelTextFiles(paths: Seq[String], minimumPaths: Int, maxBytesPerPartition: Long): RDD[String] = { + private def processParallelTextFiles(paths: Seq[String], minimumPaths: Int, maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = { val splittedPaths = paths.flatMap(ignition.core.utils.HadoopUtils.getPathStrings) if (splittedPaths.size < minimumPaths) throw new Exception(s"Not enough paths found for $paths") - parallelTextFiles(splittedPaths, maxBytesPerPartition) + parallelTextFiles(splittedPaths, maxBytesPerPartition, minPartitions) } private def filterPaths(paths: Seq[String], @@ -146,6 +147,7 @@ object SparkContextUtils { } + @deprecated("It may incur heavy S3 costs and/or be slow with small files, use getParallelTextFiles instead", "2015-10-27") def getTextFiles(paths: Seq[String], synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = { if (synchLocally) processTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths) @@ -153,14 +155,17 @@ object SparkContextUtils { processTextFiles(paths, minimumPaths) } - def getParallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long, synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = { + def getParallelTextFiles(paths: Seq[String], + maxBytesPerPartition: Long = 64 * 1000 * 1000, + minPartitions: Int = 500, + synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = { if (synchLocally) - processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition) + processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition, minPartitions) else - processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition) + processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition, minPartitions) } - @deprecated("It may incur in heavy S3 costs and/or be slow with small files, use filterAndGetParallelTextFiles instead", "2015-10-27") + @deprecated("It may incur heavy S3 costs and/or be slow with small files, use filterAndGetParallelTextFiles instead", "2015-10-27") def filterAndGetTextFiles(path: String, requireSuccess: Boolean = false, inclusiveStartDate: Boolean = true, @@ -180,6 +185,7 @@ object SparkContextUtils { def filterAndGetParallelTextFiles(path: String, maxBytesPerPartition: Long = 64 * 1000 * 1000, + minPartitions: Int = 500, requireSuccess: Boolean = false, inclusiveStartDate: Boolean = true, startDate: Option[DateTime] = None, @@ -193,7 +199,7 @@ object SparkContextUtils { val paths = getFilteredPaths(Seq(path), requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates) if (paths.size < minimumPaths) throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of paths $paths is less than the required") - getParallelTextFiles(paths, maxBytesPerPartition, synchLocally, forceSynch, minimumPaths) + getParallelTextFiles(paths, maxBytesPerPartition, minPartitions, synchLocally, forceSynch, minimumPaths) } private def stringHadoopFile(paths: Seq[String], minimumPaths: Int): RDD[Try[String]] = { @@ -237,17 +243,18 @@ object SparkContextUtils { objectHadoopFile(paths, minimumPaths) } - def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long): RDD[String] = { + def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = { + require(paths.nonEmpty, "At least one path is required") val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) val foundFiles = parallelListFiles(paths) - val partitionedFiles = sc.parallelize(foundFiles.map(_.path)).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition)) + val partitionedFiles = sc.parallelize(foundFiles.map(_.path)).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition, minPartitions)) partitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val codecFactory = new CompressionCodecFactory(conf) + val fileSystem = FileSystem.get(new java.net.URI(paths.head), conf) files.map { case (path, _) => path } flatMap { path => - val fileSystem = FileSystem.get(new java.net.URI(path), conf) val hadoopPath = new Path(path) val inputStream = Option(codecFactory.getCodec(hadoopPath)) match { case Some(compression) => compression.createInputStream(fileSystem.open(hadoopPath)) @@ -267,16 +274,22 @@ object SparkContextUtils { } } - private def createPartitioner(files: Seq[HadoopFile], maxBytesPerPartition: Long): Partitioner = { - val partitions = files.foldLeft(Seq.empty[HadoopFilePartition]) { + private def createPartitioner(files: Seq[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long): Partitioner = { + implicit val ordering: Ordering[HadoopFilePartition] = Ordering.by(p => -p.size) // Small partitions come first (highest priority) + + val pq: mutable.PriorityQueue[HadoopFilePartition] = mutable.PriorityQueue.empty + + (0L until minPartitions).foreach(_ => pq += HadoopFilePartition(0, Seq.empty)) + + val partitions = files.foldLeft(pq) { case (acc, file) => - acc.find(bucket => bucket.size + file.size < maxBytesPerPartition) match { + acc.headOption.filter(bucket => bucket.size + file.size < maxBytesPerPartition) match { case Some(found) => val updated = found.copy(size = found.size + file.size, paths = file.path +: found.paths) - acc.updated(acc.indexOf(found), updated) - case None => acc :+ HadoopFilePartition(file.size, Seq(file.path)) + acc.tail += updated + case None => acc += HadoopFilePartition(file.size, Seq(file.path)) } - } + }.filter(_.size > 0).toList // Remove empty partitions val indexedPartitions: Map[Any, Int] = partitions.zipWithIndex.flatMap { case (bucket, index) => bucket.paths.map(path => path -> index) From 0563fab5fa7bbf900436a0bb95b303f62a22bf12 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Wed, 28 Oct 2015 15:53:45 -0200 Subject: [PATCH 51/80] Small improvements --- .../ignition/core/jobs/utils/RDDUtils.scala | 4 ++ .../core/jobs/utils/SparkContextUtils.scala | 37 ++++++++++--------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala index 7e75d5ec..57069bae 100644 --- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala @@ -80,6 +80,10 @@ object RDDUtils { }, preservesPartitioning = true) } + def collectValues[U: ClassTag](f: PartialFunction[V, U]): RDD[(K, U)] = { + rdd.filter { case (k, v) => f.isDefinedAt(v) }.mapValues(f) + } + def groupByKeyAndTake(n: Int): RDD[(K, List[V])] = rdd.aggregateByKey(List.empty[V])( (lst, v) => diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 490cda2c..8e4ec35c 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -75,7 +75,7 @@ object SparkContextUtils { if (splittedPaths.size < minimumPaths) throw new Exception(s"Not enough paths found for $paths") - parallelTextFiles(splittedPaths, maxBytesPerPartition, minPartitions) + parallelTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions) } private def filterPaths(paths: Seq[String], @@ -243,7 +243,7 @@ object SparkContextUtils { objectHadoopFile(paths, minimumPaths) } - def parallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = { + def parallelTextFiles(paths: List[String], maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = { require(paths.nonEmpty, "At least one path is required") val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) @@ -274,7 +274,7 @@ object SparkContextUtils { } } - private def createPartitioner(files: Seq[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long): Partitioner = { + private def createPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long): Partitioner = { implicit val ordering: Ordering[HadoopFilePartition] = Ordering.by(p => -p.size) // Small partitions come first (highest priority) val pq: mutable.PriorityQueue[HadoopFilePartition] = mutable.PriorityQueue.empty @@ -289,7 +289,7 @@ object SparkContextUtils { acc.tail += updated case None => acc += HadoopFilePartition(file.size, Seq(file.path)) } - }.filter(_.size > 0).toList // Remove empty partitions + }.filter(_.paths.nonEmpty).toList // Remove empty partitions val indexedPartitions: Map[Any, Int] = partitions.zipWithIndex.flatMap { case (bucket, index) => bucket.paths.map(path => path -> index) @@ -301,36 +301,39 @@ object SparkContextUtils { } } - private def executeListOnWorkers(hadoopConf: Broadcast[Map[String, String]], paths: RDD[String]): Seq[HadoopFile] = { + private def executeListOnWorkers(hadoopConf: Broadcast[Map[String, String]], paths: RDD[String]): List[HadoopFile] = { paths.flatMap { path => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val fileSystem = FileSystem.get(new java.net.URI(path), conf) - try { - val hadoopPath = new Path(path) + val hadoopPath = new Path(path) + val tryFind = try { val status = fileSystem.getFileStatus(hadoopPath) if (status.isDirectory) { val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) - sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)) + Option(sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList) } else if (status.isFile) { - Seq(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)) + Option(List(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen))) } else { - // Maybe is glob or not found - val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) - sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)) + None } } catch { case e: java.io.FileNotFoundException => - println(s"File $path not found.") - Nil + None + } + + tryFind.getOrElse { + // Maybe is glob or not found + val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) + sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList } - }.collect().toSeq + }.collect().toList } - def parallelListFiles(paths: Seq[String]): Seq[HadoopFile] = { + def parallelListFiles(paths: List[String]): List[HadoopFile] = { val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) val directories = paths.map(HadoopFile(_, isDir = true, 0)) - def innerListFiles(remainingDirectories: Seq[HadoopFile]): Seq[HadoopFile] = { + def innerListFiles(remainingDirectories: List[HadoopFile]): List[HadoopFile] = { if (remainingDirectories.isEmpty) { Nil } else { From cc4f716f7485e6ca314318006fb3fb943937e3e1 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Wed, 28 Oct 2015 17:29:44 -0200 Subject: [PATCH 52/80] Fix file system issues in corner cases --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 8e4ec35c..6e4d0bc8 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -253,9 +253,9 @@ object SparkContextUtils { partitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val codecFactory = new CompressionCodecFactory(conf) - val fileSystem = FileSystem.get(new java.net.URI(paths.head), conf) files.map { case (path, _) => path } flatMap { path => val hadoopPath = new Path(path) + val fileSystem = hadoopPath.getFileSystem(conf) val inputStream = Option(codecFactory.getCodec(hadoopPath)) match { case Some(compression) => compression.createInputStream(fileSystem.open(hadoopPath)) case None => fileSystem.open(hadoopPath) @@ -304,8 +304,8 @@ object SparkContextUtils { private def executeListOnWorkers(hadoopConf: Broadcast[Map[String, String]], paths: RDD[String]): List[HadoopFile] = { paths.flatMap { path => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } - val fileSystem = FileSystem.get(new java.net.URI(path), conf) val hadoopPath = new Path(path) + val fileSystem = hadoopPath.getFileSystem(conf) val tryFind = try { val status = fileSystem.getFileStatus(hadoopPath) if (status.isDirectory) { From 5a4916489e43d7d3e67818e83969a17c25e6aecb Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 3 Nov 2015 09:09:23 -0200 Subject: [PATCH 53/80] Make it faster in some situations --- .../core/jobs/utils/SparkContextUtils.scala | 90 +++++++++++++++---- 1 file changed, 74 insertions(+), 16 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 6e4d0bc8..842ced37 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -27,6 +27,8 @@ object SparkContextUtils { implicit class SparkContextImprovements(sc: SparkContext) { + lazy val _hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) + private def getFileSystem(path: Path): FileSystem = { path.getFileSystem(sc.hadoopConfiguration) } @@ -70,12 +72,16 @@ object SparkContextUtils { processPaths((p) => sc.textFile(p), paths, minimumPaths) } - private def processParallelTextFiles(paths: Seq[String], minimumPaths: Int, maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = { + private def processParallelTextFiles(paths: Seq[String], + minimumPaths: Int, + maxBytesPerPartition: Long, + minPartitions: Int, + listOnWorkers: Boolean): RDD[String] = { val splittedPaths = paths.flatMap(ignition.core.utils.HadoopUtils.getPathStrings) if (splittedPaths.size < minimumPaths) throw new Exception(s"Not enough paths found for $paths") - parallelTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions) + parallelTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers) } private def filterPaths(paths: Seq[String], @@ -158,11 +164,11 @@ object SparkContextUtils { def getParallelTextFiles(paths: Seq[String], maxBytesPerPartition: Long = 64 * 1000 * 1000, minPartitions: Int = 500, - synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1): RDD[String] = { + synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1, listOnWorkers: Boolean = false): RDD[String] = { if (synchLocally) - processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition, minPartitions) + processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition, minPartitions, listOnWorkers) else - processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition, minPartitions) + processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition, minPartitions, listOnWorkers) } @deprecated("It may incur heavy S3 costs and/or be slow with small files, use filterAndGetParallelTextFiles instead", "2015-10-27") @@ -195,11 +201,12 @@ object SparkContextUtils { synchLocally: Boolean = false, forceSynch: Boolean = false, ignoreMalformedDates: Boolean = false, - minimumPaths: Int = 1)(implicit dateExtractor: PathDateExtractor): RDD[String] = { + minimumPaths: Int = 1, + listOnWorkers: Boolean = false)(implicit dateExtractor: PathDateExtractor): RDD[String] = { val paths = getFilteredPaths(Seq(path), requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates) if (paths.size < minimumPaths) throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of paths $paths is less than the required") - getParallelTextFiles(paths, maxBytesPerPartition, minPartitions, synchLocally, forceSynch, minimumPaths) + getParallelTextFiles(paths, maxBytesPerPartition, minPartitions, synchLocally, forceSynch, minimumPaths, listOnWorkers) } private def stringHadoopFile(paths: Seq[String], minimumPaths: Int): RDD[Try[String]] = { @@ -243,13 +250,12 @@ object SparkContextUtils { objectHadoopFile(paths, minimumPaths) } - def parallelTextFiles(paths: List[String], maxBytesPerPartition: Long, minPartitions: Int): RDD[String] = { - require(paths.nonEmpty, "At least one path is required") - val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) + def parallelTextFiles(paths: List[String], maxBytesPerPartition: Long, minPartitions: Int, listOnWorkers: Boolean): RDD[String] = { - val foundFiles = parallelListFiles(paths) - val partitionedFiles = sc.parallelize(foundFiles.map(_.path)).map(file => file -> ()).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition, minPartitions)) + val foundFiles = (if (listOnWorkers) parallelListFiles(paths) else driverListFiles(paths)).filter(_.size > 0) + val partitionedFiles = sc.parallelize(foundFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition, minPartitions)) + val hadoopConf = _hadoopConf partitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val codecFactory = new CompressionCodecFactory(conf) @@ -262,6 +268,10 @@ object SparkContextUtils { } try { Source.fromInputStream(inputStream)(Codec.UTF8).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _) + } catch { + case NonFatal(ex) => + println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") + throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") } finally { try { inputStream.close() @@ -301,7 +311,9 @@ object SparkContextUtils { } } - private def executeListOnWorkers(hadoopConf: Broadcast[Map[String, String]], paths: RDD[String]): List[HadoopFile] = { + + private def executeListOnWorkers(paths: RDD[String]): List[HadoopFile] = { + val hadoopConf = _hadoopConf paths.flatMap { path => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val hadoopPath = new Path(path) @@ -329,16 +341,62 @@ object SparkContextUtils { }.collect().toList } + def parallelListFiles(paths: List[String]): List[HadoopFile] = { - val hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) + + val directories = paths.map(HadoopFile(_, isDir = true, 0)) + + def innerListFiles(remainingDirectories: List[HadoopFile]): List[HadoopFile] = { + if (remainingDirectories.isEmpty) { + Nil + } else { + val remainingPaths = remainingDirectories.map(_.path) + val pathsRDD = sc.parallelize(remainingPaths, remainingPaths.size / 2) + val (dirs, files) = executeListOnWorkers(pathsRDD).partition(_.isDir) + files ++ innerListFiles(dirs) + } + } + innerListFiles(directories) + } + + + private def executeDriverList(paths: Seq[String]): List[HadoopFile] = { + val conf = _hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } + paths.flatMap { path => + val hadoopPath = new Path(path) + val fileSystem = hadoopPath.getFileSystem(conf) + val tryFind = try { + val status = fileSystem.getFileStatus(hadoopPath) + if (status.isDirectory) { + val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) + Option(sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList) + } else if (status.isFile) { + Option(List(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen))) + } else { + None + } + } catch { + case e: java.io.FileNotFoundException => + None + } + + tryFind.getOrElse { + // Maybe is glob or not found + val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) + sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList + } + }.toList + } + + def driverListFiles(paths: List[String]): List[HadoopFile] = { + val directories = paths.map(HadoopFile(_, isDir = true, 0)) def innerListFiles(remainingDirectories: List[HadoopFile]): List[HadoopFile] = { if (remainingDirectories.isEmpty) { Nil } else { - val pathsRDD = sc.parallelize(remainingDirectories.map(_.path)) - val (dirs, files) = executeListOnWorkers(hadoopConf, pathsRDD).partition(_.isDir) + val (dirs, files) = executeDriverList(remainingDirectories.map(_.path)).partition(_.isDir) files ++ innerListFiles(dirs) } } From 506bd1c72affb05c4ebfc001440cfe178e1d30ba Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 9 Nov 2015 13:47:19 -0200 Subject: [PATCH 54/80] Split gzip files and other improvements --- build.sbt | 2 + .../ignition/core/jobs/CoreJobRunner.scala | 2 + .../core/jobs/utils/SparkContextUtils.scala | 93 +++++++++++++++---- tools/cluster.py | 5 + 4 files changed, 85 insertions(+), 17 deletions(-) diff --git a/build.sbt b/build.sbt index acdef9cb..528d30cf 100644 --- a/build.sbt +++ b/build.sbt @@ -19,6 +19,8 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.5.1" % "provided libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") +libraryDependencies += "nl.basjes.hadoop" % "splittablegzip" % "1.2" + libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.2.4" libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16" diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index aa4dcc76..ec5d9039 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -67,6 +67,8 @@ object CoreJobRunner { val sparkConf = new SparkConf() sparkConf.set("spark.executor.memory", config.executorMemory) + sparkConf.set("spark.hadoop.io.compression.codecs", + "org.apache.hadoop.io.compress.DefaultCodec,nl.basjes.hadoop.io.compress.SplittableGzipCodec,org.apache.hadoop.io.compress.BZip2Codec") sparkConf.setMaster(config.master) sparkConf.setAppName(appName) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 842ced37..78b6ec9b 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -2,8 +2,9 @@ package ignition.core.jobs.utils import ignition.core.utils.ByteUtils import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.io.LongWritable +import org.apache.hadoop.io.{Text, LongWritable} import org.apache.hadoop.io.compress.CompressionCodecFactory +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.broadcast.Broadcast import org.apache.spark.{Partitioner, SparkContext} import org.apache.hadoop.fs.{FileStatus, Path, FileSystem} @@ -21,6 +22,10 @@ import scala.util.control.NonFatal object SparkContextUtils { + case class IndexedPartitioner(numPartitions: Int, index: Map[Any, Int]) extends Partitioner { + override def getPartition(key: Any): Int = index(key) + } + case class HadoopFile(path: String, isDir: Boolean, size: Long) private case class HadoopFilePartition(size: Long, paths: Seq[String]) @@ -162,8 +167,8 @@ object SparkContextUtils { } def getParallelTextFiles(paths: Seq[String], - maxBytesPerPartition: Long = 64 * 1000 * 1000, - minPartitions: Int = 500, + maxBytesPerPartition: Long = 256 * 1000 * 1000, + minPartitions: Int = 100, synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1, listOnWorkers: Boolean = false): RDD[String] = { if (synchLocally) processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition, minPartitions, listOnWorkers) @@ -190,8 +195,8 @@ object SparkContextUtils { } def filterAndGetParallelTextFiles(path: String, - maxBytesPerPartition: Long = 64 * 1000 * 1000, - minPartitions: Int = 500, + maxBytesPerPartition: Long = 256 * 1000 * 1000, + minPartitions: Int = 100, requireSuccess: Boolean = false, inclusiveStartDate: Boolean = true, startDate: Option[DateTime] = None, @@ -250,13 +255,27 @@ object SparkContextUtils { objectHadoopFile(paths, minimumPaths) } - def parallelTextFiles(paths: List[String], maxBytesPerPartition: Long, minPartitions: Int, listOnWorkers: Boolean): RDD[String] = { + case class SizeBasedFileHandling(averageEstimatedCompressionRatio: Int = 8, + compressedExtensions: Set[String] = Set(".gz")) { + + def isBig(f: HadoopFile, uncompressedBigSize: Long): Boolean = estimatedSize(f) >= uncompressedBigSize + + def estimatedSize(f: HadoopFile) = if (isCompressed(f)) + f.size * averageEstimatedCompressionRatio + else + f.size + + def isCompressed(f: HadoopFile): Boolean = compressedExtensions.exists(f.path.endsWith) + } - val foundFiles = (if (listOnWorkers) parallelListFiles(paths) else driverListFiles(paths)).filter(_.size > 0) - val partitionedFiles = sc.parallelize(foundFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createPartitioner(foundFiles, maxBytesPerPartition, minPartitions)) + def readSmallFiles(smallFiles: List[HadoopFile], + maxBytesPerPartition: Long, + minPartitions: Int, + sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { + val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) val hadoopConf = _hadoopConf - partitionedFiles.mapPartitions { files => + smallPartitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val codecFactory = new CompressionCodecFactory(conf) files.map { case (path, _) => path } flatMap { path => @@ -284,7 +303,48 @@ object SparkContextUtils { } } - private def createPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long): Partitioner = { + def readBigFiles(bigFiles: List[HadoopFile], + maxBytesPerPartition: Long, + minPartitions: Int, + sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { + def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq("mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString)) + .foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } + + def read(file: HadoopFile, conf: Configuration) = sc.newAPIHadoopFile[LongWritable, Text, TextInputFormat](conf = conf, fClass = classOf[TextInputFormat], + kClass = classOf[LongWritable], vClass = classOf[Text], path = file.path).map(pair => pair._2.toString) + + val confCompressed = confWith(maxBytesPerPartition / sizeBasedFileHandling.averageEstimatedCompressionRatio) + val confUncompressed = confWith(maxBytesPerPartition) + + val union = new UnionRDD(sc, bigFiles.map { file => + val conf = if (sizeBasedFileHandling.isCompressed(file)) + confCompressed + else + confUncompressed + read(file, conf) + }) + + if (union.partitions.size < minPartitions) + union.coalesce(minPartitions) + else + union + } + + def parallelTextFiles(paths: List[String], + maxBytesPerPartition: Long, + minPartitions: Int, + listOnWorkers: Boolean, + sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = { + + val foundFiles = (if (listOnWorkers) parallelListFiles(paths) else driverListFiles(paths)).filter(_.size > 0) + val (bigFiles, smallFiles) = foundFiles.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition)) + + sc.union( + readSmallFiles(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling), + readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + } + + private def createPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long, sizeBasedFileHandling: SizeBasedFileHandling): Partitioner = { implicit val ordering: Ordering[HadoopFilePartition] = Ordering.by(p => -p.size) // Small partitions come first (highest priority) val pq: mutable.PriorityQueue[HadoopFilePartition] = mutable.PriorityQueue.empty @@ -293,11 +353,13 @@ object SparkContextUtils { val partitions = files.foldLeft(pq) { case (acc, file) => - acc.headOption.filter(bucket => bucket.size + file.size < maxBytesPerPartition) match { + val fileSize = sizeBasedFileHandling.estimatedSize(file) + + acc.headOption.filter(bucket => bucket.size + fileSize < maxBytesPerPartition) match { case Some(found) => - val updated = found.copy(size = found.size + file.size, paths = file.path +: found.paths) + val updated = found.copy(size = found.size + fileSize, paths = file.path +: found.paths) acc.tail += updated - case None => acc += HadoopFilePartition(file.size, Seq(file.path)) + case None => acc += HadoopFilePartition(fileSize, Seq(file.path)) } }.filter(_.paths.nonEmpty).toList // Remove empty partitions @@ -305,10 +367,7 @@ object SparkContextUtils { case (bucket, index) => bucket.paths.map(path => path -> index) }.toMap - new Partitioner { - override def numPartitions: Int = partitions.size - override def getPartition(key: Any): Int = indexedPartitions(key) - } + IndexedPartitioner(partitions.size, indexedPartitions) } diff --git a/tools/cluster.py b/tools/cluster.py index 0af46ebe..7daf9617 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -49,6 +49,9 @@ default_master_ami = None default_env = 'dev' default_spark_version = '1.5.1' +custom_builds = { + '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz' +} default_spark_repo = 'https://github.com/chaordic/spark' default_remote_user = 'ec2-user' default_remote_control_dir = '/tmp/Ignition' @@ -260,6 +263,8 @@ def launch(cluster_name, slaves, ami_params = ['--ami', ami] if ami else [] master_ami_params = ['--master-ami', master_ami] if master_ami else [] + spark_version = custom_builds.get(spark_version, spark_version) + for i in range(retries_on_same_cluster): log.info('Running script, try %d of %d', i + 1, retries_on_same_cluster) try: From dc12d2a696e5cf847360097f86b33588b8b4cf84 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 10 Nov 2015 17:06:14 -0200 Subject: [PATCH 55/80] Use SplittableGzipCodec only for big files --- src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 2 -- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 6 +++++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index ec5d9039..aa4dcc76 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -67,8 +67,6 @@ object CoreJobRunner { val sparkConf = new SparkConf() sparkConf.set("spark.executor.memory", config.executorMemory) - sparkConf.set("spark.hadoop.io.compression.codecs", - "org.apache.hadoop.io.compress.DefaultCodec,nl.basjes.hadoop.io.compress.SplittableGzipCodec,org.apache.hadoop.io.compress.BZip2Codec") sparkConf.setMaster(config.master) sparkConf.setAppName(appName) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 78b6ec9b..d18d5f76 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -307,7 +307,9 @@ object SparkContextUtils { maxBytesPerPartition: Long, minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { - def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq("mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString)) + def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq( + "io.compression.codecs" -> "org.apache.hadoop.io.compress.DefaultCodec,nl.basjes.hadoop.io.compress.SplittableGzipCodec,org.apache.hadoop.io.compress.BZip2Codec", + "mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString)) .foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } def read(file: HadoopFile, conf: Configuration) = sc.newAPIHadoopFile[LongWritable, Text, TextInputFormat](conf = conf, fClass = classOf[TextInputFormat], @@ -317,10 +319,12 @@ object SparkContextUtils { val confUncompressed = confWith(maxBytesPerPartition) val union = new UnionRDD(sc, bigFiles.map { file => + val conf = if (sizeBasedFileHandling.isCompressed(file)) confCompressed else confUncompressed + read(file, conf) }) From b52eceea89e95ba3416ba1f8c13d77249129e9d6 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Wed, 11 Nov 2015 16:39:09 -0200 Subject: [PATCH 56/80] Dont use build with updated hadoop client --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 7daf9617..2fe6b245 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -50,7 +50,7 @@ default_env = 'dev' default_spark_version = '1.5.1' custom_builds = { - '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz' +# '1.5.1': 'https://s3.amazonaws.com/chaordic-ignition-public/spark-1.5.1-bin-cdh4.7.1.tgz' } default_spark_repo = 'https://github.com/chaordic/spark' default_remote_user = 'ec2-user' From f1075e8d5acd6bc9b03d8974c32adf3d146e2d9b Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 17 Nov 2015 17:27:00 -0200 Subject: [PATCH 57/80] s3 list --- build.sbt | 2 + .../core/jobs/utils/SparkContextUtils.scala | 49 +++++++--- .../scala/ignition/core/utils/S3Utils.scala | 91 +++++++++++++++++++ 3 files changed, 131 insertions(+), 11 deletions(-) create mode 100644 src/main/scala/ignition/core/utils/S3Utils.scala diff --git a/build.sbt b/build.sbt index 528d30cf..7231704b 100644 --- a/build.sbt +++ b/build.sbt @@ -35,6 +35,8 @@ libraryDependencies += "joda-time" % "joda-time" % "2.7" libraryDependencies += "org.joda" % "joda-convert" % "1.7" +libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.9.6" + resolvers += "Akka Repository" at "http://repo.akka.io/releases/" resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/" diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index d18d5f76..96f2341d 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -1,16 +1,18 @@ package ignition.core.jobs.utils +import com.amazonaws.services.s3.AmazonS3Client +import com.amazonaws.services.s3.model.{S3ObjectSummary, S3Object} import ignition.core.utils.ByteUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.{Text, LongWritable} import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.lib.input.TextInputFormat -import org.apache.spark.broadcast.Broadcast import org.apache.spark.{Partitioner, SparkContext} import org.apache.hadoop.fs.{FileStatus, Path, FileSystem} import org.apache.spark.rdd.{UnionRDD, RDD} import org.joda.time.DateTime import ignition.core.utils.DateUtils._ +import ignition.core.utils.S3Utils._ import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer @@ -26,6 +28,11 @@ object SparkContextUtils { override def getPartition(key: Any): Int = index(key) } + implicit class S3ObjectSummaryExtensions(s3Object: S3ObjectSummary) { + def toHadoopFile: HadoopFile = + HadoopFile(s"s3n://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize) + } + case class HadoopFile(path: String, isDir: Boolean, size: Long) private case class HadoopFilePartition(size: Long, paths: Seq[String]) @@ -86,7 +93,7 @@ object SparkContextUtils { if (splittedPaths.size < minimumPaths) throw new Exception(s"Not enough paths found for $paths") - parallelTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers) + parallelListEndReadTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers) } private def filterPaths(paths: Seq[String], @@ -257,14 +264,14 @@ object SparkContextUtils { case class SizeBasedFileHandling(averageEstimatedCompressionRatio: Int = 8, compressedExtensions: Set[String] = Set(".gz")) { - + def isBig(f: HadoopFile, uncompressedBigSize: Long): Boolean = estimatedSize(f) >= uncompressedBigSize - + def estimatedSize(f: HadoopFile) = if (isCompressed(f)) f.size * averageEstimatedCompressionRatio else f.size - + def isCompressed(f: HadoopFile): Boolean = compressedExtensions.exists(f.path.endsWith) } @@ -334,15 +341,21 @@ object SparkContextUtils { union } - def parallelTextFiles(paths: List[String], - maxBytesPerPartition: Long, - minPartitions: Int, - listOnWorkers: Boolean, - sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = { + def parallelListEndReadTextFiles(paths: List[String], + maxBytesPerPartition: Long, + minPartitions: Int, + listOnWorkers: Boolean, + sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = { val foundFiles = (if (listOnWorkers) parallelListFiles(paths) else driverListFiles(paths)).filter(_.size > 0) - val (bigFiles, smallFiles) = foundFiles.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition)) + parallelReadTextFiles(foundFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling) + } + def parallelReadTextFiles(files: List[HadoopFile], + maxBytesPerPartition: Long, + minPartitions: Int, + sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = { + val (bigFiles, smallFiles) = files.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition)) sc.union( readSmallFiles(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling), readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) @@ -466,5 +479,19 @@ object SparkContextUtils { innerListFiles(directories) } + def s3FilterAndGetParallelTextFiles(bucket: String, + prefix: String, + startDate: Option[DateTime] = None, + endDate: Option[DateTime] = None, + endsWith: Option[String] = None, + predicate: S3ObjectSummary => Boolean = _ => true, + maxBytesPerPartition: Long = 256 * 1000 * 1000, + minPartitions: Int = 100, + sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()) + (implicit s3Client: AmazonS3Client, dateExtractor: PathDateExtractor): RDD[String] = { + val foundFiles = s3ListAndFilterFiles(bucket, prefix, startDate, endDate, predicate = predicate)(s3Client, dateExtractor).map(_.toHadoopFile) + parallelReadTextFiles(foundFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling) + } + } } diff --git a/src/main/scala/ignition/core/utils/S3Utils.scala b/src/main/scala/ignition/core/utils/S3Utils.scala new file mode 100644 index 00000000..28866c4c --- /dev/null +++ b/src/main/scala/ignition/core/utils/S3Utils.scala @@ -0,0 +1,91 @@ +package ignition.core.utils + +import com.amazonaws.auth.EnvironmentVariableCredentialsProvider +import com.amazonaws.services.s3.AmazonS3Client +import com.amazonaws.services.s3.model.{S3ObjectSummary, ObjectListing} +import ignition.core.jobs.utils.PathDateExtractor +import ignition.core.utils.DateUtils._ +import org.joda.time.DateTime + +import scala.collection.JavaConversions._ +import scala.collection.mutable +import scala.util.Try + +object S3Utils { + + implicit lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider()) + + def s3List(bucket: String, prefix: String, predicate: S3ObjectSummary => Boolean = _ => true) + (implicit s3: AmazonS3Client): List[S3ObjectSummary] = { + def inner(acc: mutable.ArrayBuffer[S3ObjectSummary], listing: ObjectListing): List[S3ObjectSummary] = { + acc ++= listing.getObjectSummaries.toList.filter(predicate) + if (listing.isTruncated) + inner(acc, s3.listNextBatchOfObjects(listing)) + else + acc.toList + } + + inner(new mutable.ArrayBuffer[S3ObjectSummary], s3.listObjects(bucket, prefix)) + } + + def s3ListAndFilterFiles(bucket: String, + prefix: String, + start: Option[DateTime] = None, + end: Option[DateTime] = None, + endsWith: Option[String] = None, + exclusionPattern: Option[String] = Option("_$folder$"), + predicate: S3ObjectSummary => Boolean = _ => true) + (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): List[S3ObjectSummary] = { + + def excludePatternValidation(s3Object: S3ObjectSummary, exclusionPatternOption: Option[String]): Option[S3ObjectSummary] = + exclusionPatternOption match { + case Some(pattern) if s3Object.getKey.contains(pattern) => None + case Some(_) | None => Option(s3Object) + } + + def endsWithValidation(s3Object: S3ObjectSummary, endsWithOption: Option[String]): Option[S3ObjectSummary] = + endsWithOption match { + case Some(pattern) if s3Object.getKey.endsWith(pattern) => Option(s3Object) + case Some(_) => None + case None => Option(s3Object) + } + + def extractDateFromKey(s3Object: S3ObjectSummary): Option[DateTime] = + Try(pathDateExtractor.extractFromPath(s"s3://$bucket/${s3Object.getKey}")).toOption + + def startValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, startOption: Option[DateTime]): Option[S3ObjectSummary] = + startOption match { + case Some(startDate) if startDate.isEqualOrBefore(extractedDate) => Option(s3Object) + case Some(_) => None + case None => Option(s3Object) + } + + def endValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, endOption: Option[DateTime]): Option[S3ObjectSummary] = + endOption match { + case Some(endDate) if endDate.isEqualOrAfter(extractedDate) => Option(s3Object) + case Some(_) => None + case None => Option(s3Object) + } + + def applyPredicate(s3Object: S3ObjectSummary): Option[S3ObjectSummary] = + if (predicate(s3Object)) + Option(s3Object) + else + None + + val allValidations: S3ObjectSummary => Boolean = s3Object => { + val validatedS3Object = for { + withValidPattern <- excludePatternValidation(s3Object, exclusionPattern) + withValidEndsWith <- endsWithValidation(withValidPattern, endsWith) + extractedDate <- extractDateFromKey(withValidEndsWith) + withValidStart <- startValidation(withValidEndsWith, extractedDate, start) + withValidEnd <- endValidation(withValidStart, extractedDate, end) + valid <- applyPredicate(withValidEnd) + } yield valid + validatedS3Object.isDefined + } + + s3List(bucket, prefix, allValidations)(s3) + } + +} From 909136626ecf4daf300a66893087d8c06609c7e1 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Fri, 20 Nov 2015 18:26:42 -0200 Subject: [PATCH 58/80] Split compressed big files --- .../core/jobs/utils/SparkContextUtils.scala | 92 ++++++++++++++----- .../core/utils/AutoCloseableIterator.scala | 67 ++++++++++++++ 2 files changed, 138 insertions(+), 21 deletions(-) create mode 100644 src/main/scala/ignition/core/utils/AutoCloseableIterator.scala diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index d18d5f76..dec5ca13 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -1,20 +1,21 @@ package ignition.core.jobs.utils -import ignition.core.utils.ByteUtils +import java.io.InputStream + +import ignition.core.utils.DateUtils._ +import ignition.core.utils.{AutoCloseableIterator, ByteUtils} import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.io.{Text, LongWritable} +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.hadoop.io.compress.CompressionCodecFactory +import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.mapreduce.lib.input.TextInputFormat -import org.apache.spark.broadcast.Broadcast +import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.{Partitioner, SparkContext} -import org.apache.hadoop.fs.{FileStatus, Path, FileSystem} -import org.apache.spark.rdd.{UnionRDD, RDD} import org.joda.time.DateTime -import ignition.core.utils.DateUtils._ import scala.collection.JavaConversions._ -import scala.collection.mutable.ArrayBuffer import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer import scala.io.{Codec, Source} import scala.reflect.ClassTag import scala.util.Try @@ -22,6 +23,17 @@ import scala.util.control.NonFatal object SparkContextUtils { + def close(inputStream: InputStream, path: String): Unit = { + try { + inputStream.close() + } catch { + case NonFatal(ex) => + println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") + } + } + + case class BigFileSlice(index: Int) + case class IndexedPartitioner(numPartitions: Int, index: Map[Any, Int]) extends Partitioner { override def getPartition(key: Any): Int = index(key) } @@ -273,7 +285,7 @@ object SparkContextUtils { maxBytesPerPartition: Long, minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { - val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) val hadoopConf = _hadoopConf smallPartitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } @@ -292,13 +304,54 @@ object SparkContextUtils { println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") } finally { - try { - inputStream.close() - } catch { - case NonFatal(ex) => - println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") - } + close(inputStream, path) + } + } + } + } + + def readCompressedBigFile(file: HadoopFile, maxBytesPerPartition: Long, minPartitions: Int, + sizeBasedFileHandling: SizeBasedFileHandling, sampleCount: Int = 100): RDD[String] = { + val estimatedSize = sizeBasedFileHandling.estimatedSize(file) + val totalSlices = (estimatedSize / maxBytesPerPartition + 1).toInt + val slices = (0 until totalSlices).map(BigFileSlice.apply) + + val partitioner = { + val indexedPartitions: Map[Any, Int] = slices.map(s => s -> s.index).toMap + IndexedPartitioner(totalSlices, indexedPartitions) + } + val hadoopConf = _hadoopConf + + val partitionedSlices = sc.parallelize(slices.map(s => s -> ()), 2).partitionBy(partitioner) + + partitionedSlices.mapPartitions { slices => + val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } + val codecFactory = new CompressionCodecFactory(conf) + val hadoopPath = new Path(file.path) + val fileSystem = hadoopPath.getFileSystem(conf) + slices.flatMap { case (slice, _) => + val inputStream = Option(codecFactory.getCodec(hadoopPath)) match { + case Some(compression) => compression.createInputStream(fileSystem.open(hadoopPath)) + case None => fileSystem.open(hadoopPath) } + val lines = Source.fromInputStream(inputStream)(Codec.UTF8).getLines() + + val lineSample = lines.take(sampleCount).toList + val linesPerSlice = { + val sampleSize = lineSample.map(_.size).sum + val estimatedAverageLineSize = Math.round(sampleSize / sampleCount.toFloat) + val estimatedTotalLines = Math.round(estimatedSize / estimatedAverageLineSize.toFloat) + estimatedTotalLines / totalSlices + 1 + } + + val linesAfterSeek = (lineSample.toIterator ++ lines).drop(linesPerSlice * slice.index) + + val finalLines = if (slice.index + 1 == totalSlices) // last slice, read until the end + linesAfterSeek + else + linesAfterSeek.take(linesPerSlice) + + AutoCloseableIterator.wrap(finalLines, () => close(inputStream, s"${file.path}, slice $slice")) } } } @@ -315,17 +368,14 @@ object SparkContextUtils { def read(file: HadoopFile, conf: Configuration) = sc.newAPIHadoopFile[LongWritable, Text, TextInputFormat](conf = conf, fClass = classOf[TextInputFormat], kClass = classOf[LongWritable], vClass = classOf[Text], path = file.path).map(pair => pair._2.toString) - val confCompressed = confWith(maxBytesPerPartition / sizeBasedFileHandling.averageEstimatedCompressionRatio) val confUncompressed = confWith(maxBytesPerPartition) val union = new UnionRDD(sc, bigFiles.map { file => - val conf = if (sizeBasedFileHandling.isCompressed(file)) - confCompressed + if (sizeBasedFileHandling.isCompressed(file)) + readCompressedBigFile(file, maxBytesPerPartition, minPartitions, sizeBasedFileHandling) else - confUncompressed - - read(file, conf) + read(file, confUncompressed) }) if (union.partitions.size < minPartitions) @@ -348,7 +398,7 @@ object SparkContextUtils { readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) } - private def createPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long, sizeBasedFileHandling: SizeBasedFileHandling): Partitioner = { + private def createSmallFilesPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long, sizeBasedFileHandling: SizeBasedFileHandling): Partitioner = { implicit val ordering: Ordering[HadoopFilePartition] = Ordering.by(p => -p.size) // Small partitions come first (highest priority) val pq: mutable.PriorityQueue[HadoopFilePartition] = mutable.PriorityQueue.empty diff --git a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala new file mode 100644 index 00000000..b3f054ba --- /dev/null +++ b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala @@ -0,0 +1,67 @@ +package ignition.core.utils + +import scala.util.Try +import scala.util.control.NonFatal + +object AutoCloseableIterator { + case object empty extends AutoCloseableIterator[Nothing] { + override def naiveHasNext() = false + override def naiveNext() = throw new Exception("Empty AutoCloseableIterator") + override def naiveClose() = {} + } + + def wrap[T](iterator: Iterator[T], close: () => Unit = () => ()): AutoCloseableIterator[T] = new AutoCloseableIterator[T] { + override def naiveClose(): Unit = close() + override def naiveHasNext(): Boolean = iterator.hasNext + override def naiveNext(): T = iterator.next() + } +} + +trait AutoCloseableIterator[T] extends Iterator[T] with AutoCloseable { + // Naive functions should be implemented by the user as in a standard Iterator/AutoCloseable + def naiveHasNext(): Boolean + def naiveNext(): T + def naiveClose(): Unit + + var closed = false + + // hasNext closes the iterator and handles the case where it is already closed + override def hasNext(): Boolean = if (closed) + false + else { + val naiveResult = try { + naiveHasNext + } catch { + case NonFatal(e) => + Try { close } + throw e + } + if (naiveResult) + true + else { + close // auto close when exhausted + false + } + } + + // next closes the iterator and handles the case where it is already closed + override def next(): T = if (closed) + throw new RuntimeException("Trying to get next element on a closed iterator") + else if (hasNext()) + try { + naiveNext + } catch { + case NonFatal(e) => + Try { close } + throw e + } + else + throw new RuntimeException("Trying to get next element on an exhausted iterator") + + override def close() = if (!closed) { + closed = true + naiveClose + } + + override def finalize() = Try { close } +} From 368a9986fa019f15d7f303b403cf96821899609f Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Fri, 20 Nov 2015 18:28:33 -0200 Subject: [PATCH 59/80] Removed unused dependency --- build.sbt | 2 -- src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala | 1 - 2 files changed, 3 deletions(-) diff --git a/build.sbt b/build.sbt index 528d30cf..acdef9cb 100644 --- a/build.sbt +++ b/build.sbt @@ -19,8 +19,6 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.5.1" % "provided libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") -libraryDependencies += "nl.basjes.hadoop" % "splittablegzip" % "1.2" - libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.2.4" libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16" diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index dec5ca13..06ea71ee 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -361,7 +361,6 @@ object SparkContextUtils { minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq( - "io.compression.codecs" -> "org.apache.hadoop.io.compress.DefaultCodec,nl.basjes.hadoop.io.compress.SplittableGzipCodec,org.apache.hadoop.io.compress.BZip2Codec", "mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString)) .foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } From 016de5b8e9db0e8aa51bf6c00f1de880938de1e3 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Mon, 23 Nov 2015 10:48:56 -0200 Subject: [PATCH 60/80] pr review --- .../core/jobs/utils/SparkContextUtils.scala | 110 +++++++++++++++--- .../scala/ignition/core/utils/S3Utils.scala | 91 --------------- 2 files changed, 93 insertions(+), 108 deletions(-) delete mode 100644 src/main/scala/ignition/core/utils/S3Utils.scala diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 96f2341d..de5cdfca 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -1,22 +1,22 @@ package ignition.core.jobs.utils +import com.amazonaws.auth.EnvironmentVariableCredentialsProvider import com.amazonaws.services.s3.AmazonS3Client -import com.amazonaws.services.s3.model.{S3ObjectSummary, S3Object} +import com.amazonaws.services.s3.model.{ObjectListing, S3ObjectSummary} import ignition.core.utils.ByteUtils +import ignition.core.utils.DateUtils._ import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.io.{Text, LongWritable} +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.hadoop.io.compress.CompressionCodecFactory +import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.mapreduce.lib.input.TextInputFormat +import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.{Partitioner, SparkContext} -import org.apache.hadoop.fs.{FileStatus, Path, FileSystem} -import org.apache.spark.rdd.{UnionRDD, RDD} import org.joda.time.DateTime -import ignition.core.utils.DateUtils._ -import ignition.core.utils.S3Utils._ import scala.collection.JavaConversions._ -import scala.collection.mutable.ArrayBuffer import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer import scala.io.{Codec, Source} import scala.reflect.ClassTag import scala.util.Try @@ -24,15 +24,12 @@ import scala.util.control.NonFatal object SparkContextUtils { + implicit lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider()) + case class IndexedPartitioner(numPartitions: Int, index: Map[Any, Int]) extends Partitioner { override def getPartition(key: Any): Int = index(key) } - implicit class S3ObjectSummaryExtensions(s3Object: S3ObjectSummary) { - def toHadoopFile: HadoopFile = - HadoopFile(s"s3n://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize) - } - case class HadoopFile(path: String, isDir: Boolean, size: Long) private case class HadoopFilePartition(size: Long, paths: Seq[String]) @@ -93,7 +90,7 @@ object SparkContextUtils { if (splittedPaths.size < minimumPaths) throw new Exception(s"Not enough paths found for $paths") - parallelListEndReadTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers) + parallelListAndReadTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers) } private def filterPaths(paths: Seq[String], @@ -341,7 +338,7 @@ object SparkContextUtils { union } - def parallelListEndReadTextFiles(paths: List[String], + def parallelListAndReadTextFiles(paths: List[String], maxBytesPerPartition: Long, minPartitions: Int, listOnWorkers: Boolean, @@ -479,17 +476,96 @@ object SparkContextUtils { innerListFiles(directories) } + private def s3List(bucket: String, prefix: String, predicate: S3ObjectSummary => Boolean = _ => true) + (implicit s3: AmazonS3Client): List[S3ObjectSummary] = { + def inner(acc: mutable.ArrayBuffer[S3ObjectSummary], listing: ObjectListing): List[S3ObjectSummary] = { + acc ++= listing.getObjectSummaries.toList.filter(predicate) + if (listing.isTruncated) + inner(acc, s3.listNextBatchOfObjects(listing)) + else + acc.toList + } + + inner(new mutable.ArrayBuffer[S3ObjectSummary], s3.listObjects(bucket, prefix)) + } + + def s3ListAndFilterFiles(bucket: String, + prefix: String, + start: Option[DateTime] = None, + end: Option[DateTime] = None, + endsWith: Option[String] = None, + exclusionPattern: Option[String] = Option("_$folder$"), + predicate: HadoopFile => Boolean = _ => true) + (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): List[HadoopFile] = { + + def excludePatternValidation(s3Object: S3ObjectSummary, exclusionPatternOption: Option[String]): Option[S3ObjectSummary] = + exclusionPatternOption match { + case Some(pattern) if s3Object.getKey.contains(pattern) => None + case Some(_) | None => Option(s3Object) + } + + def endsWithValidation(s3Object: S3ObjectSummary, endsWithOption: Option[String]): Option[S3ObjectSummary] = + endsWithOption match { + case Some(pattern) if s3Object.getKey.endsWith(pattern) => Option(s3Object) + case Some(_) => None + case None => Option(s3Object) + } + + def extractDateFromKey(s3Object: S3ObjectSummary): Option[DateTime] = + Try(pathDateExtractor.extractFromPath(s"s3n://$bucket/${s3Object.getKey}")).toOption + + def startValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, startOption: Option[DateTime]): Option[S3ObjectSummary] = + startOption match { + case Some(startDate) if startDate.isEqualOrBefore(extractedDate) => Option(s3Object) + case Some(_) => None + case None => Option(s3Object) + } + + def endValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, endOption: Option[DateTime]): Option[S3ObjectSummary] = + endOption match { + case Some(endDate) if endDate.isEqualOrAfter(extractedDate) => Option(s3Object) + case Some(_) => None + case None => Option(s3Object) + } + + def applyPredicate(file: HadoopFile): Option[HadoopFile] = + if (predicate(file)) + Option(file) + else + None + + def toHadoopFile(s3Object: S3ObjectSummary): HadoopFile = + HadoopFile(s"s3n://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize) + + val allValidations: S3ObjectSummary => Boolean = s3Object => { + val validatedFile = for { + withValidPattern <- excludePatternValidation(s3Object, exclusionPattern) + withValidEndsWith <- endsWithValidation(withValidPattern, endsWith) + extractedDate <- extractDateFromKey(withValidEndsWith) + withValidStart <- startValidation(withValidEndsWith, extractedDate, start) + withValidEnd <- endValidation(withValidStart, extractedDate, end) + hadoopFile = toHadoopFile(withValidEnd) + valid <- applyPredicate(hadoopFile) + } yield valid + validatedFile.isDefined + } + + s3List(bucket, prefix, allValidations)(s3).map(toHadoopFile) + } + + def s3FilterAndGetParallelTextFiles(bucket: String, prefix: String, startDate: Option[DateTime] = None, endDate: Option[DateTime] = None, endsWith: Option[String] = None, - predicate: S3ObjectSummary => Boolean = _ => true, + predicate: HadoopFile => Boolean = _ => true, maxBytesPerPartition: Long = 256 * 1000 * 1000, minPartitions: Int = 100, sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()) - (implicit s3Client: AmazonS3Client, dateExtractor: PathDateExtractor): RDD[String] = { - val foundFiles = s3ListAndFilterFiles(bucket, prefix, startDate, endDate, predicate = predicate)(s3Client, dateExtractor).map(_.toHadoopFile) + (implicit s3Client: AmazonS3Client = amazonS3ClientFromEnvironmentVariables, + dateExtractor: PathDateExtractor): RDD[String] = { + val foundFiles = s3ListAndFilterFiles(bucket, prefix, startDate, endDate, predicate = predicate)(s3Client, dateExtractor) parallelReadTextFiles(foundFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling) } diff --git a/src/main/scala/ignition/core/utils/S3Utils.scala b/src/main/scala/ignition/core/utils/S3Utils.scala deleted file mode 100644 index 28866c4c..00000000 --- a/src/main/scala/ignition/core/utils/S3Utils.scala +++ /dev/null @@ -1,91 +0,0 @@ -package ignition.core.utils - -import com.amazonaws.auth.EnvironmentVariableCredentialsProvider -import com.amazonaws.services.s3.AmazonS3Client -import com.amazonaws.services.s3.model.{S3ObjectSummary, ObjectListing} -import ignition.core.jobs.utils.PathDateExtractor -import ignition.core.utils.DateUtils._ -import org.joda.time.DateTime - -import scala.collection.JavaConversions._ -import scala.collection.mutable -import scala.util.Try - -object S3Utils { - - implicit lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider()) - - def s3List(bucket: String, prefix: String, predicate: S3ObjectSummary => Boolean = _ => true) - (implicit s3: AmazonS3Client): List[S3ObjectSummary] = { - def inner(acc: mutable.ArrayBuffer[S3ObjectSummary], listing: ObjectListing): List[S3ObjectSummary] = { - acc ++= listing.getObjectSummaries.toList.filter(predicate) - if (listing.isTruncated) - inner(acc, s3.listNextBatchOfObjects(listing)) - else - acc.toList - } - - inner(new mutable.ArrayBuffer[S3ObjectSummary], s3.listObjects(bucket, prefix)) - } - - def s3ListAndFilterFiles(bucket: String, - prefix: String, - start: Option[DateTime] = None, - end: Option[DateTime] = None, - endsWith: Option[String] = None, - exclusionPattern: Option[String] = Option("_$folder$"), - predicate: S3ObjectSummary => Boolean = _ => true) - (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): List[S3ObjectSummary] = { - - def excludePatternValidation(s3Object: S3ObjectSummary, exclusionPatternOption: Option[String]): Option[S3ObjectSummary] = - exclusionPatternOption match { - case Some(pattern) if s3Object.getKey.contains(pattern) => None - case Some(_) | None => Option(s3Object) - } - - def endsWithValidation(s3Object: S3ObjectSummary, endsWithOption: Option[String]): Option[S3ObjectSummary] = - endsWithOption match { - case Some(pattern) if s3Object.getKey.endsWith(pattern) => Option(s3Object) - case Some(_) => None - case None => Option(s3Object) - } - - def extractDateFromKey(s3Object: S3ObjectSummary): Option[DateTime] = - Try(pathDateExtractor.extractFromPath(s"s3://$bucket/${s3Object.getKey}")).toOption - - def startValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, startOption: Option[DateTime]): Option[S3ObjectSummary] = - startOption match { - case Some(startDate) if startDate.isEqualOrBefore(extractedDate) => Option(s3Object) - case Some(_) => None - case None => Option(s3Object) - } - - def endValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, endOption: Option[DateTime]): Option[S3ObjectSummary] = - endOption match { - case Some(endDate) if endDate.isEqualOrAfter(extractedDate) => Option(s3Object) - case Some(_) => None - case None => Option(s3Object) - } - - def applyPredicate(s3Object: S3ObjectSummary): Option[S3ObjectSummary] = - if (predicate(s3Object)) - Option(s3Object) - else - None - - val allValidations: S3ObjectSummary => Boolean = s3Object => { - val validatedS3Object = for { - withValidPattern <- excludePatternValidation(s3Object, exclusionPattern) - withValidEndsWith <- endsWithValidation(withValidPattern, endsWith) - extractedDate <- extractDateFromKey(withValidEndsWith) - withValidStart <- startValidation(withValidEndsWith, extractedDate, start) - withValidEnd <- endValidation(withValidStart, extractedDate, end) - valid <- applyPredicate(withValidEnd) - } yield valid - validatedS3Object.isDefined - } - - s3List(bucket, prefix, allValidations)(s3) - } - -} From 7c23316dd058acce0a607b6e16ba0ff35460a28a Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 25 Nov 2015 14:10:27 -0200 Subject: [PATCH 61/80] fix lambda ref to close resources --- .../scala/ignition/core/utils/AutoCloseableIterator.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala index b3f054ba..bc294f6f 100644 --- a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala +++ b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala @@ -10,8 +10,8 @@ object AutoCloseableIterator { override def naiveClose() = {} } - def wrap[T](iterator: Iterator[T], close: () => Unit = () => ()): AutoCloseableIterator[T] = new AutoCloseableIterator[T] { - override def naiveClose(): Unit = close() + def wrap[T](iterator: Iterator[T], doClose: () => Unit = () => ()): AutoCloseableIterator[T] = new AutoCloseableIterator[T] { + override def naiveClose(): Unit = doClose() override def naiveHasNext(): Boolean = iterator.hasNext override def naiveNext(): T = iterator.next() } From 358459f9ea32143d2c63ef460986cef0d75345d7 Mon Sep 17 00:00:00 2001 From: Leandro Date: Fri, 4 Dec 2015 19:43:09 -0200 Subject: [PATCH 62/80] Small Xlint fixes --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 4 ++-- .../scala/ignition/core/utils/AutoCloseableIterator.scala | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 97aed619..de3bf3ae 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -289,7 +289,7 @@ object SparkContextUtils { maxBytesPerPartition: Long, minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { - val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> Unit ), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) val hadoopConf = _hadoopConf smallPartitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } @@ -326,7 +326,7 @@ object SparkContextUtils { } val hadoopConf = _hadoopConf - val partitionedSlices = sc.parallelize(slices.map(s => s -> ()), 2).partitionBy(partitioner) + val partitionedSlices = sc.parallelize(slices.map(s => s -> Unit ), 2).partitionBy(partitioner) partitionedSlices.mapPartitions { slices => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } diff --git a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala index bc294f6f..4e3db808 100644 --- a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala +++ b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala @@ -26,7 +26,7 @@ trait AutoCloseableIterator[T] extends Iterator[T] with AutoCloseable { var closed = false // hasNext closes the iterator and handles the case where it is already closed - override def hasNext(): Boolean = if (closed) + override def hasNext: Boolean = if (closed) false else { val naiveResult = try { @@ -47,7 +47,7 @@ trait AutoCloseableIterator[T] extends Iterator[T] with AutoCloseable { // next closes the iterator and handles the case where it is already closed override def next(): T = if (closed) throw new RuntimeException("Trying to get next element on a closed iterator") - else if (hasNext()) + else if (hasNext) try { naiveNext } catch { From 5f54641cb7d6448148a0570599504125fb976eaa Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 7 Dec 2015 15:40:17 -0200 Subject: [PATCH 63/80] Make it partially compatible with scala 2.11 and Xlint free and minor cleanups --- build.sbt | 8 ++++---- .../ignition/core/jobs/utils/SparkContextUtils.scala | 12 +++++++----- .../ignition/core/utils/AutoCloseableIterator.scala | 4 ++-- src/main/scala/ignition/core/utils/BetterTrace.scala | 3 ++- src/main/scala/ignition/core/utils/FutureUtils.scala | 2 +- 5 files changed, 16 insertions(+), 13 deletions(-) diff --git a/build.sbt b/build.sbt index ec4d70bf..d0e2b029 100644 --- a/build.sbt +++ b/build.sbt @@ -4,7 +4,7 @@ version := "1.0" scalaVersion := "2.10.4" -scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings") +scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code") ideaExcludeFolders += ".idea" @@ -19,9 +19,7 @@ libraryDependencies += ("org.apache.spark" %% "spark-core" % "1.5.1" % "provided libraryDependencies += ("org.apache.hadoop" % "hadoop-client" % "2.0.0-cdh4.7.1" % "provided") -libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.2.4" - -libraryDependencies += "org.scalaj" %% "scalaj-http" % "0.3.16" +libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.4" libraryDependencies += "org.scalaz" %% "scalaz-core" % "7.0.6" @@ -35,6 +33,8 @@ libraryDependencies += "org.joda" % "joda-convert" % "1.7" libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.9.6" +libraryDependencies += "commons-lang" % "commons-lang" % "2.6" + resolvers += "Akka Repository" at "http://repo.akka.io/releases/" resolvers += "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/" diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 97aed619..08f4a39d 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -23,6 +23,8 @@ import scala.io.{Codec, Source} import scala.reflect.ClassTag import scala.util.Try import scala.util.control.NonFatal +import ignition.core.utils.ExceptionUtils._ + object SparkContextUtils { @@ -31,7 +33,7 @@ object SparkContextUtils { inputStream.close() } catch { case NonFatal(ex) => - println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") + println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}") } } @@ -289,7 +291,7 @@ object SparkContextUtils { maxBytesPerPartition: Long, minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { - val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> ()), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> Unit), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) val hadoopConf = _hadoopConf smallPartitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } @@ -305,8 +307,8 @@ object SparkContextUtils { Source.fromInputStream(inputStream)(Codec.UTF8).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _) } catch { case NonFatal(ex) => - println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") - throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getStackTraceString}") + println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}") + throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}") } finally { close(inputStream, path) } @@ -326,7 +328,7 @@ object SparkContextUtils { } val hadoopConf = _hadoopConf - val partitionedSlices = sc.parallelize(slices.map(s => s -> ()), 2).partitionBy(partitioner) + val partitionedSlices = sc.parallelize(slices.map(s => s -> Unit), 2).partitionBy(partitioner) partitionedSlices.mapPartitions { slices => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } diff --git a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala index bc294f6f..4e3db808 100644 --- a/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala +++ b/src/main/scala/ignition/core/utils/AutoCloseableIterator.scala @@ -26,7 +26,7 @@ trait AutoCloseableIterator[T] extends Iterator[T] with AutoCloseable { var closed = false // hasNext closes the iterator and handles the case where it is already closed - override def hasNext(): Boolean = if (closed) + override def hasNext: Boolean = if (closed) false else { val naiveResult = try { @@ -47,7 +47,7 @@ trait AutoCloseableIterator[T] extends Iterator[T] with AutoCloseable { // next closes the iterator and handles the case where it is already closed override def next(): T = if (closed) throw new RuntimeException("Trying to get next element on a closed iterator") - else if (hasNext()) + else if (hasNext) try { naiveNext } catch { diff --git a/src/main/scala/ignition/core/utils/BetterTrace.scala b/src/main/scala/ignition/core/utils/BetterTrace.scala index 158e261e..32d5ea5f 100644 --- a/src/main/scala/ignition/core/utils/BetterTrace.scala +++ b/src/main/scala/ignition/core/utils/BetterTrace.scala @@ -1,5 +1,6 @@ package ignition.core.utils +import ignition.core.utils.ExceptionUtils._ // Used mainly to augment scalacheck traces in scalatest trait BetterTrace { def fail(message: String): Nothing @@ -7,7 +8,7 @@ trait BetterTrace { try { block } catch { - case t: Throwable => fail(s"${t.getMessage}: ${t.getStackTraceString}") + case t: Throwable => fail(s"${t.getMessage}: ${t.getFullStacktraceString}") } } diff --git a/src/main/scala/ignition/core/utils/FutureUtils.scala b/src/main/scala/ignition/core/utils/FutureUtils.scala index 95b44c2f..4523a94f 100644 --- a/src/main/scala/ignition/core/utils/FutureUtils.scala +++ b/src/main/scala/ignition/core/utils/FutureUtils.scala @@ -6,7 +6,7 @@ import scala.util.{Failure, Success, Try} object FutureUtils { - def blockingFuture[T](body: =>T)(implicit ec: ExecutionContext): Future[T] = future { blocking { body } } + def blockingFuture[T](body: =>T)(implicit ec: ExecutionContext): Future[T] = Future { blocking { body } } implicit class FutureImprovements[V](future: Future[V]) { def toOptionOnFailure(errorHandler: (Throwable) => Option[V])(implicit ec: ExecutionContext): Future[Option[V]] = { From 0ec37240db44a57e0a2117f53bf8d577b4a71037 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 7 Dec 2015 15:41:30 -0200 Subject: [PATCH 64/80] Make it partially compatible with scala 2.11 and Xlint free and minor cleanups --- src/main/scala/ignition/core/utils/ExceptionUtils.scala | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 src/main/scala/ignition/core/utils/ExceptionUtils.scala diff --git a/src/main/scala/ignition/core/utils/ExceptionUtils.scala b/src/main/scala/ignition/core/utils/ExceptionUtils.scala new file mode 100644 index 00000000..e2626764 --- /dev/null +++ b/src/main/scala/ignition/core/utils/ExceptionUtils.scala @@ -0,0 +1,9 @@ +package ignition.core.utils + +object ExceptionUtils { + + implicit class ExceptionImprovements(e: Throwable) { + def getFullStacktraceString(): String = org.apache.commons.lang.exception.ExceptionUtils.getFullStackTrace(e) + } + +} From b66d05dfa1cd328442c52c80d07b19f7643b67a5 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 7 Dec 2015 15:48:44 -0200 Subject: [PATCH 65/80] Renaming --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 6 +++--- src/main/scala/ignition/core/utils/BetterTrace.scala | 2 +- src/main/scala/ignition/core/utils/ExceptionUtils.scala | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 08f4a39d..6765009d 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -33,7 +33,7 @@ object SparkContextUtils { inputStream.close() } catch { case NonFatal(ex) => - println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}") + println(s"Fail to close resource from '$path': ${ex.getMessage} -- ${ex.getFullStackTraceString}") } } @@ -307,8 +307,8 @@ object SparkContextUtils { Source.fromInputStream(inputStream)(Codec.UTF8).getLines().foldLeft(ArrayBuffer.empty[String])(_ += _) } catch { case NonFatal(ex) => - println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}") - throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStacktraceString}") + println(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStackTraceString}") + throw new Exception(s"Failed to read resource from '$path': ${ex.getMessage} -- ${ex.getFullStackTraceString}") } finally { close(inputStream, path) } diff --git a/src/main/scala/ignition/core/utils/BetterTrace.scala b/src/main/scala/ignition/core/utils/BetterTrace.scala index 32d5ea5f..387f49f7 100644 --- a/src/main/scala/ignition/core/utils/BetterTrace.scala +++ b/src/main/scala/ignition/core/utils/BetterTrace.scala @@ -8,7 +8,7 @@ trait BetterTrace { try { block } catch { - case t: Throwable => fail(s"${t.getMessage}: ${t.getFullStacktraceString}") + case t: Throwable => fail(s"${t.getMessage}: ${t.getFullStackTraceString}") } } diff --git a/src/main/scala/ignition/core/utils/ExceptionUtils.scala b/src/main/scala/ignition/core/utils/ExceptionUtils.scala index e2626764..1ae33568 100644 --- a/src/main/scala/ignition/core/utils/ExceptionUtils.scala +++ b/src/main/scala/ignition/core/utils/ExceptionUtils.scala @@ -3,7 +3,7 @@ package ignition.core.utils object ExceptionUtils { implicit class ExceptionImprovements(e: Throwable) { - def getFullStacktraceString(): String = org.apache.commons.lang.exception.ExceptionUtils.getFullStackTrace(e) + def getFullStackTraceString(): String = org.apache.commons.lang.exception.ExceptionUtils.getFullStackTrace(e) } } From 2f6741dfd3d5ff30738cda6c2cc3279cda06fe0e Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 7 Dec 2015 16:11:10 -0200 Subject: [PATCH 66/80] Use null instead of Unit because Unit isnt serialiable --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 6765009d..648da060 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -291,7 +291,7 @@ object SparkContextUtils { maxBytesPerPartition: Long, minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { - val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> Unit), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> null), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) val hadoopConf = _hadoopConf smallPartitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } @@ -328,7 +328,7 @@ object SparkContextUtils { } val hadoopConf = _hadoopConf - val partitionedSlices = sc.parallelize(slices.map(s => s -> Unit), 2).partitionBy(partitioner) + val partitionedSlices = sc.parallelize(slices.map(s => s -> null), 2).partitionBy(partitioner) partitionedSlices.mapPartitions { slices => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } From 84e98f490d409f8b9de741eada91d932979b1eff Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Fri, 27 Nov 2015 10:54:12 -0200 Subject: [PATCH 67/80] new filter and get text files --- .../core/jobs/utils/SparkContextUtils.scala | 448 ++++++++++-------- 1 file changed, 250 insertions(+), 198 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 648da060..1afbd74f 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -4,9 +4,9 @@ import java.io.InputStream import com.amazonaws.auth.EnvironmentVariableCredentialsProvider import com.amazonaws.services.s3.AmazonS3Client -import com.amazonaws.services.s3.model.{ObjectListing, S3ObjectSummary} -import ignition.core.utils.{AutoCloseableIterator, ByteUtils} +import com.amazonaws.services.s3.model.{ListObjectsRequest, ObjectListing, S3ObjectSummary} import ignition.core.utils.DateUtils._ +import ignition.core.utils.{AutoCloseableIterator, ByteUtils, HadoopUtils} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.hadoop.io.compress.CompressionCodecFactory @@ -21,14 +21,24 @@ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.io.{Codec, Source} import scala.reflect.ClassTag -import scala.util.Try +import scala.util.{Failure, Success, Try} import scala.util.control.NonFatal import ignition.core.utils.ExceptionUtils._ object SparkContextUtils { - def close(inputStream: InputStream, path: String): Unit = { + private case class BigFileSlice(index: Int) + + private case class HadoopFilePartition(size: Long, paths: Seq[String]) + + private case class IndexedPartitioner(numPartitions: Int, index: Map[Any, Int]) extends Partitioner { + override def getPartition(key: Any): Int = index(key) + } + + private lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider()) + + private def close(inputStream: InputStream, path: String): Unit = { try { inputStream.close() } catch { @@ -37,17 +47,8 @@ object SparkContextUtils { } } - case class BigFileSlice(index: Int) - implicit lazy val amazonS3ClientFromEnvironmentVariables = new AmazonS3Client(new EnvironmentVariableCredentialsProvider()) - - case class IndexedPartitioner(numPartitions: Int, index: Map[Any, Int]) extends Partitioner { - override def getPartition(key: Any): Int = index(key) - } - case class HadoopFile(path: String, isDir: Boolean, size: Long) - private case class HadoopFilePartition(size: Long, paths: Seq[String]) - implicit class SparkContextImprovements(sc: SparkContext) { lazy val _hadoopConf = sc.broadcast(sc.hadoopConfiguration.iterator().map { case entry => entry.getKey -> entry.getValue }.toMap) @@ -95,18 +96,6 @@ object SparkContextUtils { processPaths((p) => sc.textFile(p), paths, minimumPaths) } - private def processParallelTextFiles(paths: Seq[String], - minimumPaths: Int, - maxBytesPerPartition: Long, - minPartitions: Int, - listOnWorkers: Boolean): RDD[String] = { - val splittedPaths = paths.flatMap(ignition.core.utils.HadoopUtils.getPathStrings) - if (splittedPaths.size < minimumPaths) - throw new Exception(s"Not enough paths found for $paths") - - parallelListAndReadTextFiles(splittedPaths.toList, maxBytesPerPartition, minPartitions, listOnWorkers) - } - private def filterPaths(paths: Seq[String], requireSuccess: Boolean, inclusiveStartDate: Boolean, @@ -141,7 +130,6 @@ object SparkContextUtils { } - def getFilteredPaths(paths: Seq[String], requireSuccess: Boolean, inclusiveStartDate: Boolean, @@ -154,7 +142,6 @@ object SparkContextUtils { filterPaths(paths, requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates) } - lazy val hdfsPathPrefix = sc.master.replaceFirst("spark://(.*):7077", "hdfs://$1:9000/") def synchToHdfs(paths: Seq[String], pathsToRdd: (Seq[String], Int) => RDD[String], forceSynch: Boolean): Seq[String] = { @@ -184,16 +171,6 @@ object SparkContextUtils { processTextFiles(paths, minimumPaths) } - def getParallelTextFiles(paths: Seq[String], - maxBytesPerPartition: Long = 256 * 1000 * 1000, - minPartitions: Int = 100, - synchLocally: Boolean = false, forceSynch: Boolean = false, minimumPaths: Int = 1, listOnWorkers: Boolean = false): RDD[String] = { - if (synchLocally) - processParallelTextFiles(synchToHdfs(paths, processTextFiles, forceSynch), minimumPaths, maxBytesPerPartition, minPartitions, listOnWorkers) - else - processParallelTextFiles(paths, minimumPaths, maxBytesPerPartition, minPartitions, listOnWorkers) - } - @deprecated("It may incur heavy S3 costs and/or be slow with small files, use filterAndGetParallelTextFiles instead", "2015-10-27") def filterAndGetTextFiles(path: String, requireSuccess: Boolean = false, @@ -212,26 +189,6 @@ object SparkContextUtils { getTextFiles(paths, synchLocally, forceSynch, minimumPaths) } - def filterAndGetParallelTextFiles(path: String, - maxBytesPerPartition: Long = 256 * 1000 * 1000, - minPartitions: Int = 100, - requireSuccess: Boolean = false, - inclusiveStartDate: Boolean = true, - startDate: Option[DateTime] = None, - inclusiveEndDate: Boolean = true, - endDate: Option[DateTime] = None, - lastN: Option[Int] = None, - synchLocally: Boolean = false, - forceSynch: Boolean = false, - ignoreMalformedDates: Boolean = false, - minimumPaths: Int = 1, - listOnWorkers: Boolean = false)(implicit dateExtractor: PathDateExtractor): RDD[String] = { - val paths = getFilteredPaths(Seq(path), requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, endDate, lastN, ignoreMalformedDates) - if (paths.size < minimumPaths) - throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of paths $paths is less than the required") - getParallelTextFiles(paths, maxBytesPerPartition, minPartitions, synchLocally, forceSynch, minimumPaths, listOnWorkers) - } - private def stringHadoopFile(paths: Seq[String], minimumPaths: Int): RDD[Try[String]] = { processPaths((p) => sc.sequenceFile(p, classOf[LongWritable], classOf[org.apache.hadoop.io.BytesWritable]) .map({ case (k, v) => Try { ByteUtils.toString(v.getBytes, 0, v.getLength, "UTF-8") } }), paths, minimumPaths) @@ -287,11 +244,11 @@ object SparkContextUtils { } - def readSmallFiles(smallFiles: List[HadoopFile], - maxBytesPerPartition: Long, - minPartitions: Int, - sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { - val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> null), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + private def readSmallFiles(smallFiles: List[HadoopFile], + maxBytesPerPartition: Long, + minPartitions: Int, + sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { + val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file ->()), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) val hadoopConf = _hadoopConf smallPartitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } @@ -316,8 +273,8 @@ object SparkContextUtils { } } - def readCompressedBigFile(file: HadoopFile, maxBytesPerPartition: Long, minPartitions: Int, - sizeBasedFileHandling: SizeBasedFileHandling, sampleCount: Int = 100): RDD[String] = { + private def readCompressedBigFile(file: HadoopFile, maxBytesPerPartition: Long, minPartitions: Int, + sizeBasedFileHandling: SizeBasedFileHandling, sampleCount: Int = 100): RDD[String] = { val estimatedSize = sizeBasedFileHandling.estimatedSize(file) val totalSlices = (estimatedSize / maxBytesPerPartition + 1).toInt val slices = (0 until totalSlices).map(BigFileSlice.apply) @@ -362,10 +319,10 @@ object SparkContextUtils { } } - def readBigFiles(bigFiles: List[HadoopFile], - maxBytesPerPartition: Long, - minPartitions: Int, - sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { + private def readBigFiles(bigFiles: List[HadoopFile], + maxBytesPerPartition: Long, + minPartitions: Int, + sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { def confWith(maxSplitSize: Long): Configuration = (_hadoopConf.value ++ Seq( "mapreduce.input.fileinputformat.split.maxsize" -> maxSplitSize.toString)) .foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } @@ -392,21 +349,27 @@ object SparkContextUtils { def parallelListAndReadTextFiles(paths: List[String], maxBytesPerPartition: Long, minPartitions: Int, - listOnWorkers: Boolean, - sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = { - - val foundFiles = (if (listOnWorkers) parallelListFiles(paths) else driverListFiles(paths)).filter(_.size > 0) - parallelReadTextFiles(foundFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling) + sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()) + (implicit dateExtractor: PathDateExtractor): RDD[String] = { + val foundFiles = paths.flatMap(smartList(_)).filter(_.size > 0) + parallelReadTextFiles(foundFiles, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, sizeBasedFileHandling = sizeBasedFileHandling) } def parallelReadTextFiles(files: List[HadoopFile], - maxBytesPerPartition: Long, - minPartitions: Int, - sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = { - val (bigFiles, smallFiles) = files.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition)) - sc.union( - readSmallFiles(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling), - readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + maxBytesPerPartition: Long = 256 * 1000 * 1000, + minPartitions: Int = 100, + sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling(), + synchLocally: Option[String] = None, + forceSynch: Boolean = false): RDD[String] = { + if (synchLocally.isDefined) + doSync(files, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, synchLocally = synchLocally.get, + sizeBasedFileHandling = sizeBasedFileHandling, forceSynch = forceSynch) + else { + val (bigFiles, smallFiles) = files.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition)) + sc.union( + readSmallFiles(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling), + readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + } } private def createSmallFilesPartitioner(files: List[HadoopFile], maxBytesPerPartition: Long, minPartitions: Long, sizeBasedFileHandling: SizeBasedFileHandling): Partitioner = { @@ -435,11 +398,9 @@ object SparkContextUtils { IndexedPartitioner(partitions.size, indexedPartitions) } - - private def executeListOnWorkers(paths: RDD[String]): List[HadoopFile] = { - val hadoopConf = _hadoopConf + private def executeDriverList(paths: Seq[String]): List[HadoopFile] = { + val conf = _hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } paths.flatMap { path => - val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } val hadoopPath = new Path(path) val fileSystem = hadoopPath.getFileSystem(conf) val tryFind = try { @@ -462,162 +423,253 @@ object SparkContextUtils { val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList } - }.collect().toList + }.toList } - - def parallelListFiles(paths: List[String]): List[HadoopFile] = { - - val directories = paths.map(HadoopFile(_, isDir = true, 0)) - + private def driverListFiles(path: String): List[HadoopFile] = { def innerListFiles(remainingDirectories: List[HadoopFile]): List[HadoopFile] = { if (remainingDirectories.isEmpty) { Nil } else { - val remainingPaths = remainingDirectories.map(_.path) - val pathsRDD = sc.parallelize(remainingPaths, remainingPaths.size / 2) - val (dirs, files) = executeListOnWorkers(pathsRDD).partition(_.isDir) + val (dirs, files) = executeDriverList(remainingDirectories.map(_.path)).partition(_.isDir) files ++ innerListFiles(dirs) } } - innerListFiles(directories) + innerListFiles(List(HadoopFile(path, isDir = true, 0))) } + def s3ListCommonPrefixes(bucket: String, prefix: String, delimiter: String = "/") + (implicit s3: AmazonS3Client): Stream[String] = { + def inner(current: ObjectListing): Stream[String] = + if (current.isTruncated) + current.getCommonPrefixes.toStream ++ inner(s3.listNextBatchOfObjects(current)) + else + current.getCommonPrefixes.toStream - private def executeDriverList(paths: Seq[String]): List[HadoopFile] = { - val conf = _hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } - paths.flatMap { path => - val hadoopPath = new Path(path) - val fileSystem = hadoopPath.getFileSystem(conf) - val tryFind = try { - val status = fileSystem.getFileStatus(hadoopPath) - if (status.isDirectory) { - val sanitize = Option(fileSystem.listStatus(hadoopPath)).getOrElse(Array.empty) - Option(sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList) - } else if (status.isFile) { - Option(List(HadoopFile(status.getPath.toString, status.isDirectory, status.getLen))) - } else { - None - } - } catch { - case e: java.io.FileNotFoundException => - None - } - - tryFind.getOrElse { - // Maybe is glob or not found - val sanitize = Option(fileSystem.globStatus(hadoopPath)).getOrElse(Array.empty) - sanitize.map(status => HadoopFile(status.getPath.toString, status.isDirectory, status.getLen)).toList - } - }.toList + val request = new ListObjectsRequest(bucket, prefix, null, delimiter, 1000) + inner(s3.listObjects(request)) } - def driverListFiles(paths: List[String]): List[HadoopFile] = { - - val directories = paths.map(HadoopFile(_, isDir = true, 0)) + def s3ListObjects(bucket: String, prefix: String) + (implicit s3: AmazonS3Client): Stream[S3ObjectSummary] = { + def inner(current: ObjectListing): Stream[S3ObjectSummary] = + if (current.isTruncated) + current.getObjectSummaries.toStream ++ inner(s3.listNextBatchOfObjects(current)) + else + current.getObjectSummaries.toStream + + inner(s3.listObjects(bucket, prefix)) + } + + def s3NarrowPaths(bucket: String, + prefix: String, + delimiter: String = "/", + inclusiveStartDate: Boolean = true, + startDate: Option[DateTime] = None, + inclusiveEndDate: Boolean = true, + endDate: Option[DateTime] = None, + ignoreHours: Boolean = true) + (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): Stream[String] = { + + def isGoodDate(date: DateTime): Boolean = { + val startDateToCompare = startDate.map(date => if (ignoreHours) date.withTimeAtStartOfDay() else date) + val endDateToCompare = endDate.map(date => if (ignoreHours) date.withTime(23, 59, 59, 999) else date) + val goodStartDate = startDateToCompare.isEmpty || (inclusiveStartDate && date.saneEqual(startDateToCompare.get) || date.isAfter(startDateToCompare.get)) + val goodEndDate = endDateToCompare.isEmpty || (inclusiveEndDate && date.saneEqual(endDateToCompare.get) || date.isBefore(endDateToCompare.get)) + goodStartDate && goodEndDate + } - def innerListFiles(remainingDirectories: List[HadoopFile]): List[HadoopFile] = { - if (remainingDirectories.isEmpty) { - Nil - } else { - val (dirs, files) = executeDriverList(remainingDirectories.map(_.path)).partition(_.isDir) - files ++ innerListFiles(dirs) + def classifyPath(path: String): Either[String, (String, DateTime)] = + Try(pathDateExtractor.extractFromPath(s"s3n://$bucket/$path")) match { + case Success(date) => Right(path -> date) + case Failure(_) => Left(path) } + + s3ListCommonPrefixes(bucket, prefix, delimiter).map(classifyPath).flatMap { + case Left(prefixWithoutDate) => s3NarrowPaths(bucket, prefixWithoutDate, delimiter, inclusiveStartDate, + startDate, inclusiveEndDate, endDate, ignoreHours) + case Right((prefixWithDate, date)) if isGoodDate(date) => List(s"s3n://$bucket/$prefixWithDate") + case Right(_) => List.empty } - innerListFiles(directories) } - private def s3List(bucket: String, prefix: String, predicate: S3ObjectSummary => Boolean = _ => true) - (implicit s3: AmazonS3Client): List[S3ObjectSummary] = { - def inner(acc: mutable.ArrayBuffer[S3ObjectSummary], listing: ObjectListing): List[S3ObjectSummary] = { - acc ++= listing.getObjectSummaries.toList.filter(predicate) - if (listing.isTruncated) - inner(acc, s3.listNextBatchOfObjects(listing)) - else - acc.toList + private def s3List(path: String, + inclusiveStartDate: Boolean, + startDate: Option[DateTime], + inclusiveEndDate: Boolean, + endDate: Option[DateTime], + exclusionPattern: Option[String]) + (implicit s3: AmazonS3Client, dateExtractor: PathDateExtractor): Stream[S3ObjectSummary] = { + + val s3Pattern = "s3n?://([^/]+)(.+)".r + + def extractBucketAndPrefix(path: String): Option[(String, String)] = path match { + case s3Pattern(bucket, prefix) => Option(bucket -> prefix.dropWhile(_ == '/')) + case _ => None } - inner(new mutable.ArrayBuffer[S3ObjectSummary], s3.listObjects(bucket, prefix)) + extractBucketAndPrefix(path) match { + case Some((pathBucket, pathPrefix)) => + s3NarrowPaths(pathBucket, pathPrefix, inclusiveStartDate = inclusiveStartDate, inclusiveEndDate = inclusiveEndDate, + startDate = startDate, endDate = endDate).flatMap(extractBucketAndPrefix).flatMap { + case (bucket, prefix) => s3ListObjects(bucket, prefix) + } + case _ => Stream.empty + } } - def s3ListAndFilterFiles(bucket: String, - prefix: String, - start: Option[DateTime] = None, - end: Option[DateTime] = None, - endsWith: Option[String] = None, - exclusionPattern: Option[String] = Option("_$folder$"), - predicate: HadoopFile => Boolean = _ => true) - (implicit s3: AmazonS3Client, pathDateExtractor: PathDateExtractor): List[HadoopFile] = { - - def excludePatternValidation(s3Object: S3ObjectSummary, exclusionPatternOption: Option[String]): Option[S3ObjectSummary] = - exclusionPatternOption match { - case Some(pattern) if s3Object.getKey.contains(pattern) => None - case Some(_) | None => Option(s3Object) + def listAndFilterFiles(path: String, + requireSuccess: Boolean = false, + inclusiveStartDate: Boolean = true, + startDate: Option[DateTime] = None, + inclusiveEndDate: Boolean = true, + endDate: Option[DateTime] = None, + lastN: Option[Int] = None, + ignoreMalformedDates: Boolean = false, + endsWith: Option[String] = None, + exclusionPattern: Option[String] = Option(".*_temporary.*|.*_\\$folder.*"), + predicate: HadoopFile => Boolean = _ => true) + (implicit dateExtractor: PathDateExtractor): List[HadoopFile] = { + + def isSuccessFile(file: HadoopFile): Boolean = + file.path.endsWith("_SUCCESS") || file.path.endsWith("_FINISHED") + + def extractDateFromFile(file: HadoopFile): Option[DateTime] = + Try(dateExtractor.extractFromPath(file.path)).toOption + + def excludePatternValidation(file: HadoopFile): Option[HadoopFile] = + exclusionPattern match { + case Some(pattern) if file.path.matches(pattern) => None + case Some(_) | None => Option(file) } - def endsWithValidation(s3Object: S3ObjectSummary, endsWithOption: Option[String]): Option[S3ObjectSummary] = - endsWithOption match { - case Some(pattern) if s3Object.getKey.endsWith(pattern) => Option(s3Object) + def endsWithValidation(file: HadoopFile): Option[HadoopFile] = + endsWith match { + case Some(pattern) if file.path.endsWith(pattern) => Option(file) + case Some(_) if isSuccessFile(file) => Option(file) case Some(_) => None - case None => Option(s3Object) + case None => Option(file) } - def extractDateFromKey(s3Object: S3ObjectSummary): Option[DateTime] = - Try(pathDateExtractor.extractFromPath(s"s3n://$bucket/${s3Object.getKey}")).toOption + def applyPredicate(file: HadoopFile): Option[HadoopFile] = + if (predicate(file)) Option(file) else None - def startValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, startOption: Option[DateTime]): Option[S3ObjectSummary] = - startOption match { - case Some(startDate) if startDate.isEqualOrBefore(extractedDate) => Option(s3Object) - case Some(_) => None - case None => Option(s3Object) + def dateValidation(file: HadoopFile): Option[HadoopFile] = { + val tryDate = extractDateFromFile(file) + if (tryDate.isEmpty && ignoreMalformedDates) + None + else { + val date = tryDate.get + val goodStartDate = startDate.isEmpty || (inclusiveStartDate && date.saneEqual(startDate.get) || date.isAfter(startDate.get)) + val goodEndDate = endDate.isEmpty || (inclusiveEndDate && date.saneEqual(endDate.get) || date.isBefore(endDate.get)) + if (goodStartDate && goodEndDate) Some(file) else None } + } - def endValidation(s3Object: S3ObjectSummary, extractedDate: DateTime, endOption: Option[DateTime]): Option[S3ObjectSummary] = - endOption match { - case Some(endDate) if endDate.isEqualOrAfter(extractedDate) => Option(s3Object) - case Some(_) => None - case None => Option(s3Object) - } + val preValidations: HadoopFile => Boolean = hadoopFile => { + val validatedFile = for { + _ <- excludePatternValidation(hadoopFile) + _ <- endsWithValidation(hadoopFile) + _ <- dateValidation(hadoopFile) + valid <- applyPredicate(hadoopFile) + } yield valid + validatedFile.isDefined + } - def applyPredicate(file: HadoopFile): Option[HadoopFile] = - if (predicate(file)) - Option(file) + val preFilteredFiles = smartList(path, inclusiveStartDate = inclusiveStartDate, inclusiveEndDate = inclusiveEndDate, + startDate = startDate, endDate = endDate, exclusionPattern = exclusionPattern).filter(preValidations) + + val filesByDate = preFilteredFiles.groupBy(extractDateFromFile).collect { + case (Some(date), files) => date -> files + } + + val posFilteredFiles = + if (requireSuccess) + filesByDate.filter { case (_, files) => files.exists(isSuccessFile) } else - None + filesByDate + + val allFiles = if (lastN.isDefined) + posFilteredFiles.toList.sortBy(_._1).reverse.take(lastN.get).flatMap(_._2) + else + posFilteredFiles.toList.flatMap(_._2) + + allFiles.sortBy(_.path) + } + + def smartList(path: String, + inclusiveStartDate: Boolean = false, + startDate: Option[DateTime] = None, + inclusiveEndDate: Boolean = false, + endDate: Option[DateTime] = None, + exclusionPattern: Option[String] = None)(implicit pathDateExtractor: PathDateExtractor): Stream[HadoopFile] = { def toHadoopFile(s3Object: S3ObjectSummary): HadoopFile = HadoopFile(s"s3n://${s3Object.getBucketName}/${s3Object.getKey}", isDir = false, s3Object.getSize) - val allValidations: S3ObjectSummary => Boolean = s3Object => { - val validatedFile = for { - withValidPattern <- excludePatternValidation(s3Object, exclusionPattern) - withValidEndsWith <- endsWithValidation(withValidPattern, endsWith) - extractedDate <- extractDateFromKey(withValidEndsWith) - withValidStart <- startValidation(withValidEndsWith, extractedDate, start) - withValidEnd <- endValidation(withValidStart, extractedDate, end) - hadoopFile = toHadoopFile(withValidEnd) - valid <- applyPredicate(hadoopFile) - } yield valid - validatedFile.isDefined + def listPath(path: String): Stream[HadoopFile] = { + if (path.startsWith("s3")) { + s3List(path, inclusiveStartDate = inclusiveStartDate, startDate = startDate, inclusiveEndDate = inclusiveEndDate, + endDate = endDate, exclusionPattern = exclusionPattern)(amazonS3ClientFromEnvironmentVariables, pathDateExtractor ).map(toHadoopFile) + } else { + driverListFiles(path).toStream + } } - s3List(bucket, prefix, allValidations)(s3).map(toHadoopFile) + HadoopUtils.getPathStrings(path).toStream.flatMap(listPath) } + def filterAndGetParallelTextFiles(path: String, + requireSuccess: Boolean = false, + inclusiveStartDate: Boolean = true, + startDate: Option[DateTime] = None, + inclusiveEndDate: Boolean = true, + endDate: Option[DateTime] = None, + lastN: Option[Int] = None, + ignoreMalformedDates: Boolean = false, + endsWith: Option[String] = None, + predicate: HadoopFile => Boolean = _ => true, + maxBytesPerPartition: Long = 256 * 1000 * 1000, + minPartitions: Int = 100, + sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling(), + minimumFiles: Int = 1, + synchLocally: Option[String] = None, + forceSynch: Boolean = false) + (implicit dateExtractor: PathDateExtractor): RDD[String] = { + + val foundFiles = listAndFilterFiles(path, requireSuccess, inclusiveStartDate, startDate, inclusiveEndDate, + endDate, lastN, ignoreMalformedDates, endsWith, predicate = predicate) + + if (foundFiles.size < minimumFiles) + throw new Exception(s"Tried with start/end time equals to $startDate/$endDate for path $path but but the resulting number of files $foundFiles is less than the required") + + parallelReadTextFiles(foundFiles, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, + sizeBasedFileHandling = sizeBasedFileHandling, synchLocally = synchLocally, forceSynch = forceSynch) + } + + private def doSync(hadoopFiles: List[HadoopFile], + synchLocally: String, + forceSynch: Boolean, + maxBytesPerPartition: Long = 256 * 1000 * 1000, + minPartitions: Int = 100, + sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()): RDD[String] = { + require(!synchLocally.contains("*"), "Globs are not supported on the sync key") + + def syncPath(suffix: String) = s"$hdfsPathPrefix/_core_ignition_sync_hdfs_cache/$suffix" + + val hashKey = Integer.toHexString(hadoopFiles.toSet.hashCode()) + + lazy val foundLocalPaths = getStatus(syncPath(s"$synchLocally/$hashKey/{_SUCCESS,_FINISHED}"), removeEmpty = false) + + val cacheKey = syncPath(s"$synchLocally/$hashKey") + + if (forceSynch || foundLocalPaths.isEmpty) { + delete(new Path(syncPath(s"$synchLocally/"))) + val data = parallelReadTextFiles(hadoopFiles, maxBytesPerPartition, minPartitions, synchLocally = None) + data.saveAsTextFile(cacheKey) + } - def s3FilterAndGetParallelTextFiles(bucket: String, - prefix: String, - startDate: Option[DateTime] = None, - endDate: Option[DateTime] = None, - endsWith: Option[String] = None, - predicate: HadoopFile => Boolean = _ => true, - maxBytesPerPartition: Long = 256 * 1000 * 1000, - minPartitions: Int = 100, - sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()) - (implicit s3Client: AmazonS3Client = amazonS3ClientFromEnvironmentVariables, - dateExtractor: PathDateExtractor): RDD[String] = { - val foundFiles = s3ListAndFilterFiles(bucket, prefix, startDate, endDate, predicate = predicate)(s3Client, dateExtractor) - parallelReadTextFiles(foundFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling) + sc.textFile(cacheKey) } } From a1d226a8cdf018f0652d06de85ffa11b632531a7 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 8 Dec 2015 09:55:21 -0200 Subject: [PATCH 68/80] merge --- src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 1afbd74f..9a96f78d 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -248,7 +248,7 @@ object SparkContextUtils { maxBytesPerPartition: Long, minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling): RDD[String] = { - val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file ->()), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) + val smallPartitionedFiles = sc.parallelize(smallFiles.map(_.path).map(file => file -> null), 2).partitionBy(createSmallFilesPartitioner(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) val hadoopConf = _hadoopConf smallPartitionedFiles.mapPartitions { files => val conf = hadoopConf.value.foldLeft(new Configuration()) { case (acc, (k, v)) => acc.set(k, v); acc } From f5ad7f29afdd1040d0ca54e94ebf44137dd286f9 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 8 Dec 2015 13:38:16 -0200 Subject: [PATCH 69/80] fix empty file filter --- .../scala/ignition/core/jobs/utils/SparkContextUtils.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index 9a96f78d..bed7e8f0 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -351,7 +351,7 @@ object SparkContextUtils { minPartitions: Int, sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling()) (implicit dateExtractor: PathDateExtractor): RDD[String] = { - val foundFiles = paths.flatMap(smartList(_)).filter(_.size > 0) + val foundFiles = paths.flatMap(smartList(_)) parallelReadTextFiles(foundFiles, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, sizeBasedFileHandling = sizeBasedFileHandling) } @@ -361,11 +361,12 @@ object SparkContextUtils { sizeBasedFileHandling: SizeBasedFileHandling = SizeBasedFileHandling(), synchLocally: Option[String] = None, forceSynch: Boolean = false): RDD[String] = { + val filteredFiles = files.filter(_.size > 0) if (synchLocally.isDefined) - doSync(files, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, synchLocally = synchLocally.get, + doSync(filteredFiles, maxBytesPerPartition = maxBytesPerPartition, minPartitions = minPartitions, synchLocally = synchLocally.get, sizeBasedFileHandling = sizeBasedFileHandling, forceSynch = forceSynch) else { - val (bigFiles, smallFiles) = files.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition)) + val (bigFiles, smallFiles) = filteredFiles.partition(f => sizeBasedFileHandling.isBig(f, maxBytesPerPartition)) sc.union( readSmallFiles(smallFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling), readBigFiles(bigFiles, maxBytesPerPartition, minPartitions, sizeBasedFileHandling)) From 5587537b7e42136daf6ffcae53a9754c19b55fd2 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Thu, 10 Dec 2015 09:51:40 -0200 Subject: [PATCH 70/80] fix narrow paths for paths without common prefixes (like final folders) --- .../core/jobs/utils/SparkContextUtils.scala | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala index bed7e8f0..4eab7baf 100644 --- a/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/SparkContextUtils.scala @@ -486,12 +486,16 @@ object SparkContextUtils { case Failure(_) => Left(path) } - s3ListCommonPrefixes(bucket, prefix, delimiter).map(classifyPath).flatMap { - case Left(prefixWithoutDate) => s3NarrowPaths(bucket, prefixWithoutDate, delimiter, inclusiveStartDate, - startDate, inclusiveEndDate, endDate, ignoreHours) - case Right((prefixWithDate, date)) if isGoodDate(date) => List(s"s3n://$bucket/$prefixWithDate") - case Right(_) => List.empty - } + val commonPrefixes = s3ListCommonPrefixes(bucket, prefix, delimiter).map(classifyPath) + + if (commonPrefixes.isEmpty) + Stream(s"s3n://$bucket/$prefix") + else + commonPrefixes.toStream.flatMap { + case Left(prefixWithoutDate) => s3NarrowPaths(bucket, prefixWithoutDate, delimiter, inclusiveStartDate, startDate, inclusiveEndDate, endDate, ignoreHours) + case Right((prefixWithDate, date)) if isGoodDate(date) => Stream(s"s3n://$bucket/$prefixWithDate") + case Right(_) => Stream.empty + } } private def s3List(path: String, From b253f29f66b2ba6858c46e681237e5f1f6c1cf1c Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Mon, 21 Dec 2015 13:34:59 -0200 Subject: [PATCH 71/80] Added some new utils --- .../ignition/core/utils/CollectionUtils.scala | 26 +++++++++++++++++++ .../core/utils/CollectionUtilsSpec.scala | 13 ++++++++++ 2 files changed, 39 insertions(+) diff --git a/src/main/scala/ignition/core/utils/CollectionUtils.scala b/src/main/scala/ignition/core/utils/CollectionUtils.scala index eea4755e..f98fb7ec 100644 --- a/src/main/scala/ignition/core/utils/CollectionUtils.scala +++ b/src/main/scala/ignition/core/utils/CollectionUtils.scala @@ -6,7 +6,32 @@ import scalaz.Validation object CollectionUtils { + + + implicit class SeqImprovements[A](xs: Seq[A]) { + def orElseIfEmpty[B >: A](alternative: => Seq[B]): Seq[B] = { + if (xs.nonEmpty) + xs + else + alternative + } + } + implicit class TraversableOnceImprovements[A](xs: TraversableOnce[A]) { + def maxOption(implicit cmp: Ordering[A]): Option[A] = { + if (xs.isEmpty) + None + else + Option(xs.max) + } + + def minOption(implicit cmp: Ordering[A]): Option[A] = { + if (xs.isEmpty) + None + else + Option(xs.min) + } + def maxByOption[B](f: A => B)(implicit cmp: Ordering[B]): Option[A] = { if (xs.isEmpty) None @@ -65,6 +90,7 @@ object CollectionUtils { builder.result } + } implicit class ValidatedIterableLike[T, R, Repr <: IterableLike[Validation[R, T], Repr]](seq: IterableLike[Validation[R, T], Repr]) { diff --git a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala index f01b8a34..548b2423 100644 --- a/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala +++ b/src/test/scala/ignition/core/utils/CollectionUtilsSpec.scala @@ -32,5 +32,18 @@ class CollectionUtilsSpec extends FlatSpec with ShouldMatchers { list.compressBy(_.value) shouldBe List(MyObj("p1", "v1"), MyObj("p1", "v2")) } + it should "provide orElseIfEmpty" in { + Seq.empty[String].orElseIfEmpty(Seq("something")) shouldBe Seq("something") + Seq("not empty").orElseIfEmpty(Seq("something")) shouldBe Seq("not empty") + } + + it should "provide maxOption and minOption" in { + Seq.empty[Int].maxOption shouldBe None + Seq(1, 3, 2).maxOption shouldBe Some(3) + + Seq.empty[Int].minOption shouldBe None + Seq(1, 3, 2).minOption shouldBe Some(1) + } + } From 352ee0b4d584c4d38ef8bf3bd1b4d8320f0adf4a Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Tue, 22 Dec 2015 11:26:40 -0200 Subject: [PATCH 72/80] Minor change --- build.sbt | 2 +- src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/build.sbt b/build.sbt index d0e2b029..5de79888 100644 --- a/build.sbt +++ b/build.sbt @@ -4,7 +4,7 @@ version := "1.0" scalaVersion := "2.10.4" -scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code") +scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings", "-Xlint", "-Ywarn-dead-code", "-Xmax-classfile-name", "130") ideaExcludeFolders += ".idea" diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index aa4dcc76..8430d4ef 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -13,9 +13,11 @@ object CoreJobRunner { // Used to provide contextual logging def setLoggingContextValues(config: RunnerConfig): Unit = { - org.slf4j.MDC.put("setupName", config.setupName) - org.slf4j.MDC.put("tag", config.tag) - org.slf4j.MDC.put("user", config.user) + Try { // yes, this may fail but we don't want everything to shut down + org.slf4j.MDC.put("setupName", config.setupName) + org.slf4j.MDC.put("tag", config.tag) + org.slf4j.MDC.put("user", config.user) + } } case class RunnerConfig(setupName: String = "nosetup", From d780ea589d90f4d5683de05a8ca3339ce66a1fd1 Mon Sep 17 00:00:00 2001 From: Allan Oliveira Date: Fri, 15 Jan 2016 14:03:32 -0200 Subject: [PATCH 73/80] Make try work even if the exception is fatall --- src/main/scala/ignition/core/jobs/CoreJobRunner.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala index 8430d4ef..bbede553 100644 --- a/src/main/scala/ignition/core/jobs/CoreJobRunner.scala +++ b/src/main/scala/ignition/core/jobs/CoreJobRunner.scala @@ -13,10 +13,13 @@ object CoreJobRunner { // Used to provide contextual logging def setLoggingContextValues(config: RunnerConfig): Unit = { - Try { // yes, this may fail but we don't want everything to shut down + try { // yes, this may fail but we don't want everything to shut down org.slf4j.MDC.put("setupName", config.setupName) org.slf4j.MDC.put("tag", config.tag) org.slf4j.MDC.put("user", config.user) + } catch { + case e: Throwable => + // cry } } From 400b1f0d9cfdfb54183f744e9a5f5cf3f3a03df9 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Mon, 1 Feb 2016 10:37:44 -0200 Subject: [PATCH 74/80] zeppelin setup --- remote_hook.sh | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/remote_hook.sh b/remote_hook.sh index 48ba9735..5d4bbad1 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -49,6 +49,23 @@ on_trap_exit() { rm -f "${RUNNING_FILE}" } +install_and_run_zeppelin() { + if [[ ! -d "zeppelin" ]]; then + wget "http://www.us.apache.org/dist/incubator/zeppelin/0.5.6-incubating/zeppelin-0.5.6-incubating-bin-all.tgz" -O zeppelin.tar.gz + tar xvzf zeppelin.tar.gz > /tmp/zeppelin_install.log + mv `ls -d zeppelin-*` zeppelin + fi + if [[ -f "zeppelin/bin/zeppelin.sh" ]]; then + export MASTER="${JOB_MASTER}" + export ZEPPELIN_PORT="8081" + export SPARK_HOME="/root/spark" + export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH}" + sudo -E zeppelin/bin/zeppelin.sh + else + notify_error_and_exit "Not found zeppelin installation" + fi +} + trap "on_trap_exit" EXIT @@ -74,10 +91,11 @@ if [[ "${USE_YARN}" == "yes" ]]; then export SPARK_WORKER_MEMORY=${SPARK_MEM_PARAM} fi - if [[ "${JOB_NAME}" == "shell" ]]; then export ADD_JARS=${JAR_PATH} sudo -E ${SPARK_HOME}/bin/spark-shell || notify_error_and_exit "Execution failed for shell" +elif [[ "${JOB_NAME}" == "zeppelin" ]]; then + install_and_run_zeppelin else JOB_OUTPUT="${JOB_CONTROL_DIR}/output.log" tail -F "${JOB_OUTPUT}" & From 333127927fe9581228a12f57f2c8d1a29c474908 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Tue, 2 Feb 2016 09:36:49 -0200 Subject: [PATCH 75/80] pr review --- remote_hook.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/remote_hook.sh b/remote_hook.sh index 5d4bbad1..7d8ed36e 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -52,8 +52,8 @@ on_trap_exit() { install_and_run_zeppelin() { if [[ ! -d "zeppelin" ]]; then wget "http://www.us.apache.org/dist/incubator/zeppelin/0.5.6-incubating/zeppelin-0.5.6-incubating-bin-all.tgz" -O zeppelin.tar.gz - tar xvzf zeppelin.tar.gz > /tmp/zeppelin_install.log - mv `ls -d zeppelin-*` zeppelin + mkdir zepplin + tar xvzf zeppelin.tar.gz -C zeppelin --strip-components 1 > /tmp/zeppelin_install.log fi if [[ -f "zeppelin/bin/zeppelin.sh" ]]; then export MASTER="${JOB_MASTER}" @@ -62,7 +62,7 @@ install_and_run_zeppelin() { export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH}" sudo -E zeppelin/bin/zeppelin.sh else - notify_error_and_exit "Not found zeppelin installation" + notify_error_and_exit "Zepellin installation not found" fi } From 33aa47e2cde896bfd32feaa2e9726c9cd3475871 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 3 Feb 2016 15:09:53 -0200 Subject: [PATCH 76/80] rdd.filterNot --- src/main/scala/ignition/core/jobs/utils/RDDUtils.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala index 57069bae..60bddc9a 100644 --- a/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala +++ b/src/main/scala/ignition/core/jobs/utils/RDDUtils.scala @@ -57,6 +57,8 @@ object RDDUtils { def incrementCounterIf(cond: (V) => Boolean, acc: spark.Accumulator[Int]): RDD[V] = { rdd.map(x => { if (cond(x)) acc += 1; x }) } + + def filterNot(p: V => Boolean): RDD[V] = rdd.filter(!p(_)) } implicit class PairRDDImprovements[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) { From 93964db2d79c6b84f172712b9ce62eaa9fa44687 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 3 Feb 2016 15:45:24 -0200 Subject: [PATCH 77/80] open a browser for zepplin web ui --- tools/cluster.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/cluster.py b/tools/cluster.py index 2fe6b245..4a81eaa9 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -436,6 +436,9 @@ def job_run(cluster_name, job_name, job_mem, src_local=remote_hook_local, remote_path=with_leading_slash(remote_path)) + if job_name == "zeppelin": + subprocess.Popen(["xdg-open", "http://{master}:8081".format(master=master)]) + log.info('Will run job in remote host') if disable_tmux: ssh_call(user=remote_user, host=master, key_file=key_file, args=[non_tmux_arg], allocate_terminal=False) From 5137e43546660658dfe17beb0ff54c80877f16b1 Mon Sep 17 00:00:00 2001 From: Fernando Luiz Parisotto Date: Wed, 3 Feb 2016 18:10:59 -0200 Subject: [PATCH 78/80] using webbrowser lib --- tools/cluster.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 4a81eaa9..daf03d91 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -23,6 +23,7 @@ import getpass import json import glob +import webbrowser log = logging.getLogger() @@ -437,7 +438,7 @@ def job_run(cluster_name, job_name, job_mem, remote_path=with_leading_slash(remote_path)) if job_name == "zeppelin": - subprocess.Popen(["xdg-open", "http://{master}:8081".format(master=master)]) + webbrowser.open("http://{master}:8081".format(master=master)) log.info('Will run job in remote host') if disable_tmux: From b0c323c3f283f4514a644b222a8c2a07dbb6c52c Mon Sep 17 00:00:00 2001 From: Leandro Date: Mon, 22 Feb 2016 10:54:23 -0300 Subject: [PATCH 79/80] Do not delete the security group by default --- tools/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cluster.py b/tools/cluster.py index 2fe6b245..d9e37533 100755 --- a/tools/cluster.py +++ b/tools/cluster.py @@ -322,7 +322,7 @@ def launch(cluster_name, slaves, raise CommandError('Failed to created cluster {} after failures'.format(cluster_name)) -def destroy(cluster_name, delete_groups=True, region=default_region): +def destroy(cluster_name, delete_groups=False, region=default_region): delete_sg_param = ['--delete-groups'] if delete_groups else [] ec2_script_path = chdir_to_ec2_script_and_get_path() From ce911f6153d238f1db4c74c056c590ad730d636d Mon Sep 17 00:00:00 2001 From: Leandro Date: Wed, 24 Feb 2016 17:10:44 -0300 Subject: [PATCH 80/80] Fixing typo and adding driver heap param --- remote_hook.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/remote_hook.sh b/remote_hook.sh index 7d8ed36e..dd76933a 100755 --- a/remote_hook.sh +++ b/remote_hook.sh @@ -52,14 +52,14 @@ on_trap_exit() { install_and_run_zeppelin() { if [[ ! -d "zeppelin" ]]; then wget "http://www.us.apache.org/dist/incubator/zeppelin/0.5.6-incubating/zeppelin-0.5.6-incubating-bin-all.tgz" -O zeppelin.tar.gz - mkdir zepplin + mkdir zeppelin tar xvzf zeppelin.tar.gz -C zeppelin --strip-components 1 > /tmp/zeppelin_install.log fi if [[ -f "zeppelin/bin/zeppelin.sh" ]]; then export MASTER="${JOB_MASTER}" export ZEPPELIN_PORT="8081" export SPARK_HOME="/root/spark" - export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH}" + export SPARK_SUBMIT_OPTIONS="--jars ${JAR_PATH} --runner-executor-memory ${SPARK_MEM_PARAM}" sudo -E zeppelin/bin/zeppelin.sh else notify_error_and_exit "Zepellin installation not found"