Skip to content

Server current performance

Bill Reed edited this page Oct 17, 2016 · 5 revisions

RDD Lambda transform performance JavaScript, Java, Python.

RDD Lambda Performance

Dataset size (bytes) JavaScript Java Python
2580392 691 ms 671 ms 8486 ms
620204630 33531 ms 10405 ms 73394 ms

Dataset used

To compare the performance of EclairJS to Java and Python we used the two versions of movie rating data (rating.csv) a large dataset and a small dataset. The files were obtained from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip and http://files.grouplens.org/datasets/movielens/ml-latest.zip.

Environment

  • Spark 2.0 (local[*])
  • OSX 10.12, 2.8 GHz Intel Core i7, 16 GB 1600 MHz DDR3
  • Java version 1.8.0_101

log4j configuration used for testing

# Set everything to be logged to the console
log4j.rootCategory=WARN, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n

# Settings to quiet third party logs that are too verbose
log4j.logger.org.spark-project.jetty=WARN
log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=WARN
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=WARN

# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR

log4j.logger.org.eclairjs.nashorn=ERROR

The Python code

from pyspark import SparkContext
import time
tic = time.time()

sc = SparkContext("local", "Simple App")
small_movies_file = '/Users/billreed/eclairjs_dev/eclairjs-nashorn/examples/data/mllib/ml-latest/ratings.csv'

small_movies_raw_data = sc.textFile(small_movies_file)
small_movies_raw_data_header = small_movies_raw_data.take(1)[0]

small_movies_data = small_movies_raw_data.filter(lambda line: line!=small_movies_raw_data_header)\
     .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1])).cache()

print('There are recommendations in the complete dataset: {0}'.format(small_movies_data.count()))
toc = time.time()
print('Execution time: {0} milliseconds'.format((toc - tic) *1000))

To submit the application we used spark-submit

The Java code

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.recommendation.Rating;
import org.apache.spark.sql.SQLContext;
import java.util.Date;

public class LargeDataset {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("movie recommender").setMaster("local[*]");
        JavaSparkContext jsc = new JavaSparkContext(conf);
        long start = new Date().getTime();
        JavaRDD complete_ratings_raw_data = jsc.textFile("examples/data/mllib/ml-latest/ratings.csv");
        //JavaRDD complete_ratings_raw_data = jsc.textFile("examples/data/mllib/ml-latest-small/ratings.csv");
        String complete_ratings_raw_data_header = complete_ratings_raw_data.take(1).get(0).toString();
        JavaRDD complete_ratings_data = complete_ratings_raw_data.filter(new Function<String, Boolean>() {
            public Boolean call(String line) {

                if (line.equals(complete_ratings_raw_data_header)) {
                    return false;
                } else {
                    return true;
                }
            }
        })
        .map(new Function<String, Rating>() {
            public Rating call(String line) {
                String[] fields = line.split(",");
                int userId = Integer.parseInt(fields[0]);
                int movieId = Integer.parseInt(fields[1]);
                double rating = Double.parseDouble(fields[2]);
                return new Rating(userId, movieId, rating);

            }
        }).cache();


        System.out.println("There are recommendations in the complete dataset:  " + complete_ratings_data.count());
        long end = new Date().getTime();
        long time = end - start;
        System.out.println("Execution time: " + time + " milliseconds");

    }
}

To submit the application we used spark-submit

The JavaScript code

function run(sc) {

    var Tuple2 = require('eclairjs/Tuple2');
    var Rating = require('eclairjs/mllib/recommendation/Rating');



    /*
     In order to build our recommender model, we will use the complete dataset.

     */
    var complete_ratings_raw_data = sc.textFile("examples/data/mllib/ml-latest/ratings.csv");
    //var complete_ratings_raw_data = sc.textFile("examples/data/mllib/ml-latest-small/ratings.csv");
    var complete_ratings_raw_data_header = complete_ratings_raw_data.take(1)[0];
    var start = new Date().getTime();
    var filterHeader = function(line) {
        return line != "userId,movieId,rating,timestamp";
    }
   var complete_ratings_data = complete_ratings_raw_data
       .filter(function(line) {
            // filters out the header
            return line != "userId,movieId,rating,timestamp"; //complete_ratings_raw_data_header;
        })
        .map(function(line, Tuple2) {
            var tokens = line.split(",");
            return new Tuple2(tokens[0],tokens[1]);
        }, [Tuple2]).cache()




    print("There are recommendations in the complete dataset:  " + complete_ratings_data.count());

    var end = new Date().getTime();
    var time = end - start;
    print('Execution time: ' + time + " milliseconds");

}

/*
 check if SparkContext is defined, if it is we are being run from Unit Test
 */

if (typeof sparkContext === 'undefined')  {
    var SparkConf = require('eclairjs/SparkConf');
    var SparkContext = require('eclairjs/SparkContext');
    var sparkConf = new SparkConf().setAppName("JavaScript Dataset test");
    var sc = new SparkContext(sparkConf);
    var result = run(sc);

    sc.stop();
}

To submit the application we used eclairjs.sh

Clone this wiki locally