-
Notifications
You must be signed in to change notification settings - Fork 24
Server current performance
Bill Reed edited this page Oct 17, 2016
·
5 revisions

| Dataset size (bytes) | JavaScript | Java | Python |
|---|---|---|---|
| 2580392 | 691 ms | 671 ms | 8486 ms |
| 620204630 | 33531 ms | 10405 ms | 73394 ms |
To compare the performance of EclairJS to Java and Python we used the two versions of movie rating data (rating.csv) a large dataset and a small dataset. The files were obtained from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip and http://files.grouplens.org/datasets/movielens/ml-latest.zip.
- Spark 2.0 (local[*])
- OSX 10.12, 2.8 GHz Intel Core i7, 16 GB 1600 MHz DDR3
- Java version 1.8.0_101
# Set everything to be logged to the console
log4j.rootCategory=WARN, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
# Settings to quiet third party logs that are too verbose
log4j.logger.org.spark-project.jetty=WARN
log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=WARN
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=WARN
# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
log4j.logger.org.eclairjs.nashorn=ERROR
from pyspark import SparkContext
import time
tic = time.time()
sc = SparkContext("local", "Simple App")
small_movies_file = '/Users/billreed/eclairjs_dev/eclairjs-nashorn/examples/data/mllib/ml-latest/ratings.csv'
small_movies_raw_data = sc.textFile(small_movies_file)
small_movies_raw_data_header = small_movies_raw_data.take(1)[0]
small_movies_data = small_movies_raw_data.filter(lambda line: line!=small_movies_raw_data_header)\
.map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1])).cache()
print('There are recommendations in the complete dataset: {0}'.format(small_movies_data.count()))
toc = time.time()
print('Execution time: {0} milliseconds'.format((toc - tic) *1000))
To submit the application we used spark-submit
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.recommendation.Rating;
import org.apache.spark.sql.SQLContext;
import java.util.Date;
public class LargeDataset {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("movie recommender").setMaster("local[*]");
JavaSparkContext jsc = new JavaSparkContext(conf);
long start = new Date().getTime();
JavaRDD complete_ratings_raw_data = jsc.textFile("examples/data/mllib/ml-latest/ratings.csv");
//JavaRDD complete_ratings_raw_data = jsc.textFile("examples/data/mllib/ml-latest-small/ratings.csv");
String complete_ratings_raw_data_header = complete_ratings_raw_data.take(1).get(0).toString();
JavaRDD complete_ratings_data = complete_ratings_raw_data.filter(new Function<String, Boolean>() {
public Boolean call(String line) {
if (line.equals(complete_ratings_raw_data_header)) {
return false;
} else {
return true;
}
}
})
.map(new Function<String, Rating>() {
public Rating call(String line) {
String[] fields = line.split(",");
int userId = Integer.parseInt(fields[0]);
int movieId = Integer.parseInt(fields[1]);
double rating = Double.parseDouble(fields[2]);
return new Rating(userId, movieId, rating);
}
}).cache();
System.out.println("There are recommendations in the complete dataset: " + complete_ratings_data.count());
long end = new Date().getTime();
long time = end - start;
System.out.println("Execution time: " + time + " milliseconds");
}
}
To submit the application we used spark-submit
function run(sc) {
var Tuple2 = require('eclairjs/Tuple2');
var Rating = require('eclairjs/mllib/recommendation/Rating');
/*
In order to build our recommender model, we will use the complete dataset.
*/
var complete_ratings_raw_data = sc.textFile("examples/data/mllib/ml-latest/ratings.csv");
//var complete_ratings_raw_data = sc.textFile("examples/data/mllib/ml-latest-small/ratings.csv");
var complete_ratings_raw_data_header = complete_ratings_raw_data.take(1)[0];
var start = new Date().getTime();
var filterHeader = function(line) {
return line != "userId,movieId,rating,timestamp";
}
var complete_ratings_data = complete_ratings_raw_data
.filter(function(line) {
// filters out the header
return line != "userId,movieId,rating,timestamp"; //complete_ratings_raw_data_header;
})
.map(function(line, Tuple2) {
var tokens = line.split(",");
return new Tuple2(tokens[0],tokens[1]);
}, [Tuple2]).cache()
print("There are recommendations in the complete dataset: " + complete_ratings_data.count());
var end = new Date().getTime();
var time = end - start;
print('Execution time: ' + time + " milliseconds");
}
/*
check if SparkContext is defined, if it is we are being run from Unit Test
*/
if (typeof sparkContext === 'undefined') {
var SparkConf = require('eclairjs/SparkConf');
var SparkContext = require('eclairjs/SparkContext');
var sparkConf = new SparkConf().setAppName("JavaScript Dataset test");
var sc = new SparkContext(sparkConf);
var result = run(sc);
sc.stop();
}
To submit the application we used eclairjs.sh