-
Notifications
You must be signed in to change notification settings - Fork 2
Tutorial: 5. Example Use Cases
To make getting started as easy as possible you can check out our example use cases.
The first example is a complex mlr benchmark. With traditional benchmark visualization techniques it is a lot of work to create the amount of charts integrated in the benchmarkVis package.
library(mlr)
library(benchmarkVis)
# Learners
lrns = list(makeLearner("classif.lda"), makeLearner("classif.rpart"),
makeLearner("classif.xgboost"), makeLearner("classif.randomForest"),
makeLearner("classif.naiveBayes"), makeLearner("classif.ksvm"), makeLearner("classif.knn"))
# Resampling strategies
rdescs = list(makeResampleDesc("Subsample"), makeResampleDesc("Holdout"),
makeResampleDesc("CV", iters = 5), makeResampleDesc("LOO"), makeResampleDesc("Bootstrap"))
# Tasks
ring.task = convertMLBenchObjToTask("mlbench.ringnorm", n = 600)
wave.task = convertMLBenchObjToTask("mlbench.waveform", n = 600)
data(BreastCancer, package = "mlbench") # Load data
df = BreastCancer
df$Id = NULL # Remove column
df = na.omit(df) # Remove rows which contain NAs
df[, c(1:9)] = sapply(df[, c(1:9)], as.numeric) # Convert columns to numerics
breast.cancer = makeClassifTask(id = "BreastCancer", data = df, target = "Class")
tasks = list(iris.task, sonar.task, pid.task, ring.task, wave.task, breast.cancer)
# Measures
meas = list(mmce, ber, timetrain, acc, kappa, timepredict)
# Create Benchmarks (you can just choose one resampling strategy for each benchmark)
large.benchmark = data.table()
for (rdesc in rdescs) {
bmr = benchmark(lrns, tasks, rdesc, meas, show.info = TRUE)
# Transform result
benchmarkVis.dt = useMlrBenchmarkWrapper(bmr)
# Combine single benchmarks
large.benchmark = rbind(large.benchmark, benchmarkVis.dt)
}
# Check the structure of the combined data table
checkStructure(large.benchmark)The resulting benchmarkVis data table has 18 columns and 210 rows.
For an easy interpretation following boxplot could be created:

If you want to play with the result yourself you can load the example by:
data(large.benchmark)The second example shows how batchtools package can be combined with microbenchmark to compare and visualize the results of running different sort algorithms with benchmarkVis.
library(batchtools)
library(microbenchmark)
library(data.table)
library(benchmarkVis)
# Create registry
registry = makeExperimentRegistry()
# Define problem
values = sample(1:500,100000,replace=T)
addProblem(name = "values", data = values, seed = 42)
# Define an algorithm for each of sort algorithms we want to benchmark
shell.wrapper = function(data, job, instance, ...) {
microbenchmark(sort(data, method="shell"), times=10)
}
addAlgorithm(name = "shell", fun = shell.wrapper)
radix.wrapper = function(data, job, instance, ...) {
microbenchmark(sort(data, method="radix"), times=10)
}
addAlgorithm(name = "radix", fun = radix.wrapper)
quick.wrapper = function(data, job, instance, ...) {
microbenchmark(sort(data, method="quick"), times=10)
}
addAlgorithm(name = "quick", fun = quick.wrapper)
# Add experiments and run jobs
addExperiments()
submitJobs()
waitForJobs()
# Convert the result to a benchmarkVis compatible data table
convert.to.benchmark = lapply(reduceResultsDataTable()$result, useMicrobenchmarkWrapper)
sort.benchmark = Reduce(rbind, convert.to.benchmark)
# Check the structure
checkStructure(sort.benchmark)Now you can use sort.benchmark as input for shiny application (you only have to export it to a file) or just use the benchmarkVis functions on it directly from your console.
With this demonstation we want to show how our package can easily be combined with tools outside of the R language. For this example we will use the python packages Penn Machine Learning Benchmarks and scikit-learn .
Python code (example from pmlb):
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from pmlb import fetch_data, classification_dataset_names
import pandas as pd
import csv
logit_test_scores = []
gnb_test_scores = []
data_size = []
# Classify multiple problem datasets
for classification_dataset in classification_dataset_names[0:12]:
X, y = fetch_data(classification_dataset, return_X_y=True)
train_X, test_X, train_y, test_y = train_test_split(X, y)
# Our algorithms
logit = LogisticRegression()
gnb = GaussianNB()
logit.fit(train_X, train_y)
gnb.fit(train_X, train_y)
# Save results
logit_test_scores.append(logit.score(test_X, test_y))
gnb_test_scores.append(gnb.score(test_X, test_y))
data_size.append(len(X))
# Create dataframe
problem_parameter = list(map(lambda x: "list(size = %s)" % x, data_size))
df = pd.DataFrame(data =
{"problem": classification_dataset_names[0:12] * 2,
"problem.parameter": problem_parameter * 2,
"algorithm": ["logisticRegression"] * 12 + ["GaussianNB"] * 12,
"measure.accuracy": logit_test_scores + gnb_test_scores})
# Save CSV
df.to_csv("PATH/pmlb.csv", index = False, quoting = csv.QUOTE_NONNUMERIC)All you have to do now is to load the created csv file to the shiny application or with the following command from R:
library(benchmarkVis)
csvImport("PATH/pmlb.csv")As you can see the data structure is very flexible and can be used from all kinds of programming languages.