mlr3oml

mlr3oml allows to create mlr3 tasks directly from OpenML data sets. Furthermore, you can also obtain the data and the resampling for a given OpenML task. Caching can be enabled by setting the option "mlr3oml.cache". Uploading to OpenML is currently not supported, use the OpenML package package for this.

Short Demo

library("mlr3")
library("mlr3oml")

# be less verbose
lgr::get_logger("mlr3oml")$set_threshold("warn")

# retrieve data set as task from OML
tsk("oml", data_id = 31)

## <TaskClassif:credit-g> (1000 x 21)
## * Target: class
## * Properties: twoclass
## * Features (20):
##   - fct (13): checking_status, credit_history, employment,
##     foreign_worker, housing, job, other_parties, other_payment_plans,
##     own_telephone, personal_status, property_magnitude, purpose,
##     savings_status
##   - int (7): age, credit_amount, duration, existing_credits,
##     installment_commitment, num_dependents, residence_since

# retrieve a regular task from OML
tsk("oml", task_id = 59)

## <TaskClassif:Task 59: iris (Supervised Classification)> (150 x 5)
## * Target: class
## * Properties: multiclass
## * Features (4):
##   - dbl (4): petallength, petalwidth, sepallength, sepalwidth

# retrieve resampling from OML
rsmp("oml", task_id = 59)

## <ResamplingCustom> with 10 iterations
## * Instantiated: TRUE
## * Parameters: list()

# R6 class for data sets
oml_data = OMLData$new(61)
oml_data$name

## [1] "iris"

oml_data$nrow

## [1] 150

oml_data$ncol

## [1] 5

oml_data$data

##      sepallength sepalwidth petallength petalwidth          class
##   1:         5.1        3.5         1.4        0.2    Iris-setosa
##   2:         4.9        3.0         1.4        0.2    Iris-setosa
##   3:         4.7        3.2         1.3        0.2    Iris-setosa
##   4:         4.6        3.1         1.5        0.2    Iris-setosa
##   5:         5.0        3.6         1.4        0.2    Iris-setosa
##  ---                                                             
## 146:         6.7        3.0         5.2        2.3 Iris-virginica
## 147:         6.3        2.5         5.0        1.9 Iris-virginica
## 148:         6.5        3.0         5.2        2.0 Iris-virginica
## 149:         6.2        3.4         5.4        2.3 Iris-virginica
## 150:         5.9        3.0         5.1        1.8 Iris-virginica

# R6 class for tasks
oml_task = OMLTask$new(31)
oml_task$name

## [1] "Task 31: credit-g (Supervised Classification)"

oml_task$nrow

## [1] 1000

oml_task$ncol

## [1] 21

oml_task$task

## <TaskClassif:Task 31: credit-g (Supervised Classification)> (1000 x 21)
## * Target: class
## * Properties: twoclass
## * Features (20):
##   - fct (13): checking_status, credit_history, employment,
##     foreign_worker, housing, job, other_parties, other_payment_plans,
##     own_telephone, personal_status, property_magnitude, purpose,
##     savings_status
##   - int (7): age, credit_amount, duration, existing_credits,
##     installment_commitment, num_dependents, residence_since

oml_task$resampling

## <ResamplingCustom> with 10 iterations
## * Instantiated: TRUE
## * Parameters: list()

# list oml data sets with 5 features and 50 - 200 instances
tab = list_oml_data_sets(number_features = 5, number_instances = c(50, 200))
head(tab[, .(data_id, name)])

##    data_id               name
## 1:      61               iris
## 2:     199           fruitfly
## 3:     214           baskball
## 4:     329         hayes-roth
## 5:     346               aids
## 6:     668 witmer_census_1980

# list first 10 oml tasks
tab = list_oml_tasks(limit = 10)
tab[, .(task_id, data_id, name)]

##     task_id data_id            name
##  1:       2       2          anneal
##  2:       3       3        kr-vs-kp
##  3:       4       4           labor
##  4:       5       5      arrhythmia
##  5:       6       6          letter
##  6:       7       7       audiology
##  7:       8       8 liver-disorders
##  8:       9       9           autos
##  9:      10      10           lymph
## 10:      11      11   balance-scale