library("quanteda.textmodels")
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "unpackedMatrix" of class "mMatrix"; definition not updated
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "unpackedMatrix" of class "replValueSp"; definition not updated
library("quanteda")
## Package version: 3.2.3
## Unicode version: 14.0
## ICU version: 70.1
## Parallel computing: 10 of 10 threads used.
## See https://quanteda.io for tutorials and examples.
quanteda.textmodels implements fast methods for fitting and predicting Naive Bayes textmodels built especially for sparse document-feature matrices from textual data. It implements two models: multinomial and Bernoulli. (See Manning, Raghavan, and Schütze 2008, Chapter 13.)
Here, we compare performance for the two models, and then to the performance from two other packages for fitting these models.
For these tests, we will choose the dataset of 50,000 movie reviews from Maas et. al. (2011). We will use their partition into test and training sets for training and fitting our models.
# large movie review database of 50,000 movie reviews
load(url("https://www.dropbox.com/s/sjdfmx8ggwfda5o/data_corpus_LMRD.rda?dl=1"))
<- tokens(data_corpus_LMRD) %>%
dfmat dfm()
<- dfm_subset(dfmat, set == "train")
dfmat_train <- dfm_subset(dfmat, set == "test") dfmat_test
Comparing the performance of fitting the model:
library("microbenchmark")
microbenchmark(
multi = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
bern = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
times = 20
)## Warning in microbenchmark(multi = textmodel_nb(dfmat_train,
## dfmat_train$polarity, : less accurate nanosecond times to avoid potential
## integer overflows
## Unit: milliseconds
## expr min lq mean median uq max neval
## multi 62.41414 64.25366 66.52999 65.41474 67.34633 78.29245 20
## bern 71.60478 74.20902 79.91439 76.87525 84.47343 95.97387 20
And for prediction:
microbenchmark(
multi = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
newdata = dfmat_test),
bern = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
newdata = dfmat_test),
times = 20
)## Unit: milliseconds
## expr min lq mean median uq max neval
## multi 75.72544 77.07598 80.92487 77.65695 83.66464 97.13835 20
## bern 105.35114 106.60486 114.39208 110.25710 122.09777 135.75563 20
Now let’s see how textmodel_nb()
compares to equivalent
functions from other packages. Multinomial:
library("fastNaiveBayes")
library("naivebayes")
## naivebayes 0.9.7 loaded
microbenchmark(
textmodels = {
<- textmodel_nb(dfmat_train, dfmat_train$polarity, smooth = 1, distribution = "multinomial")
tmod <- predict(tmod, newdata = dfmat_test)
pred
},fastNaiveBayes = {
<- fnb.multinomial(as(dfmat_train, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
tmod <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
pred
},naivebayes = {
= multinomial_naive_bayes(as(dfmat_train, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
tmod <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
pred
},times = 20
)## Unit: milliseconds
## expr min lq mean median uq max
## textmodels 73.95273 75.67505 77.60979 76.48378 78.78431 90.96727
## fastNaiveBayes 110.19373 111.39462 123.75988 118.87378 126.13449 212.40657
## naivebayes 82.59011 84.51685 92.99198 85.35819 87.62442 189.94287
## neval
## 20
## 20
## 20
And Bernoulli. Note here that while we are supplying the boolean
matrix to textmodel_nb()
, this re-weighting from the count
matrix would have been performed automatically within the function had
we not done so in advance - it’s done here just for comparison.
<- dfm_weight(dfmat_train, scheme = "boolean")
dfmat_train_bern <- dfm_weight(dfmat_test, scheme = "boolean")
dfmat_test_bern
microbenchmark(
textmodels = {
<- textmodel_nb(dfmat_train_bern, dfmat_train$polarity, smooth = 1, distribution = "Bernoulli")
tmod <- predict(tmod, newdata = dfmat_test)
pred
},fastNaiveBayes = {
<- fnb.bernoulli(as(dfmat_train_bern, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
tmod <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
pred
},naivebayes = {
= bernoulli_naive_bayes(as(dfmat_train_bern, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
tmod <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
pred
},times = 20
)## Unit: milliseconds
## expr min lq mean median uq max neval
## textmodels 104.79825 106.6659 114.4119 113.63574 120.6006 129.0051 20
## fastNaiveBayes 121.86442 124.4619 134.2806 127.64854 138.9119 174.1937 20
## naivebayes 95.26547 97.4695 103.0717 99.67471 109.5496 121.5826 20
Maas, Andrew L., Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts (2011). “Learning Word Vectors for Sentiment Analysis”. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).
Majka M (2020). naivebayes: High Performance Implementation of the Naive Bayes Algorithm in R. R package version 0.9.7, <URL: https://CRAN.R-project.org/package=naivebayes>. Date: 2020-03-08.
Manning, Christopher D., Prabhakar Raghavan, and Hinrich Schütze (2008). Introduction to Information Retrieval. Cambridge University Press.
Skogholt, Martin (2020). fastNaiveBayes: Extremely Fast Implementation of a Naive Bayes Classifier. R package version 2.2.0. https://github.com/mskogholt/fastNaiveBayes. Date: 2020-02-23.