Santoku is a package for cutting data into intervals. It provides a
replacement for base R’s cut()
function.
To install santoku, run:
install.packages("santoku")
Use chop()
like cut()
to cut your data
up:
library(santoku)
<- runif(10, 0, 10)
x <- chop(x, breaks = 0:10))
(chopped #> [1] [4, 5) [8, 9) [3, 4) [4, 5) [7, 8) [9, 10) [6, 7) [8, 9) [1, 2)
#> [10] [4, 5)
#> Levels: [1, 2) [3, 4) [4, 5) [6, 7) [7, 8) [8, 9) [9, 10)
data.frame(x, chopped)
#> x chopped
#> 1 4.978305 [4, 5)
#> 2 8.969989 [8, 9)
#> 3 3.391823 [3, 4)
#> 4 4.676785 [4, 5)
#> 5 7.057042 [7, 8)
#> 6 9.707687 [9, 10)
#> 7 6.713807 [6, 7)
#> 8 8.376589 [8, 9)
#> 9 1.086165 [1, 2)
#> 10 4.495479 [4, 5)
chop()
returns a factor.
If data is beyond the limits of breaks
, they will be
extended automatically:
<- chop(x, breaks = 3:7)
chopped data.frame(x, chopped)
#> x chopped
#> 1 4.978305 [4, 5)
#> 2 8.969989 [7, 9.708]
#> 3 3.391823 [3, 4)
#> 4 4.676785 [4, 5)
#> 5 7.057042 [7, 9.708]
#> 6 9.707687 [7, 9.708]
#> 7 6.713807 [6, 7)
#> 8 8.376589 [7, 9.708]
#> 9 1.086165 [1.086, 3)
#> 10 4.495479 [4, 5)
To chop a single number into a separate category, put the number
twice in breaks
:
<- x
x_fives 1:5] <- 5
x_fives[<- chop(x_fives, c(2, 5, 5, 8))
chopped data.frame(x_fives, chopped)
#> x_fives chopped
#> 1 5.000000 {5}
#> 2 5.000000 {5}
#> 3 5.000000 {5}
#> 4 5.000000 {5}
#> 5 5.000000 {5}
#> 6 9.707687 [8, 9.708]
#> 7 6.713807 (5, 8)
#> 8 8.376589 [8, 9.708]
#> 9 1.086165 [1.086, 2)
#> 10 4.495479 [2, 5)
To quickly produce a table of chopped data, use
tab()
:
tab(1:10, c(2, 5, 8))
#> [1, 2) [2, 5) [5, 8) [8, 10]
#> 1 3 3 3
To chop into fixed-width intervals, starting at the minimum value,
use chop_width()
:
<- chop_width(x, 2)
chopped data.frame(x, chopped)
#> x chopped
#> 1 4.978305 [3.086, 5.086)
#> 2 8.969989 [7.086, 9.086)
#> 3 3.391823 [3.086, 5.086)
#> 4 4.676785 [3.086, 5.086)
#> 5 7.057042 [5.086, 7.086)
#> 6 9.707687 [9.086, 11.09)
#> 7 6.713807 [5.086, 7.086)
#> 8 8.376589 [7.086, 9.086)
#> 9 1.086165 [1.086, 3.086)
#> 10 4.495479 [3.086, 5.086)
To chop into exactly intervals
fixed-with intervals, use
chop_evenly()
:
<- chop_evenly(x, intervals = 3)
chopped data.frame(x, chopped)
#> x chopped
#> 1 4.978305 [3.96, 6.834)
#> 2 8.969989 [6.834, 9.708]
#> 3 3.391823 [1.086, 3.96)
#> 4 4.676785 [3.96, 6.834)
#> 5 7.057042 [6.834, 9.708]
#> 6 9.707687 [6.834, 9.708]
#> 7 6.713807 [3.96, 6.834)
#> 8 8.376589 [6.834, 9.708]
#> 9 1.086165 [1.086, 3.96)
#> 10 4.495479 [3.96, 6.834)
To chop into groups with a fixed number of members, use
chop_n()
:
<- chop_n(x, 4)
chopped table(chopped)
#> chopped
#> [1.086, 4.978) [4.978, 8.97) [8.97, 9.708]
#> 4 4 2
To chop into a fixed number of equal-sized groups, use
chop_equally()
:
<- chop_equally(x, groups = 5)
chopped table(chopped)
#> chopped
#> [1.086, 4.275) [4.275, 4.858) [4.858, 6.851) [6.851, 8.495) [8.495, 9.708]
#> 2 2 2 2 2
To chop data up by quantiles, use chop_quantiles()
:
<- chop_quantiles(x, c(0.25, 0.5, 0.75))
chopped data.frame(x, chopped)
#> x chopped
#> 1 4.978305 [25%, 50%)
#> 2 8.969989 (75%, 100%]
#> 3 3.391823 [0%, 25%)
#> 4 4.676785 [25%, 50%)
#> 5 7.057042 [50%, 75%]
#> 6 9.707687 (75%, 100%]
#> 7 6.713807 [50%, 75%]
#> 8 8.376589 (75%, 100%]
#> 9 1.086165 [0%, 25%)
#> 10 4.495479 [0%, 25%)
To chop data by standard deviations around the mean, use
chop_mean_sd()
:
<- chop_mean_sd(x)
chopped data.frame(x, chopped)
#> x chopped
#> 1 4.978305 [-1 sd, 0 sd)
#> 2 8.969989 [1 sd, 2 sd)
#> 3 3.391823 [-1 sd, 0 sd)
#> 4 4.676785 [-1 sd, 0 sd)
#> 5 7.057042 [0 sd, 1 sd)
#> 6 9.707687 [1 sd, 2 sd)
#> 7 6.713807 [0 sd, 1 sd)
#> 8 8.376589 [0 sd, 1 sd)
#> 9 1.086165 [-2 sd, -1 sd)
#> 10 4.495479 [-1 sd, 0 sd)
To chop data into attractive intervals, use
chop_pretty()
. This selects intervals which are a multiple
of 2, 5 or 10. It’s useful for producing bar plots.
<- chop_pretty(x)
chopped data.frame(x, chopped)
#> x chopped
#> 1 4.978305 [4, 6)
#> 2 8.969989 [8, 10)
#> 3 3.391823 [2, 4)
#> 4 4.676785 [4, 6)
#> 5 7.057042 [6, 8)
#> 6 9.707687 [8, 10)
#> 7 6.713807 [6, 8)
#> 8 8.376589 [8, 10)
#> 9 1.086165 [0, 2)
#> 10 4.495479 [4, 6)
tab_n()
, tab_width()
, and friends act
similarly to tab()
, calling the related chop_*
function and then table()
on the result.
tab_n(x, 4)
#> [1.086, 4.978) [4.978, 8.97) [8.97, 9.708]
#> 4 4 2
tab_width(x, 2)
#> [1.086, 3.086) [3.086, 5.086) [5.086, 7.086) [7.086, 9.086) [9.086, 11.09)
#> 1 4 2 2 1
tab_evenly(x, 5)
#> [1.086, 2.81) [2.81, 4.535) [4.535, 6.259) [6.259, 7.983) [7.983, 9.708]
#> 1 2 2 2 3
tab_mean_sd(x)
#> [-2 sd, -1 sd) [-1 sd, 0 sd) [0 sd, 1 sd) [1 sd, 2 sd)
#> 1 4 3 2
You can chop dates too:
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
<- as.Date("2000-01-01") + 0:365
y2k <- chop_width(y2k, months(1))
months table(months)
#> months
#> [2000-01-01, 2000-02-01) [2000-02-01, 2000-03-01) [2000-03-01, 2000-04-01)
#> 31 29 31
#> [2000-04-01, 2000-05-01) [2000-05-01, 2000-06-01) [2000-06-01, 2000-07-01)
#> 30 31 30
#> [2000-07-01, 2000-08-01) [2000-08-01, 2000-09-01) [2000-09-01, 2000-10-01)
#> 31 31 30
#> [2000-10-01, 2000-11-01) [2000-11-01, 2000-12-01) [2000-12-01, 2001-01-01)
#> 31 30 31
You can change factor labels with the labels
argument:
<- chop(x, c(2, 5, 8), labels = c("Lowest", "Low", "Higher", "Highest"))
chopped data.frame(x, chopped)
#> x chopped
#> 1 4.978305 Low
#> 2 8.969989 Highest
#> 3 3.391823 Low
#> 4 4.676785 Low
#> 5 7.057042 Higher
#> 6 9.707687 Highest
#> 7 6.713807 Higher
#> 8 8.376589 Highest
#> 9 1.086165 Lowest
#> 10 4.495479 Low
You need as many labels as there are intervals - one fewer than
length(breaks)
if your data doesn’t extend beyond
breaks
, one more than length(breaks)
if it
does.
To label intervals with a dash, use lbl_dash()
:
<- chop(x, c(2, 5, 8), labels = lbl_dash())
chopped data.frame(x, chopped)
#> x chopped
#> 1 4.978305 2—5
#> 2 8.969989 8—9.708
#> 3 3.391823 2—5
#> 4 4.676785 2—5
#> 5 7.057042 5—8
#> 6 9.707687 8—9.708
#> 7 6.713807 5—8
#> 8 8.376589 8—9.708
#> 9 1.086165 1.086—2
#> 10 4.495479 2—5
To label integer data, use lbl_discrete()
. It uses more
informative right endpoints:
<- chop(1:10, c(2, 5, 8), labels = lbl_discrete())
chopped <- chop(1:10, c(2, 5, 8), labels = lbl_dash())
chopped2 data.frame(x = 1:10, lbl_discrete = chopped, lbl_dash = chopped2)
#> x lbl_discrete lbl_dash
#> 1 1 1 1—2
#> 2 2 2—4 2—5
#> 3 3 2—4 2—5
#> 4 4 2—4 2—5
#> 5 5 5—7 5—8
#> 6 6 5—7 5—8
#> 7 7 5—7 5—8
#> 8 8 8—10 8—10
#> 9 9 8—10 8—10
#> 10 10 8—10 8—10
You can customize the first or last labels:
<- chop(x, c(2, 5, 8), labels = lbl_dash(first = "< 2", last = "8+"))
chopped data.frame(x, chopped)
#> x chopped
#> 1 4.978305 2—5
#> 2 8.969989 8+
#> 3 3.391823 2—5
#> 4 4.676785 2—5
#> 5 7.057042 5—8
#> 6 9.707687 8+
#> 7 6.713807 5—8
#> 8 8.376589 8+
#> 9 1.086165 < 2
#> 10 4.495479 2—5
To label intervals in order use lbl_seq()
:
<- chop(x, c(2, 5, 8), labels = lbl_seq())
chopped data.frame(x, chopped)
#> x chopped
#> 1 4.978305 b
#> 2 8.969989 d
#> 3 3.391823 b
#> 4 4.676785 b
#> 5 7.057042 c
#> 6 9.707687 d
#> 7 6.713807 c
#> 8 8.376589 d
#> 9 1.086165 a
#> 10 4.495479 b
You can use numerals or even roman numerals:
chop(x, c(2, 5, 8), labels = lbl_seq("(1)"))
#> [1] (2) (4) (2) (2) (3) (4) (3) (4) (1) (2)
#> Levels: (1) (2) (3) (4)
chop(x, c(2, 5, 8), labels = lbl_seq("i."))
#> [1] ii. iv. ii. ii. iii. iv. iii. iv. i. ii.
#> Levels: i. ii. iii. iv.
Other labelling functions include:
lbl_endpoints()
- use left endpoints as labelslbl_midpoints()
- use interval midpoints as labelslbl_manual()
- specify labels manuallylbl_glue()
- specify labels flexibly with the
{glue}
packageBy default, chop()
extends breaks
if
necessary. If you don’t want that, set extend = FALSE
:
<- chop(x, c(3, 5, 7), extend = FALSE)
chopped data.frame(x, chopped)
#> x chopped
#> 1 4.978305 [3, 5)
#> 2 8.969989 <NA>
#> 3 3.391823 [3, 5)
#> 4 4.676785 [3, 5)
#> 5 7.057042 <NA>
#> 6 9.707687 <NA>
#> 7 6.713807 [5, 7)
#> 8 8.376589 <NA>
#> 9 1.086165 <NA>
#> 10 4.495479 [3, 5)
Data outside the range of breaks
will become
NA
.
By default, intervals are closed on the left, i.e. they include their
left endpoints. If you want right-closed intervals, set
left = FALSE
:
<- 1:5
y data.frame(
y = y,
left_closed = chop(y, 1:5),
right_closed = chop(y, 1:5, left = FALSE)
)#> y left_closed right_closed
#> 1 1 [1, 2) {1}
#> 2 2 [2, 3) (1, 2]
#> 3 3 [3, 4) (2, 3]
#> 4 4 [4, 5) (3, 4]
#> 5 5 {5} (4, 5]
If you want to close off the last interval, set
close_end = TRUE
:
data.frame(
y = y,
rightmost_open = chop(y, 1:5),
rightmost_closed = chop(y, 1:5, close_end = TRUE)
)#> y rightmost_open rightmost_closed
#> 1 1 [1, 2) [1, 2)
#> 2 2 [2, 3) [2, 3)
#> 3 3 [3, 4) [3, 4)
#> 4 4 [4, 5) [4, 5]
#> 5 5 {5} [4, 5]