Example 2: Join tables

This post has referred to a vignette from dplyr, you can find it in https://dplyr.tidyverse.org/articles/two-table.html. We’ll try to display how to join data tables in this vignette. First, load the packages we need and get some data.

library(tidyfst)
library(nycflights13)

flights2 <- flights %>% 
  select_dt(year,month,day, hour, origin, dest, tailnum, carrier)

Do a left join with a simple:

flights2 %>% 
  left_join_dt(airlines)
#> Joining by: carrier
#> Key: <carrier>
#>         carrier  year month   day  hour origin   dest tailnum
#>          <char> <int> <int> <int> <num> <char> <char>  <char>
#>      1:      9E  2013     1     1     8    JFK    MSP  N915XJ
#>      2:      9E  2013     1     1    15    JFK    IAD  N8444F
#>      3:      9E  2013     1     1    14    JFK    BUF  N920XJ
#>      4:      9E  2013     1     1    15    JFK    SYR  N8409N
#>      5:      9E  2013     1     1    15    JFK    ROC  N8631E
#>     ---                                                      
#> 336772:      YV  2013     9    29    16    LGA    IAD  N518LR
#> 336773:      YV  2013     9    29    17    LGA    CLT  N932LR
#> 336774:      YV  2013     9    30    16    LGA    IAD  N510MJ
#> 336775:      YV  2013     9    30    17    LGA    CLT  N905FJ
#> 336776:      YV  2013     9    30    20    LGA    CLT  N924FJ
#> 1 variable not shown: [name <char>]

Controlling how the tables are matched

Join works the same as dplyr:

flights2 %>% left_join_dt(weather)
#> Joining by: year,month,day,hour,origin
#> Key: <year, month, day, hour, origin>
#>          year month   day  hour origin   dest tailnum carrier  temp  dewp
#>         <int> <int> <int> <int> <char> <char>  <char>  <char> <num> <num>
#>      1:  2013     1     1     5    EWR    IAH  N14228      UA 39.02 28.04
#>      2:  2013     1     1     5    EWR    ORD  N39463      UA 39.02 28.04
#>      3:  2013     1     1     5    JFK    MIA  N619AA      AA 39.02 26.96
#>      4:  2013     1     1     5    JFK    BQN  N804JB      B6 39.02 26.96
#>      5:  2013     1     1     5    JFK    BOS  N708JB      B6 39.02 26.96
#>     ---                                                                  
#> 336772:  2013    12    31    23    EWR    SJU  N651JB      B6    NA    NA
#> 336773:  2013    12    31    23    JFK    BQN  N566JB      B6    NA    NA
#> 336774:  2013    12    31    23    JFK    SJU  N713TW      DL    NA    NA
#> 336775:  2013    12    31    23    JFK    SJU  N509JB      B6    NA    NA
#> 336776:  2013    12    31    23    JFK    PSE  N665JB      B6    NA    NA
#> 8 variables not shown: [humid <num>, wind_dir <num>, wind_speed <num>, wind_gust <num>, precip <num>, pressure <num>, visib <num>, time_hour <POSc>]
flights2 %>% left_join_dt(planes, by = "tailnum")
#> Key: <tailnum>
#>         tailnum year.x month   day  hour origin   dest carrier year.y   type
#>          <char>  <int> <int> <int> <num> <char> <char>  <char>  <int> <char>
#>      1:    <NA>   2013     1     2    15    JFK    LAX      AA     NA   <NA>
#>      2:    <NA>   2013     1     2    16    EWR    ORD      UA     NA   <NA>
#>      3:    <NA>   2013     1     3     8    EWR    MIA      UA     NA   <NA>
#>      4:    <NA>   2013     1     3     6    EWR    DFW      UA     NA   <NA>
#>      5:    <NA>   2013     1     4     8    JFK    DCA      9E     NA   <NA>
#>     ---                                                                     
#> 336772:  N9EAMQ   2013     9    27    16    LGA    ATL      MQ     NA   <NA>
#> 336773:  N9EAMQ   2013     9    29    12    LGA    BNA      MQ     NA   <NA>
#> 336774:  N9EAMQ   2013     9    29    18    LGA    CMH      MQ     NA   <NA>
#> 336775:  N9EAMQ   2013     9    30    11    JFK    DCA      MQ     NA   <NA>
#> 336776:  N9EAMQ   2013     9    30    14    JFK    TPA      MQ     NA   <NA>
#> 6 variables not shown: [manufacturer <char>, model <char>, engines <int>, seats <int>, speed <int>, engine <char>]
flights2 %>% left_join_dt(airports, c("dest" = "faa"))
#> Key: <dest>
#>           dest  year month   day  hour origin tailnum carrier
#>         <char> <int> <int> <int> <num> <char>  <char>  <char>
#>      1:    ABQ  2013    10     1    20    JFK  N554JB      B6
#>      2:    ABQ  2013    10     2    20    JFK  N607JB      B6
#>      3:    ABQ  2013    10     3    20    JFK  N591JB      B6
#>      4:    ABQ  2013    10     4    20    JFK  N662JB      B6
#>      5:    ABQ  2013    10     5    19    JFK  N580JB      B6
#>     ---                                                      
#> 336772:    XNA  2013     9    29    17    LGA  N725MQ      MQ
#> 336773:    XNA  2013     9    30     7    LGA  N735MQ      MQ
#> 336774:    XNA  2013     9    30     8    EWR  N14117      EV
#> 336775:    XNA  2013     9    30    15    LGA  N725MQ      MQ
#> 336776:    XNA  2013     9    30    17    LGA  N720MQ      MQ
#> 7 variables not shown: [name <char>, lat <num>, lon <num>, alt <num>, tz <num>, dst <char>, tzone <char>]
flights2 %>% left_join_dt(airports, c("origin" = "faa"))
#> Key: <origin>
#>         origin  year month   day  hour   dest tailnum carrier
#>         <char> <int> <int> <int> <num> <char>  <char>  <char>
#>      1:    EWR  2013     1     1     5    IAH  N14228      UA
#>      2:    EWR  2013     1     1     5    ORD  N39463      UA
#>      3:    EWR  2013     1     1     6    FLL  N516JB      B6
#>      4:    EWR  2013     1     1     6    SFO  N53441      UA
#>      5:    EWR  2013     1     1     6    LAS  N76515      UA
#>     ---                                                      
#> 336772:    LGA  2013     9    30    18    BNA  N740EV      EV
#> 336773:    LGA  2013     9    30    22    SYR    <NA>      9E
#> 336774:    LGA  2013     9    30    12    BNA  N535MQ      MQ
#> 336775:    LGA  2013     9    30    11    CLE  N511MQ      MQ
#> 336776:    LGA  2013     9    30     8    RDU  N839MQ      MQ
#> 7 variables not shown: [name <char>, lat <num>, lon <num>, alt <num>, tz <num>, dst <char>, tzone <char>]

Types of join

df1 <- data.table(x = c(1, 2), y = 2:1)
df2 <- data.table(x = c(1, 3), a = 10, b = "a")

df1 %>% inner_join_dt(df2) 
#> Joining by: x
#> Key: <x>
#>        x     y     a      b
#>    <num> <int> <num> <char>
#> 1:     1     2    10      a
df1 %>% left_join_dt(df2)
#> Joining by: x
#> Key: <x>
#>        x     y     a      b
#>    <num> <int> <num> <char>
#> 1:     1     2    10      a
#> 2:     2     1    NA   <NA>
df1 %>% right_join_dt(df2)
#> Joining by: x
#> Key: <x>
#>        x     y     a      b
#>    <num> <int> <num> <char>
#> 1:     1     2    10      a
#> 2:     3    NA    10      a
df1 %>% full_join_dt(df2)
#> Joining by: x
#> Key: <x>
#>        x     y     a      b
#>    <num> <int> <num> <char>
#> 1:     1     2    10      a
#> 2:     2     1    NA   <NA>
#> 3:     3    NA    10      a

If all you have is a data.frame or tibble, you have no need to change the format. Feed the data directly:

df1 <- data.frame(x = c(1, 1, 2), y = 1:3)
df2 <- data.frame(x = c(1, 1, 2), z = c("a", "b", "a"))

df1 %>% left_join_dt(df2)
#> Joining by: x
#> Key: <x>
#>        x     y      z
#>    <num> <int> <char>
#> 1:     1     1      a
#> 2:     1     1      b
#> 3:     1     2      a
#> 4:     1     2      b
#> 5:     2     3      a

The “_dt” suffix should remind you that this is backed up by data.table and will always return a data.table in the end.

Filtering joins

Filtering joins have also been supported in tidyfst.

flights %>% 
  anti_join_dt(planes, by = "tailnum") %>% 
  count_dt(tailnum, sort = TRUE)
#>      tailnum     n
#>       <char> <int>
#>   1:    <NA>  2512
#>   2:  N725MQ   575
#>   3:  N722MQ   513
#>   4:  N723MQ   507
#>   5:  N713MQ   483
#>  ---              
#> 718:  N7BKAA     1
#> 719:  N7CAAA     1
#> 720:  N5FCAA     1
#> 721:  N5ERAA     1
#> 722:  N647MQ     1

Other examples (semi_join_dt() and anti_join_dt() never duplicate; they only ever remove observations.):

df1 <- data.frame(x = c(1, 1, 3, 4), y = 1:4)
df2 <- data.frame(x = c(1, 1, 2), z = c("a", "b", "a"))

# Four rows to start with:
df1 %>% nrow()
#> [1] 4

# And we get four rows after the join
df1 %>% inner_join_dt(df2, by = "x") %>% nrow()
#> [1] 4

# But only two rows actually match
df1 %>% semi_join_dt(df2, by = "x") %>% nrow()
#> [1] 2

Set operations

For set operations, wrap data.table’s function directly, but the functions will automatically turn any data.frame into data.table. Examples are listed as below:

x = iris[c(2,3,3,4),]
x2 = iris[2:4,]
y = iris[c(3:5),]

intersect_dt(x, y)            # intersect
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>           <num>       <num>        <num>       <num>  <fctr>
#> 1:          4.7         3.2          1.3         0.2  setosa
#> 2:          4.6         3.1          1.5         0.2  setosa
intersect_dt(x, y, all=TRUE)  # intersect all
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>           <num>       <num>        <num>       <num>  <fctr>
#> 1:          4.7         3.2          1.3         0.2  setosa
#> 2:          4.6         3.1          1.5         0.2  setosa
setdiff_dt(x, y)              # except
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>           <num>       <num>        <num>       <num>  <fctr>
#> 1:          4.9           3          1.4         0.2  setosa
setdiff_dt(x, y, all=TRUE)    # except all
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>           <num>       <num>        <num>       <num>  <fctr>
#> 1:          4.9         3.0          1.4         0.2  setosa
#> 2:          4.7         3.2          1.3         0.2  setosa
union_dt(x, y)                # union
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>           <num>       <num>        <num>       <num>  <fctr>
#> 1:          4.9         3.0          1.4         0.2  setosa
#> 2:          4.7         3.2          1.3         0.2  setosa
#> 3:          4.6         3.1          1.5         0.2  setosa
#> 4:          5.0         3.6          1.4         0.2  setosa
union_dt(x, y, all=TRUE)      # union all
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>           <num>       <num>        <num>       <num>  <fctr>
#> 1:          4.9         3.0          1.4         0.2  setosa
#> 2:          4.7         3.2          1.3         0.2  setosa
#> 3:          4.7         3.2          1.3         0.2  setosa
#> 4:          4.6         3.1          1.5         0.2  setosa
#> 5:          4.7         3.2          1.3         0.2  setosa
#> 6:          4.6         3.1          1.5         0.2  setosa
#> 7:          5.0         3.6          1.4         0.2  setosa
setequal_dt(x, x2, all=FALSE) # setequal
#> [1] TRUE
setequal_dt(x, x2)     
#> [1] FALSE

For more details, just find the help from data.table using ?setops.