README

Record linkage and distinguishing between index, duplicate and recurrent events are common tasks in epidemiological analyses and other fields of research, particularly as part of a case definition. Implementing these in R can be complex and challenging. The diyar package provides a convenient and flexible way of doing these in R.

Installation

# Install the latest CRAN release 
install.packages("diyar")

# Or, install the development version from GitHub
install.packages("devtools")
devtools::install_github("OlisaNsonwu/diyar")

Number line

Use number_line() to create number_line objects - a range of numeric values. These can be split or manipulated in several ways.

library(diyar)
nl <- number_line(1, 10); nl
#> [1] "1 -> 10"
invert_number_line(nl)
#> [1] "-1 <- -10"
seq(nl, length.out = 3)
#> [1] "1 -> 4"  "4 -> 7"  "7 -> 10"

overlap_method(nl, nl); reverse(nl, nl)
#> [1] "exact"
#> [1] FALSE
nl2 <- reverse_number_line(nl); nl2
#> [1] "10 <- 1"
overlap_method(nl, nl2); reverse(nl, nl2)
#> [1] "reverse"
#> [1] TRUE

Set operations such as union_number_lines() are also possible for pairs of number_line objects.

nl3 <- number_line(1, 20)
nl4 <- number_line(3, 6)
nl3; nl4
#> [1] "1 -> 20"
#> [1] "3 -> 6"
overlap_method(nl3, nl4)
#> [1] "y_inbetween_x"
intersect_number_lines(nl3, nl4)
#> [1] "3 -> 6"
subtract_number_lines(nl3, nl4)
#> $n1
#> [1] "1 -> 3"
#> 
#> $n2
#> [1] "6 -> 20"

Record linkage

Use links() to create a unique identifier for matching records based on a multistage deterministic approach to record linkage.

attr_1 <- c(1, 1, 1, NA, NA, NA, NA, NA)
attr_2 <- c(NA, NA, 2, 2, 2, NA, NA, NA)
links(list(attr_1, attr_2))
#> [1] "P.1 (CRI 001)" "P.1 (CRI 001)" "P.1 (CRI 001)" "P.1 (CRI 002)"
#> [5] "P.1 (CRI 002)" "P.6 (No hits)" "P.7 (No hits)" "P.8 (No hits)"

Use link_records() to implement both deterministic and probabilistic record linkage by comparing every possible record-pair.

data(missing_staff_id)
dfr <- missing_staff_id[c("staff_id",  "initials", "hair_colour", "branch_office")]
p1 <- link_records(as.list(dfr), score_threshold = -4.2)
p1$pid
#> [1] "P.1 (CRI 001)" "P.2 (No hits)" "P.3 (No hits)" "P.4 (No hits)"
#> [5] "P.5 (No hits)" "P.1 (CRI 001)" "P.1 (CRI 001)"
subset(p1$pid_weights, record.match)
#>    sn_x sn_y cmp.staff_id cmp.initials cmp.hair_colour cmp.branch_office
#> 6     1    7            0            1               1                 1
#> 21    6    7            1            1               0                 0
#>    cmp.weight prb.staff_id prb.initials prb.hair_colour prb.branch_office
#> 6           3    -3.358454     1.074391        1.659354          1.659354
#> 21          2     1.659354     1.074391       -3.298333         -3.298333
#>    prb.weight record.match
#> 6    1.034645         TRUE
#> 21  -3.862921         TRUE

links_wf_probabilistic() is a wrapper function of links() and an alternative to link_records(). It’s less memory intensive but can be slower in comparison.

p2 <- links_wf_probabilistic(as.list(dfr), score_threshold = -4.2, recursive = TRUE)
p2$pid
#> [1] "P.1 (CRI 001)" "P.2 (No hits)" "P.3 (No hits)" "P.4 (No hits)"
#> [5] "P.5 (No hits)" "P.1 (CRI 001)" "P.1 (CRI 001)"
subset(p2$pid_weights, record.match)
#>   sn_x sn_y cmp.staff_id cmp.initials cmp.hair_colour cmp.branch_office
#> 1    1    1            0            1               1                 1
#> 6    6    7            1            1               0                 0
#> 7    7    1            0            1               1                 1
#>   cmp.weight prb.staff_id prb.initials prb.hair_colour prb.branch_office
#> 1          3    -4.321928     1.148392        1.733354          1.733354
#> 6          2     1.733354     1.148392       -3.298333         -3.298333
#> 7          3    -3.836501     1.148392        1.733354          1.733354
#>   prb.weight record.match
#> 1  0.2931724         TRUE
#> 6 -3.7149198         TRUE
#> 7  0.7785993         TRUE

Case definitions

Use episodes() to create a unique identifier for related events based on a case definition.

dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-07"), by = 1)
episodes(dates, case_length = 2, group_stats = TRUE)
#> [1] "E.1 2020-01-01 -> 2020-01-03 (C)" "E.1 2020-01-01 -> 2020-01-03 (D)"
#> [3] "E.1 2020-01-01 -> 2020-01-03 (D)" "E.4 2020-01-04 -> 2020-01-06 (C)"
#> [5] "E.4 2020-01-04 -> 2020-01-06 (D)" "E.4 2020-01-04 -> 2020-01-06 (D)"
#> [7] "E.7 2020-01-07 == 2020-01-07 (C)"
episodes(dates, case_length = 2, episode_type = "rolling", group_stats = TRUE)
#> [1] "E.1 2020-01-01 -> 2020-01-07 (C)" "E.1 2020-01-01 -> 2020-01-07 (D)"
#> [3] "E.1 2020-01-01 -> 2020-01-07 (D)" "E.1 2020-01-01 -> 2020-01-07 (R)"
#> [5] "E.1 2020-01-01 -> 2020-01-07 (D)" "E.1 2020-01-01 -> 2020-01-07 (R)"
#> [7] "E.1 2020-01-01 -> 2020-01-07 (D)"

Use partitions() to create a unique identifier for events within the same period or numerical interval.

partitions(dates, by = 2, separate = TRUE, group_stats = TRUE)
#> [1] "PN.1 2020-01-01 -> 2020-01-02 (I)" "PN.1 2020-01-01 -> 2020-01-02 (D)"
#> [3] "PN.3 2020-01-03 -> 2020-01-04 (I)" "PN.3 2020-01-03 -> 2020-01-04 (D)"
#> [5] "PN.5 2020-01-05 -> 2020-01-07 (I)" "PN.5 2020-01-05 -> 2020-01-07 (D)"
#> [7] "PN.5 2020-01-05 -> 2020-01-07 (D)"
partitions(dates, length.out = 3, separate = TRUE, group_stats = TRUE)
#> [1] "PN.1 2020-01-01 -> 2020-01-02 (I)" "PN.1 2020-01-01 -> 2020-01-02 (D)"
#> [3] "PN.3 2020-01-03 -> 2020-01-04 (I)" "PN.3 2020-01-03 -> 2020-01-04 (D)"
#> [5] "PN.5 2020-01-05 -> 2020-01-07 (I)" "PN.5 2020-01-05 -> 2020-01-07 (D)"
#> [7] "PN.5 2020-01-05 -> 2020-01-07 (D)"

diyar

Overview

Installation

Number line

Record linkage

Case definitions

Bugs and issues