Overview

Record linkage and distinguishing between index, duplicate and recurrent events are common tasks in epidemiological analyses and other fields of research, particularly as part of a case definition. Implementing these in R can be complex and challenging. The diyar package provides a convenient and flexible way of doing these in R.

Installation

# Install the latest CRAN release 
install.packages("diyar")

# Or, install the development version from GitHub
install.packages("devtools")
devtools::install_github("OlisaNsonwu/diyar")

Usage

Number line

Use number_line() to create number_line objects - a range of numeric values. These can be split or manipulated in several ways.

library(diyar)
nl <- number_line(1, 10); nl
#> [1] "1 -> 10"
invert_number_line(nl)
#> [1] "-1 <- -10"
seq(nl, length.out = 3)
#> [1] "1 -> 4"  "4 -> 7"  "7 -> 10"

overlap() and related functions test how number_line objects overlap.

overlap_method(nl, nl); reverse(nl, nl)
#> [1] "exact"
#> [1] FALSE
nl2 <- reverse_number_line(nl); nl2
#> [1] "10 <- 1"
overlap_method(nl, nl2); reverse(nl, nl2)
#> [1] "reverse"
#> [1] TRUE

Set operations such as union_number_lines() are also possible for pairs of number_line objects.

nl3 <- number_line(1, 20)
nl4 <- number_line(3, 6)
nl3; nl4
#> [1] "1 -> 20"
#> [1] "3 -> 6"
overlap_method(nl3, nl4)
#> [1] "inbetween"
intersect_number_lines(nl3, nl4)
#> [1] "3 -> 6"
subtract_number_lines(nl3, nl4)
#> $n1
#> [1] "1 -> 3"
#> 
#> $n2
#> [1] "6 -> 20"

Record linkage

Use links() to create a unique identifier for matching records based on a multistage deterministic approach to record linkage.

attr_1 <- c(1, 1, 1, NA, NA, NA, NA, NA)
attr_2 <- c(NA, NA, 2, 2, 2, NA, NA, NA)
links(list(attr_1, attr_2))
#> [1] "P.1 (CRI 001)" "P.1 (CRI 001)" "P.1 (CRI 001)" "P.1 (CRI 002)"
#> [5] "P.1 (CRI 002)" "P.6 (No hits)" "P.7 (No hits)" "P.8 (No hits)"

links_wf_probabilistic() is a wrapper function of links() to implement probabilistic record linkage.

data(missing_staff_id)
dfr <- missing_staff_id[c("staff_id",  "initials", "hair_colour", "branch_office")]
links_wf_probabilistic(as.list(dfr), score_threshold = -4.2)
#> $pid
#> [1] "P.1 (CRI 001)" "P.2 (No hits)" "P.3 (No hits)" "P.4 (No hits)"
#> [5] "P.5 (No hits)" "P.6 (No hits)" "P.1 (CRI 001)"
#> 
#> $pid_weights
#>      sn_x sn_y cmp.staff_id cmp.initials cmp.hair_colour cmp.branch_office
#> [1,]    1    7            0            1               1                 1
#> [2,]    2    2           NA           NA              NA                NA
#> [3,]    3    3           NA           NA              NA                NA
#> [4,]    4    4           NA           NA              NA                NA
#> [5,]    5    5           NA           NA              NA                NA
#> [6,]    6    6           NA           NA              NA                NA
#> [7,]    7    7            1            1               1                 1
#>      cmp.weight cmp.threshold prb.staff_id prb.initials prb.hair_colour
#> [1,]          3            NA    -4.321928     1.148392        1.733354
#> [2,]         NA            NA           NA           NA              NA
#> [3,]         NA            NA           NA           NA              NA
#> [4,]         NA            NA           NA           NA              NA
#> [5,]         NA            NA           NA           NA              NA
#> [6,]         NA            NA           NA           NA              NA
#> [7,]          4            NA     1.733354     1.148392        1.733354
#>      prb.branch_office prb.weight prb.threshold
#> [1,]          1.733354  0.2931724             1
#> [2,]                NA         NA            NA
#> [3,]                NA         NA            NA
#> [4,]                NA         NA            NA
#> [5,]                NA         NA            NA
#> [6,]                NA         NA            NA
#> [7,]          1.733354  6.3484549             1

Episode tracking

Use episodes() to create a unique identifier for related events based on a case definition.

episodes(1:7, case_length = 2)
#> [1] "E.1 (C)" "E.1 (D)" "E.1 (D)" "E.4 (C)" "E.4 (D)" "E.4 (D)" "E.7 (C)"
episodes(1:7, case_length = 2, episode_type = "rolling")
#> [1] "E.1 (C)" "E.1 (D)" "E.1 (D)" "E.1 (R)" "E.1 (D)" "E.1 (R)" "E.1 (D)"

Use partitions() to create a unique identifier for events within the same time or numerical interval.

partitions(1:7, by = 2, separate = TRUE)
#> [1] "PN.1 (I)" "PN.1 (D)" "PN.3 (I)" "PN.3 (D)" "PN.5 (I)" "PN.5 (D)" "PN.5 (D)"
partitions(1:7, length.out = 3, separate = TRUE)
#> [1] "PN.1 (I)" "PN.1 (D)" "PN.3 (I)" "PN.3 (D)" "PN.5 (I)" "PN.5 (D)" "PN.5 (D)"

Find out more here!

Bugs and issues

Please report any bug or issues with using this package here.