Deterministic and probabilistic record linkage Assign unique identifiers to records based on partial, nested or calculated probabilities.
links_af_probabilistic(
attribute,
blocking_attribute = NULL,
cmp_func = diyar::exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
u_probability = NULL,
score_threshold = 1,
repeats_allowed = FALSE,
permutations_allowed = FALSE,
data_source = NULL,
ignore_same_source = TRUE,
display = "none"
)
links_wf_probabilistic(
attribute,
blocking_attribute = NULL,
cmp_func = diyar::exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
u_probability = NULL,
score_threshold = 1,
id_1 = NULL,
id_2 = NULL,
return_weights = FALSE,
...
)
prob_score_range(attribute, m_probability = 0.95, u_probability = NULL)
[atomic|list|data.frame|matrix|d_attribute]
. Attributes to compare.
[atomic]
. Passed to criteria
in links
.
[list|function]
. String comparators for each attribute
. See Details
.
[list|numeric|number_line]
. Weight-thresholds for each cmp_func
. See Details
.
[logical]
. If TRUE
, scores are assigned base on Fellegi-Sunter model for probabilistic record linkage. See Details
.
[list|numeric]
. The probability that a matching records are the same entity.
[list|numeric]
. The probability that a matching records are not the same entity.
[numeric|number_line]
. Score-threshold for linked records. See Details
.
[logical]
Passed to repeats_allowed
in links
.
[logical]
Passed to permutations_allowed
in links
.
[character]
. Passed to data_source
in links
.
[logical]
Passed to ignore_same_source
in links
.
[character]
. Passed to display
in links
.
[list|numeric]
. Record id or index of one half of a record-pair.
[list|numeric]
. Record id or index of one half of a record-pair.
If TRUE
, returns the match-weights and score-thresholds for record pairs.
Arguments passed to links
pid
; list
links_wf_probabilistic()
- A wrapper function of links
with a
with a specific sub_criteria
and to achieve to achieve probabilistic record linkage
It excludes functionalities for the nested and multi-stage linkage.
links_wf_probabilistic()
requires a score_threshold
in advance.
To help with this, prob_score_range()
can be used to return the range of scores attainable for a given set of attribute
, m
and u
-probabilities.
Additionally, id_1
and id_2
can be used to link specific records pairs, aiding the review of potential scores.
links_af_probabilistic()
- A simpler version of links
.
It excludes functionalities for the batched, nested and multi-stage linkage.
links_af_probabilistic()
requires a score_threshold
in advance,
however, since it exports the match weights, the score_threshold
can be changed after the analysis.
Fellegi, I. P., & Sunter, A. B. (1969). A Theory for Record Linkage. Journal of the Statistical Association, 64(328), 1183 - 1210. https://doi.org/10.1080/01621459.1969.10501049
Asher, J., Resnick, D., Brite, J., Brackbill, R., & Cone, J. (2020). An Introduction to Probabilistic Record Linkage with a Focus on Linkage Processing for WTC Registries. International journal of environmental research and public health, 17(18), 6937. https://doi.org/10.3390/ijerph17186937.
See vignette("links")
for more information.
data(patient_records)
# Weighted (probabilistic) comparison of forename, middlename and surname
criteria_1 <- as.list(patient_records[c("forename", "middlename", "surname")])
# Possible scores when m-probability is 0.95
prob_scores <- prob_score_range(attribute = criteria_1,
m_probability = 0.95,
u_probability = NULL)
if (FALSE) {
# Probabilistic record linkage with 'links_af_probabilistic()'
pids_1a <- links_af_probabilistic(attribute = criteria_1,
cmp_func = exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
score_threshold = prob_scores$mid_scorce,
display = "stats")
# Equivalent with 'links_wf_probabilistic()'
pids_1b <- links_wf_probabilistic(attribute = criteria_1,
cmp_func = exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
score_threshold = prob_scores$mid_scorce,
display = "progress",
recursive = TRUE,
check_duplicates = TRUE)
# Less thorough but faster equivalent with `links_wf_probabilistic()`
pids_1c <- links_wf_probabilistic(attribute = criteria_1,
cmp_func = exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
score_threshold = prob_scores$mid_scorce,
display = "progress",
recursive = FALSE,
check_duplicates = FALSE)
# Each implementation can lead to different results
summary(pids_1a$pid)
summary(pids_1b$pid)
summary(pids_1c$pid)
}
# Weighted (non-probabilistic) comparison of forename, middlename and age difference
criteria_2 <- as.list(patient_records[c("forename", "middlename", "dateofbirth")])
age_diff <- function(x, y){
diff <- abs(as.numeric(x) - as.numeric(y))
wgt <- diff %in% 0:(365 * 10) & !is.na(diff)
wgt
}
pids_2a <- links_af_probabilistic(attribute = criteria_2,
blocking_attribute = patient_records$surname,
cmp_func = c(exact_match, exact_match, age_diff),
score_threshold = number_line(3, 5),
probabilistic = FALSE,
display = "stats")
#> Data validation: 1699828800.89
#> Pairs created: 1699828800.89
#> Weights calculated: 1699828800.9
#> `pid` created: 1699828800.91
#> Records linked in < 1 secs!
# Larger weights can be assigned to particular attributes through `cmp_func`
# For example, a smaller age difference can contribute a higher score (e.g 0 to 3)
age_diff_2 <- function(x, y){
diff <- as.numeric(abs(x - y))
wgt <- diff %in% 0:(365 * 10) & !is.na(diff)
wgt[wgt] <- match(as.numeric(cut(diff[wgt], 3)), 3:1)
wgt
}
pids_2b <- links_af_probabilistic(attribute = criteria_2,
blocking_attribute = patient_records$surname,
cmp_func = c(exact_match, exact_match, age_diff_2),
score_threshold = number_line(3, 5),
probabilistic = FALSE,
display = "stats")
#> Data validation: 1699828800.92
#> Pairs created: 1699828800.92
#> Weights calculated: 1699828800.93
#> `pid` created: 1699828800.93
#> Records linked in < 1 secs!
head(pids_2a$pid_weights, 10)
#> sn_x sn_y sn_x sn_y cmp.forename cmp.middlename cmp.dateofbirth cmp.weight
#> 1 1604 2 1604 2 0 1 0 1
#> 2 3999 2 3999 2 0 1 1 2
#> 3 3999 1604 3999 1604 0 1 0 1
#> 4 279 3 279 3 0 1 1 2
#> 5 8422 3 8422 3 0 1 1 2
#> 6 8422 279 8422 279 0 1 1 2
#> 7 451 4 451 4 0 0 0 0
#> 8 3176 4 3176 4 0 0 0 0
#> 9 3411 4 3411 4 0 0 0 0
#> 10 5697 4 5697 4 0 0 0 0
#> record.match
#> 1 0
#> 2 0
#> 3 0
#> 4 0
#> 5 0
#> 6 0
#> 7 0
#> 8 0
#> 9 0
#> 10 0
head(pids_2b$pid_weights, 10)
#> sn_x sn_y sn_x sn_y cmp.forename cmp.middlename cmp.dateofbirth cmp.weight
#> 1 1604 2 1604 2 0 1 0 1
#> 2 3999 2 3999 2 0 1 3 4
#> 3 3999 1604 3999 1604 0 1 0 1
#> 4 279 3 279 3 0 1 3 4
#> 5 8422 3 8422 3 0 1 3 4
#> 6 8422 279 8422 279 0 1 3 4
#> 7 451 4 451 4 0 0 0 0
#> 8 3176 4 3176 4 0 0 0 0
#> 9 3411 4 3411 4 0 0 0 0
#> 10 5697 4 5697 4 0 0 0 0
#> record.match
#> 1 0
#> 2 1
#> 3 0
#> 4 1
#> 5 1
#> 6 1
#> 7 0
#> 8 0
#> 9 0
#> 10 0