Cleaning Data in R
Maggie Matsui
Content Developer @ DataCamp
df_A
name zip state
1 Christine M. Conner 10456 NY
2 Keaton Z Snyder 15020 PA
3 Arthur Potts 07799 NJ
4 Maia Collier 07960 NJ
5 Atkins, Alice W. 10603 NY
df_B
name zip state
1 Jerome A. Yates 11743 NY
2 Garrison, Brenda 08611 NJ
3 Keaton Snyder 15020 PA
4 Stuart, Bert F 12211 NY
5 Hayley Peck 19134 PA
pair_blocking(df_A, df_B, blocking_var = "state") %>%
compare_pairs(by = c("name", "zip"), default_comparator = lcs())
x y name zip
1 1 1 0.3529412 0.4
2 1 4 0.3030303 0.2
3 2 3 0.9285714 1.0
4 2 5 0.2307692 0.2
5 3 2 0.2142857 0.2
6 4 2 0.2857143 0.4
7 5 1 0.1935484 0.4
8 5 4 0.3333333 0.2
x y name zip
1 1 1 0.3529412 + 0.4 =
2 1 4 0.3030303 + 0.2 =
3 2 3 0.9285714 + 1.0 =
4 2 5 0.2307692 + 0.2 =
5 3 2 0.2142857 + 0.2 =
6 4 2 0.2857143 + 0.4 =
7 5 1 0.1935484 + 0.4 =
8 5 4 0.3333333 + 0.2 =
pair_blocking(df_A, df_B, blocking_var = "state") %>% compare_pairs(by = c("name", "zip"), default_comparator = lcs()) %>%
score_simsum()
x y name zip simsum
1 1 1 0.3529412 0.4 0.7529412
2 1 4 0.3030303 0.2 0.5030303
3 2 3 0.9285714 1.0 1.9285714
4 2 5 0.2307692 0.2 0.4307692
5 3 2 0.2142857 0.2 0.4142857
6 4 2 0.2857143 0.4 0.6857143
7 5 1 0.1935484 0.4 0.5935484
8 5 4 0.3333333 0.2 0.5333333
pair_blocking(df_A, df_B, blocking_var = "state") %>% compare_pairs(by = c("name", "zip"), default_comparator = lcs()) %>%
score_simsum()
x y name zip simsum
1 1 1 0.3529412 0.4 0.7529412
2 1 4 0.3030303 0.2 0.5030303
3 2 3 0.9285714 1.0 1.9285714 <--
4 2 5 0.2307692 0.2 0.4307692
5 3 2 0.2142857 0.2 0.4142857
6 4 2 0.2857143 0.4 0.6857143
7 5 1 0.1935484 0.4 0.5935484
8 5 4 0.3333333 0.2 0.5333333
pair_blocking(df_A, df_B, blocking_var = "state") %>% compare_pairs(by = c("name", "zip"), default_comparator = lcs()) %>%
score_problink()
x y name zip weight
1 1 1 0.3529412 0.4 -1.011599
2 1 4 0.3030303 0.2 -2.219198
3 2 3 0.9285714 1.0 16.019278
4 2 5 0.2307692 0.2 -2.590260
5 3 2 0.2142857 0.2 -2.685570
6 4 2 0.2857143 0.4 -1.321753
7 5 1 0.1935484 0.4 -1.832576
8 5 4 0.3333333 0.2 -2.079436
pair_blocking(df_A, df_B, blocking_var = "state") %>% compare_pairs(by = c("name", "zip"), default_comparator = lcs()) %>% score_problink() %>%
select_n_to_m()
x y name zip weight select
1 1 1 0.3529412 0.4 -1.011599 FALSE
2 1 4 0.3030303 0.2 -2.219198 FALSE
3 2 3 0.9285714 1.0 16.019278 TRUE
4 2 5 0.2307692 0.2 -2.590260 FALSE
5 3 2 0.2142857 0.2 -2.685570 FALSE
6 4 2 0.2857143 0.4 -1.321753 FALSE
...
pair_blocking(df_A, df_B, blocking_var = "state") %>% compare_pairs(by = c("name", "zip"), default_comparator = lcs()) %>% score_problink() %>% select_n_to_m() %>%
link()
name.x zip.x state.x name.y zip.y state.y
1 Keaton Z Snyder 15020 PA Keaton Snyder 15020 PA
2 Christine M. Conner 10456 NY <NA> <NA> <NA>
3 Arthur Potts 07799 NJ <NA> <NA> <NA>
4 Maia Collier 07960 NJ <NA> <NA> <NA>
5 Atkins, Alice W. 10603 NY <NA> <NA> <NA>
6 <NA> <NA> <NA> Jerome A. Yates 11743 NY
7 <NA> <NA> <NA> Garrison, Brenda 08611 NJ
8 <NA> <NA> <NA> Stuart, Bert F 12211 NY
9 <NA> <NA> <NA> Hayley Peck 19134 PA
Cleaning Data in R