Review: beforehand we have identified the potential children for our sample in ppfad_igene_long
.
Goal: now it is time to find parental pointers from various SOEP datasets for the identified persons.
preparation: read data needed in the script
# data of children
ppfad_igene_long <- import(here::here("data", "ppfad_igene_long.rds")) # pupils 11-14 yrs
# data with parental pointers
biobirth_dict <- import(here::here("data", "biobirth_dict_v36.rds"))
kid_dict <- import(here::here("data", "kid_dict_v36.rds"))
bioparen_dict <- import(here::here("data", "bioparen_dict_v36.rds"))
bioage_mother_dict <- import(here::here("data", "bioage_mother_dict_v36.rds")) # data for pupils only
bioage_father_dict <- import(here::here("data", "bioage_father_dict_v36.rds")) # data for pupils only
hhh_fathers <- import(here::here("data", "hhh_fathers_v36.rds"))
hhh_mothers <- import(here::here("data", "hhh_mothers_v36.rds"))
# metadata
pbrutto_long <- import(here::here("data/pbrutto_long_v36.rds"))
goal: create dataset children_parents
containing people from the igene sample and the identified parental pointers with the source and the relation we think the child has with the mother or father identified (social or genetic or unknown)
fist we build a dataset that includes all potential children and add all possible combinations for possible matches.
from each of the four sources we will identify mothers and fathers and therefore we need one row for each child for all of the possible combinations. Over the course of this script, we will populate this dataset of chilren with the matched mothers and fathers and define the relation of the child to the identified person depending on the source. We allow for up to three pointers per child and pointer and source
# unique identifier keys: child_id, pointer, source
pot_children <- ppfad_igene_long %>%
# distinct only keeps one person per id and removes douplicates by survey year (the source data is in long format and has multiple observations over time per child)
tidylog::distinct(pid) %>%
# get a vector with child-id's
pull(pid) %>%
# make a dataset of all possible combinations
expand_grid(child_id = .,
pointer = c("mother", "father"),
source = c("biobirth", "bioage", "bioparen", "kid", "hhh"),
pointer_nr = c(1:3))
pot_children_parents <- pot_children %>%
# generate the variables that are oing to be populated
mutate(pointer_id = NA_real_,
child_match_rel = NA_character_,
child_match_type_minor = NA_character_
)
# before we start: define source to be used in the code for this section (this source dataset)
source_param <- "biobirth"
source_data <- biobirth_dict
pointer_mother_id <- "mother_id"
pointer_father_id <- "father_id"
goal: let’s see how many mother and father pointers we can find in biobirth
Biobirth-Doku: https://www.diw.de/documents/publikationen/73/diw_01.c.787430.de/diw_ssp0839.pdf
now we get the parental pointers and match our pupils with the possible mothers and fathers from the pointer source dataset
pointer_mothers <- source_data %>% select_pointers(pointer = "mother", pointer_id_name = pointer_mother_id, source = source_param)
pot_children_source_mothers <- pot_children %>% match_child_pointer(pointer = "mother", source = source_param, pointer_data = pointer_mothers)
pointer_fathers <- source_data %>% select_pointers(pointer = "father", pointer_id_name = pointer_father_id, source = source_param)
pot_children_source_fathers <- pot_children %>% match_child_pointer(pointer = "father", source = source_param, pointer_data = pointer_fathers)
The maximum number of parents identified in this source dataset is 1 for mothers and 1 for fathers.
in this dataset, we define the relation between the pupil and the match depending on some characteristics of the source dataset. this part is unique to every source dataset maybe it would be cleaner to do this definition bit later on, not sure
new variables: child_match_rel, child_match_type_minor
keys: child_id, pointer, pointer_nr
# just matches, no missings
pot_children_source_matches <-
# combine matches for mothers and fathers
pot_children_source_mothers %>% bind_rows(pot_children_source_fathers) %>%
# here we drop the pupils with non-dentified parents, because the will have NA's for the child_rel and child_match_type_minor
drop_na(pointer_id) %>%
mutate(child_match_rel = "child",
child_match_type_minor = "genetic_known") %>%
# select the variables that will go into the final dataset
select(child_id, pointer_id, pointer_nr, pointer, source, child_match_rel, child_match_type_minor)
test: do we have missings in the child-match variable?
testthat::expect_equal(sum(is.na(pot_children_source_matches$child_match_type_minor)), 0)
test: make sure the coding captured all possible outcomes of child-parent relationships
testthat::expect_equal(nrow(pot_children_source_matches %>% filter(child_match_type_minor == "mistake")), 0)
testthat::expect_equal(nrow(pot_children_source_matches %>% filter(child_match_rel == "mistake")), 0)
now we add all the gained information on to our main dataset pot_children
new variables: child_match_rel, child_match_type_minor
keys: child_id, source, pointer, pointer_nr
# data specific for the source
pot_children_source_parents <- pot_children %>%
filter(source == !!source_param) %>%
tidylog::left_join(pot_children_source_matches, by = c("child_id", "pointer", "source", "pointer_nr"))
# overall data that is being populated throughout the script
pot_children_parents[pot_children_parents$source == source_param, ] <- pot_children_source_parents
test: did we replace the pot_children_parents data with the same amount of rows or more if there are multiple parents for one child?
# It should just not me less (expect_lte : less than or equal)
testthat::expect_lte(nrow(pot_children_parents[pot_children_parents$source == source_param, ]), nrow(pot_children_source_parents))
summary_parents <- pot_children_parents %>%
group_by(child_id, pointer, source) %>%
summarize(n_avail = sum(!is.na(pointer_id))) %>%
mutate(flag_avail = case_when(n_avail >= 1 ~ TRUE,
n_avail == 0 ~ FALSE,
TRUE ~ NA))
summary_source_parents <- summary_parents %>%
filter(source == !!source_param)
Let’s see how many parents we found in this specific source dataset
print <- summary_source_parents %>% tabyl(pointer, flag_avail)
print_mother <- print$'TRUE'[print$pointer == "mother"]
print_father <- print$'TRUE'[print$pointer == "father"]
print %>% adorn_title()
for 414 we have at least one mother for 330 we have at least one father
Note: even though there is no identified father, there could still be an identified mother for the same child (this doesn’t come across in this table). Lets look at the availablity for mothers and fathers per child
And how many mothers and fathers per child?
The first table shows how many parents per child
print <- summary_source_parents %>% summary_by_child(kind = "specific") %>% tabyl(n_father, n_mother)
print %>% adorn_title()
The second table shows for how many children, there is at least one father and/or mother (Note: the two tables are identical, if every child only has at most one identified parent)
summary_source_parents %>% summary_by_child(kind = "specific", values_from = "flag_avail", names_prefix = "avail_") %>% tabyl(avail_father, avail_mother) %>% adorn_title()
We can see that for 2111 cases, there is no corresponding child_id in the source data and therefore also no parental ID’s
Let’s take a look and compare all the source datasets
For the sources that we already used, we can now see the number of identified parents
summary_parents %>% tabyl(source, n_avail, pointer) %>% adorn_title()
## $father
## n_avail
## source 0 1
## bioage 2541 0
## biobirth 2211 330
## bioparen 2541 0
## hhh 2541 0
## kid 2541 0
##
## $mother
## n_avail
## source 0 1
## bioage 2541 0
## biobirth 2127 414
## bioparen 2541 0
## hhh 2541 0
## kid 2541 0
For the sources that we already used, we can now see whether at least one mother or father was identified
summary_parents %>% tabyl(source, flag_avail, pointer) %>% adorn_title()
## $father
## flag_avail
## source FALSE TRUE
## bioage 2541 0
## biobirth 2211 330
## bioparen 2541 0
## hhh 2541 0
## kid 2541 0
##
## $mother
## flag_avail
## source FALSE TRUE
## bioage 2541 0
## biobirth 2127 414
## bioparen 2541 0
## hhh 2541 0
## kid 2541 0
How many mothers were identified in which source data?
summary_parents %>%
filter(pointer == "mother") %>% summary_by_child(kind = "all") %>%
count(pointer, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))
How many fathers were identified in which source data?
summary_parents %>%
filter(pointer == "father") %>% summary_by_child(kind = "all") %>%
count(pointer, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))
Last but not least lets look at a summary of the whole partial thing for the biobirth source dataset
pot_children_source_parents %>%
# drop_na(pointer_id) %>%
group_by(pointer, pointer_nr) %>%
skim()
Name | Piped data |
Number of rows | 15246 |
Number of columns | 7 |
_______________________ | |
Column type frequency: | |
character | 3 |
numeric | 2 |
________________________ | |
Group variables | pointer, pointer_nr |
Variable type: character
skim_variable | pointer | pointer_nr | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|---|---|
source | father | 1 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
source | father | 2 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
source | father | 3 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
source | mother | 1 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
source | mother | 2 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
source | mother | 3 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
child_match_rel | father | 1 | 2211 | 0.13 | 5 | 5 | 0 | 1 | 0 |
child_match_rel | father | 2 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_rel | father | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_rel | mother | 1 | 2127 | 0.16 | 5 | 5 | 0 | 1 | 0 |
child_match_rel | mother | 2 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_rel | mother | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | father | 1 | 2211 | 0.13 | 13 | 13 | 0 | 1 | 0 |
child_match_type_minor | father | 2 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | father | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | mother | 1 | 2127 | 0.16 | 13 | 13 | 0 | 1 | 0 |
child_match_type_minor | mother | 2 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | mother | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
Variable type: numeric
skim_variable | pointer | pointer_nr | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|---|---|
child_id | father | 1 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | father | 2 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | father | 3 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 1 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 2 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 3 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
pointer_id | father | 1 | 2211 | 0.13 | 21684752 | 16540273 | 1161603 | 8766326 | 9591301 | 40682101 | 41723901 | ▇▃▁▁▇ |
pointer_id | father | 2 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
pointer_id | father | 3 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
pointer_id | mother | 1 | 2127 | 0.16 | 22146233 | 16679840 | 1233702 | 8766327 | 9608802 | 40882151 | 41723902 | ▇▂▁▁▇ |
pointer_id | mother | 2 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
pointer_id | mother | 3 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA |
# pot_children_parents1 %>% count("mother available" = !is.na(mother_id), "father avaiable" = !is.na(father_id)) %>% arrange(desc(n))
# before we start: define source to be used in the code for this section (this source dataset)
source_param <- "bioage"
source_mother_data <- bioage_mother_dict
source_father_data <- bioage_father_dict
pointer_mother_id <- "mother_id"
pointer_father_id <- "father_id"
additional_mother_vars <- c("mother_biochild") # needed to define child_match_type-minor later on
additional_father_vars <- c("father_biochild") # needed to define child_match_type-minor later on
goal: let’s see how many mother and father pointers we can find in bioage
Bioage-Doku: https://www.diw.de/documents/publikationen/73/diw_01.c.673305.de/diw_ssp0747.pdf
now we get the parental pointers and match our pupils with the possible mothers and fathers from the pointer source dataset
pointer_mothers <- source_mother_data %>% select_pointers(pointer = "mother", pointer_id_name = pointer_mother_id, source = source_param, additional_mother_vars)
pot_children_source_mothers <- pot_children %>% match_child_pointer(pointer = "mother", source = source_param, pointer_data = pointer_mothers)
pointer_fathers <- source_father_data %>% select_pointers(pointer = "father", pointer_id_name = pointer_father_id, source = source_param, additional_father_vars)
pot_children_source_fathers <- pot_children %>% match_child_pointer(pointer = "father", source = source_param, pointer_data = pointer_fathers)
The maximum number of parents identified in this source dataset is 1 for mothers and 1 for fathers.
in this dataset, we define the relation between the pupil and the match depending on some characteristics of the source dataset. this part is unique to every source dataset maybe it would be cleaner to do this definition bit later on, not sure
new variables: child_match_rel, child_match_type_minor
keys: child_id, pointer, pointer_nr
# just matches, no missings
pot_children_source_matches <- pot_children_source_mothers %>% bind_rows(pot_children_source_fathers) %>%
# here we drop the pupils with non-dentified parents, because the will have NA's for the child_rel and child_match_type_minor
drop_na(pointer_id) %>%
mutate(child_match_rel = "child",
child_match_type_minor = case_when(pointer_biochild == 1 ~ "genetic_known",
pointer_biochild == 2 ~ "social_known",
is.na(pointer_biochild) ~ "unknown",
TRUE ~ "mistake")
) %>%
# select the variables that will go into the final dataset
select(child_id, pointer_id, pointer_nr, pointer, source, child_match_rel, child_match_type_minor, pointer_biochild)
pot_children_source_matches %>% tabyl(pointer_biochild, child_match_type_minor, pointer)
## $father
## pointer_biochild genetic_known
## 1 6
##
## $mother
## pointer_biochild genetic_known
## 1 8
test: do we have missings in the child-match variable?
testthat::expect_equal(sum(is.na(pot_children_source_matches$child_match_type_minor)), 0)
test: make sure the coding captured all possible outcomes of child-parent relationships
testthat::expect_equal(nrow(pot_children_source_matches %>% filter(child_match_type_minor == "mistake")), 0)
now we add all the gained information on to our main dataset pot_children
new variables: child_match_rel, child_match_type_minor
keys: child_id, source, pointer, pointer_nr
# data specific for the source
pot_children_source_parents <- pot_children %>%
filter(source == !!source_param) %>%
tidylog::left_join(pot_children_source_matches, by = c("child_id", "pointer", "source", "pointer_nr")) %>%
select(-pointer_biochild)
# overall data that is being populated throughout the script
pot_children_parents[pot_children_parents$source == source_param, ] <- pot_children_source_parents
test: did we replace the pot_children_parents data with the same amount of rows or more if there are multiple parents for one child?
# It should just not me less (expect_lte : less than or equal)
testthat::expect_lte(nrow(pot_children_parents[pot_children_parents$source == source_param, ]), nrow(pot_children_source_parents))
summary_parents <- pot_children_parents %>%
group_by(child_id, pointer, source) %>%
summarize(n_avail = sum(!is.na(pointer_id))) %>%
mutate(flag_avail = case_when(n_avail >= 1 ~ TRUE,
n_avail == 0 ~ FALSE,
TRUE ~ NA))
summary_source_parents <- summary_parents %>%
filter(source == !!source_param)
Let’s see how many parents we found in this specific source dataset
print <- summary_source_parents %>% tabyl(pointer, flag_avail)
print_mother <- print$'TRUE'[print$pointer == "mother"]
print_father <- print$'TRUE'[print$pointer == "father"]
print %>% adorn_title()
for 8 we have at least one mother for 6 we have at least one father
Note: even though there is no identified father, there could still be an identified mother for the same child (this doesn’t come across in this table). Lets look at the availablity for mothers and fathers per child
And how many mothers and fathers per child?
The first table shows how many parents per child
print <- summary_source_parents %>% summary_by_child(kind = "specific") %>% tabyl(n_father, n_mother)
print %>% adorn_title()
The second table shows for how many children, there is at least one father and/or mother (Note: the two tables are identical, if every child only has at most one identified parent)
summary_source_parents %>% summary_by_child(kind = "specific", values_from = "flag_avail", names_prefix = "avail_") %>% tabyl(avail_father, avail_mother) %>% adorn_title()
We can see that for 2533 cases, there is no corresponding child_id in the source data and therefore also no parental ID’s
Let’s take a look and compare all the source datasets
For the sources that we already used, we can now see the number of identified parents
summary_parents %>% tabyl(source, n_avail, pointer) %>% adorn_title()
## $father
## n_avail
## source 0 1
## bioage 2535 6
## biobirth 2211 330
## bioparen 2541 0
## hhh 2541 0
## kid 2541 0
##
## $mother
## n_avail
## source 0 1
## bioage 2533 8
## biobirth 2127 414
## bioparen 2541 0
## hhh 2541 0
## kid 2541 0
For the sources that we already used, we can now see whether at least one mother or father was identified
summary_parents %>% tabyl(source, flag_avail, pointer) %>% adorn_title()
## $father
## flag_avail
## source FALSE TRUE
## bioage 2535 6
## biobirth 2211 330
## bioparen 2541 0
## hhh 2541 0
## kid 2541 0
##
## $mother
## flag_avail
## source FALSE TRUE
## bioage 2533 8
## biobirth 2127 414
## bioparen 2541 0
## hhh 2541 0
## kid 2541 0
How many mothers were identified in which source data?
summary_parents %>%
filter(pointer == "mother") %>% summary_by_child(kind = "all") %>%
count(pointer, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))
How many fathers were identified in which source data?
summary_parents %>%
filter(pointer == "father") %>% summary_by_child(kind = "all") %>%
count(pointer, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))
Last but not least lets look at a summary of the whole partial thing for the bioage source dataset
pot_children_source_parents %>%
# drop_na(pointer_id) %>%
group_by(pointer, pointer_nr) %>%
skim()
Name | Piped data |
Number of rows | 15246 |
Number of columns | 7 |
_______________________ | |
Column type frequency: | |
character | 3 |
numeric | 2 |
________________________ | |
Group variables | pointer, pointer_nr |
Variable type: character
skim_variable | pointer | pointer_nr | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|---|---|
source | father | 1 | 0 | 1 | 6 | 6 | 0 | 1 | 0 |
source | father | 2 | 0 | 1 | 6 | 6 | 0 | 1 | 0 |
source | father | 3 | 0 | 1 | 6 | 6 | 0 | 1 | 0 |
source | mother | 1 | 0 | 1 | 6 | 6 | 0 | 1 | 0 |
source | mother | 2 | 0 | 1 | 6 | 6 | 0 | 1 | 0 |
source | mother | 3 | 0 | 1 | 6 | 6 | 0 | 1 | 0 |
child_match_rel | father | 1 | 2535 | 0 | 5 | 5 | 0 | 1 | 0 |
child_match_rel | father | 2 | 2541 | 0 | NA | NA | 0 | 0 | 0 |
child_match_rel | father | 3 | 2541 | 0 | NA | NA | 0 | 0 | 0 |
child_match_rel | mother | 1 | 2533 | 0 | 5 | 5 | 0 | 1 | 0 |
child_match_rel | mother | 2 | 2541 | 0 | NA | NA | 0 | 0 | 0 |
child_match_rel | mother | 3 | 2541 | 0 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | father | 1 | 2535 | 0 | 13 | 13 | 0 | 1 | 0 |
child_match_type_minor | father | 2 | 2541 | 0 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | father | 3 | 2541 | 0 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | mother | 1 | 2533 | 0 | 13 | 13 | 0 | 1 | 0 |
child_match_type_minor | mother | 2 | 2541 | 0 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | mother | 3 | 2541 | 0 | NA | NA | 0 | 0 | 0 |
Variable type: numeric
skim_variable | pointer | pointer_nr | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|---|---|
child_id | father | 1 | 0 | 1 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | father | 2 | 0 | 1 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | father | 3 | 0 | 1 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 1 | 0 | 1 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 2 | 0 | 1 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 3 | 0 | 1 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
pointer_id | father | 1 | 2535 | 0 | 5415535 | 3629204 | 2072701 | 2119802 | 5328252 | 8708752 | 8878101 | ▇▁▁▁▇ |
pointer_id | father | 2 | 2541 | 0 | NaN | NA | NA | NA | NA | NA | NA | |
pointer_id | father | 3 | 2541 | 0 | NaN | NA | NA | NA | NA | NA | NA | |
pointer_id | mother | 1 | 2533 | 0 | 6232552 | 3420203 | 2072702 | 2119801 | 8577502 | 8753202 | 8878102 | ▅▁▁▁▇ |
pointer_id | mother | 2 | 2541 | 0 | NaN | NA | NA | NA | NA | NA | NA | |
pointer_id | mother | 3 | 2541 | 0 | NaN | NA | NA | NA | NA | NA | NA |
# pot_children_parents1 %>% count("mother available" = !is.na(mother_id), "father avaiable" = !is.na(father_id)) %>% arrange(desc(n))
# before we start: define source to be used in the code for this section (this source dataset)
source_param <- "bioparen"
source_data <- bioparen_dict
pointer_mother_id <- "mother_id"
pointer_father_id <- "father_id"
goal: let’s see how many mother and father pointers we can find in bioparen
bioparen-Doku: https://www.diw.de/documents/publikationen/73/diw_01.c.787430.de/diw_ssp0839.pdf
now we get the parental pointers and match our pupils with the possible mothers and fathers from the pointer source dataset
pointer_mothers <- source_data %>% select_pointers(pointer = "mother", pointer_id_name = pointer_mother_id, source = source_param)
pot_children_source_mothers <- pot_children %>% match_child_pointer(pointer = "mother", source = source_param, pointer_data = pointer_mothers)
pointer_fathers <- source_data %>% select_pointers(pointer = "father", pointer_id_name = pointer_father_id, source = source_param)
pot_children_source_fathers <- pot_children %>% match_child_pointer(pointer = "father", source = source_param, pointer_data = pointer_fathers)
The maximum number of parents identified in this source dataset is 1 for mothers and 1 for fathers.
in this dataset, we define the relation between the pupil and the match depending on some characteristics of the source dataset. this part is unique to every source dataset maybe it would be cleaner to do this definition bit later on, not sure
new variables: child_match_rel, child_match_type_minor
keys: child_id, pointer, pointer_nr
# just matches, no missings
pot_children_source_matches <-
# combine matches for mothers and fathers
pot_children_source_mothers %>% bind_rows(pot_children_source_fathers) %>%
# here we drop the pupils with non-dentified parents, because the will have NA's for the child_rel and child_match_type_minor
drop_na(pointer_id) %>%
mutate(child_match_rel = "child",
child_match_type_minor = "genetic_known") %>%
# select the variables that will go into the final dataset
select(child_id, pointer_id, pointer_nr, pointer, source, child_match_rel, child_match_type_minor)
test: do we have missings in the child-match variable?
testthat::expect_equal(sum(is.na(pot_children_source_matches$child_match_type_minor)), 0)
test: make sure the coding captured all possible outcomes of child-parent relationships
testthat::expect_equal(nrow(pot_children_source_matches %>% filter(child_match_type_minor == "mistake")), 0)
testthat::expect_equal(nrow(pot_children_source_matches %>% filter(child_match_rel == "mistake")), 0)
now we add all the gained information on to our main dataset pot_children
new variables: child_match_rel, child_match_type_minor
keys: child_id, source, pointer, pointer_nr
# data specific for the source
pot_children_source_parents <- pot_children %>%
filter(source == !!source_param) %>%
tidylog::left_join(pot_children_source_matches, by = c("child_id", "pointer", "source", "pointer_nr"))
# overall data that is being populated throughout the script
pot_children_parents[pot_children_parents$source == source_param, ] <- pot_children_source_parents
test: did we replace the pot_children_parents data with the same amount of rows or more if there are multiple parents for one child?
# It should just not me less (expect_lte : less than or equal)
testthat::expect_lte(nrow(pot_children_parents[pot_children_parents$source == source_param, ]), nrow(pot_children_source_parents))
summary_parents <- pot_children_parents %>%
group_by(child_id, pointer, source) %>%
summarize(n_avail = sum(!is.na(pointer_id))) %>%
mutate(flag_avail = case_when(n_avail >= 1 ~ TRUE,
n_avail == 0 ~ FALSE,
TRUE ~ NA))
summary_source_parents <- summary_parents %>%
filter(source == !!source_param)
Let’s see how many parents we found in this specific source dataset
print <- summary_source_parents %>% tabyl(pointer, flag_avail)
print_mother <- print$'TRUE'[print$pointer == "mother"]
print_father <- print$'TRUE'[print$pointer == "father"]
print %>% adorn_title()
for 255 we have at least one mother for 206 we have at least one father
Note: even though there is no identified father, there could still be an identified mother for the same child (this doesn’t come across in this table). Lets look at the availablity for mothers and fathers per child
And how many mothers and fathers per child?
The first table shows how many parents per child
print <- summary_source_parents %>% summary_by_child(kind = "specific") %>% tabyl(n_father, n_mother)
print %>% adorn_title()
The second table shows for how many children, there is at least one father and/or mother (Note: the two tables are identical, if every child only has at most one identified parent)
summary_source_parents %>% summary_by_child(kind = "specific", values_from = "flag_avail", names_prefix = "avail_") %>% tabyl(avail_father, avail_mother) %>% adorn_title()
We can see that for 2276 cases, there is no corresponding child_id in the source data and therefore also no parental ID’s
Let’s take a look and compare all the source datasets
For the sources that we already used, we can now see the number of identified parents
summary_parents %>% tabyl(source, n_avail, pointer) %>% adorn_title()
## $father
## n_avail
## source 0 1
## bioage 2535 6
## biobirth 2211 330
## bioparen 2335 206
## hhh 2541 0
## kid 2541 0
##
## $mother
## n_avail
## source 0 1
## bioage 2533 8
## biobirth 2127 414
## bioparen 2286 255
## hhh 2541 0
## kid 2541 0
For the sources that we already used, we can now see whether at least one mother or father was identified
summary_parents %>% tabyl(source, flag_avail, pointer) %>% adorn_title()
## $father
## flag_avail
## source FALSE TRUE
## bioage 2535 6
## biobirth 2211 330
## bioparen 2335 206
## hhh 2541 0
## kid 2541 0
##
## $mother
## flag_avail
## source FALSE TRUE
## bioage 2533 8
## biobirth 2127 414
## bioparen 2286 255
## hhh 2541 0
## kid 2541 0
How many mothers were identified in which source data?
summary_parents %>%
filter(pointer == "mother") %>% summary_by_child(kind = "all") %>%
count(pointer, avail_bioparen, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))
How many fathers were identified in which source data?
summary_parents %>%
filter(pointer == "father") %>% summary_by_child(kind = "all") %>%
count(pointer, avail_bioparen, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))
Last but not least lets look at a summary of the whole partial thing for the bioparen source dataset
pot_children_source_parents %>%
# drop_na(pointer_id) %>%
group_by(pointer, pointer_nr) %>%
skim()
Name | Piped data |
Number of rows | 15246 |
Number of columns | 7 |
_______________________ | |
Column type frequency: | |
character | 3 |
numeric | 2 |
________________________ | |
Group variables | pointer, pointer_nr |
Variable type: character
skim_variable | pointer | pointer_nr | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|---|---|
source | father | 1 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
source | father | 2 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
source | father | 3 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
source | mother | 1 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
source | mother | 2 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
source | mother | 3 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
child_match_rel | father | 1 | 2335 | 0.08 | 5 | 5 | 0 | 1 | 0 |
child_match_rel | father | 2 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_rel | father | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_rel | mother | 1 | 2286 | 0.10 | 5 | 5 | 0 | 1 | 0 |
child_match_rel | mother | 2 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_rel | mother | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | father | 1 | 2335 | 0.08 | 13 | 13 | 0 | 1 | 0 |
child_match_type_minor | father | 2 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | father | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | mother | 1 | 2286 | 0.10 | 13 | 13 | 0 | 1 | 0 |
child_match_type_minor | mother | 2 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | mother | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
Variable type: numeric
skim_variable | pointer | pointer_nr | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|---|---|
child_id | father | 1 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | father | 2 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | father | 3 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 1 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 2 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 3 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
pointer_id | father | 1 | 2335 | 0.08 | 19334605 | 16504634 | 2006703 | 8616901 | 9293552 | 40419076 | 41723901 | ▇▁▁▁▅ |
pointer_id | father | 2 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
pointer_id | father | 3 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
pointer_id | mother | 1 | 2286 | 0.10 | 20212886 | 16492255 | 1233702 | 8670552 | 9352302 | 40551302 | 41723902 | ▇▂▁▁▆ |
pointer_id | mother | 2 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
pointer_id | mother | 3 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA |
# pot_children_parents1 %>% count("mother available" = !is.na(mother_id), "father avaiable" = !is.na(father_id)) %>% arrange(desc(n))
# before we start: define source to be used in the code for this section (this source dataset)
source_param <- "kid"
source_data <- kid_dict
pointer_mother_id <- "mother_id"
pointer_father_id <- "father_id"
goal: let’s see how many mother and father pointers we can find in kid
DOKU: https://www.diw.de/documents/publikationen/73/diw_01.c.789369.de/diw_ssp0857.pdf
now we get the parental pointers and match our pupils with the possible mothers and fathers from the pointer source dataset
pointer_mothers <- source_data %>% select_pointers(pointer = "mother", pointer_id_name = pointer_mother_id, source = source_param)
pot_children_source_mothers <- pot_children %>% match_child_pointer(pointer = "mother", source = source_param, pointer_data = pointer_mothers)
pointer_fathers <- source_data %>% select_pointers(pointer = "father", pointer_id_name = pointer_father_id, source = source_param)
pot_children_source_fathers <- pot_children %>% match_child_pointer(pointer = "father", source = source_param, pointer_data = pointer_fathers)
The maximum number of parents identified in this source dataset is 2 for mothers and 2 for fathers.
in this dataset, we define the relation between the pupil and the match depending on some characteristics of the source dataset. this part is unique to every source dataset maybe it would be cleaner to do this definition bit later on, not sure
new variables: child_match_rel, child_match_type_minor
keys: child_id, pointer, pointer_nr
# just matches, no missings
pot_children_source_matches <- pot_children_source_mothers %>% bind_rows(pot_children_source_fathers) %>%
# here we drop the pupils with non-dentified parents, because the will have NA's for the child_rel and child_match_type_minor
drop_na(pointer_id) %>%
mutate(child_match_rel = "child",
# in the kidlong data, only the mother can be consideres to be the biological mother, the father pointer only points towards the partner of the father
child_match_type_minor = "unknown") %>%
# select the variables that will go into the final dataset
select(child_id, pointer_id, pointer_nr, pointer, source, child_match_rel, child_match_type_minor)
test: do we have missings in the child-match variable?
testthat::expect_equal(sum(is.na(pot_children_source_matches$child_match_type_minor)), 0)
test: make sure the coding captured all possible outcomes of child-parent relationships
testthat::expect_equal(nrow(pot_children_source_matches %>% filter(child_match_type_minor == "mistake")), 0)
now we add all the gained information on to our main dataset pot_children
new variables: child_match_rel, child_match_type_minor
keys: child_id, source, pointer, pointer_nr
# data specific for the source
pot_children_source_parents <- pot_children %>%
filter(source == !!source_param) %>%
tidylog::left_join(pot_children_source_matches, by = c("child_id", "pointer", "source", "pointer_nr"))
# overall data that is being populated throughout the script
pot_children_parents[pot_children_parents$source == source_param, ] <- pot_children_source_parents
test: did we replace the pot_children_parents data with the same amount of rows or more if there are multiple parents for one child?
# It should just not me less (expect_lte : less than or equal)
testthat::expect_lte(nrow(pot_children_parents[pot_children_parents$source == source_param, ]), nrow(pot_children_source_parents))
summary_parents <- pot_children_parents %>%
group_by(child_id, pointer, source) %>%
summarize(n_avail = sum(!is.na(pointer_id))) %>%
mutate(flag_avail = case_when(n_avail >= 1 ~ TRUE,
n_avail == 0 ~ FALSE,
TRUE ~ NA))
summary_source_parents <- summary_parents %>%
filter(source == !!source_param)
Let’s see how many parents we found in this specific source dataset
print <- summary_source_parents %>% tabyl(pointer, flag_avail)
print_mother <- print$'TRUE'[print$pointer == "mother"]
print_father <- print$'TRUE'[print$pointer == "father"]
print %>% adorn_title()
for 335 we have at least one mother for 298 we have at least one father
Note: even though there is no identified father, there could still be an identified mother for the same child (this doesn’t come across in this table). Lets look at the availablity for mothers and fathers per child
And how many mothers and fathers per child?
The first table shows how many parents per child
print <- summary_source_parents %>% summary_by_child(kind = "specific") %>% tabyl(n_father, n_mother)
print %>% adorn_title()
The second table shows for how many children, there is at least one father and/or mother (Note: the two tables are identical, if every child only has at most one identified parent)
summary_source_parents %>% summary_by_child(kind = "specific", values_from = "flag_avail", names_prefix = "avail_") %>% tabyl(avail_father, avail_mother) %>% adorn_title()
We can see that for 2205 cases, there is no corresponding child_id in the source data and therefore also no parental ID’s
Let’s take a look and compare all the source datasets
For the sources that we already used, we can now see the number of identified parents
summary_parents %>% tabyl(source, n_avail, pointer) %>% adorn_title()
## $father
## n_avail
## source 0 1 2
## bioage 2535 6 0
## biobirth 2211 330 0
## bioparen 2335 206 0
## hhh 2541 0 0
## kid 2243 292 6
##
## $mother
## n_avail
## source 0 1 2
## bioage 2533 8 0
## biobirth 2127 414 0
## bioparen 2286 255 0
## hhh 2541 0 0
## kid 2206 334 1
For the sources that we already used, we can now see whether at least one mother or father was identified
summary_parents %>% tabyl(source, flag_avail, pointer) %>% adorn_title()
## $father
## flag_avail
## source FALSE TRUE
## bioage 2535 6
## biobirth 2211 330
## bioparen 2335 206
## hhh 2541 0
## kid 2243 298
##
## $mother
## flag_avail
## source FALSE TRUE
## bioage 2533 8
## biobirth 2127 414
## bioparen 2286 255
## hhh 2541 0
## kid 2206 335
How many mothers were identified in which source data?
summary_parents %>%
filter(pointer == "mother") %>% summary_by_child(kind = "all") %>%
count(pointer, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))
How many fathers were identified in which source data?
summary_parents %>%
filter(pointer == "father") %>% summary_by_child(kind = "all") %>%
count(pointer, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))
Last but not least lets look at a summary of the whole partial thing for the kid source dataset
pot_children_source_parents %>%
# drop_na(pointer_id) %>%
group_by(pointer, pointer_nr) %>%
skim()
Name | Piped data |
Number of rows | 15246 |
Number of columns | 7 |
_______________________ | |
Column type frequency: | |
character | 3 |
numeric | 2 |
________________________ | |
Group variables | pointer, pointer_nr |
Variable type: character
skim_variable | pointer | pointer_nr | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|---|---|
source | father | 1 | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
source | father | 2 | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
source | father | 3 | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
source | mother | 1 | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
source | mother | 2 | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
source | mother | 3 | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
child_match_rel | father | 1 | 2243 | 0.12 | 5 | 5 | 0 | 1 | 0 |
child_match_rel | father | 2 | 2535 | 0.00 | 5 | 5 | 0 | 1 | 0 |
child_match_rel | father | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_rel | mother | 1 | 2206 | 0.13 | 5 | 5 | 0 | 1 | 0 |
child_match_rel | mother | 2 | 2540 | 0.00 | 5 | 5 | 0 | 1 | 0 |
child_match_rel | mother | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | father | 1 | 2243 | 0.12 | 7 | 7 | 0 | 1 | 0 |
child_match_type_minor | father | 2 | 2535 | 0.00 | 7 | 7 | 0 | 1 | 0 |
child_match_type_minor | father | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | mother | 1 | 2206 | 0.13 | 7 | 7 | 0 | 1 | 0 |
child_match_type_minor | mother | 2 | 2540 | 0.00 | 7 | 7 | 0 | 1 | 0 |
child_match_type_minor | mother | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
Variable type: numeric
skim_variable | pointer | pointer_nr | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|---|---|
child_id | father | 1 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | father | 2 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | father | 3 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 1 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 2 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 3 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
pointer_id | father | 1 | 2243 | 0.12 | 21439650 | 16842689 | 1161603 | 8660901 | 9597201 | 40623301 | 41723901 | ▇▂▁▁▇ |
pointer_id | father | 2 | 2535 | 0.00 | 9554890 | 15650210 | 1395206 | 1571107 | 2139808 | 7577056 | 40883505 | ▇▂▁▁▂ |
pointer_id | father | 3 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
pointer_id | mother | 1 | 2206 | 0.13 | 22247840 | 16880717 | 1233702 | 8731652 | 9620301 | 40900651 | 41723902 | ▇▂▁▁▇ |
pointer_id | mother | 2 | 2540 | 0.00 | 9747705 | NA | 9747705 | 9747705 | 9747705 | 9747705 | 9747705 | ▁▁▇▁▁ |
pointer_id | mother | 3 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA |
# pot_children_parents1 %>% count("mother available" = !is.na(mother_id), "father avaiable" = !is.na(father_id)) %>% arrange(desc(n))
# before we start: define source to be used in the code for this section (this source dataset)
source_param <- "hhh"
source_mother_data <- hhh_mothers
source_father_data <- hhh_fathers
pointer_mother_id <- "mother_id"
pointer_father_id <- "father_id"
additional_vars <- c("child_stell", "child_stell_l") # needed to define child_match_type-minor later on
goal: let’s see how many mother and father pointers we can find in hhh now we identify parents via the children’s relation to the head of hh
now we get the parental pointers and match our pupils with the possible mothers and fathers from the pointer source dataset
pointer_mothers <- source_mother_data %>% select_pointers(pointer = "mother",
pointer_id_name = pointer_mother_id,
source = source_param,
all_of(additional_vars))
pot_children_source_mothers <- pot_children %>% match_child_pointer(pointer = "mother", source = source_param, pointer_data = pointer_mothers)
pointer_fathers <- source_father_data %>% select_pointers(pointer = "father", pointer_id_name = pointer_father_id, source = source_param, additional_vars)
pot_children_source_fathers <- pot_children %>% match_child_pointer(pointer = "father", source = source_param, pointer_data = pointer_fathers)
The maximum number of parents identified in this source dataset is 2 for mothers and 1 for fathers.
in this dataset, we define the relation between the pupil and the match depending on some characteristics of the source dataset.
this part is unique to every source dataset maybe it would be cleaner to do this definition bit later on, not sure
new variables: child_match_rel, child_match_type_minor
keys: child_id, pointer, pointer_nr
# just matches, no missings
pot_children_source_matches <- pot_children_source_mothers %>% bind_rows(pot_children_source_fathers) %>%
# here we drop the pupils with non-dentified parents, because they will have NA's for the child_rel and child_match_type_minor
drop_na(pointer_id) %>%
# the matched id's are from the head of hh. via the rel of the child to the head of hh, we can get the relation between the child and the parent who is head of hh
mutate(child_match_rel = case_when(child_stell %in% c(20:24) ~ "child",
child_stell == 25 ~ "grandchild",
child_stell == 27 ~ "child", # [27] Schwsohn,-tocher (Ehe-/LPartner v.Kind)
TRUE ~ "mistake"),
child_match_type_minor = case_when(child_stell == 20 ~ "genetic_label", # 20 = those are possibly adoptive and genetic children (until 2011)
child_stell == 21 ~ "genetic_known", # 21 = genetic children
child_stell == 25 ~ "genetic_known", # 25 = grandchildren
child_stell == 27 ~ "social_known", # [27] Schwsohn,-tocher (Ehe-/LPartner v.Kind)
child_stell %in% c(22:24) ~ "social_known", # adoptive, foster and step children
TRUE ~ "mistake")
) %>%
# select the variables that will go into the final dataset
select(child_id, pointer_id, pointer_nr, pointer, source, child_match_rel, child_match_type_minor)
test: do we have missings in the child-match variable?
testthat::expect_equal(sum(is.na(pot_children_source_matches$child_match_type_minor)), 0)
test: make sure the coding captured all possible outcomes of child-parent relationships
testthat::expect_equal(nrow(pot_children_source_matches %>% filter(child_match_type_minor == "mistake")), 0)
testthat::expect_equal(nrow(pot_children_source_matches %>% filter(child_match_rel == "mistake")), 0)
now we add all the gained information on to our main dataset pot_children
new variables: child_match_rel, child_match_type_minor
keys: child_id, source, pointer, pointer_nr
# data specific for the source
pot_children_source_parents <- pot_children %>%
filter(source == !!source_param) %>%
tidylog::left_join(pot_children_source_matches, by = c("child_id", "pointer", "source", "pointer_nr"))
# overall data that is being populated throughout the script
pot_children_parents[pot_children_parents$source == source_param, ] <- pot_children_source_parents
test: did we replace the pot_children_parents data with the same amount of rows or more if there are multiple parents for one child?
# It should just not me less (expect_lte : less than or equal)
testthat::expect_lte(nrow(pot_children_parents[pot_children_parents$source == source_param, ]), nrow(pot_children_source_parents))
summary_parents <- pot_children_parents %>%
group_by(child_id, pointer, source) %>%
summarize(n_avail = sum(!is.na(pointer_id))) %>%
mutate(flag_avail = case_when(n_avail >= 1 ~ TRUE,
n_avail == 0 ~ FALSE,
TRUE ~ NA))
summary_source_parents <- summary_parents %>%
filter(source == !!source_param)
Let’s see how many parents we found in this specific source dataset
print <- summary_source_parents %>% tabyl(pointer, flag_avail)
print_mother <- print$'TRUE'[print$pointer == "mother"]
print_father <- print$'TRUE'[print$pointer == "father"]
print %>% adorn_title()
for 235 we have at least one mother for 254 we have at least one father
Note: even though there is no identified father, there could still be an identified mother for the same child (this doesn’t come across in this table). Lets look at the availablity for mothers and fathers per child
And how many mothers and fathers per child?
The first table shows how many parents per child
print <- summary_source_parents %>% summary_by_child(kind = "specific") %>% tabyl(n_father, n_mother)
print %>% adorn_title()
The second table shows for how many children, there is at least one father and/or mother (Note: the two tables are identical, if every child only has at most one identified parent)
summary_source_parents %>% summary_by_child(kind = "specific", values_from = "flag_avail", names_prefix = "avail_") %>% tabyl(avail_father, avail_mother) %>% adorn_title()
We can see that for 2079 cases, there is no corresponding child_id in the source data and therefore also no parental ID’s
Let’s take a look and compare all the source datasets
For the sources that we already used, we can now see the number of identified parents
summary_parents %>% tabyl(source, n_avail, pointer) %>% adorn_title()
## $father
## n_avail
## source 0 1 2
## bioage 2535 6 0
## biobirth 2211 330 0
## bioparen 2335 206 0
## hhh 2287 254 0
## kid 2243 292 6
##
## $mother
## n_avail
## source 0 1 2
## bioage 2533 8 0
## biobirth 2127 414 0
## bioparen 2286 255 0
## hhh 2306 234 1
## kid 2206 334 1
For the sources that we already used, we can now see whether at least one mother or father was identified
summary_parents %>% tabyl(source, flag_avail, pointer) %>% adorn_title()
## $father
## flag_avail
## source FALSE TRUE
## bioage 2535 6
## biobirth 2211 330
## bioparen 2335 206
## hhh 2287 254
## kid 2243 298
##
## $mother
## flag_avail
## source FALSE TRUE
## bioage 2533 8
## biobirth 2127 414
## bioparen 2286 255
## hhh 2306 235
## kid 2206 335
How many mothers were identified in which source data?
summary_parents %>%
filter(pointer == "mother") %>% summary_by_child(kind = "all") %>%
count(pointer, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))
How many fathers were identified in which source data?
summary_parents %>%
filter(pointer == "father") %>% summary_by_child(kind = "all") %>%
count(pointer, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))
Last but not least lets look at a summary of the whole partial thing for the hhh source dataset
pot_children_source_parents %>%
# drop_na(pointer_id) %>%
group_by(pointer, pointer_nr) %>%
skim()
Name | Piped data |
Number of rows | 15246 |
Number of columns | 7 |
_______________________ | |
Column type frequency: | |
character | 3 |
numeric | 2 |
________________________ | |
Group variables | pointer, pointer_nr |
Variable type: character
skim_variable | pointer | pointer_nr | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|---|---|
source | father | 1 | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
source | father | 2 | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
source | father | 3 | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
source | mother | 1 | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
source | mother | 2 | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
source | mother | 3 | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
child_match_rel | father | 1 | 2287 | 0.10 | 5 | 10 | 0 | 2 | 0 |
child_match_rel | father | 2 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_rel | father | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_rel | mother | 1 | 2306 | 0.09 | 5 | 10 | 0 | 2 | 0 |
child_match_rel | mother | 2 | 2540 | 0.00 | 5 | 5 | 0 | 1 | 0 |
child_match_rel | mother | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | father | 1 | 2287 | 0.10 | 12 | 13 | 0 | 3 | 0 |
child_match_type_minor | father | 2 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | father | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | mother | 1 | 2306 | 0.09 | 12 | 13 | 0 | 3 | 0 |
child_match_type_minor | mother | 2 | 2540 | 0.00 | 13 | 13 | 0 | 1 | 0 |
child_match_type_minor | mother | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
Variable type: numeric
skim_variable | pointer | pointer_nr | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|---|---|
child_id | father | 1 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | father | 2 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | father | 3 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 1 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 2 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 3 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
pointer_id | father | 1 | 2287 | 0.10 | 22393198 | 16998811 | 2017501 | 8753226 | 9661101 | 40703926 | 41723901 | ▇▁▁▁▆ |
pointer_id | father | 2 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
pointer_id | father | 3 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
pointer_id | mother | 1 | 2306 | 0.09 | 22334190 | 16545633 | 1395202 | 8776001 | 9620301 | 40940951 | 41692401 | ▇▂▁▁▇ |
pointer_id | mother | 2 | 2540 | 0.00 | 40690802 | NA | 40690802 | 40690802 | 40690802 | 40690802 | 40690802 | ▁▁▇▁▁ |
pointer_id | mother | 3 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA |
# pot_children_parents1 %>% count("mother available" = !is.na(mother_id), "father avaiable" = !is.na(father_id)) %>% arrange(desc(n))
lets look at the output
keys: child_id, source, pointer, pointer_nr (those should have no missings)
pot_children_parents %>%
filter(source == "biobirth") %>%
group_by(pointer, pointer_nr) %>%
skim
Name | Piped data |
Number of rows | 15246 |
Number of columns | 7 |
_______________________ | |
Column type frequency: | |
character | 3 |
numeric | 2 |
________________________ | |
Group variables | pointer, pointer_nr |
Variable type: character
skim_variable | pointer | pointer_nr | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|---|---|
source | father | 1 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
source | father | 2 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
source | father | 3 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
source | mother | 1 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
source | mother | 2 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
source | mother | 3 | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
child_match_rel | father | 1 | 2211 | 0.13 | 5 | 5 | 0 | 1 | 0 |
child_match_rel | father | 2 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_rel | father | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_rel | mother | 1 | 2127 | 0.16 | 5 | 5 | 0 | 1 | 0 |
child_match_rel | mother | 2 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_rel | mother | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | father | 1 | 2211 | 0.13 | 13 | 13 | 0 | 1 | 0 |
child_match_type_minor | father | 2 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | father | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | mother | 1 | 2127 | 0.16 | 13 | 13 | 0 | 1 | 0 |
child_match_type_minor | mother | 2 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
child_match_type_minor | mother | 3 | 2541 | 0.00 | NA | NA | 0 | 0 | 0 |
Variable type: numeric
skim_variable | pointer | pointer_nr | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|---|---|
child_id | father | 1 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | father | 2 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | father | 3 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 1 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 2 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
child_id | mother | 3 | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
pointer_id | father | 1 | 2211 | 0.13 | 21684752 | 16540273 | 1161603 | 8766326 | 9591301 | 40682101 | 41723901 | ▇▃▁▁▇ |
pointer_id | father | 2 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
pointer_id | father | 3 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
pointer_id | mother | 1 | 2127 | 0.16 | 22146233 | 16679840 | 1233702 | 8766327 | 9608802 | 40882151 | 41723902 | ▇▂▁▁▇ |
pointer_id | mother | 2 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
pointer_id | mother | 3 | 2541 | 0.00 | NaN | NA | NA | NA | NA | NA | NA |
export(pot_children_parents, file = here::here("data/pot_children_parents.rds"))
it is time to look at the quality of the collected parental pointers and whether they all agree when it comes to the relationship between the child and the parental pointer
here, we check whether the child_match_types agree if there are multiple child_parent_pairs (from different sources)
pot_children_parents_pointers <- pot_children_parents %>%
drop_na(pointer_id)
child_parent_sources_pointers_wide <- pot_children_parents_pointers %>%
select(child_id, pointer_id, pointer, source, child_match_type_minor) %>%
pivot_wider(names_from = source, values_from = child_match_type_minor, names_prefix = "rel_") %>%
arrange(child_id, pointer)
child_parent_sources_pointers_wide %>%
count(rel_biobirth, rel_bioparen, rel_bioage, rel_hhh, rel_kid) %>%
arrange(desc(n)) %>% head(7)
n = 39
child_parent_sources_pointers_wide %>%
count(rel_biobirth, rel_bioage, rel_hhh, rel_kid) %>%
filter(rel_biobirth == "genetic_known" & (rel_bioage == "social_known" | rel_hhh == "social_known") |
(rel_hhh == "genetic_label" & rel_bioage == "social_known")|
(rel_hhh == "genetic_label" & rel_kid == "unknown" & is.na(rel_biobirth))) %>% arrange(n)
there are no ambivalent cases, only 2 cases where the relationship remains unknown
While in this project there are no conflict, they can and have appear in with other samples. Therefore, the next chunk marks and resolves conflicts. It is a code chunk that might have to be adapted by hand
sources_crossvalid_merge <- child_parent_sources_pointers_wide %>%
# filter cases that conflict
filter(rel_biobirth == "genetic_known" & (rel_bioage == "social_known" | rel_hhh == "social_known") |
(rel_hhh == "genetic_label" & rel_bioage == "social_known")) %>%
# label conflictes and resolve them in merge_conflict
mutate(sources_crossvalid = case_when(# conflicts
rel_bioage == "social_known" & rel_hhh == "genetic_known" ~ "bioage hhh conflict",
rel_bioage == "genetic_known" & rel_hhh == "social_known" ~ "bioage hhh conflict",
# possible mistakes in biobirth
rel_biobirth == "genetic_known" & rel_hhh == "social_known" ~ "hhh over biobirth",
rel_biobirth == "genetic_known" & rel_bioage == "social_known" ~ "bioage over biobirth",
# probably adoptive children
rel_bioage == "social_known" & rel_hhh == "genetic_label" ~ "bioage_social over hhh_genetic_label",
TRUE ~ "needs description"
),
merge_conflict = case_when(rel_bioage == "genetic_known" & rel_hhh == "social_known" ~ "unknown",
rel_bioage == "social_known" & rel_hhh == "genetic_known" ~ "unknown",
rel_biobirth == "genetic_known" & rel_bioage == "social_known" ~ rel_bioage,
rel_biobirth == "genetic_known" & rel_hhh == "social_known" ~ rel_hhh,
rel_bioage == "social_known" & rel_hhh == "genetic_label" ~ rel_bioage,
TRUE ~ "conflict"
)
)
sources_crossvalid_merge <- sources_crossvalid_merge %>%
select(child_id, pointer_id, pointer, sources_crossvalid, merge_conflict) %>%
tidylog::distinct(child_id, pointer_id, .keep_all = T)
child_parent_sources_pointers_correction <- pot_children_parents_pointers %>%
tidylog::left_join(sources_crossvalid_merge, by = c("child_id", "pointer_id", "pointer")) %>%
# update child_match_type_minor
mutate(child_match_type_minor_c = case_when(is.na(sources_crossvalid) ~ child_match_type_minor,
!is.na(sources_crossvalid) ~ merge_conflict,
TRUE ~ "need specification"
)) %>%
# update source
mutate(source_c = case_when(is.na(sources_crossvalid) ~ source,
sources_crossvalid == "bioage over biobirth" ~ "bioage",
sources_crossvalid == "bioage hhh conflict" ~ source,
sources_crossvalid == "hhh over biobirth" ~ "hhh",
sources_crossvalid == "biobirth_mistake" ~ "hhh",
sources_crossvalid == "bioage_social over hhh_genetic_label" ~ "bioage",
TRUE ~ "need specification"
))
did we define solutions for the problems?
testthat::expect_equal(child_parent_sources_pointers_correction %>% filter(child_match_type_minor_c == "need specification") %>% nrow(), 0)
testthat::expect_equal(child_parent_sources_pointers_correction %>% filter(source_c == "need specification") %>% nrow(), 0)
child_parent_sources_pointers_correction %>% tabyl(source, source_c) %>% adorn_title()
# child_parent_sources_pointers_correction %>% tabyl(source_c, child_match_type_minor_c) %>% adorn_title()
# child_parent_sources_pointers_correction %>% tabyl(child_match_type_minor, child_match_type_minor_c) %>% adorn_title()
the table above let’s us know that there were no conflicts.
now we replace the child_match_type_minor variable with the corrected values
child_parent_sources_pointers_corrected <- child_parent_sources_pointers_correction %>%
# now that we made sure, we defined rules for all conflicts, we can overwrite the old variables
select(-merge_conflict, -child_match_type_minor, -source) %>%
select(child_match_type_minor = child_match_type_minor_c,
source = source_c,
everything()) %>%
select(-starts_with("select_")) %>%
# get rid of the now double child_match_type_minors
tidylog::distinct(child_id, pointer_id, source, child_match_type_minor, .keep_all = T) %>%
# set preferred order for selecting genetic parents based on child_match_type_minor
mutate(select_source = case_when(source == "biobirth" ~ 1,
source == "bioage" ~ 2,
source == "kid" ~ 3,
source == "hhh" ~ 4,
source == "bioparen" ~ 5,
TRUE ~ 0),
select_match = case_when(child_match_type_minor == "unknown" ~ 4,
child_match_type_minor == "social_known" ~ 3,
child_match_type_minor == "genetic_label" ~ 2,
child_match_type_minor == "genetic_known" ~ 1,
TRUE ~ 0
)) %>%
unite(select_match_source, select_match, select_source, remove = F, sep = ".") %>%
unite(select_match_source_l, child_match_type_minor, source, remove = F, sep = ".") %>%
arrange(child_id, pointer_id, select_match, select_source)
# 0 would mean there are unspecified sources or rel-types which is not supposed to happen
testthat::expect_equal(child_parent_sources_pointers_corrected %>% filter(select_source == 0) %>% nrow(), 0)
testthat::expect_equal(child_parent_sources_pointers_corrected %>% filter(select_match == 0) %>% nrow(), 0)
child_parent_sources_pointers_corrected %>% drop_na(sources_crossvalid) %>% distinct(child_id, .keep_all = T) %>% tabyl(sources_crossvalid) %>% adorn_totals
goal: now that we have pointer_id’s from multiple sources, we need to decide which id’s are the best.
genetic parents: For each child we want the genetic_known parents, if available, and if not, we want the “genetic_label” parents and if that is not possible we want the “identified but unknown” parents. But we also just want to pick one, even if the same pointer and match type is found in two source datasets.
The problem is that if there are multiple sources with the same ID, then one child_parent pair still gets multiple values. but we just want to keep the best source. (e.g. biobirth and kidlong produce the same pointer_id. but one match_type_minor is “genetic_known” and the other is “unknown”. then we want to keep “genetic_known” and drop “unknown”.
How to solve this? Probably merge it piece by piece? But that only comes at a later stage
we start out with a dataset that includes all children we want to select parents for, each child has two rows, one for the father pointer, one for the mother.
child_parent_base <- pot_children %>%
# leave out source so that its just up to three mothers and fathers per child
tidylog::distinct(child_id, pointer, pointer_nr)
export(child_parent_base, file = here::here("data/child_parent_base.rds"))
parent datasets: children, child_parent_sources_pointers_corrected
keys: child_id, pointer, pointer_nr
genetic_child_parents <- child_parent_sources_pointers_corrected %>%
# for now we dont want grandparents
filter(child_match_rel == "child") %>%
# the arrange ensures that we only keep our "best" match (e.g. genetic_known over genetic_label)
arrange(child_id, pointer_id, select_match, select_source) %>%
tidylog::distinct(child_id, pointer_id, .keep_all = T) %>%
# only keep genetics and unknown (but only those without a conflict? for now all)
tidylog::filter(child_match_type_minor != "social_known")
# merge selected genetic parents to children dataset
child_genetic_parents_multi <- child_parent_base %>%
tidylog::left_join(genetic_child_parents %>%
# even though it shouldnt be the case we assume that there might be some double parents
group_by(child_id, pointer) %>%
mutate(pointer_nr = row_number()),
by = c("child_id", "pointer", "pointer_nr")) %>%
mutate(child_match_type_major = "genetic")
quick check: how many parents found?
child_genetic_parents_multi %>%
drop_na(pointer_id) %>% tidylog::distinct(child_id, pointer_id, .keep_all = T) %>%
tabyl(child_match_type_minor, pointer_nr, pointer)
## $father
## child_match_type_minor 1 2
## genetic_known 370 4
## genetic_label 1 0
## unknown 13 2
##
## $mother
## child_match_type_minor 1 2
## genetic_known 439 0
## genetic_label 2 0
## unknown 9 2
multi_genetic_parents_wide1 <- child_genetic_parents_multi %>%
drop_na(pointer_id) %>%
group_by(child_id, pointer) %>% add_tally() %>% ungroup() %>%
filter(n > 1) %>%
select(child_id, pointer, pointer_id, type.source = select_match_source_l, pointer_nr) %>%
pivot_wider(names_from = pointer_nr, values_from = c("type.source", "pointer_id"), names_prefix = "pointer") %>%
select(child_id, pointer, contains("pointer"))
How many mothers, how many fathers?
multi_genetic_parents_wide1 %>%
distinct(child_id, pointer, .keep_all = T) %>%
tabyl(pointer)
mostly fathers
what are the most common combinations?
# multi_genetic_parents_wide %>% View()
multi_genetic_parents_wide1 %>%
count(pointer, type.source_pointer1, type.source_pointer2) %>%
arrange(desc(n))
the most conflicts stem from biobirth is “genetic_known” and kid is “unknown” (but for a different possible parent)
we do another reshape to get the separate type and source
multi_genetic_parents_wide2 <- child_genetic_parents_multi %>%
drop_na(pointer_id) %>%
group_by(child_id, pointer) %>% add_tally() %>% ungroup() %>%
filter(n > 1) %>%
select(child_id, pointer, pointer_id, type = child_match_type_minor, source, pointer_nr) %>%
pivot_wider(names_from = pointer_nr, values_from = c("type", "source", "pointer_id"), names_prefix = "pointer") %>%
select(child_id, pointer, contains("pointer"))
multi_genetic_parents_wide2 %>% tabyl(type_pointer1, type_pointer2) %>% adorn_title
# multi_genetic_parents_wide %>% View()
multi_genetic_parents_wide2 %>%
count(type_pointer1, type_pointer2) %>%
arrange(n)
multigenetic_merge_conflict <- multi_genetic_parents_wide2 %>%
mutate(genetic_conflict = case_when(type_pointer1 == "genetic_known" & type_pointer2 == "genetic_known" ~ "multi known",
type_pointer1 == "genetic_known" & type_pointer2 == "unknown" ~ "genknown over unknown",
type_pointer1 == "unknown" & type_pointer2 == "genetic_known" ~ "genknown over unknown",
TRUE ~ "needs description"
),
merge_conflict = case_when(type_pointer1 == "genetic_known" & type_pointer2 == "genetic_known" ~ NA_real_,
type_pointer1 == "genetic_known" & type_pointer2 == "unknown" ~ pointer_id_pointer1,
type_pointer1 == "unknown" & type_pointer2 == "genetic_known" ~ pointer_id_pointer2,
TRUE ~ 0
))
make sure we resolved all issues
testthat::expect_equal(multigenetic_merge_conflict %>% filter(genetic_conflict == "need description") %>% nrow(), 0)
testthat::expect_equal(multigenetic_merge_conflict %>% filter(merge_conflict == 0) %>% nrow(), 0)
merge_conflict <- multigenetic_merge_conflict %>%
select(child_id, pointer, genetic_conflict, merge_conflict) %>%
tidylog::distinct(child_id, pointer, .keep_all = T)
child_genetic_parents_corrected <- child_genetic_parents_multi %>%
tidylog::left_join(merge_conflict, by = c("child_id", "pointer")) %>%
mutate(select_bestchoice = case_when(is.na(pointer_id) ~ "drop_me",
is.na(genetic_conflict) ~ "best_choice",
!is.na(genetic_conflict) & pointer_id == merge_conflict ~ "best_choice",
!is.na(genetic_conflict) & pointer_id != merge_conflict & !is.na(pointer_id) ~ "drop_me",
genetic_conflict == "multi unknown" ~ "drop_me",
genetic_conflict == "multi known" ~ "drop_me",
TRUE ~ "need specification"
)) %>%
select(-merge_conflict)
# child_genetic_parents_corrected %>% filter(select_bestchoice == "need specification") %>% View()
did we define solutions for the problems?
testthat::expect_equal(child_genetic_parents_corrected %>% filter(select_bestchoice == "need specification") %>% nrow(), 0)
now we replace the child_match_type_minor variable with the corrected values
child_genetic_parents_selected <- child_genetic_parents_corrected %>%
drop_na(pointer_id) %>%
tidylog::filter(select_bestchoice == "best_choice") %>%
select(child_id, pointer, pointer_id, child_match_type_major, child_match_type_minor, child_match_rel, source, sources_crossvalid, genetic_conflict)
quick check: any children who are also their parents?
testthat::expect_equal(child_genetic_parents_selected %>% filter(child_id == pointer_id) %>% nrow(), 0)
test whether we have more than one parent for any child
testthat::expect_lte(child_genetic_parents_selected %>%
# the drop_na should not remove any rows
drop_na(pointer_id) %>%
group_by(child_id, pointer_id) %>%
summarize(max_parents = max(row_number())) %>%
# get max number of max parents from all children
ungroup() %>%
summarize(overall_max = max(max_parents)) %>% pull(overall_max), 1)
save data
export(child_genetic_parents_selected, file = here::here("data/child_genetic_parents_selected.rds"))
testthat::expect_equal(child_genetic_parents_selected %>% semi_join(child_social_parents_selected, by = c("child_id", "pointer_id")) %>% nrow(), 0)
# child_parent_sources_pointers_corrected %>% semi_join(child_social_parents_selected, by = c("child_id", "pointer_id")) %>% tabyl(child_match_type_minor)
keys: child_id, pointer_id
# child_social_parents <- import(file = here::here("data/child_social_parents.rds"))
# child_genetic_parents_multi <- import(file = here::here("data/child_genetic_parents_multi.rds"))
children_all_parents_selected <- child_genetic_parents_selected %>% bind_rows(child_social_parents_selected) %>%
# this distinct should not remove any rows
tidylog::distinct(child_id, pointer_id, .keep_all = T) %>%
add_count(child_id, pointer, name = "pointer_nr") %>%
select(child_id, pointer, pointer_nr, pointer_id, child_match_type_major, child_match_type_minor, everything()) %>%
arrange(child_id, pointer, child_match_type_major)
children_all_source_parents <- child_parent_base %>%
left_join(children_all_parents_selected, by = c("child_id","pointer", "pointer_nr"))
export(children_all_source_parents, file = here::here("data/children_all_source_parents.rds"))
check out the children with more than one parent
children_parents_wide <- children_all_source_parents %>%
drop_na(pointer_id) %>%
group_by(child_id, pointer) %>%
mutate(pointer_nr = row_number()) %>%
add_tally() %>% ungroup() %>%
filter(n > 1) %>%
select(child_id, pointer, pointer_id, type.minor = child_match_type_minor, pointer_nr) %>%
pivot_wider(names_from = pointer_nr, values_from = c("type.minor", "pointer_id"), names_prefix = "pointer") %>%
select(child_id, pointer, contains("pointer"))
children_parents_wide %>% tabyl(type.minor_pointer1, type.minor_pointer2, pointer) %>% adorn_title
## $father
## type.minor_pointer2
## type.minor_pointer1 social_known
## unknown 1
for those children where we have multiple pointers, it is only genetic and social pointers which is possible and no conflict
children_all_source_parents %>% drop_na(pointer_id) %>% group_by(pointer, child_match_type_major) %>% skim
Name | Piped data |
Number of rows | 845 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
character | 5 |
numeric | 3 |
________________________ | |
Group variables | pointer, child_match_type_major |
Variable type: character
skim_variable | pointer | child_match_type_major | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|---|---|
child_match_type_minor | father | genetic | 0 | 1.00 | 7 | 13 | 0 | 3 | 0 |
child_match_type_minor | father | social | 0 | 1.00 | 12 | 12 | 0 | 1 | 0 |
child_match_type_minor | mother | genetic | 0 | 1.00 | 7 | 13 | 0 | 3 | 0 |
child_match_type_minor | mother | social | 0 | 1.00 | 12 | 12 | 0 | 1 | 0 |
child_match_rel | father | genetic | 0 | 1.00 | 5 | 5 | 0 | 1 | 0 |
child_match_rel | father | social | 0 | 1.00 | 5 | 5 | 0 | 1 | 0 |
child_match_rel | mother | genetic | 0 | 1.00 | 5 | 5 | 0 | 1 | 0 |
child_match_rel | mother | social | 0 | 1.00 | 5 | 5 | 0 | 1 | 0 |
source | father | genetic | 0 | 1.00 | 3 | 8 | 0 | 4 | 0 |
source | father | social | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
source | mother | genetic | 0 | 1.00 | 3 | 8 | 0 | 4 | 0 |
source | mother | social | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
sources_crossvalid | father | genetic | 382 | 0.00 | NA | NA | 0 | 0 | 0 |
sources_crossvalid | father | social | 8 | 0.00 | NA | NA | 0 | 0 | 0 |
sources_crossvalid | mother | genetic | 450 | 0.00 | NA | NA | 0 | 0 | 0 |
sources_crossvalid | mother | social | 5 | 0.00 | NA | NA | 0 | 0 | 0 |
genetic_conflict | father | genetic | 378 | 0.01 | 21 | 21 | 0 | 1 | 0 |
genetic_conflict | father | social | 8 | 0.00 | NA | NA | 0 | 0 | 0 |
genetic_conflict | mother | genetic | 448 | 0.00 | 21 | 21 | 0 | 1 | 0 |
genetic_conflict | mother | social | 5 | 0.00 | NA | NA | 0 | 0 | 0 |
Variable type: numeric
skim_variable | pointer | child_match_type_major | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|---|---|
child_id | father | genetic | 0 | 1 | 21441213.6 | 16682436.61 | 1161602 | 8709278 | 9556303 | 40623305 | 41723904 | ▇▃▁▁▇ |
child_id | father | social | 0 | 1 | 27213716.4 | 19283109.73 | 1342705 | 7612029 | 40561503 | 41037504 | 41677504 | ▃▂▁▁▇ |
child_id | mother | genetic | 0 | 1 | 22301160.4 | 16713312.84 | 1161602 | 8767729 | 9608704 | 40874804 | 41723904 | ▇▂▁▁▇ |
child_id | mother | social | 0 | 1 | 13700563.6 | 15183249.61 | 1395204 | 8618804 | 8645003 | 9633305 | 40210502 | ▇▂▁▁▂ |
pointer_nr | father | genetic | 0 | 1 | 1.0 | 0.05 | 1 | 1 | 1 | 1 | 2 | ▇▁▁▁▁ |
pointer_nr | father | social | 0 | 1 | 1.1 | 0.35 | 1 | 1 | 1 | 1 | 2 | ▇▁▁▁▁ |
pointer_nr | mother | genetic | 0 | 1 | 1.0 | 0.00 | 1 | 1 | 1 | 1 | 1 | ▁▁▇▁▁ |
pointer_nr | mother | social | 0 | 1 | 1.0 | 0.00 | 1 | 1 | 1 | 1 | 1 | ▁▁▇▁▁ |
pointer_id | father | genetic | 0 | 1 | 21439090.6 | 16681114.46 | 1161603 | 8701052 | 9573351 | 40623301 | 41723901 | ▇▃▁▁▇ |
pointer_id | father | social | 0 | 1 | 27403439.1 | 18993524.99 | 2098804 | 7802452 | 40561501 | 41037501 | 41677501 | ▅▁▁▁▇ |
pointer_id | mother | genetic | 0 | 1 | 22362849.2 | 16735211.85 | 1233702 | 8766327 | 9620701 | 40882151 | 41723902 | ▇▂▁▁▇ |
pointer_id | mother | social | 0 | 1 | 13700561.2 | 15183250.27 | 1395202 | 8618801 | 8645001 | 9633301 | 40210501 | ▇▂▁▁▂ |
children_all_source_parents %>% tabyl(pointer, sources_crossvalid)
there are no conflicts that needed merging
children_all_source_parents %>%
drop_na(pointer_id) %>%
tabyl(child_match_type_minor, pointer, child_match_type_major)
## $genetic
## child_match_type_minor father mother
## genetic_known 370 439
## genetic_label 1 2
## social_known 0 0
## unknown 11 9
##
## $social
## child_match_type_minor father mother
## genetic_known 0 0
## genetic_label 0 0
## social_known 8 5
## unknown 0 0
children_all_source_parents %>%
drop_na(pointer_id) %>%
tabyl(child_match_type_minor, source, pointer)
## $father
## child_match_type_minor biobirth bioparen hhh kid
## genetic_known 329 27 14 0
## genetic_label 0 0 1 0
## social_known 0 0 8 0
## unknown 0 0 0 11
##
## $mother
## child_match_type_minor biobirth bioparen hhh kid
## genetic_known 414 16 9 0
## genetic_label 0 0 2 0
## social_known 0 0 5 0
## unknown 0 0 0 9
For how many children did we not identify any parent?
n = 25
children, we identified a father, but no mother, and for n = 84
we identified a mother but no fathern = 718
children, there are identifiers for both parents availablechildren_all_source_parents %>%
tidylog::distinct(child_id, pointer, pointer_id, .keep_all = T) %>%
add_count(child_id, "parent_avail" = !is.na(pointer_id)) %>%
tidylog::distinct(child_id, pointer, .keep_all = T) %>%
tabyl(pointer, parent_avail, n) %>% adorn_title()
## $`1`
## parent_avail
## pointer FALSE TRUE
## father 0 16
## mother 0 82
##
## $`2`
## parent_avail
## pointer FALSE TRUE
## father 2153 372
## mother 2086 372
##
## $`3`
## parent_avail
## pointer FALSE TRUE
## father 0 0
## mother 0 1
How many genetic parents were identified per child?
n = 2104
potential childrenchild_genetic_parent_summary <- child_parent_base %>%
# drop the pointer_nr 1:3
tidylog::distinct(child_id, pointer) %>%
left_join(child_genetic_parents_selected, by = c("child_id", "pointer")) %>%
# this distince should not remove any rown
tidylog::distinct(child_id, pointer, pointer_id, .keep_all = T) %>%
group_by(child_id, pointer) %>%
mutate(n_missing = sum(is.na(pointer_id)),
n_rows = max(row_number()),
n_parents_avail = n_rows - n_missing) %>%
ungroup()
child_genetic_parent_summary %>%
tabyl(pointer, n_parents_avail) %>%
adorn_totals("col") %>%
adorn_title
Who are the children with no identified parent?
child_genetic_parent_summary %>%
filter(n_parents_avail == 0) %>%
tidylog::left_join(pbrutto_long %>% select(syear, pid, stell_l), by = c("child_id" = "pid")) %>%
tidylog::distinct(child_id, pointer, .keep_all = T) %>%
tabyl(stell_l, pointer, show_missing_levels = F)
Let’s see wether we can identify parents via the hh-members and their relations with the hhh head.
social child-parent
parent datasets: children, child_parent_sources_pointers_corrected
keys: child_id, pointer, pointer_nr