Review: beforehand we have identified the potential children for our sample in ppfad_igene_long.

Goal: now it is time to find parental pointers from various SOEP datasets for the identified persons.

preparation: read data needed in the script

# data of children
ppfad_igene_long <-     import(here::here("data", "ppfad_igene_long.rds"))    # pupils 11-14 yrs

# data with parental pointers
biobirth_dict <-        import(here::here("data", "biobirth_dict_v36.rds")) 
kid_dict <-             import(here::here("data", "kid_dict_v36.rds")) 
bioparen_dict <-        import(here::here("data", "bioparen_dict_v36.rds")) 
bioage_mother_dict <-   import(here::here("data", "bioage_mother_dict_v36.rds")) # data for pupils only
bioage_father_dict <-   import(here::here("data", "bioage_father_dict_v36.rds")) # data for pupils only
hhh_fathers <-          import(here::here("data", "hhh_fathers_v36.rds"))
hhh_mothers <-          import(here::here("data", "hhh_mothers_v36.rds"))

# metadata
pbrutto_long <-         import(here::here("data/pbrutto_long_v36.rds"))

step1: ppfad_igene_long (igene sample)

goal: create dataset children_parents containing people from the igene sample and the identified parental pointers with the source and the relation we think the child has with the mother or father identified (social or genetic or unknown)

fist we build a dataset that includes all potential children and add all possible combinations for possible matches.

  • child_id: all pupils from the data
  • pointer: mother, father
  • source: biobirth, bioage, kid, hhh
  • pointer_nr: 1,2,3

from each of the four sources we will identify mothers and fathers and therefore we need one row for each child for all of the possible combinations. Over the course of this script, we will populate this dataset of chilren with the matched mothers and fathers and define the relation of the child to the identified person depending on the source. We allow for up to three pointers per child and pointer and source

# unique identifier keys: child_id, pointer, source
pot_children <- ppfad_igene_long %>% 
   # distinct only keeps one person per id and removes douplicates by survey year (the source data is in long format and has multiple observations over time per child)
   tidylog::distinct(pid) %>%
   # get a vector with child-id's 
   pull(pid) %>% 
   # make a dataset of all possible combinations
   expand_grid(child_id = ., 
               pointer = c("mother", "father"), 
               source = c("biobirth", "bioage", "bioparen", "kid", "hhh"),
               pointer_nr = c(1:3)) 

pot_children_parents <- pot_children %>% 
   # generate the variables that are oing to be populated
   mutate(pointer_id = NA_real_,
          child_match_rel = NA_character_,
          child_match_type_minor = NA_character_
          )

pot_children_parents: add BIOBIRTH pointers

# before we start: define source to be used in the code for this section (this source dataset)
source_param <- "biobirth"
source_data <- biobirth_dict
pointer_mother_id <- "mother_id" 
pointer_father_id <- "father_id" 

goal: let’s see how many mother and father pointers we can find in biobirth

Biobirth-Doku: https://www.diw.de/documents/publikationen/73/diw_01.c.787430.de/diw_ssp0839.pdf

pot_children_source_parents

  • parent datasets: biobirth
  • keys: child_id, pointer_id, pointer_nr

now we get the parental pointers and match our pupils with the possible mothers and fathers from the pointer source dataset

pointer_mothers <- source_data %>% select_pointers(pointer = "mother", pointer_id_name = pointer_mother_id, source = source_param)
pot_children_source_mothers <- pot_children %>% match_child_pointer(pointer = "mother", source = source_param, pointer_data = pointer_mothers)
pointer_fathers <- source_data %>% select_pointers(pointer = "father", pointer_id_name = pointer_father_id, source = source_param)
pot_children_source_fathers <- pot_children %>% match_child_pointer(pointer = "father", source = source_param, pointer_data = pointer_fathers)

The maximum number of parents identified in this source dataset is 1 for mothers and 1 for fathers.

pot_children_source_matches: add biobirth pointers

in this dataset, we define the relation between the pupil and the match depending on some characteristics of the source dataset. this part is unique to every source dataset maybe it would be cleaner to do this definition bit later on, not sure

  • parent datasets: pot_children_source_mothers, pot_children_source_fathers
  • new variables: child_match_rel, child_match_type_minor

  • keys: child_id, pointer, pointer_nr

# just matches, no missings
pot_children_source_matches <- 
      # combine matches for mothers and fathers 
      pot_children_source_mothers %>% bind_rows(pot_children_source_fathers) %>% 
      # here we drop the pupils with non-dentified parents, because the will have NA's for the child_rel and child_match_type_minor
      drop_na(pointer_id) %>% 
      mutate(child_match_rel = "child",
          child_match_type_minor = "genetic_known") %>% 
   # select the variables that will go into the final dataset
   select(child_id, pointer_id, pointer_nr, pointer, source, child_match_rel, child_match_type_minor)

test: do we have missings in the child-match variable?

testthat::expect_equal(sum(is.na(pot_children_source_matches$child_match_type_minor)), 0)

test: make sure the coding captured all possible outcomes of child-parent relationships

testthat::expect_equal(nrow(pot_children_source_matches %>% filter(child_match_type_minor == "mistake")), 0)
testthat::expect_equal(nrow(pot_children_source_matches %>% filter(child_match_rel == "mistake")), 0)

pot_children_parents: add child_match_type

now we add all the gained information on to our main dataset pot_children

  • parent datasets: pot_children, pot_children_matches
  • new variables: child_match_rel, child_match_type_minor

  • keys: child_id, source, pointer, pointer_nr

# data specific for the source
pot_children_source_parents <- pot_children %>% 
   filter(source == !!source_param) %>% 
   tidylog::left_join(pot_children_source_matches, by = c("child_id", "pointer", "source", "pointer_nr"))

# overall data that is being populated throughout the script
pot_children_parents[pot_children_parents$source == source_param, ] <- pot_children_source_parents

test: did we replace the pot_children_parents data with the same amount of rows or more if there are multiple parents for one child?

# It should just not me less (expect_lte : less than or equal)
testthat::expect_lte(nrow(pot_children_parents[pot_children_parents$source == source_param, ]), nrow(pot_children_source_parents))

summary stats

summary_parents <- pot_children_parents %>% 
   group_by(child_id, pointer, source) %>% 
   summarize(n_avail = sum(!is.na(pointer_id))) %>% 
   mutate(flag_avail = case_when(n_avail >= 1 ~ TRUE,
                                 n_avail == 0 ~ FALSE,
                                 TRUE ~ NA))

summary_source_parents <- summary_parents %>% 
   filter(source == !!source_param) 

source_specific

Let’s see how many parents we found in this specific source dataset

print <- summary_source_parents %>% tabyl(pointer, flag_avail)

print_mother <- print$'TRUE'[print$pointer == "mother"]
print_father <- print$'TRUE'[print$pointer == "father"]

print %>% adorn_title() 

for 414 we have at least one mother for 330 we have at least one father

Note: even though there is no identified father, there could still be an identified mother for the same child (this doesn’t come across in this table). Lets look at the availablity for mothers and fathers per child

And how many mothers and fathers per child?

The first table shows how many parents per child

print <- summary_source_parents %>% summary_by_child(kind = "specific") %>% tabyl(n_father, n_mother)

print %>% adorn_title()
  • for 314 we have matches for one mother and one father
  • for 100 only mothers and 16 only fathers
  • this does not neccesarily mean that there are no parents, its just that they are not listed in the source datasets

The second table shows for how many children, there is at least one father and/or mother (Note: the two tables are identical, if every child only has at most one identified parent)

summary_source_parents %>% summary_by_child(kind = "specific", values_from = "flag_avail", names_prefix = "avail_") %>% tabyl(avail_father, avail_mother) %>% adorn_title()

We can see that for 2111 cases, there is no corresponding child_id in the source data and therefore also no parental ID’s

compare all sources

Let’s take a look and compare all the source datasets

For the sources that we already used, we can now see the number of identified parents

summary_parents %>% tabyl(source, n_avail, pointer) %>% adorn_title()
## $father
##           n_avail    
##    source       0   1
##    bioage    2541   0
##  biobirth    2211 330
##  bioparen    2541   0
##       hhh    2541   0
##       kid    2541   0
## 
## $mother
##           n_avail    
##    source       0   1
##    bioage    2541   0
##  biobirth    2127 414
##  bioparen    2541   0
##       hhh    2541   0
##       kid    2541   0

For the sources that we already used, we can now see whether at least one mother or father was identified

summary_parents %>% tabyl(source, flag_avail, pointer) %>% adorn_title()
## $father
##           flag_avail     
##    source      FALSE TRUE
##    bioage       2541    0
##  biobirth       2211  330
##  bioparen       2541    0
##       hhh       2541    0
##       kid       2541    0
## 
## $mother
##           flag_avail     
##    source      FALSE TRUE
##    bioage       2541    0
##  biobirth       2127  414
##  bioparen       2541    0
##       hhh       2541    0
##       kid       2541    0

How many mothers were identified in which source data?

summary_parents %>% 
   filter(pointer == "mother")  %>% summary_by_child(kind = "all") %>% 
   count(pointer, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))

How many fathers were identified in which source data?

summary_parents %>% 
   filter(pointer == "father")  %>% summary_by_child(kind = "all") %>% 
   count(pointer, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))

Last but not least lets look at a summary of the whole partial thing for the biobirth source dataset

pot_children_source_parents %>% 
   # drop_na(pointer_id) %>%
   group_by(pointer, pointer_nr) %>% 
   skim()
Data summary
Name Piped data
Number of rows 15246
Number of columns 7
_______________________
Column type frequency:
character 3
numeric 2
________________________
Group variables pointer, pointer_nr

Variable type: character

skim_variable pointer pointer_nr n_missing complete_rate min max empty n_unique whitespace
source father 1 0 1.00 8 8 0 1 0
source father 2 0 1.00 8 8 0 1 0
source father 3 0 1.00 8 8 0 1 0
source mother 1 0 1.00 8 8 0 1 0
source mother 2 0 1.00 8 8 0 1 0
source mother 3 0 1.00 8 8 0 1 0
child_match_rel father 1 2211 0.13 5 5 0 1 0
child_match_rel father 2 2541 0.00 NA NA 0 0 0
child_match_rel father 3 2541 0.00 NA NA 0 0 0
child_match_rel mother 1 2127 0.16 5 5 0 1 0
child_match_rel mother 2 2541 0.00 NA NA 0 0 0
child_match_rel mother 3 2541 0.00 NA NA 0 0 0
child_match_type_minor father 1 2211 0.13 13 13 0 1 0
child_match_type_minor father 2 2541 0.00 NA NA 0 0 0
child_match_type_minor father 3 2541 0.00 NA NA 0 0 0
child_match_type_minor mother 1 2127 0.16 13 13 0 1 0
child_match_type_minor mother 2 2541 0.00 NA NA 0 0 0
child_match_type_minor mother 3 2541 0.00 NA NA 0 0 0

Variable type: numeric

skim_variable pointer pointer_nr n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
child_id father 1 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id father 2 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id father 3 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 1 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 2 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 3 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
pointer_id father 1 2211 0.13 21684752 16540273 1161603 8766326 9591301 40682101 41723901 ▇▃▁▁▇
pointer_id father 2 2541 0.00 NaN NA NA NA NA NA NA
pointer_id father 3 2541 0.00 NaN NA NA NA NA NA NA
pointer_id mother 1 2127 0.16 22146233 16679840 1233702 8766327 9608802 40882151 41723902 ▇▂▁▁▇
pointer_id mother 2 2541 0.00 NaN NA NA NA NA NA NA
pointer_id mother 3 2541 0.00 NaN NA NA NA NA NA NA
# pot_children_parents1 %>% count("mother available" = !is.na(mother_id), "father avaiable" = !is.na(father_id)) %>% arrange(desc(n))

pot_children_parents: add BIOAGE pointers

# before we start: define source to be used in the code for this section (this source dataset)
source_param <- "bioage"
source_mother_data <- bioage_mother_dict
source_father_data <- bioage_father_dict
pointer_mother_id <- "mother_id" 
pointer_father_id <- "father_id" 
additional_mother_vars <- c("mother_biochild") # needed to define child_match_type-minor later on
additional_father_vars <- c("father_biochild") # needed to define child_match_type-minor later on

goal: let’s see how many mother and father pointers we can find in bioage

  • Here it is noteworthy that even though survey year info is available in the bioage dataset, it only contains the childrens data from when they were still younger than 11. therefore the survey years do not overlap between bioage and pot_children and are not useful for joining.
  • The information that we now join is basically, who the mother pointer was when the children in pot_children were still younger.

Bioage-Doku: https://www.diw.de/documents/publikationen/73/diw_01.c.673305.de/diw_ssp0747.pdf

pot_children_source_parents

  • parent datasets: bioage
  • keys: child_id, pointer_id, pointer_nr

now we get the parental pointers and match our pupils with the possible mothers and fathers from the pointer source dataset

pointer_mothers <- source_mother_data %>% select_pointers(pointer = "mother", pointer_id_name = pointer_mother_id, source = source_param, additional_mother_vars)
pot_children_source_mothers <- pot_children %>% match_child_pointer(pointer = "mother", source = source_param, pointer_data = pointer_mothers)
pointer_fathers <- source_father_data %>% select_pointers(pointer = "father", pointer_id_name = pointer_father_id, source = source_param, additional_father_vars)
pot_children_source_fathers <- pot_children %>% match_child_pointer(pointer = "father", source = source_param, pointer_data = pointer_fathers)

The maximum number of parents identified in this source dataset is 1 for mothers and 1 for fathers.

pot_children_source_matches: add bioage pointers

in this dataset, we define the relation between the pupil and the match depending on some characteristics of the source dataset. this part is unique to every source dataset maybe it would be cleaner to do this definition bit later on, not sure

  • parent datasets: pot_children_source_mothers, pot_children_source_fathers
  • new variables: child_match_rel, child_match_type_minor

  • keys: child_id, pointer, pointer_nr

# just matches, no missings
pot_children_source_matches <- pot_children_source_mothers %>% bind_rows(pot_children_source_fathers) %>% 
      # here we drop the pupils with non-dentified parents, because the will have NA's for the child_rel and child_match_type_minor
      drop_na(pointer_id) %>% 
      mutate(child_match_rel = "child",
          child_match_type_minor = case_when(pointer_biochild == 1 ~ "genetic_known",
                                             pointer_biochild == 2 ~ "social_known",
                                             is.na(pointer_biochild) ~ "unknown",
                                             TRUE ~ "mistake")
          ) %>% 
   # select the variables that will go into the final dataset
   select(child_id, pointer_id, pointer_nr, pointer, source, child_match_rel, child_match_type_minor, pointer_biochild)
pot_children_source_matches %>% tabyl(pointer_biochild, child_match_type_minor, pointer)
## $father
##  pointer_biochild genetic_known
##                 1             6
## 
## $mother
##  pointer_biochild genetic_known
##                 1             8

test: do we have missings in the child-match variable?

testthat::expect_equal(sum(is.na(pot_children_source_matches$child_match_type_minor)), 0)

test: make sure the coding captured all possible outcomes of child-parent relationships

testthat::expect_equal(nrow(pot_children_source_matches %>% filter(child_match_type_minor == "mistake")), 0)

pot_children_parents: add child_match_type

now we add all the gained information on to our main dataset pot_children

  • parent datasets: pot_children, pot_children_matches
  • new variables: child_match_rel, child_match_type_minor

  • keys: child_id, source, pointer, pointer_nr

# data specific for the source
pot_children_source_parents <- pot_children %>% 
   filter(source == !!source_param) %>% 
   tidylog::left_join(pot_children_source_matches, by = c("child_id", "pointer", "source", "pointer_nr")) %>% 
   select(-pointer_biochild)

# overall data that is being populated throughout the script
pot_children_parents[pot_children_parents$source == source_param, ] <- pot_children_source_parents

test: did we replace the pot_children_parents data with the same amount of rows or more if there are multiple parents for one child?

# It should just not me less (expect_lte : less than or equal)
testthat::expect_lte(nrow(pot_children_parents[pot_children_parents$source == source_param, ]), nrow(pot_children_source_parents))

summary stats

summary_parents <- pot_children_parents %>% 
   group_by(child_id, pointer, source) %>% 
   summarize(n_avail = sum(!is.na(pointer_id))) %>% 
   mutate(flag_avail = case_when(n_avail >= 1 ~ TRUE,
                                        n_avail == 0 ~ FALSE,
                                        TRUE ~ NA))

summary_source_parents <- summary_parents %>% 
   filter(source == !!source_param) 

source_specific

Let’s see how many parents we found in this specific source dataset

print <- summary_source_parents %>% tabyl(pointer, flag_avail)

print_mother <- print$'TRUE'[print$pointer == "mother"]
print_father <- print$'TRUE'[print$pointer == "father"]

print %>% adorn_title() 

for 8 we have at least one mother for 6 we have at least one father

Note: even though there is no identified father, there could still be an identified mother for the same child (this doesn’t come across in this table). Lets look at the availablity for mothers and fathers per child

And how many mothers and fathers per child?

The first table shows how many parents per child

print <- summary_source_parents %>% summary_by_child(kind = "specific") %>% tabyl(n_father, n_mother)

print %>% adorn_title()
  • for 6 we have matches for one mother and one father
  • for 2 only mothers and 0 only fathers
  • this does not neccesarily mean that there are no parents, its just that they are not listed in the source datasets

The second table shows for how many children, there is at least one father and/or mother (Note: the two tables are identical, if every child only has at most one identified parent)

summary_source_parents %>% summary_by_child(kind = "specific", values_from = "flag_avail", names_prefix = "avail_") %>% tabyl(avail_father, avail_mother) %>% adorn_title()

We can see that for 2533 cases, there is no corresponding child_id in the source data and therefore also no parental ID’s

compare all sources

Let’s take a look and compare all the source datasets

For the sources that we already used, we can now see the number of identified parents

summary_parents %>% tabyl(source, n_avail, pointer) %>% adorn_title()
## $father
##           n_avail    
##    source       0   1
##    bioage    2535   6
##  biobirth    2211 330
##  bioparen    2541   0
##       hhh    2541   0
##       kid    2541   0
## 
## $mother
##           n_avail    
##    source       0   1
##    bioage    2533   8
##  biobirth    2127 414
##  bioparen    2541   0
##       hhh    2541   0
##       kid    2541   0

For the sources that we already used, we can now see whether at least one mother or father was identified

summary_parents %>% tabyl(source, flag_avail, pointer) %>% adorn_title()
## $father
##           flag_avail     
##    source      FALSE TRUE
##    bioage       2535    6
##  biobirth       2211  330
##  bioparen       2541    0
##       hhh       2541    0
##       kid       2541    0
## 
## $mother
##           flag_avail     
##    source      FALSE TRUE
##    bioage       2533    8
##  biobirth       2127  414
##  bioparen       2541    0
##       hhh       2541    0
##       kid       2541    0

How many mothers were identified in which source data?

summary_parents %>% 
   filter(pointer == "mother")  %>% summary_by_child(kind = "all") %>% 
   count(pointer, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))

How many fathers were identified in which source data?

summary_parents %>% 
   filter(pointer == "father")  %>% summary_by_child(kind = "all") %>% 
   count(pointer, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))

Last but not least lets look at a summary of the whole partial thing for the bioage source dataset

pot_children_source_parents %>% 
   # drop_na(pointer_id) %>%
   group_by(pointer, pointer_nr) %>% 
   skim()
Data summary
Name Piped data
Number of rows 15246
Number of columns 7
_______________________
Column type frequency:
character 3
numeric 2
________________________
Group variables pointer, pointer_nr

Variable type: character

skim_variable pointer pointer_nr n_missing complete_rate min max empty n_unique whitespace
source father 1 0 1 6 6 0 1 0
source father 2 0 1 6 6 0 1 0
source father 3 0 1 6 6 0 1 0
source mother 1 0 1 6 6 0 1 0
source mother 2 0 1 6 6 0 1 0
source mother 3 0 1 6 6 0 1 0
child_match_rel father 1 2535 0 5 5 0 1 0
child_match_rel father 2 2541 0 NA NA 0 0 0
child_match_rel father 3 2541 0 NA NA 0 0 0
child_match_rel mother 1 2533 0 5 5 0 1 0
child_match_rel mother 2 2541 0 NA NA 0 0 0
child_match_rel mother 3 2541 0 NA NA 0 0 0
child_match_type_minor father 1 2535 0 13 13 0 1 0
child_match_type_minor father 2 2541 0 NA NA 0 0 0
child_match_type_minor father 3 2541 0 NA NA 0 0 0
child_match_type_minor mother 1 2533 0 13 13 0 1 0
child_match_type_minor mother 2 2541 0 NA NA 0 0 0
child_match_type_minor mother 3 2541 0 NA NA 0 0 0

Variable type: numeric

skim_variable pointer pointer_nr n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
child_id father 1 0 1 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id father 2 0 1 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id father 3 0 1 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 1 0 1 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 2 0 1 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 3 0 1 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
pointer_id father 1 2535 0 5415535 3629204 2072701 2119802 5328252 8708752 8878101 ▇▁▁▁▇
pointer_id father 2 2541 0 NaN NA NA NA NA NA NA
pointer_id father 3 2541 0 NaN NA NA NA NA NA NA
pointer_id mother 1 2533 0 6232552 3420203 2072702 2119801 8577502 8753202 8878102 ▅▁▁▁▇
pointer_id mother 2 2541 0 NaN NA NA NA NA NA NA
pointer_id mother 3 2541 0 NaN NA NA NA NA NA NA
# pot_children_parents1 %>% count("mother available" = !is.na(mother_id), "father avaiable" = !is.na(father_id)) %>% arrange(desc(n))

pot_children_parents: add BIOPAREN pointers

# before we start: define source to be used in the code for this section (this source dataset)
source_param <- "bioparen"
source_data <- bioparen_dict
pointer_mother_id <- "mother_id" 
pointer_father_id <- "father_id" 

goal: let’s see how many mother and father pointers we can find in bioparen

bioparen-Doku: https://www.diw.de/documents/publikationen/73/diw_01.c.787430.de/diw_ssp0839.pdf

pot_children_source_parents

  • parent datasets: bioparen
  • keys: child_id, pointer_id, pointer_nr

now we get the parental pointers and match our pupils with the possible mothers and fathers from the pointer source dataset

pointer_mothers <- source_data %>% select_pointers(pointer = "mother", pointer_id_name = pointer_mother_id, source = source_param)
pot_children_source_mothers <- pot_children %>% match_child_pointer(pointer = "mother", source = source_param, pointer_data = pointer_mothers)
pointer_fathers <- source_data %>% select_pointers(pointer = "father", pointer_id_name = pointer_father_id, source = source_param)
pot_children_source_fathers <- pot_children %>% match_child_pointer(pointer = "father", source = source_param, pointer_data = pointer_fathers)

The maximum number of parents identified in this source dataset is 1 for mothers and 1 for fathers.

pot_children_source_matches: add bioparen pointers

in this dataset, we define the relation between the pupil and the match depending on some characteristics of the source dataset. this part is unique to every source dataset maybe it would be cleaner to do this definition bit later on, not sure

  • parent datasets: pot_children_source_mothers, pot_children_source_fathers
  • new variables: child_match_rel, child_match_type_minor

  • keys: child_id, pointer, pointer_nr

# just matches, no missings
pot_children_source_matches <- 
      # combine matches for mothers and fathers 
      pot_children_source_mothers %>% bind_rows(pot_children_source_fathers) %>% 
      # here we drop the pupils with non-dentified parents, because the will have NA's for the child_rel and child_match_type_minor
      drop_na(pointer_id) %>% 
      mutate(child_match_rel = "child",
          child_match_type_minor = "genetic_known") %>% 
   # select the variables that will go into the final dataset
   select(child_id, pointer_id, pointer_nr, pointer, source, child_match_rel, child_match_type_minor)

test: do we have missings in the child-match variable?

testthat::expect_equal(sum(is.na(pot_children_source_matches$child_match_type_minor)), 0)

test: make sure the coding captured all possible outcomes of child-parent relationships

testthat::expect_equal(nrow(pot_children_source_matches %>% filter(child_match_type_minor == "mistake")), 0)
testthat::expect_equal(nrow(pot_children_source_matches %>% filter(child_match_rel == "mistake")), 0)

pot_children_parents: add child_match_type

now we add all the gained information on to our main dataset pot_children

  • parent datasets: pot_children, pot_children_matches
  • new variables: child_match_rel, child_match_type_minor

  • keys: child_id, source, pointer, pointer_nr

# data specific for the source
pot_children_source_parents <- pot_children %>% 
   filter(source == !!source_param) %>% 
   tidylog::left_join(pot_children_source_matches, by = c("child_id", "pointer", "source", "pointer_nr"))

# overall data that is being populated throughout the script
pot_children_parents[pot_children_parents$source == source_param, ] <- pot_children_source_parents

test: did we replace the pot_children_parents data with the same amount of rows or more if there are multiple parents for one child?

# It should just not me less (expect_lte : less than or equal)
testthat::expect_lte(nrow(pot_children_parents[pot_children_parents$source == source_param, ]), nrow(pot_children_source_parents))

summary stats

summary_parents <- pot_children_parents %>% 
   group_by(child_id, pointer, source) %>% 
   summarize(n_avail = sum(!is.na(pointer_id))) %>% 
   mutate(flag_avail = case_when(n_avail >= 1 ~ TRUE,
                                 n_avail == 0 ~ FALSE,
                                 TRUE ~ NA))

summary_source_parents <- summary_parents %>% 
   filter(source == !!source_param) 

source_specific

Let’s see how many parents we found in this specific source dataset

print <- summary_source_parents %>% tabyl(pointer, flag_avail)

print_mother <- print$'TRUE'[print$pointer == "mother"]
print_father <- print$'TRUE'[print$pointer == "father"]

print %>% adorn_title() 

for 255 we have at least one mother for 206 we have at least one father

Note: even though there is no identified father, there could still be an identified mother for the same child (this doesn’t come across in this table). Lets look at the availablity for mothers and fathers per child

And how many mothers and fathers per child?

The first table shows how many parents per child

print <- summary_source_parents %>% summary_by_child(kind = "specific") %>% tabyl(n_father, n_mother)

print %>% adorn_title()
  • for 196 we have matches for one mother and one father
  • for 59 only mothers and 10 only fathers
  • this does not neccesarily mean that there are no parents, its just that they are not listed in the source datasets

The second table shows for how many children, there is at least one father and/or mother (Note: the two tables are identical, if every child only has at most one identified parent)

summary_source_parents %>% summary_by_child(kind = "specific", values_from = "flag_avail", names_prefix = "avail_") %>% tabyl(avail_father, avail_mother) %>% adorn_title()

We can see that for 2276 cases, there is no corresponding child_id in the source data and therefore also no parental ID’s

compare all sources

Let’s take a look and compare all the source datasets

For the sources that we already used, we can now see the number of identified parents

summary_parents %>% tabyl(source, n_avail, pointer) %>% adorn_title()
## $father
##           n_avail    
##    source       0   1
##    bioage    2535   6
##  biobirth    2211 330
##  bioparen    2335 206
##       hhh    2541   0
##       kid    2541   0
## 
## $mother
##           n_avail    
##    source       0   1
##    bioage    2533   8
##  biobirth    2127 414
##  bioparen    2286 255
##       hhh    2541   0
##       kid    2541   0

For the sources that we already used, we can now see whether at least one mother or father was identified

summary_parents %>% tabyl(source, flag_avail, pointer) %>% adorn_title()
## $father
##           flag_avail     
##    source      FALSE TRUE
##    bioage       2535    6
##  biobirth       2211  330
##  bioparen       2335  206
##       hhh       2541    0
##       kid       2541    0
## 
## $mother
##           flag_avail     
##    source      FALSE TRUE
##    bioage       2533    8
##  biobirth       2127  414
##  bioparen       2286  255
##       hhh       2541    0
##       kid       2541    0

How many mothers were identified in which source data?

summary_parents %>% 
   filter(pointer == "mother")  %>% summary_by_child(kind = "all") %>% 
   count(pointer, avail_bioparen, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))

How many fathers were identified in which source data?

summary_parents %>% 
   filter(pointer == "father")  %>% summary_by_child(kind = "all") %>% 
   count(pointer, avail_bioparen, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))

Last but not least lets look at a summary of the whole partial thing for the bioparen source dataset

pot_children_source_parents %>% 
   # drop_na(pointer_id) %>%
   group_by(pointer, pointer_nr) %>% 
   skim()
Data summary
Name Piped data
Number of rows 15246
Number of columns 7
_______________________
Column type frequency:
character 3
numeric 2
________________________
Group variables pointer, pointer_nr

Variable type: character

skim_variable pointer pointer_nr n_missing complete_rate min max empty n_unique whitespace
source father 1 0 1.00 8 8 0 1 0
source father 2 0 1.00 8 8 0 1 0
source father 3 0 1.00 8 8 0 1 0
source mother 1 0 1.00 8 8 0 1 0
source mother 2 0 1.00 8 8 0 1 0
source mother 3 0 1.00 8 8 0 1 0
child_match_rel father 1 2335 0.08 5 5 0 1 0
child_match_rel father 2 2541 0.00 NA NA 0 0 0
child_match_rel father 3 2541 0.00 NA NA 0 0 0
child_match_rel mother 1 2286 0.10 5 5 0 1 0
child_match_rel mother 2 2541 0.00 NA NA 0 0 0
child_match_rel mother 3 2541 0.00 NA NA 0 0 0
child_match_type_minor father 1 2335 0.08 13 13 0 1 0
child_match_type_minor father 2 2541 0.00 NA NA 0 0 0
child_match_type_minor father 3 2541 0.00 NA NA 0 0 0
child_match_type_minor mother 1 2286 0.10 13 13 0 1 0
child_match_type_minor mother 2 2541 0.00 NA NA 0 0 0
child_match_type_minor mother 3 2541 0.00 NA NA 0 0 0

Variable type: numeric

skim_variable pointer pointer_nr n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
child_id father 1 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id father 2 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id father 3 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 1 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 2 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 3 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
pointer_id father 1 2335 0.08 19334605 16504634 2006703 8616901 9293552 40419076 41723901 ▇▁▁▁▅
pointer_id father 2 2541 0.00 NaN NA NA NA NA NA NA
pointer_id father 3 2541 0.00 NaN NA NA NA NA NA NA
pointer_id mother 1 2286 0.10 20212886 16492255 1233702 8670552 9352302 40551302 41723902 ▇▂▁▁▆
pointer_id mother 2 2541 0.00 NaN NA NA NA NA NA NA
pointer_id mother 3 2541 0.00 NaN NA NA NA NA NA NA
# pot_children_parents1 %>% count("mother available" = !is.na(mother_id), "father avaiable" = !is.na(father_id)) %>% arrange(desc(n))

pot_children_parents: add KIDLONG pointers

# before we start: define source to be used in the code for this section (this source dataset)
source_param <- "kid"
source_data <- kid_dict
pointer_mother_id <- "mother_id" 
pointer_father_id <- "father_id" 

goal: let’s see how many mother and father pointers we can find in kid

DOKU: https://www.diw.de/documents/publikationen/73/diw_01.c.789369.de/diw_ssp0857.pdf

pot_children_source_parents

  • parent datasets: kid
  • keys: child_id, pointer_id, pointer_nr

now we get the parental pointers and match our pupils with the possible mothers and fathers from the pointer source dataset

pointer_mothers <- source_data %>% select_pointers(pointer = "mother", pointer_id_name = pointer_mother_id, source = source_param)
pot_children_source_mothers <- pot_children %>% match_child_pointer(pointer = "mother", source = source_param, pointer_data = pointer_mothers)
pointer_fathers <- source_data %>% select_pointers(pointer = "father", pointer_id_name = pointer_father_id, source = source_param)
pot_children_source_fathers <- pot_children %>% match_child_pointer(pointer = "father", source = source_param, pointer_data = pointer_fathers)

The maximum number of parents identified in this source dataset is 2 for mothers and 2 for fathers.

pot_children_source_matches: add kid pointers

in this dataset, we define the relation between the pupil and the match depending on some characteristics of the source dataset. this part is unique to every source dataset maybe it would be cleaner to do this definition bit later on, not sure

  • parent datasets: pot_children_source_mothers, pot_children_source_fathers
  • new variables: child_match_rel, child_match_type_minor

  • keys: child_id, pointer, pointer_nr

# just matches, no missings
pot_children_source_matches <- pot_children_source_mothers %>% bind_rows(pot_children_source_fathers) %>% 
      # here we drop the pupils with non-dentified parents, because the will have NA's for the child_rel and child_match_type_minor
      drop_na(pointer_id) %>% 
      mutate(child_match_rel = "child",
          # in the kidlong data, only the mother can be consideres to be the biological mother, the father pointer only points towards the partner of the father
          child_match_type_minor = "unknown") %>% 
   # select the variables that will go into the final dataset
   select(child_id, pointer_id, pointer_nr, pointer, source, child_match_rel, child_match_type_minor)

test: do we have missings in the child-match variable?

testthat::expect_equal(sum(is.na(pot_children_source_matches$child_match_type_minor)), 0)

test: make sure the coding captured all possible outcomes of child-parent relationships

testthat::expect_equal(nrow(pot_children_source_matches %>% filter(child_match_type_minor == "mistake")), 0)

pot_children_parents: add child_match_type

now we add all the gained information on to our main dataset pot_children

  • parent datasets: pot_children, pot_children_matches
  • new variables: child_match_rel, child_match_type_minor

  • keys: child_id, source, pointer, pointer_nr

# data specific for the source
pot_children_source_parents <- pot_children %>% 
   filter(source == !!source_param) %>% 
   tidylog::left_join(pot_children_source_matches, by = c("child_id", "pointer", "source", "pointer_nr"))

# overall data that is being populated throughout the script
pot_children_parents[pot_children_parents$source == source_param, ] <- pot_children_source_parents

test: did we replace the pot_children_parents data with the same amount of rows or more if there are multiple parents for one child?

# It should just not me less (expect_lte : less than or equal)
testthat::expect_lte(nrow(pot_children_parents[pot_children_parents$source == source_param, ]), nrow(pot_children_source_parents))

summary stats

summary_parents <- pot_children_parents %>% 
   group_by(child_id, pointer, source) %>% 
   summarize(n_avail = sum(!is.na(pointer_id))) %>% 
   mutate(flag_avail = case_when(n_avail >= 1 ~ TRUE,
                                        n_avail == 0 ~ FALSE,
                                        TRUE ~ NA))

summary_source_parents <- summary_parents %>% 
   filter(source == !!source_param) 

source_specific

Let’s see how many parents we found in this specific source dataset

print <- summary_source_parents %>% tabyl(pointer, flag_avail)

print_mother <- print$'TRUE'[print$pointer == "mother"]
print_father <- print$'TRUE'[print$pointer == "father"]

print %>% adorn_title() 

for 335 we have at least one mother for 298 we have at least one father

Note: even though there is no identified father, there could still be an identified mother for the same child (this doesn’t come across in this table). Lets look at the availablity for mothers and fathers per child

And how many mothers and fathers per child?

The first table shows how many parents per child

print <- summary_source_parents %>% summary_by_child(kind = "specific") %>% tabyl(n_father, n_mother)

print %>% adorn_title()
  • for 290 we have matches for one mother and one father
  • for 38 only mothers and 1 only fathers
  • this does not neccesarily mean that there are no parents, its just that they are not listed in the source datasets

The second table shows for how many children, there is at least one father and/or mother (Note: the two tables are identical, if every child only has at most one identified parent)

summary_source_parents %>% summary_by_child(kind = "specific", values_from = "flag_avail", names_prefix = "avail_") %>% tabyl(avail_father, avail_mother) %>% adorn_title()

We can see that for 2205 cases, there is no corresponding child_id in the source data and therefore also no parental ID’s

compare all sources

Let’s take a look and compare all the source datasets

For the sources that we already used, we can now see the number of identified parents

summary_parents %>% tabyl(source, n_avail, pointer) %>% adorn_title()
## $father
##           n_avail      
##    source       0   1 2
##    bioage    2535   6 0
##  biobirth    2211 330 0
##  bioparen    2335 206 0
##       hhh    2541   0 0
##       kid    2243 292 6
## 
## $mother
##           n_avail      
##    source       0   1 2
##    bioage    2533   8 0
##  biobirth    2127 414 0
##  bioparen    2286 255 0
##       hhh    2541   0 0
##       kid    2206 334 1

For the sources that we already used, we can now see whether at least one mother or father was identified

summary_parents %>% tabyl(source, flag_avail, pointer) %>% adorn_title()
## $father
##           flag_avail     
##    source      FALSE TRUE
##    bioage       2535    6
##  biobirth       2211  330
##  bioparen       2335  206
##       hhh       2541    0
##       kid       2243  298
## 
## $mother
##           flag_avail     
##    source      FALSE TRUE
##    bioage       2533    8
##  biobirth       2127  414
##  bioparen       2286  255
##       hhh       2541    0
##       kid       2206  335

How many mothers were identified in which source data?

summary_parents %>% 
   filter(pointer == "mother")  %>% summary_by_child(kind = "all") %>% 
   count(pointer, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))

How many fathers were identified in which source data?

summary_parents %>% 
   filter(pointer == "father")  %>% summary_by_child(kind = "all") %>% 
   count(pointer, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))

Last but not least lets look at a summary of the whole partial thing for the kid source dataset

pot_children_source_parents %>% 
   # drop_na(pointer_id) %>%
   group_by(pointer, pointer_nr) %>% 
   skim()
Data summary
Name Piped data
Number of rows 15246
Number of columns 7
_______________________
Column type frequency:
character 3
numeric 2
________________________
Group variables pointer, pointer_nr

Variable type: character

skim_variable pointer pointer_nr n_missing complete_rate min max empty n_unique whitespace
source father 1 0 1.00 3 3 0 1 0
source father 2 0 1.00 3 3 0 1 0
source father 3 0 1.00 3 3 0 1 0
source mother 1 0 1.00 3 3 0 1 0
source mother 2 0 1.00 3 3 0 1 0
source mother 3 0 1.00 3 3 0 1 0
child_match_rel father 1 2243 0.12 5 5 0 1 0
child_match_rel father 2 2535 0.00 5 5 0 1 0
child_match_rel father 3 2541 0.00 NA NA 0 0 0
child_match_rel mother 1 2206 0.13 5 5 0 1 0
child_match_rel mother 2 2540 0.00 5 5 0 1 0
child_match_rel mother 3 2541 0.00 NA NA 0 0 0
child_match_type_minor father 1 2243 0.12 7 7 0 1 0
child_match_type_minor father 2 2535 0.00 7 7 0 1 0
child_match_type_minor father 3 2541 0.00 NA NA 0 0 0
child_match_type_minor mother 1 2206 0.13 7 7 0 1 0
child_match_type_minor mother 2 2540 0.00 7 7 0 1 0
child_match_type_minor mother 3 2541 0.00 NA NA 0 0 0

Variable type: numeric

skim_variable pointer pointer_nr n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
child_id father 1 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id father 2 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id father 3 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 1 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 2 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 3 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
pointer_id father 1 2243 0.12 21439650 16842689 1161603 8660901 9597201 40623301 41723901 ▇▂▁▁▇
pointer_id father 2 2535 0.00 9554890 15650210 1395206 1571107 2139808 7577056 40883505 ▇▂▁▁▂
pointer_id father 3 2541 0.00 NaN NA NA NA NA NA NA
pointer_id mother 1 2206 0.13 22247840 16880717 1233702 8731652 9620301 40900651 41723902 ▇▂▁▁▇
pointer_id mother 2 2540 0.00 9747705 NA 9747705 9747705 9747705 9747705 9747705 ▁▁▇▁▁
pointer_id mother 3 2541 0.00 NaN NA NA NA NA NA NA
# pot_children_parents1 %>% count("mother available" = !is.na(mother_id), "father avaiable" = !is.na(father_id)) %>% arrange(desc(n))

pot_children_parents: add HHH pointers

# before we start: define source to be used in the code for this section (this source dataset)
source_param <- "hhh"
source_mother_data <- hhh_mothers
source_father_data <- hhh_fathers
pointer_mother_id <- "mother_id" 
pointer_father_id <- "father_id" 
additional_vars <- c("child_stell", "child_stell_l") # needed to define child_match_type-minor later on

goal: let’s see how many mother and father pointers we can find in hhh now we identify parents via the children’s relation to the head of hh

pot_children_source_parents

  • parent datasets: hhh
  • keys: child_id, pointer_id, pointer_nr

now we get the parental pointers and match our pupils with the possible mothers and fathers from the pointer source dataset

pointer_mothers <- source_mother_data %>% select_pointers(pointer = "mother", 
                                                          pointer_id_name = pointer_mother_id,
                                                          source = source_param, 
                                                          all_of(additional_vars))
pot_children_source_mothers <- pot_children %>% match_child_pointer(pointer = "mother", source = source_param, pointer_data = pointer_mothers)
pointer_fathers <- source_father_data %>% select_pointers(pointer = "father", pointer_id_name = pointer_father_id, source = source_param, additional_vars)
pot_children_source_fathers <- pot_children %>% match_child_pointer(pointer = "father", source = source_param, pointer_data = pointer_fathers)

The maximum number of parents identified in this source dataset is 2 for mothers and 1 for fathers.

pot_children_source_matches: add hhh pointers

in this dataset, we define the relation between the pupil and the match depending on some characteristics of the source dataset.

this part is unique to every source dataset maybe it would be cleaner to do this definition bit later on, not sure

  • parent datasets: pot_children_source_mothers, pot_children_source_fathers
  • new variables: child_match_rel, child_match_type_minor

  • keys: child_id, pointer, pointer_nr

# just matches, no missings
pot_children_source_matches <- pot_children_source_mothers %>% bind_rows(pot_children_source_fathers) %>% 
      # here we drop the pupils with non-dentified parents, because they will have NA's for the child_rel and child_match_type_minor
      drop_na(pointer_id) %>% 
      # the matched id's are from the head of hh. via the rel of the child to the head of hh, we can get the relation between the child and the parent who is head of hh
      mutate(child_match_rel = case_when(child_stell %in% c(20:24) ~ "child",
                                      child_stell == 25 ~ "grandchild", 
                                      child_stell == 27 ~ "child", # [27] Schwsohn,-tocher (Ehe-/LPartner v.Kind)
                                      TRUE ~ "mistake"),
          child_match_type_minor = case_when(child_stell == 20  ~ "genetic_label", # 20 = those are possibly adoptive and genetic children (until 2011)
                                             child_stell == 21  ~ "genetic_known", # 21 = genetic children
                                             child_stell == 25  ~ "genetic_known", # 25 = grandchildren
                                             child_stell == 27  ~ "social_known", # [27] Schwsohn,-tocher (Ehe-/LPartner v.Kind)
                                             child_stell %in% c(22:24)  ~ "social_known", # adoptive, foster and step children
                                             TRUE ~ "mistake") 
          ) %>% 
   # select the variables that will go into the final dataset
   select(child_id, pointer_id, pointer_nr, pointer, source, child_match_rel, child_match_type_minor)

test: do we have missings in the child-match variable?

testthat::expect_equal(sum(is.na(pot_children_source_matches$child_match_type_minor)), 0)

test: make sure the coding captured all possible outcomes of child-parent relationships

testthat::expect_equal(nrow(pot_children_source_matches %>% filter(child_match_type_minor == "mistake")), 0)
testthat::expect_equal(nrow(pot_children_source_matches %>% filter(child_match_rel == "mistake")), 0)

pot_children_parents: add child_match_type

now we add all the gained information on to our main dataset pot_children

  • parent datasets: pot_children, pot_children_matches
  • new variables: child_match_rel, child_match_type_minor

  • keys: child_id, source, pointer, pointer_nr

# data specific for the source
pot_children_source_parents <- pot_children %>% 
   filter(source == !!source_param) %>% 
   tidylog::left_join(pot_children_source_matches, by = c("child_id", "pointer", "source", "pointer_nr"))

# overall data that is being populated throughout the script
pot_children_parents[pot_children_parents$source == source_param, ] <- pot_children_source_parents

test: did we replace the pot_children_parents data with the same amount of rows or more if there are multiple parents for one child?

# It should just not me less (expect_lte : less than or equal)
testthat::expect_lte(nrow(pot_children_parents[pot_children_parents$source == source_param, ]), nrow(pot_children_source_parents))

summary stats

summary_parents <- pot_children_parents %>% 
   group_by(child_id, pointer, source) %>% 
   summarize(n_avail = sum(!is.na(pointer_id))) %>% 
   mutate(flag_avail = case_when(n_avail >= 1 ~ TRUE,
                                        n_avail == 0 ~ FALSE,
                                        TRUE ~ NA))

summary_source_parents <- summary_parents %>% 
   filter(source == !!source_param) 

source_specific

Let’s see how many parents we found in this specific source dataset

print <- summary_source_parents %>% tabyl(pointer, flag_avail)

print_mother <- print$'TRUE'[print$pointer == "mother"]
print_father <- print$'TRUE'[print$pointer == "father"]

print %>% adorn_title() 

for 235 we have at least one mother for 254 we have at least one father

Note: even though there is no identified father, there could still be an identified mother for the same child (this doesn’t come across in this table). Lets look at the availablity for mothers and fathers per child

And how many mothers and fathers per child?

The first table shows how many parents per child

print <- summary_source_parents %>% summary_by_child(kind = "specific") %>% tabyl(n_father, n_mother)

print %>% adorn_title()
  • for 27 we have matches for one mother and one father
  • for 207 only mothers and 227 only fathers
  • this does not neccesarily mean that there are no parents, its just that they are not listed in the source datasets

The second table shows for how many children, there is at least one father and/or mother (Note: the two tables are identical, if every child only has at most one identified parent)

summary_source_parents %>% summary_by_child(kind = "specific", values_from = "flag_avail", names_prefix = "avail_") %>% tabyl(avail_father, avail_mother) %>% adorn_title()

We can see that for 2079 cases, there is no corresponding child_id in the source data and therefore also no parental ID’s

compare all sources

Let’s take a look and compare all the source datasets

For the sources that we already used, we can now see the number of identified parents

summary_parents %>% tabyl(source, n_avail, pointer) %>% adorn_title()
## $father
##           n_avail      
##    source       0   1 2
##    bioage    2535   6 0
##  biobirth    2211 330 0
##  bioparen    2335 206 0
##       hhh    2287 254 0
##       kid    2243 292 6
## 
## $mother
##           n_avail      
##    source       0   1 2
##    bioage    2533   8 0
##  biobirth    2127 414 0
##  bioparen    2286 255 0
##       hhh    2306 234 1
##       kid    2206 334 1

For the sources that we already used, we can now see whether at least one mother or father was identified

summary_parents %>% tabyl(source, flag_avail, pointer) %>% adorn_title()
## $father
##           flag_avail     
##    source      FALSE TRUE
##    bioage       2535    6
##  biobirth       2211  330
##  bioparen       2335  206
##       hhh       2287  254
##       kid       2243  298
## 
## $mother
##           flag_avail     
##    source      FALSE TRUE
##    bioage       2533    8
##  biobirth       2127  414
##  bioparen       2286  255
##       hhh       2306  235
##       kid       2206  335

How many mothers were identified in which source data?

summary_parents %>% 
   filter(pointer == "mother")  %>% summary_by_child(kind = "all") %>% 
   count(pointer, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))

How many fathers were identified in which source data?

summary_parents %>% 
   filter(pointer == "father")  %>% summary_by_child(kind = "all") %>% 
   count(pointer, avail_biobirth, avail_bioage, avail_kid, avail_hhh) %>% arrange(desc(n))

Last but not least lets look at a summary of the whole partial thing for the hhh source dataset

pot_children_source_parents %>% 
   # drop_na(pointer_id) %>%
   group_by(pointer, pointer_nr) %>% 
   skim()
Data summary
Name Piped data
Number of rows 15246
Number of columns 7
_______________________
Column type frequency:
character 3
numeric 2
________________________
Group variables pointer, pointer_nr

Variable type: character

skim_variable pointer pointer_nr n_missing complete_rate min max empty n_unique whitespace
source father 1 0 1.00 3 3 0 1 0
source father 2 0 1.00 3 3 0 1 0
source father 3 0 1.00 3 3 0 1 0
source mother 1 0 1.00 3 3 0 1 0
source mother 2 0 1.00 3 3 0 1 0
source mother 3 0 1.00 3 3 0 1 0
child_match_rel father 1 2287 0.10 5 10 0 2 0
child_match_rel father 2 2541 0.00 NA NA 0 0 0
child_match_rel father 3 2541 0.00 NA NA 0 0 0
child_match_rel mother 1 2306 0.09 5 10 0 2 0
child_match_rel mother 2 2540 0.00 5 5 0 1 0
child_match_rel mother 3 2541 0.00 NA NA 0 0 0
child_match_type_minor father 1 2287 0.10 12 13 0 3 0
child_match_type_minor father 2 2541 0.00 NA NA 0 0 0
child_match_type_minor father 3 2541 0.00 NA NA 0 0 0
child_match_type_minor mother 1 2306 0.09 12 13 0 3 0
child_match_type_minor mother 2 2540 0.00 13 13 0 1 0
child_match_type_minor mother 3 2541 0.00 NA NA 0 0 0

Variable type: numeric

skim_variable pointer pointer_nr n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
child_id father 1 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id father 2 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id father 3 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 1 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 2 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 3 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
pointer_id father 1 2287 0.10 22393198 16998811 2017501 8753226 9661101 40703926 41723901 ▇▁▁▁▆
pointer_id father 2 2541 0.00 NaN NA NA NA NA NA NA
pointer_id father 3 2541 0.00 NaN NA NA NA NA NA NA
pointer_id mother 1 2306 0.09 22334190 16545633 1395202 8776001 9620301 40940951 41692401 ▇▂▁▁▇
pointer_id mother 2 2540 0.00 40690802 NA 40690802 40690802 40690802 40690802 40690802 ▁▁▇▁▁
pointer_id mother 3 2541 0.00 NaN NA NA NA NA NA NA
# pot_children_parents1 %>% count("mother available" = !is.na(mother_id), "father avaiable" = !is.na(father_id)) %>% arrange(desc(n))

pot_children_parents

lets look at the output

keys: child_id, source, pointer, pointer_nr (those should have no missings)

pot_children_parents %>% 
   filter(source == "biobirth") %>% 
   group_by(pointer, pointer_nr) %>% 
   skim
Data summary
Name Piped data
Number of rows 15246
Number of columns 7
_______________________
Column type frequency:
character 3
numeric 2
________________________
Group variables pointer, pointer_nr

Variable type: character

skim_variable pointer pointer_nr n_missing complete_rate min max empty n_unique whitespace
source father 1 0 1.00 8 8 0 1 0
source father 2 0 1.00 8 8 0 1 0
source father 3 0 1.00 8 8 0 1 0
source mother 1 0 1.00 8 8 0 1 0
source mother 2 0 1.00 8 8 0 1 0
source mother 3 0 1.00 8 8 0 1 0
child_match_rel father 1 2211 0.13 5 5 0 1 0
child_match_rel father 2 2541 0.00 NA NA 0 0 0
child_match_rel father 3 2541 0.00 NA NA 0 0 0
child_match_rel mother 1 2127 0.16 5 5 0 1 0
child_match_rel mother 2 2541 0.00 NA NA 0 0 0
child_match_rel mother 3 2541 0.00 NA NA 0 0 0
child_match_type_minor father 1 2211 0.13 13 13 0 1 0
child_match_type_minor father 2 2541 0.00 NA NA 0 0 0
child_match_type_minor father 3 2541 0.00 NA NA 0 0 0
child_match_type_minor mother 1 2127 0.16 13 13 0 1 0
child_match_type_minor mother 2 2541 0.00 NA NA 0 0 0
child_match_type_minor mother 3 2541 0.00 NA NA 0 0 0

Variable type: numeric

skim_variable pointer pointer_nr n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
child_id father 1 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id father 2 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id father 3 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 1 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 2 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
child_id mother 3 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
pointer_id father 1 2211 0.13 21684752 16540273 1161603 8766326 9591301 40682101 41723901 ▇▃▁▁▇
pointer_id father 2 2541 0.00 NaN NA NA NA NA NA NA
pointer_id father 3 2541 0.00 NaN NA NA NA NA NA NA
pointer_id mother 1 2127 0.16 22146233 16679840 1233702 8766327 9608802 40882151 41723902 ▇▂▁▁▇
pointer_id mother 2 2541 0.00 NaN NA NA NA NA NA NA
pointer_id mother 3 2541 0.00 NaN NA NA NA NA NA NA
export(pot_children_parents, file = here::here("data/pot_children_parents.rds"))

step4: quality check 1

it is time to look at the quality of the collected parental pointers and whether they all agree when it comes to the relationship between the child and the parental pointer

here, we check whether the child_match_types agree if there are multiple child_parent_pairs (from different sources)

pot_children_parents_pointers <- pot_children_parents %>% 
   drop_na(pointer_id)

child_parent_sources_pointers_wide <- pot_children_parents_pointers %>% 
   select(child_id, pointer_id, pointer, source, child_match_type_minor) %>% 
   pivot_wider(names_from = source, values_from = child_match_type_minor, names_prefix = "rel_") %>% 
   arrange(child_id, pointer)
child_parent_sources_pointers_wide %>% 
   count(rel_biobirth, rel_bioparen, rel_bioage, rel_hhh, rel_kid) %>% 
   arrange(desc(n)) %>% head(7)
  • we can see that most child-parent pairs show consistent patterns of “genetic_known” and “unknown” and “genetic_label”
  • but there are also some parents who were identified as parents, but the relationship is unclear n = 39
  • Next, we check for inconsistent combinations (e.g. once child-parent pair was categorized as “genetic_known” and “social_known”)
  • whether the same child_parent pair has different child-match types
child_parent_sources_pointers_wide %>% 
   count(rel_biobirth, rel_bioage, rel_hhh, rel_kid) %>% 
   filter(rel_biobirth == "genetic_known" & (rel_bioage == "social_known" | rel_hhh == "social_known") |
             (rel_hhh == "genetic_label" & rel_bioage == "social_known")|
             (rel_hhh == "genetic_label" & rel_kid == "unknown" & is.na(rel_biobirth))) %>% arrange(n)
  • there are no ambivalent cases, only 2 cases where the relationship remains unknown

  • While in this project there are no conflict, they can and have appear in with other samples. Therefore, the next chunk marks and resolves conflicts. It is a code chunk that might have to be adapted by hand

mark conflicts

sources_crossvalid_merge <- child_parent_sources_pointers_wide %>% 
   # filter cases that conflict
   filter(rel_biobirth == "genetic_known" & (rel_bioage == "social_known" | rel_hhh == "social_known") |
             (rel_hhh == "genetic_label" & rel_bioage == "social_known")) %>% 
   # label conflictes and resolve them in merge_conflict
   mutate(sources_crossvalid = case_when(# conflicts
                                    rel_bioage == "social_known" & rel_hhh == "genetic_known" ~ "bioage hhh conflict",
                                    rel_bioage == "genetic_known" & rel_hhh == "social_known" ~ "bioage hhh conflict",
                                    # possible mistakes in biobirth
                                    rel_biobirth == "genetic_known" & rel_hhh == "social_known" ~ "hhh over biobirth",
                                    rel_biobirth == "genetic_known" & rel_bioage == "social_known" ~ "bioage over biobirth",
                                    # probably adoptive children
                                    rel_bioage == "social_known" & rel_hhh == "genetic_label" ~ "bioage_social over hhh_genetic_label",
                                    TRUE ~ "needs description"
                                    ), 
          merge_conflict = case_when(rel_bioage == "genetic_known" & rel_hhh == "social_known" ~ "unknown",
                                     rel_bioage == "social_known" & rel_hhh == "genetic_known" ~ "unknown",
                                     rel_biobirth == "genetic_known" & rel_bioage == "social_known" ~ rel_bioage,
                                     rel_biobirth == "genetic_known" & rel_hhh == "social_known" ~ rel_hhh,
                                     rel_bioage == "social_known" & rel_hhh == "genetic_label" ~ rel_bioage,
                                     TRUE ~ "conflict"
                                     )
          )

resolve conflicts

sources_crossvalid_merge <- sources_crossvalid_merge %>% 
   select(child_id, pointer_id, pointer, sources_crossvalid, merge_conflict) %>% 
   tidylog::distinct(child_id, pointer_id, .keep_all  = T) 

child_parent_sources_pointers_correction <- pot_children_parents_pointers %>% 
   tidylog::left_join(sources_crossvalid_merge, by  = c("child_id", "pointer_id", "pointer")) %>% 
   # update child_match_type_minor
   mutate(child_match_type_minor_c = case_when(is.na(sources_crossvalid) ~ child_match_type_minor,
                                               !is.na(sources_crossvalid) ~ merge_conflict,
                                               TRUE ~ "need specification"
                                               )) %>% 
   # update source
   mutate(source_c = case_when(is.na(sources_crossvalid) ~ source,
                               sources_crossvalid == "bioage over biobirth" ~ "bioage",
                               sources_crossvalid == "bioage hhh conflict" ~ source,
                               sources_crossvalid == "hhh over biobirth" ~ "hhh",
                               sources_crossvalid == "biobirth_mistake" ~ "hhh",
                               sources_crossvalid == "bioage_social over hhh_genetic_label" ~ "bioage",
                               TRUE ~ "need specification"
                                               )) 

did we define solutions for the problems?

testthat::expect_equal(child_parent_sources_pointers_correction %>% filter(child_match_type_minor_c == "need specification") %>% nrow(), 0)
testthat::expect_equal(child_parent_sources_pointers_correction %>% filter(source_c == "need specification") %>% nrow(), 0)

child_parent_sources_pointers_correction %>% tabyl(source, source_c) %>% adorn_title()
# child_parent_sources_pointers_correction %>% tabyl(source_c, child_match_type_minor_c) %>% adorn_title()
# child_parent_sources_pointers_correction %>% tabyl(child_match_type_minor, child_match_type_minor_c) %>% adorn_title()

the table above let’s us know that there were no conflicts.

now we replace the child_match_type_minor variable with the corrected values

child_parent_sources_pointers_corrected <- child_parent_sources_pointers_correction %>% 
   # now that we made sure, we defined rules for all conflicts, we can overwrite the old variables
   select(-merge_conflict, -child_match_type_minor, -source) %>% 
   select(child_match_type_minor = child_match_type_minor_c,
          source = source_c,
          everything()) %>% 
   select(-starts_with("select_")) %>% 
   # get rid of the now double child_match_type_minors
   tidylog::distinct(child_id, pointer_id, source, child_match_type_minor, .keep_all = T) %>% 
   # set preferred order for selecting genetic parents based on child_match_type_minor
   mutate(select_source = case_when(source == "biobirth" ~ 1,
                                    source == "bioage" ~ 2,
                                    source == "kid" ~ 3,
                                    source == "hhh" ~ 4,
                                    source == "bioparen" ~ 5,
                                    TRUE ~ 0),
      select_match = case_when(child_match_type_minor == "unknown" ~ 4,
                               child_match_type_minor == "social_known" ~ 3,
                               child_match_type_minor == "genetic_label" ~ 2,
                               child_match_type_minor == "genetic_known" ~ 1,
                                 TRUE ~ 0
                                     )) %>% 
   unite(select_match_source, select_match, select_source, remove = F, sep = ".") %>% 
   unite(select_match_source_l, child_match_type_minor, source, remove = F, sep = ".") %>% 
   arrange(child_id, pointer_id, select_match, select_source) 
# 0 would mean there are unspecified sources or rel-types which is not supposed to happen
testthat::expect_equal(child_parent_sources_pointers_corrected %>% filter(select_source == 0) %>% nrow(), 0)
testthat::expect_equal(child_parent_sources_pointers_corrected %>% filter(select_match == 0) %>% nrow(), 0)
child_parent_sources_pointers_corrected %>% drop_na(sources_crossvalid) %>% distinct(child_id, .keep_all = T) %>% tabyl(sources_crossvalid) %>% adorn_totals

step5: identify best match

goal: now that we have pointer_id’s from multiple sources, we need to decide which id’s are the best.

genetic parents: For each child we want the genetic_known parents, if available, and if not, we want the “genetic_label” parents and if that is not possible we want the “identified but unknown” parents. But we also just want to pick one, even if the same pointer and match type is found in two source datasets.

The problem is that if there are multiple sources with the same ID, then one child_parent pair still gets multiple values. but we just want to keep the best source. (e.g. biobirth and kidlong produce the same pointer_id. but one match_type_minor is “genetic_known” and the other is “unknown”. then we want to keep “genetic_known” and drop “unknown”.

How to solve this? Probably merge it piece by piece? But that only comes at a later stage

generate child_parent_base data to be populated

we start out with a dataset that includes all children we want to select parents for, each child has two rows, one for the father pointer, one for the mother.

child_parent_base <- pot_children %>% 
    # leave out source so that its just up to three mothers and fathers per child
   tidylog::distinct(child_id, pointer, pointer_nr)

export(child_parent_base, file = here::here("data/child_parent_base.rds"))

social child-parent

parent datasets: children, child_parent_sources_pointers_corrected

keys: child_id, pointer, pointer_nr

social_child_parents <- child_parent_sources_pointers_corrected %>% 
   # for now we dont want grandparents
   filter(child_match_rel == "child") %>% 
   # the arrange ensures that we only keep our "best" match (e.g. genetic_known over genetic_label)
   arrange(child_id, pointer_id, select_match, select_source) %>%
   tidylog::distinct(child_id, pointer_id, .keep_all = T) %>% 
   # only keep social 
   filter(child_match_type_minor == "social_known") 


# merge selected genetic parents to children dataset
child_social_parents_multi <- child_parent_base %>% 
   tidylog::left_join(social_child_parents %>% 
                        # we assume that there might be some double parents
                        group_by(child_id, pointer) %>%
                        mutate(pointer_nr = row_number()),
                      by = c("child_id", "pointer", "pointer_nr")) %>% 
   mutate(child_match_type_major = "social") 

quality check multi social parent pointers

quick check: how many parents found?

child_social_parents_multi %>%
   drop_na(pointer_id) %>%
   tidylog::distinct(child_id, pointer_id, .keep_all = T) %>% 
   tabyl(child_match_type_minor, pointer_nr, pointer)
## $father
##  child_match_type_minor 1
##            social_known 8
## 
## $mother
##  child_match_type_minor 1
##            social_known 5
child_social_parents_selected <- child_social_parents_multi %>% 
   drop_na(pointer_id) %>% 
   select(child_id, pointer, pointer_id, child_match_type_major, child_match_type_minor, child_match_rel, source, sources_crossvalid)

save data

export(child_social_parents_selected, file = here::here("data/child_social_parents_selected.rds"))

check if key uniquely identify al rows

testthat::expect_equal(nrow(child_social_parents_multi), nrow(child_social_parents_multi %>% distinct(child_id, pointer, pointer_nr)))

check it out

child_social_parents_selected %>% tabyl(child_match_type_minor, pointer)

genetic child-parent

parent datasets: children, child_parent_sources_pointers_corrected

keys: child_id, pointer, pointer_nr

genetic_child_parents <- child_parent_sources_pointers_corrected %>% 
   # for now we dont want grandparents
   filter(child_match_rel == "child") %>% 
   # the arrange ensures that we only keep our "best" match (e.g. genetic_known over genetic_label)
   arrange(child_id, pointer_id, select_match, select_source) %>%
   tidylog::distinct(child_id, pointer_id, .keep_all = T)  %>% 
   # only keep genetics and unknown (but only those without a conflict? for now all)
   tidylog::filter(child_match_type_minor != "social_known")

# merge selected genetic parents to children dataset
child_genetic_parents_multi <- child_parent_base %>% 
   tidylog::left_join(genetic_child_parents %>% 
                        # even though it shouldnt be the case we assume that there might be some double parents
                        group_by(child_id, pointer) %>%
                        mutate(pointer_nr = row_number()),
                      by = c("child_id", "pointer", "pointer_nr")) %>% 
   mutate(child_match_type_major = "genetic")

quality check multi genetic parent pointers

quick check: how many parents found?

child_genetic_parents_multi %>% 
   drop_na(pointer_id) %>% tidylog::distinct(child_id, pointer_id, .keep_all = T) %>% 
   tabyl(child_match_type_minor, pointer_nr, pointer)
## $father
##  child_match_type_minor   1 2
##           genetic_known 370 4
##           genetic_label   1 0
##                 unknown  13 2
## 
## $mother
##  child_match_type_minor   1 2
##           genetic_known 439 0
##           genetic_label   2 0
##                 unknown   9 2
multi_genetic_parents_wide1 <- child_genetic_parents_multi %>% 
   drop_na(pointer_id) %>% 
   group_by(child_id, pointer) %>% add_tally() %>% ungroup() %>% 
   filter(n > 1) %>% 
   select(child_id, pointer, pointer_id, type.source = select_match_source_l, pointer_nr) %>% 
   pivot_wider(names_from = pointer_nr, values_from = c("type.source", "pointer_id"), names_prefix = "pointer") %>% 
   select(child_id, pointer, contains("pointer"))

How many mothers, how many fathers?

multi_genetic_parents_wide1 %>% 
   distinct(child_id, pointer, .keep_all = T) %>% 
   tabyl(pointer)

mostly fathers

what are the most common combinations?

# multi_genetic_parents_wide %>% View()
multi_genetic_parents_wide1 %>% 
   count(pointer, type.source_pointer1, type.source_pointer2) %>% 
   arrange(desc(n)) 

the most conflicts stem from biobirth is “genetic_known” and kid is “unknown” (but for a different possible parent)

we do another reshape to get the separate type and source

multi_genetic_parents_wide2 <- child_genetic_parents_multi %>% 
   drop_na(pointer_id) %>% 
   group_by(child_id, pointer) %>% add_tally() %>% ungroup() %>% 
   filter(n > 1) %>% 
   select(child_id, pointer, pointer_id, type = child_match_type_minor, source, pointer_nr) %>% 
   pivot_wider(names_from = pointer_nr, values_from = c("type", "source", "pointer_id"), names_prefix = "pointer") %>% 
   select(child_id, pointer, contains("pointer"))
multi_genetic_parents_wide2 %>% tabyl(type_pointer1, type_pointer2) %>% adorn_title
# multi_genetic_parents_wide %>% View()
multi_genetic_parents_wide2 %>% 
   count(type_pointer1, type_pointer2) %>% 
   arrange(n)

resolve conflict

multigenetic_merge_conflict <- multi_genetic_parents_wide2 %>% 
   mutate(genetic_conflict = case_when(type_pointer1 == "genetic_known" & type_pointer2 == "genetic_known" ~ "multi known",
                                       type_pointer1 == "genetic_known" & type_pointer2 == "unknown" ~ "genknown over unknown",
                                       type_pointer1 == "unknown" & type_pointer2 == "genetic_known" ~ "genknown over unknown",
                                    TRUE ~ "needs description"
                                    ), 
          merge_conflict = case_when(type_pointer1 == "genetic_known" & type_pointer2 == "genetic_known" ~ NA_real_,
                                     type_pointer1 == "genetic_known" & type_pointer2 == "unknown" ~ pointer_id_pointer1,
                                     type_pointer1 == "unknown" & type_pointer2 == "genetic_known" ~ pointer_id_pointer2,
                                     TRUE ~ 0
                                     ))

make sure we resolved all issues

testthat::expect_equal(multigenetic_merge_conflict %>% filter(genetic_conflict == "need description") %>% nrow(), 0)
testthat::expect_equal(multigenetic_merge_conflict %>% filter(merge_conflict == 0) %>% nrow(), 0)

resolve conflicts and mark them

merge_conflict <- multigenetic_merge_conflict %>% 
   select(child_id, pointer, genetic_conflict, merge_conflict) %>% 
   tidylog::distinct(child_id, pointer, .keep_all  = T) 

child_genetic_parents_corrected <- child_genetic_parents_multi %>% 
   tidylog::left_join(merge_conflict, by  = c("child_id", "pointer")) %>% 
   mutate(select_bestchoice = case_when(is.na(pointer_id) ~ "drop_me",
                                        is.na(genetic_conflict) ~ "best_choice",
                                        !is.na(genetic_conflict) & pointer_id == merge_conflict ~ "best_choice",
                                        !is.na(genetic_conflict) & pointer_id != merge_conflict & !is.na(pointer_id) ~ "drop_me",
                                        genetic_conflict == "multi unknown" ~ "drop_me",
                                        genetic_conflict == "multi known" ~ "drop_me",
                                        TRUE ~ "need specification"
                                               )) %>% 
   select(-merge_conflict)
# child_genetic_parents_corrected %>% filter(select_bestchoice == "need specification") %>% View()

did we define solutions for the problems?

testthat::expect_equal(child_genetic_parents_corrected %>% filter(select_bestchoice == "need specification") %>% nrow(), 0)

now we replace the child_match_type_minor variable with the corrected values

child_genetic_parents_selected <- child_genetic_parents_corrected %>% 
   drop_na(pointer_id) %>% 
   tidylog::filter(select_bestchoice == "best_choice") %>%  
   select(child_id, pointer, pointer_id, child_match_type_major, child_match_type_minor, child_match_rel, source, sources_crossvalid, genetic_conflict)

quick check: any children who are also their parents?

testthat::expect_equal(child_genetic_parents_selected %>% filter(child_id == pointer_id) %>% nrow(), 0)

test whether we have more than one parent for any child

testthat::expect_lte(child_genetic_parents_selected %>% 
                        # the drop_na should not remove any rows
                        drop_na(pointer_id) %>% 
                        group_by(child_id, pointer_id) %>% 
                        summarize(max_parents = max(row_number())) %>% 
                        # get max number of max parents from all children
                        ungroup() %>% 
                        summarize(overall_max = max(max_parents)) %>% pull(overall_max), 1)

save data

export(child_genetic_parents_selected, file = here::here("data/child_genetic_parents_selected.rds"))

children_all_source_parents

testthat::expect_equal(child_genetic_parents_selected %>% semi_join(child_social_parents_selected, by = c("child_id", "pointer_id")) %>% nrow(), 0)
# child_parent_sources_pointers_corrected %>% semi_join(child_social_parents_selected, by = c("child_id", "pointer_id")) %>% tabyl(child_match_type_minor)

keys: child_id, pointer_id

# child_social_parents <- import(file = here::here("data/child_social_parents.rds"))
# child_genetic_parents_multi <- import(file = here::here("data/child_genetic_parents_multi.rds"))

children_all_parents_selected <- child_genetic_parents_selected %>% bind_rows(child_social_parents_selected) %>% 
   # this distinct should not remove any rows
   tidylog::distinct(child_id, pointer_id, .keep_all = T) %>% 
   add_count(child_id, pointer, name = "pointer_nr") %>% 
   select(child_id, pointer, pointer_nr, pointer_id, child_match_type_major, child_match_type_minor, everything()) %>% 
   arrange(child_id, pointer, child_match_type_major) 

children_all_source_parents <- child_parent_base %>% 
   left_join(children_all_parents_selected, by = c("child_id","pointer", "pointer_nr"))
export(children_all_source_parents, file = here::here("data/children_all_source_parents.rds"))

check out the children with more than one parent

children_parents_wide <- children_all_source_parents %>% 
   drop_na(pointer_id) %>% 
   group_by(child_id, pointer) %>% 
   mutate(pointer_nr = row_number()) %>% 
   add_tally() %>% ungroup() %>% 
   filter(n > 1) %>% 
   select(child_id, pointer, pointer_id, type.minor = child_match_type_minor, pointer_nr) %>% 
   pivot_wider(names_from = pointer_nr, values_from = c("type.minor", "pointer_id"), names_prefix = "pointer") %>% 
   select(child_id, pointer, contains("pointer"))

children_parents_wide %>% tabyl(type.minor_pointer1, type.minor_pointer2, pointer) %>% adorn_title
## $father
##                      type.minor_pointer2
##  type.minor_pointer1        social_known
##              unknown                   1

for those children where we have multiple pointers, it is only genetic and social pointers which is possible and no conflict

children_all_source_parents %>% drop_na(pointer_id) %>% group_by(pointer, child_match_type_major) %>% skim
Data summary
Name Piped data
Number of rows 845
Number of columns 10
_______________________
Column type frequency:
character 5
numeric 3
________________________
Group variables pointer, child_match_type_major

Variable type: character

skim_variable pointer child_match_type_major n_missing complete_rate min max empty n_unique whitespace
child_match_type_minor father genetic 0 1.00 7 13 0 3 0
child_match_type_minor father social 0 1.00 12 12 0 1 0
child_match_type_minor mother genetic 0 1.00 7 13 0 3 0
child_match_type_minor mother social 0 1.00 12 12 0 1 0
child_match_rel father genetic 0 1.00 5 5 0 1 0
child_match_rel father social 0 1.00 5 5 0 1 0
child_match_rel mother genetic 0 1.00 5 5 0 1 0
child_match_rel mother social 0 1.00 5 5 0 1 0
source father genetic 0 1.00 3 8 0 4 0
source father social 0 1.00 3 3 0 1 0
source mother genetic 0 1.00 3 8 0 4 0
source mother social 0 1.00 3 3 0 1 0
sources_crossvalid father genetic 382 0.00 NA NA 0 0 0
sources_crossvalid father social 8 0.00 NA NA 0 0 0
sources_crossvalid mother genetic 450 0.00 NA NA 0 0 0
sources_crossvalid mother social 5 0.00 NA NA 0 0 0
genetic_conflict father genetic 378 0.01 21 21 0 1 0
genetic_conflict father social 8 0.00 NA NA 0 0 0
genetic_conflict mother genetic 448 0.00 21 21 0 1 0
genetic_conflict mother social 5 0.00 NA NA 0 0 0

Variable type: numeric

skim_variable pointer child_match_type_major n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
child_id father genetic 0 1 21441213.6 16682436.61 1161602 8709278 9556303 40623305 41723904 ▇▃▁▁▇
child_id father social 0 1 27213716.4 19283109.73 1342705 7612029 40561503 41037504 41677504 ▃▂▁▁▇
child_id mother genetic 0 1 22301160.4 16713312.84 1161602 8767729 9608704 40874804 41723904 ▇▂▁▁▇
child_id mother social 0 1 13700563.6 15183249.61 1395204 8618804 8645003 9633305 40210502 ▇▂▁▁▂
pointer_nr father genetic 0 1 1.0 0.05 1 1 1 1 2 ▇▁▁▁▁
pointer_nr father social 0 1 1.1 0.35 1 1 1 1 2 ▇▁▁▁▁
pointer_nr mother genetic 0 1 1.0 0.00 1 1 1 1 1 ▁▁▇▁▁
pointer_nr mother social 0 1 1.0 0.00 1 1 1 1 1 ▁▁▇▁▁
pointer_id father genetic 0 1 21439090.6 16681114.46 1161603 8701052 9573351 40623301 41723901 ▇▃▁▁▇
pointer_id father social 0 1 27403439.1 18993524.99 2098804 7802452 40561501 41037501 41677501 ▅▁▁▁▇
pointer_id mother genetic 0 1 22362849.2 16735211.85 1233702 8766327 9620701 40882151 41723902 ▇▂▁▁▇
pointer_id mother social 0 1 13700561.2 15183250.27 1395202 8618801 8645001 9633301 40210501 ▇▂▁▁▂

Explore

children_all_source_parents %>% tabyl(pointer, sources_crossvalid)

there are no conflicts that needed merging

children_all_source_parents %>% 
   drop_na(pointer_id) %>% 
   tabyl(child_match_type_minor, pointer, child_match_type_major)
## $genetic
##  child_match_type_minor father mother
##           genetic_known    370    439
##           genetic_label      1      2
##            social_known      0      0
##                 unknown     11      9
## 
## $social
##  child_match_type_minor father mother
##           genetic_known      0      0
##           genetic_label      0      0
##            social_known      8      5
##                 unknown      0      0
children_all_source_parents %>% 
   drop_na(pointer_id) %>%
   tabyl(child_match_type_minor, source, pointer)
## $father
##  child_match_type_minor biobirth bioparen hhh kid
##           genetic_known      329       27  14   0
##           genetic_label        0        0   1   0
##            social_known        0        0   8   0
##                 unknown        0        0   0  11
## 
## $mother
##  child_match_type_minor biobirth bioparen hhh kid
##           genetic_known      414       16   9   0
##           genetic_label        0        0   2   0
##            social_known        0        0   5   0
##                 unknown        0        0   0   9

Number of children

For how many children did we not identify any parent?

  • We identified either a father or a mother for all children (which does not mean that we identified all possible parents)
  • for n = 25 children, we identified a father, but no mother, and for n = 84we identified a mother but no father
  • for n = 718 children, there are identifiers for both parents available
children_all_source_parents %>% 
      tidylog::distinct(child_id, pointer, pointer_id, .keep_all = T) %>% 
      add_count(child_id, "parent_avail" = !is.na(pointer_id)) %>%
      tidylog::distinct(child_id, pointer, .keep_all = T) %>% 
      tabyl(pointer, parent_avail, n) %>% adorn_title()
## $`1`
##          parent_avail     
##  pointer        FALSE TRUE
##   father            0   16
##   mother            0   82
## 
## $`2`
##          parent_avail     
##  pointer        FALSE TRUE
##   father         2153  372
##   mother         2086  372
## 
## $`3`
##          parent_avail     
##  pointer        FALSE TRUE
##   father            0    0
##   mother            0    1

Number of mothers

How many genetic parents were identified per child?

  • there is no mother identified for n = 2104 potential children
child_genetic_parent_summary <- child_parent_base %>% 
   # drop the pointer_nr 1:3
   tidylog::distinct(child_id, pointer) %>% 
   left_join(child_genetic_parents_selected, by = c("child_id", "pointer")) %>% 
   # this distince should not remove any rown
   tidylog::distinct(child_id, pointer, pointer_id, .keep_all = T) %>% 
   group_by(child_id, pointer) %>% 
   mutate(n_missing = sum(is.na(pointer_id)),
          n_rows = max(row_number()),
          n_parents_avail = n_rows - n_missing) %>% 
   ungroup()

child_genetic_parent_summary %>%
   tabyl(pointer, n_parents_avail) %>% 
   adorn_totals("col") %>% 
   adorn_title

Who are the children with no identified parent?

child_genetic_parent_summary %>%   
      filter(n_parents_avail == 0) %>% 
      tidylog::left_join(pbrutto_long %>% select(syear, pid, stell_l), by = c("child_id" = "pid")) %>%
      tidylog::distinct(child_id, pointer, .keep_all = T) %>% 
      tabyl(stell_l, pointer, show_missing_levels = F)

Let’s see wether we can identify parents via the hh-members and their relations with the hhh head.