Review: until now we:

  • have identified possible parents and their rel. to the child through various parental pointers, and harmonized them
  • have identified the relationship of iGene-members to each member in their households.

Goal: Now, for each igene-member, we want to add all identified relations of the igene-members as potential children to parents.

Preparation: read data needed in the script

# data of child-parent dictionaries
children_all_source_parents <- import(here::here("data/children_all_source_parents.rds"))

# data of iGENE Members
ppfad_igene_long <- import(here::here("data/ppfad_igene_long_v36.rds"))
ppfad_igene_wide <- ppfad_igene_long %>% distinct(pid, .keep_all = T)
child_parent_base <- import(here::here("data/child_parent_base.rds"))
pid_igene_sample <- import(here::here("data/pid_igene_sample.rds")) %>% tibble(pid = .)

# data of children and their relation to other hh members
igene_hhmember_relations <- import(here::here("data/igene_hhmember_relations.rds"))

# metadata
ppfad_long_cl <- import(here::here("data/ppfad_long_cl_v36.rds"))

step1: add parent pointers to children over time

goal: find out which relation children have to all identified parents

child_source_parents

first, we join the information from the sources to the igene-members in the sample over time

parent datasets: ppfad_igene_long, children_all_source_parents keys: syear, child_id, pointer, pointer_id

# note: social parents can change over time so the join might distort that n
child_source_parents <- ppfad_igene_long %>% 
      select(syear, child_id = pid, child_hid = hid, child_cid = cid) %>%
      tidylog::left_join(children_all_source_parents %>% 
                               drop_na(pointer_id) %>%
                               select(child_id, pointer_id, pointer, 
                                      child_match_type_major, 
                                      child_match_type_minor, 
                                      pointer_source = source, 
                                      child_match_rel), 
                      by = c("child_id")) %>% 
   # this should not remove any rows  
   # add_count(syear, pid, pointer_id) %>% 
   tidylog::distinct(syear, child_id, pointer_id, .keep_all = T)

child_outhh_source_parents

now we use the information on the members of the igene-members households at every survey year to subset those parent_pointers identified before who are not in the household at a given survey year.

parent datasets: child_source_parents, igene_hhmember_relations keys: syear, child_id, pointer, pointer_id

# keys: syear, pid, pointer_id
child_outhh_source_parents <- child_source_parents %>% 
      # this drop_na is important
      drop_na(pointer_id) %>% 
      # drops the rows where the pointer from the sources has a match with one of the persons that live with a child in a certain year
      tidylog::anti_join(igene_hhmember_relations, by = c("syear", "child_id" = "pid", "pointer_id" = "match_id")) %>% 
      # joins hid's  for the survey year in question
      tidylog::left_join(ppfad_long_cl %>% select(syear, pointer_id = pid, pointer_hid = hid), 
                by = c("syear", "pointer_id")) 
child_outhh_source_parents %>% count(is.na(pointer_hid))

we cannot find hid’s for n = 1318 parents. to find out why we will look up the last identified hid and the survey year and netto of that year

now we try to locate if the parents had an hid at any point in their time in the soep and what their position was

parent datasets: child_source_parents,

child_outhh_source_parents_long <- child_outhh_source_parents %>% 
      tidylog::distinct(child_id, pointer_id, .keep_all = T) %>% 
      select(-syear, -pointer_hid) %>% 
      tidylog::left_join(ppfad_long_cl %>% select(syear, pointer_id = pid, pointer_hid = hid), 
                by = c("pointer_id"))

child_outhh_source_parents_long %>% count(is.na(pointer_hid))
summary_years_inhh <- child_outhh_source_parents_long %>% 
      mutate(pointer_inhh = ifelse(child_hid == pointer_hid, TRUE, FALSE)) %>% 
      group_by(child_id, pointer_id) %>%
      summarize(years_inhh = sum(pointer_inhh)) %>% 
      ungroup()

summary_lastyear_inhh <- child_outhh_source_parents_long %>%
      filter(child_hid == pointer_hid) %>% 
      group_by(child_id, pointer_id) %>%
      summarize(pointer_last_inhh = max(syear)) %>% 
      ungroup()

summary_lastyear_anyhh <- child_outhh_source_parents_long %>%
      # drop_na(pointer_hid) %>% 
      group_by(child_id, pointer_id) %>%
      summarize(pointer_last_anyhh = max(syear)) %>% 
      ungroup()

check out parents who were in the same hh for some time

child_outhh_source_parents_info <- child_outhh_source_parents %>%  
      # add the information about years in hh
      tidylog::left_join(summary_years_inhh, by = c("child_id", "pointer_id")) %>% 
      # add the information about the last year in hh
      tidylog::left_join(summary_lastyear_inhh, by = c("child_id", "pointer_id")) %>%
      # add information about the last year in any hh
      tidylog::left_join(summary_lastyear_anyhh, by = c("child_id", "pointer_id")) %>% 
      tidylog::left_join(ppfad_long_cl %>% select(syear, pointer_id = pid, pointer_netto_lastinhh = netto_l), 
                by = c("pointer_last_inhh" = "syear", "pointer_id")) %>% 
      tidylog::left_join(ppfad_long_cl %>% select(syear, pointer_id = pid, pointer_hid_lastanyhh = hid, pointer_netto_lastanyhh = netto_l), 
                by = c("pointer_last_anyhh" = "syear", "pointer_id")) %>% 
      tidylog::distinct(syear, child_id, pointer_id, .keep_all = T) %>% 
      mutate(pointer_inhh = case_when(child_hid == pointer_hid ~ TRUE, 
                              child_hid != pointer_hid & !is.na(pointer_hid) ~ FALSE,
                              is.na(pointer_hid) ~ FALSE,
                              TRUE ~ NA) ,
             ever_inhh = case_when(years_inhh == 0 ~ FALSE,
                                   years_inhh > 0 ~ TRUE,
                                   TRUE ~ NA))

# child_outhh_source_parents_info %>% View()
child_outhh_source_parents_info %>% skim(years_inhh)
Data summary
Name Piped data
Number of rows 1483
Number of columns 19
_______________________
Column type frequency:
numeric 1
________________________
Group variables

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
years_inhh 1107 0.25 0.63 1.9 0 0 0 0 10 ▇▁▁▁▁
child_outhh_source_parents_info %>% 
      tidylog::distinct(child_id, pointer_id, .keep_all = T) %>% 
      tabyl(ever_inhh, pointer_inhh) %>% adorn_title()
child_outhh_source_parents_info %>% 
      distinct(child_id, pointer_id, .keep_all = T) %>% 
      tabyl(pointer_netto_lastinhh, pointer, show_missing_levels = F) %>% adorn_title()
child_outhh_source_parents_info %>% 
      distinct(child_id, pointer_id, .keep_all = T) %>%  
      tabyl(child_match_type_minor, pointer, pointer_inhh)
## $`FALSE`
##  child_match_type_minor father mother
##           genetic_known    113    124
##           genetic_label      0      2
##            social_known      5      3
##                 unknown      2      0
child_outhh_source_parents %>% 
      distinct(child_id, pointer_id, .keep_all = T) %>%
      tabyl(pointer_source, pointer)
child_outhh_source_parents %>% names
##  [1] "syear"                  "child_id"               "child_hid"              "child_cid"              "pointer_id"             "pointer"               
##  [7] "child_match_type_major" "child_match_type_minor" "pointer_source"         "child_match_rel"        "pointer_hid"

child_inhh_source_parents

now we turn to the source_parent_pointers who match with one person inside the childs household for a given year

child_inhh_source_parents <- child_source_parents %>% 
   # subset filter out the parent_pointers who are not in hh
   tidylog::anti_join(child_outhh_source_parents, by =  c("syear", "child_id", "pointer_id")) %>% 
   select(syear, pointer, child_id, child_hid, pointer_id, 
          child_match_type_major, child_match_type_minor, pointer_source) %>% 
   # join hh_member_id's
   tidylog::left_join(igene_hhmember_relations %>% select(syear, 
                                                          child_id = pid, 
                                                          child_hid = hid, 
                                                          match_id, 
                                                          child_match_rel = igene_match_rel, 
                                                          match_gender, 
                                                          child_match_type_minor = igene_match_type_minor, 
                                                          child_match_type_major = igene_match_type_major, 
                                                          match_hh_position5,
                                                          child_hh_position5 = igene_hh_position5, 
                                                          child_stell_l = igene_stell_l, 
                                                          match_stell_l), 
                      by = c("syear", "child_id", "child_hid"), suffix = c(".source", ".hh")) %>% 
   # the reason that pointer_id and match  id are not included in the join is that we want to add the child_hh members for  those  children who do not have  a parent pointer yet in order to find more parents via the hh-composition
   tidylog::filter(pointer_id == match_id | is.na(pointer_id)) %>% 
   mutate(pointer_inhh = TRUE)

filter those where pointer_id is missing

child_inhh_source_parents %>% filter(is.na(pointer_id)) %>% count(syear) 
#child_inhh_source_parents %>% filter(pointer_id == match_id | is.na(pointer_id)) %>% View()
# # child_inhh_source_parents %>% count(syear, child_id, pointer_id,  test = pointer_id == match_id) %>% pivot_wider(names_from = test, values_from = n)
# #    count(pointer, child_match_type_minor.source) %>% pivot_wider(names_from = "pointer", values_from = n)
#    
# child_inhh_source_parents %>% 
#    group_by(syear, child_id, pointer_id) %>% 
#    summarize(n_pointer_match = sum(match_id == pointer_id)) %>% tabyl(n_pointer_match)

do the matched parent pointers from the source-datasets and the match_id’s from the hh-members agree on the child_match_type?

child_inhh_source_parents %>% 
   drop_na(pointer_id) %>% 
   filter(!str_detect(child_match_type_major.hh, "missing")) %>% 
   distinct(child_id, match_id, .keep_all = T) %>% 
   tabyl(child_match_type_minor.source, child_match_type_major.hh, pointer) %>%
   adorn_totals %>% adorn_title() 
## $father
##                                child_match_type_major.hh       
##  child_match_type_minor.source                   genetic social
##                  genetic_known                       358     12
##                  genetic_label                         1      0
##                   social_known                         0      8
##                        unknown                        11      0
##                          Total                       370     20
## 
## $mother
##                                child_match_type_major.hh       
##  child_match_type_minor.source                   genetic social
##                  genetic_known                       432      7
##                  genetic_label                         2      0
##                   social_known                         1      4
##                        unknown                         9      0
##                          Total                       444     11
child_inhh_source_parents %>% 
   drop_na(pointer_id) %>% 
   filter(!str_detect(child_match_type_major.hh, "missing")) %>% 
   distinct(child_id, match_id, .keep_all = T) %>% 
   tabyl(child_match_type_minor.source, child_match_type_major.hh) %>% 
   adorn_title(row = "Relation based on dictionary",
               col = "Relation based on stell variable") 
  • there are some conflicts, but in most cases the classificaitions from both datasets agree

lets see for which igene-members there are new possible parents to be identified

child_inhh_source_parents %>% 
   filter(is.na(child_match_type_major.source)) %>%
   tidylog::distinct(child_id, match_id, .keep_all = T) %>% 
   count(child_match_rel, child_match_type_major.hh) %>% arrange(desc(n))

cross_validation

  • now we will use both child_match definitions to update unknown cases and resolve conflicts
# we bind those together later
# this filter filters out all child_pointer pairs where the pointer is not in the household, as well as child_hhmatches that are not in the parents from the sources
child_inhh_parents_validation <- child_inhh_source_parents %>% 
      mutate(source_pointer_hh_validation = case_when(
                  # conflicts
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "social_known" ~ paste0("conflict: ", pointer_source, " mistake possible"),
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_known" ~ "conflict: hh_member mistake possible",
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_stepchild" ~ "conflict: hh_member mistake possible",

                  # validating hh members via source
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "genetic_stepchild" ~ "confirm: stepchild is genetic",
                  child_match_type_minor.source  == "genetic_label" & child_match_type_minor.hh == "genetic_stepchild" ~ "confirm: stepchild is genetic",
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "social_multigen" ~ "confirm: child is genetic",
                  child_match_type_minor.source  == "unknown" & child_match_type_minor.hh == "social_multigen" ~ "relation stays unknown",
                  child_match_type_minor.hh == "missing match_stell" ~ "keep source pointer, missing in hh",
                  
                  # validating found partners
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "genetic_partner" ~ "confirm: partner is genetic",
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_partner" ~ "update: partner is social",
                  
                  # finding previously missing parents
                  child_match_type_minor.source  == "unknown" & child_match_type_major.hh == "social" ~ "new: found social parent in hh",
                  child_match_type_minor.source  == "unknown" & child_match_type_major.hh == "genetic" ~ "new: found genetic parent in hh",
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "genetic_known" ~ "new: found genetic parent in hh",
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "genetic_label" ~ "new: found genetic parent in hh",
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "genetic_stepchild" ~ "new: found genetic parent in hh",
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "genetic_multigen" ~ "new: found genetic parent in hh",
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "social_multigen" ~ "new: found genetic parent in hh", 
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "genetic_partner" ~ "new: found partner parent in hh",
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "social_known" ~ "new: found social parent in hh",
                  is.na(child_match_type_minor.source) & child_match_rel != "child" & !is.na(child_match_rel) ~ "new: found other pot. caretaker in hh",
                  
                  # label and adoptive children
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_label" ~ "update: label seems adoptive child",
                  child_match_type_minor.source  == "genetic_label" & child_match_type_minor.hh == "social_known" ~ "update: label seems adoptive child",
                  
                  # agreements
                  child_match_type_major.source  == "genetic" & child_match_type_major.hh == "genetic" ~ "confirm: child is genetic",
                  child_match_type_major.source  == "social" & child_match_type_major.hh == "social" ~ "confirm: child is social",
                  is.na(child_match_type_minor.hh) ~ "keep source pointer, missing in hh",
                  child_match_type_minor.hh == "missing_match_stell" ~ "keep source pointer, missing in hh",
                  TRUE ~ "unresolved combi"),
             merge_conflict = case_when(
                  # conflicts
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "social_known" ~ "unknown",
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_known" ~ "unknown",
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_stepchild" ~ "unknown",

                  # validating hh members via source
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "genetic_stepchild" ~ child_match_type_minor.source,
                  child_match_type_minor.source  == "genetic_label" & child_match_type_minor.hh == "genetic_stepchild" ~ child_match_type_minor.source,
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "social_multigen" ~ child_match_type_minor.source,
                  child_match_type_minor.source  == "unknown" & child_match_type_minor.hh == "social_multigen" ~ child_match_type_minor.source,
                  child_match_type_minor.hh == "missing match_stell" ~ child_match_type_minor.source,

                  # validating found partners
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "genetic_partner" ~ child_match_type_minor.source,
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_partner" ~ child_match_type_minor.source,
                  
                  # finding previously missing parents
                  child_match_type_minor.source  == "unknown" & child_match_type_major.hh == "social" ~ child_match_type_minor.hh,
                  child_match_type_minor.source  == "unknown" & child_match_type_major.hh == "genetic" ~ child_match_type_minor.hh,
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & !is.na(child_match_type_minor.hh) ~ child_match_type_minor.hh,
                  is.na(child_match_type_minor.source) & child_match_rel != "child" & !is.na(child_match_rel) ~ child_match_type_minor.hh,
                  
                  # label and adoptive children
                  child_match_type_minor.source  == "genetic_label" & child_match_type_minor.hh == "social_known" ~ child_match_type_minor.hh,
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_label" ~ child_match_type_minor.source,
                  
                  # agreements
                  child_match_type_major.source  == "genetic" & child_match_type_major.hh == "genetic" ~ child_match_type_minor.source,
                  child_match_type_major.source  == "social" & child_match_type_major.hh == "social" ~ child_match_type_minor.source,
                  is.na(child_match_type_minor.hh) ~ child_match_type_minor.source,
                  child_match_type_minor.hh == "missing_match_stell" ~ child_match_type_minor.source,
                  TRUE ~ "unresolved combi"),
             merge_source = case_when(
                  # conflicts
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "social_known" ~ "hh_members",
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_known" ~ pointer_source,
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_stepchild" ~ pointer_source,

                  # validating hh members via source
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "genetic_stepchild" ~ pointer_source,
                  child_match_type_minor.source  == "genetic_label" & child_match_type_minor.hh == "genetic_stepchild" ~ pointer_source,
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "social_multigen" ~ pointer_source,
                  child_match_type_minor.source  == "unknown" & child_match_type_minor.hh == "social_multigen" ~ pointer_source,
                  child_match_type_minor.hh == "missing match_stell" ~ pointer_source,
                  
                  # validating found partners
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "genetic_partner" ~ pointer_source,
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_partner" ~ pointer_source,
                  
                  # finding previously missing parents
                  child_match_type_minor.source  == "unknown" & child_match_type_major.hh == "social" ~ "hh_members",
                  child_match_type_minor.source  == "unknown" & child_match_type_major.hh == "genetic" ~ "hh_members",
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & !is.na(child_match_type_minor.hh) ~ "hh_members",
                  is.na(child_match_type_minor.source) & child_match_rel != "child" & !is.na(child_match_rel) ~ "hh_members",
                  
                  # label and adoptive children
                  child_match_type_minor.source  == "genetic_label" & child_match_type_minor.hh == "social_known" ~ "hh_members",
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_label" ~ pointer_source,
                  
                  # agreements
                  child_match_type_major.source  == "genetic" & child_match_type_major.hh == "genetic" ~ pointer_source,
                  child_match_type_major.source  == "social" & child_match_type_major.hh == "social" ~ pointer_source,
                  
                  is.na(child_match_type_minor.hh) ~ pointer_source,
                  child_match_type_minor.hh == "missing_match_stell" ~ pointer_source,
                  TRUE ~ "unresolved combi"),
             merge_pointer = case_when(is.na(pointer) & match_gender == "female" ~ "mother",
                                       is.na(pointer) & match_gender == "male" ~ "father",
                                       is.na(match_gender) & !is.na(pointer) ~ pointer,
                                       # there are four cases where the pointer for a female match in the kidlong source was labelled father (see next chunk for explanation)
                                       # match == "female" & pointer == "father" ~ match_gender, 
                                       # pointer != match & !is.na(pointer) & !is.na(match) ~ "gender problem",
                                       !is.na(pointer) & !is.na(match_gender) ~ pointer,
                                       is.na(pointer) & is.na(match_gender) ~ NA_character_,
                                       TRUE ~ "unresolved combi"))
testthat::expect_equal(child_inhh_parents_validation %>% filter(merge_conflict == "unresolved combi") %>% nrow(), 0) 
testthat::expect_equal(child_inhh_parents_validation %>% filter(merge_source == "unresolved combi") %>% nrow(), 0) 
testthat::expect_equal(child_inhh_parents_validation %>% filter(source_pointer_hh_validation == "unresolved combi") %>% nrow(), 0) 
testthat::expect_equal(child_inhh_parents_validation %>% filter(merge_pointer == "unresolved combi") %>% nrow(), 0) 
testthat::expect_equal(child_inhh_parents_validation %>% filter(merge_pointer == "gender problem") %>% nrow(), 0)

# child_inhh_parents_validation %>% filter(merge_conflict == "unresolved combi") %>% count(source_pointer_hh_validation, child_match_type_minor.source, child_match_type_minor.hh, source) %>% arrange(n)

# child_inhh_parents_validation %>% filter(merge_source == "unresolved combi") %>% tabyl(child_match_type_minor.source, child_match_type_minor.hh)
# child_inhh_parents_validation %>% filter(merge_pointer == "gender problem") %>% View()
# there are four cases where the pointer for a female match in the kidlong source was labelled father, even though the person is female. this is due to the coding of father_id = partner of mother and the assumption in this coding, that partners of mothers must be male (aka fathers)

let’s see

What are the frequencies for reasons and the source from which we updated the child_match_type_minor?

child_inhh_parents_validation %>% 
      distinct(child_id, pointer_id, .keep_all = T) %>% 
      count(source_pointer_hh_validation, merge_source) %>% arrange(desc(n))
# child_inhh_parents_validation %>% tabyl(source_pointer_hh_validation, merge_source, show_missing_levels = F) %>% adorn_title()
child_inhh_parents_validation %>% 
   distinct(child_id, pointer_id, .keep_all = T) %>% 
   count(source_pointer_hh_validation, merge_pointer) %>% pivot_wider(names_from = merge_pointer, values_from = n)

What are the frequencies for reasons and the child_type_minor which we updated?

child_inhh_parents_validation %>% 
   filter(!is.na(pointer_source)) %>% 
   distinct(child_id, pointer_id, .keep_all = T) %>% 
   count(merge_pointer, source_pointer_hh_validation, merge_conflict) %>% 
   pivot_wider(names_from = merge_pointer, values_from = n) %>% head(15)

child_inhh_parents_validated

child_inhh_parents_validated <- child_inhh_parents_validation %>% 
      select(-child_match_type_major.source, -child_match_type_minor.source, -child_match_type_minor.hh, -child_match_type_major.hh, -pointer_source, -pointer) %>% 
      select(pointer_source = merge_source, child_match_type_minor = merge_conflict, pointer = merge_pointer, everything()) %>% 
      mutate(child_match_type_major = case_when(child_match_type_minor == "genetic_known" ~ "genetic",
                                                child_match_type_minor == "unknown" ~ "genetic",
                                                child_match_type_minor == "genetic_label" ~ "genetic",
                                                child_match_type_minor == "genetic_stepchild" ~ "genetic",
                                                child_match_type_minor == "genetic_partner" ~ "genetic",
                                                child_match_type_minor == "genetic_multigen" ~ "genetic",
                                                child_match_type_minor == "social_multigen" ~ "genetic",
                                                child_match_type_minor == "social_known" ~ "social",
                                                child_match_type_minor == "missing_match_stell" ~ NA_character_,
                                                is.na(child_match_type_minor) ~ NA_character_,
                                                TRUE ~ "needs specification"
                                                ),
             pointer_hid = child_hid) %>% 
      rename_at(vars(contains("match_"), -match_id), list(~str_replace(., "match_", "pointer_"))) %>% 
      select(syear, child_id, child_hid, pointer_hid,
             pointer, pointer_source, source_pointer_hh_validation, child_stell_l, pointer_stell_l, pointer_hh_position5, child_hh_position5,  pointer_inhh,
             pointer_id = match_id, 
             child_pointer_rel,
             child_pointer_type_major, 
             child_pointer_type_minor
             )
testthat::expect_equal(child_inhh_parents_validated %>% filter(child_pointer_type_major == "needs specification") %>% nrow(), 0)
# child_inhh_parents_validated %>% filter(child_match_type_major == "needs specification") %>% tabyl(child_match_type_minor)
child_inhh_parents_validated %>% 
   distinct(child_id, pointer_id, .keep_all = T) %>% 
   filter(!is.na(pointer)) %>% 
   count(pointer, child_pointer_rel, child_pointer_type_major) %>% pivot_wider(names_from = pointer, values_from = n)

lets add the other, previously filtered out cases

children_all_parents <- child_inhh_parents_validated %>% 
   # select important variables
   select(syear, child_id, child_hid, pointer, pointer_id, pointer_hid, pointer_source, source_pointer_hh_validation, pointer_inhh, child_pointer_rel, child_pointer_type_major, child_pointer_type_minor, pointer_hh_position5, child_hh_position5) %>% 
   bind_rows(child_outhh_source_parents_info %>% 
                select(syear, child_id, child_hid, pointer, pointer_id, pointer_hid, pointer_source,
                       pointer_inhh, pointer_last_inhh, pointer_netto_lastinhh, pointer_last_anyhh, pointer_netto_lastanyhh, pointer_hid_lastanyhh,
                       child_pointer_rel = child_match_rel, 
                       child_pointer_type_major = child_match_type_major, 
                       child_pointer_type_minor = child_match_type_minor)
                ) %>% 
   arrange(syear, child_id, pointer, pointer_id)
children_all_parents %>% 
   distinct(child_id, pointer_id, .keep_all = T) %>% 
   tabyl(source_pointer_hh_validation, child_pointer_type_minor)
export(children_all_parents, file = here::here("data/children_all_parents_v36.rds"))
# children_all_parents %>%
#    distinct(child_id, pointer_id, .keep_all = T) %>% 
#    tabyl(child_pointer_type_major, inhh, pointer) %>% adorn_title()

options("tidylog.display" = list())

children_all_parents %>%  
   distinct(child_id, pointer_id, .keep_all = T) %>%
   filter(child_pointer_rel == "child") %>% 
   count(child_pointer_type_major, child_pointer_rel, pointer_inhh, pointer) %>% pivot_wider(names_from = pointer, values_from = n) %>%
   arrange(child_pointer_type_major, desc(pointer_inhh))
# children_all_parents %>%
#    distinct(child_id, pointer_id, .keep_all = T) %>% 
#    count(child_pointer_type_major, pointer_inhh, pointer) %>% 
#    pivot_wider(names_from = c("pointer_inhh", "pointer"), values_from = n, names_prefix = "pointer_inhh: ", names_sep = " ") %>% 
#    select(1,2,4,3,5)

step2: make pedigree

genetic predigree

key: child_id, mother_id, father_id

options("tidylog.display" = NULL)

genetic_pedigree <- child_parent_base %>% distinct(child_id, pointer) %>% 
   left_join(children_all_parents %>%  
                filter(child_pointer_type_major == "genetic",
                       child_pointer_rel == "child") %>% 
                tidylog::distinct(syear, child_id, pointer, .keep_all = T), by = c("child_id", "pointer"))

genetic_pedigree_wide <- genetic_pedigree %>%
   distinct(child_id, pointer, pointer_id) %>% 
   pivot_wider(names_from = pointer, values_from = pointer_id) %>% 
   # tidyr::unnest() %>%
         rename(child_id = child_id,
                   mother_id = mother,
                   father_id = father)

options("tidylog.display" = list())
export(genetic_pedigree, file = here::here("data/genetic_pedigree_v36.rds"))
export(genetic_pedigree_wide, file = here::here("data/genetic_pedigree_wide_v36.rds"))
genetic_pedigree_wide %>% skim()
Data summary
Name Piped data
Number of rows 2541
Number of columns 3
_______________________
Column type frequency:
numeric 3
________________________
Group variables

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
child_id 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
mother_id 2074 0.18 22173870 16723266 1233702 8764301 9608802 40871502 41723902 ▇▂▁▁▇
father_id 2158 0.15 21405128 16671574 1161603 8700202 9561301 40623301 41723901 ▇▃▁▁▇

genetic pedigree iGENE only

now we still need to subset the pedigree in a way that only iGENE members are chidlren as well as parents

genetic_pedigree_justigene <- genetic_pedigree %>%
   semi_join(pid_igene_sample, by = c("pointer_id" = "pid")) 

genetic_pedigree_wide_justigene <- genetic_pedigree_justigene %>%
   distinct(child_id, pointer, pointer_id) %>% 
   pivot_wider(names_from = pointer, values_from = pointer_id) %>% 
   # tidyr::unnest() %>%
         rename(child_id = child_id,
                   mother_id = mother,
                   father_id = father)
export(genetic_pedigree_justigene, file = here::here("data/genetic_pedigree_justigene_v36.rds"))
export(genetic_pedigree_wide_justigene, file = here::here("data/genetic_pedigree_wide_justigene_v36.rds"))

social pedigree

social_pedigree <- child_parent_base %>% distinct(child_id, pointer) %>% 
   left_join(children_all_parents %>%  
                filter(child_pointer_type_major == "social",
                       child_pointer_rel == "child") %>% 
                distinct(syear, child_id, pointer, .keep_all = T), by = c("child_id", "pointer"))

social_pedigree_wide <- social_pedigree %>% 
   distinct(child_id, pointer, pointer_id) %>% 
   pivot_wider(names_from = pointer, values_from = pointer_id) %>% 
   # tidyr::unnest() %>% 
         rename(child_id = child_id,
                   social_mother_id = mother,
                   social_father_id = father)
export(social_pedigree, file = here::here("data/social_pedigree_v36.rds"))
export(social_pedigree_wide, file = here::here("data/social_pedigree_wide_v36.rds"))
social_pedigree_wide %>% skim()
Data summary
Name Piped data
Number of rows 2541
Number of columns 3
_______________________
Column type frequency:
numeric 3
________________________
Group variables

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
child_id 0 1.00 25434918 16550687 1161602 8987701 40116302 41060101 41724002 ▅▂▁▁▇
social_mother_id 2528 0.01 18128833 15693671 1395202 8629501 9572803 40210501 40867303 ▇▃▁▁▅
social_father_id 2532 0.00 25312902 18841254 2098804 8588602 40291501 40831501 41677501 ▆▁▁▁▇

social pedigree iGENE only

social_pedigree_justigene <- social_pedigree %>% 
   semi_join(pid_igene_sample, by = c("pointer_id" = "pid"))

social_pedigree_wide_justigene <- social_pedigree_justigene %>% 
   distinct(child_id, pointer, pointer_id) %>% 
   pivot_wider(names_from = pointer, values_from = pointer_id) %>% 
   # tidyr::unnest() %>% 
         rename(child_id = child_id,
                   social_mother_id = mother,
                   social_father_id = father)
export(social_pedigree_justigene, file = here::here("data/social_pedigree_justigene_v36.rds"))
export(social_pedigree_wide_justigene, file = here::here("data/social_pedigree_wide_justigene_v36.rds"))

step3: describe pedigree

genetic pedigree

genetic_pedigree_wide %>% 
   mutate(mom_avail = !is.na(mother_id),
          dad_avail = !is.na(father_id),
          one_avail = ifelse(mom_avail | dad_avail, TRUE, FALSE)
          ) %>% 
   count(one_avail)
genetic_pedigree_wide %>% count("one parent available" = !(is.na(mother_id) & is.na(father_id)))

genetic pedigree iGENE only

How many child-parent triplets do we have?

genetic_pedigree_wide_justigene %>% 
   count("mother avail" = !(is.na(mother_id)), 
         "father avail" = !(is.na(father_id)))
genetic_pedigree_wide_justigene %>% count("one parent available" = !(is.na(mother_id) & is.na(father_id)))

social pedigree

social_pedigree_wide %>% count("one parent available" = !(is.na(social_mother_id) & is.na(social_father_id)))

social pedigree iGENE only

social_pedigree_wide_justigene %>% count("one parent available" = !(is.na(social_mother_id) & is.na(social_father_id)))

How many child-parent triplets do we have?

social_pedigree_wide_justigene %>% 
   count("mother avail" = !(is.na(social_mother_id)),
         "father avail" = !(is.na(social_father_id)))