Review: until now we:

have identified possible parents and their rel. to the child through various parental pointers, and harmonized them
have identified the relationship of iGene-members to each member in their households.

Goal: Now, for each igene-member, we want to add all identified relations of the igene-members as potential children to parents.

Preparation: read data needed in the script

# data of child-parent dictionaries
children_all_source_parents <- import(here::here("data/children_all_source_parents.rds"))

# data of iGENE Members
ppfad_igene_long <- import(here::here("data/ppfad_igene_long_v36.rds"))
ppfad_igene_wide <- ppfad_igene_long %>% distinct(pid, .keep_all = T)
child_parent_base <- import(here::here("data/child_parent_base.rds"))
pid_igene_sample <- import(here::here("data/pid_igene_sample.rds")) %>% tibble(pid = .)

# data of children and their relation to other hh members
igene_hhmember_relations <- import(here::here("data/igene_hhmember_relations.rds"))

# metadata
ppfad_long_cl <- import(here::here("data/ppfad_long_cl_v36.rds"))

step1: add parent pointers to children over time

goal: find out which relation children have to all identified parents

child_source_parents

first, we join the information from the sources to the igene-members in the sample over time

parent datasets: ppfad_igene_long, children_all_source_parents keys: syear, child_id, pointer, pointer_id

# note: social parents can change over time so the join might distort that n
child_source_parents <- ppfad_igene_long %>% 
      select(syear, child_id = pid, child_hid = hid, child_cid = cid) %>%
      tidylog::left_join(children_all_source_parents %>% 
                               drop_na(pointer_id) %>%
                               select(child_id, pointer_id, pointer, 
                                      child_match_type_major, 
                                      child_match_type_minor, 
                                      pointer_source = source, 
                                      child_match_rel), 
                      by = c("child_id")) %>% 
   # this should not remove any rows  
   # add_count(syear, pid, pointer_id) %>% 
   tidylog::distinct(syear, child_id, pointer_id, .keep_all = T)

child_outhh_source_parents

now we use the information on the members of the igene-members households at every survey year to subset those parent_pointers identified before who are not in the household at a given survey year.

parent datasets: child_source_parents, igene_hhmember_relations keys: syear, child_id, pointer, pointer_id

# keys: syear, pid, pointer_id
child_outhh_source_parents <- child_source_parents %>% 
      # this drop_na is important
      drop_na(pointer_id) %>% 
      # drops the rows where the pointer from the sources has a match with one of the persons that live with a child in a certain year
      tidylog::anti_join(igene_hhmember_relations, by = c("syear", "child_id" = "pid", "pointer_id" = "match_id")) %>% 
      # joins hid's  for the survey year in question
      tidylog::left_join(ppfad_long_cl %>% select(syear, pointer_id = pid, pointer_hid = hid), 
                by = c("syear", "pointer_id"))

child_outhh_source_parents %>% count(is.na(pointer_hid))

we cannot find hid’s for n = 1318 parents. to find out why we will look up the last identified hid and the survey year and netto of that year

now we try to locate if the parents had an hid at any point in their time in the soep and what their position was

parent datasets: child_source_parents,

child_outhh_source_parents_long <- child_outhh_source_parents %>% 
      tidylog::distinct(child_id, pointer_id, .keep_all = T) %>% 
      select(-syear, -pointer_hid) %>% 
      tidylog::left_join(ppfad_long_cl %>% select(syear, pointer_id = pid, pointer_hid = hid), 
                by = c("pointer_id"))

child_outhh_source_parents_long %>% count(is.na(pointer_hid))

summary_years_inhh <- child_outhh_source_parents_long %>% 
      mutate(pointer_inhh = ifelse(child_hid == pointer_hid, TRUE, FALSE)) %>% 
      group_by(child_id, pointer_id) %>%
      summarize(years_inhh = sum(pointer_inhh)) %>% 
      ungroup()

summary_lastyear_inhh <- child_outhh_source_parents_long %>%
      filter(child_hid == pointer_hid) %>% 
      group_by(child_id, pointer_id) %>%
      summarize(pointer_last_inhh = max(syear)) %>% 
      ungroup()

summary_lastyear_anyhh <- child_outhh_source_parents_long %>%
      # drop_na(pointer_hid) %>% 
      group_by(child_id, pointer_id) %>%
      summarize(pointer_last_anyhh = max(syear)) %>% 
      ungroup()

check out parents who were in the same hh for some time

child_outhh_source_parents_info <- child_outhh_source_parents %>%  
      # add the information about years in hh
      tidylog::left_join(summary_years_inhh, by = c("child_id", "pointer_id")) %>% 
      # add the information about the last year in hh
      tidylog::left_join(summary_lastyear_inhh, by = c("child_id", "pointer_id")) %>%
      # add information about the last year in any hh
      tidylog::left_join(summary_lastyear_anyhh, by = c("child_id", "pointer_id")) %>% 
      tidylog::left_join(ppfad_long_cl %>% select(syear, pointer_id = pid, pointer_netto_lastinhh = netto_l), 
                by = c("pointer_last_inhh" = "syear", "pointer_id")) %>% 
      tidylog::left_join(ppfad_long_cl %>% select(syear, pointer_id = pid, pointer_hid_lastanyhh = hid, pointer_netto_lastanyhh = netto_l), 
                by = c("pointer_last_anyhh" = "syear", "pointer_id")) %>% 
      tidylog::distinct(syear, child_id, pointer_id, .keep_all = T) %>% 
      mutate(pointer_inhh = case_when(child_hid == pointer_hid ~ TRUE, 
                              child_hid != pointer_hid & !is.na(pointer_hid) ~ FALSE,
                              is.na(pointer_hid) ~ FALSE,
                              TRUE ~ NA) ,
             ever_inhh = case_when(years_inhh == 0 ~ FALSE,
                                   years_inhh > 0 ~ TRUE,
                                   TRUE ~ NA))

# child_outhh_source_parents_info %>% View()

child_outhh_source_parents_info %>% skim(years_inhh)

Data summary
Name	Piped data
Number of rows	1483
Number of columns	19
_______________________
Column type frequency:
numeric	1
________________________
Group variables

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
years_inhh	1107	0.25	0.63	1.9	0	0	0	0	10	▇▁▁▁▁

child_outhh_source_parents_info %>% 
      tidylog::distinct(child_id, pointer_id, .keep_all = T) %>% 
      tabyl(ever_inhh, pointer_inhh) %>% adorn_title()

child_outhh_source_parents_info %>% 
      distinct(child_id, pointer_id, .keep_all = T) %>% 
      tabyl(pointer_netto_lastinhh, pointer, show_missing_levels = F) %>% adorn_title()

child_outhh_source_parents_info %>% 
      distinct(child_id, pointer_id, .keep_all = T) %>%  
      tabyl(child_match_type_minor, pointer, pointer_inhh)

## $`FALSE`
##  child_match_type_minor father mother
##           genetic_known    113    124
##           genetic_label      0      2
##            social_known      5      3
##                 unknown      2      0

child_outhh_source_parents %>% 
      distinct(child_id, pointer_id, .keep_all = T) %>%
      tabyl(pointer_source, pointer)

child_outhh_source_parents %>% names

##  [1] "syear"                  "child_id"               "child_hid"              "child_cid"              "pointer_id"             "pointer"               
##  [7] "child_match_type_major" "child_match_type_minor" "pointer_source"         "child_match_rel"        "pointer_hid"

child_inhh_source_parents

now we turn to the source_parent_pointers who match with one person inside the childs household for a given year

child_inhh_source_parents <- child_source_parents %>% 
   # subset filter out the parent_pointers who are not in hh
   tidylog::anti_join(child_outhh_source_parents, by =  c("syear", "child_id", "pointer_id")) %>% 
   select(syear, pointer, child_id, child_hid, pointer_id, 
          child_match_type_major, child_match_type_minor, pointer_source) %>% 
   # join hh_member_id's
   tidylog::left_join(igene_hhmember_relations %>% select(syear, 
                                                          child_id = pid, 
                                                          child_hid = hid, 
                                                          match_id, 
                                                          child_match_rel = igene_match_rel, 
                                                          match_gender, 
                                                          child_match_type_minor = igene_match_type_minor, 
                                                          child_match_type_major = igene_match_type_major, 
                                                          match_hh_position5,
                                                          child_hh_position5 = igene_hh_position5, 
                                                          child_stell_l = igene_stell_l, 
                                                          match_stell_l), 
                      by = c("syear", "child_id", "child_hid"), suffix = c(".source", ".hh")) %>% 
   # the reason that pointer_id and match  id are not included in the join is that we want to add the child_hh members for  those  children who do not have  a parent pointer yet in order to find more parents via the hh-composition
   tidylog::filter(pointer_id == match_id | is.na(pointer_id)) %>% 
   mutate(pointer_inhh = TRUE)

filter those where pointer_id is missing

child_inhh_source_parents %>% filter(is.na(pointer_id)) %>% count(syear)

#child_inhh_source_parents %>% filter(pointer_id == match_id | is.na(pointer_id)) %>% View()

# # child_inhh_source_parents %>% count(syear, child_id, pointer_id,  test = pointer_id == match_id) %>% pivot_wider(names_from = test, values_from = n)
# #    count(pointer, child_match_type_minor.source) %>% pivot_wider(names_from = "pointer", values_from = n)
#    
# child_inhh_source_parents %>% 
#    group_by(syear, child_id, pointer_id) %>% 
#    summarize(n_pointer_match = sum(match_id == pointer_id)) %>% tabyl(n_pointer_match)

do the matched parent pointers from the source-datasets and the match_id’s from the hh-members agree on the child_match_type?

child_inhh_source_parents %>% 
   drop_na(pointer_id) %>% 
   filter(!str_detect(child_match_type_major.hh, "missing")) %>% 
   distinct(child_id, match_id, .keep_all = T) %>% 
   tabyl(child_match_type_minor.source, child_match_type_major.hh, pointer) %>%
   adorn_totals %>% adorn_title()

## $father
##                                child_match_type_major.hh       
##  child_match_type_minor.source                   genetic social
##                  genetic_known                       358     12
##                  genetic_label                         1      0
##                   social_known                         0      8
##                        unknown                        11      0
##                          Total                       370     20
## 
## $mother
##                                child_match_type_major.hh       
##  child_match_type_minor.source                   genetic social
##                  genetic_known                       432      7
##                  genetic_label                         2      0
##                   social_known                         1      4
##                        unknown                         9      0
##                          Total                       444     11

child_inhh_source_parents %>% 
   drop_na(pointer_id) %>% 
   filter(!str_detect(child_match_type_major.hh, "missing")) %>% 
   distinct(child_id, match_id, .keep_all = T) %>% 
   tabyl(child_match_type_minor.source, child_match_type_major.hh) %>% 
   adorn_title(row = "Relation based on dictionary",
               col = "Relation based on stell variable")

there are some conflicts, but in most cases the classificaitions from both datasets agree

lets see for which igene-members there are new possible parents to be identified

child_inhh_source_parents %>% 
   filter(is.na(child_match_type_major.source)) %>%
   tidylog::distinct(child_id, match_id, .keep_all = T) %>% 
   count(child_match_rel, child_match_type_major.hh) %>% arrange(desc(n))

cross_validation

now we will use both child_match definitions to update unknown cases and resolve conflicts

# we bind those together later
# this filter filters out all child_pointer pairs where the pointer is not in the household, as well as child_hhmatches that are not in the parents from the sources
child_inhh_parents_validation <- child_inhh_source_parents %>% 
      mutate(source_pointer_hh_validation = case_when(
                  # conflicts
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "social_known" ~ paste0("conflict: ", pointer_source, " mistake possible"),
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_known" ~ "conflict: hh_member mistake possible",
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_stepchild" ~ "conflict: hh_member mistake possible",

                  # validating hh members via source
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "genetic_stepchild" ~ "confirm: stepchild is genetic",
                  child_match_type_minor.source  == "genetic_label" & child_match_type_minor.hh == "genetic_stepchild" ~ "confirm: stepchild is genetic",
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "social_multigen" ~ "confirm: child is genetic",
                  child_match_type_minor.source  == "unknown" & child_match_type_minor.hh == "social_multigen" ~ "relation stays unknown",
                  child_match_type_minor.hh == "missing match_stell" ~ "keep source pointer, missing in hh",
                  
                  # validating found partners
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "genetic_partner" ~ "confirm: partner is genetic",
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_partner" ~ "update: partner is social",
                  
                  # finding previously missing parents
                  child_match_type_minor.source  == "unknown" & child_match_type_major.hh == "social" ~ "new: found social parent in hh",
                  child_match_type_minor.source  == "unknown" & child_match_type_major.hh == "genetic" ~ "new: found genetic parent in hh",
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "genetic_known" ~ "new: found genetic parent in hh",
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "genetic_label" ~ "new: found genetic parent in hh",
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "genetic_stepchild" ~ "new: found genetic parent in hh",
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "genetic_multigen" ~ "new: found genetic parent in hh",
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "social_multigen" ~ "new: found genetic parent in hh", 
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "genetic_partner" ~ "new: found partner parent in hh",
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "social_known" ~ "new: found social parent in hh",
                  is.na(child_match_type_minor.source) & child_match_rel != "child" & !is.na(child_match_rel) ~ "new: found other pot. caretaker in hh",
                  
                  # label and adoptive children
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_label" ~ "update: label seems adoptive child",
                  child_match_type_minor.source  == "genetic_label" & child_match_type_minor.hh == "social_known" ~ "update: label seems adoptive child",
                  
                  # agreements
                  child_match_type_major.source  == "genetic" & child_match_type_major.hh == "genetic" ~ "confirm: child is genetic",
                  child_match_type_major.source  == "social" & child_match_type_major.hh == "social" ~ "confirm: child is social",
                  is.na(child_match_type_minor.hh) ~ "keep source pointer, missing in hh",
                  child_match_type_minor.hh == "missing_match_stell" ~ "keep source pointer, missing in hh",
                  TRUE ~ "unresolved combi"),
             merge_conflict = case_when(
                  # conflicts
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "social_known" ~ "unknown",
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_known" ~ "unknown",
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_stepchild" ~ "unknown",

                  # validating hh members via source
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "genetic_stepchild" ~ child_match_type_minor.source,
                  child_match_type_minor.source  == "genetic_label" & child_match_type_minor.hh == "genetic_stepchild" ~ child_match_type_minor.source,
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "social_multigen" ~ child_match_type_minor.source,
                  child_match_type_minor.source  == "unknown" & child_match_type_minor.hh == "social_multigen" ~ child_match_type_minor.source,
                  child_match_type_minor.hh == "missing match_stell" ~ child_match_type_minor.source,

                  # validating found partners
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "genetic_partner" ~ child_match_type_minor.source,
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_partner" ~ child_match_type_minor.source,
                  
                  # finding previously missing parents
                  child_match_type_minor.source  == "unknown" & child_match_type_major.hh == "social" ~ child_match_type_minor.hh,
                  child_match_type_minor.source  == "unknown" & child_match_type_major.hh == "genetic" ~ child_match_type_minor.hh,
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & !is.na(child_match_type_minor.hh) ~ child_match_type_minor.hh,
                  is.na(child_match_type_minor.source) & child_match_rel != "child" & !is.na(child_match_rel) ~ child_match_type_minor.hh,
                  
                  # label and adoptive children
                  child_match_type_minor.source  == "genetic_label" & child_match_type_minor.hh == "social_known" ~ child_match_type_minor.hh,
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_label" ~ child_match_type_minor.source,
                  
                  # agreements
                  child_match_type_major.source  == "genetic" & child_match_type_major.hh == "genetic" ~ child_match_type_minor.source,
                  child_match_type_major.source  == "social" & child_match_type_major.hh == "social" ~ child_match_type_minor.source,
                  is.na(child_match_type_minor.hh) ~ child_match_type_minor.source,
                  child_match_type_minor.hh == "missing_match_stell" ~ child_match_type_minor.source,
                  TRUE ~ "unresolved combi"),
             merge_source = case_when(
                  # conflicts
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "social_known" ~ "hh_members",
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_known" ~ pointer_source,
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_stepchild" ~ pointer_source,

                  # validating hh members via source
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "genetic_stepchild" ~ pointer_source,
                  child_match_type_minor.source  == "genetic_label" & child_match_type_minor.hh == "genetic_stepchild" ~ pointer_source,
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "social_multigen" ~ pointer_source,
                  child_match_type_minor.source  == "unknown" & child_match_type_minor.hh == "social_multigen" ~ pointer_source,
                  child_match_type_minor.hh == "missing match_stell" ~ pointer_source,
                  
                  # validating found partners
                  child_match_type_minor.source  == "genetic_known" & child_match_type_minor.hh == "genetic_partner" ~ pointer_source,
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_partner" ~ pointer_source,
                  
                  # finding previously missing parents
                  child_match_type_minor.source  == "unknown" & child_match_type_major.hh == "social" ~ "hh_members",
                  child_match_type_minor.source  == "unknown" & child_match_type_major.hh == "genetic" ~ "hh_members",
                  is.na(child_match_type_minor.source) & child_match_rel == "child" & !is.na(child_match_type_minor.hh) ~ "hh_members",
                  is.na(child_match_type_minor.source) & child_match_rel != "child" & !is.na(child_match_rel) ~ "hh_members",
                  
                  # label and adoptive children
                  child_match_type_minor.source  == "genetic_label" & child_match_type_minor.hh == "social_known" ~ "hh_members",
                  child_match_type_minor.source  == "social_known" & child_match_type_minor.hh == "genetic_label" ~ pointer_source,
                  
                  # agreements
                  child_match_type_major.source  == "genetic" & child_match_type_major.hh == "genetic" ~ pointer_source,
                  child_match_type_major.source  == "social" & child_match_type_major.hh == "social" ~ pointer_source,
                  
                  is.na(child_match_type_minor.hh) ~ pointer_source,
                  child_match_type_minor.hh == "missing_match_stell" ~ pointer_source,
                  TRUE ~ "unresolved combi"),
             merge_pointer = case_when(is.na(pointer) & match_gender == "female" ~ "mother",
                                       is.na(pointer) & match_gender == "male" ~ "father",
                                       is.na(match_gender) & !is.na(pointer) ~ pointer,
                                       # there are four cases where the pointer for a female match in the kidlong source was labelled father (see next chunk for explanation)
                                       # match == "female" & pointer == "father" ~ match_gender, 
                                       # pointer != match & !is.na(pointer) & !is.na(match) ~ "gender problem",
                                       !is.na(pointer) & !is.na(match_gender) ~ pointer,
                                       is.na(pointer) & is.na(match_gender) ~ NA_character_,
                                       TRUE ~ "unresolved combi"))

testthat::expect_equal(child_inhh_parents_validation %>% filter(merge_conflict == "unresolved combi") %>% nrow(), 0) 
testthat::expect_equal(child_inhh_parents_validation %>% filter(merge_source == "unresolved combi") %>% nrow(), 0) 
testthat::expect_equal(child_inhh_parents_validation %>% filter(source_pointer_hh_validation == "unresolved combi") %>% nrow(), 0) 
testthat::expect_equal(child_inhh_parents_validation %>% filter(merge_pointer == "unresolved combi") %>% nrow(), 0) 
testthat::expect_equal(child_inhh_parents_validation %>% filter(merge_pointer == "gender problem") %>% nrow(), 0)

# child_inhh_parents_validation %>% filter(merge_conflict == "unresolved combi") %>% count(source_pointer_hh_validation, child_match_type_minor.source, child_match_type_minor.hh, source) %>% arrange(n)

# child_inhh_parents_validation %>% filter(merge_source == "unresolved combi") %>% tabyl(child_match_type_minor.source, child_match_type_minor.hh)

# child_inhh_parents_validation %>% filter(merge_pointer == "gender problem") %>% View()
# there are four cases where the pointer for a female match in the kidlong source was labelled father, even though the person is female. this is due to the coding of father_id = partner of mother and the assumption in this coding, that partners of mothers must be male (aka fathers)

let’s see

What are the frequencies for reasons and the source from which we updated the child_match_type_minor?

child_inhh_parents_validation %>% 
      distinct(child_id, pointer_id, .keep_all = T) %>% 
      count(source_pointer_hh_validation, merge_source) %>% arrange(desc(n))

# child_inhh_parents_validation %>% tabyl(source_pointer_hh_validation, merge_source, show_missing_levels = F) %>% adorn_title()
child_inhh_parents_validation %>% 
   distinct(child_id, pointer_id, .keep_all = T) %>% 
   count(source_pointer_hh_validation, merge_pointer) %>% pivot_wider(names_from = merge_pointer, values_from = n)

What are the frequencies for reasons and the child_type_minor which we updated?

child_inhh_parents_validation %>% 
   filter(!is.na(pointer_source)) %>% 
   distinct(child_id, pointer_id, .keep_all = T) %>% 
   count(merge_pointer, source_pointer_hh_validation, merge_conflict) %>% 
   pivot_wider(names_from = merge_pointer, values_from = n) %>% head(15)

child_inhh_parents_validated

child_inhh_parents_validated <- child_inhh_parents_validation %>% 
      select(-child_match_type_major.source, -child_match_type_minor.source, -child_match_type_minor.hh, -child_match_type_major.hh, -pointer_source, -pointer) %>% 
      select(pointer_source = merge_source, child_match_type_minor = merge_conflict, pointer = merge_pointer, everything()) %>% 
      mutate(child_match_type_major = case_when(child_match_type_minor == "genetic_known" ~ "genetic",
                                                child_match_type_minor == "unknown" ~ "genetic",
                                                child_match_type_minor == "genetic_label" ~ "genetic",
                                                child_match_type_minor == "genetic_stepchild" ~ "genetic",
                                                child_match_type_minor == "genetic_partner" ~ "genetic",
                                                child_match_type_minor == "genetic_multigen" ~ "genetic",
                                                child_match_type_minor == "social_multigen" ~ "genetic",
                                                child_match_type_minor == "social_known" ~ "social",
                                                child_match_type_minor == "missing_match_stell" ~ NA_character_,
                                                is.na(child_match_type_minor) ~ NA_character_,
                                                TRUE ~ "needs specification"
                                                ),
             pointer_hid = child_hid) %>% 
      rename_at(vars(contains("match_"), -match_id), list(~str_replace(., "match_", "pointer_"))) %>% 
      select(syear, child_id, child_hid, pointer_hid,
             pointer, pointer_source, source_pointer_hh_validation, child_stell_l, pointer_stell_l, pointer_hh_position5, child_hh_position5,  pointer_inhh,
             pointer_id = match_id, 
             child_pointer_rel,
             child_pointer_type_major, 
             child_pointer_type_minor
             )

testthat::expect_equal(child_inhh_parents_validated %>% filter(child_pointer_type_major == "needs specification") %>% nrow(), 0)
# child_inhh_parents_validated %>% filter(child_match_type_major == "needs specification") %>% tabyl(child_match_type_minor)

child_inhh_parents_validated %>% 
   distinct(child_id, pointer_id, .keep_all = T) %>% 
   filter(!is.na(pointer)) %>% 
   count(pointer, child_pointer_rel, child_pointer_type_major) %>% pivot_wider(names_from = pointer, values_from = n)

lets add the other, previously filtered out cases

children_all_parents <- child_inhh_parents_validated %>% 
   # select important variables
   select(syear, child_id, child_hid, pointer, pointer_id, pointer_hid, pointer_source, source_pointer_hh_validation, pointer_inhh, child_pointer_rel, child_pointer_type_major, child_pointer_type_minor, pointer_hh_position5, child_hh_position5) %>% 
   bind_rows(child_outhh_source_parents_info %>% 
                select(syear, child_id, child_hid, pointer, pointer_id, pointer_hid, pointer_source,
                       pointer_inhh, pointer_last_inhh, pointer_netto_lastinhh, pointer_last_anyhh, pointer_netto_lastanyhh, pointer_hid_lastanyhh,
                       child_pointer_rel = child_match_rel, 
                       child_pointer_type_major = child_match_type_major, 
                       child_pointer_type_minor = child_match_type_minor)
                ) %>% 
   arrange(syear, child_id, pointer, pointer_id)

children_all_parents %>% 
   distinct(child_id, pointer_id, .keep_all = T) %>% 
   tabyl(source_pointer_hh_validation, child_pointer_type_minor)

export(children_all_parents, file = here::here("data/children_all_parents_v36.rds"))

# children_all_parents %>%
#    distinct(child_id, pointer_id, .keep_all = T) %>% 
#    tabyl(child_pointer_type_major, inhh, pointer) %>% adorn_title()

options("tidylog.display" = list())

children_all_parents %>%  
   distinct(child_id, pointer_id, .keep_all = T) %>%
   filter(child_pointer_rel == "child") %>% 
   count(child_pointer_type_major, child_pointer_rel, pointer_inhh, pointer) %>% pivot_wider(names_from = pointer, values_from = n) %>%
   arrange(child_pointer_type_major, desc(pointer_inhh))

# children_all_parents %>%
#    distinct(child_id, pointer_id, .keep_all = T) %>% 
#    count(child_pointer_type_major, pointer_inhh, pointer) %>% 
#    pivot_wider(names_from = c("pointer_inhh", "pointer"), values_from = n, names_prefix = "pointer_inhh: ", names_sep = " ") %>% 
#    select(1,2,4,3,5)

step2: make pedigree

genetic predigree

key: child_id, mother_id, father_id

options("tidylog.display" = NULL)

genetic_pedigree <- child_parent_base %>% distinct(child_id, pointer) %>% 
   left_join(children_all_parents %>%  
                filter(child_pointer_type_major == "genetic",
                       child_pointer_rel == "child") %>% 
                tidylog::distinct(syear, child_id, pointer, .keep_all = T), by = c("child_id", "pointer"))

genetic_pedigree_wide <- genetic_pedigree %>%
   distinct(child_id, pointer, pointer_id) %>% 
   pivot_wider(names_from = pointer, values_from = pointer_id) %>% 
   # tidyr::unnest() %>%
         rename(child_id = child_id,
                   mother_id = mother,
                   father_id = father)

options("tidylog.display" = list())

export(genetic_pedigree, file = here::here("data/genetic_pedigree_v36.rds"))
export(genetic_pedigree_wide, file = here::here("data/genetic_pedigree_wide_v36.rds"))

genetic_pedigree_wide %>% skim()

Data summary
Name	Piped data
Number of rows	2541
Number of columns	3
_______________________
Column type frequency:
numeric	3
________________________
Group variables

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
child_id	0	1.00	25434918	16550687	1161602	8987701	40116302	41060101	41724002	▅▂▁▁▇
mother_id	2074	0.18	22173870	16723266	1233702	8764301	9608802	40871502	41723902	▇▂▁▁▇
father_id	2158	0.15	21405128	16671574	1161603	8700202	9561301	40623301	41723901	▇▃▁▁▇

genetic pedigree iGENE only

now we still need to subset the pedigree in a way that only iGENE members are chidlren as well as parents

genetic_pedigree_justigene <- genetic_pedigree %>%
   semi_join(pid_igene_sample, by = c("pointer_id" = "pid")) 

genetic_pedigree_wide_justigene <- genetic_pedigree_justigene %>%
   distinct(child_id, pointer, pointer_id) %>% 
   pivot_wider(names_from = pointer, values_from = pointer_id) %>% 
   # tidyr::unnest() %>%
         rename(child_id = child_id,
                   mother_id = mother,
                   father_id = father)

export(genetic_pedigree_justigene, file = here::here("data/genetic_pedigree_justigene_v36.rds"))
export(genetic_pedigree_wide_justigene, file = here::here("data/genetic_pedigree_wide_justigene_v36.rds"))

social pedigree

social_pedigree <- child_parent_base %>% distinct(child_id, pointer) %>% 
   left_join(children_all_parents %>%  
                filter(child_pointer_type_major == "social",
                       child_pointer_rel == "child") %>% 
                distinct(syear, child_id, pointer, .keep_all = T), by = c("child_id", "pointer"))

social_pedigree_wide <- social_pedigree %>% 
   distinct(child_id, pointer, pointer_id) %>% 
   pivot_wider(names_from = pointer, values_from = pointer_id) %>% 
   # tidyr::unnest() %>% 
         rename(child_id = child_id,
                   social_mother_id = mother,
                   social_father_id = father)

export(social_pedigree, file = here::here("data/social_pedigree_v36.rds"))
export(social_pedigree_wide, file = here::here("data/social_pedigree_wide_v36.rds"))

social_pedigree_wide %>% skim()

Data summary
Name	Piped data
Number of rows	2541
Number of columns	3
_______________________
Column type frequency:
numeric	3
________________________
Group variables

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
child_id	0	1.00	25434918	16550687	1161602	8987701	40116302	41060101	41724002	▅▂▁▁▇
social_mother_id	2528	0.01	18128833	15693671	1395202	8629501	9572803	40210501	40867303	▇▃▁▁▅
social_father_id	2532	0.00	25312902	18841254	2098804	8588602	40291501	40831501	41677501	▆▁▁▁▇

social pedigree iGENE only

social_pedigree_justigene <- social_pedigree %>% 
   semi_join(pid_igene_sample, by = c("pointer_id" = "pid"))

social_pedigree_wide_justigene <- social_pedigree_justigene %>% 
   distinct(child_id, pointer, pointer_id) %>% 
   pivot_wider(names_from = pointer, values_from = pointer_id) %>% 
   # tidyr::unnest() %>% 
         rename(child_id = child_id,
                   social_mother_id = mother,
                   social_father_id = father)

export(social_pedigree_justigene, file = here::here("data/social_pedigree_justigene_v36.rds"))
export(social_pedigree_wide_justigene, file = here::here("data/social_pedigree_wide_justigene_v36.rds"))

step3: describe pedigree

genetic pedigree

genetic_pedigree_wide %>% 
   mutate(mom_avail = !is.na(mother_id),
          dad_avail = !is.na(father_id),
          one_avail = ifelse(mom_avail | dad_avail, TRUE, FALSE)
          ) %>% 
   count(one_avail)

genetic_pedigree_wide %>% count("one parent available" = !(is.na(mother_id) & is.na(father_id)))

genetic pedigree iGENE only

How many child-parent triplets do we have?

genetic_pedigree_wide_justigene %>% 
   count("mother avail" = !(is.na(mother_id)), 
         "father avail" = !(is.na(father_id)))

genetic_pedigree_wide_justigene %>% count("one parent available" = !(is.na(mother_id) & is.na(father_id)))

social pedigree

social_pedigree_wide %>% count("one parent available" = !(is.na(social_mother_id) & is.na(social_father_id)))

social pedigree iGENE only

social_pedigree_wide_justigene %>% count("one parent available" = !(is.na(social_mother_id) & is.na(social_father_id)))

How many child-parent triplets do we have?

social_pedigree_wide_justigene %>% 
   count("mother avail" = !(is.na(social_mother_id)),
         "father avail" = !(is.na(social_father_id)))

Data wrangling: Match Relations between iGENE-Members and Parental Pointers

step1: add parent pointers to children over time

child_source_parents

child_outhh_source_parents

child_inhh_source_parents

cross_validation

child_inhh_parents_validated

step2: make pedigree

genetic predigree

genetic pedigree iGENE only

social pedigree

social pedigree iGENE only

step3: describe pedigree

genetic pedigree

genetic pedigree iGENE only

social pedigree

social pedigree iGENE only