Review: until now we:
Goal: Now, for each igene-member, we want to add all identified relations of the igene-members as potential children to parents.
Preparation: read data needed in the script
# data of child-parent dictionaries
children_all_source_parents <- import(here::here("data/children_all_source_parents.rds"))
# data of iGENE Members
ppfad_igene_long <- import(here::here("data/ppfad_igene_long_v36.rds"))
ppfad_igene_wide <- ppfad_igene_long %>% distinct(pid, .keep_all = T)
child_parent_base <- import(here::here("data/child_parent_base.rds"))
pid_igene_sample <- import(here::here("data/pid_igene_sample.rds")) %>% tibble(pid = .)
# data of children and their relation to other hh members
igene_hhmember_relations <- import(here::here("data/igene_hhmember_relations.rds"))
# metadata
ppfad_long_cl <- import(here::here("data/ppfad_long_cl_v36.rds"))
goal: find out which relation children have to all identified parents
first, we join the information from the sources to the igene-members in the sample over time
parent datasets: ppfad_igene_long, children_all_source_parents keys: syear, child_id, pointer, pointer_id
# note: social parents can change over time so the join might distort that n
child_source_parents <- ppfad_igene_long %>%
select(syear, child_id = pid, child_hid = hid, child_cid = cid) %>%
tidylog::left_join(children_all_source_parents %>%
drop_na(pointer_id) %>%
select(child_id, pointer_id, pointer,
child_match_type_major,
child_match_type_minor,
pointer_source = source,
child_match_rel),
by = c("child_id")) %>%
# this should not remove any rows
# add_count(syear, pid, pointer_id) %>%
tidylog::distinct(syear, child_id, pointer_id, .keep_all = T)
now we use the information on the members of the igene-members households at every survey year to subset those parent_pointers identified before who are not in the household at a given survey year.
parent datasets: child_source_parents, igene_hhmember_relations keys: syear, child_id, pointer, pointer_id
# keys: syear, pid, pointer_id
child_outhh_source_parents <- child_source_parents %>%
# this drop_na is important
drop_na(pointer_id) %>%
# drops the rows where the pointer from the sources has a match with one of the persons that live with a child in a certain year
tidylog::anti_join(igene_hhmember_relations, by = c("syear", "child_id" = "pid", "pointer_id" = "match_id")) %>%
# joins hid's for the survey year in question
tidylog::left_join(ppfad_long_cl %>% select(syear, pointer_id = pid, pointer_hid = hid),
by = c("syear", "pointer_id"))
child_outhh_source_parents %>% count(is.na(pointer_hid))
we cannot find hid’s for n = 1318
parents. to find out why we will look up the last identified hid and the survey year and netto of that year
now we try to locate if the parents had an hid at any point in their time in the soep and what their position was
parent datasets: child_source_parents,
child_outhh_source_parents_long <- child_outhh_source_parents %>%
tidylog::distinct(child_id, pointer_id, .keep_all = T) %>%
select(-syear, -pointer_hid) %>%
tidylog::left_join(ppfad_long_cl %>% select(syear, pointer_id = pid, pointer_hid = hid),
by = c("pointer_id"))
child_outhh_source_parents_long %>% count(is.na(pointer_hid))
summary_years_inhh <- child_outhh_source_parents_long %>%
mutate(pointer_inhh = ifelse(child_hid == pointer_hid, TRUE, FALSE)) %>%
group_by(child_id, pointer_id) %>%
summarize(years_inhh = sum(pointer_inhh)) %>%
ungroup()
summary_lastyear_inhh <- child_outhh_source_parents_long %>%
filter(child_hid == pointer_hid) %>%
group_by(child_id, pointer_id) %>%
summarize(pointer_last_inhh = max(syear)) %>%
ungroup()
summary_lastyear_anyhh <- child_outhh_source_parents_long %>%
# drop_na(pointer_hid) %>%
group_by(child_id, pointer_id) %>%
summarize(pointer_last_anyhh = max(syear)) %>%
ungroup()
check out parents who were in the same hh for some time
child_outhh_source_parents_info <- child_outhh_source_parents %>%
# add the information about years in hh
tidylog::left_join(summary_years_inhh, by = c("child_id", "pointer_id")) %>%
# add the information about the last year in hh
tidylog::left_join(summary_lastyear_inhh, by = c("child_id", "pointer_id")) %>%
# add information about the last year in any hh
tidylog::left_join(summary_lastyear_anyhh, by = c("child_id", "pointer_id")) %>%
tidylog::left_join(ppfad_long_cl %>% select(syear, pointer_id = pid, pointer_netto_lastinhh = netto_l),
by = c("pointer_last_inhh" = "syear", "pointer_id")) %>%
tidylog::left_join(ppfad_long_cl %>% select(syear, pointer_id = pid, pointer_hid_lastanyhh = hid, pointer_netto_lastanyhh = netto_l),
by = c("pointer_last_anyhh" = "syear", "pointer_id")) %>%
tidylog::distinct(syear, child_id, pointer_id, .keep_all = T) %>%
mutate(pointer_inhh = case_when(child_hid == pointer_hid ~ TRUE,
child_hid != pointer_hid & !is.na(pointer_hid) ~ FALSE,
is.na(pointer_hid) ~ FALSE,
TRUE ~ NA) ,
ever_inhh = case_when(years_inhh == 0 ~ FALSE,
years_inhh > 0 ~ TRUE,
TRUE ~ NA))
# child_outhh_source_parents_info %>% View()
child_outhh_source_parents_info %>% skim(years_inhh)
Name | Piped data |
Number of rows | 1483 |
Number of columns | 19 |
_______________________ | |
Column type frequency: | |
numeric | 1 |
________________________ | |
Group variables |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
years_inhh | 1107 | 0.25 | 0.63 | 1.9 | 0 | 0 | 0 | 0 | 10 | ▇▁▁▁▁ |
child_outhh_source_parents_info %>%
tidylog::distinct(child_id, pointer_id, .keep_all = T) %>%
tabyl(ever_inhh, pointer_inhh) %>% adorn_title()
child_outhh_source_parents_info %>%
distinct(child_id, pointer_id, .keep_all = T) %>%
tabyl(pointer_netto_lastinhh, pointer, show_missing_levels = F) %>% adorn_title()
child_outhh_source_parents_info %>%
distinct(child_id, pointer_id, .keep_all = T) %>%
tabyl(child_match_type_minor, pointer, pointer_inhh)
## $`FALSE`
## child_match_type_minor father mother
## genetic_known 113 124
## genetic_label 0 2
## social_known 5 3
## unknown 2 0
child_outhh_source_parents %>%
distinct(child_id, pointer_id, .keep_all = T) %>%
tabyl(pointer_source, pointer)
child_outhh_source_parents %>% names
## [1] "syear" "child_id" "child_hid" "child_cid" "pointer_id" "pointer"
## [7] "child_match_type_major" "child_match_type_minor" "pointer_source" "child_match_rel" "pointer_hid"
now we turn to the source_parent_pointers who match with one person inside the childs household for a given year
child_inhh_source_parents <- child_source_parents %>%
# subset filter out the parent_pointers who are not in hh
tidylog::anti_join(child_outhh_source_parents, by = c("syear", "child_id", "pointer_id")) %>%
select(syear, pointer, child_id, child_hid, pointer_id,
child_match_type_major, child_match_type_minor, pointer_source) %>%
# join hh_member_id's
tidylog::left_join(igene_hhmember_relations %>% select(syear,
child_id = pid,
child_hid = hid,
match_id,
child_match_rel = igene_match_rel,
match_gender,
child_match_type_minor = igene_match_type_minor,
child_match_type_major = igene_match_type_major,
match_hh_position5,
child_hh_position5 = igene_hh_position5,
child_stell_l = igene_stell_l,
match_stell_l),
by = c("syear", "child_id", "child_hid"), suffix = c(".source", ".hh")) %>%
# the reason that pointer_id and match id are not included in the join is that we want to add the child_hh members for those children who do not have a parent pointer yet in order to find more parents via the hh-composition
tidylog::filter(pointer_id == match_id | is.na(pointer_id)) %>%
mutate(pointer_inhh = TRUE)
filter those where pointer_id is missing
child_inhh_source_parents %>% filter(is.na(pointer_id)) %>% count(syear)
#child_inhh_source_parents %>% filter(pointer_id == match_id | is.na(pointer_id)) %>% View()
# # child_inhh_source_parents %>% count(syear, child_id, pointer_id, test = pointer_id == match_id) %>% pivot_wider(names_from = test, values_from = n)
# # count(pointer, child_match_type_minor.source) %>% pivot_wider(names_from = "pointer", values_from = n)
#
# child_inhh_source_parents %>%
# group_by(syear, child_id, pointer_id) %>%
# summarize(n_pointer_match = sum(match_id == pointer_id)) %>% tabyl(n_pointer_match)
do the matched parent pointers from the source-datasets and the match_id’s from the hh-members agree on the child_match_type?
child_inhh_source_parents %>%
drop_na(pointer_id) %>%
filter(!str_detect(child_match_type_major.hh, "missing")) %>%
distinct(child_id, match_id, .keep_all = T) %>%
tabyl(child_match_type_minor.source, child_match_type_major.hh, pointer) %>%
adorn_totals %>% adorn_title()
## $father
## child_match_type_major.hh
## child_match_type_minor.source genetic social
## genetic_known 358 12
## genetic_label 1 0
## social_known 0 8
## unknown 11 0
## Total 370 20
##
## $mother
## child_match_type_major.hh
## child_match_type_minor.source genetic social
## genetic_known 432 7
## genetic_label 2 0
## social_known 1 4
## unknown 9 0
## Total 444 11
child_inhh_source_parents %>%
drop_na(pointer_id) %>%
filter(!str_detect(child_match_type_major.hh, "missing")) %>%
distinct(child_id, match_id, .keep_all = T) %>%
tabyl(child_match_type_minor.source, child_match_type_major.hh) %>%
adorn_title(row = "Relation based on dictionary",
col = "Relation based on stell variable")
lets see for which igene-members there are new possible parents to be identified
child_inhh_source_parents %>%
filter(is.na(child_match_type_major.source)) %>%
tidylog::distinct(child_id, match_id, .keep_all = T) %>%
count(child_match_rel, child_match_type_major.hh) %>% arrange(desc(n))
# we bind those together later
# this filter filters out all child_pointer pairs where the pointer is not in the household, as well as child_hhmatches that are not in the parents from the sources
child_inhh_parents_validation <- child_inhh_source_parents %>%
mutate(source_pointer_hh_validation = case_when(
# conflicts
child_match_type_minor.source == "genetic_known" & child_match_type_minor.hh == "social_known" ~ paste0("conflict: ", pointer_source, " mistake possible"),
child_match_type_minor.source == "social_known" & child_match_type_minor.hh == "genetic_known" ~ "conflict: hh_member mistake possible",
child_match_type_minor.source == "social_known" & child_match_type_minor.hh == "genetic_stepchild" ~ "conflict: hh_member mistake possible",
# validating hh members via source
child_match_type_minor.source == "genetic_known" & child_match_type_minor.hh == "genetic_stepchild" ~ "confirm: stepchild is genetic",
child_match_type_minor.source == "genetic_label" & child_match_type_minor.hh == "genetic_stepchild" ~ "confirm: stepchild is genetic",
child_match_type_minor.source == "genetic_known" & child_match_type_minor.hh == "social_multigen" ~ "confirm: child is genetic",
child_match_type_minor.source == "unknown" & child_match_type_minor.hh == "social_multigen" ~ "relation stays unknown",
child_match_type_minor.hh == "missing match_stell" ~ "keep source pointer, missing in hh",
# validating found partners
child_match_type_minor.source == "genetic_known" & child_match_type_minor.hh == "genetic_partner" ~ "confirm: partner is genetic",
child_match_type_minor.source == "social_known" & child_match_type_minor.hh == "genetic_partner" ~ "update: partner is social",
# finding previously missing parents
child_match_type_minor.source == "unknown" & child_match_type_major.hh == "social" ~ "new: found social parent in hh",
child_match_type_minor.source == "unknown" & child_match_type_major.hh == "genetic" ~ "new: found genetic parent in hh",
is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "genetic_known" ~ "new: found genetic parent in hh",
is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "genetic_label" ~ "new: found genetic parent in hh",
is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "genetic_stepchild" ~ "new: found genetic parent in hh",
is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "genetic_multigen" ~ "new: found genetic parent in hh",
is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "social_multigen" ~ "new: found genetic parent in hh",
is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "genetic_partner" ~ "new: found partner parent in hh",
is.na(child_match_type_minor.source) & child_match_rel == "child" & child_match_type_minor.hh == "social_known" ~ "new: found social parent in hh",
is.na(child_match_type_minor.source) & child_match_rel != "child" & !is.na(child_match_rel) ~ "new: found other pot. caretaker in hh",
# label and adoptive children
child_match_type_minor.source == "social_known" & child_match_type_minor.hh == "genetic_label" ~ "update: label seems adoptive child",
child_match_type_minor.source == "genetic_label" & child_match_type_minor.hh == "social_known" ~ "update: label seems adoptive child",
# agreements
child_match_type_major.source == "genetic" & child_match_type_major.hh == "genetic" ~ "confirm: child is genetic",
child_match_type_major.source == "social" & child_match_type_major.hh == "social" ~ "confirm: child is social",
is.na(child_match_type_minor.hh) ~ "keep source pointer, missing in hh",
child_match_type_minor.hh == "missing_match_stell" ~ "keep source pointer, missing in hh",
TRUE ~ "unresolved combi"),
merge_conflict = case_when(
# conflicts
child_match_type_minor.source == "genetic_known" & child_match_type_minor.hh == "social_known" ~ "unknown",
child_match_type_minor.source == "social_known" & child_match_type_minor.hh == "genetic_known" ~ "unknown",
child_match_type_minor.source == "social_known" & child_match_type_minor.hh == "genetic_stepchild" ~ "unknown",
# validating hh members via source
child_match_type_minor.source == "genetic_known" & child_match_type_minor.hh == "genetic_stepchild" ~ child_match_type_minor.source,
child_match_type_minor.source == "genetic_label" & child_match_type_minor.hh == "genetic_stepchild" ~ child_match_type_minor.source,
child_match_type_minor.source == "genetic_known" & child_match_type_minor.hh == "social_multigen" ~ child_match_type_minor.source,
child_match_type_minor.source == "unknown" & child_match_type_minor.hh == "social_multigen" ~ child_match_type_minor.source,
child_match_type_minor.hh == "missing match_stell" ~ child_match_type_minor.source,
# validating found partners
child_match_type_minor.source == "genetic_known" & child_match_type_minor.hh == "genetic_partner" ~ child_match_type_minor.source,
child_match_type_minor.source == "social_known" & child_match_type_minor.hh == "genetic_partner" ~ child_match_type_minor.source,
# finding previously missing parents
child_match_type_minor.source == "unknown" & child_match_type_major.hh == "social" ~ child_match_type_minor.hh,
child_match_type_minor.source == "unknown" & child_match_type_major.hh == "genetic" ~ child_match_type_minor.hh,
is.na(child_match_type_minor.source) & child_match_rel == "child" & !is.na(child_match_type_minor.hh) ~ child_match_type_minor.hh,
is.na(child_match_type_minor.source) & child_match_rel != "child" & !is.na(child_match_rel) ~ child_match_type_minor.hh,
# label and adoptive children
child_match_type_minor.source == "genetic_label" & child_match_type_minor.hh == "social_known" ~ child_match_type_minor.hh,
child_match_type_minor.source == "social_known" & child_match_type_minor.hh == "genetic_label" ~ child_match_type_minor.source,
# agreements
child_match_type_major.source == "genetic" & child_match_type_major.hh == "genetic" ~ child_match_type_minor.source,
child_match_type_major.source == "social" & child_match_type_major.hh == "social" ~ child_match_type_minor.source,
is.na(child_match_type_minor.hh) ~ child_match_type_minor.source,
child_match_type_minor.hh == "missing_match_stell" ~ child_match_type_minor.source,
TRUE ~ "unresolved combi"),
merge_source = case_when(
# conflicts
child_match_type_minor.source == "genetic_known" & child_match_type_minor.hh == "social_known" ~ "hh_members",
child_match_type_minor.source == "social_known" & child_match_type_minor.hh == "genetic_known" ~ pointer_source,
child_match_type_minor.source == "social_known" & child_match_type_minor.hh == "genetic_stepchild" ~ pointer_source,
# validating hh members via source
child_match_type_minor.source == "genetic_known" & child_match_type_minor.hh == "genetic_stepchild" ~ pointer_source,
child_match_type_minor.source == "genetic_label" & child_match_type_minor.hh == "genetic_stepchild" ~ pointer_source,
child_match_type_minor.source == "genetic_known" & child_match_type_minor.hh == "social_multigen" ~ pointer_source,
child_match_type_minor.source == "unknown" & child_match_type_minor.hh == "social_multigen" ~ pointer_source,
child_match_type_minor.hh == "missing match_stell" ~ pointer_source,
# validating found partners
child_match_type_minor.source == "genetic_known" & child_match_type_minor.hh == "genetic_partner" ~ pointer_source,
child_match_type_minor.source == "social_known" & child_match_type_minor.hh == "genetic_partner" ~ pointer_source,
# finding previously missing parents
child_match_type_minor.source == "unknown" & child_match_type_major.hh == "social" ~ "hh_members",
child_match_type_minor.source == "unknown" & child_match_type_major.hh == "genetic" ~ "hh_members",
is.na(child_match_type_minor.source) & child_match_rel == "child" & !is.na(child_match_type_minor.hh) ~ "hh_members",
is.na(child_match_type_minor.source) & child_match_rel != "child" & !is.na(child_match_rel) ~ "hh_members",
# label and adoptive children
child_match_type_minor.source == "genetic_label" & child_match_type_minor.hh == "social_known" ~ "hh_members",
child_match_type_minor.source == "social_known" & child_match_type_minor.hh == "genetic_label" ~ pointer_source,
# agreements
child_match_type_major.source == "genetic" & child_match_type_major.hh == "genetic" ~ pointer_source,
child_match_type_major.source == "social" & child_match_type_major.hh == "social" ~ pointer_source,
is.na(child_match_type_minor.hh) ~ pointer_source,
child_match_type_minor.hh == "missing_match_stell" ~ pointer_source,
TRUE ~ "unresolved combi"),
merge_pointer = case_when(is.na(pointer) & match_gender == "female" ~ "mother",
is.na(pointer) & match_gender == "male" ~ "father",
is.na(match_gender) & !is.na(pointer) ~ pointer,
# there are four cases where the pointer for a female match in the kidlong source was labelled father (see next chunk for explanation)
# match == "female" & pointer == "father" ~ match_gender,
# pointer != match & !is.na(pointer) & !is.na(match) ~ "gender problem",
!is.na(pointer) & !is.na(match_gender) ~ pointer,
is.na(pointer) & is.na(match_gender) ~ NA_character_,
TRUE ~ "unresolved combi"))
testthat::expect_equal(child_inhh_parents_validation %>% filter(merge_conflict == "unresolved combi") %>% nrow(), 0)
testthat::expect_equal(child_inhh_parents_validation %>% filter(merge_source == "unresolved combi") %>% nrow(), 0)
testthat::expect_equal(child_inhh_parents_validation %>% filter(source_pointer_hh_validation == "unresolved combi") %>% nrow(), 0)
testthat::expect_equal(child_inhh_parents_validation %>% filter(merge_pointer == "unresolved combi") %>% nrow(), 0)
testthat::expect_equal(child_inhh_parents_validation %>% filter(merge_pointer == "gender problem") %>% nrow(), 0)
# child_inhh_parents_validation %>% filter(merge_conflict == "unresolved combi") %>% count(source_pointer_hh_validation, child_match_type_minor.source, child_match_type_minor.hh, source) %>% arrange(n)
# child_inhh_parents_validation %>% filter(merge_source == "unresolved combi") %>% tabyl(child_match_type_minor.source, child_match_type_minor.hh)
# child_inhh_parents_validation %>% filter(merge_pointer == "gender problem") %>% View()
# there are four cases where the pointer for a female match in the kidlong source was labelled father, even though the person is female. this is due to the coding of father_id = partner of mother and the assumption in this coding, that partners of mothers must be male (aka fathers)
let’s see
What are the frequencies for reasons and the source from which we updated the child_match_type_minor?
child_inhh_parents_validation %>%
distinct(child_id, pointer_id, .keep_all = T) %>%
count(source_pointer_hh_validation, merge_source) %>% arrange(desc(n))
# child_inhh_parents_validation %>% tabyl(source_pointer_hh_validation, merge_source, show_missing_levels = F) %>% adorn_title()
child_inhh_parents_validation %>%
distinct(child_id, pointer_id, .keep_all = T) %>%
count(source_pointer_hh_validation, merge_pointer) %>% pivot_wider(names_from = merge_pointer, values_from = n)
What are the frequencies for reasons and the child_type_minor which we updated?
child_inhh_parents_validation %>%
filter(!is.na(pointer_source)) %>%
distinct(child_id, pointer_id, .keep_all = T) %>%
count(merge_pointer, source_pointer_hh_validation, merge_conflict) %>%
pivot_wider(names_from = merge_pointer, values_from = n) %>% head(15)
child_inhh_parents_validated <- child_inhh_parents_validation %>%
select(-child_match_type_major.source, -child_match_type_minor.source, -child_match_type_minor.hh, -child_match_type_major.hh, -pointer_source, -pointer) %>%
select(pointer_source = merge_source, child_match_type_minor = merge_conflict, pointer = merge_pointer, everything()) %>%
mutate(child_match_type_major = case_when(child_match_type_minor == "genetic_known" ~ "genetic",
child_match_type_minor == "unknown" ~ "genetic",
child_match_type_minor == "genetic_label" ~ "genetic",
child_match_type_minor == "genetic_stepchild" ~ "genetic",
child_match_type_minor == "genetic_partner" ~ "genetic",
child_match_type_minor == "genetic_multigen" ~ "genetic",
child_match_type_minor == "social_multigen" ~ "genetic",
child_match_type_minor == "social_known" ~ "social",
child_match_type_minor == "missing_match_stell" ~ NA_character_,
is.na(child_match_type_minor) ~ NA_character_,
TRUE ~ "needs specification"
),
pointer_hid = child_hid) %>%
rename_at(vars(contains("match_"), -match_id), list(~str_replace(., "match_", "pointer_"))) %>%
select(syear, child_id, child_hid, pointer_hid,
pointer, pointer_source, source_pointer_hh_validation, child_stell_l, pointer_stell_l, pointer_hh_position5, child_hh_position5, pointer_inhh,
pointer_id = match_id,
child_pointer_rel,
child_pointer_type_major,
child_pointer_type_minor
)
testthat::expect_equal(child_inhh_parents_validated %>% filter(child_pointer_type_major == "needs specification") %>% nrow(), 0)
# child_inhh_parents_validated %>% filter(child_match_type_major == "needs specification") %>% tabyl(child_match_type_minor)
child_inhh_parents_validated %>%
distinct(child_id, pointer_id, .keep_all = T) %>%
filter(!is.na(pointer)) %>%
count(pointer, child_pointer_rel, child_pointer_type_major) %>% pivot_wider(names_from = pointer, values_from = n)
lets add the other, previously filtered out cases
children_all_parents <- child_inhh_parents_validated %>%
# select important variables
select(syear, child_id, child_hid, pointer, pointer_id, pointer_hid, pointer_source, source_pointer_hh_validation, pointer_inhh, child_pointer_rel, child_pointer_type_major, child_pointer_type_minor, pointer_hh_position5, child_hh_position5) %>%
bind_rows(child_outhh_source_parents_info %>%
select(syear, child_id, child_hid, pointer, pointer_id, pointer_hid, pointer_source,
pointer_inhh, pointer_last_inhh, pointer_netto_lastinhh, pointer_last_anyhh, pointer_netto_lastanyhh, pointer_hid_lastanyhh,
child_pointer_rel = child_match_rel,
child_pointer_type_major = child_match_type_major,
child_pointer_type_minor = child_match_type_minor)
) %>%
arrange(syear, child_id, pointer, pointer_id)
children_all_parents %>%
distinct(child_id, pointer_id, .keep_all = T) %>%
tabyl(source_pointer_hh_validation, child_pointer_type_minor)
export(children_all_parents, file = here::here("data/children_all_parents_v36.rds"))
# children_all_parents %>%
# distinct(child_id, pointer_id, .keep_all = T) %>%
# tabyl(child_pointer_type_major, inhh, pointer) %>% adorn_title()
options("tidylog.display" = list())
children_all_parents %>%
distinct(child_id, pointer_id, .keep_all = T) %>%
filter(child_pointer_rel == "child") %>%
count(child_pointer_type_major, child_pointer_rel, pointer_inhh, pointer) %>% pivot_wider(names_from = pointer, values_from = n) %>%
arrange(child_pointer_type_major, desc(pointer_inhh))
# children_all_parents %>%
# distinct(child_id, pointer_id, .keep_all = T) %>%
# count(child_pointer_type_major, pointer_inhh, pointer) %>%
# pivot_wider(names_from = c("pointer_inhh", "pointer"), values_from = n, names_prefix = "pointer_inhh: ", names_sep = " ") %>%
# select(1,2,4,3,5)
key: child_id, mother_id, father_id
options("tidylog.display" = NULL)
genetic_pedigree <- child_parent_base %>% distinct(child_id, pointer) %>%
left_join(children_all_parents %>%
filter(child_pointer_type_major == "genetic",
child_pointer_rel == "child") %>%
tidylog::distinct(syear, child_id, pointer, .keep_all = T), by = c("child_id", "pointer"))
genetic_pedigree_wide <- genetic_pedigree %>%
distinct(child_id, pointer, pointer_id) %>%
pivot_wider(names_from = pointer, values_from = pointer_id) %>%
# tidyr::unnest() %>%
rename(child_id = child_id,
mother_id = mother,
father_id = father)
options("tidylog.display" = list())
export(genetic_pedigree, file = here::here("data/genetic_pedigree_v36.rds"))
export(genetic_pedigree_wide, file = here::here("data/genetic_pedigree_wide_v36.rds"))
genetic_pedigree_wide %>% skim()
Name | Piped data |
Number of rows | 2541 |
Number of columns | 3 |
_______________________ | |
Column type frequency: | |
numeric | 3 |
________________________ | |
Group variables |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
child_id | 0 | 1.00 | 25434918 | 16550687 | 1161602 | 8987701 | 40116302 | 41060101 | 41724002 | ▅▂▁▁▇ |
mother_id | 2074 | 0.18 | 22173870 | 16723266 | 1233702 | 8764301 | 9608802 | 40871502 | 41723902 | ▇▂▁▁▇ |
father_id | 2158 | 0.15 | 21405128 | 16671574 | 1161603 | 8700202 | 9561301 | 40623301 | 41723901 | ▇▃▁▁▇ |
now we still need to subset the pedigree in a way that only iGENE members are chidlren as well as parents
genetic_pedigree_justigene <- genetic_pedigree %>%
semi_join(pid_igene_sample, by = c("pointer_id" = "pid"))
genetic_pedigree_wide_justigene <- genetic_pedigree_justigene %>%
distinct(child_id, pointer, pointer_id) %>%
pivot_wider(names_from = pointer, values_from = pointer_id) %>%
# tidyr::unnest() %>%
rename(child_id = child_id,
mother_id = mother,
father_id = father)
export(genetic_pedigree_justigene, file = here::here("data/genetic_pedigree_justigene_v36.rds"))
export(genetic_pedigree_wide_justigene, file = here::here("data/genetic_pedigree_wide_justigene_v36.rds"))
genetic_pedigree_wide %>%
mutate(mom_avail = !is.na(mother_id),
dad_avail = !is.na(father_id),
one_avail = ifelse(mom_avail | dad_avail, TRUE, FALSE)
) %>%
count(one_avail)
genetic_pedigree_wide %>% count("one parent available" = !(is.na(mother_id) & is.na(father_id)))
How many child-parent triplets do we have?
genetic_pedigree_wide_justigene %>%
count("mother avail" = !(is.na(mother_id)),
"father avail" = !(is.na(father_id)))
genetic_pedigree_wide_justigene %>% count("one parent available" = !(is.na(mother_id) & is.na(father_id)))
social pedigree
Variable type: numeric