Review: Beforehand (insert link) we have identified all possible parental pointers for the igene-members in our sample.
Goal: Now we want to identify the relations that igene-members have with each member in their households.
preparation: read data needed in the script
# data of igene-sample
ppfad_igene_long <- import(here::here("data/ppfad_igene_long_v36.rds"))
pid_igene_sample <- import(here::here("data/pid_igene_sample.rds")) %>% tibble(pid = .)
# data with parental pointers
pbrutto_long_cl <- import(here::here("data/pbrutto_long_cl_v36.rds"))
ppfad_long_cl <- import(here::here("data/ppfad_long_cl_v36.rds"))
One important information in this regard is the relation to the head of household (available in: pbrutto_long$stell_l
). It is almost the only reliable source which contains information on whether a sample member is biologically or socially related to other people in the household. You can see the values in the table below. The codes 20-30 contain possible relations of children to the head of the household.
goal: create dataset rel_hhh
containing the relation to the head of hhh in various forms.
pbrutto_long_cl %>% tabyl(stell_l, show_missing_levels = F) %>% adorn_pct_formatting()
# ppfad_long_cl %>% arrange(desc(syear)) %>% distinct(pid, .keep_all = T) %>% tabyl(netto_l, show_missing_levels = F) %>% adorn_pct_formatting()
parent dataset: pbrutto_long_cl (generated in 00_import_data.Rmd) new variables:
unique keys: syear, pid
rel_hhh <- pbrutto_long_cl %>%
mutate(hh_position8 = case_when(stell == 0 ~ "head",
dplyr::between(stell, 10, 19) ~ "partner",
dplyr::between(stell, 20, 29) ~ "child",
dplyr::between(stell, 30, 39) ~ "parent",
dplyr::between(stell, 40, 49) ~ "sibling",
dplyr::between(stell, 50, 59) ~ "sibling-in-law",
dplyr::between(stell, 60, 69) ~ "relative",
dplyr::between(stell, 70, 99) ~ "other/unknown",
TRUE ~ NA_character_),
hh_position5 = case_when(stell == 0 ~ "head",
dplyr::between(stell, 10, 19) ~ "partner",
dplyr::between(stell, 20, 29) ~ "child",
dplyr::between(stell, 30, 39) ~ "relative",
dplyr::between(stell, 40, 49) ~ "relative",
dplyr::between(stell, 50, 59) ~ "other/unknown",
dplyr::between(stell, 60, 69) ~ "relative",
dplyr::between(stell, 70, 99) ~ "other/unknown",
TRUE ~ NA_character_),
flag_ischild = dplyr::between(stell, 20, 29),
hhh_rel_type = case_when(stell == 0 ~ "head_genetic_genetic_known",
# partner
stell %in% c(11, 12, 13) ~ "partner_social_social_known",
# children and grandchildren
stell %in% c(20) ~ "child_genetic_genetic_label",
stell %in% c(21) ~ "child_genetic_genetic_known",
stell %in% c(25, 26) ~ "grandchild_genetic_genetic_known",
stell %in% c(22, 23, 24, 27) ~ "child_social_social_known",
# parents grandparents
stell %in% c(30) ~ "parent_genetic_genetic_label",
stell %in% c(31, 36) ~ "parent_genetic_genetic_known",
stell %in% c(32, 33, 34, 35) ~ "parent_social_social_known",
# siblings
stell %in% c(40) ~ "sibling_genetic_genetic_label",
stell %in% c(41, 42) ~ "sibling_genetic_genetic_known",
stell %in% c(43, 44, 45, 51, 52) ~ "sibling_social_social_known",
# other relatives
stell %in% c(60, 61, 62, 63) ~ "relative_genetic_genetic_known",
stell %in% c(64, 70, 71, 99) ~ "other_social_social_known",
is.na(stell) ~ NA_character_,
TRUE ~ "mistake")) %>%
separate(hhh_rel_type, into = c("hhh_rel", "hhh_type_major", "hhh_type_minor1", "hhh_type_minor2"), remove = F) %>%
unite(hhh_type_minor, hhh_type_minor1, hhh_type_minor2) %>%
select(pid, hid, cid, syear, stell, stell_l, flag_ischild, contains("hh_position"), hhh_rel_type, hhh_rel, hhh_type_major, hhh_type_minor)
rio::export(rel_hhh, here::here("data/rel_hhh.rds"))
check if rel_hhh sample-members are unique by survey year and pid
testthat::expect_equal(nrow(rel_hhh), nrow(rel_hhh %>% distinct(syear, pid)))
check if we forgot to code any of the stell categories (in case new ones are added over time)
testthat::expect_equal(rel_hhh %>% filter(hhh_rel_type == "mistake") %>% nrow(), 0)
Frequencies of Relations to the Head of Household (hhh)
In the table below we used to see the frequencies of the relation to the head of hh types accumulated over all available years (from 1998)
If we run the commented out code for each year, we see that the case numbers increase over time. Being in the 300s in each category before 2009 and then going up to over 3000 ans then 8000
rel_hhh %>% tabyl(hhh_type_minor, hhh_type_major)
# rel_hhh %>% tabyl(hhh_type_minor, hhh_type_major, syear)
Now we want to connect the igene-members to each of their household members.
goal: create one row for each igene-member and household member in a igene-member’s household
parent dataset(s): ppfad_igene_long, ppfad_long_cl
new variables:
unique keys: syear, pid, match_id
other_hhmembers <- ppfad_igene_long %>%
tidylog::distinct(syear, pid, hid) %>%
tidylog::left_join(ppfad_long_cl %>%
select(syear,
match_id = pid, hid,
match_sex = sex,
match_sex_l = sex_l),
by = c("syear", "hid")) %>%
tidylog::filter(pid != match_id) %>%
mutate(match_gender = case_when(match_sex == 1 ~ "male", # match_sex = [1] maennlich
match_sex == 2 ~ "female",
TRUE ~ NA_character_)) %>% # match_sex = [2] weiblich
select(-match_sex, -match_sex_l)
## distinct: no rows removed
## left_join: added 3 columns (match_id, match_sex, match_sex_l)
## > rows only in x 3
## > rows only in y (351,313)
## > matched rows 52,510 (includes duplicates)
## > =========
## > rows total 52,513
## filter: removed 20,575 rows (39%), 31,938 rows remaining
igene_hhmembers <- ppfad_igene_long %>%
tidylog::distinct(syear, pid, hid) %>%
tidylog::left_join(other_hhmembers, by = c("pid", "syear", "hid")) %>%
tidylog::mutate(flag_singlehh = is.na(match_id))
## distinct: no rows removed
## left_join: added 2 columns (match_id, match_gender)
## > rows only in x 4,422
## > rows only in y ( 0)
## > matched rows 31,938 (includes duplicates)
## > ========
## > rows total 36,360
## mutate: new variable 'flag_singlehh' with 2 unique values and 0% NA
check whether all iGENE members are still in the sample
testthat::expect_equal(igene_hhmembers %>% distinct(pid) %>% nrow(),
ppfad_igene_long %>% distinct(pid) %>% nrow())
# child_hhmembers %>% skim
igene_hhmembers %>%
filter(!flag_singlehh) %>%
distinct(match_id, .keep_all = TRUE) %>%
tabyl(match_gender, show_missing_levels = FALSE)
check if igene-members are unique by survey year, child_id, match_id
testthat::expect_equal(nrow(igene_hhmembers), nrow(igene_hhmembers %>% distinct(syear, pid, match_id)))
Lets start to add the relations to head of household info! First, for the igene-members
goal: add relation to hhh for igene-member
parent dataset(s): igene_hhmembers, rel_hhh
new variables:
unique keys: syear, pid, match_id
relations1 <- igene_hhmembers %>%
left_join(rel_hhh %>% select(syear,
pid,
igene_stell = stell,
igene_stell_l = stell_l,
igene_hh_position8 = hh_position8,
igene_hh_position5 = hh_position5,
igene_hhh_rel_type = hhh_rel_type,
igene_hhh_rel = hhh_rel,
igene_hhh_type_major = hhh_type_major,
igene_hhh_type_minor = hhh_type_minor
),
by = c("pid", "syear"))
check if children are unique by survey year, child_id, match_id
testthat::expect_equal(nrow(relations1), nrow(relations1 %>% distinct(syear, pid, match_id)))
For all igene-members, what is their relationship to the hhh?
relations1 %>%
distinct(pid, .keep_all = T) %>%
tabyl(igene_hhh_type_minor, igene_hhh_type_major) %>%
adorn_totals(c("row", "col")) %>%
adorn_title()
Let’s look at the distribution of the igene-sample members position to head of hh igene_stell_l
.
relations1 %>%
distinct(pid, .keep_all = T) %>%
tabyl(igene_stell_l, show_missing_levels = F) %>%
arrange(desc(n)) %>%
adorn_totals() %>%
adorn_pct_formatting()
And in simpler categories, are the igene-sample members relations to the head of hh biological or social?
relations1 %>%
distinct(pid, .keep_all = T) %>%
filter(igene_stell != 0) %>%
tabyl(igene_hhh_type_minor, show_missing_levels = F) %>%
arrange(desc(n)) %>% adorn_pct_formatting()
Are all the genetic children also children? What is the relation to the hhh ?
n = 19
igene-members seem to be social childrenoptions("tidylog.display" = list()) # turn off tidylog
relations1 %>%
distinct(pid, .keep_all = T) %>%
tabyl(igene_hhh_rel, igene_hhh_type_minor, show_missing_levels = F) %>%
adorn_totals(c("row", "col")) %>% adorn_title(row = "iGENE Sample Member is ... of hhh",
col = "Type of relation")
options("tidylog.display" = NULL) # turn on tidylog
Lets also add the relations of other hh-members to head of household!
goal: add relation to hhh for hh-members
parent dataset(s): relations1, rel_hhh
new variables:
unique keys: syear, pid, match_id
relations2 <- relations1 %>%
# join other hh members info
left_join(rel_hhh %>% select(syear,
match_id = pid,
match_stell = stell,
match_stell_l = stell_l,
match_hh_position8 = hh_position8,
match_hh_position5 = hh_position5,
match_hhh_rel_type = hhh_rel_type,
match_hhh_rel = hhh_rel,
match_hhh_type_major = hhh_type_major,
match_hhh_type_minor = hhh_type_minor
),
by = c("match_id", "syear"))
check if igene-members are unique by survey year, child_id, match_id
testthat::expect_equal(nrow(relations2), nrow(relations2 %>% distinct(syear, pid, match_id)))
relations2 %>%
filter(!flag_singlehh) %>%
distinct(pid, match_id, .keep_all = T) %>%
tabyl(match_hhh_type_major) %>%
adorn_pct_formatting()
relations2 %>%
select(syear, igene_hhh_type_major, match_hhh_type_major) %>%
group_by(syear) %>%
skim
Name | Piped data |
Number of rows | 36360 |
Number of columns | 3 |
_______________________ | |
Column type frequency: | |
character | 2 |
________________________ | |
Group variables | syear |
Variable type: character
skim_variable | syear | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|---|
igene_hhh_type_major | 1998 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 1999 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2000 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2001 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2002 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2003 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2004 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2005 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2006 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2007 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2008 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2009 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2010 | 4 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2011 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2012 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2013 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2014 | 1 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2015 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2016 | 1 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2017 | 1 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2018 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
igene_hhh_type_major | 2019 | 0 | 1.00 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 1998 | 18 | 0.94 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 1999 | 20 | 0.93 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2000 | 20 | 0.93 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2001 | 23 | 0.92 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2002 | 22 | 0.92 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2003 | 19 | 0.94 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2004 | 23 | 0.93 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2005 | 24 | 0.92 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2006 | 26 | 0.92 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2007 | 33 | 0.89 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2008 | 33 | 0.89 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2009 | 104 | 0.91 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2010 | 117 | 0.90 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2011 | 123 | 0.90 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2012 | 209 | 0.90 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2013 | 315 | 0.89 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2014 | 433 | 0.88 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2015 | 454 | 0.87 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2016 | 581 | 0.86 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2017 | 597 | 0.86 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2018 | 620 | 0.86 | 6 | 7 | 0 | 2 | 0 |
match_hhh_type_major | 2019 | 641 | 0.85 | 6 | 7 | 0 | 2 | 0 |
It seems that each year, we have a couple of cases, where there is no information in pbrutto for the childrens hhmembers in ppath (their rel to the hhh). Who are they?
goal: find non-matches that are available in ppfad, but not in pbrutto
netto
is.we find: the missing cases come mostly from persons who did not take part in the specific year
options("tidylog.display" = list()) # turn off tidylog
missing_pbrutto <- ppfad_long_cl %>%
anti_join(pbrutto_long_cl, by = c("syear", "pid"))
missings_rel2 <- relations2 %>%
distinct(syear, match_id) %>%
semi_join(missing_pbrutto, by = c("syear","match_id" = "pid"))
missings_rel2 %>%
distinct(syear, match_id) %>%
left_join(ppfad_long_cl %>%
select(syear, pid, match_netto_l = netto_l),
by = c("syear", "match_id" = "pid")) %>%
tabyl(match_netto_l, show_missing_levels = F) %>% adorn_totals %>% adorn_pct_formatting()
# missings_rel2 %>% tabyl(match_stell_l, show_missing_levels = F)
options("tidylog.display" = NULL) # turn on tidylog
Who are the people living with igene-members?
relations2 %>%
filter(!flag_singlehh) %>%
distinct(pid, match_id, .keep_all = T) %>%
tabyl(igene_hhh_rel, match_hhh_rel, ) %>%
adorn_totals("col") %>%
adorn_title(col = "HH-Member's Rel. to hhh ",
row = "iGENE-Member is ... of the hhh")
How does this look if we only look at family members who are also in the igene sample?
igene_pids <- relations2 %>% distinct(pid)
relations2 %>%
# filter here
semi_join(igene_pids, by = c("match_id" = "pid")) %>%
distinct(pid, match_id, .keep_all = T) %>%
tabyl(igene_hhh_rel, match_hhh_rel, ) %>%
adorn_totals("col") %>%
adorn_title(col = "HH-Member's Rel. to hhh ",
row = "iGENE-Member is ... of the hhh")
goal: generate the relation of igene-members and hhmember-match pairs based on both of their relations to the head of household
parent dataset(s): relations2
new variable:
the relations can be both, social or genetic
help: you can read the code as follows: example third line of case_when statement -> if the relation of the igene-member is the “child” of the hhh and the match is also the “child” of the hhh, then the igene-member is a sibling of other hh-member
unique keys: syear, pid, match_id
relations3 <- relations2 %>%
mutate(igene_match_rel = case_when(
# first the specific definitions for each combination of child and match the hhh
# igene = child
igene_hhh_rel == "child" & match_hhh_rel == "child" ~ "sibling",
igene_hhh_rel == "child" & match_hhh_rel == "grandchild" ~ "parent",
igene_hhh_rel == "child" & match_hhh_rel == "parent" ~ "grandchild",
igene_hhh_rel == "child" & match_hhh_rel == "partner" ~ "child",
igene_hhh_rel == "child" & match_hhh_rel %in% c("relative", "sibling") ~ "relative",
# igene = sibling
igene_hhh_rel == "sibling" & match_hhh_rel == "child" ~ "relative",
igene_hhh_rel == "sibling" & match_hhh_rel == "grandchild" ~ "relative",
igene_hhh_rel == "sibling" & match_hhh_rel == "parent" ~ "child",
igene_hhh_rel == "sibling" & match_hhh_rel == "partner" ~ "relative",
igene_hhh_rel == "sibling" & match_hhh_rel == "sibling" ~ "sibling",
igene_hhh_rel == "sibling" & match_hhh_rel == "relative" ~ "relative",
# igene = grandchild
igene_hhh_rel == "grandchild" & match_hhh_rel == "child" ~ "child",
igene_hhh_rel == "grandchild" & match_hhh_rel == "grandchild" ~ "sibling",
igene_hhh_rel == "grandchild" & match_hhh_rel == "parent" ~ "grandchild",
igene_hhh_rel == "grandchild" & match_hhh_rel == "partner" ~ "grandchild",
igene_hhh_rel == "grandchild" & match_hhh_rel %in% c("relative", "sibling") ~ "relative",
# igene = head
igene_hhh_rel == "head" & match_hhh_rel == "child" ~ "parent",
igene_hhh_rel == "head" & match_hhh_rel == "grandchild" ~ "grandparent",
igene_hhh_rel == "head" & match_hhh_rel == "parent" ~ "child",
igene_hhh_rel == "head" & match_hhh_rel == "partner" ~ "partner",
igene_hhh_rel == "head" & match_hhh_rel %in% c("relative", "sibling") ~ "relative",
# igene = partner
igene_hhh_rel == "partner" & match_hhh_rel == "child" ~ "parent",
igene_hhh_rel == "partner" & match_hhh_rel == "grandchild" ~ "parent",
igene_hhh_rel == "partner" & match_hhh_rel == "parent" ~ "child",
igene_hhh_rel == "partner" & match_hhh_rel == "partner" ~ "partner",
igene_hhh_rel == "partner" & match_hhh_rel %in% c("relative", "sibling") ~ "relative",
# igene = relative
igene_hhh_rel == "relative" & match_hhh_rel == "child" ~ "relative",
igene_hhh_rel == "relative" & match_hhh_rel == "grandchild" ~ "relative",
igene_hhh_rel == "relative" & match_hhh_rel == "parent" ~ "relative",
igene_hhh_rel == "relative" & match_hhh_rel == "partner" ~ "relative",
igene_hhh_rel == "relative" & match_hhh_rel %in% c("relative", "sibling") ~ "relative",
# igene = parent
igene_hhh_rel == "parent" & match_hhh_rel == "child" ~ "grandparent",
igene_hhh_rel == "parent" & match_hhh_rel == "grandchild" ~ "grandgrandchild",
igene_hhh_rel == "parent" & match_hhh_rel == "parent" ~ "partner",
igene_hhh_rel == "parent" & match_hhh_rel == "partner" ~ "parent",
igene_hhh_rel == "parent" & match_hhh_rel == "relative" ~ "relative",
igene_hhh_rel == "parent" & match_hhh_rel == "sibling" ~ "parent",
# now some general rules:
# if match is the hhh the child rel to hh is same as igene_match relation
!is.na(igene_hhh_rel) & match_hhh_rel == "head" ~ igene_hhh_rel,
# child to match is always "other" if match to hhh is "other"
!is.na(igene_hhh_rel) & match_hhh_rel == "other" ~ "other",
# child to match is always "other" if child to hhh is "other"
igene_hhh_rel == "other" & !is.na(match_hhh_rel) ~ "other",
is.na(match_hhh_rel) & is.na(match_stell) ~ "missing match_stell",
is.na(igene_hhh_rel) ~ "missing igene_stell",
TRUE ~ "mistake"
)
)
check if igene-members are unique by survey year, pid, match_id
testthat::expect_equal(nrow(relations3), nrow(relations3 %>% distinct(syear, pid, match_id)))
did we miss any combination?
testthat::expect_equal(nrow(relations3 %>% filter(igene_match_rel == "mistake")), 0)
# relations3 %>%
# filter(igene_match_rel == "mistake") %>%
# tabyl(igene_hhh_rel, match_hhh_rel)
What are the direct relations between igene-members and their other hh-members?
relations3 %>%
filter(!flag_singlehh,
!str_detect(igene_match_rel, "missing")) %>%
distinct(pid, match_id, igene_match_rel, .keep_all = T) %>%
tabyl(igene_match_rel, igene_hhh_rel) %>%
adorn_title(row = "iGENE-member is ... to hh-member",
col = "iGENE-member is ... to hhh")
parent dataset(s): relations3
new variable:
genetic_label
if the rel. of the igene-member to the hhh is probably genetic but cannot be said for certain due to an unclear label of the value pbrutto_long_cl$stell_h
until 2011 - for genetic there is also genetic_partner
if the relationship between igene-member and hhh is genetic and the other hh-member is the partner of the hhh. In this case the partner might be the second genetic parent. It cannot be said with certainty. But it can also not be said with certainty that the relationship is definitely a social one.unique keys: syear, pid, match_id, source, parent_nr
Now we will define the type of relationship between child and match and then we are done, and we will know the relation of the child to all other persons in the household for those where it is possible
relations3 %>%
tabyl(igene_hhh_rel_type, flag_singlehh) %>%
arrange(desc(.$'FALSE'))
# relations3 %>% tabyl(child_stell_l, show_missing_levels = F) %>% adorn_pct_formatting()
what am I trying to do here?
i generate igene_match_type_minor = the type of relationship between igene member and other hhmember - it can be
relations4 <- relations3 %>%
mutate(igene_match_type_minor = case_when(
# first the specific cases:
# special cases with stepchildren and partners
igene_stell == 22 & match_hhh_rel == "partner" ~ "genetic_stepchild", # [22] Stiefkind(Kind d.Ehe-/LPartners)
# special case with child hhh and parents of hhh,
igene_stell == 0 & match_hhh_rel == "parent" ~ "genetic_known", # [0] Haushaltsvorstand,Bezugsperson,
igene_stell == 25 & match_hh_position5 == "child" & match_hhh_type_major == "social" ~ "social_multigen", # [25] Enkelkind + social match hhh rel
igene_stell == 25 & match_hh_position5 == "child" & match_hhh_type_major == "genetic" ~ "genetic_multigen", # [25] Enkelkind + genetic match hhh rel
# if both relation are social and the match is not partner of hhh then the relation is social
match_hhh_type_minor == "social_known" & (match_hhh_rel != "partner" & !is.na(match_hhh_rel)) ~ "social_known",
igene_hhh_type_minor == "social_known" & (igene_hhh_rel != "partner" & !is.na(match_hhh_rel)) ~ "social_known",
# then the more general rules:
# if match is hhh then it igene_match is the same as igene_hhh relation type
match_hhh_rel == "head" ~ igene_hhh_type_minor,
# if igene_hhh and match_hhh are both genetic or both social, then their relation to each other does not change
igene_hhh_rel_type == match_hhh_rel_type ~ igene_hhh_type_minor,
# genetic children of hhh and other genetic relatives
igene_hhh_type_minor == "genetic_known" & match_hhh_type_minor == "genetic_known" ~ "genetic_known",
igene_hhh_type_minor == "genetic_label" & match_hhh_type_minor == "genetic_known" ~ "genetic_label",
igene_hhh_type_minor == "genetic_known" & match_hhh_type_minor == "genetic_label" ~ "genetic_label",
igene_hhh_type_minor == "genetic_label" & match_hhh_type_minor == "genetic_label" ~ "genetic_label",
# for partners of genetic or maybe genetic hhh, I call this partner because they might, but do not necessarily have to be the biological parents
igene_hhh_type_minor == "genetic_known" & match_hhh_rel_type == "partner_social_social_known" ~ "genetic_partner",
igene_hhh_type_minor == "genetic_label" & match_hhh_rel_type == "partner_social_social_known" ~ "genetic_partner",
igene_hhh_rel_type == "partner_social_social_known" & match_hhh_rel == "child" ~ "genetic_partner",
igene_hhh_rel_type == "partner_social_social_known" & match_hhh_rel == "grandchild" ~ "genetic_partner",
igene_hhh_rel_type == "partner_social_social_known" & match_hhh_rel == "sibling" ~ "social_known", # match is sibling of social partner
igene_hhh_rel_type == "partner_social_social_known" & match_hhh_rel == "parent" ~ "social_known", # match is parent if social partner
# igene_hhh_type_minor == "genetic_known" & match_hhh_type_minor == "social_known" ~ "social_known",
is.na(match_stell) ~ "missing_match_stell",
is.na(igene_stell) ~ "missing_igene_stell",
TRUE ~ "mistake"
),
igene_match_type_major = case_when(
igene_match_type_minor == "genetic_known" ~ "genetic",
igene_match_type_minor == "genetic_label" ~ "genetic",
igene_match_type_minor == "genetic_partner" ~ "genetic",
# stepchild indicates those cases where the child is the child of the partner and the match is the partner
igene_match_type_minor == "genetic_stepchild" ~ "genetic",
igene_match_type_minor == "genetic_multigen" ~ "genetic",
# in a multigen household, we assume that even if the relation between the match and the hhh is social (schwiegertochter/sohn), the child is probably the biological grandchild and child of that parent
igene_match_type_minor == "social_multigen" ~ "genetic",
igene_match_type_minor == "social_known" ~ "social",
is.na(match_stell) ~ "missing_match_stell",
is.na(igene_stell) ~ "missing_igene_stell",
TRUE ~ "mistake"
)
) %>%
unite(col = igene_match_rel_type, igene_match_rel, igene_match_type_minor, sep = "_", remove = F, na.rm = FALSE)
# relations4 %>% filter(igene_match_type_minor == "mistake") %>% tabyl(igene_hhh_type_minor)
check if children are unique by survey year, pid, match_id
testthat::expect_equal(nrow(relations4), nrow(relations4 %>% distinct(syear, pid, match_id)))
# relations4 %>% tabyl(igene_match_type_major, igene_match_type_minor, show_missing_levels = F) %>% adorn_title()
testthat::expect_equal(nrow(relations4 %>% filter(igene_match_type_minor == "mistake")), 0)
testthat::expect_equal(nrow(relations4 %>% filter(igene_match_type_major == "mistake")), 0)
relations4 %>%
distinct(pid, match_id, .keep_all = TRUE) %>%
filter(!str_detect(igene_match_type_major, "missing")) %>%
tabyl(igene_match_type_minor, igene_match_type_major) %>%
adorn_totals() %>% adorn_title
Lets have a look at the somewhat weird case where the igene-member is the step child (child of the hhh partner) and the other hh-member (match) is the partner of the hhh (so should be the parent). How often is this the case?
relations4 %>%
filter(igene_stell == 22, match_hhh_rel == "partner") %>%
mutate_all(as.character) %>% # fix for bug that appeared april 2020
tabyl(syear, igene_match_rel_type)
now we check out the cases, where the igene-member is head of hh and the match is the parent of the hhh
relations4 %>%
filter(igene_stell == 0, match_hhh_rel == "parent") %>%
mutate_all(as.character) %>% # fix for bug that appeared april 2020
tabyl(syear, igene_match_rel_type)
now we check out the cases, where the igene-member is sibling of hh and the match is the parent of the hhh
relations4 %>%
filter(igene_stell == 41, match_hhh_rel == "parent") %>% # stell == 41 [41] Brother, Sister
mutate_all(as.character) %>% # fix for bug that appeared april 2020
tabyl(syear, igene_match_rel_type)
What are the relationships and the type of them for igene-members and their hh-members?
n = 1941
where the partner of the genetic parent might also be the genetic parent but there is no way to find that out (igene_match_type_minor = “partner”)partners are always social (although there are supposed to be cases where this is not the case in the SOEP)
for this section it could be good to write some tests to ensure that the coding has no flaws
relations4 %>%
distinct(pid, match_id, .keep_all = TRUE) %>%
filter(!str_detect(igene_match_type_major, "missing")) %>%
tabyl(igene_match_rel, igene_match_type_minor, igene_match_type_major, show_missing_levels = F) %>%
adorn_title()
## $genetic
## igene_match_type_minor
## igene_match_rel genetic_known genetic_label genetic_multigen genetic_partner genetic_stepchild social_multigen
## child 350 132 8 394 16 1
## grandchild 11 3 0 3 0 0
## grandgrandchild 0 1 0 0 0 0
## grandparent 20 5 0 0 0 0
## parent 635 210 0 550 0 0
## partner 0 1 0 974 0 0
## relative 26 2 0 2 0 0
## sibling 383 201 4 0 0 0
##
## $social
## igene_match_type_minor
## igene_match_rel social_known
## child 31
## grandchild 10
## grandparent 7
## other 88
## parent 95
## partner 723
## relative 3
## sibling 55
What are the frequencies of the different kinds of relationships?
relations4 %>%
distinct(pid, match_id, .keep_all = T) %>%
tabyl(igene_match_rel_type) %>% arrange(desc(n)) %>% adorn_pct_formatting()
unique keys: syear, child_id, match_id
igene_hhmember_relations <- relations4 %>%
mutate(match_hid = hid) %>%
select(syear, pid, match_id, hid, starts_with("igene"), match_hid, starts_with("match"), starts_with("flag"))
igene_hhmember_relations_justigene <- igene_hhmember_relations %>%
tidylog::semi_join(pid_igene_sample, by = c("match_id" = "pid"))
## semi_join: added no columns
## > rows only in x (16,432)
## > rows only in y ( 993)
## > matched rows 19,928
## > ========
## > rows total 19,928
export(igene_hhmember_relations, here::here("data/igene_hhmember_relations_v36.rds"))
export(igene_hhmember_relations_justigene, here::here("data/igene_hhmember_relations_justigene_v36.rds"))
igene_hhmember_relations %>% names
## [1] "syear" "pid" "match_id" "hid" "igene_stell" "igene_stell_l"
## [7] "igene_hh_position8" "igene_hh_position5" "igene_hhh_rel_type" "igene_hhh_rel" "igene_hhh_type_major" "igene_hhh_type_minor"
## [13] "igene_match_rel_type" "igene_match_rel" "igene_match_type_minor" "igene_match_type_major" "match_hid" "match_gender"
## [19] "match_stell" "match_stell_l" "match_hh_position8" "match_hh_position5" "match_hhh_rel_type" "match_hhh_rel"
## [25] "match_hhh_type_major" "match_hhh_type_minor" "flag_singlehh"
check if children are unique by survey year
testthat::expect_equal(nrow(igene_hhmember_relations), nrow(igene_hhmember_relations %>% distinct(syear, pid, match_id)))
I once thought about making the dataset into an even longer format and not having mother and father ID’s separate but in one variable with an identifyer variable telling wether the match_id is male or female.
The code below comes from the part generate relations1 (insert link)