Review: Beforehand (insert link) we have identified all possible parental pointers for the igene-members in our sample.

Goal: Now we want to identify the relations that igene-members have with each member in their households.

preparation: read data needed in the script

# data of igene-sample
ppfad_igene_long <- import(here::here("data/ppfad_igene_long_v36.rds"))
pid_igene_sample <- import(here::here("data/pid_igene_sample.rds")) %>% tibble(pid = .)

# data with parental pointers
pbrutto_long_cl <- import(here::here("data/pbrutto_long_cl_v36.rds"))
ppfad_long_cl <- import(here::here("data/ppfad_long_cl_v36.rds"))

step1: get relation to head of hh

One important information in this regard is the relation to the head of household (available in: pbrutto_long$stell_l). It is almost the only reliable source which contains information on whether a sample member is biologically or socially related to other people in the household. You can see the values in the table below. The codes 20-30 contain possible relations of children to the head of the household.

goal: create dataset rel_hhh containing the relation to the head of hhh in various forms.

pbrutto_long_cl %>% tabyl(stell_l, show_missing_levels = F) %>% adorn_pct_formatting()
# ppfad_long_cl %>% arrange(desc(syear)) %>% distinct(pid, .keep_all = T) %>% tabyl(netto_l, show_missing_levels = F) %>% adorn_pct_formatting()

generate rel_hhh

parent dataset: pbrutto_long_cl (generated in 00_import_data.Rmd) new variables:

  • hh_position8: whether a person is one of 8 categories: head, child, relative, partner, other/unknown, parent, sibling, sibling-in-law
  • hh_position5: whether a person is one of 5 categories: head, child, relative, partner, other/unknown
  • flag_ischild: TRUE or FALSE depending of whether person is a child of head of household
  • hhh_rel_type: variable identifying the relationship and the type of relation (whether biological or social)
  • hhh_rel: just relation
  • hhh_type: just type of relationship - genetic is surely genetic (as sure as one can be with this type of data) - genetic_label is for cases like “child/adoptive child” from cases previous of 2011, where it was not distinguished. I would argue that probably most of those children are biological, since adoptive children are generally not common.

unique keys: syear, pid

rel_hhh <- pbrutto_long_cl %>%
      mutate(hh_position8 = case_when(stell == 0 ~ "head",
                                     dplyr::between(stell, 10, 19) ~ "partner",
                                     dplyr::between(stell, 20, 29) ~ "child",
                                     dplyr::between(stell, 30, 39) ~ "parent",
                                     dplyr::between(stell, 40, 49) ~ "sibling",
                                     dplyr::between(stell, 50, 59) ~ "sibling-in-law",
                                     dplyr::between(stell, 60, 69) ~ "relative",
                                     dplyr::between(stell, 70, 99) ~ "other/unknown",
                                     TRUE ~ NA_character_),
             hh_position5 = case_when(stell == 0 ~ "head",
                                      dplyr::between(stell, 10, 19) ~ "partner",
                                      dplyr::between(stell, 20, 29) ~ "child",
                                      dplyr::between(stell, 30, 39) ~ "relative",
                                      dplyr::between(stell, 40, 49) ~ "relative",
                                      dplyr::between(stell, 50, 59) ~ "other/unknown",
                                      dplyr::between(stell, 60, 69) ~ "relative",
                                      dplyr::between(stell, 70, 99) ~ "other/unknown",
                                      TRUE ~ NA_character_),
             flag_ischild = dplyr::between(stell, 20, 29),
             hhh_rel_type = case_when(stell == 0 ~ "head_genetic_genetic_known",
                                    # partner
                                    stell %in% c(11, 12, 13) ~ "partner_social_social_known", 
                                    # children and grandchildren
                                    stell %in% c(20) ~ "child_genetic_genetic_label", 
                                    stell %in% c(21) ~ "child_genetic_genetic_known", 
                                    stell %in% c(25, 26) ~ "grandchild_genetic_genetic_known", 
                                    stell %in% c(22, 23, 24, 27) ~ "child_social_social_known", 
                                    # parents grandparents
                                    stell %in% c(30) ~ "parent_genetic_genetic_label",
                                    stell %in% c(31, 36) ~ "parent_genetic_genetic_known",
                                    stell %in% c(32, 33, 34, 35) ~ "parent_social_social_known", 
                                    # siblings
                                    stell %in% c(40) ~ "sibling_genetic_genetic_label",
                                    stell %in% c(41, 42) ~ "sibling_genetic_genetic_known",
                                    stell %in% c(43, 44, 45, 51, 52) ~ "sibling_social_social_known",  
                                    # other relatives
                                    stell %in% c(60, 61, 62, 63) ~ "relative_genetic_genetic_known",
                                    stell %in% c(64, 70, 71, 99) ~ "other_social_social_known",
                                    is.na(stell) ~ NA_character_,
                                    TRUE ~ "mistake")) %>% 
      separate(hhh_rel_type, into = c("hhh_rel", "hhh_type_major", "hhh_type_minor1", "hhh_type_minor2"), remove = F) %>% 
      unite(hhh_type_minor, hhh_type_minor1, hhh_type_minor2) %>% 
      select(pid, hid, cid, syear, stell, stell_l, flag_ischild, contains("hh_position"), hhh_rel_type, hhh_rel, hhh_type_major, hhh_type_minor)

rio::export(rel_hhh, here::here("data/rel_hhh.rds"))

check if rel_hhh sample-members are unique by survey year and pid

testthat::expect_equal(nrow(rel_hhh), nrow(rel_hhh %>% distinct(syear, pid)))

check if we forgot to code any of the stell categories (in case new ones are added over time)

testthat::expect_equal(rel_hhh %>% filter(hhh_rel_type == "mistake") %>% nrow(), 0)

Frequencies of Relations to the Head of Household (hhh)

  • In the table below we used to see the frequencies of the relation to the head of hh types accumulated over all available years (from 1998)

  • If we run the commented out code for each year, we see that the case numbers increase over time. Being in the 300s in each category before 2009 and then going up to over 3000 ans then 8000

rel_hhh %>% tabyl(hhh_type_minor, hhh_type_major)
# rel_hhh %>% tabyl(hhh_type_minor, hhh_type_major, syear)

step2: add hh members per igene-member

Now we want to connect the igene-members to each of their household members.

goal: create one row for each igene-member and household member in a igene-member’s household

generate igene_hhmembers

parent dataset(s): ppfad_igene_long, ppfad_long_cl

new variables:

  • match_id: id of hh member in a igene-members household

unique keys: syear, pid, match_id

other_hhmembers <- ppfad_igene_long %>% 
      tidylog::distinct(syear, pid, hid) %>% 
      tidylog::left_join(ppfad_long_cl %>% 
                      select(syear, 
                             match_id = pid, hid,
                             match_sex = sex, 
                             match_sex_l = sex_l), 
                by = c("syear", "hid")) %>% 
      tidylog::filter(pid != match_id) %>% 
      mutate(match_gender = case_when(match_sex == 1 ~ "male", # match_sex = [1] maennlich
                               match_sex == 2 ~ "female",
                               TRUE ~ NA_character_)) %>% # match_sex = [2] weiblich
      select(-match_sex, -match_sex_l)
## distinct: no rows removed
## left_join: added 3 columns (match_id, match_sex, match_sex_l)
##            > rows only in x         3
##            > rows only in y  (351,313)
##            > matched rows      52,510    (includes duplicates)
##            >                 =========
##            > rows total        52,513
## filter: removed 20,575 rows (39%), 31,938 rows remaining
igene_hhmembers <- ppfad_igene_long %>% 
      tidylog::distinct(syear, pid, hid) %>% 
      tidylog::left_join(other_hhmembers, by = c("pid", "syear", "hid")) %>% 
      tidylog::mutate(flag_singlehh = is.na(match_id))
## distinct: no rows removed
## left_join: added 2 columns (match_id, match_gender)
##            > rows only in x    4,422
##            > rows only in y  (     0)
##            > matched rows     31,938    (includes duplicates)
##            >                 ========
##            > rows total       36,360
## mutate: new variable 'flag_singlehh' with 2 unique values and 0% NA

check whether all iGENE members are still in the sample

testthat::expect_equal(igene_hhmembers %>% distinct(pid) %>% nrow(),
                       ppfad_igene_long %>% distinct(pid) %>% nrow())
# child_hhmembers %>% skim

igene_hhmembers %>% 
   filter(!flag_singlehh) %>% 
   distinct(match_id, .keep_all = TRUE) %>% 
   tabyl(match_gender, show_missing_levels = FALSE)

check if igene-members are unique by survey year, child_id, match_id

testthat::expect_equal(nrow(igene_hhmembers), nrow(igene_hhmembers %>% distinct(syear, pid, match_id)))

step3: add child position information

Lets start to add the relations to head of household info! First, for the igene-members

goal: add relation to hhh for igene-member

generate relations1

parent dataset(s): igene_hhmembers, rel_hhh

new variables:

  • igene_stell, igene_stell_l: igene-members position to head of household, ending _l is the labelled version, the other one is numeric
  • all rel_hhh variables get the prefix igene_ so that we can tell them apart from the other hh-member’s stell_l variable

unique keys: syear, pid, match_id

relations1 <- igene_hhmembers %>% 
      left_join(rel_hhh %>% select(syear,
                                   pid, 
                                   igene_stell = stell, 
                                   igene_stell_l = stell_l,
                                   igene_hh_position8 = hh_position8,
                                   igene_hh_position5 = hh_position5,
                                   igene_hhh_rel_type = hhh_rel_type,
                                   igene_hhh_rel = hhh_rel,
                                   igene_hhh_type_major = hhh_type_major,
                                   igene_hhh_type_minor = hhh_type_minor
                                   ),
                      by = c("pid", "syear")) 

check if children are unique by survey year, child_id, match_id

testthat::expect_equal(nrow(relations1), nrow(relations1 %>% distinct(syear, pid, match_id)))

For all igene-members, what is their relationship to the hhh?

relations1 %>% 
   distinct(pid, .keep_all = T) %>% 
   tabyl(igene_hhh_type_minor, igene_hhh_type_major) %>% 
   adorn_totals(c("row", "col")) %>% 
   adorn_title()

Let’s look at the distribution of the igene-sample members position to head of hh igene_stell_l.

  • About 75% of the igene-sample are head of hh themselves (54,7%) or Spouses of the hhh (20,4%)
  • About 12% are biological children, and for 132 children they could be biological or adoptive
  • the rest is made up of very small numbers of other relatives or social children
relations1 %>% 
      distinct(pid, .keep_all = T) %>% 
      tabyl(igene_stell_l, show_missing_levels = F) %>% 
      arrange(desc(n)) %>% 
      adorn_totals() %>% 
   adorn_pct_formatting()

And in simpler categories, are the igene-sample members relations to the head of hh biological or social?

  • about two third of the relations to the head of hh are social (these are probably mostly spouses and partners)
relations1 %>% 
      distinct(pid, .keep_all = T) %>% 
      filter(igene_stell != 0) %>% 
      tabyl(igene_hhh_type_minor, show_missing_levels = F) %>% 
      arrange(desc(n)) %>% adorn_pct_formatting()

Are all the genetic children also children? What is the relation to the hhh ?

  • Here we can see that most igene-sample members who are children of the head of hh are (or probably are) genetic children
  • only n = 19 igene-members seem to be social children
options("tidylog.display" = list()) # turn off tidylog

relations1 %>% 
      distinct(pid, .keep_all = T) %>% 
      tabyl(igene_hhh_rel, igene_hhh_type_minor, show_missing_levels = F) %>% 
      adorn_totals(c("row", "col")) %>% adorn_title(row = "iGENE Sample Member is ... of hhh",
                                                    col = "Type of relation")
options("tidylog.display" = NULL) # turn on tidylog

step4: add hh member position info

Lets also add the relations of other hh-members to head of household!

goal: add relation to hhh for hh-members

generate relations2

parent dataset(s): relations1, rel_hhh

new variables:

  • match_stell, match_stell_l: igene-sample members position to head of household, ending _l is the labelled version, the other one is numeric
  • all rel_hhh variables get the prefix match_ so that we can tell them apart from the child stell_l variable

unique keys: syear, pid, match_id

relations2 <- relations1 %>% 
      # join other hh members info
      left_join(rel_hhh %>% select(syear,
                                   match_id = pid, 
                                   match_stell = stell, 
                                   match_stell_l = stell_l,
                                   match_hh_position8 = hh_position8,
                                   match_hh_position5 = hh_position5,
                                   match_hhh_rel_type = hhh_rel_type,
                                   match_hhh_rel = hhh_rel,
                                   match_hhh_type_major = hhh_type_major,
                                   match_hhh_type_minor = hhh_type_minor
                                   ), 
                      by = c("match_id", "syear")) 

check if igene-members are unique by survey year, child_id, match_id

testthat::expect_equal(nrow(relations2), nrow(relations2 %>% distinct(syear, pid, match_id)))
relations2 %>% 
   filter(!flag_singlehh) %>% 
   distinct(pid, match_id, .keep_all = T) %>% 
   tabyl(match_hhh_type_major) %>%
   adorn_pct_formatting() 
relations2 %>% 
   select(syear, igene_hhh_type_major, match_hhh_type_major) %>%  
   group_by(syear) %>%
   skim
Data summary
Name Piped data
Number of rows 36360
Number of columns 3
_______________________
Column type frequency:
character 2
________________________
Group variables syear

Variable type: character

skim_variable syear n_missing complete_rate min max empty n_unique whitespace
igene_hhh_type_major 1998 0 1.00 6 7 0 2 0
igene_hhh_type_major 1999 0 1.00 6 7 0 2 0
igene_hhh_type_major 2000 0 1.00 6 7 0 2 0
igene_hhh_type_major 2001 0 1.00 6 7 0 2 0
igene_hhh_type_major 2002 0 1.00 6 7 0 2 0
igene_hhh_type_major 2003 0 1.00 6 7 0 2 0
igene_hhh_type_major 2004 0 1.00 6 7 0 2 0
igene_hhh_type_major 2005 0 1.00 6 7 0 2 0
igene_hhh_type_major 2006 0 1.00 6 7 0 2 0
igene_hhh_type_major 2007 0 1.00 6 7 0 2 0
igene_hhh_type_major 2008 0 1.00 6 7 0 2 0
igene_hhh_type_major 2009 0 1.00 6 7 0 2 0
igene_hhh_type_major 2010 4 1.00 6 7 0 2 0
igene_hhh_type_major 2011 0 1.00 6 7 0 2 0
igene_hhh_type_major 2012 0 1.00 6 7 0 2 0
igene_hhh_type_major 2013 0 1.00 6 7 0 2 0
igene_hhh_type_major 2014 1 1.00 6 7 0 2 0
igene_hhh_type_major 2015 0 1.00 6 7 0 2 0
igene_hhh_type_major 2016 1 1.00 6 7 0 2 0
igene_hhh_type_major 2017 1 1.00 6 7 0 2 0
igene_hhh_type_major 2018 0 1.00 6 7 0 2 0
igene_hhh_type_major 2019 0 1.00 6 7 0 2 0
match_hhh_type_major 1998 18 0.94 6 7 0 2 0
match_hhh_type_major 1999 20 0.93 6 7 0 2 0
match_hhh_type_major 2000 20 0.93 6 7 0 2 0
match_hhh_type_major 2001 23 0.92 6 7 0 2 0
match_hhh_type_major 2002 22 0.92 6 7 0 2 0
match_hhh_type_major 2003 19 0.94 6 7 0 2 0
match_hhh_type_major 2004 23 0.93 6 7 0 2 0
match_hhh_type_major 2005 24 0.92 6 7 0 2 0
match_hhh_type_major 2006 26 0.92 6 7 0 2 0
match_hhh_type_major 2007 33 0.89 6 7 0 2 0
match_hhh_type_major 2008 33 0.89 6 7 0 2 0
match_hhh_type_major 2009 104 0.91 6 7 0 2 0
match_hhh_type_major 2010 117 0.90 6 7 0 2 0
match_hhh_type_major 2011 123 0.90 6 7 0 2 0
match_hhh_type_major 2012 209 0.90 6 7 0 2 0
match_hhh_type_major 2013 315 0.89 6 7 0 2 0
match_hhh_type_major 2014 433 0.88 6 7 0 2 0
match_hhh_type_major 2015 454 0.87 6 7 0 2 0
match_hhh_type_major 2016 581 0.86 6 7 0 2 0
match_hhh_type_major 2017 597 0.86 6 7 0 2 0
match_hhh_type_major 2018 620 0.86 6 7 0 2 0
match_hhh_type_major 2019 641 0.85 6 7 0 2 0

It seems that each year, we have a couple of cases, where there is no information in pbrutto for the childrens hhmembers in ppath (their rel to the hhh). Who are they?

excurse: finding non-matches

goal: find non-matches that are available in ppfad, but not in pbrutto

  • missing_pbrutto: subset if rows in ppath that do not have match in pbrutto for pid and syear
  • missings_rel2: subset of our latest dataset relations2, but only those row that were missing in pbrutto
  • for those cases we join meta-info from ppath_long to see what their survey status netto is.

we find: the missing cases come mostly from persons who did not take part in the specific year

options("tidylog.display" = list()) # turn off tidylog
missing_pbrutto <- ppfad_long_cl %>% 
      anti_join(pbrutto_long_cl, by = c("syear", "pid")) 

missings_rel2 <- relations2 %>% 
      distinct(syear, match_id) %>% 
      semi_join(missing_pbrutto, by = c("syear","match_id" = "pid"))

missings_rel2 %>% 
      distinct(syear, match_id) %>% 
      left_join(ppfad_long_cl %>% 
                      select(syear, pid, match_netto_l = netto_l), 
                by = c("syear", "match_id" = "pid")) %>%
      tabyl(match_netto_l, show_missing_levels = F) %>% adorn_totals %>% adorn_pct_formatting()
# missings_rel2 %>% tabyl(match_stell_l, show_missing_levels = F)
options("tidylog.display" = NULL) # turn on tidylog

Who are the people living with igene-members?

  • when the igene-member is the head of the hh, most other hh-members are partners or children
  • when the igene-member is a child of the hhh, most other members are also mostly children or the head themselves
  • when the igene-member is the partner of the hhh, most other hh-members are heah of hh or children of the hhh
  • so basically it all makes sense
  • note: for these numbers, only one igene-hhmember pair over time is kept. in the next table you can see the same thing but with all observations over time
relations2 %>% 
   filter(!flag_singlehh) %>% 
   distinct(pid, match_id, .keep_all = T) %>% 
   tabyl(igene_hhh_rel, match_hhh_rel, ) %>% 
   adorn_totals("col") %>% 
   adorn_title(col = "HH-Member's Rel. to hhh ",
               row = "iGENE-Member is ... of the hhh")

How does this look if we only look at family members who are also in the igene sample?

igene_pids <- relations2 %>% distinct(pid) 

relations2 %>% 
   # filter here
   semi_join(igene_pids, by = c("match_id" = "pid")) %>% 
   distinct(pid, match_id, .keep_all = T) %>% 
   tabyl(igene_hhh_rel, match_hhh_rel, ) %>% 
   adorn_totals("col") %>% 
   adorn_title(col = "HH-Member's Rel. to hhh ",
               row = "iGENE-Member is ... of the hhh")  

step5: generate igene-member to hh-member relations

goal: generate the relation of igene-members and hhmember-match pairs based on both of their relations to the head of household

generate relations3

parent dataset(s): relations2

new variable:

  • igene_match_rel: the relation of the child to the other hh-member (match), based on both relations to the head of hh
  • the relations can be both, social or genetic

  • help: you can read the code as follows: example third line of case_when statement -> if the relation of the igene-member is the “child” of the hhh and the match is also the “child” of the hhh, then the igene-member is a sibling of other hh-member

unique keys: syear, pid, match_id

relations3 <- relations2 %>% 
      mutate(igene_match_rel = case_when(
            # first the specific definitions for each combination of child and match the hhh
            # igene = child
            igene_hhh_rel == "child" & match_hhh_rel == "child" ~ "sibling",
            igene_hhh_rel == "child" & match_hhh_rel == "grandchild" ~ "parent",
            igene_hhh_rel == "child" & match_hhh_rel == "parent" ~ "grandchild",
            igene_hhh_rel == "child" & match_hhh_rel == "partner" ~ "child",
            igene_hhh_rel == "child" & match_hhh_rel %in% c("relative", "sibling") ~ "relative",
            # igene = sibling
            igene_hhh_rel == "sibling" & match_hhh_rel == "child" ~ "relative",
            igene_hhh_rel == "sibling" & match_hhh_rel == "grandchild" ~ "relative",
            igene_hhh_rel == "sibling" & match_hhh_rel == "parent" ~ "child",
            igene_hhh_rel == "sibling" & match_hhh_rel == "partner" ~ "relative",
            igene_hhh_rel == "sibling" & match_hhh_rel == "sibling" ~ "sibling",
            igene_hhh_rel == "sibling" & match_hhh_rel == "relative" ~ "relative",
            # igene = grandchild
            igene_hhh_rel == "grandchild" & match_hhh_rel == "child" ~ "child",
            igene_hhh_rel == "grandchild" & match_hhh_rel == "grandchild" ~ "sibling",
            igene_hhh_rel == "grandchild" & match_hhh_rel == "parent" ~ "grandchild",
            igene_hhh_rel == "grandchild" & match_hhh_rel == "partner" ~ "grandchild",
            igene_hhh_rel == "grandchild" & match_hhh_rel %in% c("relative", "sibling") ~ "relative",
            # igene = head
            igene_hhh_rel == "head" & match_hhh_rel == "child" ~ "parent",
            igene_hhh_rel == "head" & match_hhh_rel == "grandchild" ~ "grandparent",
            igene_hhh_rel == "head" & match_hhh_rel == "parent" ~ "child",
            igene_hhh_rel == "head" & match_hhh_rel == "partner" ~ "partner",
            igene_hhh_rel == "head" & match_hhh_rel %in% c("relative", "sibling") ~ "relative",
            # igene = partner
            igene_hhh_rel == "partner" & match_hhh_rel == "child" ~ "parent",
            igene_hhh_rel == "partner" & match_hhh_rel == "grandchild" ~ "parent",
            igene_hhh_rel == "partner" & match_hhh_rel == "parent" ~ "child",
            igene_hhh_rel == "partner" & match_hhh_rel == "partner" ~ "partner",
            igene_hhh_rel == "partner" & match_hhh_rel %in% c("relative", "sibling") ~ "relative",
            # igene = relative
            igene_hhh_rel == "relative" & match_hhh_rel == "child" ~ "relative",
            igene_hhh_rel == "relative" & match_hhh_rel == "grandchild" ~ "relative",
            igene_hhh_rel == "relative" & match_hhh_rel == "parent" ~ "relative",
            igene_hhh_rel == "relative" & match_hhh_rel == "partner" ~ "relative",
            igene_hhh_rel == "relative" & match_hhh_rel %in% c("relative", "sibling") ~ "relative",
            # igene = parent
            igene_hhh_rel == "parent" & match_hhh_rel == "child" ~ "grandparent",
            igene_hhh_rel == "parent" & match_hhh_rel == "grandchild" ~ "grandgrandchild",
            igene_hhh_rel == "parent" & match_hhh_rel == "parent" ~ "partner",
            igene_hhh_rel == "parent" & match_hhh_rel == "partner" ~ "parent",
            igene_hhh_rel == "parent" & match_hhh_rel == "relative" ~ "relative",
            igene_hhh_rel == "parent" & match_hhh_rel == "sibling" ~ "parent",
            # now some general rules:
            # if match is the hhh the child rel to hh is same as igene_match relation
            !is.na(igene_hhh_rel) & match_hhh_rel == "head" ~ igene_hhh_rel,
            # child to match is always "other" if match to hhh is "other"
            !is.na(igene_hhh_rel) & match_hhh_rel == "other" ~ "other", 
            # child to match is always "other" if child to hhh is "other"
            igene_hhh_rel == "other" & !is.na(match_hhh_rel) ~ "other", 
            is.na(match_hhh_rel) & is.na(match_stell) ~ "missing match_stell",
            is.na(igene_hhh_rel) ~ "missing igene_stell",
            TRUE ~ "mistake"
            )
      )

check if igene-members are unique by survey year, pid, match_id

testthat::expect_equal(nrow(relations3), nrow(relations3 %>% distinct(syear, pid, match_id)))

did we miss any combination?

testthat::expect_equal(nrow(relations3 %>% filter(igene_match_rel == "mistake")), 0)

# relations3 %>% 
#    filter(igene_match_rel == "mistake") %>% 
#    tabyl(igene_hhh_rel, match_hhh_rel)

What are the direct relations between igene-members and their other hh-members?

  • when the igene-member is the child or sibling of the other hh-member, most igene-members are also children of the hhh
  • when the igene-member is the partner or parent of the other hh-member, most other hh-members are head or partner of the hhh
  • so basically it all makes sense
relations3 %>%
   filter(!flag_singlehh,
          !str_detect(igene_match_rel, "missing")) %>% 
   distinct(pid, match_id, igene_match_rel, .keep_all = T) %>% 
   tabyl(igene_match_rel, igene_hhh_rel) %>% 
   adorn_title(row = "iGENE-member is ... to hh-member",
               col = "iGENE-member is ... to hhh")

generate relations4

parent dataset(s): relations3

new variable:

  • igene_match_type: the type of relationship of the igene-member to the other hh-member (match), based on both type of relations to the head of hh
  • the relations can be both, social or genetic - for genetic we choose genetic_label if the rel. of the igene-member to the hhh is probably genetic but cannot be said for certain due to an unclear label of the value pbrutto_long_cl$stell_h until 2011 - for genetic there is also genetic_partner if the relationship between igene-member and hhh is genetic and the other hh-member is the partner of the hhh. In this case the partner might be the second genetic parent. It cannot be said with certainty. But it can also not be said with certainty that the relationship is definitely a social one.
  • help: you can read the code as follows: example first line of case_when statement -> if the relation of the igene-member is the “[22] step child” of the hhh and the match is also the “partner” of the hhh, then the igene-member is a genetic child of the other hh-member (match)

unique keys: syear, pid, match_id, source, parent_nr

Now we will define the type of relationship between child and match and then we are done, and we will know the relation of the child to all other persons in the household for those where it is possible

relations3 %>% 
   tabyl(igene_hhh_rel_type, flag_singlehh) %>% 
   arrange(desc(.$'FALSE'))
# relations3 %>% tabyl(child_stell_l, show_missing_levels = F) %>% adorn_pct_formatting()

what am I trying to do here?

i generate igene_match_type_minor = the type of relationship between igene member and other hhmember - it can be

relations4 <- relations3 %>%
      mutate(igene_match_type_minor = case_when(
            # first the specific cases:
            # special cases with stepchildren and partners
            igene_stell == 22 & match_hhh_rel == "partner" ~ "genetic_stepchild",  # [22] Stiefkind(Kind d.Ehe-/LPartners)
            # special case with child hhh and parents of hhh,            
            igene_stell == 0 & match_hhh_rel == "parent" ~ "genetic_known", # [0] Haushaltsvorstand,Bezugsperson,
            igene_stell == 25 & match_hh_position5 == "child" & match_hhh_type_major == "social" ~ "social_multigen", # [25] Enkelkind + social match hhh rel
            igene_stell == 25 & match_hh_position5 == "child" & match_hhh_type_major == "genetic" ~ "genetic_multigen", # [25] Enkelkind + genetic match hhh rel
            # if both relation are social and the match is not partner of hhh then the relation is social
            match_hhh_type_minor == "social_known" & (match_hhh_rel != "partner" & !is.na(match_hhh_rel)) ~ "social_known",
            igene_hhh_type_minor == "social_known" & (igene_hhh_rel != "partner" & !is.na(match_hhh_rel)) ~ "social_known",
            # then the more general rules:
            # if match is hhh then it igene_match is the same as igene_hhh relation type
            match_hhh_rel == "head" ~ igene_hhh_type_minor, 
            # if igene_hhh and match_hhh are both genetic or both social, then their relation to each other does not change
            igene_hhh_rel_type == match_hhh_rel_type ~ igene_hhh_type_minor,
            # genetic children of hhh and other genetic relatives
            igene_hhh_type_minor == "genetic_known" & match_hhh_type_minor == "genetic_known" ~ "genetic_known",
            igene_hhh_type_minor == "genetic_label" & match_hhh_type_minor == "genetic_known" ~ "genetic_label", 
            igene_hhh_type_minor == "genetic_known" & match_hhh_type_minor == "genetic_label" ~ "genetic_label", 
            igene_hhh_type_minor == "genetic_label" & match_hhh_type_minor == "genetic_label" ~ "genetic_label",
            # for partners of genetic or maybe genetic hhh, I call this partner because they might, but do not necessarily have to be the biological parents
            igene_hhh_type_minor == "genetic_known" & match_hhh_rel_type == "partner_social_social_known" ~ "genetic_partner",
            igene_hhh_type_minor == "genetic_label" & match_hhh_rel_type == "partner_social_social_known" ~ "genetic_partner",
            igene_hhh_rel_type == "partner_social_social_known" & match_hhh_rel == "child" ~ "genetic_partner",
            igene_hhh_rel_type == "partner_social_social_known" & match_hhh_rel == "grandchild" ~ "genetic_partner",
            igene_hhh_rel_type == "partner_social_social_known" & match_hhh_rel == "sibling" ~ "social_known", # match is sibling of social partner
            igene_hhh_rel_type == "partner_social_social_known" & match_hhh_rel == "parent" ~ "social_known", # match is parent if social partner
            # igene_hhh_type_minor == "genetic_known" & match_hhh_type_minor == "social_known" ~ "social_known",
            is.na(match_stell) ~ "missing_match_stell",
            is.na(igene_stell) ~ "missing_igene_stell",
            TRUE ~ "mistake"
            ),
            igene_match_type_major = case_when(
                  igene_match_type_minor == "genetic_known" ~ "genetic",
                  igene_match_type_minor == "genetic_label" ~ "genetic",
                  igene_match_type_minor == "genetic_partner" ~ "genetic",
                  # stepchild indicates those cases where the child is the child of the partner and the match is the partner
                  igene_match_type_minor == "genetic_stepchild" ~ "genetic",
                  igene_match_type_minor == "genetic_multigen" ~ "genetic",
                  # in a multigen household, we assume that even if the relation between the match and the hhh is social (schwiegertochter/sohn), the child is probably the biological grandchild and child of that parent
                  igene_match_type_minor == "social_multigen" ~ "genetic", 
                  igene_match_type_minor == "social_known" ~ "social",
                  is.na(match_stell) ~ "missing_match_stell",
                  is.na(igene_stell) ~ "missing_igene_stell",
                  TRUE ~ "mistake" 
            )
      ) %>% 
      unite(col = igene_match_rel_type, igene_match_rel, igene_match_type_minor, sep = "_", remove = F, na.rm = FALSE)
# relations4 %>% filter(igene_match_type_minor == "mistake") %>% tabyl(igene_hhh_type_minor) 

check if children are unique by survey year, pid, match_id

testthat::expect_equal(nrow(relations4), nrow(relations4 %>% distinct(syear, pid, match_id)))
# relations4 %>% tabyl(igene_match_type_major, igene_match_type_minor, show_missing_levels = F) %>% adorn_title()
testthat::expect_equal(nrow(relations4 %>% filter(igene_match_type_minor == "mistake")), 0)
testthat::expect_equal(nrow(relations4 %>% filter(igene_match_type_major == "mistake")), 0)
relations4 %>% 
      distinct(pid, match_id, .keep_all = TRUE) %>% 
      filter(!str_detect(igene_match_type_major, "missing")) %>% 
      tabyl(igene_match_type_minor, igene_match_type_major) %>% 
      adorn_totals() %>% adorn_title

Lets have a look at the somewhat weird case where the igene-member is the step child (child of the hhh partner) and the other hh-member (match) is the partner of the hhh (so should be the parent). How often is this the case?

  • thats a couple of cases per year
relations4 %>% 
      filter(igene_stell == 22, match_hhh_rel == "partner") %>% 
      mutate_all(as.character) %>%  # fix for bug that appeared april 2020
      tabyl(syear, igene_match_rel_type) 

now we check out the cases, where the igene-member is head of hh and the match is the parent of the hhh

relations4 %>% 
      filter(igene_stell == 0, match_hhh_rel == "parent") %>% 
      mutate_all(as.character) %>%  # fix for bug that appeared april 2020
      tabyl(syear, igene_match_rel_type) 

now we check out the cases, where the igene-member is sibling of hh and the match is the parent of the hhh

relations4 %>% 
      filter(igene_stell == 41, match_hhh_rel == "parent") %>%  # stell == 41 [41] Brother, Sister
      mutate_all(as.character) %>%  # fix for bug that appeared april 2020
      tabyl(syear, igene_match_rel_type) 

What are the relationships and the type of them for igene-members and their hh-members?

  • as we saw before, a lot of genetic siblings and genetic or maybe genetic parents
  • a problem might be those cases n = 1941 where the partner of the genetic parent might also be the genetic parent but there is no way to find that out (igene_match_type_minor = “partner”)
  • partners are always social (although there are supposed to be cases where this is not the case in the SOEP)

  • for this section it could be good to write some tests to ensure that the coding has no flaws

relations4 %>%
   distinct(pid, match_id, .keep_all = TRUE) %>% 
   filter(!str_detect(igene_match_type_major, "missing")) %>% 
   tabyl(igene_match_rel, igene_match_type_minor, igene_match_type_major, show_missing_levels = F) %>%
   adorn_title()
## $genetic
##                  igene_match_type_minor                                                                                 
##  igene_match_rel          genetic_known genetic_label genetic_multigen genetic_partner genetic_stepchild social_multigen
##            child                    350           132                8             394                16               1
##       grandchild                     11             3                0               3                 0               0
##  grandgrandchild                      0             1                0               0                 0               0
##      grandparent                     20             5                0               0                 0               0
##           parent                    635           210                0             550                 0               0
##          partner                      0             1                0             974                 0               0
##         relative                     26             2                0               2                 0               0
##          sibling                    383           201                4               0                 0               0
## 
## $social
##                  igene_match_type_minor
##  igene_match_rel           social_known
##            child                     31
##       grandchild                     10
##      grandparent                      7
##            other                     88
##           parent                     95
##          partner                    723
##         relative                      3
##          sibling                     55

What are the frequencies of the different kinds of relationships?

relations4 %>% 
   distinct(pid, match_id, .keep_all = T) %>% 
   tabyl(igene_match_rel_type) %>% arrange(desc(n)) %>% adorn_pct_formatting()

generate igene_hhmember_relations

unique keys: syear, child_id, match_id

igene_hhmember_relations <- relations4 %>% 
   mutate(match_hid = hid) %>% 
   select(syear, pid, match_id, hid, starts_with("igene"), match_hid, starts_with("match"), starts_with("flag"))

igene_hhmember_relations_justigene <- igene_hhmember_relations %>% 
   tidylog::semi_join(pid_igene_sample, by = c("match_id" = "pid"))
## semi_join: added no columns
##            > rows only in x  (16,432)
##            > rows only in y  (   993)
##            > matched rows     19,928
##            >                 ========
##            > rows total       19,928
export(igene_hhmember_relations, here::here("data/igene_hhmember_relations_v36.rds"))
export(igene_hhmember_relations_justigene, here::here("data/igene_hhmember_relations_justigene_v36.rds"))
igene_hhmember_relations %>% names
##  [1] "syear"                  "pid"                    "match_id"               "hid"                    "igene_stell"            "igene_stell_l"         
##  [7] "igene_hh_position8"     "igene_hh_position5"     "igene_hhh_rel_type"     "igene_hhh_rel"          "igene_hhh_type_major"   "igene_hhh_type_minor"  
## [13] "igene_match_rel_type"   "igene_match_rel"        "igene_match_type_minor" "igene_match_type_major" "match_hid"              "match_gender"          
## [19] "match_stell"            "match_stell_l"          "match_hh_position8"     "match_hh_position5"     "match_hhh_rel_type"     "match_hhh_rel"         
## [25] "match_hhh_type_major"   "match_hhh_type_minor"   "flag_singlehh"

check if children are unique by survey year

testthat::expect_equal(nrow(igene_hhmember_relations), nrow(igene_hhmember_relations %>% distinct(syear, pid, match_id)))

Cutouts

I once thought about making the dataset into an even longer format and not having mother and father ID’s separate but in one variable with an identifyer variable telling wether the match_id is male or female.

The code below comes from the part generate relations1 (insert link)