Step 1: Identify potential children

goal

Our goal is to create a dataset ppfad_igene_long.rds containing all members of the IGENE sample and the survey years in which they have previously participated

Input

soepis_igene_raw : a datset containing the information about the participants of the genetic sample (key identifier: pid)
ppfad_long: a dataset containing information about all participant who participated in SOEP-IS (key identifier: syear, pid)

soepis_igene_raw <- rio::import(here::here("data", "soepis_igene_raw.rds"))
ppfad_long <- rio::import(here::here("data", "ppfad_long_v36.rds"))
pbrutto_long_cl <- rio::import(here::here("data", "pbrutto_long_cl_v36.rds"))

N soepis_igene_raw = 4406

Generated within

igene_lab: modification of soepis_igene_raw.rds - the values are not represented by numbers but by their actual value labels. Practical for frequency tables.
tables_list: a list of frequency tables for all variables of soepis_igene_raw
tables_list_l: a named list of frequency tables for all variables of soepis_igene_raw. The names of the lists refer to the variable names (not the variable Labels)

igene_lab <- soepis_igene_raw %>% mutate_all(sjlabelled::as_label)

tables_list <- purrr::map(igene_lab, janitor::tabyl)
tables_list_l <- tables_list

names(tables_list_l) <- purrr::map_chr(igene_lab, get_label)

Output

ppfad_igene_long: a dataset containing the persons from the gene-sample for each year they participated in the SOEP-IS (only those where saliva was actually taken) (key identifier: syear, pid)
soepis_igene_long
soepis_igene_age

Identify IGENE Sample of interest

Let us take a look at the labelled version of soepis_igene_raw.

We can already exclude people who did not sign their agreement, although a more robust analysis will follow later, but

# sjlabelled::get_label(soepis_igene_raw %>% select(igene06))

igene_lab %>% 
      show_table("igene06")

NA’s are from Children who have a different variable for consent.

# sjlabelled::get_label(soepis_igene_raw["igene_kind_final"])

igene_lab %>% 
      show_table("igene_kind_final")

It turns out that for children, there are multiple variables describing the consent and information process that participants went through before the saliva sampling (German: Speichen entnehmen).

Therefore it seems more straightforward to filter based on the information whether saliva was or was not sampled in the end (German: “Speichel wurde entnommen”).

Before we filter, we will take a look at the frquencies for adults and children.

Adults:

# sjlabelled::get_label(soepis_igene_raw["igene15_kind"])

igene_lab %>% 
      show_table("igene15")

# sjlabelled::get_label(soepis_igene_raw["igene15_kind"])

igene_lab %>% 
      show_table("igene15_kind")

and there is information on whether the probe was send to Rotterdam

igene_lab %>% 
      show_table("probe")

Now we only keep those individuals who sucessfully sampled their saliva or got it sampled.

# igene_lab %>% 
#       count(igene15_kind == "Speichel entnommen",
#              igene15 == "Speichel entnommen") 

# igene_sample <- igene_lab %>% 
#       filter(igene15_kind == "Speichel entnommen" |
#              igene15 == "Speichel entnommen") %>% 
#       mutate(pid = as_numeric(pid))

igene_sample <- igene_lab %>% 
      filter(probe == "ans Labor geschickt") %>% 
      mutate(pid = as_numeric(pid))

export(igene_sample, file = here::here("data/igene_sample.rds"))

nice to have: select the variables that have the same content but for adults and children and pivot_longer() at the same time add a column named q_type with two values: “adult” and “child”. ## export pid’s

pid_igene_sample <- igene_sample %>% select(pid)
export(pid_igene_sample %>% pull(), file = here::here("data/pid_igene_sample.rds"))

subset ppath

Now we want to get information about the 2019 participants: In which years have they participated previously? For this we can use the previously mentioned ppath

ppfad_igene_long <- ppfad_long %>%
      tidylog::filter(netto > 0) %>% 
      tidylog::semi_join(igene_sample, by = "pid") %>% 
      rescue_attributes(ppfad_long)

export(ppfad_igene_long, here::here("data/ppfad_igene_long_v36.rds"))

export soepis_igene_long

also make and export igene sample

if (!file.exists(here::here("data/soepis_igene_long_v36.rds"))) {
   soepis_igene_long <- import(here::here("data/soepis_long_v36.rds")) %>% 
      semi_join(pid_igene_sample, by = "pid")
   rio::export(soepis_igene_long, here::here("data/soepis_igene_long_v36.rds"))
}

export soepis_igene_age

also make and export igene sample

if (!file.exists(here::here("data/soepis_igene_age_v36.rds"))) {
   soepis_igene_age <- import(here::here("data/soepis_age_v36.rds")) %>% 
      semi_join(pid_igene_sample, by = "pid")
   rio::export(soepis_igene_age, here::here("data/soepis_igene_age_v36.rds"))
}

Helpful Resources - hide and show individual code blocks: https://bookdown.org/yihui/rmarkdown-cookbook/fold-show.html