This markdown outlines the process in which multiple SOEP-IS datasets are used to construct a dataset which includes all variables of interest. SOEP-IS Documentation is available on the website and via paneldata).
Procedure
Data Structure (ID’s)
soepisv34 will be in SOEP-long format and on the person level, meaning that unique individuals may have multiple rows in the dataset for multiple timepoints (survey years). The rows in the data will NOT be uniquely identifyable by Person ID ‘pid’ alone, but in the combination of person ID pid and time syear.hid.ppfad (Person-related Meta-dataset) is the most extensive dataset, including most peole who ever participated in the SOEP-IS. It is therefore used as the “base” dataset, onto which the other information will be merged. (ppfad documentation)
it includes more ID vars and metadata than the others
  - pid (unique person ID)
  - cid (unique case id (hh level))
  - hid$$ (current household ID)
it also includes information about participants
  - sex: gender
  - gebjahr: year of birth
  - gebmo: month of birth
  - psample: sample member
  - netto$$: current wave status
  
information that might further be of interest (not included for now)
  - todjahr: year of death
  - corigin: country of origin
  - migback: migration background
  - sampreg$$: East/West Germany
Data Structure (ID’s)
the data is in wide format (each person has one row), years are indicated in the variable names.
unique key: pid
if (!file.exists(here::here("data/ppfad_wide_v36.rds"))) {
   
   ppfad_raw <- rio::import(paste0(soepisv36_en, "ppfad.dta"))
      
   ppfad_wide <- ppfad_raw %>% 
   select(
            # time invariant
            pid,   
            cid,
            psample,
            sex,
            gebjahr,
            gebmonat,
            # time variant
            starts_with("hid"),
            contains("netto")
             )
   
   export(ppfad_wide, here::here("data/ppfad_wide_v36.rds"))
   message(paste0("ppfad_wide exported to:", here::here("data/ppfad_wide_v36.rds")))
} else {
   ppfad_wide <- import(here::here("data", "ppfad_wide_v36.rds"))
}
ppfad_wide %>% summarize("Number of persons" = n())
## # A tibble: 1 x 1
##   `Number of persons`
##                 <int>
## 1               17315
the biobirth (Birth Biography of Female and Male Respondents) data contains links of personal ID’s to other respondents (such as children)
  - kidpnr## : PID of first, second, ... child
 
might be of further interest
  - sumkids : total number of kids
  - biokids : number of births from biography
  - kidmon## : month of birth for all children
  - kidgeb## : year of birth for all children
  - kidsex## : gender of all children
  
  
Data Structure (ID’s)
the data is in wide format (each person one row). info on multiple children in numbers (##) is indicated in the variable names
again, we read data in, and select wanted variables (also report number of rows)
biobirth_wide_exists <- here::here("data/biobirth_wide_v36.rds")
if (!file.exists(biobirth_wide_exists)) {
   
   biobirth_raw <- rio::import(paste0(soepisv36_en, "biobirth.dta"))
      
   biobirth_wide <- biobirth_raw %>% 
   select(
            pid,
            cid,
            starts_with("kidpnr")
      )
   export(biobirth_wide, here::here("data/biobirth_wide_v36.rds"))
   message(paste0("biobirth_wide_v36 exported to:",
                  here::here("data/biobirth_wide_v36.rds")))
} else {
   biobirth_wide <- import(here::here("data", "biobirth_wide_v36.rds"))
}
biobirth_wide %>% summarize("Number of persons" = n())
## # A tibble: 1 x 1
##   `Number of persons`
##                 <int>
## 1               12196
the bioparen(Biography Information for Respondents‘ Parents) dataset contains links to other respondents (especially parents).
  - fpid: pid father
  - mpid: pid mother
  - numb: number of brothers
  - nums: number of sisters
  
might further be of relevance:
  - geschw: siblings yes/no
  - living#: Number of years living with different(#) persons
Data Structure (ID’s)
the data is in wide format (each person has one row), years are indicated in the variable names.
again, we read data in, and select wanted variables (also report number of rows)
if (!file.exists(here::here("data/bioparen_wide_v36.rds"))) {
   
   bioparen_raw <- rio::import(paste0(soepisv36_en, "bioparen.dta")) 
      
   bioparen_wide <- bioparen_raw %>% 
   select(
            pid,
            cid,
            fpid,
            mpid,
            nums,
            numb)
      
   export(bioparen_wide, here::here("data/bioparen_wide_v36.rds"))
   message(paste0("bioparen_wide_v36 exported to:", 
               here::here("data/bioparen_wide_v36.rds")))
} else {
   bioparen_wide <- import(here::here("data", "bioparen_wide_v36.rds"))
}
bioparen_wide %>% summarize("Number of persons" = n())
## # A tibble: 1 x 1
##   `Number of persons`
##                 <int>
## 1               11538
unique keys: syear, pid
if (!file.exists(here::here("data/ppfad_long_v36.rds"))) {
   
   time_invariant_vars <- c("pid", "cid", "psample", "sex", "gebjahr", "gebmonat")
   ppfad_labels <- ppfad_wide %>% select(netto = netto00)
   ppfad_long <- ppfad_wide %>% 
         pivot_longer(cols = -all_of(time_invariant_vars), 
                      names_to = "var_syear", 
                      values_to = "value") %>% 
         separate(var_syear, into = c("var", "syear"), sep = -2) %>% 
         mutate(syear = ifelse(syear < 40, paste0("20", syear), paste0("19", syear))) %>% 
         pivot_wider(names_from = "var", 
                     values_from = "value") %>% 
         labelled::set_variable_labels(syear = "Survey Year",
                                       netto = "Survey Status",
                                       hid = "Household Number") %>% 
      codebook::rescue_attributes(ppfad_labels) %>% 
         mutate(syear = as.numeric(syear),
                netto_l = sjlabelled::as_label(.$netto),
                sex_l = sjlabelled::as_label(.$sex))
   
   export(ppfad_long, here::here("data", "ppfad_long_v36.rds"))
   message(paste0("ppfad_long_v36 exported to:", 
                  here::here("data/ppfad_long_v36.rds")))
   ppfad_long_cl <- ppfad_long %>% 
      # recode negative values to NA
      codebook::detect_missing(learn_from_labels = T, 
                                negative_values_are_missing = T, 
                                ninety_nine_problems = F, 
                                only_labelled = F) 
   export(ppfad_long_cl, here::here("data", "ppfad_long_cl_v36.rds"))
   message(paste0("ppfad_long_cl_v36 exported to:", 
                     here::here("data/ppfad_long_cl_v36.rds")))
} else {
   ppfad_long <- import(here::here("data", "ppfad_long_v36.rds"))
   ppfad_long_cl <- import(here::here("data", "ppfad_long_cl_v36.rds"))
}
ppfad_long %>% group_by(syear) %>% summarize("number of persons" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 20 x 2
##    syear `number of persons`
##    <dbl>               <int>
##  1  1998               17012
##  2  1999               17012
##  3  2000               17012
##  4  2001               17012
##  5  2002               17012
##  6  2003               17012
##  7  2004               17012
##  8  2005               17012
##  9  2006               17012
## 10  2007               17012
## 11  2008               17012
## 12  2009               17012
## 13  2010               17012
## 14  2011               17012
## 15  2012               17012
## 16  2013               17012
## 17  2014               17012
## 18  2015               17012
## 19  2016               17012
## 20  2017               17012
pbrutto (Person-related Gross File) contains information on:
  - stell : relationship to head of household
  - stell1: relationship to head of hh (until 2011)
  - salivaerg: results from saliva sample (consent)
  
might further be of interest
  - ewstatu: employment status for people who did not participate (via partner or other person in hh)
  - pzug  : Membership to household (whether moved away or currently lives there)
Data Structure (ID’s)
the data is in long format (each person has multiple rows for each year they participated).
Unique keys: pid, syear
now we will read the data into R and select the variables we wish to use. We will also check the number of years available for the participants
if (!file.exists(here::here("data/pbrutto_long_36.rds"))) {
   pbrutto_raw <- rio::import(paste0(soepisv36_en, "pbrutto.dta")) %>% 
      select(
            pid,
            cid,
            hid,
            syear,
            stell,
            stell1,
            salivaerg
      )
   pbrutto_long <- pbrutto_raw %>% 
            mutate(stell_l = sjlabelled::as_label(.$stell))
   
   pbrutto_long_cl <- pbrutto_long %>% 
      # recode negative values to NA
         codebook::detect_missing(learn_from_labels = T, 
                                   negative_values_are_missing = T, 
                                   ninety_nine_problems = F, 
                                   only_labelled = F) 
   
   export(pbrutto_long, "data/pbrutto_long_v36.rds")
   export(pbrutto_long_cl, "data/pbrutto_long_cl_v36.rds")
   message(paste0("pbrutto_long_v36 & pbrutto_long_cl_v36 exported to:", 
                     here::here("data/pbrutto_long_v36.rds")))
} else {
   pbrutto_long <- import(here::here("data", "pbrutto_long_v36.rds"))
   pbrutto_long_cl <- import(here::here("data", "pbrutto_long_cl_v36.rds"))
}
## Registered S3 method overwritten by 'codebook':
##   method          from 
##   print.knit_asis formr
## pbrutto_long_v36 & pbrutto_long_cl_v36 exported to:/home/reiber/Users/ReiberLisa/git/GeneAnalysis/data/pbrutto_long_v36.rds
pbrutto_long %>% group_by(syear) %>% summarize("Number of persons per year" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 22 x 2
##    syear `Number of persons per year`
##    <dbl>                        <int>
##  1  1998                          936
##  2  1999                          961
##  3  2000                          971
##  4  2001                          981
##  5  2002                          984
##  6  2003                         1004
##  7  2004                         1007
##  8  2005                         1011
##  9  2006                         1027
## 10  2007                         1026
## # … with 12 more rows
# pbrutto_long %>% tabyl(stell_l)
the inno (Variables from the Innovation Modules) dataset contains information about experiments and innovative questions
  - iabm1-6 : ADHD
  - iabaut1-10 : Autism
  
furthermore could be of interest:
  - im_seiat : self-esteem
  - self#_# : Self assessments (warmth, competent, helpful etc)
  
Data Structure (ID’s)
the data is in long format (each person has multiple rows for each year they participated).
Unique keys: pid, syear
again, we read data in, select wanted variables and report number of rows (observations) per year
if (!file.exists(here::here("data/inno_long_v36.rds"))) {
   inno_raw <- rio::import(paste0(soepisv36_en, "inno.dta")) 
   inno_long <- inno_raw %>% 
         select(
               pid,
               cid,
               hid,
               syear,
               starts_with("iabm"),
               starts_with("iabaut"))
   
   export(inno_long, here::here("data", "inno_long_v36.rds"))
} else {
   inno_long <- import(here::here("data", "inno_long_v36.rds"))
}
inno_long %>% group_by(syear) %>% summarize("Number of persons" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 8 x 2
##   syear `Number of persons`
##   <dbl>               <int>
## 1  2011                1701
## 2  2012                3696
## 3  2013                5141
## 4  2014                5868
## 5  2015                5897
## 6  2016                6358
## 7  2017                5464
## 8  2018                4860
How many times do people appear in the data over time?
inno_long %>% group_by(pid) %>% 
      summarize("Number of years" = n()) %>% 
      tabyl("Number of years") %>% adorn_pct_formatting()
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 8 x 3
##   `Number of years`     n percent
##               <int> <dbl> <chr>  
## 1                 1  2964 28.4%  
## 2                 2  1139 10.9%  
## 3                 3  1511 14.5%  
## 4                 4   622 6.0%   
## 5                 5  1167 11.2%  
## 6                 6  1162 11.1%  
## 7                 7  1133 10.8%  
## 8                 8   748 7.2%
Most people appear once, some appear in all years (8 times from 2011 to 2019)
the cognit (Data on cognitive potential) dataset contains information on cognitive tests
  - f025r : sum of correct words
  - f099s30 : Sum of all numerical entries 30s
  - f099s60 : Sum of all numerical entries 30s
  - f099s90 : Sum of all numerical entries 30s
Data Structure (ID’s)
the data is in long format (each person has multiple rows for each year they participated).
Unique keys: pid, syear Note: data is so far only from 2004, and 2018
again, we read data in, select wanted variables and report number of rows (observations)
if (!file.exists(here::here("data/cognit_long_v36.rds"))) {
   cognit_raw <- rio::import(paste0(soepisv36_en, "cognit.dta")) 
   cognit_long <- cognit_raw %>% 
         select(
               pid,
               cid,
               hid,
               syear,
               cogtest = f025r,
               cogtest_01 = f099s30,
               cogtest_02 = f099s60,
               cogtest_03 = f099s90)
   
   export(cognit_long, here::here("data", "cognit_long_v36.rds"))
   message(paste0("cognit_long exported to:", here::here("data/cognit_long_v36.rds")))
} else {
   cognit_long <- import(here::here("data", "cognit_long_v36.rds"))
}
cognit_long %>% group_by(syear) %>% summarize("Number of persons" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
##   syear `Number of persons`
##   <dbl>               <int>
## 1  2014                4498
## 2  2018                 872
the pgen (Person-related Status and Generated Variables) dataset contains generated information on participants at the person level
  - pgbilzt : years of education
  - pgpartnr : pid of partner
  - pgpartz : partner indicator 
  
further might be of interest:
  - pgfamstd : family status
  - pglfs : labor force status
  - pgemplst : employment status
  
Data Structure (ID’s)
the data is in long format (each person has multiple rows for each year they participated). Unique identifier keys: pid, syear
again, we read data in, select wanted variables and report number of rows (observations)
if (!file.exists(here::here("data/pgen_long_v36.rds"))) {
   pgen_raw <- rio::import(paste0(soepisv36_en, "pgen.dta")) 
   pgen_long <- pgen_raw %>% 
         select(
               pid,
               cid,
               hid,
               syear,
               pgbilzt,
               pgpartnr,
               pgpartz)
   export(pgen_long, here::here("data", "pgen_long_v36.rds"))
   message(paste0("pgen_long_v36 exported to:", here::here("data/pgen_long_v36.rds")))
} else {
   pgen_long <- import(here::here("data", "pgen_long_v36.rds"))
}
pgen_long %>% group_by(syear) %>% summarize("Number of persons" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 22 x 2
##    syear `Number of persons`
##    <dbl>               <int>
##  1  1998                 724
##  2  1999                 750
##  3  2000                 755
##  4  2001                 766
##  5  2002                 780
##  6  2003                 795
##  7  2004                 792
##  8  2005                 799
##  9  2006                 797
## 10  2007                 797
## # … with 12 more rows
the p (Variables from the Individual Question Module) dataset contains information from the person individual questionnaire on the person level, which all adults get (no chidren)
Doku: Psychological Scales Manual SOEP: https://www.diw.de/sixcms/detail.php?id=diw_01.c.554370.de
  - plh0203 : personal willingness to take risks after winning lottery
  - plh0205 : personal willingness to take risks (wrong)
  
  - Tendency to Forgive (plh0142, plh0145)
  - Selbstwert (plh0146)
  - Optimismus (plh0188)
  - Vertrauen (plh0192-plh0194)
  - Reziprozität (plh0206 - plh0211)
  - Big 5 (plh0212 - plh0255)
        Adults:
              - Openness: plh0212, plh0215, plh0220, plh0225
              - Conscientiousness: plh0218, plh0222
              - Extraversion: plh0213, plh0219, plh0223
              - Agreeableness: plh0214, plh0217, plh0224
              - Neuroticism: plh0216, plh0221, plh0226
  - Kontrollüberzeugung (plh0245 - plh0252) also (plh0235 - plh0242)
  - Geduld/Impulsivität (plh0253/plh0254)
further might be of interest
  - pld00## : nature of relationship to  different people (mother, father, spouse)
  - pld0015 : grandparents present
  - pld0016 : number of grandparents
  - plh0134/136 : gift of 10.000 €, what to do with it
  - pld0020-117 : current spouse in other hh present - and then other persons
  - health related:
        - ple0008 : current health
        - ple0019 : depressive psychosis
        - ple0020 : dementia
        - ple0021 : joint disorder
        - ple0022 : chronic back complaints
        ...
        - ple0024 : no illnes
        - ple0086/88 : number of cigarettes/cigars per day
  - Satisfaction
        - plh0182 : Current Life Satisfaction
Data Structure (ID’s)
the data is in long format (each person has multiple rows for each year they participated).
Unique keys: pid, syear
again, we read data in, select wanted variables and report number of rows (observations)
if (!file.exists(here::here("data/p_long_v36.rds"))) {
   p_raw <- rio::import(paste0(soepisv36_en, "p.dta")) %>% 
      select(
            pid,
            cid,
            hid,
            syear,
            starts_with("plh0")
            )
   p_long <- p_raw %>% 
         select(# ID's
                pid, cid, hid, syear,
                # Personality
                forgive_01 = plh0142,
                forgive_02 = plh0145,
                selfworth = plh0146,
                optimism = plh0188, 
                trust_1 = plh0192,
                trust_2 = plh0194,
                recip_1 = plh0206,
                recip_2 = plh0207,
                recip_3 = plh0208,
                recip_4 = plh0209,
                recip_5 = plh0210,
                recip_6 = plh0211,
                b5_open_1 = plh0212,
                b5_open_2 = plh0215,
                b5_open_3 = plh0220,
                b5_open_4 = plh0225,
                b5_consc_1 = plh0218,
                b5_consc_2 = plh0222,
                b5_extra_1 = plh0213,
                b5_extra_2 = plh0219,
                b5_extra_3 = plh0223,
                b5_agree_1 = plh0214,
                b5_agree_2 = plh0217,
                b5_agree_3 = plh0224,
                b5_neuro_1 = plh0216,
                b5_neuro_2 = plh0221,
                b5_neuro_3 = plh0226,
                # to do locus of control
                patience_1 = plh0253,
                patience_2 = plh0254,
                # risk
                risk_lottery = plh0203,
                risk_driving = plh0197, 
                risk_finance = plh0198,
                risk_leisure = plh0199, 
                risk_occupation = plh0200, 
                risk_health = plh0201, 
                risk_trust = plh0202,
                risk_general = plh0204
                )
   export(p_long, here::here("data", "p_long_v36.rds"))
   message(paste0("p_long_v36 exported to:", here::here("data/p_long_v36.rds")))
} else {
   p_long <- import(here::here("data", "p_long_v36.rds"))
}
p_long %>% group_by(syear) %>% summarize("Number of persons" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 22 x 2
##    syear `Number of persons`
##    <dbl>               <int>
##  1  1998                 724
##  2  1999                 750
##  3  2000                 755
##  4  2001                 766
##  5  2002                 780
##  6  2003                 795
##  7  2004                 792
##  8  2005                 799
##  9  2006                 797
## 10  2007                 797
## # … with 12 more rows
the bioage (Variables from the Modules of Questions on Children) dataset contains information about the relationship to the child (biological or social)
again, we read data in, and select wanted variables
if (!file.exists(here::here("data/bioage_long_v36.rds"))) {
   bioage_raw <- rio::import(paste0(soepisv36_en, "bioage.dta")) 
   bioage_long <- bioage_raw %>% 
         select(
               child_id = pid,
               child_cid = cid,
               child_hid = hid,
               syear,
               biochild,
               pidresp
         )
   
   export(bioage_long, here::here("data", "bioage_long_v36.rds"))
   message(paste0("bioage_long_v36 exported to:", here::here("data/bioage_long_v36.rds")))
} else {
   bioage_long <- import(here::here("data", "bioage_long_v36.rds"))
}
bioage_long %>% group_by(syear) %>% summarize("Number of persons per year" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 17 x 2
##    syear `Number of persons per year`
##    <dbl>                        <int>
##  1  2003                            8
##  2  2004                           11
##  3  2005                           14
##  4  2006                           17
##  5  2007                           12
##  6  2008                           18
##  7  2009                           23
##  8  2010                          156
##  9  2011                          421
## 10  2012                          803
## 11  2013                          709
## 12  2014                         1847
## 13  2015                         1898
## 14  2016                         1798
## 15  2017                         1910
## 16  2018                         1493
## 17  2019                         1482
the kid (Pooled Dataset on Children) dataset also contains links to other respondents. Note that in this case the information is about children, not adults.
  - k_phead : pointer to head of hh
  - k_pheadp: pointer to partner of head of hh
  - k_pmum: pointer to mother
  - k_pmump: pointer to partner of mother
  - k_lel: relationship to head of hh
  
might be of interest
  - k_size: household size
  - k_inhh: member of hh
  - k_nrkid: number of chilren in hh
Data Structure (ID’s)
the data is in long format (each person (in this case child) has multiple rows for each year they participated).
Unique keys: pid, syear
again, we read data in, and select wanted variables (also report number of rows)
if (!file.exists(here::here("data/kid_long_v36.rds"))) {
   
   kid_raw <- rio::import(paste0(soepisv36_en, "kid.dta"))
   kid_long <- kid_raw %>% 
         select(child_id = pid, 
                child_cid = cid,
                child_hid = hid,
                syear,
                k_pheadp,
                k_pheadp,
                k_pmum,
                k_pmump,
                k_rel)
   
   export(kid_long, here::here("data", "kid_long_v36.rds"))
   message(paste0("kid_long_v36 exported to:", here::here("data/kid_long_v36.rds")))
} else {
   kid_long <- import(here::here("data", "kid_long_v36.rds"))
}
kid_long %>% group_by(syear) %>% summarize("Number of children" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 22 x 2
##    syear `Number of children`
##    <dbl>                <int>
##  1  1998                  187
##  2  1999                  185
##  3  2000                  178
##  4  2001                  180
##  5  2002                  168
##  6  2003                  162
##  7  2004                  169
##  8  2005                  159
##  9  2006                  148
## 10  2007                  144
## # … with 12 more rows
soepis_wide <- ppfad_wide %>% 
      left_join(bioparen_wide, by = c("pid", "cid")) %>%  # add bioparen data
      left_join(biobirth_wide, by = c("pid", "cid"))  #add biobirth data
now we will re-format the wide data into the somewhat long format and make sure that all persons have the same amount of rows (for each possible year). This means that Variables which were given for each year (such as hid01, hid02 etc) will be reformatted into one variable (hid) and multiple rows for each year (01,02, etc).
soepis_long_partial1 <- soepis_wide %>% 
      pivot_longer(cols = matches("^hid|^netto"),
                   names_to = "key_syear",
                   values_to = "value") %>% 
      # separate var and syear column
      separate(col = key_syear, into = c("key", "syear"), sep = -2) %>% 
      # turn the last two digits of syear into real digits
      mutate(syear = case_when(syear >= 90 ~ str_c("19", syear),
                               syear <= 20 ~ str_c("20", syear)),
             # make numerical (was character before)
             syear = as.numeric(syear)) %>% 
      pivot_wider(values_from = value, 
                  names_from = key)
soepis_long_partial1 %>% group_by(syear) %>% summarize("Number of persons per year" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 20 x 2
##    syear `Number of persons per year`
##    <dbl>                        <int>
##  1  1998                        17012
##  2  1999                        17012
##  3  2000                        17012
##  4  2001                        17012
##  5  2002                        17012
##  6  2003                        17012
##  7  2004                        17012
##  8  2005                        17012
##  9  2006                        17012
## 10  2007                        17012
## 11  2008                        17012
## 12  2009                        17012
## 13  2010                        17012
## 14  2011                        17012
## 15  2012                        17012
## 16  2013                        17012
## 17  2014                        17012
## 18  2015                        17012
## 19  2016                        17012
## 20  2017                        17012
soepisv36_adults <- soepis_long_partial1 %>% 
   left_join(pbrutto_long, by = c("pid", "hid", "cid","syear")) %>%  # add pbrutto data
   left_join(inno_long, by = c("pid", "hid", "cid","syear")) %>%   # add inno data
   left_join(cognit_long, by = c("pid", "hid", "cid","syear")) %>%   # add cognit data
   left_join(pgen_long, by = c("pid", "hid", "cid","syear")) %>%   # add pgen data
   left_join(p_long, by = c("pid", "hid", "cid","syear"))  # add p data
rio::export(soepisv36_adults, here::here("data/soepisv36_adults.rds"))
soepisv36_children <- soepis_long_partial1 %>% 
   # add bioage data
   left_join(bioage_long, by = c("pid" = "child_id",
                                 "hid" = "child_hid", 
                                 "cid" = "child_cid",
                                 "syear")) %>%   
   # add kid data
   left_join(kid_long, by = c("pid" = "child_id",
                              "hid" = "child_hid", 
                              "cid" = "child_cid",
                              "syear"))  
rio::export(soepisv36_children, here::here("data/soepisv36_children.rds"))
soepisv36 <- soepisv36_adults %>% 
      bind_rows(soepisv36_children) %>% 
      mutate(syear = as.integer(syear)) %>% 
      rescue_attributes(soepisv36_adults) %>% 
      rescue_attributes(soepisv36_children) %>% 
      set_variable_labels(syear = "Survey Year",
                          netto = "Survey Status",
                          hid = "Household Number") 
rio::export(soepisv36, here::here("data/soepisv36.rds"))
if (!file.exists(here::here("data/soepis_igene_raw.rds"))) {
   soepis_igene_raw <- rio::import(here::here("../../data/00_raw/SOEP/SOEP_v36/Gene/Inno19F_IGENE.dta"))
   # soepis_igene_raw2 <- rio::import(here::here("../../data/00_raw/SOEP/SOEP_v36/trios.dta"))
   pid_igene_raw <- soepis_igene_raw %>% select(pid) %>% distinct() %>% pull
   
   rio::export(soepis_igene_raw, here::here("data/soepis_igene_raw.rds"))
   rio::export(pid_igene_raw, here::here("data/pid_igene_raw.rds"))
} 
# old approach
# key_label <- map(soepisv34, attr, "label") %>% unlist # Gives you list of the labels
# key_labels <- as.data.frame(key_label) %>% add_rownames( var = "key_name")
key_labels <- labelled::var_label(soepisv36) %>% 
      as_tibble(rownames = c("key_name", "key_label")) %>% 
      gather(key = key_name, value = key_label)
id <- c("syear", "pid", "cid", "hid", "kidpnr", 
        "fpid", "mpid", "pgpartnr", "pgpartz", 
        "k_pheadp", "k_pheadp", "k_pmum", "k_pmump")
survey <- c("psample", "netto", "stell", "stell1", "salivaerg")
demogr <- c("sex", "gebjahr", "gebmonat", "pgbilzt")
psych <- c("iabm", "iabaut", "cogtest", "forgive", "selfworth", 
           "optimism", "trust", "recip", "b5", "patience", "risk")
other <- c("nums", "numb", "biochild", "k_rel")
key_features <- key_labels %>% 
      mutate(key_category = case_when(key_name %in% id ~ "ID's",
                                      str_detect(key_name, "kidpnr") ~ "ID's",
                                      key_name %in% survey ~ "Survey",
                                      key_name %in% demogr ~ "Demography",
                                      key_name %in% other ~ "Other",
                                      TRUE ~ "Psych. Measure"
                                      ),
             key_category = ordered(key_category, levels = c("ID's", "Survey", "Demography",
                                                             "Psych. Measure", "Other") )
      )
export(key_labels, here::here("data", "key_labels.rds"))
now we create an additional dataset which is long in a true sence since all variables ere represented in the keycolumn and all corresponding values in a value column.
the data is uniquely identifiable by: syear, pid, key
if (!file.exists(here::here("data/soepis_long_v36.rds"))) {
   soepis_long <- soepisv36 %>% 
      select(-stell_l) %>% 
      # recode negative values to NA
      codebook::detect_missing(learn_from_labels = T,
                                negative_values_are_missing = T,
                                ninety_nine_problems = F,
                                only_labelled = F) %>% 
      tidyr::pivot_longer(cols = c(-syear, -pid, -hid, -cid), names_to = "key", values_to = "value")
   
   # add labels from key_features generated above
   soepis_long <- soepis_long %>% 
         left_join(key_features, by = c("key" = "key_name")) %>% 
         mutate(key_name_label = paste0(key, ": ", key_label))
   
   rio::export(soepis_long, here::here("data/soepis_long_v36.rds"))
   
} 
# https://stackoverflow.com/questions/39417003/long-vectors-not-supported-yet-error-in-rmd-but-not-in-r-script
to do some plotting by age, we need another kind of long format
#key_labels is defined above
if (!file.exists(here::here("data/soepis_age_v36.rds"))) {
   soepis_age <- soepisv36 %>% 
         select(-stell_l) %>% 
         # recode negative values to NA
         codebook::detect_missing(learn_from_labels = T,
                                   negative_values_are_missing = T,
                                   ninety_nine_problems = F,
                                   only_labelled = F) %>% 
         mutate(age = syear - gebjahr,
                age_k = cut_width(age, 15, boundary = 0)) %>% 
         pivot_longer(cols = c(-age,-age_k, -pid, -cid), names_to = "key", values_to = "value")
   
   # add labels from key_features generated above
   soepis_age <- soepis_age %>%
         left_join(key_features, by = c("key" = "key_name")) %>%
         mutate(key_name_label = paste0(key, ": ", key_label))
   
   rio::export(soepis_age, here::here("data/soepis_age_v36.rds"))
}