This markdown outlines the process in which multiple SOEP-IS datasets are used to construct a dataset which includes all variables of interest. SOEP-IS Documentation is available on the website and via paneldata).
Procedure
Data Structure (ID’s)
soepisv34
will be in SOEP-long format and on the person level, meaning that unique individuals may have multiple rows in the dataset for multiple timepoints (survey years). The rows in the data will NOT be uniquely identifyable by Person ID ‘pid’ alone, but in the combination of person ID pid
and time syear
.hid
.ppfad (Person-related Meta-dataset) is the most extensive dataset, including most peole who ever participated in the SOEP-IS. It is therefore used as the “base” dataset, onto which the other information will be merged. (ppfad documentation)
it includes more ID vars and metadata than the others
- pid (unique person ID)
- cid (unique case id (hh level))
- hid$$ (current household ID)
it also includes information about participants
- sex: gender
- gebjahr: year of birth
- gebmo: month of birth
- psample: sample member
- netto$$: current wave status
information that might further be of interest (not included for now)
- todjahr: year of death
- corigin: country of origin
- migback: migration background
- sampreg$$: East/West Germany
Data Structure (ID’s)
the data is in wide format (each person has one row), years are indicated in the variable names.
unique key: pid
if (!file.exists(here::here("data/ppfad_wide_v36.rds"))) {
ppfad_raw <- rio::import(paste0(soepisv36_en, "ppfad.dta"))
ppfad_wide <- ppfad_raw %>%
select(
# time invariant
pid,
cid,
psample,
sex,
gebjahr,
gebmonat,
# time variant
starts_with("hid"),
contains("netto")
)
export(ppfad_wide, here::here("data/ppfad_wide_v36.rds"))
message(paste0("ppfad_wide exported to:", here::here("data/ppfad_wide_v36.rds")))
} else {
ppfad_wide <- import(here::here("data", "ppfad_wide_v36.rds"))
}
ppfad_wide %>% summarize("Number of persons" = n())
## # A tibble: 1 x 1
## `Number of persons`
## <int>
## 1 17315
the biobirth (Birth Biography of Female and Male Respondents) data contains links of personal ID’s to other respondents (such as children)
- kidpnr## : PID of first, second, ... child
might be of further interest
- sumkids : total number of kids
- biokids : number of births from biography
- kidmon## : month of birth for all children
- kidgeb## : year of birth for all children
- kidsex## : gender of all children
Data Structure (ID’s)
the data is in wide format (each person one row). info on multiple children in numbers (##) is indicated in the variable names
again, we read data in, and select wanted variables (also report number of rows)
biobirth_wide_exists <- here::here("data/biobirth_wide_v36.rds")
if (!file.exists(biobirth_wide_exists)) {
biobirth_raw <- rio::import(paste0(soepisv36_en, "biobirth.dta"))
biobirth_wide <- biobirth_raw %>%
select(
pid,
cid,
starts_with("kidpnr")
)
export(biobirth_wide, here::here("data/biobirth_wide_v36.rds"))
message(paste0("biobirth_wide_v36 exported to:",
here::here("data/biobirth_wide_v36.rds")))
} else {
biobirth_wide <- import(here::here("data", "biobirth_wide_v36.rds"))
}
biobirth_wide %>% summarize("Number of persons" = n())
## # A tibble: 1 x 1
## `Number of persons`
## <int>
## 1 12196
the bioparen(Biography Information for Respondents‘ Parents) dataset contains links to other respondents (especially parents).
- fpid: pid father
- mpid: pid mother
- numb: number of brothers
- nums: number of sisters
might further be of relevance:
- geschw: siblings yes/no
- living#: Number of years living with different(#) persons
Data Structure (ID’s)
the data is in wide format (each person has one row), years are indicated in the variable names.
again, we read data in, and select wanted variables (also report number of rows)
if (!file.exists(here::here("data/bioparen_wide_v36.rds"))) {
bioparen_raw <- rio::import(paste0(soepisv36_en, "bioparen.dta"))
bioparen_wide <- bioparen_raw %>%
select(
pid,
cid,
fpid,
mpid,
nums,
numb)
export(bioparen_wide, here::here("data/bioparen_wide_v36.rds"))
message(paste0("bioparen_wide_v36 exported to:",
here::here("data/bioparen_wide_v36.rds")))
} else {
bioparen_wide <- import(here::here("data", "bioparen_wide_v36.rds"))
}
bioparen_wide %>% summarize("Number of persons" = n())
## # A tibble: 1 x 1
## `Number of persons`
## <int>
## 1 11538
unique keys: syear, pid
if (!file.exists(here::here("data/ppfad_long_v36.rds"))) {
time_invariant_vars <- c("pid", "cid", "psample", "sex", "gebjahr", "gebmonat")
ppfad_labels <- ppfad_wide %>% select(netto = netto00)
ppfad_long <- ppfad_wide %>%
pivot_longer(cols = -all_of(time_invariant_vars),
names_to = "var_syear",
values_to = "value") %>%
separate(var_syear, into = c("var", "syear"), sep = -2) %>%
mutate(syear = ifelse(syear < 40, paste0("20", syear), paste0("19", syear))) %>%
pivot_wider(names_from = "var",
values_from = "value") %>%
labelled::set_variable_labels(syear = "Survey Year",
netto = "Survey Status",
hid = "Household Number") %>%
codebook::rescue_attributes(ppfad_labels) %>%
mutate(syear = as.numeric(syear),
netto_l = sjlabelled::as_label(.$netto),
sex_l = sjlabelled::as_label(.$sex))
export(ppfad_long, here::here("data", "ppfad_long_v36.rds"))
message(paste0("ppfad_long_v36 exported to:",
here::here("data/ppfad_long_v36.rds")))
ppfad_long_cl <- ppfad_long %>%
# recode negative values to NA
codebook::detect_missing(learn_from_labels = T,
negative_values_are_missing = T,
ninety_nine_problems = F,
only_labelled = F)
export(ppfad_long_cl, here::here("data", "ppfad_long_cl_v36.rds"))
message(paste0("ppfad_long_cl_v36 exported to:",
here::here("data/ppfad_long_cl_v36.rds")))
} else {
ppfad_long <- import(here::here("data", "ppfad_long_v36.rds"))
ppfad_long_cl <- import(here::here("data", "ppfad_long_cl_v36.rds"))
}
ppfad_long %>% group_by(syear) %>% summarize("number of persons" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 20 x 2
## syear `number of persons`
## <dbl> <int>
## 1 1998 17012
## 2 1999 17012
## 3 2000 17012
## 4 2001 17012
## 5 2002 17012
## 6 2003 17012
## 7 2004 17012
## 8 2005 17012
## 9 2006 17012
## 10 2007 17012
## 11 2008 17012
## 12 2009 17012
## 13 2010 17012
## 14 2011 17012
## 15 2012 17012
## 16 2013 17012
## 17 2014 17012
## 18 2015 17012
## 19 2016 17012
## 20 2017 17012
pbrutto (Person-related Gross File) contains information on:
- stell : relationship to head of household
- stell1: relationship to head of hh (until 2011)
- salivaerg: results from saliva sample (consent)
might further be of interest
- ewstatu: employment status for people who did not participate (via partner or other person in hh)
- pzug : Membership to household (whether moved away or currently lives there)
Data Structure (ID’s)
the data is in long format (each person has multiple rows for each year they participated).
Unique keys: pid, syear
now we will read the data into R and select the variables we wish to use. We will also check the number of years available for the participants
if (!file.exists(here::here("data/pbrutto_long_36.rds"))) {
pbrutto_raw <- rio::import(paste0(soepisv36_en, "pbrutto.dta")) %>%
select(
pid,
cid,
hid,
syear,
stell,
stell1,
salivaerg
)
pbrutto_long <- pbrutto_raw %>%
mutate(stell_l = sjlabelled::as_label(.$stell))
pbrutto_long_cl <- pbrutto_long %>%
# recode negative values to NA
codebook::detect_missing(learn_from_labels = T,
negative_values_are_missing = T,
ninety_nine_problems = F,
only_labelled = F)
export(pbrutto_long, "data/pbrutto_long_v36.rds")
export(pbrutto_long_cl, "data/pbrutto_long_cl_v36.rds")
message(paste0("pbrutto_long_v36 & pbrutto_long_cl_v36 exported to:",
here::here("data/pbrutto_long_v36.rds")))
} else {
pbrutto_long <- import(here::here("data", "pbrutto_long_v36.rds"))
pbrutto_long_cl <- import(here::here("data", "pbrutto_long_cl_v36.rds"))
}
## Registered S3 method overwritten by 'codebook':
## method from
## print.knit_asis formr
## pbrutto_long_v36 & pbrutto_long_cl_v36 exported to:/home/reiber/Users/ReiberLisa/git/GeneAnalysis/data/pbrutto_long_v36.rds
pbrutto_long %>% group_by(syear) %>% summarize("Number of persons per year" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 22 x 2
## syear `Number of persons per year`
## <dbl> <int>
## 1 1998 936
## 2 1999 961
## 3 2000 971
## 4 2001 981
## 5 2002 984
## 6 2003 1004
## 7 2004 1007
## 8 2005 1011
## 9 2006 1027
## 10 2007 1026
## # … with 12 more rows
# pbrutto_long %>% tabyl(stell_l)
the inno (Variables from the Innovation Modules) dataset contains information about experiments and innovative questions
- iabm1-6 : ADHD
- iabaut1-10 : Autism
furthermore could be of interest:
- im_seiat : self-esteem
- self#_# : Self assessments (warmth, competent, helpful etc)
Data Structure (ID’s)
the data is in long format (each person has multiple rows for each year they participated).
Unique keys: pid, syear
again, we read data in, select wanted variables and report number of rows (observations) per year
if (!file.exists(here::here("data/inno_long_v36.rds"))) {
inno_raw <- rio::import(paste0(soepisv36_en, "inno.dta"))
inno_long <- inno_raw %>%
select(
pid,
cid,
hid,
syear,
starts_with("iabm"),
starts_with("iabaut"))
export(inno_long, here::here("data", "inno_long_v36.rds"))
} else {
inno_long <- import(here::here("data", "inno_long_v36.rds"))
}
inno_long %>% group_by(syear) %>% summarize("Number of persons" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 8 x 2
## syear `Number of persons`
## <dbl> <int>
## 1 2011 1701
## 2 2012 3696
## 3 2013 5141
## 4 2014 5868
## 5 2015 5897
## 6 2016 6358
## 7 2017 5464
## 8 2018 4860
How many times do people appear in the data over time?
inno_long %>% group_by(pid) %>%
summarize("Number of years" = n()) %>%
tabyl("Number of years") %>% adorn_pct_formatting()
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 8 x 3
## `Number of years` n percent
## <int> <dbl> <chr>
## 1 1 2964 28.4%
## 2 2 1139 10.9%
## 3 3 1511 14.5%
## 4 4 622 6.0%
## 5 5 1167 11.2%
## 6 6 1162 11.1%
## 7 7 1133 10.8%
## 8 8 748 7.2%
Most people appear once, some appear in all years (8 times from 2011 to 2019)
the cognit (Data on cognitive potential) dataset contains information on cognitive tests
- f025r : sum of correct words
- f099s30 : Sum of all numerical entries 30s
- f099s60 : Sum of all numerical entries 30s
- f099s90 : Sum of all numerical entries 30s
Data Structure (ID’s)
the data is in long format (each person has multiple rows for each year they participated).
Unique keys: pid, syear Note: data is so far only from 2004, and 2018
again, we read data in, select wanted variables and report number of rows (observations)
if (!file.exists(here::here("data/cognit_long_v36.rds"))) {
cognit_raw <- rio::import(paste0(soepisv36_en, "cognit.dta"))
cognit_long <- cognit_raw %>%
select(
pid,
cid,
hid,
syear,
cogtest = f025r,
cogtest_01 = f099s30,
cogtest_02 = f099s60,
cogtest_03 = f099s90)
export(cognit_long, here::here("data", "cognit_long_v36.rds"))
message(paste0("cognit_long exported to:", here::here("data/cognit_long_v36.rds")))
} else {
cognit_long <- import(here::here("data", "cognit_long_v36.rds"))
}
cognit_long %>% group_by(syear) %>% summarize("Number of persons" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## syear `Number of persons`
## <dbl> <int>
## 1 2014 4498
## 2 2018 872
the pgen (Person-related Status and Generated Variables) dataset contains generated information on participants at the person level
- pgbilzt : years of education
- pgpartnr : pid of partner
- pgpartz : partner indicator
further might be of interest:
- pgfamstd : family status
- pglfs : labor force status
- pgemplst : employment status
Data Structure (ID’s)
the data is in long format (each person has multiple rows for each year they participated). Unique identifier keys: pid, syear
again, we read data in, select wanted variables and report number of rows (observations)
if (!file.exists(here::here("data/pgen_long_v36.rds"))) {
pgen_raw <- rio::import(paste0(soepisv36_en, "pgen.dta"))
pgen_long <- pgen_raw %>%
select(
pid,
cid,
hid,
syear,
pgbilzt,
pgpartnr,
pgpartz)
export(pgen_long, here::here("data", "pgen_long_v36.rds"))
message(paste0("pgen_long_v36 exported to:", here::here("data/pgen_long_v36.rds")))
} else {
pgen_long <- import(here::here("data", "pgen_long_v36.rds"))
}
pgen_long %>% group_by(syear) %>% summarize("Number of persons" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 22 x 2
## syear `Number of persons`
## <dbl> <int>
## 1 1998 724
## 2 1999 750
## 3 2000 755
## 4 2001 766
## 5 2002 780
## 6 2003 795
## 7 2004 792
## 8 2005 799
## 9 2006 797
## 10 2007 797
## # … with 12 more rows
the p (Variables from the Individual Question Module) dataset contains information from the person individual questionnaire on the person level, which all adults get (no chidren)
Doku: Psychological Scales Manual SOEP: https://www.diw.de/sixcms/detail.php?id=diw_01.c.554370.de
- plh0203 : personal willingness to take risks after winning lottery
- plh0205 : personal willingness to take risks (wrong)
- Tendency to Forgive (plh0142, plh0145)
- Selbstwert (plh0146)
- Optimismus (plh0188)
- Vertrauen (plh0192-plh0194)
- Reziprozität (plh0206 - plh0211)
- Big 5 (plh0212 - plh0255)
Adults:
- Openness: plh0212, plh0215, plh0220, plh0225
- Conscientiousness: plh0218, plh0222
- Extraversion: plh0213, plh0219, plh0223
- Agreeableness: plh0214, plh0217, plh0224
- Neuroticism: plh0216, plh0221, plh0226
- Kontrollüberzeugung (plh0245 - plh0252) also (plh0235 - plh0242)
- Geduld/Impulsivität (plh0253/plh0254)
further might be of interest
- pld00## : nature of relationship to different people (mother, father, spouse)
- pld0015 : grandparents present
- pld0016 : number of grandparents
- plh0134/136 : gift of 10.000 €, what to do with it
- pld0020-117 : current spouse in other hh present - and then other persons
- health related:
- ple0008 : current health
- ple0019 : depressive psychosis
- ple0020 : dementia
- ple0021 : joint disorder
- ple0022 : chronic back complaints
...
- ple0024 : no illnes
- ple0086/88 : number of cigarettes/cigars per day
- Satisfaction
- plh0182 : Current Life Satisfaction
Data Structure (ID’s)
the data is in long format (each person has multiple rows for each year they participated).
Unique keys: pid, syear
again, we read data in, select wanted variables and report number of rows (observations)
if (!file.exists(here::here("data/p_long_v36.rds"))) {
p_raw <- rio::import(paste0(soepisv36_en, "p.dta")) %>%
select(
pid,
cid,
hid,
syear,
starts_with("plh0")
)
p_long <- p_raw %>%
select(# ID's
pid, cid, hid, syear,
# Personality
forgive_01 = plh0142,
forgive_02 = plh0145,
selfworth = plh0146,
optimism = plh0188,
trust_1 = plh0192,
trust_2 = plh0194,
recip_1 = plh0206,
recip_2 = plh0207,
recip_3 = plh0208,
recip_4 = plh0209,
recip_5 = plh0210,
recip_6 = plh0211,
b5_open_1 = plh0212,
b5_open_2 = plh0215,
b5_open_3 = plh0220,
b5_open_4 = plh0225,
b5_consc_1 = plh0218,
b5_consc_2 = plh0222,
b5_extra_1 = plh0213,
b5_extra_2 = plh0219,
b5_extra_3 = plh0223,
b5_agree_1 = plh0214,
b5_agree_2 = plh0217,
b5_agree_3 = plh0224,
b5_neuro_1 = plh0216,
b5_neuro_2 = plh0221,
b5_neuro_3 = plh0226,
# to do locus of control
patience_1 = plh0253,
patience_2 = plh0254,
# risk
risk_lottery = plh0203,
risk_driving = plh0197,
risk_finance = plh0198,
risk_leisure = plh0199,
risk_occupation = plh0200,
risk_health = plh0201,
risk_trust = plh0202,
risk_general = plh0204
)
export(p_long, here::here("data", "p_long_v36.rds"))
message(paste0("p_long_v36 exported to:", here::here("data/p_long_v36.rds")))
} else {
p_long <- import(here::here("data", "p_long_v36.rds"))
}
p_long %>% group_by(syear) %>% summarize("Number of persons" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 22 x 2
## syear `Number of persons`
## <dbl> <int>
## 1 1998 724
## 2 1999 750
## 3 2000 755
## 4 2001 766
## 5 2002 780
## 6 2003 795
## 7 2004 792
## 8 2005 799
## 9 2006 797
## 10 2007 797
## # … with 12 more rows
the bioage (Variables from the Modules of Questions on Children) dataset contains information about the relationship to the child (biological or social)
again, we read data in, and select wanted variables
if (!file.exists(here::here("data/bioage_long_v36.rds"))) {
bioage_raw <- rio::import(paste0(soepisv36_en, "bioage.dta"))
bioage_long <- bioage_raw %>%
select(
child_id = pid,
child_cid = cid,
child_hid = hid,
syear,
biochild,
pidresp
)
export(bioage_long, here::here("data", "bioage_long_v36.rds"))
message(paste0("bioage_long_v36 exported to:", here::here("data/bioage_long_v36.rds")))
} else {
bioage_long <- import(here::here("data", "bioage_long_v36.rds"))
}
bioage_long %>% group_by(syear) %>% summarize("Number of persons per year" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 17 x 2
## syear `Number of persons per year`
## <dbl> <int>
## 1 2003 8
## 2 2004 11
## 3 2005 14
## 4 2006 17
## 5 2007 12
## 6 2008 18
## 7 2009 23
## 8 2010 156
## 9 2011 421
## 10 2012 803
## 11 2013 709
## 12 2014 1847
## 13 2015 1898
## 14 2016 1798
## 15 2017 1910
## 16 2018 1493
## 17 2019 1482
the kid (Pooled Dataset on Children) dataset also contains links to other respondents. Note that in this case the information is about children, not adults.
- k_phead : pointer to head of hh
- k_pheadp: pointer to partner of head of hh
- k_pmum: pointer to mother
- k_pmump: pointer to partner of mother
- k_lel: relationship to head of hh
might be of interest
- k_size: household size
- k_inhh: member of hh
- k_nrkid: number of chilren in hh
Data Structure (ID’s)
the data is in long format (each person (in this case child) has multiple rows for each year they participated).
Unique keys: pid, syear
again, we read data in, and select wanted variables (also report number of rows)
if (!file.exists(here::here("data/kid_long_v36.rds"))) {
kid_raw <- rio::import(paste0(soepisv36_en, "kid.dta"))
kid_long <- kid_raw %>%
select(child_id = pid,
child_cid = cid,
child_hid = hid,
syear,
k_pheadp,
k_pheadp,
k_pmum,
k_pmump,
k_rel)
export(kid_long, here::here("data", "kid_long_v36.rds"))
message(paste0("kid_long_v36 exported to:", here::here("data/kid_long_v36.rds")))
} else {
kid_long <- import(here::here("data", "kid_long_v36.rds"))
}
kid_long %>% group_by(syear) %>% summarize("Number of children" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 22 x 2
## syear `Number of children`
## <dbl> <int>
## 1 1998 187
## 2 1999 185
## 3 2000 178
## 4 2001 180
## 5 2002 168
## 6 2003 162
## 7 2004 169
## 8 2005 159
## 9 2006 148
## 10 2007 144
## # … with 12 more rows
soepis_wide <- ppfad_wide %>%
left_join(bioparen_wide, by = c("pid", "cid")) %>% # add bioparen data
left_join(biobirth_wide, by = c("pid", "cid")) #add biobirth data
now we will re-format the wide data into the somewhat long format and make sure that all persons have the same amount of rows (for each possible year). This means that Variables which were given for each year (such as hid01, hid02 etc) will be reformatted into one variable (hid) and multiple rows for each year (01,02, etc).
soepis_long_partial1 <- soepis_wide %>%
pivot_longer(cols = matches("^hid|^netto"),
names_to = "key_syear",
values_to = "value") %>%
# separate var and syear column
separate(col = key_syear, into = c("key", "syear"), sep = -2) %>%
# turn the last two digits of syear into real digits
mutate(syear = case_when(syear >= 90 ~ str_c("19", syear),
syear <= 20 ~ str_c("20", syear)),
# make numerical (was character before)
syear = as.numeric(syear)) %>%
pivot_wider(values_from = value,
names_from = key)
soepis_long_partial1 %>% group_by(syear) %>% summarize("Number of persons per year" = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 20 x 2
## syear `Number of persons per year`
## <dbl> <int>
## 1 1998 17012
## 2 1999 17012
## 3 2000 17012
## 4 2001 17012
## 5 2002 17012
## 6 2003 17012
## 7 2004 17012
## 8 2005 17012
## 9 2006 17012
## 10 2007 17012
## 11 2008 17012
## 12 2009 17012
## 13 2010 17012
## 14 2011 17012
## 15 2012 17012
## 16 2013 17012
## 17 2014 17012
## 18 2015 17012
## 19 2016 17012
## 20 2017 17012
soepisv36_adults <- soepis_long_partial1 %>%
left_join(pbrutto_long, by = c("pid", "hid", "cid","syear")) %>% # add pbrutto data
left_join(inno_long, by = c("pid", "hid", "cid","syear")) %>% # add inno data
left_join(cognit_long, by = c("pid", "hid", "cid","syear")) %>% # add cognit data
left_join(pgen_long, by = c("pid", "hid", "cid","syear")) %>% # add pgen data
left_join(p_long, by = c("pid", "hid", "cid","syear")) # add p data
rio::export(soepisv36_adults, here::here("data/soepisv36_adults.rds"))
soepisv36_children <- soepis_long_partial1 %>%
# add bioage data
left_join(bioage_long, by = c("pid" = "child_id",
"hid" = "child_hid",
"cid" = "child_cid",
"syear")) %>%
# add kid data
left_join(kid_long, by = c("pid" = "child_id",
"hid" = "child_hid",
"cid" = "child_cid",
"syear"))
rio::export(soepisv36_children, here::here("data/soepisv36_children.rds"))
soepisv36 <- soepisv36_adults %>%
bind_rows(soepisv36_children) %>%
mutate(syear = as.integer(syear)) %>%
rescue_attributes(soepisv36_adults) %>%
rescue_attributes(soepisv36_children) %>%
set_variable_labels(syear = "Survey Year",
netto = "Survey Status",
hid = "Household Number")
rio::export(soepisv36, here::here("data/soepisv36.rds"))
if (!file.exists(here::here("data/soepis_igene_raw.rds"))) {
soepis_igene_raw <- rio::import(here::here("../../data/00_raw/SOEP/SOEP_v36/Gene/Inno19F_IGENE.dta"))
# soepis_igene_raw2 <- rio::import(here::here("../../data/00_raw/SOEP/SOEP_v36/trios.dta"))
pid_igene_raw <- soepis_igene_raw %>% select(pid) %>% distinct() %>% pull
rio::export(soepis_igene_raw, here::here("data/soepis_igene_raw.rds"))
rio::export(pid_igene_raw, here::here("data/pid_igene_raw.rds"))
}
# old approach
# key_label <- map(soepisv34, attr, "label") %>% unlist # Gives you list of the labels
# key_labels <- as.data.frame(key_label) %>% add_rownames( var = "key_name")
key_labels <- labelled::var_label(soepisv36) %>%
as_tibble(rownames = c("key_name", "key_label")) %>%
gather(key = key_name, value = key_label)
id <- c("syear", "pid", "cid", "hid", "kidpnr",
"fpid", "mpid", "pgpartnr", "pgpartz",
"k_pheadp", "k_pheadp", "k_pmum", "k_pmump")
survey <- c("psample", "netto", "stell", "stell1", "salivaerg")
demogr <- c("sex", "gebjahr", "gebmonat", "pgbilzt")
psych <- c("iabm", "iabaut", "cogtest", "forgive", "selfworth",
"optimism", "trust", "recip", "b5", "patience", "risk")
other <- c("nums", "numb", "biochild", "k_rel")
key_features <- key_labels %>%
mutate(key_category = case_when(key_name %in% id ~ "ID's",
str_detect(key_name, "kidpnr") ~ "ID's",
key_name %in% survey ~ "Survey",
key_name %in% demogr ~ "Demography",
key_name %in% other ~ "Other",
TRUE ~ "Psych. Measure"
),
key_category = ordered(key_category, levels = c("ID's", "Survey", "Demography",
"Psych. Measure", "Other") )
)
export(key_labels, here::here("data", "key_labels.rds"))
now we create an additional dataset which is long in a true sence since all variables ere represented in the key
column and all corresponding values in a value
column.
the data is uniquely identifiable by: syear, pid, key
if (!file.exists(here::here("data/soepis_long_v36.rds"))) {
soepis_long <- soepisv36 %>%
select(-stell_l) %>%
# recode negative values to NA
codebook::detect_missing(learn_from_labels = T,
negative_values_are_missing = T,
ninety_nine_problems = F,
only_labelled = F) %>%
tidyr::pivot_longer(cols = c(-syear, -pid, -hid, -cid), names_to = "key", values_to = "value")
# add labels from key_features generated above
soepis_long <- soepis_long %>%
left_join(key_features, by = c("key" = "key_name")) %>%
mutate(key_name_label = paste0(key, ": ", key_label))
rio::export(soepis_long, here::here("data/soepis_long_v36.rds"))
}
# https://stackoverflow.com/questions/39417003/long-vectors-not-supported-yet-error-in-rmd-but-not-in-r-script
to do some plotting by age, we need another kind of long format
#key_labels is defined above
if (!file.exists(here::here("data/soepis_age_v36.rds"))) {
soepis_age <- soepisv36 %>%
select(-stell_l) %>%
# recode negative values to NA
codebook::detect_missing(learn_from_labels = T,
negative_values_are_missing = T,
ninety_nine_problems = F,
only_labelled = F) %>%
mutate(age = syear - gebjahr,
age_k = cut_width(age, 15, boundary = 0)) %>%
pivot_longer(cols = c(-age,-age_k, -pid, -cid), names_to = "key", values_to = "value")
# add labels from key_features generated above
soepis_age <- soepis_age %>%
left_join(key_features, by = c("key" = "key_name")) %>%
mutate(key_name_label = paste0(key, ": ", key_label))
rio::export(soepis_age, here::here("data/soepis_age_v36.rds"))
}