set more off
capt clear
version 14
* 1. Load datasets and merge (also exercise 1.1)
********************************************************************************
use "_data/pgen.dta", clear
merge 1:1 pid syear using "_data/ppfadl.dta" , ///
keepusing(sampreg psample pop sex gebjahr phrf phrf0 phrf1) keep(match) nogen
* 2. Data cleaning
********************************************************************************
* 2.1 define missing values
mvdecode *, mv(-8/-1)
* 2.2 generierate variables (also exercise 1.2)
*************************************
** demographics
cap drop ost
recode sampreg 1 = 0 2 = 1, gen(ost)
lab def ost 0 "west" 1 "east" , replace
lab val ost ost
tab1 ost sampreg
cap drop frau
recode sex 1 = 0 2 = 1, gen(frau)
lab def frau 0 "male" 1 "femal" , replace
lab val frau frau
tab1 frau sex
cap drop alter
gen alter = syear - gebjahr
** Job Status
cap drop erwstatus
recode pgemplst 1 = 1 2 4 = 2 * = ., gen(erwstatus)
lab def erwstatus 1 "fulltime" 2 "part-time", replace
lab val erwstatus erwstatus
*** Consumer Price Index Jahr 2010 (also Exercise 1.2)
cap drop cpi
gen cpi=.
label var cpi Preisindex
replace cpi=(106.9) if syear==2015 // laut destatis
replace cpi=(106.6) if syear==2014 // laut destatis
replace cpi=(105.7) if syear==2013
replace cpi=(104.1) if syear==2012
replace cpi=(102.1) if syear==2011
replace cpi=(100) if syear==2010 // base year 2010
replace cpi=(98.9) if syear==2009
replace cpi=(98.6) if syear==2008
replace cpi=(96.1) if syear==2007
replace cpi=(93.9) if syear==2006
replace cpi=(92.5) if syear==2005
replace cpi=(91) if syear==2004
replace cpi=(89.6) if syear==2003
replace cpi=(88.6) if syear==2002
replace cpi=(87.4) if syear==2001
replace cpi=(85.7) if syear==2000
replace cpi=(84.4) if syear==1999 & ost==0
replace cpi=(85.1) if syear==1999 & ost==1
replace cpi=(83.8) if syear==1998 & ost==0
replace cpi=(84.8) if syear==1998 & ost==1
replace cpi=(83.1) if syear==1997 & ost==0
replace cpi=(83.9) if syear==1997 & ost==1
replace cpi=(81.5) if syear==1996 & ost==0
replace cpi=(82.0) if syear==1996 & ost==1
replace cpi=(80.5) if syear==1995 & ost==0
replace cpi=(80.5) if syear==1995 & ost==1
replace cpi=(79.2) if syear==1994 & ost==0
replace cpi=(79.0) if syear==1994 & ost==1
replace cpi=(77.1) if syear==1993 & ost==0
replace cpi=(76.2) if syear==1993 & ost==1
replace cpi=(74.5) if syear==1992 & ost==0
replace cpi=(68.9) if syear==1992 & ost==1
replace cpi=(71.6) if syear==1991 & ost==0
replace cpi=(60.8) if syear==1991 & ost==1
replace cpi=(69.1) if syear==1990 & ost == 0
replace cpi=(67.3) if syear==1989 & ost == 0
replace cpi=(65.4) if syear==1988 & ost == 0
replace cpi=(64.6) if syear==1987 & ost == 0
replace cpi=(64.5) if syear==1986 & ost == 0
replace cpi=(64.6) if syear==1985 & ost == 0
replace cpi=(63.3) if syear==1984 & ost == 0
*** Brutto Income in prices from 2010
cap drop breink10
gen breink10= pglabgro/cpi*100
lab var breink10 "Monthly Brutto Income in 2010 Prices"
*** Brutto Income in Prices from 2015
cap drop breink15
gen breink15= pglabgro/cpi*106.9
lab var breink15 "Monthly Brutto Income in 2015 Prices"
*** Hourly Wages in Prices from 2015
cap drop hwageb
gen hwageb = breink15/(pgtatzeit*(52/12))
lab var hwageb "Hourly Brutto Income in 2015 Prices"
*** Work Experience
cap drop erf
gen erf= pgexpft + .4*pgexppt
gen erfq= erf^2
*** Labor Market Segments
cap drop qualstand
gen qualstand= pgausb
recode qualstand 1/4=0 5/9=1
cap drop intern
gen intern = pgallbet
recode intern 1/2=0 3/5=1
tab intern qualstand
*** Centralize years of schooling and work experience
gen cpgbilzeit = pgbilzeit - r(mean)
gen cerf = erf -r(mean)
*** Log of Wage
gen lnwage= ln(hwageb)
lab var lnwage "Log. Hourly Brutto Wages in 2015 Prices"
* 4. Define Sample for Analysis
********************************************************************************
cap drop asample
gen asample=1 if pgtatzeit >= 6 & !missing(pgtatzeit) & ///
alter <= 65 & alter >= 18 & !missing(alter) & ///
(pgemplst==1 | pgemplst==2 | pgemplst==4) & ///
pop<3
* unplausible cases:
replace asample=. if pid==1380202 | ///
(pid== 8267202 & syear==2007) | ///
(pid==2633801 & syear==2006) | ///
(pid==2582901 & syear > 2006) | ///
pid==1380202 | ///
pid == 607602 | ///
pid==2555301
*** Save Analysis Sample Data
save "_data/ex_mydf.dta", replace
. set more off
. capt clear
. version 14
.
.
. * 1. Load datasets and merge (also exercise 1.1)
. *****************************************************************************
> ***
. use "_data/pgen.dta", clear
(PGEN: Feb 12, 2017 13:00:53-1 DBV32L)
.
. merge 1:1 pid syear using "_data/ppfadl.dta" , ///
> keepusing(sampreg psample pop sex gebjahr phrf phrf0 phrf1) keep(match)
> nogen
Result # of obs.
-----------------------------------------
not matched 0
matched 594,828
-----------------------------------------
.
. * 2. Data cleaning
. *****************************************************************************
> ***
. * 2.1 define missing values
. mvdecode *, mv(-8/-1)
pgerljob: 255844 missing values generated
pgbetr: 260221 missing values generated
pgoeffd: 261535 missing values generated
pgausb: 265058 missing values generated
pgpartz: 487 missing values generated
pgpartnr: 181355 missing values generated
pgnation: 91 missing values generated
pgpsbil: 13990 missing values generated
pgpbbil01: 231533 missing values generated
pgpbbil02: 493054 missing values generated
pgpbbil03: 425223 missing values generated
pgpsbila: 530357 missing values generated
pgpbbila: 561965 missing values generated
pgpsbilo: 482647 missing values generated
pgpbbilo: 514647 missing values generated
pgfamstd: 7236 missing values generated
pgbilzeit: 27340 missing values generated
pgerwzeit: 248371 missing values generated
pgtatzeit: 260674 missing values generated
pgvebzeit: 308715 missing values generated
pguebstd: 289557 missing values generated
pglfs: 6 missing values generated
pgisco88: 261674 missing values generated
pgisei88: 160505 missing values generated
pgmps92: 160698 missing values generated
pgnace: 270930 missing values generated
pgsiops88: 160505 missing values generated
pgegp88: 160505 missing values generated
pgkldb92: 260031 missing values generated
pgautono: 248630 missing values generated
pgisced97: 10137 missing values generated
pgcasmin: 13108 missing values generated
pgstib: 8372 missing values generated
pgmonth: 49 missing values generated
pgmode: 523 missing values generated
pglabgro: 245465 missing values generated
pgimpgro: 245465 missing values generated
pglabnet: 245466 missing values generated
pgimpnet: 245466 missing values generated
pgallbet: 260221 missing values generated
pgemplst: 24 missing values generated
pgexpft: 13804 missing values generated
pgexppt: 13804 missing values generated
pgexpue: 13804 missing values generated
pgjobch: 25 missing values generated
pgfield: 547715 missing values generated
pgdegree: 555441 missing values generated
pgtraina: 472005 missing values generated
pgtrainb: 571016 missing values generated
pgtrainc: 579023 missing values generated
pgtraind: 588756 missing values generated
pgfdt_f: 391212 missing values generated
pgbilztch: 111168 missing values generated
pgbilztev: 31076 missing values generated
pgsndjob: 565504 missing values generated
pgimpsnd: 565504 missing values generated
pgjobend: 547189 missing values generated
pgkldb10: 545448 missing values generated
pgisco08: 545413 missing values generated
pgisced11: 426010 missing values generated
gebjahr: 2 missing values generated
.
. * 2.2 generierate variables (also exercise 1.2)
. *************************************
. ** demographics
. cap drop ost
. recode sampreg 1 = 0 2 = 1, gen(ost)
(594828 differences between sampreg and ost)
. lab def ost 0 "west" 1 "east" , replace
. lab val ost ost
. tab1 ost sampreg
-> tabulation of ost
RECODE of |
sampreg |
(Aktuelle |
Stichproben |
region |
(Berlin |
West-Ost)) | Freq. Percent Cum.
------------+-----------------------------------
west | 466,335 78.40 78.40
east | 128,493 21.60 100.00
------------+-----------------------------------
Total | 594,828 100.00
-> tabulation of sampreg
Aktuelle Stichprobenregion (Berlin |
West-Ost) | Freq. Percent Cum.
----------------------------------------+-----------------------------------
[1] Westdeutschland, alte Bundeslaender | 466,335 78.40 78.40
[2] Ostdeutschland, neue Bundeslaender | 128,493 21.60 100.00
----------------------------------------+-----------------------------------
Total | 594,828 100.00
.
. cap drop frau
. recode sex 1 = 0 2 = 1, gen(frau)
(594828 differences between sex and frau)
. lab def frau 0 "male" 1 "femal" , replace
. lab val frau frau
. tab1 frau sex
-> tabulation of frau
RECODE of |
sex |
(Geschlecht |
) | Freq. Percent Cum.
------------+-----------------------------------
male | 282,865 47.55 47.55
femal | 311,963 52.45 100.00
------------+-----------------------------------
Total | 594,828 100.00
-> tabulation of sex
Geschlecht | Freq. Percent Cum.
----------------------------------------+-----------------------------------
[1] maennlich | 282,865 47.55 47.55
[2] weiblich | 311,963 52.45 100.00
----------------------------------------+-----------------------------------
Total | 594,828 100.00
.
. cap drop alter
. gen alter = syear - gebjahr
(2 missing values generated)
.
. ** Job Status
. cap drop erwstatus
. recode pgemplst 1 = 1 2 4 = 2 * = ., gen(erwstatus)
(288308 differences between pgemplst and erwstatus)
. lab def erwstatus 1 "fulltime" 2 "part-time", replace
. lab val erwstatus erwstatus
.
. *** Consumer Price Index Jahr 2010 (also Exercise 1.2)
. cap drop cpi
. gen cpi=.
(594,828 missing values generated)
. label var cpi Preisindex
. replace cpi=(106.9) if syear==2015 // laut destatis
(27,743 real changes made)
. replace cpi=(106.6) if syear==2014 // laut destatis
(28,042 real changes made)
. replace cpi=(105.7) if syear==2013
(31,523 real changes made)
. replace cpi=(104.1) if syear==2012
(28,520 real changes made)
. replace cpi=(102.1) if syear==2011
(29,264 real changes made)
. replace cpi=(100) if syear==2010 // base year 2010
(27,124 real changes made)
. replace cpi=(98.9) if syear==2009
(21,035 real changes made)
. replace cpi=(98.6) if syear==2008
(19,945 real changes made)
. replace cpi=(96.1) if syear==2007
(21,232 real changes made)
. replace cpi=(93.9) if syear==2006
(22,665 real changes made)
. replace cpi=(92.5) if syear==2005
(21,105 real changes made)
. replace cpi=(91) if syear==2004
(22,019 real changes made)
. replace cpi=(89.6) if syear==2003
(22,611 real changes made)
. replace cpi=(88.6) if syear==2002
(23,892 real changes made)
. replace cpi=(87.4) if syear==2001
(22,351 real changes made)
. replace cpi=(85.7) if syear==2000
(24,576 real changes made)
. replace cpi=(84.4) if syear==1999 & ost==0
(10,266 real changes made)
. replace cpi=(85.1) if syear==1999 & ost==1
(3,819 real changes made)
. replace cpi=(83.8) if syear==1998 & ost==0
(10,772 real changes made)
. replace cpi=(84.8) if syear==1998 & ost==1
(3,898 real changes made)
. replace cpi=(83.1) if syear==1997 & ost==0
(9,617 real changes made)
. replace cpi=(83.9) if syear==1997 & ost==1
(3,666 real changes made)
. replace cpi=(81.5) if syear==1996 & ost==0
(9,811 real changes made)
. replace cpi=(82.0) if syear==1996 & ost==1
(3,700 real changes made)
. replace cpi=(80.5) if syear==1995 & ost==0
(10,063 real changes made)
. replace cpi=(80.5) if syear==1995 & ost==1
(3,705 real changes made)
. replace cpi=(79.2) if syear==1994 & ost==0
(9,628 real changes made)
. replace cpi=(79.0) if syear==1994 & ost==1
(3,789 real changes made)
. replace cpi=(77.1) if syear==1993 & ost==0
(9,339 real changes made)
. replace cpi=(76.2) if syear==1993 & ost==1
(3,840 real changes made)
. replace cpi=(74.5) if syear==1992 & ost==0
(9,404 real changes made)
. replace cpi=(68.9) if syear==1992 & ost==1
(3,993 real changes made)
. replace cpi=(71.6) if syear==1991 & ost==0
(9,506 real changes made)
. replace cpi=(60.8) if syear==1991 & ost==1
(4,163 real changes made)
. replace cpi=(69.1) if syear==1990 & ost == 0
(9,519 real changes made)
. replace cpi=(67.3) if syear==1989 & ost == 0
(9,710 real changes made)
. replace cpi=(65.4) if syear==1988 & ost == 0
(10,023 real changes made)
. replace cpi=(64.6) if syear==1987 & ost == 0
(10,516 real changes made)
. replace cpi=(64.5) if syear==1986 & ost == 0
(10,646 real changes made)
. replace cpi=(64.6) if syear==1985 & ost == 0
(11,090 real changes made)
. replace cpi=(63.3) if syear==1984 & ost == 0
(12,245 real changes made)
.
.
. *** Brutto Income in prices from 2010
. cap drop breink10
. gen breink10= pglabgro/cpi*100
(248,902 missing values generated)
. lab var breink10 "Monthly Brutto Income in 2010 Prices"
.
. *** Brutto Income in Prices from 2015
. cap drop breink15
. gen breink15= pglabgro/cpi*106.9
(248,902 missing values generated)
. lab var breink15 "Monthly Brutto Income in 2015 Prices"
.
. *** Hourly Wages in Prices from 2015
. cap drop hwageb
. gen hwageb = breink15/(pgtatzeit*(52/12))
(263,911 missing values generated)
. lab var hwageb "Hourly Brutto Income in 2015 Prices"
.
. *** Work Experience
. cap drop erf
. gen erf= pgexpft + .4*pgexppt
(13,804 missing values generated)
. gen erfq= erf^2
(13,804 missing values generated)
.
. *** Labor Market Segments
. cap drop qualstand
. gen qualstand= pgausb
(265,058 missing values generated)
. recode qualstand 1/4=0 5/9=1
(qualstand: 329770 changes made)
.
. cap drop intern
. gen intern = pgallbet
(260,221 missing values generated)
. recode intern 1/2=0 3/5=1
(intern: 334607 changes made)
. tab intern qualstand
| qualstand
intern | 0 1 | Total
-----------+----------------------+----------
0 | 54,450 119,376 | 173,826
1 | 42,824 108,103 | 150,927
-----------+----------------------+----------
Total | 97,274 227,479 | 324,7.
. *** Centralize years of schooling and work experience
. gen cpgbilzeit = pgbilzeit - r(mean)
(594,828 missing values generated)
. gen cerf = erf -r(mean)
(594,828 missing values generated)
.
. *** Log of Wage
. gen lnwage= ln(hwageb)
(265,217 missing values generated)
. lab var lnwage "Log. Hourly Brutto Wages in 2015 Prices"
.
.
. * 4. Define Sample for Analysis
. *****************************************************************************
> ***
. cap drop asample
. gen asample=1 if pgtatzeit >= 6 & !missing(pgtatzeit) & ///
> alter <= 65 & alter >= 18 & !
> missing(alter) & ///
> (pgemplst==1 | pgemplst==2 |
> pgemplst==4) & ///
> pop<3
(285,903 missing values generated)
.
. * unplausible cases:
. replace asample=. if pid==1380202 | ///
> (pid== 8267202 & syear==2007) | ///
> (pid==2633801 & syear==2006) | ///
> (pid==2582901 & syear > 2006) | ///
> pid==1380202 | ///
> pid == 607602 | ///
> pid==2555301
(26 real changes made, 26 to missing)
.
. *** Save Analysis Sample Data
. save "_data/ex_mydf.dta", replace
file _data/ex_mydf.dta saved
Because SOEP Data are not available in R format we can read them into R with readstata13
or haven
and save them as .rds for future use.
#### 1. generate and merge dataset mydf_raw
# PGEN
if (file.exists(paste0("_data/pgen.rds"))) {
pgen <- readRDS("_data/pgen.rds")
} else{
cat("RDS Datei nicht vorhanden, wird aus Stata Dataset .dta erstellt")
pgen <- read_dta("_data/pgen.dta")
saveRDS(pgen, file = "_data/pgen.rds")
}
# PPFADL
if (file.exists(paste0("_data/ppfadl.rds"))) {
ppfadl <- readRDS("_data/ppfadl.rds")
} else{
cat("RDS Datei nicht vorhanden, wird aus Stata Dataset .dta erstellt")
ppfadl <- read_dta("_data/ppfadl.dta") %>%
tbl_df() %>%
select(sampreg,
psample,
pop,
sex,
gebjahr,
phrf,
phrf0,
pid,
syear,
phrf1)
saveRDS(ppfadl, file = "_data/ppfadl.rds")
}
Note: Separate Calculation of CPI in SOEP from 1991 to 2000 the SOEP provides separate cpi for former East and West- Germany from the Years 1991 to 2000. These are reflected in the cpi_s (_s for SOEP) variable. One thing to remark is that the SOEP does not provide CPI’s for East-Incomes in 1990. Those deflated incomes can therefore not be calculated.
# CPI (base 2010)
# download.file("http://research.stlouisfed.org/fred2/data/DEUCPIALLMINMEI.txt", "~/Desktop")
# look for how many lines to skip
cpi_url <- "https://research.stlouisfed.org/fred2/data/DEUCPIALLMINMEI.txt"
cpi_url2 <- "https://www.destatis.de/DE/Publikationen/Thematisch/Preise/Verbraucherpreise/VerbraucherpreisindexLangeReihenXLS_5611103.xlsx"
# cpi from the net
cpi_raw <-read.table(cpi_url, skip = 15, header = TRUE)
cpi <- cpi_raw %>%
mutate(cpi_year = year(.$DATE)) %>%
# convert from monthly to yearly cpi values:
group_by(cpi_year) %>%
dplyr::summarize(cpi = mean(VALUE)) %>%
# create adjustment factor relative 2010 year's prices:
mutate(adj_fac2010 = cpi/cpi[cpi_year == 2010],
adj_fac2015 = cpi/cpi[cpi_year == 2015])
# cpi from SOEP
cpi_raw_s <- read.table("_data/cpi.csv", header = T, na.strings = "-", dec = ".",
numerals = c("no.loss"))
cpi_s <- cpi_raw_s %>%
select(year, cpi_west, cpi_ost) %>%
# add missing values based on destatis data
add_row(year = 2014:2015,
cpi_ost = c(106.6, 106.9),
cpi_west = c(106.6, 106.9)) %>%
mutate(syear = as.numeric(year)
) %>%
select(-year)
# merge datasets
mydf_raw <- ppfadl %>%
inner_join(y = pgen, by= c("pid", "syear")) %>%
left_join(y = cpi, by = c("syear" = "cpi_year")) %>%
left_join(cpi_s, by = c("syear")) %>%
mutate(cpi_s = case_when(
sampreg == 2 ~ cpi_ost,
sampreg == 1 ~ cpi_west)) %>%
select(-cpi_west, -cpi_ost)
na_codes <- c(-1: -8)
for (i in seq_along(mydf_raw)) {
mydf_raw[[i]][mydf_raw[[i]] %in% na_codes] <- NA
}
Note:
Logarithm of wage differences in STATA and R.
example: - R mutate(data, lnwage = log(wage))
- STATA gen lnwage = ln(wage)
- In R the function log() calculates the natural logarithm of wage as described here, returns -Inf if wage = 0. - In STATA ln() returns missings for lnwage if wage is 0. - If you want the same results, you can code mutate(data, lnwage = ifelse(wage <0, log(wage), NA)
.
# 2.2 Generate Variables
# Employment Status
mydf_raw$erwstatus <- mydf_raw %>%
mutate(erwstatus = case_when(
pgemplst == 1 ~ 1,
pgemplst %in% c(2, 4) ~ 2
)
) %>%
select(erwstatus) %>% unlist() %>%
labelled(c("Vollzeit" = 1, "Teilzeit" = 2))
ex_mydf <- mydf_raw %>%
# Demographics
mutate(ost = as_factor(labelled(sampreg - 1, c("west" = 0, "east" = 1))),
# age
alter = syear - gebjahr,
# gender
frau = as_factor(labelled(sex - 1, c("male" = 0, "female" = 1))),
# Generate Brutto Income 2010 and 2015, Hourly Wage 2015
# Einkommen, Hourly Wage
breink10 = pglabgro / cpi_s * 100,
# deflate income for 2015
breink15 = pglabgro / cpi_s * 106.9,
# hourly wage in prices of 2015
hwageb = breink15/(pgtatzeit*(52/12)),
# Log of wage
lnwage = ifelse(hwageb > 0, log(hwageb), NA),
# job experience
erf = pgexpft + 0.4*pgexppt,
erfq = I(erf^2),
# Arbeitsmarktsegmente I
qualstand = case_when(
between(pgausb, 1,4) ~ 0,
between(pgausb, 5,9) ~ 1),
# Arbeitsmarktsegments II
intern = case_when(
between(pgallbet, 1,2) ~ 0,
between(pgallbet, 3,5) ~ 1),
# create id var
pid_syear = (paste(pid, syear, sep = "." ))
) %>%
group_by(syear) %>%
mutate(
# centralized values for education by year
cpgbilzeit = pgbilzeit - mean(pgbilzeit, na.rm = T),
cerf = erf - mean(erf, na.rm = T)
) %>%
ungroup()
## Warning: between() called on numeric vector with S3 class
## Warning: between() called on numeric vector with S3 class
## Warning: between() called on numeric vector with S3 class
## Warning: between() called on numeric vector with S3 class
saveRDS(ex_mydf, file = "_data/ex_mydf.rds")