Prepare Data for Exercises

Stata

set more off
capt clear
version 14


* 1. Load datasets and merge (also exercise 1.1)
********************************************************************************
use "_data/pgen.dta", clear

merge 1:1 pid syear using "_data/ppfadl.dta" , ///
      keepusing(sampreg psample pop sex gebjahr phrf phrf0 phrf1) keep(match) nogen

* 2. Data cleaning
********************************************************************************
* 2.1 define missing values
    mvdecode *, mv(-8/-1)
    
* 2.2 generierate variables (also exercise 1.2)
*************************************
** demographics
    cap drop ost
    recode sampreg 1 = 0 2 = 1, gen(ost)
    lab def ost 0 "west" 1 "east" , replace
    lab val ost ost
    tab1 ost sampreg

    cap drop frau
    recode sex 1 = 0 2 = 1, gen(frau)
    lab def frau 0 "male" 1 "femal" , replace
    lab val frau frau
    tab1 frau sex
    
    cap drop alter
    gen alter = syear - gebjahr

** Job Status
    cap drop erwstatus
    recode pgemplst 1 = 1  2 4 = 2 * = ., gen(erwstatus)
    lab def erwstatus 1 "fulltime" 2 "part-time", replace
    lab val erwstatus erwstatus

*** Consumer Price Index Jahr 2010 (also Exercise 1.2)
    cap drop cpi
    gen cpi=.
    label var cpi Preisindex
    replace cpi=(106.9) if syear==2015  // laut destatis
    replace cpi=(106.6) if syear==2014  // laut destatis
    replace cpi=(105.7) if syear==2013  
    replace cpi=(104.1) if syear==2012
    replace cpi=(102.1) if syear==2011
    replace cpi=(100)  if syear==2010   // base year 2010
    replace cpi=(98.9) if syear==2009
    replace cpi=(98.6) if syear==2008
    replace cpi=(96.1) if syear==2007
    replace cpi=(93.9) if syear==2006
    replace cpi=(92.5) if syear==2005   
    replace cpi=(91)   if syear==2004
    replace cpi=(89.6) if syear==2003
    replace cpi=(88.6) if syear==2002
    replace cpi=(87.4) if syear==2001
    replace cpi=(85.7) if syear==2000
    replace cpi=(84.4) if syear==1999 & ost==0
    replace cpi=(85.1) if syear==1999 & ost==1
    replace cpi=(83.8) if syear==1998 & ost==0
    replace cpi=(84.8) if syear==1998 & ost==1
    replace cpi=(83.1) if syear==1997 & ost==0
    replace cpi=(83.9) if syear==1997 & ost==1
    replace cpi=(81.5) if syear==1996 & ost==0
    replace cpi=(82.0) if syear==1996 & ost==1
    replace cpi=(80.5) if syear==1995 & ost==0
    replace cpi=(80.5) if syear==1995 & ost==1
    replace cpi=(79.2) if syear==1994 & ost==0
    replace cpi=(79.0) if syear==1994 & ost==1
    replace cpi=(77.1) if syear==1993 & ost==0
    replace cpi=(76.2) if syear==1993 & ost==1
    replace cpi=(74.5) if syear==1992 & ost==0
    replace cpi=(68.9) if syear==1992 & ost==1
    replace cpi=(71.6) if syear==1991 & ost==0
    replace cpi=(60.8) if syear==1991 & ost==1
    replace cpi=(69.1) if syear==1990 & ost == 0
    replace cpi=(67.3) if syear==1989 & ost == 0
    replace cpi=(65.4) if syear==1988 & ost == 0
    replace cpi=(64.6) if syear==1987 & ost == 0
    replace cpi=(64.5) if syear==1986 & ost == 0
    replace cpi=(64.6) if syear==1985 & ost == 0
    replace cpi=(63.3) if syear==1984 & ost == 0
    

*** Brutto Income in prices from 2010
    cap drop breink10
    gen breink10= pglabgro/cpi*100
    lab var breink10 "Monthly Brutto Income in 2010 Prices"

*** Brutto Income in Prices from 2015
    cap drop breink15
    gen breink15= pglabgro/cpi*106.9
    lab var breink15 "Monthly Brutto Income in 2015 Prices"

*** Hourly Wages in Prices from 2015
    cap drop hwageb
    gen hwageb = breink15/(pgtatzeit*(52/12))
    lab var hwageb "Hourly Brutto Income in 2015 Prices"

*** Work Experience
    cap drop erf
    gen erf= pgexpft + .4*pgexppt
    gen erfq= erf^2

*** Labor Market Segments
    cap drop qualstand
    gen qualstand= pgausb
    recode qualstand 1/4=0 5/9=1

    cap drop intern
    gen intern = pgallbet
    recode intern 1/2=0 3/5=1 
    tab intern qualstand
    
*** Centralize years of schooling and work experience
    gen cpgbilzeit = pgbilzeit - r(mean)
    gen cerf = erf -r(mean)
    
*** Log of Wage
    gen lnwage= ln(hwageb)
    lab var lnwage "Log. Hourly Brutto Wages in 2015 Prices"


* 4. Define Sample for Analysis
********************************************************************************
      cap drop asample
      gen asample=1 if  pgtatzeit >= 6 & !missing(pgtatzeit) & ///
                        alter <= 65 & alter >= 18 & !missing(alter) &   ///
                        (pgemplst==1 | pgemplst==2 | pgemplst==4) &  ///
                        pop<3
                     
* unplausible cases:
    replace asample=. if    pid==1380202 | ///
                          (pid== 8267202 & syear==2007) | ///
                          (pid==2633801 & syear==2006) | ///
                          (pid==2582901 & syear > 2006) | ///
                          pid==1380202 | ///
                          pid == 607602  | ///
                          pid==2555301
      
*** Save Analysis Sample Data
      save "_data/ex_mydf.dta", replace

. set more off

. capt clear

. version 14

. 
. 
. * 1. Load datasets and merge (also exercise 1.1)
. *****************************************************************************
> ***
. use "_data/pgen.dta", clear
(PGEN: Feb 12, 2017 13:00:53-1 DBV32L)

. 
. merge 1:1 pid syear using "_data/ppfadl.dta" , ///
>       keepusing(sampreg psample pop sex gebjahr phrf phrf0 phrf1) keep(match)
>  nogen

    Result                           # of obs.
    -----------------------------------------
    not matched                             0
    matched                           594,828  
    -----------------------------------------

. 
. * 2. Data cleaning
. *****************************************************************************
> ***
. * 2.1 define missing values
.         mvdecode *, mv(-8/-1)
    pgerljob: 255844 missing values generated
      pgbetr: 260221 missing values generated
     pgoeffd: 261535 missing values generated
      pgausb: 265058 missing values generated
     pgpartz: 487 missing values generated
    pgpartnr: 181355 missing values generated
    pgnation: 91 missing values generated
     pgpsbil: 13990 missing values generated
   pgpbbil01: 231533 missing values generated
   pgpbbil02: 493054 missing values generated
   pgpbbil03: 425223 missing values generated
    pgpsbila: 530357 missing values generated
    pgpbbila: 561965 missing values generated
    pgpsbilo: 482647 missing values generated
    pgpbbilo: 514647 missing values generated
    pgfamstd: 7236 missing values generated
   pgbilzeit: 27340 missing values generated
   pgerwzeit: 248371 missing values generated
   pgtatzeit: 260674 missing values generated
   pgvebzeit: 308715 missing values generated
    pguebstd: 289557 missing values generated
       pglfs: 6 missing values generated
    pgisco88: 261674 missing values generated
    pgisei88: 160505 missing values generated
     pgmps92: 160698 missing values generated
      pgnace: 270930 missing values generated
   pgsiops88: 160505 missing values generated
     pgegp88: 160505 missing values generated
    pgkldb92: 260031 missing values generated
    pgautono: 248630 missing values generated
   pgisced97: 10137 missing values generated
    pgcasmin: 13108 missing values generated
      pgstib: 8372 missing values generated
     pgmonth: 49 missing values generated
      pgmode: 523 missing values generated
    pglabgro: 245465 missing values generated
    pgimpgro: 245465 missing values generated
    pglabnet: 245466 missing values generated
    pgimpnet: 245466 missing values generated
    pgallbet: 260221 missing values generated
    pgemplst: 24 missing values generated
     pgexpft: 13804 missing values generated
     pgexppt: 13804 missing values generated
     pgexpue: 13804 missing values generated
     pgjobch: 25 missing values generated
     pgfield: 547715 missing values generated
    pgdegree: 555441 missing values generated
    pgtraina: 472005 missing values generated
    pgtrainb: 571016 missing values generated
    pgtrainc: 579023 missing values generated
    pgtraind: 588756 missing values generated
     pgfdt_f: 391212 missing values generated
   pgbilztch: 111168 missing values generated
   pgbilztev: 31076 missing values generated
    pgsndjob: 565504 missing values generated
    pgimpsnd: 565504 missing values generated
    pgjobend: 547189 missing values generated
    pgkldb10: 545448 missing values generated
    pgisco08: 545413 missing values generated
   pgisced11: 426010 missing values generated
     gebjahr: 2 missing values generated

.         
. * 2.2 generierate variables (also exercise 1.2)
. *************************************
. ** demographics
.         cap drop ost

.         recode sampreg 1 = 0 2 = 1, gen(ost)
(594828 differences between sampreg and ost)

.         lab def ost 0 "west" 1 "east" , replace

.         lab val ost ost

.         tab1 ost sampreg

-> tabulation of ost  

  RECODE of |
    sampreg |
  (Aktuelle |
Stichproben |
     region |
    (Berlin |
 West-Ost)) |      Freq.     Percent        Cum.
------------+-----------------------------------
       west |    466,335       78.40       78.40
       east |    128,493       21.60      100.00
------------+-----------------------------------
      Total |    594,828      100.00

-> tabulation of sampreg  

     Aktuelle Stichprobenregion (Berlin |
                              West-Ost) |      Freq.     Percent        Cum.
----------------------------------------+-----------------------------------
[1] Westdeutschland, alte Bundeslaender |    466,335       78.40       78.40
[2] Ostdeutschland, neue Bundeslaender  |    128,493       21.60      100.00
----------------------------------------+-----------------------------------
                                  Total |    594,828      100.00

. 
.         cap drop frau

.         recode sex 1 = 0 2 = 1, gen(frau)
(594828 differences between sex and frau)

.         lab def frau 0 "male" 1 "femal" , replace

.         lab val frau frau

.         tab1 frau sex

-> tabulation of frau  

  RECODE of |
        sex |
(Geschlecht |
          ) |      Freq.     Percent        Cum.
------------+-----------------------------------
       male |    282,865       47.55       47.55
      femal |    311,963       52.45      100.00
------------+-----------------------------------
      Total |    594,828      100.00

-> tabulation of sex  

                             Geschlecht |      Freq.     Percent        Cum.
----------------------------------------+-----------------------------------
                          [1] maennlich |    282,865       47.55       47.55
                           [2] weiblich |    311,963       52.45      100.00
----------------------------------------+-----------------------------------
                                  Total |    594,828      100.00

.         
.         cap drop alter

.         gen alter = syear - gebjahr
(2 missing values generated)

. 
. ** Job Status
.         cap drop erwstatus

.         recode pgemplst 1 = 1  2 4 = 2 * = ., gen(erwstatus)
(288308 differences between pgemplst and erwstatus)

.         lab def erwstatus 1 "fulltime" 2 "part-time", replace

.         lab val erwstatus erwstatus

. 
. *** Consumer Price Index Jahr 2010 (also Exercise 1.2)
.         cap drop cpi

.         gen cpi=.
(594,828 missing values generated)

.         label var cpi Preisindex

.         replace cpi=(106.9) if syear==2015  // laut destatis
(27,743 real changes made)

.         replace cpi=(106.6) if syear==2014  // laut destatis
(28,042 real changes made)

.         replace cpi=(105.7) if syear==2013  
(31,523 real changes made)

.         replace cpi=(104.1) if syear==2012
(28,520 real changes made)

.         replace cpi=(102.1) if syear==2011
(29,264 real changes made)

.         replace cpi=(100)  if syear==2010   // base year 2010
(27,124 real changes made)

.         replace cpi=(98.9) if syear==2009
(21,035 real changes made)

.         replace cpi=(98.6) if syear==2008
(19,945 real changes made)

.         replace cpi=(96.1) if syear==2007
(21,232 real changes made)

.         replace cpi=(93.9) if syear==2006
(22,665 real changes made)

.         replace cpi=(92.5) if syear==2005   
(21,105 real changes made)

.         replace cpi=(91)   if syear==2004
(22,019 real changes made)

.         replace cpi=(89.6) if syear==2003
(22,611 real changes made)

.         replace cpi=(88.6) if syear==2002
(23,892 real changes made)

.         replace cpi=(87.4) if syear==2001
(22,351 real changes made)

.         replace cpi=(85.7) if syear==2000
(24,576 real changes made)

.         replace cpi=(84.4) if syear==1999 & ost==0
(10,266 real changes made)

.         replace cpi=(85.1) if syear==1999 & ost==1
(3,819 real changes made)

.         replace cpi=(83.8) if syear==1998 & ost==0
(10,772 real changes made)

.         replace cpi=(84.8) if syear==1998 & ost==1
(3,898 real changes made)

.         replace cpi=(83.1) if syear==1997 & ost==0
(9,617 real changes made)

.         replace cpi=(83.9) if syear==1997 & ost==1
(3,666 real changes made)

.         replace cpi=(81.5) if syear==1996 & ost==0
(9,811 real changes made)

.         replace cpi=(82.0) if syear==1996 & ost==1
(3,700 real changes made)

.         replace cpi=(80.5) if syear==1995 & ost==0
(10,063 real changes made)

.         replace cpi=(80.5) if syear==1995 & ost==1
(3,705 real changes made)

.         replace cpi=(79.2) if syear==1994 & ost==0
(9,628 real changes made)

.         replace cpi=(79.0) if syear==1994 & ost==1
(3,789 real changes made)

.         replace cpi=(77.1) if syear==1993 & ost==0
(9,339 real changes made)

.         replace cpi=(76.2) if syear==1993 & ost==1
(3,840 real changes made)

.         replace cpi=(74.5) if syear==1992 & ost==0
(9,404 real changes made)

.         replace cpi=(68.9) if syear==1992 & ost==1
(3,993 real changes made)

.         replace cpi=(71.6) if syear==1991 & ost==0
(9,506 real changes made)

.         replace cpi=(60.8) if syear==1991 & ost==1
(4,163 real changes made)

.         replace cpi=(69.1) if syear==1990 & ost == 0
(9,519 real changes made)

.         replace cpi=(67.3) if syear==1989 & ost == 0
(9,710 real changes made)

.         replace cpi=(65.4) if syear==1988 & ost == 0
(10,023 real changes made)

.         replace cpi=(64.6) if syear==1987 & ost == 0
(10,516 real changes made)

.         replace cpi=(64.5) if syear==1986 & ost == 0
(10,646 real changes made)

.         replace cpi=(64.6) if syear==1985 & ost == 0
(11,090 real changes made)

.         replace cpi=(63.3) if syear==1984 & ost == 0
(12,245 real changes made)

.         
. 
. *** Brutto Income in prices from 2010
.         cap drop breink10

.         gen breink10= pglabgro/cpi*100
(248,902 missing values generated)

.         lab var breink10 "Monthly Brutto Income in 2010 Prices"

. 
. *** Brutto Income in Prices from 2015
.         cap drop breink15

.         gen breink15= pglabgro/cpi*106.9
(248,902 missing values generated)

.         lab var breink15 "Monthly Brutto Income in 2015 Prices"

. 
. *** Hourly Wages in Prices from 2015
.         cap drop hwageb

.         gen hwageb = breink15/(pgtatzeit*(52/12))
(263,911 missing values generated)

.         lab var hwageb "Hourly Brutto Income in 2015 Prices"

. 
. *** Work Experience
.         cap drop erf

.         gen erf= pgexpft + .4*pgexppt
(13,804 missing values generated)

.         gen erfq= erf^2
(13,804 missing values generated)

. 
. *** Labor Market Segments
.         cap drop qualstand

.         gen qualstand= pgausb
(265,058 missing values generated)

.         recode qualstand 1/4=0 5/9=1
(qualstand: 329770 changes made)

. 
.         cap drop intern

.         gen intern = pgallbet
(260,221 missing values generated)

.         recode intern 1/2=0 3/5=1 
(intern: 334607 changes made)

.         tab intern qualstand

           |       qualstand
    intern |         0          1 |     Total
-----------+----------------------+----------
         0 |    54,450    119,376 |   173,826 
         1 |    42,824    108,103 |   150,927 
-----------+----------------------+----------
     Total |    97,274    227,479 |   324,7.         
. *** Centralize years of schooling and work experience
.         gen cpgbilzeit = pgbilzeit - r(mean)
(594,828 missing values generated)

.         gen cerf = erf -r(mean)
(594,828 missing values generated)

.         
. *** Log of Wage
.         gen lnwage= ln(hwageb)
(265,217 missing values generated)

.         lab var lnwage "Log. Hourly Brutto Wages in 2015 Prices"

. 
. 
. * 4. Define Sample for Analysis
. *****************************************************************************
> ***
.       cap drop asample

.       gen asample=1 if  pgtatzeit >= 6 & !missing(pgtatzeit) & ///
>                                                 alter <= 65 & alter >= 18 & !
> missing(alter)     &       ///
>                                                 (pgemplst==1 | pgemplst==2 | 
> pgemplst==4) &  ///
>                                                 pop<3
(285,903 missing values generated)

.                                          
. * unplausible cases:
.         replace asample=. if    pid==1380202 | ///
>                               (pid== 8267202 & syear==2007) | ///
>                               (pid==2633801 & syear==2006) | ///
>                               (pid==2582901 & syear > 2006) | ///
>                               pid==1380202 | ///
>                               pid == 607602  | ///
>                               pid==2555301
(26 real changes made, 26 to missing)

.       
. *** Save Analysis Sample Data
.       save "_data/ex_mydf.dta", replace
file _data/ex_mydf.dta saved

R

Read SOEP Data into R

Because SOEP Data are not available in R format we can read them into R with readstata13 or haven and save them as .rds for future use.

#### 1. generate  and merge dataset mydf_raw  
# PGEN
  if (file.exists(paste0("_data/pgen.rds"))) {
        pgen <- readRDS("_data/pgen.rds")
  } else{
        cat("RDS Datei nicht vorhanden, wird aus Stata Dataset .dta erstellt")
        pgen <- read_dta("_data/pgen.dta")
        saveRDS(pgen, file = "_data/pgen.rds")
  }

# PPFADL
  if (file.exists(paste0("_data/ppfadl.rds"))) {
        ppfadl <- readRDS("_data/ppfadl.rds")
  } else{
        cat("RDS Datei nicht vorhanden, wird aus Stata Dataset .dta erstellt")
        ppfadl <- read_dta("_data/ppfadl.dta") %>% 
            tbl_df() %>% 
            select(sampreg,
                   psample,
                   pop,
                   sex,
                   gebjahr,
                   phrf,
                   phrf0,
                   pid,
                   syear,
                   phrf1)
        saveRDS(ppfadl, file = "_data/ppfadl.rds")
  }

Get Additional Data

CPI (base 2010)

Note: Separate Calculation of CPI in SOEP from 1991 to 2000 the SOEP provides separate cpi for former East and West- Germany from the Years 1991 to 2000. These are reflected in the cpi_s (_s for SOEP) variable. One thing to remark is that the SOEP does not provide CPI’s for East-Incomes in 1990. Those deflated incomes can therefore not be calculated.

# CPI (base 2010)
# download.file("http://research.stlouisfed.org/fred2/data/DEUCPIALLMINMEI.txt", "~/Desktop") 

# look for how many lines to skip
cpi_url <- "https://research.stlouisfed.org/fred2/data/DEUCPIALLMINMEI.txt"
cpi_url2 <- "https://www.destatis.de/DE/Publikationen/Thematisch/Preise/Verbraucherpreise/VerbraucherpreisindexLangeReihenXLS_5611103.xlsx"

# cpi from the net
cpi_raw <-read.table(cpi_url, skip = 15, header = TRUE)
cpi <- cpi_raw %>% 
      mutate(cpi_year = year(.$DATE)) %>% 
      # convert from monthly to yearly cpi values:
      group_by(cpi_year) %>% 
      dplyr::summarize(cpi = mean(VALUE)) %>% 
      # create adjustment factor relative 2010 year's prices:
      mutate(adj_fac2010 = cpi/cpi[cpi_year == 2010],
             adj_fac2015 = cpi/cpi[cpi_year == 2015])

# cpi from SOEP
cpi_raw_s <- read.table("_data/cpi.csv", header = T, na.strings = "-", dec = ".",
                        numerals = c("no.loss"))
cpi_s <- cpi_raw_s %>% 
      select(year, cpi_west, cpi_ost) %>%
      # add missing values based on destatis data
      add_row(year = 2014:2015, 
              cpi_ost = c(106.6, 106.9),
              cpi_west = c(106.6, 106.9)) %>% 
      mutate(syear = as.numeric(year)
             ) %>% 
      select(-year)

Create Dataset

1. Merge Data

#  merge datasets
mydf_raw <- ppfadl %>% 
      inner_join(y = pgen, by= c("pid", "syear")) %>% 
      left_join(y = cpi, by = c("syear" = "cpi_year")) %>% 
      left_join(cpi_s, by = c("syear")) %>% 
      mutate(cpi_s = case_when(
            sampreg == 2 ~ cpi_ost, 
            sampreg == 1 ~ cpi_west)) %>% 
      select(-cpi_west, -cpi_ost)

2. Clean Data

2.1 Define Missing Codes
      na_codes <- c(-1: -8)
      for (i in seq_along(mydf_raw)) {
            mydf_raw[[i]][mydf_raw[[i]] %in% na_codes] <- NA
      }
2.2 Generate Variables

Note:

Logarithm of wage differences in STATA and R.

example: - R mutate(data, lnwage = log(wage)) - STATA gen lnwage = ln(wage) - In R the function log() calculates the natural logarithm of wage as described here, returns -Inf if wage = 0. - In STATA ln() returns missings for lnwage if wage is 0. - If you want the same results, you can code mutate(data, lnwage = ifelse(wage <0, log(wage), NA).

# 2.2 Generate Variables
# Employment Status
mydf_raw$erwstatus <- mydf_raw %>% 
      mutate(erwstatus = case_when(
                              pgemplst == 1 ~ 1, 
                              pgemplst %in% c(2, 4) ~ 2
                              )
            ) %>%
      select(erwstatus) %>% unlist() %>% 
      labelled(c("Vollzeit" = 1, "Teilzeit" = 2)) 


ex_mydf <- mydf_raw %>% 
      # Demographics
      mutate(ost = as_factor(labelled(sampreg - 1, c("west" = 0, "east" = 1))), 
             # age
             alter = syear - gebjahr, 
             # gender
             frau = as_factor(labelled(sex - 1, c("male" = 0, "female" = 1))), 
      # Generate Brutto Income 2010 and 2015, Hourly Wage 2015
             # Einkommen, Hourly Wage
             breink10 = pglabgro / cpi_s * 100,
             # deflate income for 2015
             breink15 = pglabgro / cpi_s * 106.9,
             # hourly wage in prices of 2015
             hwageb = breink15/(pgtatzeit*(52/12)),
             # Log of wage
             lnwage = ifelse(hwageb > 0, log(hwageb), NA),
             # job experience
             erf = pgexpft + 0.4*pgexppt,
             erfq = I(erf^2),
             # Arbeitsmarktsegmente I
             qualstand = case_when(
                   between(pgausb, 1,4) ~ 0,
                   between(pgausb, 5,9) ~ 1),
             # Arbeitsmarktsegments II
             intern = case_when(
                   between(pgallbet, 1,2) ~ 0,
                   between(pgallbet, 3,5) ~ 1),
            # create id var
            pid_syear = (paste(pid, syear, sep = "." ))
                  ) %>% 
      group_by(syear) %>% 
      mutate( 
            # centralized values for education by year
            cpgbilzeit = pgbilzeit - mean(pgbilzeit, na.rm = T),
            cerf = erf - mean(erf, na.rm = T)
            ) %>% 
      ungroup()
## Warning: between() called on numeric vector with S3 class

## Warning: between() called on numeric vector with S3 class

## Warning: between() called on numeric vector with S3 class

## Warning: between() called on numeric vector with S3 class
2.3 save dataset
saveRDS(ex_mydf, file = "_data/ex_mydf.rds")