### Code for: Matching as non-parametric preprocessing for the estimation of equivalence scales ### Part 2: Estimates for low incomes ### Contact: christian.dudel@rub.de ### Clear workspace rm(list=ls()) ### Paths for data & figures path.data <- "C:/Data/" path.figures <- "C:/Figures/" ### Load libraries library(Matching) library(HapEstXXR) library(nnet) ### Read data dat <- read.csv2(paste0(path.data,"gf3_08.csv"),na.strings="",comment.char="#") ### Generate variable containing household types # Generate variable dat$hhtype <- NA # Couple dat$hhtype[dat$EF36==9|dat$EF36==10|dat$EF36==21|dat$EF36==22] <- 0 # Couple with one child (below 18) dat$hhtype[dat$EF36==11|dat$EF36==12|dat$EF36==23|dat$EF36==24] <- 1 # Drop other households dat <- dat[!is.na(dat$hhtype),] ### Generate age of household members # Age head of household dat$age1 <- NA dat$age1 <- 2008-dat$EF8U3 # Age second person dat$age2 <- NA dat$age2[is.na(dat$EF9U3)==F] <- 2008-dat$EF9U3[is.na(dat$EF9U3)==F] # Age of third person dat$age3 <- NA dat$age3[is.na(dat$EF10U3)==F] <- 2008-dat$EF10U3[is.na(dat$EF10U3)==F] # Age of child dat$agechild <- NA dat$agechild[dat$hhtype==1&dat$age2>dat$age3] <- dat$age3[dat$hhtype==1&dat$age2>dat$age3] dat$agechild[dat$hhtype==1&dat$age2=14 dat <- dat[dat$hhtype==0|dat$hhtype==1&dat$agechild<14,] ### Socio-demographic variables # Region (West/East) dat$east <- 0 dat$west <- 0 dat$east[dat$EF1==44] <- 1 dat$west[dat$EF1==33] <- 1 # Dual earner couple dat$dual <- 0 dat$dual[dat$EF45==2] <- 1 # Education hh-head dat$edu <- dat$EF8U6 # "Polytechnische Oberschule" roughly equivalent to "Realschule" dat$edu[dat$edu>3] <- dat$edu[dat$edu>3]-1 # This leaves 5 levels: # 1=No degree # 2=Haupt-/Volksschule # 3=Mittlere Reife # 4=Fachabitur # 5=Abitur # Employment dat$emp <- dat$EF8U13 # Region dat$reg <- dat$EF5 # Quartal dat$quart <- dat$EF6 ### Net household income # Generate variable dat$net <- dat$EF58 # Drop negative values dat$net[dat$net<=0] <- NA # Drop low values dat$net[dat$hhtype==0&dat$net<(337*2*3)] <- NA dat$net[dat$hhtype==1&dat$agechild<=6&dat$net<(3*2*337+219*3)] <- NA dat$net[dat$hhtype==1&dat$agechild>6 &dat$net<(3*2*337+251*3)] <- NA ### Generate welfare indicators # Expenditure share food dat$food <- dat$EF61/dat$net dat$food[dat$food>0.75] <- NA dat$food[dat$food<=0] <- NA # Clothing adults dat$cloth <- dat$EF224+dat$EF230+dat$EF225+dat$EF231 ### Restrictions # Both partners younger than 65 (retirement age) dat <- dat[dat$age1<65&dat$age2<65,] # At least one partner is employed dat <- dat[!dat$EF45==0,] dat <- dat[!(dat$hhtype==0&dat$EF8U13==0&dat$EF9U13==0),] dat <- dat[!(dat$hhtype==1&dat$EF8U13==0&dat$EF9U13==0&dat$EF10U13==0),] dat <- dat[!dat$EF8U13==5,] dat <- dat[!dat$EF9U13==5,] # No unemployment benefits (Hartz IV) dat <- dat[!dat$EF127U1>0,] dat <- dat[!dat$EF127U2>0,] dat <- dat[!dat$EF127U3>0,] # Only 'poor' couples with one child (poorest 20%) dat$poor_aac <- 0 poverty.threshold <- quantile(dat$net[dat$hhtype==1],probs=0.2,na.rm=T) dat$poor_aac[dat$net<=poverty.threshold&dat$hhtype==1] <- 1 dat <- dat[dat$hhtype==0|dat$poor_aac==1,] # Indicator for 'poor' couples without children (poorest 20%) dat.naive <- dat dat.naive$poor_aa <- 0 poverty.threshold <- quantile(dat.naive$net[dat.naive$hhtype==0],probs=0.2,na.rm=T) dat.naive$poor_aa[dat.naive$net<=poverty.threshold&dat.naive$hhtype==0] <- 1 dat.naive <- dat.naive[dat.naive$poor_aa==1|dat.naive$poor_aac==1,] ### Drop outliers dat <- dat[dat$food<0.35,] dat <- dat[dat$cloth<3000,] dat.naive <- dat.naive[dat.naive$food<0.35,] dat.naive <- dat.naive[dat.naive$cloth<3000,] ### Drop unnecessary variables dat <- dat[,c("hhtype","age1","net","food","cloth","east","west","dual","edu","emp","reg","quart")] dat <- na.omit(dat) dat.naive <- dat.naive[,c("hhtype","age1","net","food","cloth","east","west","dual","edu","emp","reg","quart")] dat.naive <- na.omit(dat.naive) ### Simple Engel scale (basic estimate) # Generate log household income dat$lnet <- log(dat$net) dat.naive$lnet <- log(dat.naive$net) # Generate age squared dat$age1s <- dat$age1^2 dat.naive$age1s <- dat.naive$age1^2 # Estimate regression fit1 <- lm(food~lnet+hhtype+age1+age1s+east+dual+factor(edu)+factor(emp)+factor(reg)+factor(quart),data=dat) # Calculate equivalence scale beta1 <- coef(fit1)[3] beta2 <- coef(fit1)[2] scale1 <- exp(-beta1/beta2) ### All possible models: Unmatched and naive estimates # Generate list of all possible models basic.indicators <- "lnet+hhtype" list.indicators <- c("age1","age1s","east","dual","factor(edu)","factor(emp)","factor(reg)","factor(quart)", "dual*factor(edu)","dual*factor(emp)","dual*factor(reg)","dual*factor(quart)","dual*factor(east)") models <- powerset(list.indicators) n.models <- length(models) # Unmatched estimates scales <- numeric(n.models) for(i in 1:n.models) { if(i%%1000==0) cat(".") exp.var <- paste(basic.indicators,models[[i]],sep="+",collapse="+") full.model <- formula(paste("food~",exp.var,sep="")) fit.tmp <- lm(full.model,data=dat) beta1 <- coef(fit.tmp)[3] beta2 <- coef(fit.tmp)[2] scales[i] <- exp(-beta1/beta2) } # Naive estimate: Poorest 20% of childless couples scales.naive <- numeric(n.models) for(i in 1:n.models) { if(i%%1000==0) cat(".") exp.var <- paste(basic.indicators,models[[i]],sep="+",collapse="+") full.model <- formula(paste("food~",exp.var,sep="")) fit.tmp <- lm(full.model,data=dat.naive) beta1 <- coef(fit.tmp)[3] beta2 <- coef(fit.tmp)[2] scales.naive[i] <- exp(-beta1/beta2) } ### All possible models: Matching set.seed(7391) # Prepare data dat.tmp <- dat[!(dat$hhtype==0&dat$net>6000*1.6),] edu <- class.ind(dat.tmp$edu)[,-1] reg <- class.ind(dat.tmp$reg)[,-1] emp <- class.ind(dat.tmp$emp)[,-1] quart <- class.ind(dat.tmp$quart)[,-1] match.tmp <- cbind(as.matrix(dat.tmp[,c("east","dual","age1","cloth")]),edu,reg,emp,quart) # Matching 1-1 Match1 <- Match(Y=dat.tmp$food,Tr=dat.tmp$hhtype,X=match.tmp,M=1,ties=FALSE,replace=F) # Matched data set dat.match <- dat.tmp[c(Match1$index.treated,Match1$index.control),] # Regression fit2 <- lm(food~lnet+hhtype+age1+age1s+east+dual+factor(edu)+factor(emp)+factor(reg)+factor(quart),data=dat.match) # Calculate equivalence scale beta1m <- coef(fit2)[3] beta2m <- coef(fit2)[2] exp(-beta1m/beta2m) # All Models scales.match <- numeric(n.models) for(i in 1:n.models) { if(i%%1000==0) cat(".") exp.var <- paste(basic.indicators,models[[i]],sep="+",collapse="+") full.model <- formula(paste("food~",exp.var,sep="")) fit.tmp <- lm(full.model,data=dat.match) beta1 <- coef(fit.tmp)[3] beta2 <- coef(fit.tmp)[2] scales.match[i] <- exp(-beta1/beta2) } ### Comparison of matched, unmatched, and naive results # Figure 3 in the paper pdf(file=paste0(path.figures,"ex3.pdf")) plot(density(scales),ylim=c(0,45),xlab="Scale value",main="",panel.first=grid(),xlim=c(1.15,1.5),lwd=2,lty=2) lines(density(scales.match),lwd=2,lty=1) lines(density(scales.naive),lwd=2,lty=4) legend(x=1.15,y=35,lty=c(1,4,2),legend=c("Matched","Naive","Unmatched"),bg="white",lwd=2) dev.off() # Range of scale estimates max(scales)-min(scales) max(scales.naive)-min(scales.naive) max(scales.match)-min(scales.match) # Standard deviation of scale estimates sd(scales) sd(scales.naive) sd(scales.match) ### Balancing # Age by(dat$age1,dat$hhtype,mean) by(dat.naive$age1,dat.naive$hhtype,mean) by(dat.match$age1,dat.match$hhtype,mean) # Expenditure clothing by(dat$cloth/3,dat$hhtype,mean) by(dat.naive$cloth/3,dat.naive$hhtype,mean) by(dat.match$cloth/3,dat.match$hhtype,mean) # East-Germany by(dat$east,dat$hhtype,mean) by(dat.naive$east,dat.naive$hhtype,mean) by(dat.match$east,dat.match$hhtype,mean) # Dual earner by(dat$dual,dat$hhtype,mean) by(dat.naive$dual,dat.naive$hhtype,mean) by(dat.match$dual,dat.match$hhtype,mean) # Income by(dat$net,dat$hhtype,mean) by(dat.naive$net,dat.naive$hhtype,mean) by(dat.match$net,dat.match$hhtype,mean) # Education p0 <- prop.table(table(dat$edu[dat$hhtype==0])) p1 <- prop.table(table(dat$edu[dat$hhtype==1])) 0.5*sum(abs(p0-p1)) p0 <- prop.table(table(dat.naive$edu[dat.naive$hhtype==0])) p1 <- prop.table(table(dat.naive$edu[dat.naive$hhtype==1])) 0.5*sum(abs(p0-p1)) p0m <- prop.table(table(dat.match$edu[dat.match$hhtype==0])) p1m <- prop.table(table(dat.match$edu[dat.match$hhtype==1])) 0.5*sum(abs(p0m-p1m)) # Region p0 <- prop.table(table(dat$reg[dat$hhtype==0])) p1 <- prop.table(table(dat$reg[dat$hhtype==1])) 0.5*sum(abs(p0-p1)) p0 <- prop.table(table(dat.naive$reg[dat.naive$hhtype==0])) p1 <- prop.table(table(dat.naive$reg[dat.naive$hhtype==1])) 0.5*sum(abs(p0-p1)) p0m <- prop.table(table(dat.match$reg[dat.match$hhtype==0])) p1m <- prop.table(table(dat.match$reg[dat.match$hhtype==1])) 0.5*sum(abs(p0m-p1m)) # Employment p0 <- prop.table(table(dat$emp[dat$hhtype==0])) p1 <- prop.table(table(dat$emp[dat$hhtype==1])) 0.5*sum(abs(p0-p1)) p0 <- prop.table(table(dat.naive$emp[dat.naive$hhtype==0])) p1 <- prop.table(table(dat.naive$emp[dat.naive$hhtype==1])) 0.5*sum(abs(p0-p1)) p0m <- prop.table(table(dat.match$emp[dat.match$hhtype==0])) p1m <- prop.table(table(dat.match$emp[dat.match$hhtype==1])) 0.5*sum(abs(p0m-p1m)) # Quartal p0 <- prop.table(table(dat$quart[dat$hhtype==0])) p1 <- prop.table(table(dat$quart[dat$hhtype==1])) 0.5*sum(abs(p0-p1)) p0 <- prop.table(table(dat.naive$quart[dat.naive$hhtype==0])) p1 <- prop.table(table(dat.naive$quart[dat.naive$hhtype==1])) 0.5*sum(abs(p0-p1)) p0m <- prop.table(table(dat.match$quart[dat.match$hhtype==0])) p1m <- prop.table(table(dat.match$quart[dat.match$hhtype==1])) 0.5*sum(abs(p0m-p1m)) ### Descriptives food round(by(dat$food,dat$hhtype,mean),digits=2) round(by(dat.naive$food,dat.naive$hhtype,mean),digits=2) round(by(dat.match$food,dat.match$hhtype,mean),digits=2) round(by(dat$food,dat$hhtype,sd),digits=2) round(by(dat.naive$food,dat.naive$hhtype,sd),digits=2) round(by(dat.match$food,dat.match$hhtype,sd),digits=2)