# Script for Generating Absolute Wealth Estimate (AWE) from DHS Wealth Index

# Introduction:   This script is intended as a guide to those who would like to generate the
#                 absolute wealth estimates (AWE) from DHS data, using the method described in:
#                 Hruschka, D.J., Gerkey, D. & Hadley, C. 2015. "Estimating the Absolute Wealth of Households."
#                 Bulletin of the World Health Organization. Article ID: BLT.147082

# Data Requirements:   
#1. DHS data from one or more surveys.  Each survey represents a combination of a country and year.
#   Specifically, at least four variables are needed from the DHS Data:
#     A. Country ID.  Hereafter = "country"
#     B. Year ID. Hereafter = "year"
#     C. DHS Wealth Factor Score.  Hereafter = "wealth.factor"
#2. Country GDP per capita based on purchasing power parity (PPP) in constant 2011 international dollars.  Hereafter = "GDPpc2011"
#      Source: World Bank:  http://data.worldbank.org/indicator/NY.GDP.PCAP.PP.KD
#3. Country Gini Coefficient for household wealth.  Hereafter = "gini"
#       Source: Davies, J.B., Sandstrom, S., Shorrocks, A.B., Wolff, E.N. 2009.
#               "The Level and Distribution of Global Household Wealth."
#               National Bureau of Economic Research Working Paper Series: Working Paper 15508
#               http://www.nber.org/papers/w15508
#               See Appendix V
#4. OilRents (% of GDP) .  Hereafter = "OilRentCode"
#      Source: World Bank:  http://data.worldbank.org/indicator/NY.GDP.PETR.RT.ZS

#Load Data:   The script below assumes you have loaded a data table in R, hereafter "my.dhs.data,"
#             with rows of individuals and columns for the following variables:
#1. DHS Country ID = "country"
#2. DHS Year ID = "year"
#3. DHS Wealth Factor = "wealth.factor"
#4. Country GDP per capita PPP 2011 International $ = "GDPpc"
#5. Country Gini for HH Wealth = "gini"
#6. OilRentCode for the percentage of GDP derived from Oil rents

##NOTE: The Hruschka et al. 2015 paper uses a specific formulation for deriving country level wealth per capita based on GDP and oil rents. 
##A researcher may choose a different method of calculating mean wealth per capita used in the formulas below (lnPPPwpc and PPPwpc)

# Load R Packages:
library(data.table)

# Generate Unique DHS Survey ID ( = Country + Survey Year)
my.dhs.data$country.year.id2 <- paste0(my.dhs.data$country, my.dhs.data$year)
my.dhs.data$country.year.id <- as.numeric(as.factor(my.dhs.data$country.year.id2))

# Generate dataset containing counts for each survey
names <- unique(my.dhs.data$country.year.id) #object to store unique DHS Survey IDs
holder <- data.frame(aggregate(my.dhs.data$country.year.id, by = list(my.dhs.data$country.year.id), FUN = length)) #object to store unique DHS Survey IDs and Counts of Individuals in each Survey
names(holder) <- c("country.year.id","numIND")  # Changes name of 1st column to indicate number of individuals in each DHS survey
my.dhs.data <- merge(x=my.dhs.data, y=holder, by.x="country.year.id", by.y="country.year.id", all.x=TRUE) # Merges column indicating number of individuals in a survey with DHS data

#Creates proportional rank of individuals within countries.

my.dhs.data$wrank1 <- ave(my.dhs.data$wealth.factor, my.dhs.data$country.year.id, FUN=rank) # Generates new variable (wrank1) = ordered rank of Individual wealth (1 to N, where N is wealthiest individual in a survey)
my.dhs.data$wrank3 <- (my.dhs.data$wrank1/my.dhs.data$numIND) * (my.dhs.data$numIND - 1)/my.dhs.data$numIND  # normalizes proportional rank so that 0 <= p < 1

# Generating Country-level Wealth Per Capita - Uses regression equation from Davies et al 2009 & Davies et al 2011
my.dhs.data$GDPpc <- as.numeric(as.character(my.dhs.data$GDPpc)) #converting GDPpc from factor to numeric

my.dhs.data$lnPPPwpc <- (1.129 * log(my.dhs.data$GDPpc)*(1-my.dhs.data$OilRentCode/100) - 0.293) # This converts GDP per capita to natural log of Country Wealth per capita PPP in 2011 constant international $ (new variable "lnPPPwpc")
my.dhs.data$PPPwpc <- exp(my.dhs.data$lnPPPwpc) # This generates new variable (PPPwpc) = Country Wealth per capita PPP in 2011 constant internatioanl dollars


### Generating Absolute Wealth Estimates ###

#Note: As explained in Hruschka et al. 2015, we generate AWE using one of three distributions:
#   1. Pareto Distribution
#   2. Log-normal Distribution
#   3. Combined Distribution (i.e. weighted average of AWE from Pareto and Log-normal Distribution)


#1. Absolute Wealth Estimate -Pareto

# Generating ICDF
# Shape Parameter - Sigma
my.dhs.data$shape <- (1+my.dhs.data$gini)/(2*my.dhs.data$gini)
# Threshold - Mu
my.dhs.data$threshold <- 1-(1/my.dhs.data$shape) * my.dhs.data$PPPwpc
# ICDF
my.dhs.data$icdf <- my.dhs.data$threshold / ((1-my.dhs.data$wrank3)^(1/my.dhs.data$shape))
# Generating Mean ICDF at Country level
mean.icdf.country<- tapply(my.dhs.data$icdf, my.dhs.data$country.year.id2, mean)
mean.icdf.country <- as.data.frame(mean.icdf.country)
# Merging with Data
my.dhs.data <- merge(x=my.dhs.data, y=mean.icdf.country, by.x="country.year.id2", by.y="row.names", all.x=TRUE) # New column  in my.dhs.data "mean.icdf.country" = Mean ICDF at Country Level

# Absolute Wealth Estimate for Household (in PPP 2011 International Constant Dollars)
my.dhs.data$wealthpc <- my.dhs.data$icdf * my.dhs.data$PPPwpc / my.dhs.data$mean.icdf.country

#The new variable "wealthpc" represents the Pareto Absolute Wealth Estimate


#2. Absolute Wealth Estimate -Log-normal

# Generating ICDF

# Shape Parameter - Sigma
my.dhs.data$sigma <- sqrt(2) * (qnorm(( (my.dhs.data$gini + 1) / 2)))

# Threshold - Mu
my.dhs.data$mu <- ((log(my.dhs.data$PPPwpc)) - ((my.dhs.data$sigma^2)/2))

# ICDF
my.dhs.data$icdfln<- qlnorm(my.dhs.data$wrank3, meanlog=my.dhs.data$mu, sdlog=my.dhs.data$sigma)

#Generating Mean ICDF at Country Level
mean.icdf.country.ln <- tapply(my.dhs.data$icdfln, my.dhs.data$country.year.id2, mean)
mean.icdf.country.ln <- as.data.frame(mean.icdf.country.ln)

#Merging with Data
my.dhs.data <- merge(x=my.dhs.data, y=mean.icdf.country.ln, by.x="country.year.id2", by.y="row.names", all.x=TRUE)
# New column  in my.dhs.data "mean.icdf.country.ln" = Mean ICDF at Country Level

# Estimating Household Wealth Per Capita PPP in 2011 International Constant $
my.dhs.data$wealthpcln  <- my.dhs.data$icdfln * my.dhs.data$PPPwpc / my.dhs.data$mean.icdf.country.ln

#The new variable "wealthpcln" represents the Log-normal Absolute Wealth Estimate


#3. Combined Pareto and Log-normal Absolute Wealth Estimates

# Wealth(G) = Pareto AWE ^ G x Log-normal AWE ^ 1-G
# Where 0 <= G <= 1

# Gamma (G) Weight = 0.32

my.dhs.data$wealthpcg32 <- (my.dhs.data$wealthpc ^ 0.32) * (my.dhs.data$wealthpcln ^ (1-0.32))

#The new variable "wealthpcg32" represents the Combined AWE with a gamma weight = 0.32


## Summary of Results ##

# The code above generates three Absolute Wealth Estimates (AWE):
#Pareto AWE = my.dhs.data$wealthpc
#Log-normal AWE = my.dhs.data$wealthpcln
#Combined AWE (gamma weight=0.32) = my.dhs.data$wealthpcg32

# Of these three estimates, Hruschka et al 2015 suggests the Combined AWE provides the closest match with World Bank poverty headcounts