### GETTING STARTED
library(survey)
# Change this to wherever your data are stored
hbai.data.dir <- "~/data/HBAI/HBAI1213/tab/"
# Load the source data
hbai <- read.delim(sprintf("%s%s",
                           hbai.data.dir,
                           "hbai1213_g4.tab"))
# Set up the survey design, using person weights
ppl.svy <- svydesign(id=~1,
                     data=hbai,
                     weight=~GS_NEWPP)

# Some alternative weights:
# GS_NEWCH - numbers of children
# GS_NEWAD - numbers of adults
# GS_NEWPN - numbers pensioners
# Numbers of households - include only main benefit unit
hh.svy <- svydesign(id=~1,
                       data=subset(hbai, BENUNIT==1),
                       weight=~GS_NEWHH)

### THE OVERALL DISTRIBUTION OF INCOME (cf Table 2b, p32)
# Mean before-housing-costs income - £535.52 / week
bhc.mean <- svymean(~S_OE_BHC, ppl.svy)

# Deciles of BHC income
bhc.deciles <- svyquantile(~S_OE_BHC, ppl.svy,
                           quantiles=seq(0.1, 0.9, 0.1) )
bhc.deciles

### CHARTING THE INCOME DISTRIBUTION (cf Chart 2.4/BHC, p28)
# Group incomes up to £1,000 in £10 bands
hbai$bhc.inc.10grp <- cut(hbai$S_OE_BHC,
                          seq(0,1000,by=10),
                          right=FALSE, include.lowest=TRUE)
# Since we have added variables, survey design must be respecified
ppl.svy <- svydesign(id=~1, data=hbai, weight=~GS_NEWPP)
# Counts of all people in each band
bands.freq <- svytable(~bhc.inc.10grp, ppl.svy)

pdf(file="income_dist-bhc.pdf",width=4.7,height=3)
# From here on, set up the data specifically for plotting,
# as per the chart in the HBAI report
# First make a data.frame for plotting
bands <- data.frame(ppl=bands.freq,
                    lower=seq(0,990,10),
                    upper=seq(10,1000,10)  )

# Round deciles to nearest 10 and assign each £10 band to one
deciles.rnd <- signif(bhc.deciles,2)
bands$decile <- sapply(bands$upper, function(b) sum(b>deciles.rnd) + 1)
# Locations of the numeric labels of each decile
dec.labels <- data.frame(dec=1:10,
                         x=(c(0,bhc.deciles) + c(bhc.deciles, 1000))/2)
# Actually create the plot
library(ggplot2)
ggplot(bands)+
    geom_rect(aes(xmin=lower, xmax=upper,
                  ymin=0, ymax=ppl.Freq/10^6,
                  fill=factor(decile%%2)) ) +
    scale_fill_brewer("",type="qual", palette="Paired", guide=FALSE) +
    scale_y_continuous("People (millions)",
                       limits=c(0,1.5),
                       expand=c(0,0)) +
    scale_x_continuous("Equivalised BHC income (£/week, £10 bands)",
                       limits=c(0,1050),
                       breaks=seq(0,1000,100),
                       expand=c(0,0)) +
    geom_text(data=dec.labels, aes(x=x, y=0.05, label=dec),
              colour="white", fontface="bold", size=10/14*5)
dev.off()

### WITHIN-GROUP POVERTY RATES (cf Table 3.5db)
# According to economic status of adults
# A dummy variable for counting whole group populations
hbai$all <- 1
# Whether (at-risk-of) poverty according to various poverty thresholds:
# 50%, 60%, and 70% of median AHC income
hbai$ahc.poor.50 <- ( hbai$S_OE_AHC < hbai$MDOEAHC * 0.5 )
hbai$ahc.poor.60 <- ( hbai$S_OE_AHC < hbai$MDOEAHC * 0.6 )
hbai$ahc.poor.70 <- ( hbai$S_OE_AHC < hbai$MDOEAHC * 0.7 )

# Set up some labels for economic status
econ.statuses <- c("1+ F/T self-employed",
                   "Single/Couple, all in F/T work",
                   "Couple, 1 F/T, 1 P/T",
                   "Couple, 1 F/T, 1 not working",
                   "No full time, 1+ P/T",
                   "Workless, 1+ aged 60+",
                   "Workless, 1+ unemployed",
                   "Workless, other inactive")
hbai$ad.ec.stat <- factor(econ.statuses[hbai$ECOBU],
                          levels=econ.statuses)
# Redefine the survey
ppl.svy <- svydesign(id=~1, data=hbai, weight=~GS_NEWPP)

# Calculate the proportion in low income for each threshold & group
eact.pov <- svyby(~ahc.poor.50+ahc.poor.60+ahc.poor.70,
                  ~ad.ec.stat,
                  design=ppl.svy,
                  svyratio,
                  denominator=~all)
# eact.pov

### BETWEEN-YEAR COMPARISON OF INCOME DISTRIBUTION
# Use the HBAI deflators to match the published results.

# The income deflators given in the HBAI User Guide.
all.years <- data.frame(
    year1=1994:2012,
    ahc.deflator=c(148.8, 153.0, 155.9, 159.0, 159.7,
        162.5, 161.8, 164.5, 166.8, 169.4,
        171.5, 174.5, 179.8, 184.6, 192.7,
        198.8, 209.7, 222.0, 229.5) )

# Give each year its proper label (e.g. "2012/13")
all.years$label <- sapply(all.years$year1,
                         function(yr)
                         sprintf("%s/%s",
                                 yr,
                                 substring(sprintf("%i", yr+1), 3, 4 ) ) )

# Find the corresponding data files for each year. There are slight
# variations in how they are named.
all.years$data.file <-
    sapply(all.years$year1, function(yr1)
           Sys.glob(sprintf("%shbai%s*.tab",
                            hbai.data.dir,
                            substring(sprintf("%i",yr1), 3, 4) ) ) )

# Read all the data files in one go and keep them in memory. This can be
# convenient, but uses more memory (≈700MB). Otherwise, read and process
# one file at a time.
all.years.data <- sapply(all.years$data.file, read.delim)

# Function to give quintile mid-points and mean of AHC income
yr.quints.and.mean <- function(yr.data) {
    svy.ppl <- svydesign(ids=~1,
                         weights=~GS_NEWPP,
                         data=yr.data)
    c(svyquantile(~S_OE_AHC, svy.ppl, seq(0.1, 0.9, 0.2)),
      svymean(~S_OE_AHC, svy.ppl))
}
# Get the nominal (in-year) values and means
nominal.qiles <- sapply(all.years.data, yr.quints.and.mean)

# Convert all our values to 2012/13 terms (index 229.5)
# = value / source year's index * target year's index
real.qiles <- apply(nominal.qiles, 1,
                    function(nom.val)
                    mapply(prod, nom.val, 1/all.years$ahc.def, 229.5) )

# Label and show the output
rownames(real.qiles) <- all.years$label
colnames(real.qiles) <- c(paste("Quintile",1:5), "Mean")
# real.qiles

pdf(file="gini-series.pdf",width=4.7,height=3)
### MEASURES OF INCOME INEQUALITY
# Needed to calculate Gini coefficient from weighted data
library(reldist)
# Calculate BHC and AHC gini (all-people-weighted)
gini.bhc <- sapply(all.years.data,
                   function(d) gini(d$S_OE_BHC, d$GS_NEWPP))
gini.ahc <- sapply(all.years.data,
                   function(d) gini(d$S_OE_AHC, d$GS_NEWPP))

# A data frame for plotting
ginis <- data.frame(year=all.years$label,
                    gini.bhc=gini.bhc,
                    gini.ahc=gini.ahc)
# In order to convert to "long" format
library(reshape2)

# Note that ggplot2 (rightly) does not allow the use of multiple y-axes,
# so we can't plot the 90/10 ratio on the same chart
ggplot(melt(ginis),aes(x=year, y=round(value*100),
                       colour=variable, group=variable)) +
    geom_path() +
    geom_point() +
    scale_y_continuous("Gini coefficient",
                       limits=c(30,42), breaks=seq(30,42,2)) +
    scale_colour_brewer("Income variable",
                        type="qual", palette="Paired") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
dev.off()

### ESTIMATING ERROR OF ESTIMATES
# Estimates of children and all people in poverty
ppl.svy <- svydesign(id=~1,
                     data=hbai,
                     weight=~GS_NEWPP)
ppl.pov <- svyratio(~ahc.poor.60, denominator=~all,
                    design=ppl.svy, level=0.95)
kid.svy <- svydesign(id=~1,
                     data=hbai,
                     weight=~GS_NEWCH)
kid.pov <- svyratio(~ahc.poor.60, denominator=~all,
                    design=kid.svy, level=0.95)

# The assumed design factor
des.fac <- 1.3
# The adjusted lower and uppper CIs for all individuals
ppl.pov$ratio - ( SE(ppl.pov) * des.fac )
ppl.pov$ratio + ( SE(ppl.pov) * des.fac )

# Table showing this
cis <- data.frame(estimate=c(
                      ppl.pov$ratio,
                      kid.pov$ratio),
                  ci.low=c(
                      ppl.pov$ratio - ( SE(ppl.pov) * des.fac ),
                      kid.pov$ratio - ( SE(kid.pov) * des.fac ) ),
                  ci.high=c(
                      ppl.pov$ratio + ( SE(ppl.pov) * des.fac ),
                      kid.pov$ratio + ( SE(kid.pov) * des.fac ) )
                  )
# Convert to percentages
cis <- cis * 100
rownames(cis) <- c("All individual", "Children")