Skip to contents

Built with R 4.3.1


Redistribution means rearranging values such that totals remain the same, and the original distribution could be recovered when re-aggregating from a perfectly contained disaggregate.

This may be desirable if your goal is to simply display the same data in different forms.

If, however, your goal is to come up with values at scales you don’t have observations of, it may be desirable to combine various observations at different scales and use them to come up with new estimates.

In the case of estimation, there are many more methods to consider, and validating their results is less straightforward, so this article will consider a few estimation and validation methods.

Data

We’ll focus on a single county (Arlington, VA), with census block groups and Public Use Microdata Samples (PUMS) as our primary source data.

# directory to save data in
base_dir <- "../estimate_comparison"
dir.create(base_dir, FALSE)
# geographies
library(catchment)
geography_bg <- download_census_shapes(base_dir, "va", "bg", year = 2021)
geography_bg <- geography_bg[grep("^51013", geography_bg$GEOID), ]
geography_tr <- download_census_shapes(base_dir, "va", "tr", year = 2021)
geography_tr <- geography_tr[grep("^51013", geography_tr$GEOID), ]

# ACS data

## block groups
block_groups <- tidycensus::get_acs(
  year = 2021,
  state = "51",
  county = "013",
  geography = "block group",
  output = "wide",
  variables = c(
    total = "B01001_001",
    race_white = "B02001_002",
    race_black = "B02001_003",
    race_native = "B02001_004",
    race_asian = "B02001_005",
    sex_male = "B01001_002",
    sex_female = "B01001_026",
    tenure_total = "B25003_001",
    tenure_owner = "B25003_002",
    tenure_renter = "B25003_003",
    income_total = "B19001_001",
    income_lt10 = "B19001_002",
    income_10_15 = "B19001_003",
    income_15_20 = "B19001_004",
    income_20_25 = "B19001_005",
    income_25_30 = "B19001_006",
    income_30_35 = "B19001_007",
    income_35_40 = "B19001_008",
    income_40_45 = "B19001_009",
    income_45_50 = "B19001_010",
    income_50_60 = "B19001_011",
    income_60_75 = "B19001_012",
    income_75_100 = "B19001_013",
    income_100_125 = "B19001_014",
    income_125_150 = "B19001_015",
    income_150_200 = "B19001_016",
    income_gt200 = "B19001_017",
    avg_hhsize = "B25010_001",
    avg_hhsize_own = "B25010_002",
    avg_hhsize_rent = "B25010_003"
  )
)[, -2]
#> Getting data from the 2017-2021 5-year ACS
colnames(block_groups)[2] <- "Total"
block_groups$race_other <- block_groups$Total - rowSums(
  block_groups[, grep("^race_.*E$", colnames(block_groups))]
)
colnames(block_groups) <- sub("E$", "", colnames(block_groups))

### impute missing average household sizes
block_groups$avg_hhsize[is.na(block_groups$avg_hhsize)] <- mean(
  block_groups$avg_hhsize,
  na.rm = TRUE
)
su <- is.na(block_groups$avg_hhsize_own)
block_groups$avg_hhsize_own[su] <- rowMeans(
  block_groups[su, c("avg_hhsize", "avg_hhsize_rent")],
  na.rm = TRUE
)
block_groups[is.na(block_groups)] <- 0

### make larger income groups
income_vars <- grep("income", colnames(block_groups), value = TRUE)
block_groups$income_lt50 <- rowSums(
  block_groups[, grep("_(?:lt1|[1-4][05]_).*\\d$", income_vars, value = TRUE)]
)
block_groups$income_50_100 <- rowSums(
  block_groups[, grep("income_[5-7].*\\d$", income_vars, value = TRUE)]
)
block_groups$income_100_200 <- rowSums(
  block_groups[, grep("_1\\d{2}.*\\d$", income_vars, value = TRUE)]
)
income_vars_select <- c(
  "income_lt50", "income_50_100", "income_100_200", "income_gt200"
)

library(sf)
rownames(geography_bg) <- geography_bg$GEOID
st_geometry(block_groups) <- st_geometry(geography_bg[block_groups$GEOID, ])

# parcel data
parcel_file <- paste0(base_dir, "/parcels.rds")
if (!file.exists(parcel_file)) {
  parcels <- st_read(paste0(
    "https://arlgis.arlingtonva.us/arcgis/rest/",
    "services/Open_Data/od_MHUD_Polygons/",
    "FeatureServer/0/query?where=1=1&outFields=*&outSR=4326&f=json"
  ))
  saveRDS(parcels, parcel_file, compress = "xz")
}
parcels <- readRDS(parcel_file)
colnames(parcels)[colnames(parcels) == "Total_Units"] <- "Total"
parcels$OBJECTID <- as.character(parcels$OBJECTID)
parcels <- parcels[parcels$Year_Built <= 2021, ]

Ultimately, we’ll want to use these methods to get values for unobserved geographies, but for this comparison of methods, we’ll want a way to test their results against known values.

For this test case, we can simply start at a higher geolevel, and make estimates at the block group level. To do that, we’ll need to aggregate our block group data up to tracts:

tracts <- redistribute(block_groups, geography_tr)

Redistribution

As a baseline, the first method to consider is proportional redistribution of the block group summary estimates alone.

Direct

We could do this from tracts to block groups directly:

# going from tracts to block groups with no information
estimate_tr_to_bg <- redistribute(
  tracts, block_groups,
  source_id = "id"
)

# mean absolute error between total population estimates
mean(abs(estimate_tr_to_bg$Total - block_groups$Total))
#> [1] 319.8405

We can see in this case, the error introduce when we only have proportional information at the lower level; block groups that once varied within tract now have the same value:

Down and Up

Or we could go from tracts down to parcels, then back up to block groups:

# function to apply tract to parcel to block group redistribution
redist_downup <- function(
    top, bottom, middle, map_tb = NULL, map_bm = NULL,
    weight = "Total", source_id = "id",
    target_id = "OBJECTID", target_id_mid = "GEOID") {
  to_bottom <- redistribute(
    top, bottom, map_tb,
    weight = weight, source_id = source_id, target_id = target_id,
    default_value = 0
  )
  redistribute(
    to_bottom, middle, map_bm,
    source_id = "id", target_id = target_id_mid,
    default_value = 0
  )
}

# pre-make maps

## from block groups to parcels
map_bg_to_parcel <- redistribute(
  block_groups, parcels,
  target_id = "OBJECTID", return_map = TRUE, overlaps = "first"
)
parcels$geoid <- structure(
  rep(names(map_bg_to_parcel), vapply(map_bg_to_parcel, length, 0)),
  names = names(unlist(unname(map_bg_to_parcel)))
)[parcels$OBJECTID]
block_groups <- block_groups[block_groups$GEOID %in% parcels$geoid, ]

## from tracts to parcels
map_tr_to_parcel <- split(unlist(unname(map_bg_to_parcel)), rep(
  substring(names(map_bg_to_parcel), 1, 11), vapply(map_bg_to_parcel, length, 0)
))

# check that down from and up to the same level works
all.equal(
  as.data.frame(redist_downup(
    block_groups, parcels, block_groups, map_bg_to_parcel, map_bg_to_parcel,
    source_id = "GEOID"
  )[, -1]),
  as.data.frame(block_groups[, -1])
)
#> [1] TRUE

# redistribute
estimate_redist <- redist_downup(
  tracts, parcels, block_groups, map_tr_to_parcel, map_bg_to_parcel
)

# mean absolute error between total population estimates
mean(abs(estimate_redist$Total - block_groups$Total))
#> [1] 231.1852

Now there is more of the original variation between block groups:

Because the parcel totals are living units rather than individuals, we may be able to refine our redistribution based on some heuristic estimates of number of people per living unit:

# 2.5 for multi-unit parcels, 5.5 for single-unit parcels
parcels$Residents <- parcels$Total * (2.5 + (parcels$Total == 1) * 3)

# now see if this improved our redistribution
estimate_redist_adj <- redist_downup(
  tracts, parcels, block_groups, map_tr_to_parcel, map_bg_to_parcel,
  weight = "Residents"
)

# mean absolute error between total population estimates
mean(abs(estimate_redist_adj$Total - block_groups$Total))
#> [1] 194.0749

For a better sense of what this resident adjustment is doing, we can look at the difference made in a single tract:

# identify the most different tract between variants
tract <- substring(block_groups$GEOID[
  which.max(abs(estimate_redist_adj$Total - estimate_redist$Total))
], 1, 11)
su <- substring(estimate_redist$id, 1, 11) == tract
tsu <- tracts$id == tract
psu <- substring(parcels$geoid, 1, 11) == tract

# recalculate parcel-level values within that tract
differences <- list(
  parcels = redistribute(
    tracts$Total[tsu], parcels[psu, ],
    target_id = "OBJECTID", weight = "Residents", return_geometry = FALSE
  )[, 2] - redistribute(
    tracts$Total[tsu], parcels[psu, ],
    target_id = "OBJECTID", weight = "Total", return_geometry = FALSE
  )[, 2],
  block_group = estimate_redist_adj$Total[su] - estimate_redist$Total[su]
)

Now we can look at the difference in the contained parcels and block groups:

In this case, the difference mostly comes down to two buildings with many units that get less relative weight with the resident values.

PUMS + Raking

So far, we’ve used some additional information (total units) at the parcel level to improve the distribution of values across block groups, relative to a uniform distribution.

We have some more information at the parcel level, such as unit type and number of renter versus owner designations, but to make use of this information, we need some association between source values and those features. This is where the census microdata sample can come in: The Public Use Microdata Sample (PUMS) consists of individual-level observations that include demographic features and housing features.

pums <- download_census_pums(base_dir, "va", 2021, one_year = FALSE)
#>  loading household sample: h.csv.xz
#>  loading person sample: p.csv.xz
#>  loading crosswalk 2010_Census_Tract_to_2010_PUMA.txt

# prepare IDs
pums$crosswalk$GEOID <- do.call(paste0, pums$crosswalk[, 1:3])
pums$person$ID <- do.call(paste0, pums$person[, c("SERIALNO", "SPORDER")])

# prepare variables

## survey categories
pums$person$sex_cat <- c("sex_male", "sex_female")[as.numeric(pums$person$SEX)]

pums$person$race_cat <- paste0("race_", c(
  "white", "black", "native", rep("other", 2), "asian", rep("other", 3)
))[as.numeric(pums$person$RAC1P)]

pums$household$income_cat <- as.character(cut(
  pums$household$HINCP, c(-Inf, 50, 100, 200, Inf) * 1e3, income_vars_select,
  right = FALSE
))

## unit categories
pums$household <- pums$household[
  !is.na(pums$household$BLD) & !is.na(pums$household$TEN),
]
pums$household$building_type <- "MULTI"
pums$household$building_type[pums$household$BLD == "02"] <- "SFD"
pums$household$building_type[pums$household$BLD == "03"] <- "SFA"

pums$household$status <- "RENTER"
pums$household$status[pums$household$TEN %in% 1:2] <- "OWNER"

The PUM sample is only located down to PUM areas, which are made up of tracts, so this is where we can start for a baseline.

We will use associations between demographic and housing features within each PUMA to get probabilities at the parcel level, then redistribute our source values to them like before.

The first step to this end is to prepare our data so it is easy to associate the individual PUMS variables with their per-level summaries at the tract level:

# define variables of interest
vars_house <- c(
  ID = "SERIALNO", weight = "WGTP", type = "building_type",
  status = "status", income = "income_cat"
)
vars_person <- c(
  ID = "ID", weight = "PWGTP", sex_cat = "sex_cat", race_cat = "race_cat"
)
vars_units <- c(
  type = "Unit_Type", renters = "Renter_Occupied", owners = "Owner_Occupied"
)

## get their levels
vars_list <- c(
  lapply(vars_person[-(1:2)], function(var) unique(pums$person[[var]])),
  income_cat = list(unique(pums$household$income_cat))
)
return_vars <- names(vars_list)
pvars <- unlist(vars_list, use.names = FALSE)
vars <- c(pvars, "avg_hhsize_own", "avg_hhsize_rent")

# prepare datasets split into PUMAs
pumas_focal <- unique(pums$crosswalk$PUMA5CE[pums$crosswalk$GEOID %in% tracts$id])
data <- lapply(structure(pumas_focal, names = pumas_focal), function(puma) {
  households <- as.data.frame(pums$household[pums$household$PUMA == puma, vars_house])
  rownames(households) <- households$SERIALNO
  ids <- pums$crosswalk$GEOID[
    pums$crosswalk$PUMA5CE == puma & pums$crosswalk$GEOID %in% tracts$id
  ]
  person <- pums$person[pums$person$SERIALNO %in% households$SERIALNO, ]
  list(
    individuals = cbind(households[person$SERIALNO, ], person),
    source = tracts[tracts$id %in% ids, c("id", vars), drop = TRUE],
    parcels = parcels[
      substring(parcels$geoid, 1, 11) %in% ids, c("geoid", "OBJECTID", vars_units),
      drop = TRUE
    ]
  )
})

# function to apply each method to the data
apply_method <- function(data, method, parcel_only = FALSE, rescale = TRUE) {
  p <- do.call(rbind, lapply(data, function(set) {
    set$source <- st_drop_geometry(set$source)
    do.call(rbind, lapply(unique(set$source$id), function(id) {
      source <- set$source[set$source$id == id, -1]
      target <- set$parcels[if ("map" %in% names(set)) {
        if ("geography" %in% names(set)) {
          set$parcels$geoid %in% set$map[[id]]
        } else {
          set$parcels$geoid == id
        }
      } else {
        substring(set$parcels$geoid, 1, 11) == id
      }, ]
      if (nrow(target)) {
        est <- if (sum(source)) method(source, target[, -1], set$individuals) else NULL
        if (length(est)) {
          if (rescale) {
            totals <- colSums(est)
            totals[!totals] <- 1
            est <- sweep(est, 2, totals, "/") * rep(
              as.numeric(source[, colnames(est)]),
              each = nrow(est)
            )
          }
          su <- !pvars %in% colnames(est)
          if (any(su)) est[, pvars[su]] <- 0
          cbind(target[, 1:2], est[as.character(target$OBJECTID), pvars])
        } else {
          est <- target[, 1:2]
          est[, pvars] <- 0
          est
        }
      }
    }))
  }))
  if (parcel_only) {
    p
  } else {
    list(parcels = p, block_groups = redistribute(
      p[, -1], block_groups, map_bg_to_parcel,
      source_id = "OBJECTID", target_id = "GEOID", default_value = 0
    ))
  }
}

# function to calculate estimates from weights
make_estimates <- function() {
  do.call(rbind, lapply(unique(target$Unit_Type), function(type) {
    d <- target[
      target$Unit_Type == type, c("OBJECTID", "Renter_Occupied", "Owner_Occupied")
    ]
    d$Renter_Occupied <- d$Renter_Occupied * source$avg_hhsize_rent
    d$Owner_Occupied <- d$Owner_Occupied * source$avg_hhsize_own
    nd <- nrow(d)
    colnames(d)[-1] <- c("RENTER", "OWNER")
    i <- individuals[
      individuals$building_type == type, c("weight", "status", return_vars)
    ]
    as.data.frame(Reduce("+", lapply(
      (if (is.factor(i$status)) levels else unique)(i$status),
      function(s) {
        ii <- i[i$status == s, ]
        weight_total <- sum(ii$weight)
        do.call(cbind, lapply(return_vars, function(cat) {
          props <- tapply(ii$weight, ii[[cat]], sum) / weight_total
          props[vars_list[[cat]][!vars_list[[cat]] %in% names(props)]] <- 0
          props[is.na(props)] <- 0
          props <- props[vars_list[[cat]]]
          matrix(
            d[[s]] * rep(props, each = nd),
            nd,
            dimnames = list(d$OBJECTID, names(props))
          )
        }))
      }
    )))
  }))
}

Now we can apply each method to this same set of data.

Before the new methods, we can also update our heuristic adjustment to the proportional redistribution method with PUMS information:

# make single-family-home indicators
pums$household$single_family_home <- pums$household$building_type != "MULTI"
parcels$single_family_home <- parcels$Unit_Type != "MULTI"

# update resident estimates with average household size for
# single family versus other home types within the county
pums$household$size <- table(pums$person$SERIALNO)[pums$household$SERIALNO]
households <- pums$household[pums$household$PUMA %in% pumas_focal, ]
parcels$Residents <- parcels$Total * tapply(
  households$size, households$single_family_home, mean
)[as.character(parcels$single_family_home)]

# recalculate the adjusted redistribution results
estimate_redist_adj <- redist_downup(
  tracts, parcels, block_groups, map_tr_to_parcel, map_bg_to_parcel,
  weight = "Residents"
)
mean(abs(estimate_redist_adj$Total - block_groups$Total))
#> [1] 201.2044

Initial Weights

As a baseline, we can calculate a set of proportions for each PUMA, then apply that to the parcels in each contained tract:

# function to apply the method:
# - source: the tract-level totals
# - target: the parcels
# - individuals: the individual PUM responses
est_baseline <- function(source, target, individuals) {
  individuals$weight <- individuals$PWGTP
  environment(make_estimates) <- environment()
  make_estimates()
}

# apply across PUMAs and tracts within them
# and aggregate to block groups
estimate_baseline <- apply_method(data, est_baseline)

Because this method does not make estimates that recover their initial values, we have to rescale them to be comparable. For illustration:

# adjust tract counts so they match
# unit totals * average household size across parcels
tr_adj <- st_drop_geometry(tracts[tracts$id %in% pums$crosswalk$GEOID, ])
pid <- substring(parcels$geoid, 1, 11)
parcels$renter_est <- parcels$Renter_Occupied * structure(
  tracts$avg_hhsize_rent,
  names = tracts$id
)[pid]
parcels$owner_est <- parcels$Owner_Occupied * structure(
  tracts$avg_hhsize_own,
  names = tracts$id
)[pid]
parcels$resident_est <- parcels$renter_est + parcels$owner_est
tr_adj$Total <- tapply(parcels$resident_est, pid, sum)[tr_adj$id]

for (g in vars_list) {
  tr_adj[, g] <- tr_adj[, g] / rowSums(tr_adj[, g]) * tr_adj$Total
}
tr_adj[is.na(tr_adj)] <- 0

# make baseline parcel-level estimates without rescaling
data_adj <- data
data_adj[[1]]$source <- tr_adj[tr_adj$id %in% data[[1]]$source$id, c("id", vars)]
data_adj[[2]]$source <- tr_adj[tr_adj$id %in% data[[2]]$source$id, c("id", vars)]
parcel_adj <- apply_method(
  data_adj, est_baseline,
  rescale = FALSE, parcel_only = TRUE
)

# aggregate back up to tracts and measure per-category error
tr_est <- redistribute(
  parcel_adj, tr_adj, map_tr_to_parcel,
  source_id = "OBJECTID", target_id = "id", default_value = 0
)
data.frame(MAE = round(vapply(
  pvars, function(var) mean(abs(tr_est[[var]] - tr_adj[[var]])), 0
), 5))
#>                       MAE
#> sex_female      304.75734
#> sex_male        304.75734
#> race_black      574.76484
#> race_white     1093.99980
#> race_other      698.47523
#> race_asian      471.04857
#> race_native      45.12699
#> income_lt50     840.36931
#> income_gt200    767.20051
#> income_100_200  904.24289
#> income_50_100   704.13201

## this method does recover the parcel-level totals and overall totals
g <- c("sex_female", "sex_male")
all(abs(rowSums(parcel_adj[, g]) - structure(
  parcels$resident_est,
  names = parcels$OBJECTID
)[parcel_adj$OBJECTID]) < 1e-12)
#> [1] TRUE
c(
  original = sum(
    rbind(data_adj[[1]]$source[, g], data_adj[[2]]$source[, g])
  ),
  parcel = sum(parcel_adj[, g]),
  tract = sum(tr_est[, g])
)
#> original   parcel    tract 
#> 691326.4 691326.4 691326.4

# rescaling makes the totals line up
parcel_adj_rescaled <- apply_method(
  data_adj, est_baseline,
  parcel_only = TRUE
)
tr_est_rescaled <- redistribute(
  parcel_adj_rescaled, tr_adj, map_tr_to_parcel,
  source_id = "OBJECTID", target_id = "id", default_value = 0
)
data.frame(MAE = round(vapply(
  pvars, function(var) mean(abs(tr_est_rescaled[[var]] - tr_adj[[var]])), 0
), 5))
#>                MAE
#> sex_female       0
#> sex_male         0
#> race_black       0
#> race_white       0
#> race_other       0
#> race_asian       0
#> race_native      0
#> income_lt50      0
#> income_gt200     0
#> income_100_200   0
#> income_50_100    0

# just like the redistribution methods
tr_redist <- redist_downup(
  tr_adj, parcels, tr_adj, map_tr_to_parcel, map_tr_to_parcel
)
data.frame(MAE = round(vapply(
  pvars, function(var) mean(abs(tr_redist[[var]] - tr_adj[[var]])), 0
), 5))
#>                MAE
#> sex_female       0
#> sex_male         0
#> race_black       0
#> race_white       0
#> race_other       0
#> race_asian       0
#> race_native      0
#> income_lt50      0
#> income_gt200     0
#> income_100_200   0
#> income_50_100    0

Raking

In the baseline, we’re using PUMS weights to calculate proportions, but these weights are meant to bring totals in line with PUMA-level estimates; Since we have lower-level information in the form of tract-level totals, we can adjust those weights to match each smaller-area summary:

library(anesrake)
rake_prep <- function() {
  eval(expression({
    for (var in names(totals)) {
      individuals[[var]] <- as.factor(individuals[[var]])
      l <- levels(individuals[[var]])
      totals[[var]] <- totals[[var]][l]
      su <- is.na(totals[[var]]) | totals[[var]] == 0
      if (sum(su)) {
        individuals <- individuals[!individuals[[var]] %in% l[su], ]
        individuals[[var]] <- droplevels(individuals[[var]])
        totals[[var]] <- totals[[var]][!su]
      }
      total <- sum(totals[[var]])
      if (total) totals[[var]] <- totals[[var]] / total
    }
    totals <- Filter(length, totals)
  }), parent.frame())
}
est_rake <- function(source, target, individuals) {
  totals <- lapply(
    structure(names(vars_list)[1:2], names = names(vars_list)[1:2]),
    function(n) unlist(source[, vars_list[[n]]])
  )
  rake_prep()
  individuals$status <- as.factor(individuals$status)
  individuals$building_type <- as.factor(individuals$building_type)
  individuals$income_cat <- as.factor(individuals$income_cat)
  capture.output(individuals$weight <- tryCatch(
    anesrake(
      totals, individuals, individuals$ID, individuals$PWGTP,
      pctlim = .001
    )$weightvec[as.character(individuals$ID)],
    error = function(e) {
      warning(e$message)
      individuals$PWGTP
    }
  ))
  environment(make_estimates) <- environment()
  make_estimates()
}
estimate_rake <- apply_method(data, est_rake)

Two Step Raking

In the initial raking approach, we only considered the person-level variables, but we also have household-level information, so we might try to account for this as well by first adjusting the household weights, then using those adjusted weights to initialize the adjusted person-level weights:

est_twostep <- function(source, target, individuals) {
  all <- individuals
  individuals <- individuals[!duplicated(individuals$SERIALNO), ]
  # step 1: initial weights at household level
  totals <- list(
    income_cat = unlist(source[, grep("^income_", colnames(source))]),
    status = structure(
      colSums(target[, vars_units[-1]]),
      names = c("RENTER", "OWNER")
    ),
    building_type = tapply(rowSums(target[, vars_units[-1]]), target$Unit_Type, sum)
  )
  rake_prep()
  capture.output(initial <- tryCatch(
    anesrake(
      totals, individuals, individuals$SERIALNO, individuals$WGTP,
      pctlim = .001
    )$weightvec,
    error = function(e) {
      warning(e$message)
      structure(individuals$WGTP, names = individuals$SERIALNO)
    }
  ))
  all$weight <- initial[as.character(all$SERIALNO)]
  all <- all[!is.na(all$weight), ]
  individuals <- all
  # step 2: adjust at person level
  totals <- lapply(
    structure(names(vars_list)[1:2], names = names(vars_list)[1:2]),
    function(n) unlist(source[, vars_list[[n]]])
  )
  rake_prep()
  initial <- individuals$weight
  capture.output(individuals$weight <- tryCatch(
    anesrake(
      totals, individuals, individuals$ID,
      individuals$PWGTP * initial / individuals$WGTP,
      pctlim = .001
    )$weightvec[as.character(individuals$ID)],
    error = function(e) {
      warning(e$message)
      individuals$PWGTP * initial / individuals$WGTP
    }
  ))
  environment(make_estimates) <- environment()
  make_estimates()
}
estimate_twostep <- apply_method(data, est_twostep)

N Step Raking

We can take the two-step method further by repeating the process to try and better fit both the household and individual data:

est_nstep <- function(source, target, individuals) {
  individuals$weight <- individuals$PWGTP
  all <- individuals
  iter <- function() {
    individuals <- individuals[!duplicated(individuals$SERIALNO), ]
    # step 1: initial weights at household level
    totals <- list(
      income_cat = unlist(source[, grep("^income_", colnames(source))]),
      status = structure(
        colSums(target[, vars_units[-1]]),
        names = c("RENTER", "OWNER")
      ),
      building_type = tapply(rowSums(target[, vars_units[-1]]), target$Unit_Type, sum)
    )
    rake_prep()
    household_totals <- totals
    capture.output(initial <- tryCatch(
      anesrake(
        totals, individuals, individuals$SERIALNO,
        individuals$weight / individuals$PWGTP * individuals$WGTP,
        pctlim = .001
      )$weightvec,
      error = function(e) {
        warning(e$message)
        structure(
          individuals$weight / individuals$PWGTP * individuals$WGTP,
          names = individuals$SERIALNO
        )
      }
    ))
    all$weight <- initial[as.character(all$SERIALNO)]
    all <- all[!is.na(all$weight), ]
    individuals <- all
    # step 2: adjust at person level
    totals <- lapply(
      structure(names(vars_list)[1:2], names = names(vars_list)[1:2]),
      function(n) unlist(source[, vars_list[[n]]])
    )
    rake_prep()
    initial <- individuals$weight
    capture.output(individuals$weight <- tryCatch(
      anesrake(
        totals, individuals, individuals$ID,
        individuals$PWGTP * initial / individuals$WGTP,
        pctlim = .001
      )$weightvec[as.character(individuals$ID)],
      error = function(e) {
        warning(e$message)
        individuals$PWGTP * initial / individuals$WGTP
      }
    ))
    hh <- individuals[!duplicated(individuals$SERIALNO), ]
    hh$weight[hh$weight == 0] <- 1
    weight <- hh$PWGTP / hh$weight * hh$WGTP
    list(
      error = mean(vapply(names(household_totals), function(v) {
        mean((household_totals[[v]] - tapply(weight, hh[[v]], sum) / sum(weight))^2)
      }, 0), na.rm = TRUE),
      data = individuals
    )
  }
  previous_error <- 0
  for (i in 1:20) {
    step <- iter()
    individuals <- step$data
    if (step$error < .001 || abs(previous_error - step$error) < .0005) break
    previous_error <- step$error
  }
  environment(make_estimates) <- environment()
  make_estimates()
}
estimate_nstep <- apply_method(data, est_nstep)

Generalized Raking

The two-step approach still mainly focuses on better aligning the person-level weights – the household totals only match after the first step, but are not considered in the second step. Alternative to this, we might try to match the household- and person-level weights at the same time with a generalized raking approach:

library(mlfit)
est_grake <- function(source, target, individuals) {
  totals <- list(
    person = sum(individuals$PWGTP),
    household = sum(individuals$WGTP[!duplicated(individuals$SERIALNO)])
  )
  person <- lapply(names(vars_list)[1:2], function(n) {
    l <- vars_list[[n]]
    count <- as.numeric(source[, l, drop = TRUE])
    count[!count] <- 1
    count <- count / sum(count) * totals$person
    s <- data.frame(level = l, count = count)
    colnames(s)[1] <- n
    s
  })
  household <- unique(individuals$building_type)
  household <- structure(numeric(length(household)), names = household)
  observed_types <- tapply(
    rowSums(target[, vars_units[-1]]), target$Unit_Type, sum
  )
  household[names(observed_types)] <- observed_types
  household <- list(data.frame(
    building_type = names(household), count = household
  ))
  household[[1]]$count[is.na(household[[1]]$count)] <- 1
  household[[2]] <- data.frame(
    status = c("RENTER", "OWNER"),
    count = colSums(target[, c("Renter_Occupied", "Owner_Occupied")])
  )
  household[[3]] <- unlist(source[, grep("^income_", colnames(source))])
  household[[3]] <- data.frame(
    income_cat = names(household[[3]]), count = household[[3]]
  )
  names(household) <- c("building_type", "status", "income_cat")
  household <- lapply(names(household), function(var) {
    l <- household[[var]]
    l <- l[l[[1]] %in% unique(individuals[, var]), ]
    l[[2]][!l[[2]]] <- 1
    l[[2]] <- l[[2]] / sum(l[[2]]) * totals$household
    l
  })
  m <- ml_problem(
    individuals,
    field_names = special_field_names("SERIALNO", "ID", count = "count"),
    group_controls = household,
    individual_controls = person,
    prior_weights = individuals$WGTP
  )
  fm <- tryCatch(
    suppressWarnings(ml_fit_dss(m, ginv = solve)),
    error = function(e) NULL
  )
  if (!isTRUE(fm$success)) {
    fm <- tryCatch(
      suppressWarnings(ml_fit_ipu(m)),
      error = function(e) NULL
    )
  }
  individuals$weight <- if (is.null(fm$weight)) {
    individuals$PWGTP
  } else {
    fm$weight
  }
  environment(make_estimates) <- environment()
  make_estimates()
}
estimate_grake <- apply_method(data, est_grake)

Comparisons

Tract Aggregates

Now we can compare all of the considered methods within each of the variables we used to inform the raking approaches:

error <- do.call(rbind, lapply(structure(pvars, names = pvars), function(var) {
  d <- cbind(
    Prop = estimate_redist[[var]],
    "Prop Adj" = estimate_redist_adj[[var]],
    Baseline = estimate_baseline$block_groups[[var]],
    Rake = estimate_rake$block_groups[[var]],
    "Rake 2 Step" = estimate_twostep$block_groups[[var]],
    "Rake N Step" = estimate_nstep$block_groups[[var]],
    "Rake General" = estimate_grake$block_groups[[var]]
  )
  colMeans(abs(d - block_groups[[var]]))
}))
error <- rbind(error, Average = colMeans(error))
kable(error, digits = 3)
Prop Prop Adj Baseline Rake Rake 2 Step Rake N Step Rake General
sex_female 121.352 105.943 219.286 219.348 219.254 218.665 218.654
sex_male 128.929 117.032 233.955 234.025 233.991 234.246 234.482
race_black 61.684 61.584 72.059 71.940 72.473 72.777 71.546
race_white 166.633 145.178 291.699 292.624 289.364 288.859 292.544
race_other 76.493 75.017 91.940 91.577 91.380 92.491 90.631
race_asian 55.791 56.351 75.142 75.073 74.722 74.077 75.122
race_native 5.577 5.663 5.135 5.218 5.085 4.998 5.138
income_lt50 38.820 40.608 51.486 51.392 51.004 51.024 51.648
income_gt200 50.340 45.409 59.663 59.603 59.930 59.950 60.083
income_100_200 47.578 46.870 92.734 92.493 91.819 91.752 92.352
income_50_100 39.530 41.883 63.116 63.111 62.817 62.952 62.810
Average 72.066 67.412 114.201 114.219 113.804 113.799 114.092

Full results are presented in a data site: redistribution_arlington

Alternate Geolevels

So far, we’ve used the same geolevel-related test-case to assess our methods, but this may have an effect in itself. As an alternative, we might abandon canonical tracts in favor of randomly agglutinated block groups.

# randomly combine adjacent block groups into pairs of 2 (were possible)
# apply within each PUMA
apply_altgeo <- function(run = NULL, return_full = FALSE) {
  vdata <- lapply(
    structure(pumas_focal, names = pumas_focal),
    function(puma) {
      d <- data[[puma]]
      su <- substring(block_groups$GEOID, 1, 11) %in% d$source$id
      vtr <- make_parent_polygons(
        block_groups[su, ], "GEOID",
        n_as_min = TRUE, verbose = FALSE
      )
      names(vtr$new) <- names(vtr$map) <- paste0(
        puma, "_", seq_along(vtr$new)
      )
      d$geography <- vtr$new
      d$map <- vtr$map
      d$source <- redistribute(
        block_groups[su, c("GEOID", vars)], vtr$new, vtr$map,
        default_value = 0
      )
      d
    }
  )
  vsource <- do.call(rbind, lapply(vdata, "[[", "source"))
  vmap <- redistribute(
    vsource, parcels,
    source_id = "id", target_id = "OBJECTID", return_map = TRUE
  )
  vestimates <- list(
    Prop = list(block_groups = redist_downup(
      vsource, parcels, block_groups, vmap, map_bg_to_parcel
    )),
    "Prop Adj" = list(block_groups = redist_downup(
      vsource, parcels, block_groups, vmap, map_bg_to_parcel,
      weight = "Residents"
    )),
    Baseline = apply_method(vdata, est_baseline),
    Rake = apply_method(vdata, est_rake),
    "Rake 2 Step" = apply_method(vdata, est_twostep),
    "Rake N Step" = apply_method(vdata, est_nstep),
    "Rake General" = apply_method(vdata, est_grake)
  )
  verror <- do.call(rbind, lapply(
    structure(pvars, names = pvars),
    function(var) {
      colMeans(abs(as.data.frame(
        lapply(vestimates, function(d) d$block_groups[[var]]),
        check.names = FALSE
      ) - block_groups[[var]]))
    }
  ))
  verror <- rbind(verror, Average = colMeans(verror))
  if (return_full) {
    list(
      data = data, source = vsource, map = vmap,
      estimates = vestimates, error = verror
    )
  } else {
    verror
  }
}

estimates_altgeo <- apply_altgeo(return_full = TRUE)
kable(estimates_altgeo$error, digits = 3)
Prop Prop Adj Baseline Rake Rake 2 Step Rake N Step Rake General
sex_female 215.937 200.870 216.167 216.578 216.313 215.513 216.289
sex_male 236.555 219.472 241.522 241.517 241.489 241.820 241.303
race_black 72.468 71.878 69.874 69.645 69.612 70.061 69.561
race_white 312.564 288.576 300.305 303.951 301.975 301.493 305.405
race_other 86.287 85.025 88.380 87.935 88.173 88.312 88.221
race_asian 65.491 65.548 67.677 67.883 67.587 67.091 67.628
race_native 4.689 4.794 4.534 4.734 4.475 4.441 4.386
income_lt50 48.612 51.038 46.041 45.849 45.804 45.741 46.130
income_gt200 63.435 59.583 55.054 55.228 55.047 54.905 54.798
income_100_200 86.937 87.857 89.694 89.708 89.437 89.174 89.610
income_50_100 62.409 64.883 60.513 60.631 60.241 60.246 60.339
Average 114.126 109.048 112.705 113.060 112.741 112.618 113.061

We can also take a look at the virtual tracts that were created in this run:

Since these virtual tracts are randomly created, we might also look at results across a few runs for a sense of variation:

# since this can be quite long-running,
# we'll save and reload results
results_altgeo_file <- paste0(base_dir, "/results_altgeo.rds")
if (file.exists(results_altgeo_file)) {
  results_altgeo <- readRDS(results_altgeo_file)
} else {
  results_altgeo <- lapply(1:50, apply_altgeo)
  saveRDS(results_altgeo, results_altgeo_file)
}

kable(Reduce("+", results_altgeo) / length(results_altgeo), digits = 3)
Prop Prop Adj Baseline Rake Rake 2 Step Rake N Step Rake General
sex_female 113.199 100.320 110.849 111.066 111.120 110.519 110.812
sex_male 131.315 117.479 129.066 128.938 128.895 129.342 128.710
race_black 61.055 61.697 57.506 57.528 57.178 57.315 57.711
race_white 172.615 148.385 155.580 157.644 154.924 154.485 158.939
race_other 74.940 74.262 75.171 74.803 74.689 74.914 75.175
race_asian 49.191 50.147 49.037 49.377 49.214 48.922 49.299
race_native 4.810 4.896 4.608 4.816 4.553 4.534 4.658
income_lt50 36.104 38.230 32.539 32.607 32.378 32.395 33.134
income_gt200 48.107 43.420 37.323 37.354 37.422 37.540 37.199
income_100_200 47.601 48.775 46.230 46.253 45.911 45.816 45.995
income_50_100 36.142 38.199 34.363 34.607 34.254 34.301 34.397
Average 70.462 65.983 66.570 66.818 66.412 66.371 66.912

Simulated Populations

So far, we’ve used the same data at different initial geolevels as a way to introduce error while still having a known target to measure error from. This may have the benefit of retaining some properties of the data that affect our methods in unknown ways, but the down side is that we cannot very directly measure error, and we are testing against estimates with their own error.

An alternative is to simulate the initial population, then draw samples from that population to form simulated version of the data we have available (sparse PUM samples, and less sparse block group summaries). Then, instead of having to go down to parcels and back up to artificially unknown intermediate layers (block groups) to measure error, we can measure directly at the parcel level.

# simulating the whole population is pretty intensive,
# so we'll just do that once, then draw samples from it
population_file <- paste0(base_dir, "/simulated_population.rds")
if (file.exists(population_file)) {
  population <- readRDS(population_file)
} else {
  population <- generate_population(
    sum(parcels$Total), parcels, "Total", "OBJECTID"
  )
  population$households$SERIALNO <- population$households$household
  population$individuals$ID <- population$individuals$person

  # make categories

  ## household
  population$households$building_type <- c(
    "MULTI", "SFD", "SFA"
  )[population$households$building_type]
  population$households$status <- c("OWNER", "RENTER")[
    population$households$renting + 1
  ]
  population$households$Total <- structure(
    population$regions$capacity,
    names = population$regions$id
  )[population$households$region]
  population$households$head_income <- population$households$head_income /
    max(population$households$head_income) * max(pums$household$HINCP)
  population$households$income_cat <- as.character(cut(
    population$households$head_income,
    c(-Inf, 50, 100, 200, Inf) * 1e3,
    c("income_lt50", "income_50_100", "income_100_200", "income_gt200"),
    right = TRUE
  ))

  ## individual
  population$individuals$sex_cat <- c("sex_male", "sex_female")[
    population$individuals$sex + 1
  ]
  population$individuals$race_cat <- rep_len(
    grep("^race", pvars, value = TRUE), max(population$individuals$race) + 1
  )[population$individuals$race + 1]
  population$individuals$income <- population$individuals$income /
    max(population$individuals$income) * max(pums$household$HINCP)
  population$individuals$income_cat <- as.character(cut(
    population$individuals$income,
    c(-Inf, 50, 100, 200, Inf) * 1e3,
    c("income_lt50", "income_50_100", "income_100_200", "income_gt200"),
    right = TRUE
  ))
  combined <- cbind(
    population$households[population$individuals$household, ],
    population$individuals
  )
  population$summary <- do.call(rbind, lapply(
    split(combined, as.character(combined$region)),
    function(d) {
      household <- d[!duplicated(d$household), ]
      owner <- household$status == "OWNER"
      renter <- household$status == "RENTER"
      cbind(
        id = household$region[[1]],
        Total = household$Total[[1]],
        Owner_Occupied = sum(owner),
        Renter_Occupied = sum(renter),
        avg_hhsize = mean(household$size),
        avg_hhsize_own = if (sum(owner)) mean(household$size[owner]) else 0,
        avg_hhsize_rent = if (sum(renter)) mean(household$size[renter]) else 0,
        as.data.frame(as.list(unlist(lapply(
          names(vars_list),
          function(var) as.list(table(factor(d[[var]], vars_list[[var]])))
        ))))
      )
    }
  ))

  saveRDS(population, population_file, compress = "xz")
}

First, we can look at the block group summaries of the full simulated population:

The redistribution methods apply to census-like samples from a full population, so we’ll set up a function to both draw a sample and apply methods to it:

## for the initial sample, we'll aim for an even
## sampling from across block groups
sample_n <- ceiling(nrow(population$households) * .1 / nrow(block_groups))
map_parcel_to_bg <- structure(
  rep(names(map_bg_to_parcel), vapply(map_bg_to_parcel, length, 0)),
  names = names(unlist(unname(map_bg_to_parcel)))
)
draw_sample <- function(run = NULL, return_full = FALSE) {
  # draw households, and retrieve individuals within them
  sample_hh <- lapply(map_bg_to_parcel, function(bg) {
    names(if (length(bg) < sample_n) bg else sample(bg, sample_n))
  })
  full <- population$households[
    population$households$household %in% unlist(sample_hh, use.names = FALSE),
  ]
  full$WGTP <- 1
  full_individuals <- population$individuals[
    population$individuals$household %in% full$household,
  ]
  full_individuals$PWGTP <- 1
  individuals <- cbind(
    full[as.character(full_individuals$household), c("region", vars_house)],
    full_individuals[, vars_person]
  )
  individuals$SERIALNO <- as.character(individuals$SERIALNO)
  individuals$ID <- as.character(individuals$ID)
  individuals$geoid <- structure(
    rep(names(sample_hh), vapply(sample_hh, length, 0)),
    names = unlist(sample_hh, use.names = FALSE)
  )[individuals$SERIALNO]

  bg <- do.call(rbind, lapply(
    split(individuals, individuals$geoid),
    function(d) {
      hh <- d[!duplicated(d$SERIALNO), ]
      hh$size <- table(d$SERIALNO)[hh$SERIALNO]
      size <- tapply(hh$size, factor(hh$status, c("OWNER", "RENTER")), mean)
      size[is.na(size)] <- 0
      cbind(
        id = d$geoid[[1]],
        as.data.frame(unlist(lapply(
          names(vars_list), function(var) {
            as.list(vapply(
              unique(vars_list[[var]]),
              function(l) sum((d[[var]] == l) * d$PWGTP), 0
            ))
          }
        ), recursive = FALSE)),
        avg_hhsize = mean(hh$size),
        avg_hhsize_own = size[["OWNER"]],
        avg_hhsize_rent = size[["RENTER"]]
      )
    }
  ))

  ps <- cbind(
    population$regions,
    population$summary[population$regions$id, c("Total", vars_units[-1])]
  )
  ps$OBJECTID <- population$regions$id
  ps$geoid <- map_parcel_to_bg[population$regions$id]
  ps$Unit_Type <- c("MULTI", "SFD", "SFA")[ps$building_type]

  ps$PUMA <- structure(
    pums$crosswalk$PUMA5CE,
    names = pums$crosswalk$GEOID
  )[substring(ps$geoid, 1, 11)]
  ps <- ps[!is.na(ps$PUMA), ]
  ps$single_family_home <- ps$Unit_Type != "MULTI"
  ps$type <- do.call(
    paste0, ps[, c("PUMA", "single_family_home"), drop = TRUE]
  )

  vdata <- lapply(
    structure(pumas_focal, names = pumas_focal),
    function(puma) {
      ps <- ps[ps$PUMA == puma, ]
      su <- which(individuals$region %in% ps$id)
      list(
        map = map_bg_to_parcel,
        individuals = individuals[sample(su, floor(length(su) * .3)), ],
        parcels = ps,
        source = bg[bg$id %in% ps$geoid, ]
      )
    }
  )
  ps <- do.call(rbind, lapply(vdata, "[[", "parcels"))
  individuals <- do.call(rbind, lapply(vdata, "[[", "individuals"))
  hh <- individuals[!duplicated(individuals$SERIALNO), ]
  hh$single_family_home <- hh$building_type != "MULTI"
  hh$size <- table(individuals$SERIALNO)[as.character(hh$SERIALNO)]
  ps$Residents <- ps$Total * tapply(
    hh$size, hh$single_family_home, mean
  )[as.character(ps$single_family_home)]

  vestimates <- list(
    Prop = redistribute(
      bg, ps, map_bg_to_parcel,
      weight = "Total", source_id = "id", default_value = 0
    ),
    "Prop Adj" = redistribute(
      bg, ps, map_bg_to_parcel,
      weight = "Residents", source_id = "id", default_value = 0
    ),
    Baseline = apply_method(vdata, est_baseline, parcel_only = TRUE),
    Rake = apply_method(vdata, est_rake, parcel_only = TRUE),
    "Rake 2 Step" = apply_method(vdata, est_twostep, parcel_only = TRUE),
    "Rake N Step" = apply_method(vdata, est_nstep, parcel_only = TRUE),
    "Rake General" = apply_method(vdata, est_grake, parcel_only = TRUE)
  )
  actual <- population$summary[ps$id, ]
  verror <- do.call(rbind, lapply(
    structure(pvars, names = pvars),
    function(var) {
      colMeans(abs(as.data.frame(
        lapply(vestimates, function(d) d[[var]]),
        check.names = FALSE
      ) - actual[[var]]))
    }
  ))
  verror <- rbind(verror, Average = colMeans(verror))
  if (return_full) {
    list(
      data = data, source = bg, parcels = ps, map = map_bg_to_parcel,
      estimates = vestimates, error = verror
    )
  } else {
    verror
  }
}

Now we can look at results from a single sample. These are not comparable to the other results because (a) they are simulated independently from the real samples (so, for instance, the race category baselines are randomized), and (b) error is measured at the parcel level, rather than the block group level.

estimates_sim <- draw_sample(return_full = TRUE)
kable(estimates_sim$error, digits = 5)
Prop Prop Adj Baseline Rake Rake 2 Step Rake N Step Rake General
sex_female 4.82773 4.82818 5.05596 5.05591 5.05543 5.05565 5.05508
sex_male 4.84018 4.84059 5.07137 5.07147 5.07198 5.07178 5.07291
race_black 3.51751 3.53062 3.69505 3.69516 3.69521 3.69553 3.69500
race_white 0.30985 0.31093 0.32178 0.32178 0.32177 0.32177 0.32180
race_other 1.75416 1.76040 1.82061 1.82068 1.82079 1.82087 1.82020
race_asian 1.43407 1.43888 1.48769 1.48782 1.48750 1.48695 1.48766
race_native 3.43651 3.44780 3.61038 3.61049 3.61078 3.61106 3.61119
income_lt50 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
income_gt200 9.32742 9.32830 9.76866 9.76860 9.76812 9.76814 9.77088
income_100_200 0.39683 0.39691 0.39670 0.39670 0.39670 0.39670 0.39670
income_50_100 0.00035 0.00035 0.00035 0.00035 0.00035 0.00035 0.00035
Average 2.71315 2.71663 2.83896 2.83900 2.83897 2.83898 2.83925