Estimate Comparisons • redistribute

Built with R 4.3.1

Redistribution means rearranging values such that totals remain the same, and the original distribution could be recovered when re-aggregating from a perfectly contained disaggregate.

This may be desirable if your goal is to simply display the same data in different forms.

If, however, your goal is to come up with values at scales you don’t have observations of, it may be desirable to combine various observations at different scales and use them to come up with new estimates.

In the case of estimation, there are many more methods to consider, and validating their results is less straightforward, so this article will consider a few estimation and validation methods.

Data

We’ll focus on a single county (Arlington, VA), with census block groups and Public Use Microdata Samples (PUMS) as our primary source data.

# directory to save data in
base_dir <- "../estimate_comparison"
dir.create(base_dir, FALSE)

# geographies
library(catchment)
geography_bg <- download_census_shapes(base_dir, "va", "bg", year = 2021)
geography_bg <- geography_bg[grep("^51013", geography_bg$GEOID), ]
geography_tr <- download_census_shapes(base_dir, "va", "tr", year = 2021)
geography_tr <- geography_tr[grep("^51013", geography_tr$GEOID), ]

# ACS data

## block groups
block_groups <- tidycensus::get_acs(
  year = 2021,
  state = "51",
  county = "013",
  geography = "block group",
  output = "wide",
  variables = c(
    total = "B01001_001",
    race_white = "B02001_002",
    race_black = "B02001_003",
    race_native = "B02001_004",
    race_asian = "B02001_005",
    sex_male = "B01001_002",
    sex_female = "B01001_026",
    tenure_total = "B25003_001",
    tenure_owner = "B25003_002",
    tenure_renter = "B25003_003",
    income_total = "B19001_001",
    income_lt10 = "B19001_002",
    income_10_15 = "B19001_003",
    income_15_20 = "B19001_004",
    income_20_25 = "B19001_005",
    income_25_30 = "B19001_006",
    income_30_35 = "B19001_007",
    income_35_40 = "B19001_008",
    income_40_45 = "B19001_009",
    income_45_50 = "B19001_010",
    income_50_60 = "B19001_011",
    income_60_75 = "B19001_012",
    income_75_100 = "B19001_013",
    income_100_125 = "B19001_014",
    income_125_150 = "B19001_015",
    income_150_200 = "B19001_016",
    income_gt200 = "B19001_017",
    avg_hhsize = "B25010_001",
    avg_hhsize_own = "B25010_002",
    avg_hhsize_rent = "B25010_003"
  )
)[, -2]
#> Getting data from the 2017-2021 5-year ACS
colnames(block_groups)[2] <- "Total"
block_groups$race_other <- block_groups$Total - rowSums(
  block_groups[, grep("^race_.*E$", colnames(block_groups))]
)
colnames(block_groups) <- sub("E$", "", colnames(block_groups))

### impute missing average household sizes
block_groups$avg_hhsize[is.na(block_groups$avg_hhsize)] <- mean(
  block_groups$avg_hhsize,
  na.rm = TRUE
)
su <- is.na(block_groups$avg_hhsize_own)
block_groups$avg_hhsize_own[su] <- rowMeans(
  block_groups[su, c("avg_hhsize", "avg_hhsize_rent")],
  na.rm = TRUE
)
block_groups[is.na(block_groups)] <- 0

### make larger income groups
income_vars <- grep("income", colnames(block_groups), value = TRUE)
block_groups$income_lt50 <- rowSums(
  block_groups[, grep("_(?:lt1|[1-4][05]_).*\\d$", income_vars, value = TRUE)]
)
block_groups$income_50_100 <- rowSums(
  block_groups[, grep("income_[5-7].*\\d$", income_vars, value = TRUE)]
)
block_groups$income_100_200 <- rowSums(
  block_groups[, grep("_1\\d{2}.*\\d$", income_vars, value = TRUE)]
)
income_vars_select <- c(
  "income_lt50", "income_50_100", "income_100_200", "income_gt200"
)

library(sf)
rownames(geography_bg) <- geography_bg$GEOID
st_geometry(block_groups) <- st_geometry(geography_bg[block_groups$GEOID, ])

# parcel data
parcel_file <- paste0(base_dir, "/parcels.rds")
if (!file.exists(parcel_file)) {
  parcels <- st_read(paste0(
    "https://arlgis.arlingtonva.us/arcgis/rest/",
    "services/Open_Data/od_MHUD_Polygons/",
    "FeatureServer/0/query?where=1=1&outFields=*&outSR=4326&f=json"
  ))
  saveRDS(parcels, parcel_file, compress = "xz")
}
parcels <- readRDS(parcel_file)
colnames(parcels)[colnames(parcels) == "Total_Units"] <- "Total"
parcels$OBJECTID <- as.character(parcels$OBJECTID)
parcels <- parcels[parcels$Year_Built <= 2021, ]

Ultimately, we’ll want to use these methods to get values for unobserved geographies, but for this comparison of methods, we’ll want a way to test their results against known values.

For this test case, we can simply start at a higher geolevel, and make estimates at the block group level. To do that, we’ll need to aggregate our block group data up to tracts:

tracts <- redistribute(block_groups, geography_tr)

Redistribution

As a baseline, the first method to consider is proportional redistribution of the block group summary estimates alone.

Direct

We could do this from tracts to block groups directly:

# going from tracts to block groups with no information
estimate_tr_to_bg <- redistribute(
  tracts, block_groups,
  source_id = "id"
)

# mean absolute error between total population estimates
mean(abs(estimate_tr_to_bg$Total - block_groups$Total))
#> [1] 319.8405

We can see in this case, the error introduce when we only have proportional information at the lower level; block groups that once varied within tract now have the same value:

Down and Up

Or we could go from tracts down to parcels, then back up to block groups:

# function to apply tract to parcel to block group redistribution
redist_downup <- function(
    top, bottom, middle, map_tb = NULL, map_bm = NULL,
    weight = "Total", source_id = "id",
    target_id = "OBJECTID", target_id_mid = "GEOID") {
  to_bottom <- redistribute(
    top, bottom, map_tb,
    weight = weight, source_id = source_id, target_id = target_id,
    default_value = 0
  )
  redistribute(
    to_bottom, middle, map_bm,
    source_id = "id", target_id = target_id_mid,
    default_value = 0
  )
}

# pre-make maps

## from block groups to parcels
map_bg_to_parcel <- redistribute(
  block_groups, parcels,
  target_id = "OBJECTID", return_map = TRUE, overlaps = "first"
)
parcels$geoid <- structure(
  rep(names(map_bg_to_parcel), vapply(map_bg_to_parcel, length, 0)),
  names = names(unlist(unname(map_bg_to_parcel)))
)[parcels$OBJECTID]
block_groups <- block_groups[block_groups$GEOID %in% parcels$geoid, ]

## from tracts to parcels
map_tr_to_parcel <- split(unlist(unname(map_bg_to_parcel)), rep(
  substring(names(map_bg_to_parcel), 1, 11), vapply(map_bg_to_parcel, length, 0)
))

# check that down from and up to the same level works
all.equal(
  as.data.frame(redist_downup(
    block_groups, parcels, block_groups, map_bg_to_parcel, map_bg_to_parcel,
    source_id = "GEOID"
  )[, -1]),
  as.data.frame(block_groups[, -1])
)
#> [1] TRUE

# redistribute
estimate_redist <- redist_downup(
  tracts, parcels, block_groups, map_tr_to_parcel, map_bg_to_parcel
)

# mean absolute error between total population estimates
mean(abs(estimate_redist$Total - block_groups$Total))
#> [1] 231.1852

Now there is more of the original variation between block groups:

Because the parcel totals are living units rather than individuals, we may be able to refine our redistribution based on some heuristic estimates of number of people per living unit:

# 2.5 for multi-unit parcels, 5.5 for single-unit parcels
parcels$Residents <- parcels$Total * (2.5 + (parcels$Total == 1) * 3)

# now see if this improved our redistribution
estimate_redist_adj <- redist_downup(
  tracts, parcels, block_groups, map_tr_to_parcel, map_bg_to_parcel,
  weight = "Residents"
)

# mean absolute error between total population estimates
mean(abs(estimate_redist_adj$Total - block_groups$Total))
#> [1] 194.0749

For a better sense of what this resident adjustment is doing, we can look at the difference made in a single tract:

# identify the most different tract between variants
tract <- substring(block_groups$GEOID[
  which.max(abs(estimate_redist_adj$Total - estimate_redist$Total))
], 1, 11)
su <- substring(estimate_redist$id, 1, 11) == tract
tsu <- tracts$id == tract
psu <- substring(parcels$geoid, 1, 11) == tract

# recalculate parcel-level values within that tract
differences <- list(
  parcels = redistribute(
    tracts$Total[tsu], parcels[psu, ],
    target_id = "OBJECTID", weight = "Residents", return_geometry = FALSE
  )[, 2] - redistribute(
    tracts$Total[tsu], parcels[psu, ],
    target_id = "OBJECTID", weight = "Total", return_geometry = FALSE
  )[, 2],
  block_group = estimate_redist_adj$Total[su] - estimate_redist$Total[su]
)

Now we can look at the difference in the contained parcels and block groups:

In this case, the difference mostly comes down to two buildings with many units that get less relative weight with the resident values.

PUMS + Raking

So far, we’ve used some additional information (total units) at the parcel level to improve the distribution of values across block groups, relative to a uniform distribution.

We have some more information at the parcel level, such as unit type and number of renter versus owner designations, but to make use of this information, we need some association between source values and those features. This is where the census microdata sample can come in: The Public Use Microdata Sample (PUMS) consists of individual-level observations that include demographic features and housing features.

pums <- download_census_pums(base_dir, "va", 2021, one_year = FALSE)
#> ℹ loading household sample: h.csv.xz
#> ℹ loading person sample: p.csv.xz
#> ℹ loading crosswalk 2010_Census_Tract_to_2010_PUMA.txt

# prepare IDs
pums$crosswalk$GEOID <- do.call(paste0, pums$crosswalk[, 1:3])
pums$person$ID <- do.call(paste0, pums$person[, c("SERIALNO", "SPORDER")])

# prepare variables

## survey categories
pums$person$sex_cat <- c("sex_male", "sex_female")[as.numeric(pums$person$SEX)]

pums$person$race_cat <- paste0("race_", c(
  "white", "black", "native", rep("other", 2), "asian", rep("other", 3)
))[as.numeric(pums$person$RAC1P)]

pums$household$income_cat <- as.character(cut(
  pums$household$HINCP, c(-Inf, 50, 100, 200, Inf) * 1e3, income_vars_select,
  right = FALSE
))

## unit categories
pums$household <- pums$household[
  !is.na(pums$household$BLD) & !is.na(pums$household$TEN),
]
pums$household$building_type <- "MULTI"
pums$household$building_type[pums$household$BLD == "02"] <- "SFD"
pums$household$building_type[pums$household$BLD == "03"] <- "SFA"

pums$household$status <- "RENTER"
pums$household$status[pums$household$TEN %in% 1:2] <- "OWNER"

The PUM sample is only located down to PUM areas, which are made up of tracts, so this is where we can start for a baseline.

We will use associations between demographic and housing features within each PUMA to get probabilities at the parcel level, then redistribute our source values to them like before.

The first step to this end is to prepare our data so it is easy to associate the individual PUMS variables with their per-level summaries at the tract level:

# define variables of interest
vars_house <- c(
  ID = "SERIALNO", weight = "WGTP", type = "building_type",
  status = "status", income = "income_cat"
)
vars_person <- c(
  ID = "ID", weight = "PWGTP", sex_cat = "sex_cat", race_cat = "race_cat"
)
vars_units <- c(
  type = "Unit_Type", renters = "Renter_Occupied", owners = "Owner_Occupied"
)

## get their levels
vars_list <- c(
  lapply(vars_person[-(1:2)], function(var) unique(pums$person[[var]])),
  income_cat = list(unique(pums$household$income_cat))
)
return_vars <- names(vars_list)
pvars <- unlist(vars_list, use.names = FALSE)
vars <- c(pvars, "avg_hhsize_own", "avg_hhsize_rent")

# prepare datasets split into PUMAs
pumas_focal <- unique(pums$crosswalk$PUMA5CE[pums$crosswalk$GEOID %in% tracts$id])
data <- lapply(structure(pumas_focal, names = pumas_focal), function(puma) {
  households <- as.data.frame(pums$household[pums$household$PUMA == puma, vars_house])
  rownames(households) <- households$SERIALNO
  ids <- pums$crosswalk$GEOID[
    pums$crosswalk$PUMA5CE == puma & pums$crosswalk$GEOID %in% tracts$id
  ]
  person <- pums$person[pums$person$SERIALNO %in% households$SERIALNO, ]
  list(
    individuals = cbind(households[person$SERIALNO, ], person),
    source = tracts[tracts$id %in% ids, c("id", vars), drop = TRUE],
    parcels = parcels[
      substring(parcels$geoid, 1, 11) %in% ids, c("geoid", "OBJECTID", vars_units),
      drop = TRUE
    ]
  )
})

# function to apply each method to the data
apply_method <- function(data, method, parcel_only = FALSE, rescale = TRUE) {
  p <- do.call(rbind, lapply(data, function(set) {
    set$source <- st_drop_geometry(set$source)
    do.call(rbind, lapply(unique(set$source$id), function(id) {
      source <- set$source[set$source$id == id, -1]
      target <- set$parcels[if ("map" %in% names(set)) {
        if ("geography" %in% names(set)) {
          set$parcels$geoid %in% set$map[[id]]
        } else {
          set$parcels$geoid == id
        }
      } else {
        substring(set$parcels$geoid, 1, 11) == id
      }, ]
      if (nrow(target)) {
        est <- if (sum(source)) method(source, target[, -1], set$individuals) else NULL
        if (length(est)) {
          if (rescale) {
            totals <- colSums(est)
            totals[!totals] <- 1
            est <- sweep(est, 2, totals, "/") * rep(
              as.numeric(source[, colnames(est)]),
              each = nrow(est)
            )
          }
          su <- !pvars %in% colnames(est)
          if (any(su)) est[, pvars[su]] <- 0
          cbind(target[, 1:2], est[as.character(target$OBJECTID), pvars])
        } else {
          est <- target[, 1:2]
          est[, pvars] <- 0
          est
        }
      }
    }))
  }))
  if (parcel_only) {
    p
  } else {
    list(parcels = p, block_groups = redistribute(
      p[, -1], block_groups, map_bg_to_parcel,
      source_id = "OBJECTID", target_id = "GEOID", default_value = 0
    ))
  }
}

# function to calculate estimates from weights
make_estimates <- function() {
  do.call(rbind, lapply(unique(target$Unit_Type), function(type) {
    d <- target[
      target$Unit_Type == type, c("OBJECTID", "Renter_Occupied", "Owner_Occupied")
    ]
    d$Renter_Occupied <- d$Renter_Occupied * source$avg_hhsize_rent
    d$Owner_Occupied <- d$Owner_Occupied * source$avg_hhsize_own
    nd <- nrow(d)
    colnames(d)[-1] <- c("RENTER", "OWNER")
    i <- individuals[
      individuals$building_type == type, c("weight", "status", return_vars)
    ]
    as.data.frame(Reduce("+", lapply(
      (if (is.factor(i$status)) levels else unique)(i$status),
      function(s) {
        ii <- i[i$status == s, ]
        weight_total <- sum(ii$weight)
        do.call(cbind, lapply(return_vars, function(cat) {
          props <- tapply(ii$weight, ii[[cat]], sum) / weight_total
          props[vars_list[[cat]][!vars_list[[cat]] %in% names(props)]] <- 0
          props[is.na(props)] <- 0
          props <- props[vars_list[[cat]]]
          matrix(
            d[[s]] * rep(props, each = nd),
            nd,
            dimnames = list(d$OBJECTID, names(props))
          )
        }))
      }
    )))
  }))
}

Now we can apply each method to this same set of data.

Before the new methods, we can also update our heuristic adjustment to the proportional redistribution method with PUMS information:

# make single-family-home indicators
pums$household$single_family_home <- pums$household$building_type != "MULTI"
parcels$single_family_home <- parcels$Unit_Type != "MULTI"

# update resident estimates with average household size for
# single family versus other home types within the county
pums$household$size <- table(pums$person$SERIALNO)[pums$household$SERIALNO]
households <- pums$household[pums$household$PUMA %in% pumas_focal, ]
parcels$Residents <- parcels$Total * tapply(
  households$size, households$single_family_home, mean
)[as.character(parcels$single_family_home)]

# recalculate the adjusted redistribution results
estimate_redist_adj <- redist_downup(
  tracts, parcels, block_groups, map_tr_to_parcel, map_bg_to_parcel,
  weight = "Residents"
)
mean(abs(estimate_redist_adj$Total - block_groups$Total))
#> [1] 201.2044

Initial Weights

As a baseline, we can calculate a set of proportions for each PUMA, then apply that to the parcels in each contained tract:

# function to apply the method:
# - source: the tract-level totals
# - target: the parcels
# - individuals: the individual PUM responses
est_baseline <- function(source, target, individuals) {
  individuals$weight <- individuals$PWGTP
  environment(make_estimates) <- environment()
  make_estimates()
}

# apply across PUMAs and tracts within them
# and aggregate to block groups
estimate_baseline <- apply_method(data, est_baseline)

Because this method does not make estimates that recover their initial values, we have to rescale them to be comparable. For illustration:

# adjust tract counts so they match
# unit totals * average household size across parcels
tr_adj <- st_drop_geometry(tracts[tracts$id %in% pums$crosswalk$GEOID, ])
pid <- substring(parcels$geoid, 1, 11)
parcels$renter_est <- parcels$Renter_Occupied * structure(
  tracts$avg_hhsize_rent,
  names = tracts$id
)[pid]
parcels$owner_est <- parcels$Owner_Occupied * structure(
  tracts$avg_hhsize_own,
  names = tracts$id
)[pid]
parcels$resident_est <- parcels$renter_est + parcels$owner_est
tr_adj$Total <- tapply(parcels$resident_est, pid, sum)[tr_adj$id]

for (g in vars_list) {
  tr_adj[, g] <- tr_adj[, g] / rowSums(tr_adj[, g]) * tr_adj$Total
}
tr_adj[is.na(tr_adj)] <- 0

# make baseline parcel-level estimates without rescaling
data_adj <- data
data_adj[[1]]$source <- tr_adj[tr_adj$id %in% data[[1]]$source$id, c("id", vars)]
data_adj[[2]]$source <- tr_adj[tr_adj$id %in% data[[2]]$source$id, c("id", vars)]
parcel_adj <- apply_method(
  data_adj, est_baseline,
  rescale = FALSE, parcel_only = TRUE
)

# aggregate back up to tracts and measure per-category error
tr_est <- redistribute(
  parcel_adj, tr_adj, map_tr_to_parcel,
  source_id = "OBJECTID", target_id = "id", default_value = 0
)
data.frame(MAE = round(vapply(
  pvars, function(var) mean(abs(tr_est[[var]] - tr_adj[[var]])), 0
), 5))
#>                       MAE
#> sex_female      304.75734
#> sex_male        304.75734
#> race_black      574.76484
#> race_white     1093.99980
#> race_other      698.47523
#> race_asian      471.04857
#> race_native      45.12699
#> income_lt50     840.36931
#> income_gt200    767.20051
#> income_100_200  904.24289
#> income_50_100   704.13201

## this method does recover the parcel-level totals and overall totals
g <- c("sex_female", "sex_male")
all(abs(rowSums(parcel_adj[, g]) - structure(
  parcels$resident_est,
  names = parcels$OBJECTID
)[parcel_adj$OBJECTID]) < 1e-12)
#> [1] TRUE
c(
  original = sum(
    rbind(data_adj[[1]]$source[, g], data_adj[[2]]$source[, g])
  ),
  parcel = sum(parcel_adj[, g]),
  tract = sum(tr_est[, g])
)
#> original   parcel    tract 
#> 691326.4 691326.4 691326.4

# rescaling makes the totals line up
parcel_adj_rescaled <- apply_method(
  data_adj, est_baseline,
  parcel_only = TRUE
)
tr_est_rescaled <- redistribute(
  parcel_adj_rescaled, tr_adj, map_tr_to_parcel,
  source_id = "OBJECTID", target_id = "id", default_value = 0
)
data.frame(MAE = round(vapply(
  pvars, function(var) mean(abs(tr_est_rescaled[[var]] - tr_adj[[var]])), 0
), 5))
#>                MAE
#> sex_female       0
#> sex_male         0
#> race_black       0
#> race_white       0
#> race_other       0
#> race_asian       0
#> race_native      0
#> income_lt50      0
#> income_gt200     0
#> income_100_200   0
#> income_50_100    0

# just like the redistribution methods
tr_redist <- redist_downup(
  tr_adj, parcels, tr_adj, map_tr_to_parcel, map_tr_to_parcel
)
data.frame(MAE = round(vapply(
  pvars, function(var) mean(abs(tr_redist[[var]] - tr_adj[[var]])), 0
), 5))
#>                MAE
#> sex_female       0
#> sex_male         0
#> race_black       0
#> race_white       0
#> race_other       0
#> race_asian       0
#> race_native      0
#> income_lt50      0
#> income_gt200     0
#> income_100_200   0
#> income_50_100    0

Raking

In the baseline, we’re using PUMS weights to calculate proportions, but these weights are meant to bring totals in line with PUMA-level estimates; Since we have lower-level information in the form of tract-level totals, we can adjust those weights to match each smaller-area summary:

library(anesrake)
rake_prep <- function() {
  eval(expression({
    for (var in names(totals)) {
      individuals[[var]] <- as.factor(individuals[[var]])
      l <- levels(individuals[[var]])
      totals[[var]] <- totals[[var]][l]
      su <- is.na(totals[[var]]) | totals[[var]] == 0
      if (sum(su)) {
        individuals <- individuals[!individuals[[var]] %in% l[su], ]
        individuals[[var]] <- droplevels(individuals[[var]])
        totals[[var]] <- totals[[var]][!su]
      }
      total <- sum(totals[[var]])
      if (total) totals[[var]] <- totals[[var]] / total
    }
    totals <- Filter(length, totals)
  }), parent.frame())
}
est_rake <- function(source, target, individuals) {
  totals <- lapply(
    structure(names(vars_list)[1:2], names = names(vars_list)[1:2]),
    function(n) unlist(source[, vars_list[[n]]])
  )
  rake_prep()
  individuals$status <- as.factor(individuals$status)
  individuals$building_type <- as.factor(individuals$building_type)
  individuals$income_cat <- as.factor(individuals$income_cat)
  capture.output(individuals$weight <- tryCatch(
    anesrake(
      totals, individuals, individuals$ID, individuals$PWGTP,
      pctlim = .001
    )$weightvec[as.character(individuals$ID)],
    error = function(e) {
      warning(e$message)
      individuals$PWGTP
    }
  ))
  environment(make_estimates) <- environment()
  make_estimates()
}
estimate_rake <- apply_method(data, est_rake)

Two Step Raking

In the initial raking approach, we only considered the person-level variables, but we also have household-level information, so we might try to account for this as well by first adjusting the household weights, then using those adjusted weights to initialize the adjusted person-level weights:

est_twostep <- function(source, target, individuals) {
  all <- individuals
  individuals <- individuals[!duplicated(individuals$SERIALNO), ]
  # step 1: initial weights at household level
  totals <- list(
    income_cat = unlist(source[, grep("^income_", colnames(source))]),
    status = structure(
      colSums(target[, vars_units[-1]]),
      names = c("RENTER", "OWNER")
    ),
    building_type = tapply(rowSums(target[, vars_units[-1]]), target$Unit_Type, sum)
  )
  rake_prep()
  capture.output(initial <- tryCatch(
    anesrake(
      totals, individuals, individuals$SERIALNO, individuals$WGTP,
      pctlim = .001
    )$weightvec,
    error = function(e) {
      warning(e$message)
      structure(individuals$WGTP, names = individuals$SERIALNO)
    }
  ))
  all$weight <- initial[as.character(all$SERIALNO)]
  all <- all[!is.na(all$weight), ]
  individuals <- all
  # step 2: adjust at person level
  totals <- lapply(
    structure(names(vars_list)[1:2], names = names(vars_list)[1:2]),
    function(n) unlist(source[, vars_list[[n]]])
  )
  rake_prep()
  initial <- individuals$weight
  capture.output(individuals$weight <- tryCatch(
    anesrake(
      totals, individuals, individuals$ID,
      individuals$PWGTP * initial / individuals$WGTP,
      pctlim = .001
    )$weightvec[as.character(individuals$ID)],
    error = function(e) {
      warning(e$message)
      individuals$PWGTP * initial / individuals$WGTP
    }
  ))
  environment(make_estimates) <- environment()
  make_estimates()
}
estimate_twostep <- apply_method(data, est_twostep)

N Step Raking

We can take the two-step method further by repeating the process to try and better fit both the household and individual data:

est_nstep <- function(source, target, individuals) {
  individuals$weight <- individuals$PWGTP
  all <- individuals
  iter <- function() {
    individuals <- individuals[!duplicated(individuals$SERIALNO), ]
    # step 1: initial weights at household level
    totals <- list(
      income_cat = unlist(source[, grep("^income_", colnames(source))]),
      status = structure(
        colSums(target[, vars_units[-1]]),
        names = c("RENTER", "OWNER")
      ),
      building_type = tapply(rowSums(target[, vars_units[-1]]), target$Unit_Type, sum)
    )
    rake_prep()
    household_totals <- totals
    capture.output(initial <- tryCatch(
      anesrake(
        totals, individuals, individuals$SERIALNO,
        individuals$weight / individuals$PWGTP * individuals$WGTP,
        pctlim = .001
      )$weightvec,
      error = function(e) {
        warning(e$message)
        structure(
          individuals$weight / individuals$PWGTP * individuals$WGTP,
          names = individuals$SERIALNO
        )
      }
    ))
    all$weight <- initial[as.character(all$SERIALNO)]
    all <- all[!is.na(all$weight), ]
    individuals <- all
    # step 2: adjust at person level
    totals <- lapply(
      structure(names(vars_list)[1:2], names = names(vars_list)[1:2]),
      function(n) unlist(source[, vars_list[[n]]])
    )
    rake_prep()
    initial <- individuals$weight
    capture.output(individuals$weight <- tryCatch(
      anesrake(
        totals, individuals, individuals$ID,
        individuals$PWGTP * initial / individuals$WGTP,
        pctlim = .001
      )$weightvec[as.character(individuals$ID)],
      error = function(e) {
        warning(e$message)
        individuals$PWGTP * initial / individuals$WGTP
      }
    ))
    hh <- individuals[!duplicated(individuals$SERIALNO), ]
    hh$weight[hh$weight == 0] <- 1
    weight <- hh$PWGTP / hh$weight * hh$WGTP
    list(
      error = mean(vapply(names(household_totals), function(v) {
        mean((household_totals[[v]] - tapply(weight, hh[[v]], sum) / sum(weight))^2)
      }, 0), na.rm = TRUE),
      data = individuals
    )
  }
  previous_error <- 0
  for (i in 1:20) {
    step <- iter()
    individuals <- step$data
    if (step$error < .001 || abs(previous_error - step$error) < .0005) break
    previous_error <- step$error
  }
  environment(make_estimates) <- environment()
  make_estimates()
}
estimate_nstep <- apply_method(data, est_nstep)

Generalized Raking

The two-step approach still mainly focuses on better aligning the person-level weights – the household totals only match after the first step, but are not considered in the second step. Alternative to this, we might try to match the household- and person-level weights at the same time with a generalized raking approach:

library(mlfit)
est_grake <- function(source, target, individuals) {
  totals <- list(
    person = sum(individuals$PWGTP),
    household = sum(individuals$WGTP[!duplicated(individuals$SERIALNO)])
  )
  person <- lapply(names(vars_list)[1:2], function(n) {
    l <- vars_list[[n]]
    count <- as.numeric(source[, l, drop = TRUE])
    count[!count] <- 1
    count <- count / sum(count) * totals$person
    s <- data.frame(level = l, count = count)
    colnames(s)[1] <- n
    s
  })
  household <- unique(individuals$building_type)
  household <- structure(numeric(length(household)), names = household)
  observed_types <- tapply(
    rowSums(target[, vars_units[-1]]), target$Unit_Type, sum
  )
  household[names(observed_types)] <- observed_types
  household <- list(data.frame(
    building_type = names(household), count = household
  ))
  household[[1]]$count[is.na(household[[1]]$count)] <- 1
  household[[2]] <- data.frame(
    status = c("RENTER", "OWNER"),
    count = colSums(target[, c("Renter_Occupied", "Owner_Occupied")])
  )
  household[[3]] <- unlist(source[, grep("^income_", colnames(source))])
  household[[3]] <- data.frame(
    income_cat = names(household[[3]]), count = household[[3]]
  )
  names(household) <- c("building_type", "status", "income_cat")
  household <- lapply(names(household), function(var) {
    l <- household[[var]]
    l <- l[l[[1]] %in% unique(individuals[, var]), ]
    l[[2]][!l[[2]]] <- 1
    l[[2]] <- l[[2]] / sum(l[[2]]) * totals$household
    l
  })
  m <- ml_problem(
    individuals,
    field_names = special_field_names("SERIALNO", "ID", count = "count"),
    group_controls = household,
    individual_controls = person,
    prior_weights = individuals$WGTP
  )
  fm <- tryCatch(
    suppressWarnings(ml_fit_dss(m, ginv = solve)),
    error = function(e) NULL
  )
  if (!isTRUE(fm$success)) {
    fm <- tryCatch(
      suppressWarnings(ml_fit_ipu(m)),
      error = function(e) NULL
    )
  }
  individuals$weight <- if (is.null(fm$weight)) {
    individuals$PWGTP
  } else {
    fm$weight
  }
  environment(make_estimates) <- environment()
  make_estimates()
}
estimate_grake <- apply_method(data, est_grake)

Comparisons

Tract Aggregates

Now we can compare all of the considered methods within each of the variables we used to inform the raking approaches:

error <- do.call(rbind, lapply(structure(pvars, names = pvars), function(var) {
  d <- cbind(
    Prop = estimate_redist[[var]],
    "Prop Adj" = estimate_redist_adj[[var]],
    Baseline = estimate_baseline$block_groups[[var]],
    Rake = estimate_rake$block_groups[[var]],
    "Rake 2 Step" = estimate_twostep$block_groups[[var]],
    "Rake N Step" = estimate_nstep$block_groups[[var]],
    "Rake General" = estimate_grake$block_groups[[var]]
  )
  colMeans(abs(d - block_groups[[var]]))
}))
error <- rbind(error, Average = colMeans(error))
kable(error, digits = 3)

	Prop	Prop Adj	Baseline	Rake	Rake 2 Step	Rake N Step	Rake General
sex_female	121.352	105.943	219.286	219.348	219.254	218.665	218.654
sex_male	128.929	117.032	233.955	234.025	233.991	234.246	234.482
race_black	61.684	61.584	72.059	71.940	72.473	72.777	71.546
race_white	166.633	145.178	291.699	292.624	289.364	288.859	292.544
race_other	76.493	75.017	91.940	91.577	91.380	92.491	90.631
race_asian	55.791	56.351	75.142	75.073	74.722	74.077	75.122
race_native	5.577	5.663	5.135	5.218	5.085	4.998	5.138
income_lt50	38.820	40.608	51.486	51.392	51.004	51.024	51.648
income_gt200	50.340	45.409	59.663	59.603	59.930	59.950	60.083
income_100_200	47.578	46.870	92.734	92.493	91.819	91.752	92.352
income_50_100	39.530	41.883	63.116	63.111	62.817	62.952	62.810
Average	72.066	67.412	114.201	114.219	113.804	113.799	114.092

Full results are presented in a data site: redistribution_arlington

Alternate Geolevels

So far, we’ve used the same geolevel-related test-case to assess our methods, but this may have an effect in itself. As an alternative, we might abandon canonical tracts in favor of randomly agglutinated block groups.

# randomly combine adjacent block groups into pairs of 2 (were possible)
# apply within each PUMA
apply_altgeo <- function(run = NULL, return_full = FALSE) {
  vdata <- lapply(
    structure(pumas_focal, names = pumas_focal),
    function(puma) {
      d <- data[[puma]]
      su <- substring(block_groups$GEOID, 1, 11) %in% d$source$id
      vtr <- make_parent_polygons(
        block_groups[su, ], "GEOID",
        n_as_min = TRUE, verbose = FALSE
      )
      names(vtr$new) <- names(vtr$map) <- paste0(
        puma, "_", seq_along(vtr$new)
      )
      d$geography <- vtr$new
      d$map <- vtr$map
      d$source <- redistribute(
        block_groups[su, c("GEOID", vars)], vtr$new, vtr$map,
        default_value = 0
      )
      d
    }
  )
  vsource <- do.call(rbind, lapply(vdata, "[[", "source"))
  vmap <- redistribute(
    vsource, parcels,
    source_id = "id", target_id = "OBJECTID", return_map = TRUE
  )
  vestimates <- list(
    Prop = list(block_groups = redist_downup(
      vsource, parcels, block_groups, vmap, map_bg_to_parcel
    )),
    "Prop Adj" = list(block_groups = redist_downup(
      vsource, parcels, block_groups, vmap, map_bg_to_parcel,
      weight = "Residents"
    )),
    Baseline = apply_method(vdata, est_baseline),
    Rake = apply_method(vdata, est_rake),
    "Rake 2 Step" = apply_method(vdata, est_twostep),
    "Rake N Step" = apply_method(vdata, est_nstep),
    "Rake General" = apply_method(vdata, est_grake)
  )
  verror <- do.call(rbind, lapply(
    structure(pvars, names = pvars),
    function(var) {
      colMeans(abs(as.data.frame(
        lapply(vestimates, function(d) d$block_groups[[var]]),
        check.names = FALSE
      ) - block_groups[[var]]))
    }
  ))
  verror <- rbind(verror, Average = colMeans(verror))
  if (return_full) {
    list(
      data = data, source = vsource, map = vmap,
      estimates = vestimates, error = verror
    )
  } else {
    verror
  }
}

estimates_altgeo <- apply_altgeo(return_full = TRUE)
kable(estimates_altgeo$error, digits = 3)

	Prop	Prop Adj	Baseline	Rake	Rake 2 Step	Rake N Step	Rake General
sex_female	215.937	200.870	216.167	216.578	216.313	215.513	216.289
sex_male	236.555	219.472	241.522	241.517	241.489	241.820	241.303
race_black	72.468	71.878	69.874	69.645	69.612	70.061	69.561
race_white	312.564	288.576	300.305	303.951	301.975	301.493	305.405
race_other	86.287	85.025	88.380	87.935	88.173	88.312	88.221
race_asian	65.491	65.548	67.677	67.883	67.587	67.091	67.628
race_native	4.689	4.794	4.534	4.734	4.475	4.441	4.386
income_lt50	48.612	51.038	46.041	45.849	45.804	45.741	46.130
income_gt200	63.435	59.583	55.054	55.228	55.047	54.905	54.798
income_100_200	86.937	87.857	89.694	89.708	89.437	89.174	89.610
income_50_100	62.409	64.883	60.513	60.631	60.241	60.246	60.339
Average	114.126	109.048	112.705	113.060	112.741	112.618	113.061

We can also take a look at the virtual tracts that were created in this run:

Since these virtual tracts are randomly created, we might also look at results across a few runs for a sense of variation:

# since this can be quite long-running,
# we'll save and reload results
results_altgeo_file <- paste0(base_dir, "/results_altgeo.rds")
if (file.exists(results_altgeo_file)) {
  results_altgeo <- readRDS(results_altgeo_file)
} else {
  results_altgeo <- lapply(1:50, apply_altgeo)
  saveRDS(results_altgeo, results_altgeo_file)
}

kable(Reduce("+", results_altgeo) / length(results_altgeo), digits = 3)

	Prop	Prop Adj	Baseline	Rake	Rake 2 Step	Rake N Step	Rake General
sex_female	113.199	100.320	110.849	111.066	111.120	110.519	110.812
sex_male	131.315	117.479	129.066	128.938	128.895	129.342	128.710
race_black	61.055	61.697	57.506	57.528	57.178	57.315	57.711
race_white	172.615	148.385	155.580	157.644	154.924	154.485	158.939
race_other	74.940	74.262	75.171	74.803	74.689	74.914	75.175
race_asian	49.191	50.147	49.037	49.377	49.214	48.922	49.299
race_native	4.810	4.896	4.608	4.816	4.553	4.534	4.658
income_lt50	36.104	38.230	32.539	32.607	32.378	32.395	33.134
income_gt200	48.107	43.420	37.323	37.354	37.422	37.540	37.199
income_100_200	47.601	48.775	46.230	46.253	45.911	45.816	45.995
income_50_100	36.142	38.199	34.363	34.607	34.254	34.301	34.397
Average	70.462	65.983	66.570	66.818	66.412	66.371	66.912

Simulated Populations

So far, we’ve used the same data at different initial geolevels as a way to introduce error while still having a known target to measure error from. This may have the benefit of retaining some properties of the data that affect our methods in unknown ways, but the down side is that we cannot very directly measure error, and we are testing against estimates with their own error.

An alternative is to simulate the initial population, then draw samples from that population to form simulated version of the data we have available (sparse PUM samples, and less sparse block group summaries). Then, instead of having to go down to parcels and back up to artificially unknown intermediate layers (block groups) to measure error, we can measure directly at the parcel level.

# simulating the whole population is pretty intensive,
# so we'll just do that once, then draw samples from it
population_file <- paste0(base_dir, "/simulated_population.rds")
if (file.exists(population_file)) {
  population <- readRDS(population_file)
} else {
  population <- generate_population(
    sum(parcels$Total), parcels, "Total", "OBJECTID"
  )
  population$households$SERIALNO <- population$households$household
  population$individuals$ID <- population$individuals$person

  # make categories

  ## household
  population$households$building_type <- c(
    "MULTI", "SFD", "SFA"
  )[population$households$building_type]
  population$households$status <- c("OWNER", "RENTER")[
    population$households$renting + 1
  ]
  population$households$Total <- structure(
    population$regions$capacity,
    names = population$regions$id
  )[population$households$region]
  population$households$head_income <- population$households$head_income /
    max(population$households$head_income) * max(pums$household$HINCP)
  population$households$income_cat <- as.character(cut(
    population$households$head_income,
    c(-Inf, 50, 100, 200, Inf) * 1e3,
    c("income_lt50", "income_50_100", "income_100_200", "income_gt200"),
    right = TRUE
  ))

  ## individual
  population$individuals$sex_cat <- c("sex_male", "sex_female")[
    population$individuals$sex + 1
  ]
  population$individuals$race_cat <- rep_len(
    grep("^race", pvars, value = TRUE), max(population$individuals$race) + 1
  )[population$individuals$race + 1]
  population$individuals$income <- population$individuals$income /
    max(population$individuals$income) * max(pums$household$HINCP)
  population$individuals$income_cat <- as.character(cut(
    population$individuals$income,
    c(-Inf, 50, 100, 200, Inf) * 1e3,
    c("income_lt50", "income_50_100", "income_100_200", "income_gt200"),
    right = TRUE
  ))
  combined <- cbind(
    population$households[population$individuals$household, ],
    population$individuals
  )
  population$summary <- do.call(rbind, lapply(
    split(combined, as.character(combined$region)),
    function(d) {
      household <- d[!duplicated(d$household), ]
      owner <- household$status == "OWNER"
      renter <- household$status == "RENTER"
      cbind(
        id = household$region[[1]],
        Total = household$Total[[1]],
        Owner_Occupied = sum(owner),
        Renter_Occupied = sum(renter),
        avg_hhsize = mean(household$size),
        avg_hhsize_own = if (sum(owner)) mean(household$size[owner]) else 0,
        avg_hhsize_rent = if (sum(renter)) mean(household$size[renter]) else 0,
        as.data.frame(as.list(unlist(lapply(
          names(vars_list),
          function(var) as.list(table(factor(d[[var]], vars_list[[var]])))
        ))))
      )
    }
  ))

  saveRDS(population, population_file, compress = "xz")
}

First, we can look at the block group summaries of the full simulated population:

The redistribution methods apply to census-like samples from a full population, so we’ll set up a function to both draw a sample and apply methods to it:

## for the initial sample, we'll aim for an even
## sampling from across block groups
sample_n <- ceiling(nrow(population$households) * .1 / nrow(block_groups))
map_parcel_to_bg <- structure(
  rep(names(map_bg_to_parcel), vapply(map_bg_to_parcel, length, 0)),
  names = names(unlist(unname(map_bg_to_parcel)))
)
draw_sample <- function(run = NULL, return_full = FALSE) {
  # draw households, and retrieve individuals within them
  sample_hh <- lapply(map_bg_to_parcel, function(bg) {
    names(if (length(bg) < sample_n) bg else sample(bg, sample_n))
  })
  full <- population$households[
    population$households$household %in% unlist(sample_hh, use.names = FALSE),
  ]
  full$WGTP <- 1
  full_individuals <- population$individuals[
    population$individuals$household %in% full$household,
  ]
  full_individuals$PWGTP <- 1
  individuals <- cbind(
    full[as.character(full_individuals$household), c("region", vars_house)],
    full_individuals[, vars_person]
  )
  individuals$SERIALNO <- as.character(individuals$SERIALNO)
  individuals$ID <- as.character(individuals$ID)
  individuals$geoid <- structure(
    rep(names(sample_hh), vapply(sample_hh, length, 0)),
    names = unlist(sample_hh, use.names = FALSE)
  )[individuals$SERIALNO]

  bg <- do.call(rbind, lapply(
    split(individuals, individuals$geoid),
    function(d) {
      hh <- d[!duplicated(d$SERIALNO), ]
      hh$size <- table(d$SERIALNO)[hh$SERIALNO]
      size <- tapply(hh$size, factor(hh$status, c("OWNER", "RENTER")), mean)
      size[is.na(size)] <- 0
      cbind(
        id = d$geoid[[1]],
        as.data.frame(unlist(lapply(
          names(vars_list), function(var) {
            as.list(vapply(
              unique(vars_list[[var]]),
              function(l) sum((d[[var]] == l) * d$PWGTP), 0
            ))
          }
        ), recursive = FALSE)),
        avg_hhsize = mean(hh$size),
        avg_hhsize_own = size[["OWNER"]],
        avg_hhsize_rent = size[["RENTER"]]
      )
    }
  ))

  ps <- cbind(
    population$regions,
    population$summary[population$regions$id, c("Total", vars_units[-1])]
  )
  ps$OBJECTID <- population$regions$id
  ps$geoid <- map_parcel_to_bg[population$regions$id]
  ps$Unit_Type <- c("MULTI", "SFD", "SFA")[ps$building_type]

  ps$PUMA <- structure(
    pums$crosswalk$PUMA5CE,
    names = pums$crosswalk$GEOID
  )[substring(ps$geoid, 1, 11)]
  ps <- ps[!is.na(ps$PUMA), ]
  ps$single_family_home <- ps$Unit_Type != "MULTI"
  ps$type <- do.call(
    paste0, ps[, c("PUMA", "single_family_home"), drop = TRUE]
  )

  vdata <- lapply(
    structure(pumas_focal, names = pumas_focal),
    function(puma) {
      ps <- ps[ps$PUMA == puma, ]
      su <- which(individuals$region %in% ps$id)
      list(
        map = map_bg_to_parcel,
        individuals = individuals[sample(su, floor(length(su) * .3)), ],
        parcels = ps,
        source = bg[bg$id %in% ps$geoid, ]
      )
    }
  )
  ps <- do.call(rbind, lapply(vdata, "[[", "parcels"))
  individuals <- do.call(rbind, lapply(vdata, "[[", "individuals"))
  hh <- individuals[!duplicated(individuals$SERIALNO), ]
  hh$single_family_home <- hh$building_type != "MULTI"
  hh$size <- table(individuals$SERIALNO)[as.character(hh$SERIALNO)]
  ps$Residents <- ps$Total * tapply(
    hh$size, hh$single_family_home, mean
  )[as.character(ps$single_family_home)]

  vestimates <- list(
    Prop = redistribute(
      bg, ps, map_bg_to_parcel,
      weight = "Total", source_id = "id", default_value = 0
    ),
    "Prop Adj" = redistribute(
      bg, ps, map_bg_to_parcel,
      weight = "Residents", source_id = "id", default_value = 0
    ),
    Baseline = apply_method(vdata, est_baseline, parcel_only = TRUE),
    Rake = apply_method(vdata, est_rake, parcel_only = TRUE),
    "Rake 2 Step" = apply_method(vdata, est_twostep, parcel_only = TRUE),
    "Rake N Step" = apply_method(vdata, est_nstep, parcel_only = TRUE),
    "Rake General" = apply_method(vdata, est_grake, parcel_only = TRUE)
  )
  actual <- population$summary[ps$id, ]
  verror <- do.call(rbind, lapply(
    structure(pvars, names = pvars),
    function(var) {
      colMeans(abs(as.data.frame(
        lapply(vestimates, function(d) d[[var]]),
        check.names = FALSE
      ) - actual[[var]]))
    }
  ))
  verror <- rbind(verror, Average = colMeans(verror))
  if (return_full) {
    list(
      data = data, source = bg, parcels = ps, map = map_bg_to_parcel,
      estimates = vestimates, error = verror
    )
  } else {
    verror
  }
}

Now we can look at results from a single sample. These are not comparable to the other results because (a) they are simulated independently from the real samples (so, for instance, the race category baselines are randomized), and (b) error is measured at the parcel level, rather than the block group level.

estimates_sim <- draw_sample(return_full = TRUE)
kable(estimates_sim$error, digits = 5)

	Prop	Prop Adj	Baseline	Rake	Rake 2 Step	Rake N Step	Rake General
sex_female	4.82773	4.82818	5.05596	5.05591	5.05543	5.05565	5.05508
sex_male	4.84018	4.84059	5.07137	5.07147	5.07198	5.07178	5.07291
race_black	3.51751	3.53062	3.69505	3.69516	3.69521	3.69553	3.69500
race_white	0.30985	0.31093	0.32178	0.32178	0.32177	0.32177	0.32180
race_other	1.75416	1.76040	1.82061	1.82068	1.82079	1.82087	1.82020
race_asian	1.43407	1.43888	1.48769	1.48782	1.48750	1.48695	1.48766
race_native	3.43651	3.44780	3.61038	3.61049	3.61078	3.61106	3.61119
income_lt50	0.00000	0.00000	0.00000	0.00000	0.00000	0.00000	0.00000
income_gt200	9.32742	9.32830	9.76866	9.76860	9.76812	9.76814	9.77088
income_100_200	0.39683	0.39691	0.39670	0.39670	0.39670	0.39670	0.39670
income_50_100	0.00035	0.00035	0.00035	0.00035	0.00035	0.00035	0.00035
Average	2.71315	2.71663	2.83896	2.83900	2.83897	2.83898	2.83925