Skip to contents

Built with R 4.3.1


This article compares different means of estimating zipcode-level household values from a block group level source in Fairfax, Virginia.

# directory to save data in
base_dir <- "../household_estimates"
dir.create(base_dir, FALSE)

Direct

We can start by directly redistributing block group values to zipcodes based on their geometries.

First, we need to get our source block group data (American Community Survey 5-year estimates, 2017-2021):

counties <- c("059", "600")
variables <- c(
  total = "B25006_001",
  race_white = "B25006_002",
  race_black = "B25006_003",
  race_native = "B25006_004",
  race_asian = "B25006_005",
  income_total = "B19001_001",
  income_lt10 = "B19001_002",
  income_10_15 = "B19001_003",
  income_15_20 = "B19001_004",
  income_20_25 = "B19001_005",
  income_25_30 = "B19001_006",
  income_30_35 = "B19001_007",
  income_35_40 = "B19001_008",
  income_40_45 = "B19001_009",
  income_45_50 = "B19001_010",
  income_50_60 = "B19001_011",
  income_60_75 = "B19001_012",
  income_75_100 = "B19001_013",
  income_100_125 = "B19001_014",
  income_125_150 = "B19001_015",
  income_150_200 = "B19001_016",
  income_gt200 = "B19001_017"
)
block_groups <- st_transform(tidycensus::get_acs(
  "block group", variables,
  year = 2021, output = "wide", cache_table = TRUE,
  state = "51", county = counties, geometry = TRUE
)[, -2], "WGS84")
#> Getting data from the 2017-2021 5-year ACS
colnames(block_groups)[2] <- "Total"
block_groups$race_other <- block_groups$Total - rowSums(
  block_groups[, grep("^race_.*E$", colnames(block_groups)), drop = TRUE]
)
colnames(block_groups) <- sub("E$", "", colnames(block_groups))

# make larger income groups
income_vars <- grep("income", colnames(block_groups), value = TRUE)
block_groups$income_lt50 <- rowSums(block_groups[, grep(
  "_(?:lt1|[1-4][05]_).*\\d$", income_vars,
  value = TRUE
), drop = TRUE])
block_groups$income_50_100 <- rowSums(block_groups[, grep(
  "income_[5-7].*\\d$", income_vars,
  value = TRUE
), drop = TRUE])
block_groups$income_100_200 <- rowSums(block_groups[, grep(
  "_1\\d{2}.*\\d$", income_vars,
  value = TRUE
), drop = TRUE])
income_vars_select <- c(
  "income_lt50", "income_50_100", "income_100_200", "income_gt200"
)

And our ultimate target zipcode geometries:

library(sf)
zipcode_file <- paste0(base_dir, "/zipcode_va_fairfax.rds")
if (!file.exists(zipcode_file)) {
  zipcodes <- st_read(paste0(
    "https://www.fairfaxcounty.gov/euclid/rest/services/IPLS/IPLSMap",
    "/MapServer/3/query?where=1=1&outFields=*&outSR=4326&f=geojson"
  ))
  saveRDS(
    st_transform(zipcodes, "WGS84"), zipcode_file,
    compress = "xz"
  )
}
zipcodes <- readRDS(zipcode_file)

Now we can use the redistribute function to make zipcode-level estimates:

library(redistribute)
map_bg_to_zipcodes <- estimates_direct <- redistribute(
  block_groups, zipcodes,
  target_id = "ZIPCODE", return_map = TRUE
)
estimates_direct <- redistribute(
  block_groups, zipcodes, map_bg_to_zipcodes,
  target_id = "ZIPCODE"
)

Since total household estimates are included in the zipcode data (though from 2022), we can see how close our estimated total got to them:

# Mean Absolute Error
mean(abs(estimates_direct$Total - zipcodes$HOUSEHOLDS))
#> [1] 836.1899

# Pearson's r
cor(estimates_direct$Total, zipcodes$HOUSEHOLDS)
#> [1] 0.9811777

We could also use these zipcode-level estimates as weights to improve our redistribution:

estimates_direct_weighted <- redistribute(
  block_groups, zipcodes, map_bg_to_zipcodes,
  target_id = "ZIPCODE", weight = "HOUSEHOLDS"
)
mean(abs(estimates_direct_weighted$Total - zipcodes$HOUSEHOLDS))
#> [1] 376.5102
cor(estimates_direct_weighted$Total, zipcodes$HOUSEHOLDS)
#> [1] 0.9871246

Parcels

To potentially improve these estimates, we can use parcel-level housing unit information to disaggregate down from block groups to parcels, then aggregate up to zip codes:

First, we need to get the parcel data with housing unit counts:

parcel_file <- paste0(base_dir, "/parcel_va_fairfax.rds")
if (!file.exists(parcel_file)) {
  parcels <- st_read(paste0(
    "https://opendata.arcgis.com/api/v3/datasets/",
    "4f00b13df5a24cc19068bf356d3d1c45_1/downloads/data",
    "?format=geojson&spatialRefId=4326"
  ))
  saveRDS(
    st_transform(parcels, "WGS84"), parcel_file,
    compress = "xz"
  )
}
parcels <- readRDS(parcel_file)

colnames(parcels)[colnames(parcels) == "CURRE_UNIT"] <- "Total"
parcels$Unit_Type <- "MULTI"
parcels$Unit_Type[parcels$HOUSI_UNIT_TYPE == "SF"] <- "SFD"
parcels$Unit_Type[parcels$HOUSI_UNIT_TYPE %in% c("TH", "MP", "DX")] <- "SFA"

parcels$PIN <- as.character(parcels$PIN)
parcels <- parcels[parcels$YEAR_BUILT <= 2021 & !duplicated(parcels$PIN), ]

Then, we can disaggregate down from block groups:

# since not all block groups contain parcels, we'll need to fill some in
map_bg_to_parcel <- redistribute(
  block_groups, parcels,
  target_id = "PIN", return_map = TRUE, fill_targets = TRUE
)
rownames(parcels) <- parcels$PIN
mapped_pins <- names(unlist(unname(map_bg_to_parcel)))
parcels <- parcels[mapped_pins, ]
filled <- which(is.na(parcels$PIN))
filled_ids <- grep("^filled_", mapped_pins, value = TRUE)
parcels$Total[filled] <- 1
parcels$PIN[filled] <- filled_ids
parcels$Unit_Type[filled] <- "SFD"
parcels$geometry[filled] <- structure(
  st_geometry(block_groups),
  names = block_groups$GEOID
)[sub("^filled_", "", filled_ids)]

parcels$geoid <- structure(
  rep(names(map_bg_to_parcel), vapply(map_bg_to_parcel, length, 0)),
  names = names(unlist(unname(map_bg_to_parcel)))
)[parcels$PIN]

estimates_parcels <- redistribute(
  block_groups, parcels, map_bg_to_parcel,
  target_id = "PIN", weight = "Total", default_value = 0
)

Since household estimates are also available at the parcel level, we can first see how close we got to those:

# download parcel-level household data
parcel_hh_file <- paste0(base_dir, "/parcel_hh_va_fairfax.rds")
if (!file.exists(parcel_hh_file)) {
  parcels_hh <- st_read(paste0(
    "https://opendata.arcgis.com/api/v3/datasets/",
    "6b11da4a036049b89e656db6fe834621_0/downloads/data",
    "?format=geojson&spatialRefId=4326"
  ))
  saveRDS(
    st_transform(parcels_hh, "WGS84"), parcel_hh_file,
    compress = "xz"
  )
}
parcels_hh <- readRDS(parcel_hh_file)
rownames(parcels_hh) <- parcels_hh$PIN
parcels_hh <- parcels_hh[parcels$PIN, ]

mean(abs(estimates_parcels$Total - parcels_hh$CURRE_HHLDS), na.rm = TRUE)
#> [1] 0.1749806
cor(estimates_parcels$Total, parcels_hh$CURRE_HHLDS, use = "complete.obs")
#> [1] 0.9648648

The estimated number of households at the parcel level is slightly higher, potentially in part due to the difference in time:

totals <- c(
  "Block Groups" = sum(block_groups$Total),
  Parcels = sum(parcels_hh$CURRE_HHLDS, na.rm = TRUE)
)
data.frame(Households = c(totals, Difference = totals[[2]] - totals[[1]]))
#>              Households
#> Block Groups 417763.000
#> Parcels      418679.413
#> Difference      916.413

Now we can aggregate from the parcel-level to zipcodes:

map_parcel_to_zip <- redistribute(
  parcels, zipcodes,
  source_id = "PIN", target_id = "ZIPCODE", return_map = TRUE
)
estimates_downup <- redistribute(
  estimates_parcels, zipcodes, map_parcel_to_zip,
  source_id = "id", target_id = "ZIPCODE", default_value = 0
)

mean(abs(estimates_downup$Total - zipcodes$HOUSEHOLDS))
#> [1] 322.4999
cor(estimates_downup$Total, zipcodes$HOUSEHOLDS)
#> [1] 0.9899511

Since the parcel-level household estimates differ very little from the unit counts, it makes less of a difference in this case:

mean(abs(parcels$Total - parcels_hh$CURRE_HHLDS), na.rm = TRUE)
#> [1] 0.03345603
cor(parcels$Total, parcels_hh$CURRE_HHLDS, use = "complete.obs")
#> [1] 0.9759002

Without knowing anything about how the household estimates were made for these parcels and zipcodes, it is interesting to note that aggregated parcel estimates nearly perfectly line up with zipcode estimates.

estimates_downup_original <- redistribute(
  parcels_hh, zipcodes, map_parcel_to_zip,
  source_id = "PIN", target_id = "ZIPCODE", default_value = 0
)

mean(abs(estimates_downup_original$CURRE_HHLDS - zipcodes$HOUSEHOLDS))
#> [1] 1.115248
cor(estimates_downup_original$CURRE_HHLDS, zipcodes$HOUSEHOLDS)
#> [1] 0.9999999

Parcels + Pums

Another thing we might try is using associations between our variables of interest and housing variables that we can derive from microdata to make different parcel-based estimates.

First, we need to download and prepare the microdata sample:

pums <- download_census_pums(
  base_dir, "va", 2021,
  one_year = FALSE, geoids = paste0("51", counties)
)
#>  loading household sample: h.csv.xz
#>  loading person sample: p.csv.xz
#>  loading crosswalk 2010_Census_Tract_to_2010_PUMA.txt
#>  subsetting to 9 of 982 PUM areas
pums$crosswalk$GEOID <- do.call(paste0, pums$crosswalk[, 1:3])
households <- pums$household

# prepare variables

## survey categories
households$race_cat <- paste0("race_", c(
  "white", "black", "native", rep("other", 2), "asian", rep("other", 3)
))[as.numeric(households$HHLDRRAC1P)]

households$income_cat <- as.character(cut(
  households$HINCP, c(-Inf, 50, 100, 200, Inf) * 1e3, income_vars_select,
  right = FALSE
))

## unit categories
households <- households[!is.na(households$BLD) & !is.na(households$TEN), ]
households$building_type <- "MULTI"
households$building_type[households$BLD == "02"] <- "SFD"
households$building_type[households$BLD == "03"] <- "SFA"

Then we’ll set set some things up to calculate estimates from microdata:

# define variables of interest
vars_house <- c(
  ID = "SERIALNO", weight = "WGTP", type = "building_type",
  income_cat = "income_cat", race_cat = "race_cat"
)
vars_units <- c(type = "Unit_Type")

## get their levels
return_vars <- c("income_cat", "race_cat")
vars_list <- lapply(vars_house[return_vars], function(var) unique(households[[var]]))
vars <- unlist(vars_list, use.names = FALSE)

# prepare datasets split into PUMAs
pumas_focal <- unique(households$PUMA)
data <- lapply(structure(pumas_focal, names = pumas_focal), function(puma) {
  ids <- pums$crosswalk$GEOID[pums$crosswalk$PUMA5CE == puma]
  list(
    households = as.data.frame(households[households$PUMA == puma, vars_house]),
    source = block_groups[
      substring(block_groups$GEOID, 1, 11) %in% ids, c("GEOID", vars),
      drop = TRUE
    ],
    parcels = parcels[
      substring(parcels$geoid, 1, 11) %in% ids, c("geoid", "PIN", "Total", vars_units),
      drop = TRUE
    ],
    map = map_bg_to_parcel
  )
})

# function to apply each method to the data
apply_method <- function(data, method, rescale = FALSE) {
  p <- do.call(rbind, lapply(data, function(set) {
    set$source <- st_drop_geometry(set$source)
    do.call(rbind, lapply(unique(set$source$GEOID), function(id) {
      source <- set$source[set$source$GEOID == id, -1]
      target <- set$parcels[if ("geography" %in% names(set)) {
        set$parcels$geoid %in% set$map[[id]]
      } else {
        set$parcels$geoid == id
      }, ]
      if (nrow(target)) {
        est <- if (sum(source)) method(source, target[, -1], set$households) else NULL
        if (length(est)) {
          if (rescale) {
            totals <- colSums(est)
            totals[!totals] <- 1
            est <- sweep(est, 2, totals, "/") * rep(
              as.numeric(source[, colnames(est)]),
              each = nrow(est)
            )
          }
          su <- !vars %in% colnames(est)
          if (any(su)) est[, vars[su]] <- 0
          cbind(target[, 1:2], est[as.character(target$PIN), vars])
        } else {
          est <- target[, 1:2]
          est[, vars] <- 0
          est
        }
      }
    }))
  }))
  redistribute(
    p[, -1], zipcodes, map_parcel_to_zip,
    source_id = "PIN", target_id = "ZIPCODE", default_value = 0
  )
}

# function to calculate estimates from weights
make_estimates <- function() {
  do.call(rbind, lapply(unique(target$Unit_Type), function(type) {
    d <- target[target$Unit_Type == type, c("PIN", "Total")]
    nd <- nrow(d)
    i <- households[
      households$building_type == type, c("weight", return_vars)
    ]
    weight_total <- sum(i$weight)
    as.data.frame(do.call(cbind, lapply(return_vars, function(cat) {
      props <- tapply(i$weight, i[[cat]], sum) / weight_total
      props[vars_list[[cat]][!vars_list[[cat]] %in% names(props)]] <- 0
      props[is.na(props)] <- 0
      props <- props[vars_list[[cat]]]
      matrix(
        d$Total * rep(props, each = nd),
        nd,
        dimnames = list(d$PIN, names(props))
      )
    })))
  }))
}

Now we can make zipcode-level estimates from parcel-level estimates that are based on PUMS associations between target variables and housing-related variables.

method_pums <- function(source, target, households) {
  households$weight <- households$WGTP
  environment(make_estimates) <- environment()
  make_estimates()
}
estimates_pums <- apply_method(data, method_pums)

Here we’re not estimating total population, but we might calculate it from each set of variables to measure against the provided zipcode-level totals:

estimates_pums$Total <- rowSums(estimates_pums[
  , grep("^race_", colnames(estimates_pums)),
  drop = TRUE
])

mean(abs(estimates_pums$Total - zipcodes$HOUSEHOLDS))
#> [1] 552.8591
cor(estimates_pums$Total, zipcodes$HOUSEHOLDS)
#> [1] 0.9678753

Parcels + Raked PUMS

In the previous example, we did not use the ACS summary information to make estimates.

We might incorporate those summaries by adjusting the initial PUMS weights to fit the totals within each block group:

library(anesrake)
rake_prep <- function() {
  eval(expression({
    for (var in names(totals)) {
      households[[var]] <- as.factor(households[[var]])
      l <- levels(households[[var]])
      totals[[var]] <- totals[[var]][l]
      su <- is.na(totals[[var]]) | totals[[var]] == 0
      if (sum(su)) {
        households <- households[!households[[var]] %in% l[su], ]
        households[[var]] <- droplevels(households[[var]])
        totals[[var]] <- totals[[var]][!su]
      }
      total <- sum(totals[[var]])
      if (total) totals[[var]] <- totals[[var]] / total
    }
    totals <- Filter(length, totals)
  }), parent.frame())
}
method_rake <- function(source, target, households) {
  totals <- lapply(
    structure(names(vars_list)[1:2], names = names(vars_list)[1:2]),
    function(n) unlist(source[, vars_list[[n]]])
  )
  rake_prep()
  households$building_type <- as.factor(households$building_type)
  households$income_cat <- as.factor(households$income_cat)
  capture.output(households$weight <- tryCatch(
    anesrake(
      totals, households, households$SERIALNO, households$WGTP,
      pctlim = .001
    )$weightvec[as.character(households$SERIALNO)],
    error = function(e) {
      warning(e$message)
      households$WGTP
    }
  ))
  environment(make_estimates) <- environment()
  make_estimates()
}
estimates_rake <- apply_method(data, method_rake)

Totals with the adjusted weights will be the same as before:

estimates_rake$Total <- rowSums(estimates_rake[
  , grep("^race_", colnames(estimates_rake)),
  drop = TRUE
])
all.equal(estimates_rake$Total, estimates_pums$Total)
#> [1] TRUE

But the spread across categories will be a little different:

kable(data.frame(
  mae = colMeans(abs(
    estimates_pums[, vars, drop = TRUE] -
      estimates_rake[, vars, drop = TRUE]
  )),
  cor = diag(cor(
    estimates_pums[, vars, drop = TRUE],
    estimates_rake[, vars, drop = TRUE]
  ))
), digits = 4)
mae cor
income_100_200 158.2663 0.9913
income_gt200 298.3226 0.9809
income_50_100 146.4135 0.9812
income_lt50 170.6139 0.9650
race_white 263.3993 0.9910
race_asian 178.5484 0.9530
race_black 145.5130 0.9499
race_other 103.7728 0.9554
race_native 5.2856 0.7985

These differences are much smaller when the estimates are rescaled – rescaling suppresses much of the difference the raking makes.

Comparison

We’ll start by looking at the largest category variable (race: white) to compare the methods.

We can first check the totals between methods:

data <- cbind(
  Direct = estimates_direct$race_white,
  DirectHH = estimates_direct_weighted$race_white,
  DownUp = estimates_downup$race_white,
  Pums = estimates_pums$race_white,
  Rake = estimates_rake$race_white
)
data.frame(
  "Race: White" = c(Original = sum(block_groups$race_white), colSums(data)),
  check.names = FALSE
)
#>          Race: White
#> Original    261026.0
#> Direct      261026.0
#> DirectHH    261026.0
#> DownUp      261026.0
#> Pums        237758.9
#> Rake        240187.5

Only the direct methods naturally recover the original totals, because the other methods (a) depend on parcel coverage of block groups, and (b) the PUMS-based estimates are not guaranteed to recover category-level totals.

We can also look at these estimates on a map:

library(leaflet)
mapdata <- cbind(rmapshaper::ms_simplify(zipcodes[, 2], keep_shapes = TRUE), data)
all_values <- unlist(data)
pal <- colorNumeric(scico::scico(255, palette = "lajolla"), all_values)
leaflet(
  mapdata,
  options = leafletOptions(attributionControl = FALSE)
) |>
  addProviderTiles("CartoDB.Positron") |>
  addScaleBar("bottomleft") |>
  addControl("Race: White", "topright") |>
  addLayersControl(
    position = "topleft",
    baseGroups = c(
      "Direct", "Direct Weighted", "DownUp", "Pums", "Rake"
    )
  ) |>
  addLegend("bottomright", pal, all_values, opacity = 1) |>
  addPolygons(
    fillColor = ~ pal(Direct), fillOpacity = 1, weight = 1,
    color = "#000", highlightOptions = highlightOptions(color = "#fff"),
    group = "Direct", label = ~ paste(ZIPCODE, "White:", Direct)
  ) |>
  addPolygons(
    fillColor = ~ pal(DirectHH), fillOpacity = 1, weight = 1,
    color = "#000", highlightOptions = highlightOptions(color = "#fff"),
    group = "Direct Weighted", label = ~ paste(ZIPCODE, "White:", DirectHH)
  ) |>
  addPolygons(
    fillColor = ~ pal(DownUp), fillOpacity = 1, weight = 1,
    color = "#000", highlightOptions = highlightOptions(color = "#fff"),
    group = "DownUp", label = ~ paste(ZIPCODE, "White:", DownUp)
  ) |>
  hideGroup("DownUp") |>
  addPolygons(
    fillColor = ~ pal(Pums), fillOpacity = 1, weight = 1,
    color = "#000", highlightOptions = highlightOptions(color = "#fff"),
    group = "Pums", label = ~ paste(ZIPCODE, "White:", Pums)
  ) |>
  hideGroup("Pums") |>
  addPolygons(
    fillColor = ~ pal(Rake), fillOpacity = 1, weight = 1,
    color = "#000", highlightOptions = highlightOptions(color = "#fff"),
    group = "Rake", label = ~ paste(ZIPCODE, "White:", Rake)
  ) |>
  hideGroup("Rake")

And look at the correlations between them:

kable(cor(data), digits = 5)
Direct DirectHH DownUp Pums Rake
Direct 1.00000 0.99251 0.99078 0.94702 0.94990
DirectHH 0.99251 1.00000 0.99867 0.95922 0.96039
DownUp 0.99078 0.99867 1.00000 0.96154 0.96446
Pums 0.94702 0.95922 0.96154 1.00000 0.99104
Rake 0.94990 0.96039 0.96446 0.99104 1.00000

Finally, we might take a look at the smallest category variable (race: native):

data_native <- cbind(
  Direct = estimates_direct$race_native,
  DirectHH = estimates_direct_weighted$race_native,
  DownUp = estimates_downup$race_native,
  Pums = estimates_pums$race_native,
  Rake = estimates_rake$race_native
)
data.frame(
  "Race: Native" = c(Original = sum(block_groups$race_native), colSums(data_native)),
  check.names = FALSE
)
#>          Race: Native
#> Original    1132.0000
#> Direct      1132.0000
#> DirectHH    1132.0000
#> DownUp      1132.0000
#> Pums         761.7542
#> Rake         354.9479
kable(cor(data_native), digits = 5)
Direct DirectHH DownUp Pums Rake
Direct 1.00000 0.96945 0.96333 0.60671 0.72698
DirectHH 0.96945 1.00000 0.99801 0.64103 0.74029
DownUp 0.96333 0.99801 1.00000 0.64300 0.73745
Pums 0.60671 0.64103 0.64300 1.00000 0.79848
Rake 0.72698 0.74029 0.73745 0.79848 1.00000