Built with R 4.3.1
This article compares different means of estimating zipcode-level household values from a block group level source in Fairfax, Virginia.
# directory to save data in
base_dir <- "../household_estimates"
dir.create(base_dir, FALSE)
Direct
We can start by directly redistributing block group values to zipcodes based on their geometries.
First, we need to get our source block group data (American Community Survey 5-year estimates, 2017-2021):
counties <- c("059", "600")
variables <- c(
total = "B25006_001",
race_white = "B25006_002",
race_black = "B25006_003",
race_native = "B25006_004",
race_asian = "B25006_005",
income_total = "B19001_001",
income_lt10 = "B19001_002",
income_10_15 = "B19001_003",
income_15_20 = "B19001_004",
income_20_25 = "B19001_005",
income_25_30 = "B19001_006",
income_30_35 = "B19001_007",
income_35_40 = "B19001_008",
income_40_45 = "B19001_009",
income_45_50 = "B19001_010",
income_50_60 = "B19001_011",
income_60_75 = "B19001_012",
income_75_100 = "B19001_013",
income_100_125 = "B19001_014",
income_125_150 = "B19001_015",
income_150_200 = "B19001_016",
income_gt200 = "B19001_017"
)
block_groups <- st_transform(tidycensus::get_acs(
"block group", variables,
year = 2021, output = "wide", cache_table = TRUE,
state = "51", county = counties, geometry = TRUE
)[, -2], "WGS84")
#> Getting data from the 2017-2021 5-year ACS
colnames(block_groups)[2] <- "Total"
block_groups$race_other <- block_groups$Total - rowSums(
block_groups[, grep("^race_.*E$", colnames(block_groups)), drop = TRUE]
)
colnames(block_groups) <- sub("E$", "", colnames(block_groups))
# make larger income groups
income_vars <- grep("income", colnames(block_groups), value = TRUE)
block_groups$income_lt50 <- rowSums(block_groups[, grep(
"_(?:lt1|[1-4][05]_).*\\d$", income_vars,
value = TRUE
), drop = TRUE])
block_groups$income_50_100 <- rowSums(block_groups[, grep(
"income_[5-7].*\\d$", income_vars,
value = TRUE
), drop = TRUE])
block_groups$income_100_200 <- rowSums(block_groups[, grep(
"_1\\d{2}.*\\d$", income_vars,
value = TRUE
), drop = TRUE])
income_vars_select <- c(
"income_lt50", "income_50_100", "income_100_200", "income_gt200"
)
And our ultimate target zipcode geometries:
library(sf)
zipcode_file <- paste0(base_dir, "/zipcode_va_fairfax.rds")
if (!file.exists(zipcode_file)) {
zipcodes <- st_read(paste0(
"https://www.fairfaxcounty.gov/euclid/rest/services/IPLS/IPLSMap",
"/MapServer/3/query?where=1=1&outFields=*&outSR=4326&f=geojson"
))
saveRDS(
st_transform(zipcodes, "WGS84"), zipcode_file,
compress = "xz"
)
}
zipcodes <- readRDS(zipcode_file)
Now we can use the redistribute
function to make
zipcode-level estimates:
library(redistribute)
map_bg_to_zipcodes <- estimates_direct <- redistribute(
block_groups, zipcodes,
target_id = "ZIPCODE", return_map = TRUE
)
estimates_direct <- redistribute(
block_groups, zipcodes, map_bg_to_zipcodes,
target_id = "ZIPCODE"
)
Since total household estimates are included in the zipcode data (though from 2022), we can see how close our estimated total got to them:
# Mean Absolute Error
mean(abs(estimates_direct$Total - zipcodes$HOUSEHOLDS))
#> [1] 836.1899
# Pearson's r
cor(estimates_direct$Total, zipcodes$HOUSEHOLDS)
#> [1] 0.9811777
We could also use these zipcode-level estimates as weights to improve our redistribution:
estimates_direct_weighted <- redistribute(
block_groups, zipcodes, map_bg_to_zipcodes,
target_id = "ZIPCODE", weight = "HOUSEHOLDS"
)
mean(abs(estimates_direct_weighted$Total - zipcodes$HOUSEHOLDS))
#> [1] 376.5102
cor(estimates_direct_weighted$Total, zipcodes$HOUSEHOLDS)
#> [1] 0.9871246
Parcels
To potentially improve these estimates, we can use parcel-level housing unit information to disaggregate down from block groups to parcels, then aggregate up to zip codes:
First, we need to get the parcel data with housing unit counts:
parcel_file <- paste0(base_dir, "/parcel_va_fairfax.rds")
if (!file.exists(parcel_file)) {
parcels <- st_read(paste0(
"https://opendata.arcgis.com/api/v3/datasets/",
"4f00b13df5a24cc19068bf356d3d1c45_1/downloads/data",
"?format=geojson&spatialRefId=4326"
))
saveRDS(
st_transform(parcels, "WGS84"), parcel_file,
compress = "xz"
)
}
parcels <- readRDS(parcel_file)
colnames(parcels)[colnames(parcels) == "CURRE_UNIT"] <- "Total"
parcels$Unit_Type <- "MULTI"
parcels$Unit_Type[parcels$HOUSI_UNIT_TYPE == "SF"] <- "SFD"
parcels$Unit_Type[parcels$HOUSI_UNIT_TYPE %in% c("TH", "MP", "DX")] <- "SFA"
parcels$PIN <- as.character(parcels$PIN)
parcels <- parcels[parcels$YEAR_BUILT <= 2021 & !duplicated(parcels$PIN), ]
Then, we can disaggregate down from block groups:
# since not all block groups contain parcels, we'll need to fill some in
map_bg_to_parcel <- redistribute(
block_groups, parcels,
target_id = "PIN", return_map = TRUE, fill_targets = TRUE
)
rownames(parcels) <- parcels$PIN
mapped_pins <- names(unlist(unname(map_bg_to_parcel)))
parcels <- parcels[mapped_pins, ]
filled <- which(is.na(parcels$PIN))
filled_ids <- grep("^filled_", mapped_pins, value = TRUE)
parcels$Total[filled] <- 1
parcels$PIN[filled] <- filled_ids
parcels$Unit_Type[filled] <- "SFD"
parcels$geometry[filled] <- structure(
st_geometry(block_groups),
names = block_groups$GEOID
)[sub("^filled_", "", filled_ids)]
parcels$geoid <- structure(
rep(names(map_bg_to_parcel), vapply(map_bg_to_parcel, length, 0)),
names = names(unlist(unname(map_bg_to_parcel)))
)[parcels$PIN]
estimates_parcels <- redistribute(
block_groups, parcels, map_bg_to_parcel,
target_id = "PIN", weight = "Total", default_value = 0
)
Since household estimates are also available at the parcel level, we can first see how close we got to those:
# download parcel-level household data
parcel_hh_file <- paste0(base_dir, "/parcel_hh_va_fairfax.rds")
if (!file.exists(parcel_hh_file)) {
parcels_hh <- st_read(paste0(
"https://opendata.arcgis.com/api/v3/datasets/",
"6b11da4a036049b89e656db6fe834621_0/downloads/data",
"?format=geojson&spatialRefId=4326"
))
saveRDS(
st_transform(parcels_hh, "WGS84"), parcel_hh_file,
compress = "xz"
)
}
parcels_hh <- readRDS(parcel_hh_file)
rownames(parcels_hh) <- parcels_hh$PIN
parcels_hh <- parcels_hh[parcels$PIN, ]
mean(abs(estimates_parcels$Total - parcels_hh$CURRE_HHLDS), na.rm = TRUE)
#> [1] 0.1749806
cor(estimates_parcels$Total, parcels_hh$CURRE_HHLDS, use = "complete.obs")
#> [1] 0.9648648
The estimated number of households at the parcel level is slightly higher, potentially in part due to the difference in time:
totals <- c(
"Block Groups" = sum(block_groups$Total),
Parcels = sum(parcels_hh$CURRE_HHLDS, na.rm = TRUE)
)
data.frame(Households = c(totals, Difference = totals[[2]] - totals[[1]]))
#> Households
#> Block Groups 417763.000
#> Parcels 418679.413
#> Difference 916.413
Now we can aggregate from the parcel-level to zipcodes:
map_parcel_to_zip <- redistribute(
parcels, zipcodes,
source_id = "PIN", target_id = "ZIPCODE", return_map = TRUE
)
estimates_downup <- redistribute(
estimates_parcels, zipcodes, map_parcel_to_zip,
source_id = "id", target_id = "ZIPCODE", default_value = 0
)
mean(abs(estimates_downup$Total - zipcodes$HOUSEHOLDS))
#> [1] 322.4999
cor(estimates_downup$Total, zipcodes$HOUSEHOLDS)
#> [1] 0.9899511
Since the parcel-level household estimates differ very little from the unit counts, it makes less of a difference in this case:
mean(abs(parcels$Total - parcels_hh$CURRE_HHLDS), na.rm = TRUE)
#> [1] 0.03345603
cor(parcels$Total, parcels_hh$CURRE_HHLDS, use = "complete.obs")
#> [1] 0.9759002
Without knowing anything about how the household estimates were made for these parcels and zipcodes, it is interesting to note that aggregated parcel estimates nearly perfectly line up with zipcode estimates.
estimates_downup_original <- redistribute(
parcels_hh, zipcodes, map_parcel_to_zip,
source_id = "PIN", target_id = "ZIPCODE", default_value = 0
)
mean(abs(estimates_downup_original$CURRE_HHLDS - zipcodes$HOUSEHOLDS))
#> [1] 1.115248
cor(estimates_downup_original$CURRE_HHLDS, zipcodes$HOUSEHOLDS)
#> [1] 0.9999999
Parcels + Pums
Another thing we might try is using associations between our variables of interest and housing variables that we can derive from microdata to make different parcel-based estimates.
First, we need to download and prepare the microdata sample:
pums <- download_census_pums(
base_dir, "va", 2021,
one_year = FALSE, geoids = paste0("51", counties)
)
#> ℹ loading household sample: h.csv.xz
#> ℹ loading person sample: p.csv.xz
#> ℹ loading crosswalk 2010_Census_Tract_to_2010_PUMA.txt
#> ℹ subsetting to 9 of 982 PUM areas
pums$crosswalk$GEOID <- do.call(paste0, pums$crosswalk[, 1:3])
households <- pums$household
# prepare variables
## survey categories
households$race_cat <- paste0("race_", c(
"white", "black", "native", rep("other", 2), "asian", rep("other", 3)
))[as.numeric(households$HHLDRRAC1P)]
households$income_cat <- as.character(cut(
households$HINCP, c(-Inf, 50, 100, 200, Inf) * 1e3, income_vars_select,
right = FALSE
))
## unit categories
households <- households[!is.na(households$BLD) & !is.na(households$TEN), ]
households$building_type <- "MULTI"
households$building_type[households$BLD == "02"] <- "SFD"
households$building_type[households$BLD == "03"] <- "SFA"
Then we’ll set set some things up to calculate estimates from microdata:
# define variables of interest
vars_house <- c(
ID = "SERIALNO", weight = "WGTP", type = "building_type",
income_cat = "income_cat", race_cat = "race_cat"
)
vars_units <- c(type = "Unit_Type")
## get their levels
return_vars <- c("income_cat", "race_cat")
vars_list <- lapply(vars_house[return_vars], function(var) unique(households[[var]]))
vars <- unlist(vars_list, use.names = FALSE)
# prepare datasets split into PUMAs
pumas_focal <- unique(households$PUMA)
data <- lapply(structure(pumas_focal, names = pumas_focal), function(puma) {
ids <- pums$crosswalk$GEOID[pums$crosswalk$PUMA5CE == puma]
list(
households = as.data.frame(households[households$PUMA == puma, vars_house]),
source = block_groups[
substring(block_groups$GEOID, 1, 11) %in% ids, c("GEOID", vars),
drop = TRUE
],
parcels = parcels[
substring(parcels$geoid, 1, 11) %in% ids, c("geoid", "PIN", "Total", vars_units),
drop = TRUE
],
map = map_bg_to_parcel
)
})
# function to apply each method to the data
apply_method <- function(data, method, rescale = FALSE) {
p <- do.call(rbind, lapply(data, function(set) {
set$source <- st_drop_geometry(set$source)
do.call(rbind, lapply(unique(set$source$GEOID), function(id) {
source <- set$source[set$source$GEOID == id, -1]
target <- set$parcels[if ("geography" %in% names(set)) {
set$parcels$geoid %in% set$map[[id]]
} else {
set$parcels$geoid == id
}, ]
if (nrow(target)) {
est <- if (sum(source)) method(source, target[, -1], set$households) else NULL
if (length(est)) {
if (rescale) {
totals <- colSums(est)
totals[!totals] <- 1
est <- sweep(est, 2, totals, "/") * rep(
as.numeric(source[, colnames(est)]),
each = nrow(est)
)
}
su <- !vars %in% colnames(est)
if (any(su)) est[, vars[su]] <- 0
cbind(target[, 1:2], est[as.character(target$PIN), vars])
} else {
est <- target[, 1:2]
est[, vars] <- 0
est
}
}
}))
}))
redistribute(
p[, -1], zipcodes, map_parcel_to_zip,
source_id = "PIN", target_id = "ZIPCODE", default_value = 0
)
}
# function to calculate estimates from weights
make_estimates <- function() {
do.call(rbind, lapply(unique(target$Unit_Type), function(type) {
d <- target[target$Unit_Type == type, c("PIN", "Total")]
nd <- nrow(d)
i <- households[
households$building_type == type, c("weight", return_vars)
]
weight_total <- sum(i$weight)
as.data.frame(do.call(cbind, lapply(return_vars, function(cat) {
props <- tapply(i$weight, i[[cat]], sum) / weight_total
props[vars_list[[cat]][!vars_list[[cat]] %in% names(props)]] <- 0
props[is.na(props)] <- 0
props <- props[vars_list[[cat]]]
matrix(
d$Total * rep(props, each = nd),
nd,
dimnames = list(d$PIN, names(props))
)
})))
}))
}
Now we can make zipcode-level estimates from parcel-level estimates that are based on PUMS associations between target variables and housing-related variables.
method_pums <- function(source, target, households) {
households$weight <- households$WGTP
environment(make_estimates) <- environment()
make_estimates()
}
estimates_pums <- apply_method(data, method_pums)
Here we’re not estimating total population, but we might calculate it from each set of variables to measure against the provided zipcode-level totals:
Parcels + Raked PUMS
In the previous example, we did not use the ACS summary information to make estimates.
We might incorporate those summaries by adjusting the initial PUMS weights to fit the totals within each block group:
library(anesrake)
rake_prep <- function() {
eval(expression({
for (var in names(totals)) {
households[[var]] <- as.factor(households[[var]])
l <- levels(households[[var]])
totals[[var]] <- totals[[var]][l]
su <- is.na(totals[[var]]) | totals[[var]] == 0
if (sum(su)) {
households <- households[!households[[var]] %in% l[su], ]
households[[var]] <- droplevels(households[[var]])
totals[[var]] <- totals[[var]][!su]
}
total <- sum(totals[[var]])
if (total) totals[[var]] <- totals[[var]] / total
}
totals <- Filter(length, totals)
}), parent.frame())
}
method_rake <- function(source, target, households) {
totals <- lapply(
structure(names(vars_list)[1:2], names = names(vars_list)[1:2]),
function(n) unlist(source[, vars_list[[n]]])
)
rake_prep()
households$building_type <- as.factor(households$building_type)
households$income_cat <- as.factor(households$income_cat)
capture.output(households$weight <- tryCatch(
anesrake(
totals, households, households$SERIALNO, households$WGTP,
pctlim = .001
)$weightvec[as.character(households$SERIALNO)],
error = function(e) {
warning(e$message)
households$WGTP
}
))
environment(make_estimates) <- environment()
make_estimates()
}
estimates_rake <- apply_method(data, method_rake)
Totals with the adjusted weights will be the same as before:
estimates_rake$Total <- rowSums(estimates_rake[
, grep("^race_", colnames(estimates_rake)),
drop = TRUE
])
all.equal(estimates_rake$Total, estimates_pums$Total)
#> [1] TRUE
But the spread across categories will be a little different:
kable(data.frame(
mae = colMeans(abs(
estimates_pums[, vars, drop = TRUE] -
estimates_rake[, vars, drop = TRUE]
)),
cor = diag(cor(
estimates_pums[, vars, drop = TRUE],
estimates_rake[, vars, drop = TRUE]
))
), digits = 4)
mae | cor | |
---|---|---|
income_100_200 | 158.2663 | 0.9913 |
income_gt200 | 298.3226 | 0.9809 |
income_50_100 | 146.4135 | 0.9812 |
income_lt50 | 170.6139 | 0.9650 |
race_white | 263.3993 | 0.9910 |
race_asian | 178.5484 | 0.9530 |
race_black | 145.5130 | 0.9499 |
race_other | 103.7728 | 0.9554 |
race_native | 5.2856 | 0.7985 |
These differences are much smaller when the estimates are rescaled – rescaling suppresses much of the difference the raking makes.
Comparison
We’ll start by looking at the largest category variable (race: white) to compare the methods.
We can first check the totals between methods:
data <- cbind(
Direct = estimates_direct$race_white,
DirectHH = estimates_direct_weighted$race_white,
DownUp = estimates_downup$race_white,
Pums = estimates_pums$race_white,
Rake = estimates_rake$race_white
)
data.frame(
"Race: White" = c(Original = sum(block_groups$race_white), colSums(data)),
check.names = FALSE
)
#> Race: White
#> Original 261026.0
#> Direct 261026.0
#> DirectHH 261026.0
#> DownUp 261026.0
#> Pums 237758.9
#> Rake 240187.5
Only the direct methods naturally recover the original totals, because the other methods (a) depend on parcel coverage of block groups, and (b) the PUMS-based estimates are not guaranteed to recover category-level totals.
We can also look at these estimates on a map:
library(leaflet)
mapdata <- cbind(rmapshaper::ms_simplify(zipcodes[, 2], keep_shapes = TRUE), data)
all_values <- unlist(data)
pal <- colorNumeric(scico::scico(255, palette = "lajolla"), all_values)
leaflet(
mapdata,
options = leafletOptions(attributionControl = FALSE)
) |>
addProviderTiles("CartoDB.Positron") |>
addScaleBar("bottomleft") |>
addControl("Race: White", "topright") |>
addLayersControl(
position = "topleft",
baseGroups = c(
"Direct", "Direct Weighted", "DownUp", "Pums", "Rake"
)
) |>
addLegend("bottomright", pal, all_values, opacity = 1) |>
addPolygons(
fillColor = ~ pal(Direct), fillOpacity = 1, weight = 1,
color = "#000", highlightOptions = highlightOptions(color = "#fff"),
group = "Direct", label = ~ paste(ZIPCODE, "White:", Direct)
) |>
addPolygons(
fillColor = ~ pal(DirectHH), fillOpacity = 1, weight = 1,
color = "#000", highlightOptions = highlightOptions(color = "#fff"),
group = "Direct Weighted", label = ~ paste(ZIPCODE, "White:", DirectHH)
) |>
addPolygons(
fillColor = ~ pal(DownUp), fillOpacity = 1, weight = 1,
color = "#000", highlightOptions = highlightOptions(color = "#fff"),
group = "DownUp", label = ~ paste(ZIPCODE, "White:", DownUp)
) |>
hideGroup("DownUp") |>
addPolygons(
fillColor = ~ pal(Pums), fillOpacity = 1, weight = 1,
color = "#000", highlightOptions = highlightOptions(color = "#fff"),
group = "Pums", label = ~ paste(ZIPCODE, "White:", Pums)
) |>
hideGroup("Pums") |>
addPolygons(
fillColor = ~ pal(Rake), fillOpacity = 1, weight = 1,
color = "#000", highlightOptions = highlightOptions(color = "#fff"),
group = "Rake", label = ~ paste(ZIPCODE, "White:", Rake)
) |>
hideGroup("Rake")
And look at the correlations between them:
kable(cor(data), digits = 5)
Direct | DirectHH | DownUp | Pums | Rake | |
---|---|---|---|---|---|
Direct | 1.00000 | 0.99251 | 0.99078 | 0.94702 | 0.94990 |
DirectHH | 0.99251 | 1.00000 | 0.99867 | 0.95922 | 0.96039 |
DownUp | 0.99078 | 0.99867 | 1.00000 | 0.96154 | 0.96446 |
Pums | 0.94702 | 0.95922 | 0.96154 | 1.00000 | 0.99104 |
Rake | 0.94990 | 0.96039 | 0.96446 | 0.99104 | 1.00000 |
Finally, we might take a look at the smallest category variable (race: native):
data_native <- cbind(
Direct = estimates_direct$race_native,
DirectHH = estimates_direct_weighted$race_native,
DownUp = estimates_downup$race_native,
Pums = estimates_pums$race_native,
Rake = estimates_rake$race_native
)
data.frame(
"Race: Native" = c(Original = sum(block_groups$race_native), colSums(data_native)),
check.names = FALSE
)
#> Race: Native
#> Original 1132.0000
#> Direct 1132.0000
#> DirectHH 1132.0000
#> DownUp 1132.0000
#> Pums 761.7542
#> Rake 354.9479
kable(cor(data_native), digits = 5)
Direct | DirectHH | DownUp | Pums | Rake | |
---|---|---|---|---|---|
Direct | 1.00000 | 0.96945 | 0.96333 | 0.60671 | 0.72698 |
DirectHH | 0.96945 | 1.00000 | 0.99801 | 0.64103 | 0.74029 |
DownUp | 0.96333 | 0.99801 | 1.00000 | 0.64300 | 0.73745 |
Pums | 0.60671 | 0.64103 | 0.64300 | 1.00000 | 0.79848 |
Rake | 0.72698 | 0.74029 | 0.73745 | 0.79848 | 1.00000 |