Built with R 4.3.1
Redistribution means rearranging values such that totals remain the same, and the original distribution could be recovered when re-aggregating from a perfectly contained disaggregate.
This may be desirable if your goal is to simply display the same data in different forms.
If, however, your goal is to come up with values at scales you don’t have observations of, it may be desirable to combine various observations at different scales and use them to come up with new estimates.
In the case of estimation, there are many more methods to consider, and validating their results is less straightforward, so this article will consider a few estimation and validation methods.
Data
We’ll focus on a single county (Arlington, VA), with census block groups and Public Use Microdata Samples (PUMS) as our primary source data.
# directory to save data in
base_dir <- "../estimate_comparison"
dir.create(base_dir, FALSE)
# geographies
library(catchment)
geography_bg <- download_census_shapes(base_dir, "va", "bg", year = 2021)
geography_bg <- geography_bg[grep("^51013", geography_bg$GEOID), ]
geography_tr <- download_census_shapes(base_dir, "va", "tr", year = 2021)
geography_tr <- geography_tr[grep("^51013", geography_tr$GEOID), ]
# ACS data
## block groups
block_groups <- tidycensus::get_acs(
year = 2021,
state = "51",
county = "013",
geography = "block group",
output = "wide",
variables = c(
total = "B01001_001",
race_white = "B02001_002",
race_black = "B02001_003",
race_native = "B02001_004",
race_asian = "B02001_005",
sex_male = "B01001_002",
sex_female = "B01001_026",
tenure_total = "B25003_001",
tenure_owner = "B25003_002",
tenure_renter = "B25003_003",
income_total = "B19001_001",
income_lt10 = "B19001_002",
income_10_15 = "B19001_003",
income_15_20 = "B19001_004",
income_20_25 = "B19001_005",
income_25_30 = "B19001_006",
income_30_35 = "B19001_007",
income_35_40 = "B19001_008",
income_40_45 = "B19001_009",
income_45_50 = "B19001_010",
income_50_60 = "B19001_011",
income_60_75 = "B19001_012",
income_75_100 = "B19001_013",
income_100_125 = "B19001_014",
income_125_150 = "B19001_015",
income_150_200 = "B19001_016",
income_gt200 = "B19001_017",
avg_hhsize = "B25010_001",
avg_hhsize_own = "B25010_002",
avg_hhsize_rent = "B25010_003"
)
)[, -2]
#> Getting data from the 2017-2021 5-year ACS
colnames(block_groups)[2] <- "Total"
block_groups$race_other <- block_groups$Total - rowSums(
block_groups[, grep("^race_.*E$", colnames(block_groups))]
)
colnames(block_groups) <- sub("E$", "", colnames(block_groups))
### impute missing average household sizes
block_groups$avg_hhsize[is.na(block_groups$avg_hhsize)] <- mean(
block_groups$avg_hhsize,
na.rm = TRUE
)
su <- is.na(block_groups$avg_hhsize_own)
block_groups$avg_hhsize_own[su] <- rowMeans(
block_groups[su, c("avg_hhsize", "avg_hhsize_rent")],
na.rm = TRUE
)
block_groups[is.na(block_groups)] <- 0
### make larger income groups
income_vars <- grep("income", colnames(block_groups), value = TRUE)
block_groups$income_lt50 <- rowSums(
block_groups[, grep("_(?:lt1|[1-4][05]_).*\\d$", income_vars, value = TRUE)]
)
block_groups$income_50_100 <- rowSums(
block_groups[, grep("income_[5-7].*\\d$", income_vars, value = TRUE)]
)
block_groups$income_100_200 <- rowSums(
block_groups[, grep("_1\\d{2}.*\\d$", income_vars, value = TRUE)]
)
income_vars_select <- c(
"income_lt50", "income_50_100", "income_100_200", "income_gt200"
)
library(sf)
rownames(geography_bg) <- geography_bg$GEOID
st_geometry(block_groups) <- st_geometry(geography_bg[block_groups$GEOID, ])
# parcel data
parcel_file <- paste0(base_dir, "/parcels.rds")
if (!file.exists(parcel_file)) {
parcels <- st_read(paste0(
"https://arlgis.arlingtonva.us/arcgis/rest/",
"services/Open_Data/od_MHUD_Polygons/",
"FeatureServer/0/query?where=1=1&outFields=*&outSR=4326&f=json"
))
saveRDS(parcels, parcel_file, compress = "xz")
}
parcels <- readRDS(parcel_file)
colnames(parcels)[colnames(parcels) == "Total_Units"] <- "Total"
parcels$OBJECTID <- as.character(parcels$OBJECTID)
parcels <- parcels[parcels$Year_Built <= 2021, ]
Ultimately, we’ll want to use these methods to get values for unobserved geographies, but for this comparison of methods, we’ll want a way to test their results against known values.
For this test case, we can simply start at a higher geolevel, and make estimates at the block group level. To do that, we’ll need to aggregate our block group data up to tracts:
tracts <- redistribute(block_groups, geography_tr)
Redistribution
As a baseline, the first method to consider is proportional redistribution of the block group summary estimates alone.
Direct
We could do this from tracts to block groups directly:
# going from tracts to block groups with no information
estimate_tr_to_bg <- redistribute(
tracts, block_groups,
source_id = "id"
)
# mean absolute error between total population estimates
mean(abs(estimate_tr_to_bg$Total - block_groups$Total))
#> [1] 319.8405
We can see in this case, the error introduce when we only have proportional information at the lower level; block groups that once varied within tract now have the same value:
Down and Up
Or we could go from tracts down to parcels, then back up to block groups:
# function to apply tract to parcel to block group redistribution
redist_downup <- function(
top, bottom, middle, map_tb = NULL, map_bm = NULL,
weight = "Total", source_id = "id",
target_id = "OBJECTID", target_id_mid = "GEOID") {
to_bottom <- redistribute(
top, bottom, map_tb,
weight = weight, source_id = source_id, target_id = target_id,
default_value = 0
)
redistribute(
to_bottom, middle, map_bm,
source_id = "id", target_id = target_id_mid,
default_value = 0
)
}
# pre-make maps
## from block groups to parcels
map_bg_to_parcel <- redistribute(
block_groups, parcels,
target_id = "OBJECTID", return_map = TRUE, overlaps = "first"
)
parcels$geoid <- structure(
rep(names(map_bg_to_parcel), vapply(map_bg_to_parcel, length, 0)),
names = names(unlist(unname(map_bg_to_parcel)))
)[parcels$OBJECTID]
block_groups <- block_groups[block_groups$GEOID %in% parcels$geoid, ]
## from tracts to parcels
map_tr_to_parcel <- split(unlist(unname(map_bg_to_parcel)), rep(
substring(names(map_bg_to_parcel), 1, 11), vapply(map_bg_to_parcel, length, 0)
))
# check that down from and up to the same level works
all.equal(
as.data.frame(redist_downup(
block_groups, parcels, block_groups, map_bg_to_parcel, map_bg_to_parcel,
source_id = "GEOID"
)[, -1]),
as.data.frame(block_groups[, -1])
)
#> [1] TRUE
# redistribute
estimate_redist <- redist_downup(
tracts, parcels, block_groups, map_tr_to_parcel, map_bg_to_parcel
)
# mean absolute error between total population estimates
mean(abs(estimate_redist$Total - block_groups$Total))
#> [1] 231.1852
Now there is more of the original variation between block groups:
Because the parcel totals are living units rather than individuals, we may be able to refine our redistribution based on some heuristic estimates of number of people per living unit:
# 2.5 for multi-unit parcels, 5.5 for single-unit parcels
parcels$Residents <- parcels$Total * (2.5 + (parcels$Total == 1) * 3)
# now see if this improved our redistribution
estimate_redist_adj <- redist_downup(
tracts, parcels, block_groups, map_tr_to_parcel, map_bg_to_parcel,
weight = "Residents"
)
# mean absolute error between total population estimates
mean(abs(estimate_redist_adj$Total - block_groups$Total))
#> [1] 194.0749
For a better sense of what this resident adjustment is doing, we can look at the difference made in a single tract:
# identify the most different tract between variants
tract <- substring(block_groups$GEOID[
which.max(abs(estimate_redist_adj$Total - estimate_redist$Total))
], 1, 11)
su <- substring(estimate_redist$id, 1, 11) == tract
tsu <- tracts$id == tract
psu <- substring(parcels$geoid, 1, 11) == tract
# recalculate parcel-level values within that tract
differences <- list(
parcels = redistribute(
tracts$Total[tsu], parcels[psu, ],
target_id = "OBJECTID", weight = "Residents", return_geometry = FALSE
)[, 2] - redistribute(
tracts$Total[tsu], parcels[psu, ],
target_id = "OBJECTID", weight = "Total", return_geometry = FALSE
)[, 2],
block_group = estimate_redist_adj$Total[su] - estimate_redist$Total[su]
)
Now we can look at the difference in the contained parcels and block groups:
In this case, the difference mostly comes down to two buildings with many units that get less relative weight with the resident values.
PUMS + Raking
So far, we’ve used some additional information (total units) at the parcel level to improve the distribution of values across block groups, relative to a uniform distribution.
We have some more information at the parcel level, such as unit type and number of renter versus owner designations, but to make use of this information, we need some association between source values and those features. This is where the census microdata sample can come in: The Public Use Microdata Sample (PUMS) consists of individual-level observations that include demographic features and housing features.
pums <- download_census_pums(base_dir, "va", 2021, one_year = FALSE)
#> ℹ loading household sample: h.csv.xz
#> ℹ loading person sample: p.csv.xz
#> ℹ loading crosswalk 2010_Census_Tract_to_2010_PUMA.txt
# prepare IDs
pums$crosswalk$GEOID <- do.call(paste0, pums$crosswalk[, 1:3])
pums$person$ID <- do.call(paste0, pums$person[, c("SERIALNO", "SPORDER")])
# prepare variables
## survey categories
pums$person$sex_cat <- c("sex_male", "sex_female")[as.numeric(pums$person$SEX)]
pums$person$race_cat <- paste0("race_", c(
"white", "black", "native", rep("other", 2), "asian", rep("other", 3)
))[as.numeric(pums$person$RAC1P)]
pums$household$income_cat <- as.character(cut(
pums$household$HINCP, c(-Inf, 50, 100, 200, Inf) * 1e3, income_vars_select,
right = FALSE
))
## unit categories
pums$household <- pums$household[
!is.na(pums$household$BLD) & !is.na(pums$household$TEN),
]
pums$household$building_type <- "MULTI"
pums$household$building_type[pums$household$BLD == "02"] <- "SFD"
pums$household$building_type[pums$household$BLD == "03"] <- "SFA"
pums$household$status <- "RENTER"
pums$household$status[pums$household$TEN %in% 1:2] <- "OWNER"
The PUM sample is only located down to PUM areas, which are made up of tracts, so this is where we can start for a baseline.
We will use associations between demographic and housing features within each PUMA to get probabilities at the parcel level, then redistribute our source values to them like before.
The first step to this end is to prepare our data so it is easy to associate the individual PUMS variables with their per-level summaries at the tract level:
# define variables of interest
vars_house <- c(
ID = "SERIALNO", weight = "WGTP", type = "building_type",
status = "status", income = "income_cat"
)
vars_person <- c(
ID = "ID", weight = "PWGTP", sex_cat = "sex_cat", race_cat = "race_cat"
)
vars_units <- c(
type = "Unit_Type", renters = "Renter_Occupied", owners = "Owner_Occupied"
)
## get their levels
vars_list <- c(
lapply(vars_person[-(1:2)], function(var) unique(pums$person[[var]])),
income_cat = list(unique(pums$household$income_cat))
)
return_vars <- names(vars_list)
pvars <- unlist(vars_list, use.names = FALSE)
vars <- c(pvars, "avg_hhsize_own", "avg_hhsize_rent")
# prepare datasets split into PUMAs
pumas_focal <- unique(pums$crosswalk$PUMA5CE[pums$crosswalk$GEOID %in% tracts$id])
data <- lapply(structure(pumas_focal, names = pumas_focal), function(puma) {
households <- as.data.frame(pums$household[pums$household$PUMA == puma, vars_house])
rownames(households) <- households$SERIALNO
ids <- pums$crosswalk$GEOID[
pums$crosswalk$PUMA5CE == puma & pums$crosswalk$GEOID %in% tracts$id
]
person <- pums$person[pums$person$SERIALNO %in% households$SERIALNO, ]
list(
individuals = cbind(households[person$SERIALNO, ], person),
source = tracts[tracts$id %in% ids, c("id", vars), drop = TRUE],
parcels = parcels[
substring(parcels$geoid, 1, 11) %in% ids, c("geoid", "OBJECTID", vars_units),
drop = TRUE
]
)
})
# function to apply each method to the data
apply_method <- function(data, method, parcel_only = FALSE, rescale = TRUE) {
p <- do.call(rbind, lapply(data, function(set) {
set$source <- st_drop_geometry(set$source)
do.call(rbind, lapply(unique(set$source$id), function(id) {
source <- set$source[set$source$id == id, -1]
target <- set$parcels[if ("map" %in% names(set)) {
if ("geography" %in% names(set)) {
set$parcels$geoid %in% set$map[[id]]
} else {
set$parcels$geoid == id
}
} else {
substring(set$parcels$geoid, 1, 11) == id
}, ]
if (nrow(target)) {
est <- if (sum(source)) method(source, target[, -1], set$individuals) else NULL
if (length(est)) {
if (rescale) {
totals <- colSums(est)
totals[!totals] <- 1
est <- sweep(est, 2, totals, "/") * rep(
as.numeric(source[, colnames(est)]),
each = nrow(est)
)
}
su <- !pvars %in% colnames(est)
if (any(su)) est[, pvars[su]] <- 0
cbind(target[, 1:2], est[as.character(target$OBJECTID), pvars])
} else {
est <- target[, 1:2]
est[, pvars] <- 0
est
}
}
}))
}))
if (parcel_only) {
p
} else {
list(parcels = p, block_groups = redistribute(
p[, -1], block_groups, map_bg_to_parcel,
source_id = "OBJECTID", target_id = "GEOID", default_value = 0
))
}
}
# function to calculate estimates from weights
make_estimates <- function() {
do.call(rbind, lapply(unique(target$Unit_Type), function(type) {
d <- target[
target$Unit_Type == type, c("OBJECTID", "Renter_Occupied", "Owner_Occupied")
]
d$Renter_Occupied <- d$Renter_Occupied * source$avg_hhsize_rent
d$Owner_Occupied <- d$Owner_Occupied * source$avg_hhsize_own
nd <- nrow(d)
colnames(d)[-1] <- c("RENTER", "OWNER")
i <- individuals[
individuals$building_type == type, c("weight", "status", return_vars)
]
as.data.frame(Reduce("+", lapply(
(if (is.factor(i$status)) levels else unique)(i$status),
function(s) {
ii <- i[i$status == s, ]
weight_total <- sum(ii$weight)
do.call(cbind, lapply(return_vars, function(cat) {
props <- tapply(ii$weight, ii[[cat]], sum) / weight_total
props[vars_list[[cat]][!vars_list[[cat]] %in% names(props)]] <- 0
props[is.na(props)] <- 0
props <- props[vars_list[[cat]]]
matrix(
d[[s]] * rep(props, each = nd),
nd,
dimnames = list(d$OBJECTID, names(props))
)
}))
}
)))
}))
}
Now we can apply each method to this same set of data.
Before the new methods, we can also update our heuristic adjustment to the proportional redistribution method with PUMS information:
# make single-family-home indicators
pums$household$single_family_home <- pums$household$building_type != "MULTI"
parcels$single_family_home <- parcels$Unit_Type != "MULTI"
# update resident estimates with average household size for
# single family versus other home types within the county
pums$household$size <- table(pums$person$SERIALNO)[pums$household$SERIALNO]
households <- pums$household[pums$household$PUMA %in% pumas_focal, ]
parcels$Residents <- parcels$Total * tapply(
households$size, households$single_family_home, mean
)[as.character(parcels$single_family_home)]
# recalculate the adjusted redistribution results
estimate_redist_adj <- redist_downup(
tracts, parcels, block_groups, map_tr_to_parcel, map_bg_to_parcel,
weight = "Residents"
)
mean(abs(estimate_redist_adj$Total - block_groups$Total))
#> [1] 201.2044
Initial Weights
As a baseline, we can calculate a set of proportions for each PUMA, then apply that to the parcels in each contained tract:
# function to apply the method:
# - source: the tract-level totals
# - target: the parcels
# - individuals: the individual PUM responses
est_baseline <- function(source, target, individuals) {
individuals$weight <- individuals$PWGTP
environment(make_estimates) <- environment()
make_estimates()
}
# apply across PUMAs and tracts within them
# and aggregate to block groups
estimate_baseline <- apply_method(data, est_baseline)
Because this method does not make estimates that recover their initial values, we have to rescale them to be comparable. For illustration:
# adjust tract counts so they match
# unit totals * average household size across parcels
tr_adj <- st_drop_geometry(tracts[tracts$id %in% pums$crosswalk$GEOID, ])
pid <- substring(parcels$geoid, 1, 11)
parcels$renter_est <- parcels$Renter_Occupied * structure(
tracts$avg_hhsize_rent,
names = tracts$id
)[pid]
parcels$owner_est <- parcels$Owner_Occupied * structure(
tracts$avg_hhsize_own,
names = tracts$id
)[pid]
parcels$resident_est <- parcels$renter_est + parcels$owner_est
tr_adj$Total <- tapply(parcels$resident_est, pid, sum)[tr_adj$id]
for (g in vars_list) {
tr_adj[, g] <- tr_adj[, g] / rowSums(tr_adj[, g]) * tr_adj$Total
}
tr_adj[is.na(tr_adj)] <- 0
# make baseline parcel-level estimates without rescaling
data_adj <- data
data_adj[[1]]$source <- tr_adj[tr_adj$id %in% data[[1]]$source$id, c("id", vars)]
data_adj[[2]]$source <- tr_adj[tr_adj$id %in% data[[2]]$source$id, c("id", vars)]
parcel_adj <- apply_method(
data_adj, est_baseline,
rescale = FALSE, parcel_only = TRUE
)
# aggregate back up to tracts and measure per-category error
tr_est <- redistribute(
parcel_adj, tr_adj, map_tr_to_parcel,
source_id = "OBJECTID", target_id = "id", default_value = 0
)
data.frame(MAE = round(vapply(
pvars, function(var) mean(abs(tr_est[[var]] - tr_adj[[var]])), 0
), 5))
#> MAE
#> sex_female 304.75734
#> sex_male 304.75734
#> race_black 574.76484
#> race_white 1093.99980
#> race_other 698.47523
#> race_asian 471.04857
#> race_native 45.12699
#> income_lt50 840.36931
#> income_gt200 767.20051
#> income_100_200 904.24289
#> income_50_100 704.13201
## this method does recover the parcel-level totals and overall totals
g <- c("sex_female", "sex_male")
all(abs(rowSums(parcel_adj[, g]) - structure(
parcels$resident_est,
names = parcels$OBJECTID
)[parcel_adj$OBJECTID]) < 1e-12)
#> [1] TRUE
c(
original = sum(
rbind(data_adj[[1]]$source[, g], data_adj[[2]]$source[, g])
),
parcel = sum(parcel_adj[, g]),
tract = sum(tr_est[, g])
)
#> original parcel tract
#> 691326.4 691326.4 691326.4
# rescaling makes the totals line up
parcel_adj_rescaled <- apply_method(
data_adj, est_baseline,
parcel_only = TRUE
)
tr_est_rescaled <- redistribute(
parcel_adj_rescaled, tr_adj, map_tr_to_parcel,
source_id = "OBJECTID", target_id = "id", default_value = 0
)
data.frame(MAE = round(vapply(
pvars, function(var) mean(abs(tr_est_rescaled[[var]] - tr_adj[[var]])), 0
), 5))
#> MAE
#> sex_female 0
#> sex_male 0
#> race_black 0
#> race_white 0
#> race_other 0
#> race_asian 0
#> race_native 0
#> income_lt50 0
#> income_gt200 0
#> income_100_200 0
#> income_50_100 0
# just like the redistribution methods
tr_redist <- redist_downup(
tr_adj, parcels, tr_adj, map_tr_to_parcel, map_tr_to_parcel
)
data.frame(MAE = round(vapply(
pvars, function(var) mean(abs(tr_redist[[var]] - tr_adj[[var]])), 0
), 5))
#> MAE
#> sex_female 0
#> sex_male 0
#> race_black 0
#> race_white 0
#> race_other 0
#> race_asian 0
#> race_native 0
#> income_lt50 0
#> income_gt200 0
#> income_100_200 0
#> income_50_100 0
Raking
In the baseline, we’re using PUMS weights to calculate proportions, but these weights are meant to bring totals in line with PUMA-level estimates; Since we have lower-level information in the form of tract-level totals, we can adjust those weights to match each smaller-area summary:
library(anesrake)
rake_prep <- function() {
eval(expression({
for (var in names(totals)) {
individuals[[var]] <- as.factor(individuals[[var]])
l <- levels(individuals[[var]])
totals[[var]] <- totals[[var]][l]
su <- is.na(totals[[var]]) | totals[[var]] == 0
if (sum(su)) {
individuals <- individuals[!individuals[[var]] %in% l[su], ]
individuals[[var]] <- droplevels(individuals[[var]])
totals[[var]] <- totals[[var]][!su]
}
total <- sum(totals[[var]])
if (total) totals[[var]] <- totals[[var]] / total
}
totals <- Filter(length, totals)
}), parent.frame())
}
est_rake <- function(source, target, individuals) {
totals <- lapply(
structure(names(vars_list)[1:2], names = names(vars_list)[1:2]),
function(n) unlist(source[, vars_list[[n]]])
)
rake_prep()
individuals$status <- as.factor(individuals$status)
individuals$building_type <- as.factor(individuals$building_type)
individuals$income_cat <- as.factor(individuals$income_cat)
capture.output(individuals$weight <- tryCatch(
anesrake(
totals, individuals, individuals$ID, individuals$PWGTP,
pctlim = .001
)$weightvec[as.character(individuals$ID)],
error = function(e) {
warning(e$message)
individuals$PWGTP
}
))
environment(make_estimates) <- environment()
make_estimates()
}
estimate_rake <- apply_method(data, est_rake)
Two Step Raking
In the initial raking approach, we only considered the person-level variables, but we also have household-level information, so we might try to account for this as well by first adjusting the household weights, then using those adjusted weights to initialize the adjusted person-level weights:
est_twostep <- function(source, target, individuals) {
all <- individuals
individuals <- individuals[!duplicated(individuals$SERIALNO), ]
# step 1: initial weights at household level
totals <- list(
income_cat = unlist(source[, grep("^income_", colnames(source))]),
status = structure(
colSums(target[, vars_units[-1]]),
names = c("RENTER", "OWNER")
),
building_type = tapply(rowSums(target[, vars_units[-1]]), target$Unit_Type, sum)
)
rake_prep()
capture.output(initial <- tryCatch(
anesrake(
totals, individuals, individuals$SERIALNO, individuals$WGTP,
pctlim = .001
)$weightvec,
error = function(e) {
warning(e$message)
structure(individuals$WGTP, names = individuals$SERIALNO)
}
))
all$weight <- initial[as.character(all$SERIALNO)]
all <- all[!is.na(all$weight), ]
individuals <- all
# step 2: adjust at person level
totals <- lapply(
structure(names(vars_list)[1:2], names = names(vars_list)[1:2]),
function(n) unlist(source[, vars_list[[n]]])
)
rake_prep()
initial <- individuals$weight
capture.output(individuals$weight <- tryCatch(
anesrake(
totals, individuals, individuals$ID,
individuals$PWGTP * initial / individuals$WGTP,
pctlim = .001
)$weightvec[as.character(individuals$ID)],
error = function(e) {
warning(e$message)
individuals$PWGTP * initial / individuals$WGTP
}
))
environment(make_estimates) <- environment()
make_estimates()
}
estimate_twostep <- apply_method(data, est_twostep)
N Step Raking
We can take the two-step method further by repeating the process to try and better fit both the household and individual data:
est_nstep <- function(source, target, individuals) {
individuals$weight <- individuals$PWGTP
all <- individuals
iter <- function() {
individuals <- individuals[!duplicated(individuals$SERIALNO), ]
# step 1: initial weights at household level
totals <- list(
income_cat = unlist(source[, grep("^income_", colnames(source))]),
status = structure(
colSums(target[, vars_units[-1]]),
names = c("RENTER", "OWNER")
),
building_type = tapply(rowSums(target[, vars_units[-1]]), target$Unit_Type, sum)
)
rake_prep()
household_totals <- totals
capture.output(initial <- tryCatch(
anesrake(
totals, individuals, individuals$SERIALNO,
individuals$weight / individuals$PWGTP * individuals$WGTP,
pctlim = .001
)$weightvec,
error = function(e) {
warning(e$message)
structure(
individuals$weight / individuals$PWGTP * individuals$WGTP,
names = individuals$SERIALNO
)
}
))
all$weight <- initial[as.character(all$SERIALNO)]
all <- all[!is.na(all$weight), ]
individuals <- all
# step 2: adjust at person level
totals <- lapply(
structure(names(vars_list)[1:2], names = names(vars_list)[1:2]),
function(n) unlist(source[, vars_list[[n]]])
)
rake_prep()
initial <- individuals$weight
capture.output(individuals$weight <- tryCatch(
anesrake(
totals, individuals, individuals$ID,
individuals$PWGTP * initial / individuals$WGTP,
pctlim = .001
)$weightvec[as.character(individuals$ID)],
error = function(e) {
warning(e$message)
individuals$PWGTP * initial / individuals$WGTP
}
))
hh <- individuals[!duplicated(individuals$SERIALNO), ]
hh$weight[hh$weight == 0] <- 1
weight <- hh$PWGTP / hh$weight * hh$WGTP
list(
error = mean(vapply(names(household_totals), function(v) {
mean((household_totals[[v]] - tapply(weight, hh[[v]], sum) / sum(weight))^2)
}, 0), na.rm = TRUE),
data = individuals
)
}
previous_error <- 0
for (i in 1:20) {
step <- iter()
individuals <- step$data
if (step$error < .001 || abs(previous_error - step$error) < .0005) break
previous_error <- step$error
}
environment(make_estimates) <- environment()
make_estimates()
}
estimate_nstep <- apply_method(data, est_nstep)
Generalized Raking
The two-step approach still mainly focuses on better aligning the person-level weights – the household totals only match after the first step, but are not considered in the second step. Alternative to this, we might try to match the household- and person-level weights at the same time with a generalized raking approach:
library(mlfit)
est_grake <- function(source, target, individuals) {
totals <- list(
person = sum(individuals$PWGTP),
household = sum(individuals$WGTP[!duplicated(individuals$SERIALNO)])
)
person <- lapply(names(vars_list)[1:2], function(n) {
l <- vars_list[[n]]
count <- as.numeric(source[, l, drop = TRUE])
count[!count] <- 1
count <- count / sum(count) * totals$person
s <- data.frame(level = l, count = count)
colnames(s)[1] <- n
s
})
household <- unique(individuals$building_type)
household <- structure(numeric(length(household)), names = household)
observed_types <- tapply(
rowSums(target[, vars_units[-1]]), target$Unit_Type, sum
)
household[names(observed_types)] <- observed_types
household <- list(data.frame(
building_type = names(household), count = household
))
household[[1]]$count[is.na(household[[1]]$count)] <- 1
household[[2]] <- data.frame(
status = c("RENTER", "OWNER"),
count = colSums(target[, c("Renter_Occupied", "Owner_Occupied")])
)
household[[3]] <- unlist(source[, grep("^income_", colnames(source))])
household[[3]] <- data.frame(
income_cat = names(household[[3]]), count = household[[3]]
)
names(household) <- c("building_type", "status", "income_cat")
household <- lapply(names(household), function(var) {
l <- household[[var]]
l <- l[l[[1]] %in% unique(individuals[, var]), ]
l[[2]][!l[[2]]] <- 1
l[[2]] <- l[[2]] / sum(l[[2]]) * totals$household
l
})
m <- ml_problem(
individuals,
field_names = special_field_names("SERIALNO", "ID", count = "count"),
group_controls = household,
individual_controls = person,
prior_weights = individuals$WGTP
)
fm <- tryCatch(
suppressWarnings(ml_fit_dss(m, ginv = solve)),
error = function(e) NULL
)
if (!isTRUE(fm$success)) {
fm <- tryCatch(
suppressWarnings(ml_fit_ipu(m)),
error = function(e) NULL
)
}
individuals$weight <- if (is.null(fm$weight)) {
individuals$PWGTP
} else {
fm$weight
}
environment(make_estimates) <- environment()
make_estimates()
}
estimate_grake <- apply_method(data, est_grake)
Comparisons
Tract Aggregates
Now we can compare all of the considered methods within each of the variables we used to inform the raking approaches:
error <- do.call(rbind, lapply(structure(pvars, names = pvars), function(var) {
d <- cbind(
Prop = estimate_redist[[var]],
"Prop Adj" = estimate_redist_adj[[var]],
Baseline = estimate_baseline$block_groups[[var]],
Rake = estimate_rake$block_groups[[var]],
"Rake 2 Step" = estimate_twostep$block_groups[[var]],
"Rake N Step" = estimate_nstep$block_groups[[var]],
"Rake General" = estimate_grake$block_groups[[var]]
)
colMeans(abs(d - block_groups[[var]]))
}))
error <- rbind(error, Average = colMeans(error))
kable(error, digits = 3)
Prop | Prop Adj | Baseline | Rake | Rake 2 Step | Rake N Step | Rake General | |
---|---|---|---|---|---|---|---|
sex_female | 121.352 | 105.943 | 219.286 | 219.348 | 219.254 | 218.665 | 218.654 |
sex_male | 128.929 | 117.032 | 233.955 | 234.025 | 233.991 | 234.246 | 234.482 |
race_black | 61.684 | 61.584 | 72.059 | 71.940 | 72.473 | 72.777 | 71.546 |
race_white | 166.633 | 145.178 | 291.699 | 292.624 | 289.364 | 288.859 | 292.544 |
race_other | 76.493 | 75.017 | 91.940 | 91.577 | 91.380 | 92.491 | 90.631 |
race_asian | 55.791 | 56.351 | 75.142 | 75.073 | 74.722 | 74.077 | 75.122 |
race_native | 5.577 | 5.663 | 5.135 | 5.218 | 5.085 | 4.998 | 5.138 |
income_lt50 | 38.820 | 40.608 | 51.486 | 51.392 | 51.004 | 51.024 | 51.648 |
income_gt200 | 50.340 | 45.409 | 59.663 | 59.603 | 59.930 | 59.950 | 60.083 |
income_100_200 | 47.578 | 46.870 | 92.734 | 92.493 | 91.819 | 91.752 | 92.352 |
income_50_100 | 39.530 | 41.883 | 63.116 | 63.111 | 62.817 | 62.952 | 62.810 |
Average | 72.066 | 67.412 | 114.201 | 114.219 | 113.804 | 113.799 | 114.092 |
Full results are presented in a data site: redistribution_arlington
Alternate Geolevels
So far, we’ve used the same geolevel-related test-case to assess our methods, but this may have an effect in itself. As an alternative, we might abandon canonical tracts in favor of randomly agglutinated block groups.
# randomly combine adjacent block groups into pairs of 2 (were possible)
# apply within each PUMA
apply_altgeo <- function(run = NULL, return_full = FALSE) {
vdata <- lapply(
structure(pumas_focal, names = pumas_focal),
function(puma) {
d <- data[[puma]]
su <- substring(block_groups$GEOID, 1, 11) %in% d$source$id
vtr <- make_parent_polygons(
block_groups[su, ], "GEOID",
n_as_min = TRUE, verbose = FALSE
)
names(vtr$new) <- names(vtr$map) <- paste0(
puma, "_", seq_along(vtr$new)
)
d$geography <- vtr$new
d$map <- vtr$map
d$source <- redistribute(
block_groups[su, c("GEOID", vars)], vtr$new, vtr$map,
default_value = 0
)
d
}
)
vsource <- do.call(rbind, lapply(vdata, "[[", "source"))
vmap <- redistribute(
vsource, parcels,
source_id = "id", target_id = "OBJECTID", return_map = TRUE
)
vestimates <- list(
Prop = list(block_groups = redist_downup(
vsource, parcels, block_groups, vmap, map_bg_to_parcel
)),
"Prop Adj" = list(block_groups = redist_downup(
vsource, parcels, block_groups, vmap, map_bg_to_parcel,
weight = "Residents"
)),
Baseline = apply_method(vdata, est_baseline),
Rake = apply_method(vdata, est_rake),
"Rake 2 Step" = apply_method(vdata, est_twostep),
"Rake N Step" = apply_method(vdata, est_nstep),
"Rake General" = apply_method(vdata, est_grake)
)
verror <- do.call(rbind, lapply(
structure(pvars, names = pvars),
function(var) {
colMeans(abs(as.data.frame(
lapply(vestimates, function(d) d$block_groups[[var]]),
check.names = FALSE
) - block_groups[[var]]))
}
))
verror <- rbind(verror, Average = colMeans(verror))
if (return_full) {
list(
data = data, source = vsource, map = vmap,
estimates = vestimates, error = verror
)
} else {
verror
}
}
estimates_altgeo <- apply_altgeo(return_full = TRUE)
kable(estimates_altgeo$error, digits = 3)
Prop | Prop Adj | Baseline | Rake | Rake 2 Step | Rake N Step | Rake General | |
---|---|---|---|---|---|---|---|
sex_female | 215.937 | 200.870 | 216.167 | 216.578 | 216.313 | 215.513 | 216.289 |
sex_male | 236.555 | 219.472 | 241.522 | 241.517 | 241.489 | 241.820 | 241.303 |
race_black | 72.468 | 71.878 | 69.874 | 69.645 | 69.612 | 70.061 | 69.561 |
race_white | 312.564 | 288.576 | 300.305 | 303.951 | 301.975 | 301.493 | 305.405 |
race_other | 86.287 | 85.025 | 88.380 | 87.935 | 88.173 | 88.312 | 88.221 |
race_asian | 65.491 | 65.548 | 67.677 | 67.883 | 67.587 | 67.091 | 67.628 |
race_native | 4.689 | 4.794 | 4.534 | 4.734 | 4.475 | 4.441 | 4.386 |
income_lt50 | 48.612 | 51.038 | 46.041 | 45.849 | 45.804 | 45.741 | 46.130 |
income_gt200 | 63.435 | 59.583 | 55.054 | 55.228 | 55.047 | 54.905 | 54.798 |
income_100_200 | 86.937 | 87.857 | 89.694 | 89.708 | 89.437 | 89.174 | 89.610 |
income_50_100 | 62.409 | 64.883 | 60.513 | 60.631 | 60.241 | 60.246 | 60.339 |
Average | 114.126 | 109.048 | 112.705 | 113.060 | 112.741 | 112.618 | 113.061 |
We can also take a look at the virtual tracts that were created in this run:
Since these virtual tracts are randomly created, we might also look at results across a few runs for a sense of variation:
# since this can be quite long-running,
# we'll save and reload results
results_altgeo_file <- paste0(base_dir, "/results_altgeo.rds")
if (file.exists(results_altgeo_file)) {
results_altgeo <- readRDS(results_altgeo_file)
} else {
results_altgeo <- lapply(1:50, apply_altgeo)
saveRDS(results_altgeo, results_altgeo_file)
}
kable(Reduce("+", results_altgeo) / length(results_altgeo), digits = 3)
Prop | Prop Adj | Baseline | Rake | Rake 2 Step | Rake N Step | Rake General | |
---|---|---|---|---|---|---|---|
sex_female | 113.199 | 100.320 | 110.849 | 111.066 | 111.120 | 110.519 | 110.812 |
sex_male | 131.315 | 117.479 | 129.066 | 128.938 | 128.895 | 129.342 | 128.710 |
race_black | 61.055 | 61.697 | 57.506 | 57.528 | 57.178 | 57.315 | 57.711 |
race_white | 172.615 | 148.385 | 155.580 | 157.644 | 154.924 | 154.485 | 158.939 |
race_other | 74.940 | 74.262 | 75.171 | 74.803 | 74.689 | 74.914 | 75.175 |
race_asian | 49.191 | 50.147 | 49.037 | 49.377 | 49.214 | 48.922 | 49.299 |
race_native | 4.810 | 4.896 | 4.608 | 4.816 | 4.553 | 4.534 | 4.658 |
income_lt50 | 36.104 | 38.230 | 32.539 | 32.607 | 32.378 | 32.395 | 33.134 |
income_gt200 | 48.107 | 43.420 | 37.323 | 37.354 | 37.422 | 37.540 | 37.199 |
income_100_200 | 47.601 | 48.775 | 46.230 | 46.253 | 45.911 | 45.816 | 45.995 |
income_50_100 | 36.142 | 38.199 | 34.363 | 34.607 | 34.254 | 34.301 | 34.397 |
Average | 70.462 | 65.983 | 66.570 | 66.818 | 66.412 | 66.371 | 66.912 |
Simulated Populations
So far, we’ve used the same data at different initial geolevels as a way to introduce error while still having a known target to measure error from. This may have the benefit of retaining some properties of the data that affect our methods in unknown ways, but the down side is that we cannot very directly measure error, and we are testing against estimates with their own error.
An alternative is to simulate the initial population, then draw samples from that population to form simulated version of the data we have available (sparse PUM samples, and less sparse block group summaries). Then, instead of having to go down to parcels and back up to artificially unknown intermediate layers (block groups) to measure error, we can measure directly at the parcel level.
# simulating the whole population is pretty intensive,
# so we'll just do that once, then draw samples from it
population_file <- paste0(base_dir, "/simulated_population.rds")
if (file.exists(population_file)) {
population <- readRDS(population_file)
} else {
population <- generate_population(
sum(parcels$Total), parcels, "Total", "OBJECTID"
)
population$households$SERIALNO <- population$households$household
population$individuals$ID <- population$individuals$person
# make categories
## household
population$households$building_type <- c(
"MULTI", "SFD", "SFA"
)[population$households$building_type]
population$households$status <- c("OWNER", "RENTER")[
population$households$renting + 1
]
population$households$Total <- structure(
population$regions$capacity,
names = population$regions$id
)[population$households$region]
population$households$head_income <- population$households$head_income /
max(population$households$head_income) * max(pums$household$HINCP)
population$households$income_cat <- as.character(cut(
population$households$head_income,
c(-Inf, 50, 100, 200, Inf) * 1e3,
c("income_lt50", "income_50_100", "income_100_200", "income_gt200"),
right = TRUE
))
## individual
population$individuals$sex_cat <- c("sex_male", "sex_female")[
population$individuals$sex + 1
]
population$individuals$race_cat <- rep_len(
grep("^race", pvars, value = TRUE), max(population$individuals$race) + 1
)[population$individuals$race + 1]
population$individuals$income <- population$individuals$income /
max(population$individuals$income) * max(pums$household$HINCP)
population$individuals$income_cat <- as.character(cut(
population$individuals$income,
c(-Inf, 50, 100, 200, Inf) * 1e3,
c("income_lt50", "income_50_100", "income_100_200", "income_gt200"),
right = TRUE
))
combined <- cbind(
population$households[population$individuals$household, ],
population$individuals
)
population$summary <- do.call(rbind, lapply(
split(combined, as.character(combined$region)),
function(d) {
household <- d[!duplicated(d$household), ]
owner <- household$status == "OWNER"
renter <- household$status == "RENTER"
cbind(
id = household$region[[1]],
Total = household$Total[[1]],
Owner_Occupied = sum(owner),
Renter_Occupied = sum(renter),
avg_hhsize = mean(household$size),
avg_hhsize_own = if (sum(owner)) mean(household$size[owner]) else 0,
avg_hhsize_rent = if (sum(renter)) mean(household$size[renter]) else 0,
as.data.frame(as.list(unlist(lapply(
names(vars_list),
function(var) as.list(table(factor(d[[var]], vars_list[[var]])))
))))
)
}
))
saveRDS(population, population_file, compress = "xz")
}
First, we can look at the block group summaries of the full simulated population:
The redistribution methods apply to census-like samples from a full population, so we’ll set up a function to both draw a sample and apply methods to it:
## for the initial sample, we'll aim for an even
## sampling from across block groups
sample_n <- ceiling(nrow(population$households) * .1 / nrow(block_groups))
map_parcel_to_bg <- structure(
rep(names(map_bg_to_parcel), vapply(map_bg_to_parcel, length, 0)),
names = names(unlist(unname(map_bg_to_parcel)))
)
draw_sample <- function(run = NULL, return_full = FALSE) {
# draw households, and retrieve individuals within them
sample_hh <- lapply(map_bg_to_parcel, function(bg) {
names(if (length(bg) < sample_n) bg else sample(bg, sample_n))
})
full <- population$households[
population$households$household %in% unlist(sample_hh, use.names = FALSE),
]
full$WGTP <- 1
full_individuals <- population$individuals[
population$individuals$household %in% full$household,
]
full_individuals$PWGTP <- 1
individuals <- cbind(
full[as.character(full_individuals$household), c("region", vars_house)],
full_individuals[, vars_person]
)
individuals$SERIALNO <- as.character(individuals$SERIALNO)
individuals$ID <- as.character(individuals$ID)
individuals$geoid <- structure(
rep(names(sample_hh), vapply(sample_hh, length, 0)),
names = unlist(sample_hh, use.names = FALSE)
)[individuals$SERIALNO]
bg <- do.call(rbind, lapply(
split(individuals, individuals$geoid),
function(d) {
hh <- d[!duplicated(d$SERIALNO), ]
hh$size <- table(d$SERIALNO)[hh$SERIALNO]
size <- tapply(hh$size, factor(hh$status, c("OWNER", "RENTER")), mean)
size[is.na(size)] <- 0
cbind(
id = d$geoid[[1]],
as.data.frame(unlist(lapply(
names(vars_list), function(var) {
as.list(vapply(
unique(vars_list[[var]]),
function(l) sum((d[[var]] == l) * d$PWGTP), 0
))
}
), recursive = FALSE)),
avg_hhsize = mean(hh$size),
avg_hhsize_own = size[["OWNER"]],
avg_hhsize_rent = size[["RENTER"]]
)
}
))
ps <- cbind(
population$regions,
population$summary[population$regions$id, c("Total", vars_units[-1])]
)
ps$OBJECTID <- population$regions$id
ps$geoid <- map_parcel_to_bg[population$regions$id]
ps$Unit_Type <- c("MULTI", "SFD", "SFA")[ps$building_type]
ps$PUMA <- structure(
pums$crosswalk$PUMA5CE,
names = pums$crosswalk$GEOID
)[substring(ps$geoid, 1, 11)]
ps <- ps[!is.na(ps$PUMA), ]
ps$single_family_home <- ps$Unit_Type != "MULTI"
ps$type <- do.call(
paste0, ps[, c("PUMA", "single_family_home"), drop = TRUE]
)
vdata <- lapply(
structure(pumas_focal, names = pumas_focal),
function(puma) {
ps <- ps[ps$PUMA == puma, ]
su <- which(individuals$region %in% ps$id)
list(
map = map_bg_to_parcel,
individuals = individuals[sample(su, floor(length(su) * .3)), ],
parcels = ps,
source = bg[bg$id %in% ps$geoid, ]
)
}
)
ps <- do.call(rbind, lapply(vdata, "[[", "parcels"))
individuals <- do.call(rbind, lapply(vdata, "[[", "individuals"))
hh <- individuals[!duplicated(individuals$SERIALNO), ]
hh$single_family_home <- hh$building_type != "MULTI"
hh$size <- table(individuals$SERIALNO)[as.character(hh$SERIALNO)]
ps$Residents <- ps$Total * tapply(
hh$size, hh$single_family_home, mean
)[as.character(ps$single_family_home)]
vestimates <- list(
Prop = redistribute(
bg, ps, map_bg_to_parcel,
weight = "Total", source_id = "id", default_value = 0
),
"Prop Adj" = redistribute(
bg, ps, map_bg_to_parcel,
weight = "Residents", source_id = "id", default_value = 0
),
Baseline = apply_method(vdata, est_baseline, parcel_only = TRUE),
Rake = apply_method(vdata, est_rake, parcel_only = TRUE),
"Rake 2 Step" = apply_method(vdata, est_twostep, parcel_only = TRUE),
"Rake N Step" = apply_method(vdata, est_nstep, parcel_only = TRUE),
"Rake General" = apply_method(vdata, est_grake, parcel_only = TRUE)
)
actual <- population$summary[ps$id, ]
verror <- do.call(rbind, lapply(
structure(pvars, names = pvars),
function(var) {
colMeans(abs(as.data.frame(
lapply(vestimates, function(d) d[[var]]),
check.names = FALSE
) - actual[[var]]))
}
))
verror <- rbind(verror, Average = colMeans(verror))
if (return_full) {
list(
data = data, source = bg, parcels = ps, map = map_bg_to_parcel,
estimates = vestimates, error = verror
)
} else {
verror
}
}
Now we can look at results from a single sample. These are not comparable to the other results because (a) they are simulated independently from the real samples (so, for instance, the race category baselines are randomized), and (b) error is measured at the parcel level, rather than the block group level.
estimates_sim <- draw_sample(return_full = TRUE)
kable(estimates_sim$error, digits = 5)
Prop | Prop Adj | Baseline | Rake | Rake 2 Step | Rake N Step | Rake General | |
---|---|---|---|---|---|---|---|
sex_female | 4.82773 | 4.82818 | 5.05596 | 5.05591 | 5.05543 | 5.05565 | 5.05508 |
sex_male | 4.84018 | 4.84059 | 5.07137 | 5.07147 | 5.07198 | 5.07178 | 5.07291 |
race_black | 3.51751 | 3.53062 | 3.69505 | 3.69516 | 3.69521 | 3.69553 | 3.69500 |
race_white | 0.30985 | 0.31093 | 0.32178 | 0.32178 | 0.32177 | 0.32177 | 0.32180 |
race_other | 1.75416 | 1.76040 | 1.82061 | 1.82068 | 1.82079 | 1.82087 | 1.82020 |
race_asian | 1.43407 | 1.43888 | 1.48769 | 1.48782 | 1.48750 | 1.48695 | 1.48766 |
race_native | 3.43651 | 3.44780 | 3.61038 | 3.61049 | 3.61078 | 3.61106 | 3.61119 |
income_lt50 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 |
income_gt200 | 9.32742 | 9.32830 | 9.76866 | 9.76860 | 9.76812 | 9.76814 | 9.77088 |
income_100_200 | 0.39683 | 0.39691 | 0.39670 | 0.39670 | 0.39670 | 0.39670 | 0.39670 |
income_50_100 | 0.00035 | 0.00035 | 0.00035 | 0.00035 | 0.00035 | 0.00035 | 0.00035 |
Average | 2.71315 | 2.71663 | 2.83896 | 2.83900 | 2.83897 | 2.83898 | 2.83925 |