1 |
#' Generate a Population |
|
2 |
#' |
|
3 |
#' Simulate a population of individuals within households, with complex relationships between |
|
4 |
#' demographic and location features. |
|
5 |
#' |
|
6 |
#' The population is generated in two steps: |
|
7 |
#' |
|
8 |
#' \strong{First}, households are generated and placed within \code{regions}. Placement within regions |
|
9 |
#' is determined by total distance from one or more regions selected as attraction loci. If |
|
10 |
#' coordinates are not provided, these are first randomly generated. |
|
11 |
#' |
|
12 |
#' After households are placed, household incomes (of the first member) is generated |
|
13 |
#' based on cost loci, which are then used to generate building types (where types are increasingly |
|
14 |
#' associated with income) and then household size (based on size loci, income, and building type). |
|
15 |
#' Renting status is then generated based on income and building type: |
|
16 |
#' 60\% chance if income is under the mean income, and 20\% otherwise, multiplied by .8 |
|
17 |
#' if the building type is of a selected renting type, or .3 otherwise. |
|
18 |
#' |
|
19 |
#' \strong{Second}, individuals are generated for each household. To generate an individual, |
|
20 |
#' first, neighbors are searched for, based on \code{n_neighbors} and \code{neighbor_range}. |
|
21 |
#' Any neighbors are summarized: average age and income, and tabulated race. |
|
22 |
#' |
|
23 |
#' These then affect the first member of the household: age is first drawn from a Beta |
|
24 |
#' distribution (with shapes of \code{1} and \code{2} if renting or \code{1.5} otherwise, |
|
25 |
#' multiplied by \code{80}) and added to \code{18}, then adjusted toward a random value centered |
|
26 |
#' on the average neighbor age (floored Gaussian with a standard deviation of \code{1}), and race |
|
27 |
#' is sampled (that with the highest result of a Binomial draw with \code{n_races} trials |
|
28 |
#' proportion of \code{neighbors * base rate} chance of success for each race group). |
|
29 |
#' |
|
30 |
#' Neighbors also affect the income of the second member of the household if the first |
|
31 |
#' member's income is under the neighbor mean income (or under \code{40,000} given no neighbors); |
|
32 |
#' in this case, the second member's income is drawn from a Gaussian distribution centered on |
|
33 |
#' the first member's income, with a standard deviation of \code{10,000}. |
|
34 |
#' |
|
35 |
#' The second member's age is based on that of the first member; a floored Gaussian centered on |
|
36 |
#' the first member's age, with a standard deviation of \code{15} if the first member's age |
|
37 |
#' if over \code{40}, or \code{5} otherwise, trimmed to be between \code{18} and \code{90}. |
|
38 |
#' |
|
39 |
#' The second member's race has a 70\% chance to be that of the first member, |
|
40 |
#' and a 30\% chance to be selected like the first member's. |
|
41 |
#' |
|
42 |
#' Members after the second have no income, have age randomly selected from a |
|
43 |
#' uniform distribution between \code{0} and the first member's age minus \code{15} |
|
44 |
#' (which is then rounded up), and have race determined by either the first or second |
|
45 |
#' member (50\% chance). |
|
46 |
#' |
|
47 |
#' Sex has a 50\% chance to be \code{0} or \code{1} for all but the second member; their sex |
|
48 |
#' has a 10\% chance to be the same as the first member's, and 90\% chance to be opposite. |
|
49 |
#' |
|
50 |
#' @param N Number of initial individuals to generate. Final number of individuals will be larger. |
|
51 |
#' @param regions A vector of region IDs, a matrix of coordinates, or an \code{sf} object with |
|
52 |
#' geometries from which coordinates can be derived. If not specified |
|
53 |
#' (and \code{capacities} is not specified), regions similar to housing units (with a mix of |
|
54 |
#' single and multi-family locations) will be generated. |
|
55 |
#' @param capacities A vector with the maximum number of households for each entry in \code{regions}. |
|
56 |
#' @param region_ids A vector of unique IDs for each \code{regions}, or a column name in |
|
57 |
#' \code{regions} containing IDs. |
|
58 |
#' @param attraction_loci Number of locations selected to be centers of attractiveness, |
|
59 |
#' which influence where households are located. |
|
60 |
#' @param random_regions A number between \code{0} and \code{1}, which determines the proportion of |
|
61 |
#' people who are randomly relocated, in the case that there is more capacity than households. |
|
62 |
#' @param cost_loci Number of locations selected to be centers of cost, which influences |
|
63 |
#' the initial income associated with households. |
|
64 |
#' @param size_loci Number of locations selected to be centers of size, which influence |
|
65 |
#' household sizes. |
|
66 |
#' @param similarity_metric Name of a metric to use to calculate nearness between |
|
67 |
#' neighbors; see \code{\link[lingmatch]{lma_simets}}. |
|
68 |
#' @param n_neighbors Number of neighbors used to influence each new household's |
|
69 |
#' initial age and race. |
|
70 |
#' @param neighbor_range Minimum similarity between people to be considered |
|
71 |
#' neighbors, between \code{0} and \code{1} (where \code{0} means unrestricted, |
|
72 |
#' and \code{1} means same region only). |
|
73 |
#' @param n_races Number of different race groups to sample from. |
|
74 |
#' @param n_building_types Number of different building types to sample from. |
|
75 |
#' @param verbose Logical; if \code{TRUE}, will show status messages. |
|
76 |
#' @examples |
|
77 |
#' generate_population(2) |
|
78 |
#' @returns A list with entries for \code{params} (with initial settings), and two |
|
79 |
#' \code{data.frames}: |
|
80 |
#' \describe{ |
|
81 |
#' \item{\code{households}}{ |
|
82 |
#' \tabular{ll}{ |
|
83 |
#' \code{household} \tab Household ID. \cr |
|
84 |
#' \code{region} \tab Region ID. \cr |
|
85 |
#' \code{head_income} \tab Income of the first household member. \cr |
|
86 |
#' \code{size} \tab Number of individuals in the household. \cr |
|
87 |
#' \code{building_type} \tab Categorical indicator of building type. \cr |
|
88 |
#' \code{renting} \tab Binary indicator of renting status. \cr |
|
89 |
#' } |
|
90 |
#' } |
|
91 |
#' \item{\code{individuals}}{ |
|
92 |
#' \tabular{ll}{ |
|
93 |
#' \code{household} \tab Household ID. \cr |
|
94 |
#' \code{person} \tab Person ID. \cr |
|
95 |
#' \code{neighbors} \tab Number of neighbors who bore on variables. \cr |
|
96 |
#' \code{age} \tab Age in years. \cr |
|
97 |
#' \code{sex} \tab Categorical indicator of sex. \cr |
|
98 |
#' \code{race} \tab Categorical indicator of race. \cr |
|
99 |
#' \code{income} \tab Income of individual. \cr |
|
100 |
#' } |
|
101 |
#' } |
|
102 |
#' } |
|
103 |
#' @export |
|
104 | ||
105 |
generate_population <- function(N = 1000, regions = NULL, capacities = NULL, region_ids = NULL, |
|
106 |
attraction_loci = 3, random_regions = .1, cost_loci = 2, size_loci = 5, |
|
107 |
similarity_metric = "euclidean", n_neighbors = 50, |
|
108 |
neighbor_range = .5, n_races = 6, n_building_types = 3, verbose = FALSE) { |
|
109 | 2x |
gen_regions <- missing(regions) |
110 | 2x |
gen_capacities <- missing(capacities) |
111 | 2x |
if (gen_regions) { |
112 | 2x |
if (gen_capacities) { |
113 | 1x |
if (verbose) cli_alert_info("regions: sequence along {.arg N}, with housing unit capacities") |
114 | 2x |
regions <- seq_len(N) |
115 | 2x |
capacities <- rep(1, N) |
116 | 2x |
n_multi <- round(N * .1) |
117 | 2x |
capacities[sample.int(N, n_multi)] <- sample(5:300, n_multi, TRUE) |
118 |
} else { |
|
119 | ! |
if (verbose) { |
120 | ! |
cli_alert_info( |
121 | ! |
"regions: sequence along specified {.arg capacities} with specified capacities" |
122 |
) |
|
123 |
} |
|
124 | ! |
regions <- seq_along(capacities) |
125 |
} |
|
126 |
} |
|
127 | 2x |
if (!missing(region_ids)) { |
128 | ! |
if (length(region_ids) == 1 && region_ids %in% colnames(regions)) { |
129 | ! |
su <- colnames(regions) != region_ids |
130 | ! |
region_ids <- regions[[region_ids]] |
131 | ! |
regions <- regions[, su, drop = FALSE] |
132 |
} else { |
|
133 | ! |
nr <- if (length(dim(regions)) == 2) nrow(regions) else length(regions) |
134 | ! |
if (region_ids != nr) cli_abort("{.arg region_ids} are not the same length as {.arg regions}") |
135 |
} |
|
136 |
} |
|
137 | 2x |
if (length(dim(regions)) == 2) { |
138 | ! |
if (length(capacities) == 1 && is.character(capacities) && capacities %in% colnames(regions)) { |
139 | ! |
if (verbose) { |
140 | ! |
cli_alert_info( |
141 | ! |
"regions: specified coordinates, with capacities from {.field {capacities}} column" |
142 |
) |
|
143 |
} |
|
144 | ! |
su <- colnames(regions) != capacities |
145 | ! |
capacities <- regions[[capacities]] |
146 | ! |
regions <- regions[, su, drop = FALSE] |
147 | ! |
} else if (verbose) { |
148 | ! |
cli_alert_info(paste( |
149 | ! |
"regions: specified coordinates, with", |
150 | ! |
if (gen_capacities) "{.field N / nrow(regions)}" else "specified", "capacities" |
151 |
)) |
|
152 |
} |
|
153 | ! |
rids <- if (length(region_ids)) { |
154 | ! |
region_ids |
155 | ! |
} else if (is.null(rownames(regions))) { |
156 | ! |
paste0("r", seq_len(nrow(regions))) |
157 |
} else { |
|
158 | ! |
rownames(regions) |
159 |
} |
|
160 |
} else { |
|
161 | 2x |
rids <- if (length(regions) == 1 && is.numeric(regions)) { |
162 | ! |
if (verbose) { |
163 | ! |
cli_alert_info(paste( |
164 | ! |
"regions: sequence of {.field N}, with", |
165 | ! |
if (gen_capacities) "{.field N / nrow(regions)}" else "specified", "capacities" |
166 |
)) |
|
167 |
} |
|
168 | ! |
if (length(region_ids)) region_ids else paste0("r", seq_len(regions)) |
169 |
} else { |
|
170 | 2x |
if (verbose && !gen_regions) { |
171 | ! |
cli_alert_info(paste( |
172 | ! |
"regions:", if (gen_capacities) { |
173 | ! |
"specified region, with {.field N / length(regions)}" |
174 |
} else { |
|
175 | ! |
"specified regions and" |
176 | ! |
}, "capacities" |
177 |
)) |
|
178 |
} |
|
179 | 2x |
if (length(region_ids)) region_ids else regions |
180 |
} |
|
181 | 2x |
regions <- matrix( |
182 | 2x |
ceiling(rnorm(length(rids) * 2, 1e5, 1e4)), |
183 | 2x |
ncol = 2, dimnames = list(rids, c("X", "Y")) |
184 |
) |
|
185 |
} |
|
186 | 2x |
nr <- length(rids) |
187 | 2x |
capacities <- rep_len(if (!is.numeric(capacities)) N / nr else capacities, nr) |
188 | ||
189 | 1x |
if (verbose) cli_alert_info("preparing {.field {N}} households") |
190 | 2x |
household <- seq_len(N) |
191 | ||
192 |
# select attraction loci |
|
193 | 2x |
if (!is.numeric(attraction_loci) || attraction_loci < 1) { |
194 | ! |
attraction_loci <- 1 |
195 | ! |
if (verbose) cli_alert_warning("setting {.arg attraction_loci} to {.field 1}") |
196 | 2x |
} else if (attraction_loci > nr) { |
197 | 1x |
attraction_loci <- max(1, nr - 1) |
198 | 1x |
if (verbose) { |
199 | 1x |
cli_alert_warning( |
200 | 1x |
"too many attraction loci; setting {.arg attraction_loci} to {.field {attraction_loci}}" |
201 |
) |
|
202 |
} |
|
203 | 1x |
} else if (verbose) { |
204 | ! |
cli_alert_info("attraction loci: {.field {attraction_loci}}") |
205 |
} |
|
206 | 2x |
selected_attraction_loci <- sample.int(nr, attraction_loci) |
207 | 1x |
if (verbose) cli_alert_info("calculating region similarities") |
208 | 2x |
if (nr == 1) { |
209 | ! |
space <- Matrix(1, dimnames = list(rids, rids), sparse = TRUE) |
210 |
} else { |
|
211 | ! |
if (inherits(regions, c("sfc", "sf"))) regions <- st_coordinates(st_centroid(regions)) |
212 | 2x |
space <- lma_simets(regions, metric = similarity_metric, symmetrical = TRUE) |
213 |
} |
|
214 | 1x |
if (verbose) cli_alert_info("rescaling region similarities") |
215 | 2x |
min_space <- min(space@x) |
216 | 2x |
space@x <- (space@x - min_space) / (max(space@x) - min_space) |
217 | 2x |
space_cols <- integer(ncol(space)) |
218 | ||
219 |
# select initial locations |
|
220 | 2x |
space_cols[selected_attraction_loci] <- -1L |
221 | 2x |
ro <- order(space %*% space_cols) |
222 | 2x |
space_cols[selected_attraction_loci] <- 0L |
223 | 2x |
capacities <- ceiling(capacities) |
224 | 2x |
if (sum(capacities) < N) { |
225 | ! |
if (verbose) cli_alert_info("{.arg N} is larger than total capacities; expanding to accomodate") |
226 | ! |
capacities <- ceiling(capacities / sum(capacities) * N) |
227 |
} |
|
228 | 2x |
rids_expanded <- rep(ro, capacities[ro]) |
229 | 2x |
region <- rids_expanded[household] |
230 | ! |
if (anyNA(region)) cli_abort("NAs in regions") |
231 | ||
232 | 2x |
if (N < length(rids_expanded) && is.numeric(random_regions)) { |
233 |
# redistribute a portion |
|
234 | 1x |
moves <- min(length(rids_expanded) - N, N * random_regions) |
235 | 1x |
if (moves > 0) { |
236 | ! |
if (verbose) cli_alert_info("randomly placing {.field {moves}} households in remaining regions") |
237 | 1x |
region[sample(household, moves)] <- sample(rids_expanded[-household], moves) |
238 |
} |
|
239 |
} |
|
240 | ||
241 |
# select head-of-household income |
|
242 | 2x |
if (!is.numeric(cost_loci) || cost_loci < 1) { |
243 | ! |
cost_loci <- 1 |
244 | ! |
if (verbose) cli_alert_warning("setting {.arg cost_loci} to {.field 1}") |
245 | 2x |
} else if (cost_loci > nr) { |
246 | 1x |
cost_loci <- max(1, nr - 1) |
247 | 1x |
if (verbose) cli_alert_warning("too many cost loci; setting {.arg cost_loci} to {.field {cost_loci}}") |
248 | 1x |
} else if (verbose) { |
249 | ! |
cli_alert_info("cost loci: {.field {cost_loci}}") |
250 |
} |
|
251 | 2x |
selected_cost_loci <- sample.int(nr, cost_loci) |
252 | 2x |
space_cols[selected_cost_loci] <- 1L |
253 | 2x |
region_cost <- abs(rnorm(nr, as.numeric(space %*% space_cols) * 5e5, 1e5)) |
254 | 2x |
space_cols[selected_cost_loci] <- 0L |
255 | 2x |
head_income <- abs(as.integer( |
256 | 2x |
rnorm(N, region_cost[region], 2e3) + (rbeta(N, .1, 1) - .6) * region_cost[region] |
257 |
)) |
|
258 | ||
259 |
# selecting building type |
|
260 | 1x |
if (verbose) cli_alert_info("drawing building type and rental types") |
261 | 2x |
region_building_type <- rep(1L, nr) |
262 | 2x |
su <- capacities == 1 |
263 | 2x |
if (n_building_types > 1) { |
264 | 2x |
if (any(su)) { |
265 | 2x |
region_building_type[su] <- if (n_building_types == 2) { |
266 | ! |
2L |
267 |
} else { |
|
268 | 2x |
sample.int(n_building_types - 1, sum(su), TRUE) + 1L |
269 |
} |
|
270 |
} |
|
271 |
} |
|
272 | 2x |
building_type <- region_building_type[region] |
273 | ||
274 |
# select household sizes |
|
275 | 2x |
if (!is.numeric(size_loci) || size_loci < 1) { |
276 | ! |
size_loci <- 1 |
277 | ! |
if (verbose) cli_alert_warning("setting {.arg size_loci} to {.field 1}") |
278 | 2x |
} else if (size_loci > nr) { |
279 | 1x |
size_loci <- max(1, nr - 1) |
280 | 1x |
if (verbose) cli_alert_warning("too many size loci; setting {.arg size_loci} to {.field {size_loci}}") |
281 | 1x |
} else if (verbose) { |
282 | ! |
cli_alert_info("size loci: {.field {size_loci}}") |
283 |
} |
|
284 | 2x |
selected_size_loci <- sample.int(nr, size_loci) |
285 | 2x |
space_cols[selected_size_loci] <- .1 / size_loci |
286 | 2x |
size_prob <- rnorm( |
287 | 2x |
N, |
288 | 2x |
(su * 3 + (space %*% space_cols) * size_loci)[region, ] + |
289 | 2x |
head_income / max(head_income) * 5 + |
290 | 2x |
1 / building_type * .5, |
291 | 2x |
.4 |
292 |
) |
|
293 | 2x |
size <- rpois(N, abs(size_prob)) + 1L |
294 | ||
295 |
# selecting renting status |
|
296 | 1x |
if (verbose) cli_alert_info("drawing renting status") |
297 | 2x |
renting <- rbinom(N, 1, ((head_income < mean(head_income)) * .2 + .4) * |
298 | 2x |
((building_type == 1) * .3 + .5)) |
299 | ||
300 |
# select race base rates |
|
301 | 1x |
if (verbose) cli_alert_info("drawing race baserates") |
302 | 2x |
race_rates <- rbeta(n_races, 1, 5) |
303 | 2x |
race_rates <- race_rates / (race_rates + max(race_rates)) |
304 | ||
305 |
# making space sparse |
|
306 | 1x |
if (verbose) cli_alert_info("defining region neighbors") |
307 | 2x |
space@x[space@x < neighbor_range] <- 0 |
308 | 2x |
space <- drop0(space) |
309 | ||
310 |
# preparing individual data |
|
311 | 2x |
individuals <- data.frame() |
312 | 2x |
if (verbose) { |
313 | 1x |
cli_progress_step( |
314 | 1x |
"generating individuals...", |
315 | 1x |
msg_done = "generated {.field {nrow(individuals)}} individuals" |
316 |
) |
|
317 |
} |
|
318 | 2x |
individuals <- generate_individuals( |
319 | 2x |
region, head_income, size, renting, space, n_neighbors, race_rates |
320 |
) |
|
321 | ||
322 | 2x |
list( |
323 | 2x |
params = list( |
324 | 2x |
neighbors = n_neighbors, range = neighbor_range, races_rates = race_rates, |
325 | 2x |
n_building_types = n_building_types |
326 |
), |
|
327 | 2x |
regions = data.frame( |
328 | 2x |
id = rids, capacity = capacities, cost = region_cost, |
329 | 2x |
building_type = region_building_type, regions |
330 |
), |
|
331 | 2x |
households = data.frame( |
332 | 2x |
household, |
333 | 2x |
region = rids[region], head_income, size, building_type, renting |
334 |
), |
|
335 | 2x |
individuals = as.data.frame(individuals) |
336 |
) |
|
337 |
} |
1 |
#' Redistribute Data |
|
2 |
#' |
|
3 |
#' Distribute data from a source frame to a target frame. |
|
4 |
#' |
|
5 |
#' @param source A matrix-like object you want to distribute from; usually this will be |
|
6 |
#' the real or more complete dataset, and is often at a lower resolution / higher level. |
|
7 |
#' @param target A matrix-like object you want to distribute to: usually this will be |
|
8 |
#' the dataset you want but isn't available, and is often at a higher resolution / lower level |
|
9 |
#' (for disaggregation). Can also be a single number, representing the number of initial |
|
10 |
#' characters of \code{source} IDs to derive target IDs from (useful for aggregating up nested groups). |
|
11 |
#' @param map A list with entries named with \code{source} IDs (or aligning with those IDs), |
|
12 |
#' containing vectors of associated \code{target} IDs (or indices of those IDs). Entries |
|
13 |
#' can also be numeric vectors with IDs as names, which will be used to weigh the relationship. |
|
14 |
#' If IDs are related by substrings (the first characters of \code{target} IDs are \code{source} IDs), |
|
15 |
#' then a map can be automatically generated from them. If \code{source} and \code{target} |
|
16 |
#' contain \code{sf} geometries, a map will be made with \code{\link[sf]{st_intersects}} |
|
17 |
#' (\code{st_intersects(source, target)}). If an intersects map is made, and \code{source} |
|
18 |
#' is being aggregated to \code{target}, and map entries contain multiple target IDs, |
|
19 |
#' those entries will be weighted by their proportion of overlap with the source area. |
|
20 |
#' @param source_id,target_id Name of a column in \code{source} / \code{target}, |
|
21 |
#' or a vector containing IDs. For \code{source}, this will default to the first column. For |
|
22 |
#' \code{target}, columns will be searched through for one that appears to relate to the |
|
23 |
#' source IDs, falling back to the first column. |
|
24 |
#' @param weight Name of a column, or a vector containing weights (or single value to apply to all cases), |
|
25 |
#' which apply to \code{target} when disaggregating, and \code{source} when aggregating. |
|
26 |
#' Defaults to unit weights (all weights are 1). |
|
27 |
#' @param source_variable,source_value If \code{source} is tall (with variables spread across |
|
28 |
#' rows rather than columns), specifies names of columns in \code{source} containing variable names |
|
29 |
#' and values for conversion. |
|
30 |
#' @param aggregate Logical; if specified, will determine whether to aggregate or disaggregate |
|
31 |
#' from \code{source} to \code{target}. Otherwise, this will be \code{TRUE} if there are more |
|
32 |
#' \code{source} observations than \code{target} observations. |
|
33 |
#' @param weight_agg_method Means of aggregating \code{weight}, in the case that target IDs contain duplicates. |
|
34 |
#' Options are \code{"sum"}, \code{"average"}, or \code{"auto"} (default; which will sum if \code{weight} |
|
35 |
#' is integer-like, and average otherwise). |
|
36 |
#' @param rescale Logical; if \code{FALSE}, will not adjust target values after redistribution such that they |
|
37 |
#' match source totals. |
|
38 |
#' @param drop_extra_sources Logical; if \code{TRUE}, will remove any source rows that are not mapped |
|
39 |
#' to any target rows. Useful when inputting a source with regions outside of the target area, |
|
40 |
#' especially when \code{rescale} is \code{TRUE}. |
|
41 |
#' @param default_value Value to set to any unmapped target ID. |
|
42 |
#' @param outFile Path to a CSV file in which to save results. |
|
43 |
#' @param overwrite Logical; if \code{TRUE}, will overwrite an existing \code{outFile}. |
|
44 |
#' @param make_intersect_map Logical; if \code{TRUE}, will opt to calculate an intersect-based map |
|
45 |
#' rather than an ID-based map, if both seem possible. If specified as \code{FALSE}, will |
|
46 |
#' never calculate an intersect-based map. |
|
47 |
#' @param fill_targets Logical; if \code{TRUE}, will make new \code{target} rows for any |
|
48 |
#' un-mapped \code{source} row. |
|
49 |
#' @param overlaps If specified and not \code{TRUE} or \code{"keep"} (default), will assign \code{target} |
|
50 |
#' entities that are mapped to multiple \code{source} entities to a single source entity. The value |
|
51 |
#' determines how entities with the same weight should be assigned, between \code{"first"}, |
|
52 |
#' \code{"last"}, and \code{"random"}. |
|
53 |
#' @param use_all Logical; if \code{TRUE} (default), will redistribute map weights so they sum to 1. |
|
54 |
#' Otherwise, entities may be partially weighted. |
|
55 |
#' @param return_geometry Logical; if \code{FALSE}, will not set the returned \code{data.frame}'s |
|
56 |
#' geometry to that of \code{target}, if it exists. |
|
57 |
#' @param return_map Logical; if \code{TRUE}, will only return the map, without performing the |
|
58 |
#' redistribution. Useful if you want to inspect an automatically created map, or use it in a later call. |
|
59 |
#' @param verbose Logical; if \code{TRUE}, will show status messages. |
|
60 |
#' @examples |
|
61 |
#' # minimal example |
|
62 |
#' source <- data.frame(a = 1, b = 2) |
|
63 |
#' target <- 1:5 |
|
64 |
#' (redistribute(source, target, verbose = TRUE)) |
|
65 |
#' |
|
66 |
#' # multi-entity example |
|
67 |
#' source <- data.frame(id = c("a", "b"), cat = c("aaa", "bbb"), num = c(1, 2)) |
|
68 |
#' target <- data.frame( |
|
69 |
#' id = sample(paste0(c("a", "b"), rep(1:5, 2))), |
|
70 |
#' population = sample.int(1e5, 10) |
|
71 |
#' ) |
|
72 |
#' (redistribute(source, target, verbose = TRUE)) |
|
73 |
#' @returns A \code{data.frame} with a row for each \code{target_ids} (identified by the first column, |
|
74 |
#' \code{id}), and a column for each variable from \code{source}. |
|
75 |
#' @importFrom cli cli_abort cli_bullets cli_alert_info cli_alert_warning cli_warn |
|
76 |
#' cli_progress_step cli_progress_update cli_progress_done |
|
77 |
#' @importFrom Rcpp sourceCpp |
|
78 |
#' @importFrom RcppParallel RcppParallelLibs |
|
79 |
#' @importFrom utils unzip |
|
80 |
#' @importFrom sf st_intersects st_intersection st_geometry st_geometry<- st_crs st_geometry_type |
|
81 |
#' st_coordinates st_centroid st_boundary st_cast st_polygon st_union st_transform st_buffer st_as_sf |
|
82 |
#' st_is_valid st_make_valid st_sfc st_point |
|
83 |
#' @importFrom s2 s2_area s2_is_valid |
|
84 |
#' @importFrom lingmatch lma_simets |
|
85 |
#' @importFrom jsonlite read_json write_json |
|
86 |
#' @importFrom curl curl_fetch_disk |
|
87 |
#' @importFrom vroom vroom vroom_write |
|
88 |
#' @importFrom stats rbeta rbinom rnorm rpois |
|
89 |
#' @importFrom Matrix Matrix drop0 |
|
90 |
#' @useDynLib redistribute, .registration = TRUE |
|
91 |
#' @export |
|
92 | ||
93 |
redistribute <- function(source, target = NULL, map = list(), source_id = "GEOID", target_id = source_id, |
|
94 |
weight = NULL, source_variable = NULL, source_value = NULL, aggregate = NULL, |
|
95 |
weight_agg_method = "auto", rescale = TRUE, drop_extra_sources = FALSE, |
|
96 |
default_value = NA, outFile = NULL, overwrite = FALSE, make_intersect_map = FALSE, |
|
97 |
fill_targets = FALSE, overlaps = "keep", use_all = TRUE, return_geometry = TRUE, |
|
98 |
return_map = FALSE, verbose = FALSE) { |
|
99 | 27x |
if (!overwrite && !is.null(outFile) && file.exists(outFile)) { |
100 | ! |
cli_abort("{.arg outFile} already exists; use {.code overwrite = TRUE} to overwrite it") |
101 |
} |
|
102 | 27x |
can_intersects <- missing(make_intersect_map) || make_intersect_map |
103 | 27x |
source_sf <- can_intersects && inherits(source, c("sfc", "sf")) |
104 | 27x |
target_sf <- can_intersects && inherits(target, c("sfc", "sf")) |
105 | 27x |
can_intersects <- source_sf && target_sf |
106 | 27x |
if (!can_intersects && make_intersect_map) { |
107 | 1x |
if (!source_sf && is.data.frame(source) && any(vapply(source, inherits, TRUE, "sfc"))) { |
108 | 1x |
cli_warn("{.arg source} is not an sf object, but contains a simple features column, so converting it") |
109 | 1x |
source <- st_as_sf(source) |
110 | 1x |
source_sf <- TRUE |
111 |
} |
|
112 | 1x |
if (!target_sf && is.data.frame(target) && any(vapply(target, inherits, TRUE, "sfc"))) { |
113 | ! |
cli_warn("{.arg target} is not an sf object, but contains a simple features column, so converting it") |
114 | ! |
target <- st_as_sf(target) |
115 | ! |
target_sf <- TRUE |
116 |
} |
|
117 | 1x |
can_intersects <- source_sf && target_sf |
118 | 1x |
if (!can_intersects) { |
119 | ! |
cli_abort( |
120 | ! |
"{.arg make_intersect_map} was set to {.field TRUE}, but {.arg source} and {.arg target} are not both sf objects" |
121 |
) |
|
122 |
} |
|
123 |
} |
|
124 | 27x |
intersect_map <- FALSE |
125 | ! |
if (length(dim(source)) != 2) source <- data.frame(source) |
126 | 1x |
if (is.null(colnames(source))) colnames(source) <- paste0("V", seq_len(ncol(source))) |
127 | 27x |
if (length(source_id) > 1) { |
128 | ! |
if (verbose) cli_alert_info("source IDs: {.arg source_id} vector") |
129 | ! |
sid <- source_id |
130 | 27x |
} else if (source_id %in% colnames(source)) { |
131 | 1x |
if (verbose) cli_alert_info("source IDs: {.field {source_id}} column of {.arg source}") |
132 | 16x |
sid <- source[, source_id, drop = TRUE] |
133 | 16x |
source <- source[, colnames(source) != source_id, drop = FALSE] |
134 | 11x |
} else if (nrow(source) == 1) { |
135 | ! |
if (verbose) cli_alert_info("source IDs: {.field 1}") |
136 | 2x |
sid <- if (missing(source_id)) "1" else source_id |
137 |
} else { |
|
138 | 9x |
if (source_sf && target_sf) { |
139 | ! |
if (verbose) cli_alert_info("source IDs: sequence, assuming map from geometries") |
140 | 3x |
intersect_map <- TRUE |
141 | 3x |
sid <- seq_len(nrow(source)) |
142 |
} else { |
|
143 | ! |
if (verbose) cli_alert_info("source IDs: {.field {colnames(source)[1]}} column of {.arg source}") |
144 | 6x |
sid <- source[, 1, drop = TRUE] |
145 | 6x |
source <- source[, -1, drop = FALSE] |
146 |
} |
|
147 |
} |
|
148 | 27x |
sid <- as.character(sid) |
149 | ! |
if (length(sid) != nrow(source)) cli_abort("{.arg source_id} is not the same length as {.field nrow(source)}") |
150 | 27x |
if (anyDuplicated(sid)) { |
151 | 1x |
if (is.null(source_variable)) { |
152 | 1x |
if ("variable" %in% colnames(source)) source_variable <- "variable" |
153 | ! |
if ("measure" %in% colnames(source)) source_variable <- "measure" |
154 |
} |
|
155 | 1x |
if (is.null(source_value)) { |
156 | 1x |
if ("value" %in% colnames(source)) source_value <- "value" |
157 |
} |
|
158 |
} |
|
159 | 27x |
if (!is.null(source_variable) || !is.null(source_value)) { |
160 | 1x |
source_numeric <- vapply(seq_len(ncol(source)), function(col) is.numeric(source[, col, drop = TRUE]), TRUE) |
161 | ! |
if (is.null(source_value) && any(source_numeric)) source_value <- colnames(source)[which(source_numeric)[1]] |
162 | 1x |
if (is.null(source_variable) && any(!source_numeric)) { |
163 | ! |
source_variable <- colnames(source)[ |
164 | ! |
which.min(vapply(which(!source_numeric), function(col) length(unique(source[, col])), 0)) |
165 |
] |
|
166 |
} |
|
167 | 1x |
if (verbose) { |
168 | ! |
cli_alert_info(paste( |
169 | ! |
"converting {.arg source} format, breaking", |
170 | ! |
if (length(source_value) == 1) "{.field {source_value}}" else "{.arg source_value}", |
171 | ! |
"values across", |
172 | ! |
if (length(source_variable) == 1) "{.field {source_variable}}" else "{.arg source_variable}", |
173 | ! |
"columns" |
174 |
)) |
|
175 |
} |
|
176 | 1x |
usid <- unique(sid) |
177 | 1x |
nr <- nrow(source) |
178 | 1x |
if (length(source_value) != nr) source_value <- source[, source_value, drop = TRUE] |
179 | 1x |
if (length(source_variable) != nr) source_variable <- source[, source_variable, drop = TRUE] |
180 | 1x |
ss <- split(data.frame(sid, source_value), source_variable) |
181 | 1x |
source <- do.call(cbind, lapply(ss, function(vd) { |
182 | 2x |
vd <- vd[!duplicated(vd[[1]]), ] |
183 | 2x |
rownames(vd) <- vd[, 1] |
184 | 2x |
vd[usid, 2, drop = TRUE] |
185 |
})) |
|
186 | 1x |
colnames(source) <- names(ss) |
187 | 1x |
source <- as.data.frame(source) |
188 | 1x |
sid <- usid |
189 |
} |
|
190 | ! |
if (length(dim(target)) == 2 && is.null(colnames(target))) colnames(target) <- paste0("V", seq_len(ncol(target))) |
191 | 27x |
if (length(target) == 1 && is.numeric(target)) { |
192 | ! |
if (verbose) cli_alert_info("target IDs: first {.field {target}} characters of {.arg source} IDs") |
193 | ! |
tid <- target <- unique(substring(sid, 1, target)) |
194 | 27x |
} else if (length(target_id) > 1) { |
195 | ! |
if (verbose) cli_alert_info("target IDs: {.arg target_id} vector") |
196 | ! |
tid <- target_id |
197 | 27x |
} else if (is.null(target)) { |
198 | ! |
if (verbose) cli_alert_info("target IDs: {.field 1}") |
199 | ! |
tid <- 1 |
200 | 27x |
} else if (!is.null(target_id) && length(dim(target)) == 2 && target_id %in% colnames(target)) { |
201 | 1x |
if (verbose) cli_alert_info("target IDs: {.field {target_id}} column of {.arg target}") |
202 | 16x |
tid <- target[, target_id, drop = TRUE] |
203 | 16x |
target <- target[, colnames(target) != target_id, drop = FALSE] |
204 |
} else { |
|
205 | 11x |
if (length(dim(target)) != 2) { |
206 | 2x |
if (is.null(weight) && (target_sf || is.numeric(target)) && !is.null(names(target))) { |
207 | ! |
if (target_sf) { |
208 | ! |
if (verbose) cli_alert_info("target IDs: names of {.arg target} vector") |
209 |
} else { |
|
210 | ! |
if (verbose) cli_alert_info("target IDs: names of {.arg target} vector; {.arg weight} set to {.arg target}") |
211 | ! |
weight <- unname(target) |
212 |
} |
|
213 | ! |
tid <- names(target) |
214 |
} else { |
|
215 | 2x |
if (target_sf) { |
216 | ! |
if (verbose) cli_alert_info("target IDs: sequence") |
217 | ! |
tid <- seq_along(target) |
218 |
} else { |
|
219 | ! |
if (verbose) cli_alert_info("target IDs: {.arg target} vector") |
220 | 2x |
tid <- target |
221 |
} |
|
222 |
} |
|
223 |
} else { |
|
224 | 9x |
if (can_intersects) { |
225 | ! |
if (verbose) cli_alert_info("target IDs: sequence, assuming map from geometries") |
226 | 3x |
intersect_map <- TRUE |
227 | 3x |
tid <- seq_len(nrow(target)) |
228 |
} else { |
|
229 | 6x |
idi <- 1 |
230 | 6x |
if (length(map)) { |
231 | ! |
tids <- unique(unlist(map, use.names = FALSE)) |
232 | ! |
for (i in seq_len(ncol(target))) { |
233 | ! |
if (any(target[, i] %in% tids)) { |
234 | ! |
idi <- i |
235 | ! |
break |
236 |
} |
|
237 |
} |
|
238 | 6x |
} else if (length(sid) > 1 && (sl <- nchar(sid[1])) > 1 && all(nchar(sid) == sl)) { |
239 | ! |
for (i in seq_len(ncol(target))) { |
240 | ! |
if (any(substr(target[, i, drop = TRUE], 1, sl) %in% sid)) { |
241 | ! |
idi <- i |
242 | ! |
break |
243 |
} |
|
244 |
} |
|
245 |
} |
|
246 | ! |
if (verbose) cli_alert_info("target IDs: {.field {colnames(target)[idi]}} column of {.arg target}") |
247 | 6x |
tid <- target[, idi, drop = TRUE] |
248 | 6x |
target <- target[, -idi, drop = FALSE] |
249 |
} |
|
250 |
} |
|
251 |
} |
|
252 | 27x |
tid <- as.character(tid) |
253 | 27x |
if (!is.logical(aggregate)) aggregate <- length(sid) > length(tid) |
254 | 5x |
if (make_intersect_map && source_sf && target_sf) intersect_map <- TRUE |
255 | 27x |
if (length(map)) { |
256 | 1x |
if (verbose) cli_alert_info("map: provided list") |
257 | 9x |
intersect_map <- FALSE |
258 | 9x |
if (is.null(names(map))) { |
259 | ! |
if (length(map) != length(sid)) cli_abort("{.arg map} has no names, and is not the same length as source IDs") |
260 | ! |
names(map) <- sid |
261 | 9x |
} else if (!any(sid %in% names(map))) { |
262 | ! |
if (!any(tid %in% names(map))) cli_abort("no source or target IDs were in map names") |
263 | 1x |
mw <- unlist(unname(map)) |
264 | ! |
if (is.null(names(mw))) mw <- structure(numeric(length(mw)) + 1, names = mw) |
265 | 1x |
onames <- names(mw) |
266 | 1x |
if (!any(sid %in% onames)) { |
267 | ! |
if (all(!grepl("[^0-9]", onames))) { |
268 | ! |
onames <- as.integer(onames) |
269 | ! |
if (!anyNA(onames) && min(onames) > 0 && max(onames) <= length(sid)) { |
270 | ! |
if (verbose) cli_alert_info("inverting map, with entry values mapped to source IDs") |
271 | ! |
names(mw) <- rep(names(map), vapply(map, length, 0)) |
272 | ! |
map <- split(mw, sid[onames]) |
273 |
} else { |
|
274 | ! |
cli_abort("map appeared to refer to target indices, but they are out of range") |
275 |
} |
|
276 |
} else { |
|
277 | ! |
cli_abort("no source IDs were present in the ID map") |
278 |
} |
|
279 |
} else { |
|
280 | ! |
if (verbose) cli_alert_info("inverting map") |
281 | 1x |
names(mw) <- rep(names(map), vapply(map, length, 0)) |
282 | 1x |
map <- split(mw, onames) |
283 |
} |
|
284 |
} |
|
285 |
} else { |
|
286 | 18x |
if (nrow(source) == 1) { |
287 | ! |
if (verbose) cli_alert_info("map: all target IDs for single source") |
288 | 2x |
intersect_map <- FALSE |
289 | 2x |
map[[sid]] <- tid |
290 | 16x |
} else if (is.null(target) || length(tid) == 1) { |
291 | ! |
if (verbose) cli_alert_info("map: all source IDs to a single target") |
292 | ! |
intersect_map <- FALSE |
293 | ! |
map <- as.list(structure(rep(tid, length(sid)), names = sid)) |
294 | 16x |
} else if (aggregate && !intersect_map && all(nchar(tid) == nchar(tid[1])) && |
295 | 16x |
nchar(sid[1]) > nchar(tid[1]) && any(substring(sid, 1, nchar(tid[1])) %in% tid)) { |
296 | ! |
if (verbose) cli_alert_info("map: first {.field {nchar(tid[1])}} character{?s} of source IDs") |
297 | 1x |
intersect_map <- FALSE |
298 | 1x |
map <- as.list(structure(substr(sid, 1, nchar(tid[1])), names = sid)) |
299 | 15x |
} else if (!intersect_map && all(nchar(sid) == nchar(sid[1])) && nchar(tid[1]) > nchar(sid[1]) && |
300 | 15x |
any(substring(tid, 1, nchar(sid[1])) %in% sid) && |
301 | 15x |
(!can_intersects || !all(substring(tid, 1, nchar(sid[1])) == substring(tid[1], 1, nchar(sid[1]))))) { |
302 | ! |
if (verbose) cli_alert_info("map: first {.field {nchar(sid[1])}} character{?s} of target IDs") |
303 | 8x |
map <- split(tid, substr(tid, 1, nchar(sid[1]))) |
304 | 7x |
} else if (can_intersects) { |
305 | 7x |
if (verbose) { |
306 | ! |
cli_alert_info("map: intersections between geometries") |
307 | ! |
cli_progress_step("initial mapping...", msg_done = "done initial mapping") |
308 |
} |
|
309 | 7x |
intersect_map <- TRUE |
310 | 7x |
op <- options(sf_use_s2 = FALSE) |
311 | 7x |
on.exit(options(sf_use_s2 = op[[1]])) |
312 | 7x |
if (st_crs(source) != st_crs(target)) { |
313 | 1x |
if (length(tid) > length(sid)) { |
314 | 1x |
source <- st_transform(source, st_crs(target)) |
315 |
} else { |
|
316 | ! |
target <- st_transform(target, st_crs(source)) |
317 |
} |
|
318 |
} |
|
319 | 7x |
su <- !st_is_valid(st_geometry(source)) |
320 | 7x |
if (any(su)) { |
321 | 1x |
st_geometry(source)[su] <- st_make_valid(st_geometry(source)[su]) |
322 | 1x |
su <- !st_is_valid(st_geometry(source)) |
323 | 1x |
if (any(su)) { |
324 | ! |
cli_warn("some {.arg source} geometries were not valid, so removed unrepairable ones") |
325 | ! |
source <- source[!su, ] |
326 | ! |
sid <- sid[!su] |
327 |
} else { |
|
328 | 1x |
cli_warn("some {.arg source} geometries were not valid, so repaired them") |
329 |
} |
|
330 |
} |
|
331 | 7x |
su <- !st_is_valid(st_geometry(target)) |
332 | 7x |
if (any(su)) { |
333 | 1x |
st_geometry(target)[su] <- st_make_valid(st_geometry(target)[su]) |
334 | 1x |
su <- !st_is_valid(st_geometry(target)) |
335 | 1x |
if (any(su)) { |
336 | ! |
cli_warn("some {.arg target} geometries were not valid, so removed unrepairable ones") |
337 | ! |
target <- target[!su, ] |
338 | ! |
tid <- tid[!su] |
339 |
} else { |
|
340 | 1x |
cli_warn("some {.arg target} geometries were not valid, so repaired them") |
341 |
} |
|
342 |
} |
|
343 | 7x |
map <- tryCatch(suppressMessages(st_intersects(st_geometry(source), st_geometry(target))), error = function(e) NULL) |
344 | 7x |
if (is.null(map)) { |
345 | ! |
cli_abort(c( |
346 | ! |
x = "{.fn st_intersects} failed between {.arg source} and {.arg target}", |
347 | ! |
i = "if you can do this or similar separetely, you can pass the result as {.arg map}" |
348 |
)) |
|
349 |
} |
|
350 | 7x |
names(map) <- sid |
351 |
} else { |
|
352 | ! |
cli_abort("no map was provided, and it could not be made from IDs") |
353 |
} |
|
354 |
} |
|
355 | 27x |
if (aggregate && length(map) == length(tid) && length(map) != length(sid)) { |
356 | ! |
ids <- if (is.null(names(map))) seq_along(map) else names(map) |
357 | ! |
names(map) <- NULL |
358 | ! |
child_counts <- vapply(map, length, 0) |
359 | ! |
map <- as.list(unlist(map)) |
360 | ! |
names(map) <- rep(ids, child_counts) |
361 |
} |
|
362 | 27x |
if ((is.logical(overlaps) && overlaps) || grepl("^[Kk]", overlaps)) { |
363 | 24x |
dodedupe <- FALSE |
364 |
} else { |
|
365 | 3x |
tiebreak <- c(f = "first", l = "last", r = "random")[tolower(substring(overlaps, 1, 1))] |
366 | 3x |
dodedupe <- !is.na(tiebreak) |
367 | 3x |
deduper <- switch(tiebreak, |
368 | 3x |
first = function(e, agg = TRUE) { |
369 | 5x |
su <- which(e == max(e)) |
370 | 5x |
if (length(su) > 1) su <- su[1] |
371 | 5x |
if (agg) e[su] else su |
372 |
}, |
|
373 | 3x |
last = function(e, agg = TRUE) { |
374 | 5x |
su <- which(e == max(e)) |
375 | 5x |
if (length(su) > 1) su <- su[length(su)] |
376 | 5x |
if (agg) e[su] else su |
377 |
}, |
|
378 | 3x |
random = function(e, agg = TRUE) { |
379 | 5x |
su <- which(e == max(e)) |
380 | 5x |
if (length(su) > 1) su <- sample(su, 1) |
381 | 5x |
if (agg) e[su] else su |
382 |
} |
|
383 |
) |
|
384 |
} |
|
385 | 27x |
any_partial <- FALSE |
386 | 27x |
map <- map[sid] |
387 | 27x |
mls <- vapply(map, length, 0) |
388 | 27x |
if (!fill_targets && drop_extra_sources) { |
389 | 1x |
su <- vapply(map, length, 0) != 0 |
390 | 1x |
if (!all(su)) { |
391 | ! |
if (verbose) cli_alert_info("removing {sum(!su)} {.arg source}{?s} with no mapped {.arg target}s") |
392 | 1x |
source <- source[su, , drop = FALSE] |
393 | 1x |
sid <- sid[su] |
394 | 1x |
map <- map[su] |
395 |
} |
|
396 |
} |
|
397 | 27x |
if (intersect_map) { |
398 | 7x |
source_geom <- st_geometry(source) |
399 | 7x |
names(source_geom) <- sid |
400 |
} |
|
401 | 27x |
polys <- intersect_map && any(grepl("POLY", st_geometry_type( |
402 | 27x |
if (aggregate) source_geom else st_geometry(target) |
403 | 27x |
), fixed = TRUE)) |
404 | 27x |
if (polys && any(mls > 1)) { |
405 | 7x |
if (verbose) { |
406 | ! |
oi <- 0 |
407 | ! |
cli_progress_step( |
408 | ! |
"calculating map intersections ({oi}/{length(map)})", |
409 | ! |
msg_done = "calculated map intersections ({length(map)})" |
410 |
) |
|
411 |
} |
|
412 | 7x |
map <- lapply(seq_along(map), function(i) { |
413 | 169x |
if (verbose) { |
414 | ! |
oi <<- i |
415 | ! |
cli_progress_update(.envir = parent.frame(2)) |
416 |
} |
|
417 | 169x |
e <- map[[i]] |
418 | 169x |
if (length(e)) { |
419 | 167x |
res <- if (is.null(names(e))) { |
420 | 167x |
w <- rep(1, length(e)) |
421 | 167x |
if (intersect_map && length(e) > 1) { |
422 | 86x |
reg <- st_geometry(target)[e] |
423 | 86x |
totals <- s2_area(reg) |
424 | 86x |
su <- totals > 0 |
425 | 86x |
if (any(su)) { |
426 | 86x |
part <- suppressMessages(st_intersection(reg[su], source_geom[sid[i]], model = "closed")) |
427 | 86x |
pv <- numeric(length(part)) |
428 | 86x |
tsu <- s2_is_valid(part) |
429 | 86x |
pv[tsu] <- s2_area(part[tsu]) |
430 | 86x |
w[su] <- pv / (if (aggregate) s2_area(source_geom[[sid[i]]]) else totals[su]) |
431 |
} |
|
432 |
} |
|
433 | 167x |
names(w) <- if (is.integer(e)) tid[e] else e |
434 | 167x |
w[w > 0] |
435 |
} else { |
|
436 | ! |
e |
437 |
} |
|
438 | 167x |
if (aggregate && dodedupe) { |
439 | ! |
if (length(res) > 1) { |
440 | ! |
deduper(res) |
441 |
} else { |
|
442 | ! |
res |
443 |
} |
|
444 |
} else { |
|
445 | 76x |
if (!aggregate && any(res < 1)) any_partial <<- TRUE |
446 | 167x |
res |
447 |
} |
|
448 |
} else { |
|
449 | 2x |
numeric() |
450 |
} |
|
451 |
}) |
|
452 | ! |
if (verbose) cli_progress_done() |
453 |
} else { |
|
454 | 20x |
map <- lapply(map, function(e) { |
455 | 275x |
if (is.null(names(e))) { |
456 | 45x |
w <- rep(1, length(e)) |
457 | 45x |
names(w) <- if (is.integer(e)) tid[e] else e |
458 | 45x |
e <- w |
459 |
} |
|
460 | 138x |
if (!aggregate && any(e < 1)) any_partial <<- TRUE |
461 | 275x |
e |
462 |
}) |
|
463 |
} |
|
464 | 27x |
targets <- unlist(unname(map)) |
465 | 19x |
if (!any_partial) any_partial <- anyDuplicated(names(targets)) |
466 | 27x |
if (dodedupe && any_partial) { |
467 | ! |
if (verbose) cli_progress_step("assigning each target to a single source") |
468 | 3x |
any_partial <- FALSE |
469 | 3x |
esids <- factor(rep(sid, vapply(map, length, 0)), unique(sid)) |
470 | 3x |
if (anyDuplicated(names(targets))) { |
471 | 3x |
for (ct in unique(names(targets))) { |
472 | 135x |
su <- which(names(targets) == ct) |
473 | 135x |
if (length(su) > 1) { |
474 | 15x |
sel <- deduper(targets[su], FALSE) |
475 | 15x |
targets[su[-sel]] <- 0 |
476 | 15x |
targets[su[sel]] <- 1 |
477 |
} |
|
478 |
} |
|
479 | 3x |
su <- which(targets != 0) |
480 | 3x |
map <- split(targets[su], esids[su]) |
481 |
} else { |
|
482 | ! |
targets[] <- 1L |
483 | ! |
map <- split(targets, esids) |
484 |
} |
|
485 | ! |
if (verbose) cli_progress_done() |
486 |
} else { |
|
487 | 24x |
names(map) <- sid |
488 |
} |
|
489 | 27x |
n_filled <- 0 |
490 | 27x |
if (fill_targets) { |
491 | 1x |
su <- vapply(map, length, 0) == 0 |
492 | 1x |
if (any(su)) { |
493 | 1x |
missed_sources <- names(which(su)) |
494 | 1x |
ftids <- paste0("filled_", missed_sources) |
495 | 1x |
n_filled <- length(ftids) |
496 | ! |
if (verbose) cli_alert_info("adding {n_filled} target ID{?s} to cover all source rows") |
497 | 1x |
tid <- c(tid, ftids) |
498 | 1x |
map[missed_sources] <- split(structure(rep(1, n_filled), names = ftids), missed_sources) |
499 |
} |
|
500 |
} |
|
501 | ! |
if (verbose && intersect_map) cli_progress_done() |
502 | 27x |
if (return_map) { |
503 | ! |
if (verbose) cli_alert_info("returning map") |
504 | 5x |
return(map) |
505 |
} |
|
506 | 22x |
mtid <- names(unlist(unname(map))) |
507 | 22x |
nout <- length(if (aggregate) sid else tid) |
508 | 22x |
w <- if (length(weight) > 1) { |
509 | ! |
if (verbose) cli_alert_info("weights: {.arg weight} vector") |
510 | 5x |
if (length(weight) != nout && (!n_filled || length(weight) + n_filled != nout)) { |
511 | ! |
cli_abort("{.arg weight} is not the same length as {.arg {if (aggregate) 'source' else 'target'}} IDs") |
512 |
} |
|
513 | 5x |
weight |
514 | 22x |
} else if (aggregate && !is.null(weight) && weight %in% colnames(source)) { |
515 | ! |
if (verbose) cli_alert_info("weights: {.field {weight}} column of {.arg source}") |
516 | ! |
w <- source[, weight, drop = TRUE] |
517 | ! |
source <- source[, colnames(weight) != weight, drop = FALSE] |
518 | ! |
w |
519 | 22x |
} else if (!is.null(weight) && weight %in% colnames(target)) { |
520 | 1x |
if (verbose) cli_alert_info("weights: {.field {weight}} column of {.arg target}") |
521 | 12x |
target[, weight, drop = TRUE] |
522 |
} else { |
|
523 | 5x |
weight <- if (!is.numeric(weight)) 1 else weight |
524 | ! |
if (verbose) cli_alert_info("weights: {.field {weight}}") |
525 | 5x |
rep(weight, nout) |
526 |
} |
|
527 | 22x |
w <- as.numeric(w) |
528 | 1x |
if (n_filled) w <- c(w, rep(1, n_filled)) |
529 | 22x |
if (anyNA(w)) { |
530 | ! |
cli_warn("weights contained NAs, which were set to 0") |
531 | ! |
w[is.na(w)] <- 0 |
532 |
} |
|
533 | 22x |
realign <- FALSE |
534 | 22x |
su <- tid %in% mtid |
535 | 22x |
if (!all(su)) { |
536 | ! |
if (verbose) { |
537 | ! |
cli_alert_info(paste( |
538 | ! |
"{.field {sum(!su)}} of {.field {length(su)}} target IDs", |
539 | ! |
"were dropped because they were not present in {.arg map}" |
540 |
)) |
|
541 |
} |
|
542 | ! |
realign <- TRUE |
543 | ! |
otid <- tid |
544 | ! |
tid <- tid[su] |
545 | ! |
if (!aggregate) w <- w[su] |
546 |
} |
|
547 | 22x |
if (!aggregate && anyDuplicated(tid)) { |
548 | ! |
if (!realign) { |
549 | ! |
realign <- TRUE |
550 | ! |
otid <- tid |
551 |
} |
|
552 | ! |
agger <- if (weight_agg_method == "auto") { |
553 | ! |
if (all(w == as.integer(w))) "summing" else "averaging" |
554 | ! |
} else if (grepl("^[Ss]", weight_agg_method)) "summing" else "averaging" |
555 | ! |
if (verbose) cli_alert_info("{.arg target_id} contains duplicates, so {agger} {.arg weight}s") |
556 | ! |
w <- tapply(w, tid, if (agger == "summing") sum else mean) |
557 | ! |
tid <- names(w) |
558 | ! |
w <- unname(w) |
559 |
} |
|
560 | 11x |
if (source_sf) st_geometry(source) <- NULL |
561 | 22x |
source_numeric <- vapply(seq_len(ncol(source)), function(col) is.numeric(source[, col, drop = TRUE]), TRUE) |
562 | 22x |
if (!all(source_numeric)) { |
563 | 4x |
non_numeric <- which(!source_numeric) |
564 | 4x |
level_map <- list() |
565 | 4x |
for (col in non_numeric) { |
566 | 4x |
x <- as.factor(source[, col, drop = TRUE]) |
567 | 4x |
source[, col] <- as.numeric(x) |
568 | 4x |
level_map[[as.character(col)]] <- list( |
569 | 4x |
index = col, |
570 | 4x |
levels = levels(x) |
571 |
) |
|
572 |
} |
|
573 |
} |
|
574 | 22x |
if (verbose) { |
575 | 1x |
cli_alert_info(paste( |
576 | 1x |
"redistributing {.field {length(source_numeric)}} variable{?s}", |
577 | 1x |
"from {.field {length(sid)}} source{?s} to {.field {length(tid)}} target{?s}:" |
578 |
)) |
|
579 | 1x |
var_groups <- split(colnames(source), !source_numeric) |
580 | 1x |
names(var_groups) <- c( |
581 | 1x |
"TRUE" = "(char; {sum(!source_numeric)})", "FALSE" = "(numb; {sum(source_numeric)})" |
582 | 1x |
)[names(var_groups)] |
583 | 1x |
cli_bullets(structure( |
584 | 1x |
vapply(names(var_groups), function(type) { |
585 | 1x |
paste(type, if (length(var_groups[[type]]) < 10) paste(var_groups[[type]], collapse = ", ")) |
586 |
}, ""), |
|
587 | 1x |
names = rep("*", length(var_groups)) |
588 |
)) |
|
589 | 1x |
type <- if (aggregate) "aggregating" else "disaggregating" |
590 | 1x |
cli_progress_step(paste0(type, "..."), msg_done = paste("done", type)) |
591 |
} |
|
592 | 22x |
method <- as.integer(source_numeric) |
593 | 9x |
if (any_partial) aggregate <- TRUE |
594 | 22x |
if (aggregate) { |
595 | 9x |
method <- method + 10 |
596 | 9x |
if (any_partial) method[method == 11] <- 12 |
597 |
} |
|
598 | 22x |
balance <- any_partial && (!is.logical(use_all) || use_all) |
599 | 22x |
res <- process_distribute(as.matrix(source), method, tid, w, map, aggregate, balance) |
600 | 22x |
res <- as.data.frame(res) |
601 | 22x |
colnames(res) <- colnames(source) |
602 | 1x |
if (verbose) cli_progress_done() |
603 | 22x |
if (!all(source_numeric)) { |
604 | ! |
if (verbose) cli_alert_info("re-converting categorical levels") |
605 | 4x |
for (l in level_map) { |
606 | 4x |
x <- res[, l$index] |
607 | 4x |
x[x == 0] <- NA |
608 | 4x |
res[, l$index] <- l$levels[x] |
609 |
} |
|
610 |
} |
|
611 | 22x |
if (rescale) { |
612 | 22x |
if (verbose) { |
613 | 1x |
status <- "checking totals" |
614 | 1x |
cli_progress_step("{status}") |
615 |
} |
|
616 | 22x |
source_totals <- vapply( |
617 | 22x |
seq_along(source_numeric), |
618 | 22x |
function(i) if (source_numeric[i]) sum(source[, i], na.rm = TRUE) else 0, |
619 | 22x |
0 |
620 |
) |
|
621 | 22x |
res_totals <- vapply( |
622 | 22x |
seq_along(source_numeric), |
623 | 22x |
function(i) if (source_numeric[i]) sum(res[, i], na.rm = TRUE) else 0, |
624 | 22x |
0 |
625 |
) |
|
626 | 22x |
su <- which(source_totals != res_totals) |
627 | 22x |
if (length(su)) { |
628 | 12x |
if (verbose) { |
629 | 1x |
status <- paste0("rescaling ", length(su), " variable", if (length(su) == 1) "" else "s") |
630 | 1x |
cli_progress_update() |
631 |
} |
|
632 | 12x |
res_totals[res_totals == 0] <- 1 |
633 | 12x |
for (i in su) res[, i] <- res[, i] / res_totals[i] * source_totals[i] |
634 | 12x |
if (verbose) { |
635 | 1x |
status <- sub("rescaling", "rescaled", status, fixed = TRUE) |
636 | 1x |
cli_progress_done() |
637 |
} |
|
638 | 10x |
} else if (verbose) { |
639 | ! |
status <- "totals are aligned" |
640 | ! |
cli_progress_done() |
641 |
} |
|
642 |
} |
|
643 | 22x |
res <- cbind(id = tid, res) |
644 | 22x |
if (realign) { |
645 | ! |
if (verbose) cli_alert_info("realigning with original target IDs") |
646 | ! |
rownames(res) <- tid |
647 | ! |
res <- res[otid, ] |
648 | ! |
res$id <- otid |
649 | ! |
if (!is.na(default_value)) res[!res$id %in% tid, -1] <- default_value |
650 | ! |
rownames(res) <- NULL |
651 |
} |
|
652 | 22x |
if (!is.null(outFile)) { |
653 | ! |
if (verbose) cli_progress_step("writing results to {.file {outFile}}") |
654 | ! |
dir.create(dirname(outFile), FALSE, TRUE) |
655 | ! |
if (!grepl("\\.\\w", outFile)) outFile <- paste0(outFile, ".csv") |
656 | ! |
vroom_write(res, outFile, ",") |
657 | ! |
if (verbose) cli_progress_done() |
658 |
} |
|
659 | 22x |
if (return_geometry && target_sf) { |
660 | 11x |
st_geometry(res) <- if (n_filled) { |
661 | ! |
c(st_geometry(target), if (source_sf) { |
662 | ! |
structure(source_geom, names = sid)[missed_sources] |
663 |
} else { |
|
664 | ! |
st_sfc(lapply(seq_len(n_filled), function(i) st_point()), crs = st_crs(target)) |
665 |
}) |
|
666 |
} else { |
|
667 | 11x |
st_geometry(target) |
668 |
} |
|
669 |
} |
|
670 | 22x |
invisible(res) |
671 |
} |
1 |
#' Make Higher-Order Polygons |
|
2 |
#' |
|
3 |
#' Randomly combine given polygons into contiguous, larger polygons. |
|
4 |
#' |
|
5 |
#' @param polygons An \code{sf} object with polygons to be reformed. |
|
6 |
#' @param ids A vector of IDs the same length as \code{polygons}, or the column name |
|
7 |
#' in \code{polygons} to use as IDs. |
|
8 |
#' @param n Number of polygons to aim to combine for each new polygon. |
|
9 |
#' @param strict_n Logical; if \code{TRUE} (default), \code{n} represents the number of |
|
10 |
#' intersecting polygons to select, depending on availability (with a minimum of 2). |
|
11 |
#' If \code{FALSE}, \code{n} represents the number of nearest polygons (by centroid) |
|
12 |
#' to use when calculating a box to use when selecting polygons (with a minimum of 1). |
|
13 |
#' @param n_as_min Logical; if \code{TRUE}, will merge any parents with fewer than \code{n} |
|
14 |
#' children with a random neighbor. Otherwise (and by default), parents may have fewer than \code{n} |
|
15 |
#' children. Applies if \code{strict_n} is \code{TRUE}. |
|
16 |
#' @param buffer_dist Distance around each initial shape or set of shapes, used to define |
|
17 |
#' neighboring shapes. Applies if \code{strict_n} is \code{TRUE} |
|
18 |
#' @param min_overlap Minimal area of overlap between potential neighboring shapes and |
|
19 |
#' the buffered target shape, used to define neighboring shapes. |
|
20 |
#' Applies if \code{strict_n} is \code{TRUE} |
|
21 |
#' @param verbose Logical; if \code{FALSE}, will not print status messages. |
|
22 |
#' @returns A list with entries for \code{new} (an \code{sf} object containing the new polygons) |
|
23 |
#' and \code{map} (a list mapping between the old and new polygon indices). |
|
24 |
#' @export |
|
25 | ||
26 |
make_parent_polygons <- function( |
|
27 |
polygons, ids = NULL, n = 2, strict_n = TRUE, n_as_min = FALSE, buffer_dist = 5e-5, |
|
28 |
min_overlap = NULL, verbose = TRUE) { |
|
29 | ! |
if (!inherits(polygons, "sf")) cli_abort("{.arg polygons} must be an sf object") |
30 | ! |
if (length(ids) == 1 && ids %in% colnames(polygons)) ids <- polygons[[ids]] |
31 | 3x |
polygons <- st_geometry(polygons) |
32 | 3x |
polygons <- st_cast(st_boundary(polygons), "POLYGON", do_split = FALSE) |
33 | 2x |
if (strict_n && is.null(NULL)) min_overlap <- min(s2_area(polygons)) * .0035 |
34 | 3x |
n_poly <- length(polygons) |
35 | 3x |
if (!is.null(ids)) { |
36 | ! |
if (length(ids) != n_poly) cli_abort("{.arg ids} do not align with {.arg polygons}") |
37 | ! |
names(polygons) <- ids |
38 |
} |
|
39 | 3x |
n <- max(if (strict_n) 2 else 1, min(n, n_poly - 1)) |
40 | 3x |
sels <- seq_len(n) + 1L |
41 | 3x |
centers <- st_coordinates(suppressWarnings(st_centroid(polygons))) |
42 | 3x |
rownames(centers) <- seq_len(nrow(centers)) |
43 | 3x |
map <- new <- NULL |
44 | 3x |
accounted <- integer(n_poly) |
45 | 3x |
if (verbose) { |
46 | 3x |
cli_progress_step( |
47 | 3x |
"mapping child polygons ({.field {sum(accounted)}}/{.field {n_poly}})", |
48 | 3x |
msg_done = paste0( |
49 | 3x |
"created initial map from {.field {length(map)}} parent polygons to ", |
50 | 3x |
"{.field {n_poly}} child polygons" |
51 |
), |
|
52 | 3x |
spinner = TRUE |
53 |
) |
|
54 |
} |
|
55 | 3x |
for (i in seq_len(n_poly)) { |
56 | 49x |
if (verbose) cli_progress_update() |
57 | 49x |
available <- which(accounted == 0L) |
58 | 3x |
if (!length(available)) break |
59 | 46x |
start <- if (length(available) == 1) available else sample(available, 1) |
60 | 46x |
if (strict_n) { |
61 | 35x |
sg <- st_buffer(polygons[[start]], buffer_dist, 1) |
62 | 35x |
ag <- polygons[available] |
63 | 35x |
pre <- st_intersects(sg, ag)[[1]] |
64 | 35x |
if (length(pre)) { |
65 | 35x |
pre <- pre[vapply(ag[pre], function(t) { |
66 | 157x |
s2_area(st_intersection(t, sg, model = "closed")) |
67 | 35x |
}, 0) > min_overlap] |
68 |
} |
|
69 | 35x |
set <- unique(c(start, available[pre])) |
70 | 35x |
sn <- length(set) |
71 | 35x |
if (sn < n) { |
72 | 10x |
available <- available[!available %in% set] |
73 | 10x |
while (length(set) < n && length(available)) { |
74 | 8x |
sg <- st_buffer(st_boundary(polygons[set])[[1]], buffer_dist, 1) |
75 | 8x |
ag <- polygons[available] |
76 | 8x |
pre <- st_intersects(sg, ag)[[1]] |
77 | 8x |
if (length(pre)) { |
78 | ! |
pre <- pre[vapply(ag[pre], function(t) { |
79 | ! |
s2_area(st_intersection(t, sg, model = "closed")) |
80 | ! |
}, 0) > min_overlap] |
81 |
} |
|
82 | 8x |
set <- unique(c(set, available[pre])) |
83 | 8x |
if (length(set) > n) { |
84 | ! |
set <- set[seq_len(n)] |
85 | ! |
break |
86 |
} else { |
|
87 | 8x |
su <- !available %in% set |
88 | 8x |
if (all(su)) { |
89 | 8x |
break |
90 |
} else { |
|
91 | ! |
available <- available[su] |
92 |
} |
|
93 |
} |
|
94 |
} |
|
95 |
} else { |
|
96 | 25x |
set <- set[seq_len(n)] |
97 |
} |
|
98 |
} else { |
|
99 | 11x |
nearest <- as.integer(names(sort( |
100 | 11x |
lma_simets(centers, centers[start, ], metric = "euclidean"), TRUE |
101 | 11x |
)[sels])) |
102 | 11x |
adj <- max(abs(centers[nearest, ] - rep(unname(centers[start, ]), each = n))) |
103 | 11x |
set <- unique(c(start, available[st_intersects( |
104 | 11x |
st_polygon(list(matrix( |
105 | 11x |
rep(centers[start, ], each = 5) - adj * c(1, -1, -1, 1, 1, 1, 1, -1, -1, 1), 5 |
106 | 11x |
))), polygons[available] |
107 | 11x |
)[[1]]])) |
108 |
} |
|
109 | 46x |
accounted[set] <- 1L |
110 | 46x |
map <- c(list(set), map) |
111 |
} |
|
112 | 3x |
map <- data.frame( |
113 | 3x |
super = rep(seq_along(map), vapply(map, length, 0)), |
114 | 3x |
sub = unlist(map, use.names = FALSE) |
115 |
) |
|
116 | 3x |
map <- map[!duplicated(map$sub), ] |
117 | 3x |
map <- split(map$sub, map$super) |
118 | 3x |
if (n_as_min) { |
119 | 1x |
su <- which(vapply(map, length, 0) < n) |
120 | 1x |
if (length(su)) { |
121 | 1x |
i <- 0 |
122 | 1x |
defunct <- NULL |
123 | 1x |
if (verbose) { |
124 | 1x |
cli_progress_step( |
125 | 1x |
"merging in {.field {length(defunct)}}/{.field {length(su)}} parent{?/s} with too few children", |
126 | 1x |
msg_done = "merged in {.field {length(defunct)}} of {.field {length(su)}} small parent{?/s}", |
127 | 1x |
spinner = TRUE |
128 |
) |
|
129 |
} |
|
130 | 1x |
tmap <- data.frame( |
131 | 1x |
super = rep(seq_along(map), vapply(map, length, 0)), |
132 | 1x |
sub = unlist(map, use.names = FALSE) |
133 |
) |
|
134 | 1x |
available <- seq_along(polygons) |
135 | 1x |
for (i in su) { |
136 | 3x |
available <- available[!available %in% map[[i]]] |
137 | 3x |
sg <- st_buffer(st_boundary(polygons[map[[i]]])[[1]], buffer_dist, 1) |
138 | 3x |
ag <- polygons[available] |
139 | 3x |
pre <- st_intersects(sg, ag)[[1]] |
140 | 3x |
if (length(pre)) { |
141 | 3x |
pre <- pre[vapply(ag[pre], function(t) { |
142 | 13x |
s2_area(st_intersection(t, sg, model = "closed")) |
143 | 3x |
}, 0) > min_overlap] |
144 |
} |
|
145 | 3x |
if (length(pre)) { |
146 | 3x |
ns <- available[if (length(pre) == 1) pre else sample(pre, 1)] |
147 | 3x |
np <- tmap[[1]][tmap[[2]] == ns] |
148 | 3x |
map[[np]] <- c(map[[np]], map[[i]]) |
149 | 3x |
available <- available[!available %in% map[[np]]] |
150 | 3x |
defunct <- c(defunct, i) |
151 |
} |
|
152 | 3x |
if (verbose) cli_progress_update() |
153 |
} |
|
154 | 1x |
if (length(defunct)) map <- map[-defunct] |
155 | 1x |
if (verbose) cli_progress_done() |
156 |
} |
|
157 | 1x |
map <- data.frame( |
158 | 1x |
super = rep(seq_along(map), vapply(map, length, 0)), |
159 | 1x |
sub = unlist(map, use.names = FALSE) |
160 |
) |
|
161 | 1x |
map <- map[!duplicated(map$sub), ] |
162 | 1x |
map <- split(map$sub, map$super) |
163 |
} |
|
164 | ! |
if (!is.null(ids)) map <- lapply(map, function(l) ids[l]) |
165 | 3x |
i <- 0 |
166 | 3x |
if (verbose) { |
167 | 3x |
cli_progress_step( |
168 | 3x |
"creating new polygons ({.field {i}}/{.field {length(map)}})", |
169 | 3x |
msg_done = "created {.field {length(map)}} new polygons", spinner = TRUE |
170 |
) |
|
171 |
} |
|
172 | 3x |
new <- do.call(c, lapply(map, function(l) { |
173 | 43x |
if (verbose) { |
174 | 43x |
i <<- i + 1 |
175 | 43x |
cli_progress_update(.envir = parent.frame(2)) |
176 |
} |
|
177 | 43x |
st_union(polygons[l]) |
178 |
})) |
|
179 | 3x |
list(new = new, map = map) |
180 |
} |
1 |
#' Download U.S. Census Microdata |
|
2 |
#' |
|
3 |
#' Download and load U.S. census American Community Survey (ACS) Public Use Microdata Samples (PUMS): |
|
4 |
#' \href{https://www.census.gov/programs-surveys/acs/microdata.html}{census.gov/programs-surveys/acs/microdata.html} |
|
5 |
#' |
|
6 |
#' |
|
7 |
#' @param dir Directory in which to save the file(s). |
|
8 |
#' @param state Postal or FIPS code of the state. |
|
9 |
#' @param year 4 digit year, between 2005 and the most recent year. |
|
10 |
#' @param level A character indicating whether to get the person- or household-level sample. Defaults to both. |
|
11 |
#' @param one_year Logical; if \code{FALSE}, will get the 5-year estimates rather than the 1-year file. If |
|
12 |
#' not specified, will fall back to the 5-year file if the 1-year file is not available. |
|
13 |
#' @param calculate_error Logical; if \code{TRUE}, will calculate standard errors for each variable using |
|
14 |
#' the Successive Difference Replicate Formula from the |
|
15 |
#' \href{https://www.census.gov/programs-surveys/acs/library/handbooks/pums.html}{census.gov/programs-surveys/acs/library/handbooks/pums.html}. |
|
16 |
#' @param crosswalk Logical; if \code{FALSE}, will not retrieve the PUMA relationship files for associating |
|
17 |
#' Census tracts with PUM areas. This will be treated as \code{TRUE} if |
|
18 |
#' \code{geoids} is specified. |
|
19 |
#' @param geoids A vector of county, tract, or block group GEOIDs within the specified state to |
|
20 |
#' select PUM areas by; defaults to all areas. If specified, \code{crosswalk} will be treated as \code{TRUE}. |
|
21 |
#' @param verbose Logical; if \code{FALSE}, will not print status messages. |
|
22 |
#' @examples |
|
23 |
#' \dontrun{ |
|
24 |
#' download_census_pums(".", "va") |
|
25 |
#' } |
|
26 |
#' @return A list with entries for \code{year} and \code{state} (as specified), |
|
27 |
#' \code{dictionary} (containing the data dictionary for that year), \code{household} and/or \code{person} |
|
28 |
#' (with survey data), and optionally \code{household_error} and/or \code{person_error} |
|
29 |
#' (if \code{calculate_error} is \code{TRUE}), and \code{crosswalk} (if \code{crosswalk} is \code{TRUE} or |
|
30 |
#' \code{geoids} is specified). |
|
31 |
#' @export |
|
32 | ||
33 |
download_census_pums <- function(dir, state, year = 2021, level = "both", one_year = TRUE, calculate_error = FALSE, |
|
34 |
crosswalk = TRUE, geoids = NULL, verbose = TRUE) { |
|
35 | 3x |
us_fips <- list( |
36 | 3x |
post = c( |
37 | 3x |
"us", "al", "ak", "az", "ar", "ca", "co", "ct", "de", "dc", "fl", "ga", "hi", "id", "il", "in", "ia", "ks", "ky", |
38 | 3x |
"la", "me", "md", "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", "nm", "ny", "nc", "nd", "oh", "ok", |
39 | 3x |
"or", "pa", "ri", "sc", "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy", "pr" |
40 |
), |
|
41 | 3x |
fips = c( |
42 | 3x |
"us", 1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, |
43 | 3x |
33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 72 |
44 |
) |
|
45 |
) |
|
46 | 3x |
state <- gsub("^0|[^a-z0-9]", "", tolower(state[[1]])) |
47 | 3x |
fid <- which(us_fips$fips == state) |
48 | 2x |
if (!length(fid) && nchar(state) == 2) fid <- which(us_fips$post == state) |
49 | ! |
if (!length(fid) || is.na(fid)) cli_abort("failed to recognize {.arg state}") |
50 | 3x |
state <- us_fips$post[fid] |
51 | 3x |
level <- tolower(substring(level, 1, 1)) |
52 | 3x |
if (!level %in% c("h", "p", "b")) { |
53 | ! |
cli_abort( |
54 | ! |
'{.arg level} should be one of {.field "houshold"}, {.field "person"}, or {.field "both"}.' |
55 |
) |
|
56 |
} |
|
57 | 3x |
summary <- if (one_year) "1-Year" else "5-Year" |
58 | 3x |
urls <- paste0( |
59 | 3x |
"https://www2.census.gov/programs-surveys/acs/data/pums/", year, |
60 | 3x |
"/", summary, "/csv_", c("h", "p"), state, ".zip" |
61 |
) |
|
62 | 3x |
if (!is.null(dir)) { |
63 | 3x |
dir <- tdir <- paste0(normalizePath(dir, "/", FALSE), "/") |
64 | ! |
if (!dir.exists(dir)) dir.create(dir, FALSE, TRUE) |
65 |
} else { |
|
66 | ! |
tdir <- paste0(tempdir(), "/") |
67 |
} |
|
68 | 3x |
folder <- paste0(tdir, "pums_", state, "/", year, "/", summary, "/") |
69 | 3x |
dir.create(folder, FALSE, TRUE) |
70 | 3x |
res <- list(year = year, state = state, dictionary = NULL, household = NULL, person = NULL) |
71 | 3x |
dict_json <- paste0(folder, paste0( |
72 | 3x |
"PUMS_Data_Dictionary_", if (one_year) year else paste0(year - 4, "-", year), ".json" |
73 |
)) |
|
74 | 3x |
if (file.exists(dict_json)) { |
75 | 1x |
res$dictionary <- read_json(dict_json, simplifyVector = TRUE) |
76 |
} else { |
|
77 | 2x |
dict_file <- sub(".json", ".csv", dict_json, fixed = TRUE) |
78 | 2x |
if (!file.exists(dict_file)) { |
79 | 2x |
url <- paste0("https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/", basename(dict_file)) |
80 | 2x |
status <- curl_fetch_disk(url, dict_file) |
81 | ! |
if (status$status_code != 200) cli_abort("failed to retrieve data dictionary from {.url {url}}") |
82 |
} |
|
83 | 2x |
dict <- readLines(dict_file, warn = FALSE) |
84 | 2x |
res$dictionary <- lapply( |
85 | 2x |
split(dict, sub(",.*$", "", sub("^[^,]+,", "", dict, perl = TRUE), perl = TRUE)), |
86 | 2x |
function(e) { |
87 | 1042x |
h <- suppressWarnings(scan(text = e[[1]], what = "", sep = ",", quiet = TRUE)) |
88 | 1042x |
map <- as.data.frame(do.call(rbind, lapply( |
89 | 1042x |
e[-1], function(l) suppressWarnings(scan(text = l, what = "", sep = ",", quiet = TRUE))[5:7] |
90 |
))) |
|
91 | 1042x |
colnames(map) <- c("from", "to", "value") |
92 | 1042x |
list( |
93 | 1042x |
description = h[5], |
94 | 1042x |
type = if (h[3] == "C") "character" else "integer", |
95 | 1042x |
length = as.numeric(h[4]), |
96 | 1042x |
map = map |
97 |
) |
|
98 |
} |
|
99 |
) |
|
100 | 2x |
write_json(res$dictionary, dict_json, auto_unbox = TRUE) |
101 |
} |
|
102 | 3x |
files <- paste0(folder, "/", c("h", "p"), ".csv.xz") |
103 | 3x |
if (level != "b") { |
104 | ! |
lid <- (level == "p") + 1 |
105 | ! |
urls <- urls[lid] |
106 | ! |
files <- files[lid] |
107 |
} |
|
108 | 3x |
can_change <- missing(one_year) |
109 | 3x |
for (i in seq_along(urls)) { |
110 | 6x |
url <- urls[i] |
111 | 6x |
clevel <- if (grepl("csv_h", url, fixed = TRUE)) "household" else "person" |
112 | 6x |
if (!file.exists(files[i])) { |
113 | 4x |
file_zip <- paste0(folder, basename(url)) |
114 | 4x |
if (!file.exists(file_zip)) { |
115 | 4x |
if (verbose) cli_alert_info("retrieving {clevel} sample: {.url {url}}") |
116 | 4x |
status <- curl_fetch_disk(url, file_zip) |
117 | 4x |
if (status$status_code != 200 && can_change) { |
118 | ! |
new_summary <- if (one_year) "5-Year" else "1-Year" |
119 | ! |
urls <- sub(summary, new_summary, urls, fixed = TRUE) |
120 | ! |
files <- sub(summary, new_summary, files, fixed = TRUE) |
121 | ! |
summary <- new_summary |
122 | ! |
url <- urls[i] |
123 | ! |
status <- curl_fetch_disk(url, file_zip) |
124 |
} |
|
125 | 4x |
can_change <- FALSE |
126 | 4x |
if (status$status_code != 200) { |
127 | ! |
unlink(file_zip) |
128 | ! |
cli_abort(paste0( |
129 | ! |
"failed to download {.url {url}}: ", strsplit(rawToChar(status$headers), "[\r\n]")[[1]][1] |
130 |
)) |
|
131 |
} |
|
132 |
} |
|
133 | 4x |
rfile <- paste0(folder, grep(".csv", unzip(file_zip, list = TRUE)$Name, fixed = TRUE, value = TRUE)) |
134 | 4x |
unzip(file_zip, exdir = folder) |
135 | 4x |
cols <- strsplit(readLines(rfile, 1), ",")[[1]] |
136 | 4x |
types <- paste(vapply(cols, function(n) { |
137 | 1056x |
type <- res$dictionary[[n]]$type |
138 | 1056x |
if (is.null(type)) if (grepl("WGTP", n, fixed = TRUE)) "i" else "c" else substring(type, 1, 1) |
139 | 4x |
}, ""), collapse = "") |
140 | 4x |
res[[clevel]] <- vroom(rfile, col_types = types) |
141 | 4x |
vroom_write(res[[clevel]], files[i], ",") |
142 | 4x |
on.exit(unlink(rfile), add = TRUE) |
143 |
} else { |
|
144 | 2x |
if (verbose) cli_alert_info("loading {clevel} sample: {.field {basename(files[i])}}") |
145 | 2x |
cols <- strsplit(readLines(files[i], 1), ",")[[1]] |
146 | 2x |
types <- paste(vapply(cols, function(n) { |
147 | 528x |
type <- res$dictionary[[n]]$type |
148 | 528x |
if (is.null(type)) if (grepl("WGTP", n, fixed = TRUE)) "i" else "c" else substring(type, 1, 1) |
149 | 2x |
}, ""), collapse = "") |
150 | 2x |
res[[clevel]] <- vroom(files[i], col_types = types) |
151 |
} |
|
152 | 6x |
weight <- if (clevel == "person") "PWGTP" else "WGTP" |
153 | 6x |
if (calculate_error && paste0(weight, "1") %in% cols) { |
154 | 2x |
if (verbose) cli_alert_info("calculating {clevel} errors") |
155 | 2x |
vars <- cols[!grepl("^(?:RT|SERIAL|DIV|PUMA|REG|ST|ADJ)|WGTP", cols) & cols %in% names(res$dictionary)] |
156 | 2x |
names(vars) <- vars |
157 | 2x |
wv <- res[[clevel]][, weight, drop = TRUE] |
158 | 2x |
w <- as.matrix(res[[clevel]][, grep("WGTP\\d+", cols)]) |
159 | 2x |
res[[paste0(clevel, "_error")]] <- as.data.frame(do.call(cbind, lapply(vars, function(name) { |
160 | 350x |
v <- res[[clevel]][, name, drop = TRUE] |
161 | 350x |
map <- res$dictionary[[name]] |
162 | 350x |
if (map$type == "integer") { |
163 | 45x |
sqrt(.05 * rowSums((w - v * wv)^2)) |
164 |
} else { |
|
165 | 305x |
levels <- map$map$from |
166 | 305x |
names(levels) <- paste0(name, "_", levels) |
167 | 305x |
vapply(levels, function(l) { |
168 | 4787x |
sqrt(.05 * rowSums((w - (v == l) * wv)^2)) |
169 | 305x |
}, numeric(length(v))) |
170 |
} |
|
171 |
}))) |
|
172 |
} |
|
173 |
} |
|
174 | 3x |
if (crosswalk || !is.null(geoids)) { |
175 | 3x |
url <- if (as.numeric(year) > if (one_year) 2019 else 2024) { |
176 | 3x |
"https://www2.census.gov/geo/docs/maps-data/data/rel2020/2020_Census_Tract_to_2020_PUMA.txt" |
177 |
} else { |
|
178 | ! |
"https://www2.census.gov/geo/docs/maps-data/data/rel/2010_Census_Tract_to_2010_PUMA.txt" |
179 |
} |
|
180 | 3x |
crosswalk_file <- paste0(folder, basename(url)) |
181 | 3x |
if (!file.exists(crosswalk_file)) { |
182 | 2x |
if (verbose) cli_alert_info("retrieving crosswalk {.url {url}}") |
183 | 2x |
status <- curl_fetch_disk(url, crosswalk_file) |
184 | ! |
if (status$status_code != 200) cli_warn("failed to retrieve crosswalk from {.url {url}}") |
185 |
} |
|
186 | 3x |
if (file.exists(crosswalk_file)) { |
187 | 3x |
if (verbose) cli_alert_info("loading crosswalk {.field {basename(crosswalk_file)}}") |
188 | 3x |
res$crosswalk <- vroom(crosswalk_file, col_types = "cccc") |
189 | 3x |
if (!is.null(geoids)) { |
190 | 2x |
geoids <- substring(geoids, 1, 11) |
191 | 2x |
su <- do.call(paste0, res$crosswalk[, 1:2]) %in% geoids | do.call(paste0, res$crosswalk[, 1:3]) %in% geoids |
192 | 2x |
if (any(su)) { |
193 | 2x |
pids <- unique(res$crosswalk$PUMA5CE[su]) |
194 | 2x |
if (verbose) { |
195 | 2x |
cli_alert_info( |
196 | 2x |
"subsetting to {length(pids)} of {length(unique(res$crosswalk$PUMA5CE))} PUM areas" |
197 |
) |
|
198 |
} |
|
199 | 2x |
for (set in c("household", "person")) { |
200 | 4x |
su <- res[[set]]$PUMA %in% pids |
201 | 4x |
res[[set]] <- res[[set]][su, ] |
202 | 4x |
set_error <- paste0(set, "_error") |
203 | ! |
if (set_error %in% names(res)) res[[set_error]] <- res[[set_error]][su, ] |
204 |
} |
|
205 |
} else { |
|
206 | ! |
cli_warn("none of the specified {.arg geoids} were found in the crosswalk") |
207 |
} |
|
208 |
} |
|
209 |
} |
|
210 |
} |
|
211 | 3x |
res |
212 |
} |
1 |
#' Method: Proportional, resident-estimate-adjusted parcel unit counts |
|
2 |
#' |
|
3 |
#' A wrapper around the \code{\link{redistribute}} function, that |
|
4 |
#' takes PUMS data as additional input, and uses it to calculates an adjusted weight for |
|
5 |
#' redistribution to parcel data. |
|
6 |
#' |
|
7 |
#' It is assumed that initial weights are unit counts, and these are to be adjusted |
|
8 |
#' based on PUMS household data. |
|
9 |
#' |
|
10 |
#' @param source,target Source and target of \code{\link{redistribute}}. |
|
11 |
#' @param households PUMS household data. Can be a list with \code{household} and \code{person} |
|
12 |
#' entries, like that returned from \code{\link{download_census_pums}}. |
|
13 |
#' @param target_total Column name in \code{target} (e.g., parcel data; or equivalent vector) |
|
14 |
#' containing the unit count |
|
15 |
#' @param target_indicator Column name of a logical variable in \code{target} (or equivalent vector) |
|
16 |
#' to use to assign adjustment values, where \code{TRUE} indicates a single family home. |
|
17 |
#' If the values of the column are characters, anything not matching \code{"MULTI"} will be \code{TRUE}. |
|
18 |
#' @param households_indicator Same a \code{target_indicator}, but in \code{households}, or |
|
19 |
#' a vector (or column name) of BLD codes, where either \code{"02"} or \code{"03"} would be \code{TRUE}. |
|
20 |
#' @param households_size A vector of household sizes, or a column in \code{households} containing |
|
21 |
#' such values. If this is not specified or found, \code{person} will be used to calculate |
|
22 |
#' household size, based on \code{hosuehold_id} and \code{person_household_id}. |
|
23 |
#' @param households_id A vector of household IDs aligning with \code{households}, |
|
24 |
#' or a column name in \code{households} containing such IDs. |
|
25 |
#' Only used to calculate \code{households_size} if necessary. |
|
26 |
#' @param person PUMS person data. Only used to calculate \code{households_size} if necessary. |
|
27 |
#' @param person_household_id A vector of household IDs aligning with \code{person}, |
|
28 |
#' or a column name in \code{person} containing such IDs. |
|
29 |
#' Only used to calculate \code{households_size} if necessary. |
|
30 |
#' @param ... Additional arguments to pass to \code{\link{redistribute}}. You will likely need |
|
31 |
#' to specify a \code{map}, and potentially \code{source_id} and/or \code{target_id}. |
|
32 |
#' @examples |
|
33 |
#' \dontrun{ |
|
34 |
#' if (require("tidycensus")) { |
|
35 |
#' # download source, target, and household data |
|
36 |
#' tracts <- tidycensus::get_acs( |
|
37 |
#' year = 2021, |
|
38 |
#' state = "51", |
|
39 |
#' county = "013", |
|
40 |
#' geography = "tract", |
|
41 |
#' output = "wide", |
|
42 |
#' variables = c(total = "B01001_001"), |
|
43 |
#' geometry = TRUE |
|
44 |
#' ) |
|
45 |
#' parcels <- sf::st_read(paste0( |
|
46 |
#' "https://arlgis.arlingtonva.us/arcgis/rest/", |
|
47 |
#' "services/Open_Data/od_MHUD_Polygons/", |
|
48 |
#' "FeatureServer/0/query?where=1=1&outFields=*&outSR=4326&f=json" |
|
49 |
#' )) |
|
50 |
#' pums <- download_census_pums(tempdir(), "51", geoids = tracts$GEOID) |
|
51 |
#' |
|
52 |
#' # calculate map between source and target |
|
53 |
#' map_tr_to_parcel <- redistribute( |
|
54 |
#' tracts, parcels, |
|
55 |
#' target_id = "OBJECTID", return_map = TRUE |
|
56 |
#' ) |
|
57 |
#' |
|
58 |
#' # redistribute tract-level summaries to parcels, |
|
59 |
#' # using a resident-estimate-adjusted weight |
|
60 |
#' parcels_filled <- redistribute_parcel_pums_adj( |
|
61 |
#' tracts[, -2], parcels, pums, |
|
62 |
#' map = map_tr_to_parcel, target_id = "OBJECTID" |
|
63 |
#' ) |
|
64 |
#' |
|
65 |
#' # this can also be calculated with the underlying function |
|
66 |
#' |
|
67 |
#' ## calculate resident estimates |
|
68 |
#' household_size <- tapply( |
|
69 |
#' table(pums$person$SERIALNO)[pums$household$SERIALNO], |
|
70 |
#' pums$household$BLD %in% c("02", "03"), |
|
71 |
#' mean, |
|
72 |
#' na.rm = TRUE |
|
73 |
#' ) |
|
74 |
#' residents <- parcels$Total_Units * household_size[ |
|
75 |
#' as.character(parcels$Unit_Type != "MULTI") |
|
76 |
#' ] |
|
77 |
#' |
|
78 |
#' ## supply as weights |
|
79 |
#' parcels_filled_manual <- redistribute( |
|
80 |
#' tracts[, -2], parcels, map_tr_to_parcel, |
|
81 |
#' target_id = "OBJECTID", weight = residents |
|
82 |
#' ) |
|
83 |
#' |
|
84 |
#' # either way, you could now use the result to make estimates |
|
85 |
#' # at higher-level geographies by redistributing from the |
|
86 |
#' # parcel-level estimates to the new target |
|
87 |
#' } |
|
88 |
#' } |
|
89 |
#' @returns Result of \code{\link{redistribute}}. These are assumed to be parcel-level |
|
90 |
#' estimates, which could then be aggregated to make higher-level estimates. |
|
91 |
#' @export |
|
92 | ||
93 |
redistribute_parcel_pums_adj <- function( |
|
94 |
source, target, households, target_total = "Total_Units", target_indicator = "Unit_Type", |
|
95 |
households_size = "size", households_indicator = "BLD", households_id = "SERIALNO", |
|
96 |
person = NULL, person_household_id = "SERIALNO", ...) { |
|
97 | 1x |
if (is.null(dim(households)) && !is.null(households$household)) { |
98 | 1x |
if (is.null(person)) person <- households$person |
99 | 1x |
households <- households$household |
100 |
} |
|
101 | 1x |
hn <- nrow(households) |
102 | 1x |
hcols <- colnames(households) |
103 | 1x |
tn <- nrow(target) |
104 | 1x |
tcols <- colnames(target) |
105 | ||
106 |
# resolve main vectors |
|
107 | 1x |
if (is.character(target_total) && length(target_total) == 1) { |
108 | ! |
if (!target_total %in% tcols) cli_abort("{.arg target_total} ({target_total}) is not in {.arg target}") |
109 | 1x |
target_total <- target[[target_total]] |
110 | ! |
} else if (length(target_total) != tn) cli_abort("{.arg target_total} is not the same length as {.code nrow(target)}") |
111 | 1x |
if (is.character(target_indicator) && length(target_indicator) == 1) { |
112 | ! |
if (!target_indicator %in% tcols) cli_abort("{.arg target_indicator} ({target_indicator}) is not in {.arg target}") |
113 | 1x |
target_indicator <- target[[target_indicator]] |
114 | ! |
} else if (length(target_indicator) != tn) cli_abort("{.arg target_indicator} is not the same length as {.code nrow(target)}") |
115 | 1x |
if (is.character(households_indicator) && length(households_indicator) == 1) { |
116 | 1x |
if (!households_indicator %in% hcols) { |
117 | ! |
cli_abort("{.arg households_indicator} ({households_indicator}) is not in {.arg households}") |
118 |
} |
|
119 | 1x |
households_indicator <- households[[households_indicator]] |
120 | ! |
} else if (length(households_indicator) != hn) { |
121 | ! |
cli_abort("{.arg households_indicator} is not the same length as {.code nrow(households)}") |
122 |
} |
|
123 | 1x |
if (!is.logical(target_indicator)) { |
124 | 1x |
target_indicator <- if (is.numeric(target_indicator)) { |
125 | ! |
target_indicator == 1 |
126 |
} else { |
|
127 | 1x |
!grepl("^multi", target_indicator, TRUE) |
128 |
} |
|
129 |
} |
|
130 | 1x |
if (!is.logical(households_indicator)) { |
131 | 1x |
households_indicator <- if (is.numeric(households_indicator)) { |
132 | ! |
households_indicator == 1 |
133 |
} else { |
|
134 | 1x |
if (all(c("02", "03") %in% households_indicator)) { |
135 | 1x |
grepl("^0[23]$", households_indicator) |
136 |
} else { |
|
137 | ! |
!grepl("^multi", households_indicator, TRUE) |
138 |
} |
|
139 |
} |
|
140 |
} |
|
141 | ||
142 |
# resolve size |
|
143 | 1x |
size <- NULL |
144 | 1x |
if (length(households_size) != 1 && length(households_size) != hn) { |
145 | ! |
cli_abort("{.arg households_size} is not the same length as {.code nrow(households)}") |
146 |
} |
|
147 | 1x |
if (length(households_id) != 1 && length(households_id) != hn) { |
148 | ! |
cli_abort("{.arg households_id} is not the same length as {.code nrow(households)}") |
149 |
} |
|
150 | 1x |
if (is.character(households_size) && length(households_size) == 1) { |
151 | 1x |
if (households_size %in% hcols) { |
152 | ! |
size <- households[[households_size]] |
153 |
} else { |
|
154 | 1x |
if (length(households_id) == 1 && households_id %in% hcols) households_id <- households[[households_id]] |
155 | 1x |
if (!is.null(person) && length(person_household_id) == 1 && person_household_id %in% colnames(person)) { |
156 | 1x |
person_household_id <- person[[person_household_id]] |
157 | ! |
} else if (length(person_household_id) == 1) { |
158 | ! |
cli_abort("{.arg person_household_id} ({person_household_id}) is not in {.arg person}") |
159 |
} |
|
160 | 1x |
if (!any(households_id %in% person_household_id)) { |
161 | ! |
cli_abort("no {.arg households_id}s are present in {.arg person_household_id}") |
162 |
} |
|
163 | 1x |
size <- table(person_household_id)[households_id] |
164 |
} |
|
165 |
} |
|
166 | ! |
if (is.null(size)) cli_abort("failed to resolve {.arg households_size}") |
167 | ||
168 | 1x |
residents <- target_total * tapply( |
169 | 1x |
size, households_indicator, mean, |
170 | 1x |
na.rm = TRUE |
171 | 1x |
)[as.character(target_indicator)] |
172 | 1x |
redistribute(source, target, weight = residents, ...) |
173 |
} |
1 |
// [[Rcpp::depends(RcppParallel)]] |
|
2 |
#include <Rcpp.h> |
|
3 |
#include <RcppParallel.h> |
|
4 |
using namespace std; |
|
5 |
using namespace Rcpp; |
|
6 |
using namespace RcppParallel; |
|
7 | ||
8 | 11980x |
const int select(const vector<int> &local, const NumericVector &base) { |
9 | 11980x |
const int n = local.size(); |
10 | 11980x |
int sel = 0; |
11 | 11980x |
double prob, sel_prob = 0; |
12 | 83808x |
for (int i = n; i--;) { |
13 | 71828x |
prob = R::rbinom(n, local[i] ? local[i] / n * base[i] : base[i]); |
14 | 71828x |
if (prob > sel_prob) { |
15 | 7851x |
sel = i; |
16 | 7851x |
sel_prob = prob; |
17 |
} |
|
18 |
} |
|
19 | 11980x |
return sel; |
20 |
} |
|
21 | ||
22 |
struct Generate : public Worker { |
|
23 |
RVector<double> individuals; |
|
24 |
const vector<int> start, end; |
|
25 |
const IntegerVector region, head_income, size, renting, space_i, space_p; |
|
26 |
const NumericVector space_x, race_rates; |
|
27 | 2x |
const int N, n_neighbors, n_races = race_rates.length(), nh = region.length(); |
28 | 6x |
Generate( |
29 |
NumericVector &individuals, const vector<int> &start, const vector<int> &end, |
|
30 |
const IntegerVector ®ion, const IntegerVector &head_income, const IntegerVector &size, |
|
31 |
const IntegerVector &renting, const S4 &space, const NumericVector &race_rates, |
|
32 |
const int &N, const double &n_neighbors |
|
33 |
): |
|
34 | 2x |
individuals(individuals), start(start), end(end), region(region), |
35 | 2x |
head_income(head_income), size(size), renting(renting), space_i(space.slot("i")), |
36 | 2x |
space_p(space.slot("p")), space_x(space.slot("x")), race_rates(race_rates), N(N), |
37 | 2x |
n_neighbors(n_neighbors) |
38 |
{} |
|
39 | 1826x |
void operator()(size_t s, size_t e){ |
40 | 1826x |
int ck = 1e3; |
41 | 1826x |
vector<int> neighbors(n_neighbors), race_table(n_races); |
42 | 11826x |
for (; s < e; s++) { |
43 | 21930x |
const int ci = s, r = region[s] - 1, inc = head_income[s], n = size[s], |
44 | 9991x |
ro = renting[s], ie = end[s]; |
45 | 9987x |
int i = 0, nn = 0, nni, nhi = space_p[r], nhn = space_p[r + 1], h1, h2; |
46 | 21910x |
double ns = 0; |
47 | ||
48 |
// summarize neighbors |
|
49 | 21910x |
vector<int> neighbor_index; |
50 | 21910x |
vector<double> neighbor_sims; |
51 |
// // find neighbors in range |
|
52 | 13916424x |
for (; i < nh; i++) { |
53 | 13906436x |
if (i != ci) { |
54 | 13439084x |
const int nr = region[i] - 1; |
55 | 13097952x |
ns = 0; |
56 | 26221395x |
for (nni = nhi; nni < nhn; nni++) { |
57 | 14140029x |
if (space_i[nni] == nr) { |
58 | 1313646x |
ns = space_x[nni]; |
59 | 1313801x |
break; |
60 |
} |
|
61 |
} |
|
62 | 13395167x |
if (ns) { |
63 | 1369336x |
if (nn < n_neighbors) { |
64 | 407938x |
neighbor_index.push_back(i); |
65 | 416174x |
neighbor_sims.push_back(ns); |
66 | 415980x |
nn++; |
67 |
} else { |
|
68 | 961401x |
ns += R::rnorm(0, 1e-5); |
69 | 25724556x |
for (nni = nn; nni--;) { |
70 | 25235510x |
if (ns > neighbor_sims[nni]) { |
71 | 496305x |
neighbor_index[nni] = i; |
72 | 496305x |
neighbor_sims[nni] = ns; |
73 | 496305x |
break; |
74 |
} |
|
75 |
} |
|
76 |
} |
|
77 |
} |
|
78 |
} |
|
79 |
} |
|
80 | 9988x |
if (nn) { |
81 | 9098x |
int ni = n_neighbors; |
82 | 460485x |
for (; ni--;) neighbors[ni] = 0; |
83 | 63627x |
for (ni = n_races; ni--;) race_table[ni] = 0; |
84 |
// see if neighbors have been filled, and summarize over their household members |
|
85 | 420708x |
for (ni = nn, nn = 0; ni--;) { |
86 | 411604x |
nni = neighbor_index[ni], nhi = start[nni], nhn = min(nhi + 2, end[nni]); |
87 | 416435x |
if (individuals[nhi]) { |
88 | 642853x |
for (; nhi < nhn; nhi++) { |
89 | 398689x |
neighbors[0] += individuals[nhi + 3 * N]; // sum age |
90 | 399610x |
neighbors[1] += individuals[nhi + 4 * N]; // n 1 sex |
91 | 398228x |
race_table[individuals[nhi + 5 * N]]++; |
92 | 395879x |
neighbors[3] += individuals[nhi + 6 * N]; // sum income |
93 | 395441x |
nn++; |
94 |
} |
|
95 |
} |
|
96 |
} |
|
97 | 9104x |
if (nn) { |
98 |
// finalize neighbor summaries |
|
99 | 8433x |
ni = n_races, i = 0; |
100 | 59011x |
for (int m = -1; ni--;) { |
101 | 50578x |
if (race_table[ni] > m) { |
102 | 21622x |
i = ni; |
103 | 21622x |
m = race_table[ni]; |
104 |
} |
|
105 |
} |
|
106 | 8433x |
neighbors[0] /= nn; |
107 | 8433x |
neighbors[2] = i; |
108 | 8433x |
neighbors[3] /= nn; |
109 |
} |
|
110 |
} |
|
111 | ||
112 |
// fill out household members |
|
113 | 9994x |
int hage = 0; |
114 | 33162x |
for (i = start[s], h1 = i, h2 = i + 1; i < ie; i++) { |
115 |
// set household and individual ids |
|
116 | 23162x |
const int row = i; |
117 | 23162x |
individuals[row] = s + 1; |
118 | 23157x |
individuals[row + N] = row + 1; |
119 | ||
120 |
// set number of neighbors (col 3) |
|
121 | 23163x |
individuals[row + 2 * N] = nn; |
122 | ||
123 |
// select age (col 4) |
|
124 | 23171x |
if (row == h1) { |
125 | 10001x |
hage = floor(R::rbeta(1, ro ? 2 : 1.5) * 80 + 18); |
126 | 10002x |
if (nn) { |
127 | 8441x |
hage = max(18, (int)ceil((hage + R::rnorm(neighbors[0], 5)) / 2)); |
128 |
} |
|
129 | 10002x |
individuals[row + 3 * N] = hage; |
130 | 23172x |
} else if (row == h2) { |
131 | 6411x |
individuals[row + 3 * N] = max(18, min((int)floor(R::rnorm(hage, hage > 40 ? 15 : 5)), 90)); |
132 |
} else { |
|
133 | 6759x |
individuals[row + 3 * N] = ceil(R::runif(0, 1) * (hage - 15)); |
134 |
} |
|
135 | ||
136 |
// select sex (col 5) |
|
137 | 23172x |
if (row == h2) { |
138 | 6414x |
individuals[row + 4 * N] = R::rbinom(1, .1) ? individuals[row - 1 + 4 * N] : (int)!individuals[row - 1 + 4 * N]; |
139 |
} else { |
|
140 | 16758x |
individuals[row + 4 * N] = R::rbinom(1, .5); |
141 |
} |
|
142 | ||
143 |
// select race (col 6) |
|
144 | 23174x |
if (row == h1) { |
145 | 10002x |
individuals[row + 5 * N] = select(race_table, race_rates); |
146 | 23170x |
} else if (row == h2) { |
147 | 6412x |
individuals[row + 5 * N] = R::rbinom(1, .7) ? individuals[row - 1 + 5 * N] : select(race_table, race_rates); |
148 |
} else { |
|
149 | 6760x |
individuals[row + 5 * N] = individuals[row - (1 + R::rbinom(1, .5)) + 5 * N]; |
150 |
} |
|
151 | ||
152 |
// select income (col 7) |
|
153 | 23169x |
if (row == h1) { |
154 | 10000x |
individuals[row + 6 * N] = inc; |
155 | 23167x |
} else if (row == h2) { |
156 | 6411x |
individuals[row + 6 * N] = inc < (nn ? neighbors[3] : 4e4) || n == 2 ? max(0, (int)floor(R::rnorm(inc, 2e4))) : 0; |
157 |
} |
|
158 |
} |
|
159 | 10000x |
if (!--ck) { |
160 | ! |
checkUserInterrupt(); |
161 | ! |
ck = 1e3; |
162 |
} |
|
163 |
} |
|
164 |
} |
|
165 |
}; |
|
166 | ||
167 |
// [[Rcpp::export]] |
|
168 | 2x |
NumericVector generate_individuals( |
169 |
const IntegerVector ®ion, const IntegerVector &head_income, const IntegerVector &size, |
|
170 |
const IntegerVector &renting, const S4 &space, const int &n_neighbors, |
|
171 |
const NumericVector &race_rates |
|
172 |
) { |
|
173 | 2x |
const int nh = size.length(), nvars = 7; |
174 | ||
175 |
// map sets of individuals |
|
176 | 2x |
vector<int> start(nh), end(nh); |
177 | 2x |
int n = 0; |
178 | 10004x |
for (int i = 0; i < nh; i++) { |
179 | 10002x |
start[i] = n; |
180 | 10002x |
n += size[i]; |
181 | 10002x |
end[i] = n; |
182 |
} |
|
183 | ||
184 | 2x |
NumericVector individuals(n * nvars); |
185 | 2x |
Generate g( |
186 | 2x |
individuals, start, end, region, head_income, size, renting, space, race_rates, n, n_neighbors |
187 |
); |
|
188 | 2x |
parallelFor(0, nh, g); |
189 | ||
190 | 2x |
individuals.attr("dim") = Dimension(n, nvars); |
191 | 2x |
individuals.attr("dimnames") = List::create(R_NilValue, CharacterVector::create( |
192 |
"household", "person", "neighbors", "age", "sex", "race", "income" |
|
193 |
)); |
|
194 | 2x |
return individuals; |
195 |
} |
1 |
// [[Rcpp::depends(RcppParallel)]] |
|
2 |
#include <Rcpp.h> |
|
3 |
#include <RcppParallel.h> |
|
4 |
#include <unordered_map> |
|
5 |
using namespace std; |
|
6 |
using namespace Rcpp; |
|
7 |
using namespace RcppParallel; |
|
8 | ||
9 | 2x |
const int aggregate_mode( |
10 |
const int &os, const RMatrix<double> &source, const int &tstart, const int &tend, |
|
11 |
const vector<int> &source_rows, const vector<double> &iw |
|
12 |
) { |
|
13 | 2x |
bool any = false; |
14 | 2x |
double max = 0; |
15 | 2x |
int max_cat = 0; |
16 | 2x |
unordered_map<int, double> acc; |
17 | 12x |
for (int i = tstart, r; i < tend; i++) { |
18 | 10x |
r = source_rows[i]; |
19 | 10x |
const int cat = source[r + os]; |
20 | 10x |
if (isfinite(cat)) { |
21 | 10x |
any = true; |
22 | 10x |
if (acc.find(cat) == acc.end()) { |
23 | 2x |
acc.insert({cat, iw[i]}); |
24 | 10x |
} else acc.at(cat) += iw[i]; |
25 | 10x |
const double count = acc.at(cat); |
26 | 10x |
if (count > max) { |
27 | 10x |
max = count; |
28 | 10x |
max_cat = cat; |
29 |
} |
|
30 |
} |
|
31 |
} |
|
32 | 2x |
return any ? max_cat : NA_INTEGER; |
33 |
} |
|
34 | ||
35 | ! |
const double aggregate_sum( |
36 |
const int &os, const RMatrix<double> &source, const int &tstart, const int &tend, |
|
37 |
const vector<int> &source_rows, const RVector<double> &w, const vector<double> &iw |
|
38 |
) { |
|
39 | ! |
bool any = false; |
40 | ! |
double sum = 0; |
41 | ! |
for (int i = tstart, r; i < tend; i++) { |
42 | ! |
r = source_rows[i]; |
43 | ! |
const double v = source[r + os]; |
44 | ! |
if (isfinite(v)) { |
45 | ! |
any = true; |
46 | ! |
sum += v * iw[i] * w[r]; |
47 |
} |
|
48 |
} |
|
49 | ! |
return any ? sum : NA_REAL; |
50 |
} |
|
51 | ||
52 | 49114x |
const double proportional_aggregate( |
53 |
const int &os, const RMatrix<double> &source, const int &tstart, const int &tend, |
|
54 |
const vector<int> &source_rows, const RVector<double> &w, const vector<double> &iw, |
|
55 |
const vector<double> &totals, const size_t &s |
|
56 |
) { |
|
57 | 49114x |
bool any = false; |
58 | 49114x |
double sum = 0; |
59 | 77714x |
for (int i = tstart, r; i < tend; i++) { |
60 | 48128x |
r = source_rows[i]; |
61 | 48128x |
const double v = source[r + os]; |
62 | 48128x |
if (isfinite(v)) { |
63 | 46155x |
any = true; |
64 | 46158x |
if (totals[r]) sum += v * iw[i] * w[s] / totals[r]; |
65 |
} |
|
66 |
} |
|
67 | 29586x |
return any ? sum : NA_REAL; |
68 |
} |
|
69 | ||
70 | 6x |
void proportional_categoric( |
71 |
const int &os, const int &s, const int &tstart, const int &tend, const vector<int> &target_rows, |
|
72 |
RVector<double> &v, const vector<double> &iw |
|
73 |
) { |
|
74 | 6x |
const int val = isfinite(s) ? s : NA_REAL; |
75 | 36x |
for (int i = tstart, r; i < tend; i++) { |
76 | 30x |
r = target_rows[i]; |
77 | 30x |
v[r + os] = val; |
78 |
} |
|
79 |
} |
|
80 | ||
81 | 54x |
void proportional_numeric( |
82 |
const int &os, const double &s, const int &tstart, const int &tend, const vector<int> &target_rows, |
|
83 |
const RVector<double> &w, const double &total, RVector<double> &v, const vector<double> &iw |
|
84 |
) { |
|
85 | 686x |
for (int i = tstart, r; i < tend; i++) { |
86 | 632x |
r = target_rows[i]; |
87 | 632x |
v[r + os] = isfinite(s) ? s * iw[i] * w[r] / total : NA_REAL; |
88 |
} |
|
89 |
} |
|
90 | ||
91 |
struct Distribute : public Worker { |
|
92 |
RVector<double> res; |
|
93 |
const RMatrix<double> source; |
|
94 |
const vector<int> start, end, index; |
|
95 |
const RVector<double> weight; |
|
96 |
const vector<double> index_weight, weight_totals; |
|
97 |
const RVector<int> method; |
|
98 |
const bool agg; |
|
99 | 22x |
const int ncol = method.length(), n = agg ? source.nrow() : weight.length(), on = start.size(); |
100 | 66x |
Distribute( |
101 |
NumericVector &res, const NumericMatrix &source, |
|
102 |
const vector<int> &start, const vector<int> &end, const vector<int> &index, |
|
103 |
const NumericVector &weight, const vector<double> &index_weight, const vector<double> &weight_totals, |
|
104 |
const IntegerVector &method, const bool &agg |
|
105 |
): |
|
106 | 22x |
res(res), source(source), start(start), end(end), index(index), weight(weight), |
107 | 22x |
index_weight(index_weight), weight_totals(weight_totals), method(method), agg(agg) |
108 |
{} |
|
109 | 6114x |
void operator()(size_t s, size_t e){ |
110 | 6114x |
int c = 0; |
111 | 31681x |
for (; s < e; s++) { |
112 | 46727x |
const int ss = start[s], se = end[s] + 1; |
113 | 46727x |
if (ss > -1) { |
114 | 46665x |
if (agg) { |
115 | 77794x |
for (c = 0; c < ncol; c++) { |
116 | 52317x |
const int os = c * n; |
117 | 52317x |
switch(method[c]) { |
118 |
case 10: |
|
119 | 2x |
res[s + c * on] = aggregate_mode(os, source, ss, se, index, index_weight); |
120 | 2x |
break; |
121 |
case 11: |
|
122 | ! |
res[s + c * on] = aggregate_sum(os, source, ss, se, index, weight, index_weight); |
123 | ! |
break; |
124 |
case 12: |
|
125 | 41735x |
res[s + c * on] = proportional_aggregate( |
126 | 41735x |
os, source, ss, se, index, weight, index_weight, weight_totals, s |
127 |
); |
|
128 | 41735x |
break; |
129 |
} |
|
130 |
} |
|
131 |
} else { |
|
132 | 28x |
const double total = weight_totals[s]; |
133 | 28x |
const RMatrix<double>::Row source_row = source.row(s); |
134 | 88x |
for (c = 0; c < ncol; c++) { |
135 | 60x |
const int os = c * n; |
136 | 60x |
switch(method[c]) { |
137 |
case 0: |
|
138 | 6x |
proportional_categoric(os, source_row[c], ss, se, index, res, index_weight); |
139 | 6x |
break; |
140 |
case 1: |
|
141 | 54x |
if (total) { |
142 | 54x |
proportional_numeric(os, source_row[c], ss, se, index, weight, total, res, index_weight); |
143 |
} |
|
144 | 54x |
break; |
145 |
} |
|
146 |
} |
|
147 |
} |
|
148 |
} |
|
149 |
} |
|
150 |
} |
|
151 |
}; |
|
152 | ||
153 |
// [[Rcpp::export]] |
|
154 | 22x |
NumericVector process_distribute( |
155 |
const NumericMatrix &s, const IntegerVector &method, const CharacterVector &tid, |
|
156 |
const NumericVector &weight, const List &map, const bool &agg, const bool &balance |
|
157 |
) { |
|
158 | 22x |
const int source_n = s.nrow(), nvars = s.ncol(), target_n = tid.length(); |
159 | 22x |
NumericVector res(nvars * target_n); |
160 | ||
161 |
// translate map |
|
162 | 22x |
const int iter_max = agg ? target_n : source_n; |
163 | 22x |
vector<int> start(iter_max), end(iter_max), index; |
164 | 22x |
vector<double> weight_totals(source_n), index_weight; |
165 | 22x |
if (agg) { |
166 | 9x |
unordered_map<String, IntegerVector> imap; |
167 | 9x |
unordered_map<String, NumericVector> wmap; |
168 | 9x |
unordered_map<String, double> tmap; |
169 |
int i, t; |
|
170 | 68610x |
for (i = 0; i < target_n; i++) { |
171 | 68601x |
IntegerVector indices; |
172 | 68601x |
NumericVector indices_weight; |
173 | 68601x |
const String id = tid[i]; |
174 | 68601x |
imap.insert({id, indices}); |
175 | 68601x |
wmap.insert({id, indices_weight}); |
176 | 68601x |
tmap.insert({id, balance ? 0 : 1}); |
177 |
} |
|
178 | 302x |
for (i = 0; i < source_n; i++) { |
179 | 293x |
weight_totals[i] = 0; |
180 | 293x |
const NumericVector tidws = map[i]; |
181 | 293x |
const int n = tidws.length(); |
182 | 293x |
if (n) { |
183 | 289x |
const CharacterVector tids = tidws.names(); |
184 | 70045x |
for (t = 0; t < n; t++) { |
185 | 69756x |
const String id = tids[t]; |
186 | 69756x |
if (imap.find(id) != imap.end()) { |
187 | 69756x |
const double itw = tidws[t]; |
188 | 69756x |
imap.at(id).push_back(i); |
189 | 69756x |
wmap.at(id).push_back(itw); |
190 | 69756x |
if (balance) tmap.at(id) += itw; |
191 |
} |
|
192 |
} |
|
193 |
} |
|
194 |
} |
|
195 | 68610x |
for (i = 0; i < target_n; i++) { |
196 | 68601x |
const String id = tid[i]; |
197 | 68601x |
const IntegerVector indices = imap.at(id); |
198 | 68601x |
const NumericVector indices_weight = wmap.at(id); |
199 | 68601x |
const double weight_total = tmap.at(id); |
200 | 68601x |
const int n = indices.length(); |
201 | 68601x |
bool any = false; |
202 | 68601x |
start[i] = index.size(); |
203 | 138357x |
for (t = 0; t < n; t++) { |
204 | 69756x |
const int si = indices[t]; |
205 | 69756x |
double wi = indices_weight[t]; |
206 | 69756x |
if (balance && weight_total != 1) wi /= weight_total; |
207 | 69756x |
any = true; |
208 | 69756x |
index.push_back(si); |
209 | 69756x |
index_weight.push_back(wi); |
210 | 69756x |
weight_totals[si] += wi * weight[i]; |
211 |
} |
|
212 | 68601x |
if (any) { |
213 | 68601x |
end[i] = index.size() - 1; |
214 |
} else { |
|
215 | ! |
start[i] = end[i] = -1; |
216 |
} |
|
217 |
} |
|
218 |
} else { |
|
219 | 42x |
for (int i = 0, t; i < source_n; i++) { |
220 | 29x |
weight_totals[i] = 0; |
221 | 29x |
const NumericVector tidws = map[i]; |
222 | 29x |
if (tidws.length()) { |
223 | 28x |
const CharacterVector tids = tidws.names(); |
224 | 28x |
const IntegerVector indices = match(tids, tid) - 1; |
225 | 28x |
const int n = indices.length(); |
226 | 28x |
bool any = false; |
227 | 28x |
start[i] = index.size(); |
228 | 290x |
for (t = 0; t < n; t++) { |
229 | 262x |
const int si = indices[t]; |
230 | 262x |
if (si > -1 && si < target_n) { |
231 | 262x |
any = true; |
232 | 262x |
index.push_back(si); |
233 | 262x |
index_weight.push_back(tidws[t]); |
234 | 262x |
weight_totals[i] += weight[si]; |
235 |
} |
|
236 |
} |
|
237 | 28x |
if (any) { |
238 | 28x |
end[i] = index.size() - 1; |
239 |
} else { |
|
240 | ! |
start[i] = end[i] = -1; |
241 |
} |
|
242 |
} else { |
|
243 | 1x |
start[i] = end[i] = -1; |
244 |
} |
|
245 |
} |
|
246 |
} |
|
247 | ||
248 |
// process |
|
249 | 22x |
Distribute distributed(res, s, start, end, index, weight, index_weight, weight_totals, method, agg); |
250 | 22x |
checkUserInterrupt(); |
251 | 22x |
parallelFor(0, iter_max, distributed); |
252 | ||
253 | 22x |
res.attr("dim") = Dimension(target_n, nvars); |
254 | 22x |
return res; |
255 |
} |