redistribute coverage - 77.98%

Files
Source

#' Generate a Population
#'
#' Simulate a population of individuals within households, with complex relationships between
#' demographic and location features.
#'
#' The population is generated in two steps:
#'
#' \strong{First}, households are generated and placed within \code{regions}. Placement within regions
#' is determined by total distance from one or more regions selected as attraction loci. If
#' coordinates are not provided, these are first randomly generated.
#'
#' After households are placed, household incomes (of the first member) is generated
#' based on cost loci, which are then used to generate building types (where types are increasingly
#' associated with income) and then household size (based on size loci, income, and building type).
#' Renting status is then generated based on income and building type:
#' 60\% chance if income is under the mean income, and 20\% otherwise, multiplied by .8
#' if the building type is of a selected renting type, or .3 otherwise.
#'
#' \strong{Second}, individuals are generated for each household. To generate an individual,
#' first, neighbors are searched for, based on \code{n_neighbors} and \code{neighbor_range}.
#' Any neighbors are summarized: average age and income, and tabulated race.
#'
#' These then affect the first member of the household: age is first drawn from a Beta
#' distribution (with shapes of \code{1} and \code{2} if renting or \code{1.5} otherwise,
#' multiplied by \code{80}) and added to \code{18}, then adjusted toward a random value centered
#' on the average neighbor age (floored Gaussian with a standard deviation of \code{1}), and race
#' is sampled (that with the highest result of a Binomial draw with \code{n_races} trials
#' proportion of \code{neighbors * base rate} chance of success for each race group).
#'
#' Neighbors also affect the income of the second member of the household if the first
#' member's income is under the neighbor mean income (or under \code{40,000} given no neighbors);
#' in this case, the second member's income is drawn from a Gaussian distribution centered on
#' the first member's income, with a standard deviation of \code{10,000}.
#'
#' The second member's age is based on that of the first member; a floored Gaussian centered on
#' the first member's age, with a standard deviation of \code{15} if the first member's age
#' if over \code{40}, or \code{5} otherwise, trimmed to be between \code{18} and \code{90}.
#'
#' The second member's race has a 70\% chance to be that of the first member,
#' and a 30\% chance to be selected like the first member's.
#'
#' Members after the second have no income, have age randomly selected from a
#' uniform distribution between \code{0} and the first member's age minus \code{15}
#' (which is then rounded up), and have race determined by either the first or second
#' member (50\% chance).
#'
#' Sex has a 50\% chance to be \code{0} or \code{1} for all but the second member; their sex
#' has a 10\% chance to be the same as the first member's, and 90\% chance to be opposite.
#'
#' @param N Number of initial individuals to generate. Final number of individuals will be larger.
#' @param regions A vector of region IDs, a matrix of coordinates, or an \code{sf} object with
#' geometries from which coordinates can be derived. If not specified
#' (and \code{capacities} is not specified), regions similar to housing units (with a mix of
#' single and multi-family locations) will be generated.
#' @param capacities A vector with the maximum number of households for each entry in \code{regions}.
#' @param region_ids A vector of unique IDs for each \code{regions}, or a column name in
#' \code{regions} containing IDs.
#' @param attraction_loci Number of locations selected to be centers of attractiveness,
#' which influence where households are located.
#' @param random_regions A number between \code{0} and \code{1}, which determines the proportion of
#' people who are randomly relocated, in the case that there is more capacity than households.
#' @param cost_loci Number of locations selected to be centers of cost, which influences
#' the initial income associated with households.
#' @param size_loci Number of locations selected to be centers of size, which influence
#' household sizes.
#' @param similarity_metric Name of a metric to use to calculate nearness between
#' neighbors; see \code{\link[lingmatch]{lma_simets}}.
#' @param n_neighbors Number of neighbors used to influence each new household's
#' initial age and race.
#' @param neighbor_range Minimum similarity between people to be considered
#' neighbors, between \code{0} and \code{1} (where \code{0} means unrestricted,
#' and \code{1} means same region only).
#' @param n_races Number of different race groups to sample from.
#' @param n_building_types Number of different building types to sample from.
#' @param verbose Logical; if \code{TRUE}, will show status messages.
#' @examples
#' generate_population(2)
#' @returns A list with entries for \code{params} (with initial settings), and two
#' \code{data.frames}:
#' \describe{
#'   \item{\code{households}}{
#'     \tabular{ll}{
#'       \code{household} \tab Household ID. \cr
#'       \code{region} \tab Region ID. \cr
#'       \code{head_income} \tab Income of the first household member. \cr
#'       \code{size} \tab Number of individuals in the household. \cr
#'       \code{building_type} \tab Categorical indicator of building type. \cr
#'       \code{renting} \tab Binary indicator of renting status. \cr
#'     }
#'   }
#'   \item{\code{individuals}}{
#'     \tabular{ll}{
#'       \code{household} \tab Household ID. \cr
#'       \code{person} \tab Person ID. \cr
#'       \code{neighbors} \tab Number of neighbors who bore on variables. \cr
#'       \code{age} \tab Age in years. \cr
#'       \code{sex} \tab Categorical indicator of sex. \cr
#'       \code{race} \tab Categorical indicator of race. \cr
#'       \code{income} \tab Income of individual. \cr
#'     }
#'   }
#' }
#' @export

generate_population <- function(N = 1000, regions = NULL, capacities = NULL, region_ids = NULL,
                                attraction_loci = 3, random_regions = .1, cost_loci = 2, size_loci = 5,
                                similarity_metric = "euclidean", n_neighbors = 50,
                                neighbor_range = .5, n_races = 6, n_building_types = 3, verbose = FALSE) {
  gen_regions <- missing(regions)
  gen_capacities <- missing(capacities)
  if (gen_regions) {
    if (gen_capacities) {
      if (verbose) cli_alert_info("regions: sequence along {.arg N}, with housing unit capacities")
      regions <- seq_len(N)
      capacities <- rep(1, N)
      n_multi <- round(N * .1)
      capacities[sample.int(N, n_multi)] <- sample(5:300, n_multi, TRUE)
    } else {
      if (verbose) {
        cli_alert_info(
          "regions: sequence along specified {.arg capacities} with specified capacities"
        )
      }
      regions <- seq_along(capacities)
    }
  }
  if (!missing(region_ids)) {
    if (length(region_ids) == 1 && region_ids %in% colnames(regions)) {
      su <- colnames(regions) != region_ids
      region_ids <- regions[[region_ids]]
      regions <- regions[, su, drop = FALSE]
    } else {
      nr <- if (length(dim(regions)) == 2) nrow(regions) else length(regions)
      if (region_ids != nr) cli_abort("{.arg region_ids} are not the same length as {.arg regions}")
    }
  }
  if (length(dim(regions)) == 2) {
    if (length(capacities) == 1 && is.character(capacities) && capacities %in% colnames(regions)) {
      if (verbose) {
        cli_alert_info(
          "regions: specified coordinates, with capacities from {.field {capacities}} column"
        )
      }
      su <- colnames(regions) != capacities
      capacities <- regions[[capacities]]
      regions <- regions[, su, drop = FALSE]
    } else if (verbose) {
      cli_alert_info(paste(
        "regions: specified coordinates, with",
        if (gen_capacities) "{.field N / nrow(regions)}" else "specified", "capacities"
      ))
    }
    rids <- if (length(region_ids)) {
      region_ids
    } else if (is.null(rownames(regions))) {
      paste0("r", seq_len(nrow(regions)))
    } else {
      rownames(regions)
    }
  } else {
    rids <- if (length(regions) == 1 && is.numeric(regions)) {
      if (verbose) {
        cli_alert_info(paste(
          "regions: sequence of {.field N}, with",
          if (gen_capacities) "{.field N / nrow(regions)}" else "specified", "capacities"
        ))
      }
      if (length(region_ids)) region_ids else paste0("r", seq_len(regions))
    } else {
      if (verbose && !gen_regions) {
        cli_alert_info(paste(
          "regions:", if (gen_capacities) {
            "specified region, with {.field N / length(regions)}"
          } else {
            "specified regions and"
          }, "capacities"
        ))
      }
      if (length(region_ids)) region_ids else regions
    }
    regions <- matrix(
      ceiling(rnorm(length(rids) * 2, 1e5, 1e4)),
      ncol = 2, dimnames = list(rids, c("X", "Y"))
    )
  }
  nr <- length(rids)
  capacities <- rep_len(if (!is.numeric(capacities)) N / nr else capacities, nr)

  if (verbose) cli_alert_info("preparing {.field {N}} households")
  household <- seq_len(N)

  # select attraction loci
  if (!is.numeric(attraction_loci) || attraction_loci < 1) {
    attraction_loci <- 1
    if (verbose) cli_alert_warning("setting {.arg attraction_loci} to {.field 1}")
  } else if (attraction_loci > nr) {
    attraction_loci <- max(1, nr - 1)
    if (verbose) {
      cli_alert_warning(
        "too many attraction loci; setting {.arg attraction_loci} to {.field {attraction_loci}}"
      )
    }
  } else if (verbose) {
    cli_alert_info("attraction loci: {.field {attraction_loci}}")
  }
  selected_attraction_loci <- sample.int(nr, attraction_loci)
  if (verbose) cli_alert_info("calculating region similarities")
  if (nr == 1) {
    space <- Matrix(1, dimnames = list(rids, rids), sparse = TRUE)
  } else {
    if (inherits(regions, c("sfc", "sf"))) regions <- st_coordinates(st_centroid(regions))
    space <- lma_simets(regions, metric = similarity_metric, symmetrical = TRUE)
  }
  if (verbose) cli_alert_info("rescaling region similarities")
  min_space <- min(space@x)
  space@x <- (space@x - min_space) / (max(space@x) - min_space)
  space_cols <- integer(ncol(space))

  # select initial locations
  space_cols[selected_attraction_loci] <- -1L
  ro <- order(space %*% space_cols)
  space_cols[selected_attraction_loci] <- 0L
  capacities <- ceiling(capacities)
  if (sum(capacities) < N) {
    if (verbose) cli_alert_info("{.arg N} is larger than total capacities; expanding to accomodate")
    capacities <- ceiling(capacities / sum(capacities) * N)
  }
  rids_expanded <- rep(ro, capacities[ro])
  region <- rids_expanded[household]
  if (anyNA(region)) cli_abort("NAs in regions")

  if (N < length(rids_expanded) && is.numeric(random_regions)) {
    # redistribute a portion
    moves <- min(length(rids_expanded) - N, N * random_regions)
    if (moves > 0) {
      if (verbose) cli_alert_info("randomly placing {.field {moves}} households in remaining regions")
      region[sample(household, moves)] <- sample(rids_expanded[-household], moves)
    }
  }

  # select head-of-household income
  if (!is.numeric(cost_loci) || cost_loci < 1) {
    cost_loci <- 1
    if (verbose) cli_alert_warning("setting {.arg cost_loci} to {.field 1}")
  } else if (cost_loci > nr) {
    cost_loci <- max(1, nr - 1)
    if (verbose) cli_alert_warning("too many cost loci; setting {.arg cost_loci} to {.field {cost_loci}}")
  } else if (verbose) {
    cli_alert_info("cost loci: {.field {cost_loci}}")
  }
  selected_cost_loci <- sample.int(nr, cost_loci)
  space_cols[selected_cost_loci] <- 1L
  region_cost <- abs(rnorm(nr, as.numeric(space %*% space_cols) * 5e5, 1e5))
  space_cols[selected_cost_loci] <- 0L
  head_income <- abs(as.integer(
    rnorm(N, region_cost[region], 2e3) + (rbeta(N, .1, 1) - .6) * region_cost[region]
  ))

  # selecting building type
  if (verbose) cli_alert_info("drawing building type and rental types")
  region_building_type <- rep(1L, nr)
  su <- capacities == 1
  if (n_building_types > 1) {
    if (any(su)) {
      region_building_type[su] <- if (n_building_types == 2) {
        2L
      } else {
        sample.int(n_building_types - 1, sum(su), TRUE) + 1L
      }
    }
  }
  building_type <- region_building_type[region]

  # select household sizes
  if (!is.numeric(size_loci) || size_loci < 1) {
    size_loci <- 1
    if (verbose) cli_alert_warning("setting {.arg size_loci} to {.field 1}")
  } else if (size_loci > nr) {
    size_loci <- max(1, nr - 1)
    if (verbose) cli_alert_warning("too many size loci; setting {.arg size_loci} to {.field {size_loci}}")
  } else if (verbose) {
    cli_alert_info("size loci: {.field {size_loci}}")
  }
  selected_size_loci <- sample.int(nr, size_loci)
  space_cols[selected_size_loci] <- .1 / size_loci
  size_prob <- rnorm(
    N,
    (su * 3 + (space %*% space_cols) * size_loci)[region, ] +
      head_income / max(head_income) * 5 +
      1 / building_type * .5,
    .4
  )
  size <- rpois(N, abs(size_prob)) + 1L

  # selecting renting status
  if (verbose) cli_alert_info("drawing renting status")
  renting <- rbinom(N, 1, ((head_income < mean(head_income)) * .2 + .4) *
    ((building_type == 1) * .3 + .5))

  # select race base rates
  if (verbose) cli_alert_info("drawing race baserates")
  race_rates <- rbeta(n_races, 1, 5)
  race_rates <- race_rates / (race_rates + max(race_rates))

  # making space sparse
  if (verbose) cli_alert_info("defining region neighbors")
  space@x[space@x < neighbor_range] <- 0
  space <- drop0(space)

  # preparing individual data
  individuals <- data.frame()
  if (verbose) {
    cli_progress_step(
      "generating individuals...",
      msg_done = "generated {.field {nrow(individuals)}} individuals"
    )
  }
  individuals <- generate_individuals(
    region, head_income, size, renting, space, n_neighbors, race_rates
  )

  list(
    params = list(
      neighbors = n_neighbors, range = neighbor_range, races_rates = race_rates,
      n_building_types = n_building_types
    ),
    regions = data.frame(
      id = rids, capacity = capacities, cost = region_cost,
      building_type = region_building_type, regions
    ),
    households = data.frame(
      household,
      region = rids[region], head_income, size, building_type, renting
    ),
    individuals = as.data.frame(individuals)
  )
}

#' Redistribute Data
#'
#' Distribute data from a source frame to a target frame.
#'
#' @param source A matrix-like object you want to distribute from; usually this will be
#' the real or more complete dataset, and is often at a lower resolution / higher level.
#' @param target A matrix-like object you want to distribute to: usually this will be
#' the dataset you want but isn't available, and is often at a higher resolution / lower level
#' (for disaggregation). Can also be a single number, representing the number of initial
#' characters of \code{source} IDs to derive target IDs from (useful for aggregating up nested groups).
#' @param map A list with entries named with \code{source} IDs (or aligning with those IDs),
#' containing vectors of associated \code{target} IDs (or indices of those IDs). Entries
#' can also be numeric vectors with IDs as names, which will be used to weigh the relationship.
#' If IDs are related by substrings (the first characters of \code{target} IDs are \code{source} IDs),
#' then a map can be automatically generated from them. If \code{source} and \code{target}
#' contain \code{sf} geometries, a map will be made with \code{\link[sf]{st_intersects}}
#' (\code{st_intersects(source, target)}). If an intersects map is made, and \code{source}
#' is being aggregated to \code{target}, and map entries contain multiple target IDs,
#' those entries will be weighted by their proportion of overlap with the source area.
#' @param source_id,target_id Name of a column in \code{source} / \code{target},
#' or a vector containing IDs. For \code{source}, this will default to the first column. For
#' \code{target}, columns will be searched through for one that appears to relate to the
#' source IDs, falling back to the first column.
#' @param weight Name of a column, or a vector containing weights (or single value to apply to all cases),
#' which apply to \code{target} when disaggregating, and \code{source} when aggregating.
#' Defaults to unit weights (all weights are 1).
#' @param source_variable,source_value If \code{source} is tall (with variables spread across
#' rows rather than columns), specifies names of columns in \code{source} containing variable names
#' and values for conversion.
#' @param aggregate Logical; if specified, will determine whether to aggregate or disaggregate
#' from \code{source} to \code{target}. Otherwise, this will be \code{TRUE} if there are more
#' \code{source} observations than \code{target} observations.
#' @param weight_agg_method Means of aggregating \code{weight}, in the case that target IDs contain duplicates.
#' Options are \code{"sum"}, \code{"average"}, or \code{"auto"} (default; which will sum if \code{weight}
#' is integer-like, and average otherwise).
#' @param rescale Logical; if \code{FALSE}, will not adjust target values after redistribution such that they
#' match source totals.
#' @param drop_extra_sources Logical; if \code{TRUE}, will remove any source rows that are not mapped
#' to any target rows. Useful when inputting a source with regions outside of the target area,
#' especially when \code{rescale} is \code{TRUE}.
#' @param default_value Value to set to any unmapped target ID.
#' @param outFile Path to a CSV file in which to save results.
#' @param overwrite Logical; if \code{TRUE}, will overwrite an existing \code{outFile}.
#' @param make_intersect_map Logical; if \code{TRUE}, will opt to calculate an intersect-based map
#' rather than an ID-based map, if both seem possible. If specified as \code{FALSE}, will
#' never calculate an intersect-based map.
#' @param fill_targets Logical; if \code{TRUE}, will make new \code{target} rows for any
#' un-mapped \code{source} row.
#' @param overlaps If specified and not \code{TRUE} or \code{"keep"} (default), will assign \code{target}
#' entities that are mapped to multiple \code{source} entities to a single source entity. The value
#' determines how entities with the same weight should be assigned, between \code{"first"},
#' \code{"last"}, and \code{"random"}.
#' @param use_all Logical; if \code{TRUE} (default), will redistribute map weights so they sum to 1.
#' Otherwise, entities may be partially weighted.
#' @param return_geometry Logical; if \code{FALSE}, will not set the returned \code{data.frame}'s
#' geometry to that of \code{target}, if it exists.
#' @param return_map Logical; if \code{TRUE}, will only return the map, without performing the
#' redistribution. Useful if you want to inspect an automatically created map, or use it in a later call.
#' @param verbose Logical; if \code{TRUE}, will show status messages.
#' @examples
#' # minimal example
#' source <- data.frame(a = 1, b = 2)
#' target <- 1:5
#' (redistribute(source, target, verbose = TRUE))
#'
#' # multi-entity example
#' source <- data.frame(id = c("a", "b"), cat = c("aaa", "bbb"), num = c(1, 2))
#' target <- data.frame(
#'   id = sample(paste0(c("a", "b"), rep(1:5, 2))),
#'   population = sample.int(1e5, 10)
#' )
#' (redistribute(source, target, verbose = TRUE))
#' @returns A \code{data.frame} with a row for each \code{target_ids} (identified by the first column,
#' \code{id}), and a column for each variable from \code{source}.
#' @importFrom cli cli_abort cli_bullets cli_alert_info cli_alert_warning cli_warn
#' cli_progress_step cli_progress_update cli_progress_done
#' @importFrom Rcpp sourceCpp
#' @importFrom RcppParallel RcppParallelLibs
#' @importFrom utils unzip
#' @importFrom sf st_intersects st_intersection st_geometry st_geometry<- st_crs st_geometry_type
#' st_coordinates st_centroid st_boundary st_cast st_polygon st_union st_transform st_buffer st_as_sf
#' st_is_valid st_make_valid st_sfc st_point
#' @importFrom s2 s2_area s2_is_valid
#' @importFrom lingmatch lma_simets
#' @importFrom jsonlite read_json write_json
#' @importFrom curl curl_fetch_disk
#' @importFrom vroom vroom vroom_write
#' @importFrom stats rbeta rbinom rnorm rpois
#' @importFrom Matrix Matrix drop0
#' @useDynLib redistribute, .registration = TRUE
#' @export

redistribute <- function(source, target = NULL, map = list(), source_id = "GEOID", target_id = source_id,
                         weight = NULL, source_variable = NULL, source_value = NULL, aggregate = NULL,
                         weight_agg_method = "auto", rescale = TRUE, drop_extra_sources = FALSE,
                         default_value = NA, outFile = NULL, overwrite = FALSE, make_intersect_map = FALSE,
                         fill_targets = FALSE, overlaps = "keep", use_all = TRUE, return_geometry = TRUE,
                         return_map = FALSE, verbose = FALSE) {
  if (!overwrite && !is.null(outFile) && file.exists(outFile)) {
    cli_abort("{.arg outFile} already exists; use {.code overwrite = TRUE} to overwrite it")
  }
  can_intersects <- missing(make_intersect_map) || make_intersect_map
  source_sf <- can_intersects && inherits(source, c("sfc", "sf"))
  target_sf <- can_intersects && inherits(target, c("sfc", "sf"))
  can_intersects <- source_sf && target_sf
  if (!can_intersects && make_intersect_map) {
    if (!source_sf && is.data.frame(source) && any(vapply(source, inherits, TRUE, "sfc"))) {
      cli_warn("{.arg source} is not an sf object, but contains a simple features column, so converting it")
      source <- st_as_sf(source)
      source_sf <- TRUE
    }
    if (!target_sf && is.data.frame(target) && any(vapply(target, inherits, TRUE, "sfc"))) {
      cli_warn("{.arg target} is not an sf object, but contains a simple features column, so converting it")
      target <- st_as_sf(target)
      target_sf <- TRUE
    }
    can_intersects <- source_sf && target_sf
    if (!can_intersects) {
      cli_abort(
        "{.arg make_intersect_map} was set to {.field TRUE}, but {.arg source} and {.arg target} are not both sf objects"
      )
    }
  }
  intersect_map <- FALSE
  if (length(dim(source)) != 2) source <- data.frame(source)
  if (is.null(colnames(source))) colnames(source) <- paste0("V", seq_len(ncol(source)))
  if (length(source_id) > 1) {
    if (verbose) cli_alert_info("source IDs: {.arg source_id} vector")
    sid <- source_id
  } else if (source_id %in% colnames(source)) {
    if (verbose) cli_alert_info("source IDs: {.field {source_id}} column of {.arg source}")
    sid <- source[, source_id, drop = TRUE]
    source <- source[, colnames(source) != source_id, drop = FALSE]
  } else if (nrow(source) == 1) {
    if (verbose) cli_alert_info("source IDs: {.field 1}")
    sid <- if (missing(source_id)) "1" else source_id
  } else {
    if (source_sf && target_sf) {
      if (verbose) cli_alert_info("source IDs: sequence, assuming map from geometries")
      intersect_map <- TRUE
      sid <- seq_len(nrow(source))
    } else {
      if (verbose) cli_alert_info("source IDs: {.field {colnames(source)[1]}} column of {.arg source}")
      sid <- source[, 1, drop = TRUE]
      source <- source[, -1, drop = FALSE]
    }
  }
  sid <- as.character(sid)
  if (length(sid) != nrow(source)) cli_abort("{.arg source_id} is not the same length as {.field nrow(source)}")
  if (anyDuplicated(sid)) {
    if (is.null(source_variable)) {
      if ("variable" %in% colnames(source)) source_variable <- "variable"
      if ("measure" %in% colnames(source)) source_variable <- "measure"
    }
    if (is.null(source_value)) {
      if ("value" %in% colnames(source)) source_value <- "value"
    }
  }
  if (!is.null(source_variable) || !is.null(source_value)) {
    source_numeric <- vapply(seq_len(ncol(source)), function(col) is.numeric(source[, col, drop = TRUE]), TRUE)
    if (is.null(source_value) && any(source_numeric)) source_value <- colnames(source)[which(source_numeric)[1]]
    if (is.null(source_variable) && any(!source_numeric)) {
      source_variable <- colnames(source)[
        which.min(vapply(which(!source_numeric), function(col) length(unique(source[, col])), 0))
      ]
    }
    if (verbose) {
      cli_alert_info(paste(
        "converting {.arg source} format, breaking",
        if (length(source_value) == 1) "{.field {source_value}}" else "{.arg source_value}",
        "values across",
        if (length(source_variable) == 1) "{.field {source_variable}}" else "{.arg source_variable}",
        "columns"
      ))
    }
    usid <- unique(sid)
    nr <- nrow(source)
    if (length(source_value) != nr) source_value <- source[, source_value, drop = TRUE]
    if (length(source_variable) != nr) source_variable <- source[, source_variable, drop = TRUE]
    ss <- split(data.frame(sid, source_value), source_variable)
    source <- do.call(cbind, lapply(ss, function(vd) {
      vd <- vd[!duplicated(vd[[1]]), ]
      rownames(vd) <- vd[, 1]
      vd[usid, 2, drop = TRUE]
    }))
    colnames(source) <- names(ss)
    source <- as.data.frame(source)
    sid <- usid
  }
  if (length(dim(target)) == 2 && is.null(colnames(target))) colnames(target) <- paste0("V", seq_len(ncol(target)))
  if (length(target) == 1 && is.numeric(target)) {
    if (verbose) cli_alert_info("target IDs: first {.field {target}} characters of {.arg source} IDs")
    tid <- target <- unique(substring(sid, 1, target))
  } else if (length(target_id) > 1) {
    if (verbose) cli_alert_info("target IDs: {.arg target_id} vector")
    tid <- target_id
  } else if (is.null(target)) {
    if (verbose) cli_alert_info("target IDs: {.field 1}")
    tid <- 1
  } else if (!is.null(target_id) && length(dim(target)) == 2 && target_id %in% colnames(target)) {
    if (verbose) cli_alert_info("target IDs: {.field {target_id}} column of {.arg target}")
    tid <- target[, target_id, drop = TRUE]
    target <- target[, colnames(target) != target_id, drop = FALSE]
  } else {
    if (length(dim(target)) != 2) {
      if (is.null(weight) && (target_sf || is.numeric(target)) && !is.null(names(target))) {
        if (target_sf) {
          if (verbose) cli_alert_info("target IDs: names of {.arg target} vector")
        } else {
          if (verbose) cli_alert_info("target IDs: names of {.arg target} vector; {.arg weight} set to {.arg target}")
          weight <- unname(target)
        }
        tid <- names(target)
      } else {
        if (target_sf) {
          if (verbose) cli_alert_info("target IDs: sequence")
          tid <- seq_along(target)
        } else {
          if (verbose) cli_alert_info("target IDs: {.arg target} vector")
          tid <- target
        }
      }
    } else {
      if (can_intersects) {
        if (verbose) cli_alert_info("target IDs: sequence, assuming map from geometries")
        intersect_map <- TRUE
        tid <- seq_len(nrow(target))
      } else {
        idi <- 1
        if (length(map)) {
          tids <- unique(unlist(map, use.names = FALSE))
          for (i in seq_len(ncol(target))) {
            if (any(target[, i] %in% tids)) {
              idi <- i
              break
            }
          }
        } else if (length(sid) > 1 && (sl <- nchar(sid[1])) > 1 && all(nchar(sid) == sl)) {
          for (i in seq_len(ncol(target))) {
            if (any(substr(target[, i, drop = TRUE], 1, sl) %in% sid)) {
              idi <- i
              break
            }
          }
        }
        if (verbose) cli_alert_info("target IDs: {.field {colnames(target)[idi]}} column of {.arg target}")
        tid <- target[, idi, drop = TRUE]
        target <- target[, -idi, drop = FALSE]
      }
    }
  }
  tid <- as.character(tid)
  if (!is.logical(aggregate)) aggregate <- length(sid) > length(tid)
  if (make_intersect_map && source_sf && target_sf) intersect_map <- TRUE
  if (length(map)) {
    if (verbose) cli_alert_info("map: provided list")
    intersect_map <- FALSE
    if (is.null(names(map))) {
      if (length(map) != length(sid)) cli_abort("{.arg map} has no names, and is not the same length as source IDs")
      names(map) <- sid
    } else if (!any(sid %in% names(map))) {
      if (!any(tid %in% names(map))) cli_abort("no source or target IDs were in map names")
      mw <- unlist(unname(map))
      if (is.null(names(mw))) mw <- structure(numeric(length(mw)) + 1, names = mw)
      onames <- names(mw)
      if (!any(sid %in% onames)) {
        if (all(!grepl("[^0-9]", onames))) {
          onames <- as.integer(onames)
          if (!anyNA(onames) && min(onames) > 0 && max(onames) <= length(sid)) {
            if (verbose) cli_alert_info("inverting map, with entry values mapped to source IDs")
            names(mw) <- rep(names(map), vapply(map, length, 0))
            map <- split(mw, sid[onames])
          } else {
            cli_abort("map appeared to refer to target indices, but they are out of range")
          }
        } else {
          cli_abort("no source IDs were present in the ID map")
        }
      } else {
        if (verbose) cli_alert_info("inverting map")
        names(mw) <- rep(names(map), vapply(map, length, 0))
        map <- split(mw, onames)
      }
    }
  } else {
    if (nrow(source) == 1) {
      if (verbose) cli_alert_info("map: all target IDs for single source")
      intersect_map <- FALSE
      map[[sid]] <- tid
    } else if (is.null(target) || length(tid) == 1) {
      if (verbose) cli_alert_info("map: all source IDs to a single target")
      intersect_map <- FALSE
      map <- as.list(structure(rep(tid, length(sid)), names = sid))
    } else if (aggregate && !intersect_map && all(nchar(tid) == nchar(tid[1])) &&
      nchar(sid[1]) > nchar(tid[1]) && any(substring(sid, 1, nchar(tid[1])) %in% tid)) {
      if (verbose) cli_alert_info("map: first {.field {nchar(tid[1])}} character{?s} of source IDs")
      intersect_map <- FALSE
      map <- as.list(structure(substr(sid, 1, nchar(tid[1])), names = sid))
    } else if (!intersect_map && all(nchar(sid) == nchar(sid[1])) && nchar(tid[1]) > nchar(sid[1]) &&
      any(substring(tid, 1, nchar(sid[1])) %in% sid) &&
      (!can_intersects || !all(substring(tid, 1, nchar(sid[1])) == substring(tid[1], 1, nchar(sid[1]))))) {
      if (verbose) cli_alert_info("map: first {.field {nchar(sid[1])}} character{?s} of target IDs")
      map <- split(tid, substr(tid, 1, nchar(sid[1])))
    } else if (can_intersects) {
      if (verbose) {
        cli_alert_info("map: intersections between geometries")
        cli_progress_step("initial mapping...", msg_done = "done initial mapping")
      }
      intersect_map <- TRUE
      op <- options(sf_use_s2 = FALSE)
      on.exit(options(sf_use_s2 = op[[1]]))
      if (st_crs(source) != st_crs(target)) {
        if (length(tid) > length(sid)) {
          source <- st_transform(source, st_crs(target))
        } else {
          target <- st_transform(target, st_crs(source))
        }
      }
      su <- !st_is_valid(st_geometry(source))
      if (any(su)) {
        st_geometry(source)[su] <- st_make_valid(st_geometry(source)[su])
        su <- !st_is_valid(st_geometry(source))
        if (any(su)) {
          cli_warn("some {.arg source} geometries were not valid, so removed unrepairable ones")
          source <- source[!su, ]
          sid <- sid[!su]
        } else {
          cli_warn("some {.arg source} geometries were not valid, so repaired them")
        }
      }
      su <- !st_is_valid(st_geometry(target))
      if (any(su)) {
        st_geometry(target)[su] <- st_make_valid(st_geometry(target)[su])
        su <- !st_is_valid(st_geometry(target))
        if (any(su)) {
          cli_warn("some {.arg target} geometries were not valid, so removed unrepairable ones")
          target <- target[!su, ]
          tid <- tid[!su]
        } else {
          cli_warn("some {.arg target} geometries were not valid, so repaired them")
        }
      }
      map <- tryCatch(suppressMessages(st_intersects(st_geometry(source), st_geometry(target))), error = function(e) NULL)
      if (is.null(map)) {
        cli_abort(c(
          x = "{.fn st_intersects} failed between {.arg source} and {.arg target}",
          i = "if you can do this or similar separetely, you can pass the result as {.arg map}"
        ))
      }
      names(map) <- sid
    } else {
      cli_abort("no map was provided, and it could not be made from IDs")
    }
  }
  if (aggregate && length(map) == length(tid) && length(map) != length(sid)) {
    ids <- if (is.null(names(map))) seq_along(map) else names(map)
    names(map) <- NULL
    child_counts <- vapply(map, length, 0)
    map <- as.list(unlist(map))
    names(map) <- rep(ids, child_counts)
  }
  if ((is.logical(overlaps) && overlaps) || grepl("^[Kk]", overlaps)) {
    dodedupe <- FALSE
  } else {
    tiebreak <- c(f = "first", l = "last", r = "random")[tolower(substring(overlaps, 1, 1))]
    dodedupe <- !is.na(tiebreak)
    deduper <- switch(tiebreak,
      first = function(e, agg = TRUE) {
        su <- which(e == max(e))
        if (length(su) > 1) su <- su[1]
        if (agg) e[su] else su
      },
      last = function(e, agg = TRUE) {
        su <- which(e == max(e))
        if (length(su) > 1) su <- su[length(su)]
        if (agg) e[su] else su
      },
      random = function(e, agg = TRUE) {
        su <- which(e == max(e))
        if (length(su) > 1) su <- sample(su, 1)
        if (agg) e[su] else su
      }
    )
  }
  any_partial <- FALSE
  map <- map[sid]
  mls <- vapply(map, length, 0)
  if (!fill_targets && drop_extra_sources) {
    su <- vapply(map, length, 0) != 0
    if (!all(su)) {
      if (verbose) cli_alert_info("removing {sum(!su)} {.arg source}{?s} with no mapped {.arg target}s")
      source <- source[su, , drop = FALSE]
      sid <- sid[su]
      map <- map[su]
    }
  }
  if (intersect_map) {
    source_geom <- st_geometry(source)
    names(source_geom) <- sid
  }
  polys <- intersect_map && any(grepl("POLY", st_geometry_type(
    if (aggregate) source_geom else st_geometry(target)
  ), fixed = TRUE))
  if (polys && any(mls > 1)) {
    if (verbose) {
      oi <- 0
      cli_progress_step(
        "calculating map intersections ({oi}/{length(map)})",
        msg_done = "calculated map intersections ({length(map)})"
      )
    }
    map <- lapply(seq_along(map), function(i) {
      if (verbose) {
        oi <<- i
        cli_progress_update(.envir = parent.frame(2))
      }
      e <- map[[i]]
      if (length(e)) {
        res <- if (is.null(names(e))) {
          w <- rep(1, length(e))
          if (intersect_map && length(e) > 1) {
            reg <- st_geometry(target)[e]
            totals <- s2_area(reg)
            su <- totals > 0
            if (any(su)) {
              part <- suppressMessages(st_intersection(reg[su], source_geom[sid[i]], model = "closed"))
              pv <- numeric(length(part))
              tsu <- s2_is_valid(part)
              pv[tsu] <- s2_area(part[tsu])
              w[su] <- pv / (if (aggregate) s2_area(source_geom[[sid[i]]]) else totals[su])
            }
          }
          names(w) <- if (is.integer(e)) tid[e] else e
          w[w > 0]
        } else {
          e
        }
        if (aggregate && dodedupe) {
          if (length(res) > 1) {
            deduper(res)
          } else {
            res
          }
        } else {
          if (!aggregate && any(res < 1)) any_partial <<- TRUE
          res
        }
      } else {
        numeric()
      }
    })
    if (verbose) cli_progress_done()
  } else {
    map <- lapply(map, function(e) {
      if (is.null(names(e))) {
        w <- rep(1, length(e))
        names(w) <- if (is.integer(e)) tid[e] else e
        e <- w
      }
      if (!aggregate && any(e < 1)) any_partial <<- TRUE
      e
    })
  }
  targets <- unlist(unname(map))
  if (!any_partial) any_partial <- anyDuplicated(names(targets))
  if (dodedupe && any_partial) {
    if (verbose) cli_progress_step("assigning each target to a single source")
    any_partial <- FALSE
    esids <- factor(rep(sid, vapply(map, length, 0)), unique(sid))
    if (anyDuplicated(names(targets))) {
      for (ct in unique(names(targets))) {
        su <- which(names(targets) == ct)
        if (length(su) > 1) {
          sel <- deduper(targets[su], FALSE)
          targets[su[-sel]] <- 0
          targets[su[sel]] <- 1
        }
      }
      su <- which(targets != 0)
      map <- split(targets[su], esids[su])
    } else {
      targets[] <- 1L
      map <- split(targets, esids)
    }
    if (verbose) cli_progress_done()
  } else {
    names(map) <- sid
  }
  n_filled <- 0
  if (fill_targets) {
    su <- vapply(map, length, 0) == 0
    if (any(su)) {
      missed_sources <- names(which(su))
      ftids <- paste0("filled_", missed_sources)
      n_filled <- length(ftids)
      if (verbose) cli_alert_info("adding {n_filled} target ID{?s} to cover all source rows")
      tid <- c(tid, ftids)
      map[missed_sources] <- split(structure(rep(1, n_filled), names = ftids), missed_sources)
    }
  }
  if (verbose && intersect_map) cli_progress_done()
  if (return_map) {
    if (verbose) cli_alert_info("returning map")
    return(map)
  }
  mtid <- names(unlist(unname(map)))
  nout <- length(if (aggregate) sid else tid)
  w <- if (length(weight) > 1) {
    if (verbose) cli_alert_info("weights: {.arg weight} vector")
    if (length(weight) != nout && (!n_filled || length(weight) + n_filled != nout)) {
      cli_abort("{.arg weight} is not the same length as {.arg {if (aggregate) 'source' else 'target'}} IDs")
    }
    weight
  } else if (aggregate && !is.null(weight) && weight %in% colnames(source)) {
    if (verbose) cli_alert_info("weights: {.field {weight}} column of {.arg source}")
    w <- source[, weight, drop = TRUE]
    source <- source[, colnames(weight) != weight, drop = FALSE]
    w
  } else if (!is.null(weight) && weight %in% colnames(target)) {
    if (verbose) cli_alert_info("weights: {.field {weight}} column of {.arg target}")
    target[, weight, drop = TRUE]
  } else {
    weight <- if (!is.numeric(weight)) 1 else weight
    if (verbose) cli_alert_info("weights: {.field {weight}}")
    rep(weight, nout)
  }
  w <- as.numeric(w)
  if (n_filled) w <- c(w, rep(1, n_filled))
  if (anyNA(w)) {
    cli_warn("weights contained NAs, which were set to 0")
    w[is.na(w)] <- 0
  }
  realign <- FALSE
  su <- tid %in% mtid
  if (!all(su)) {
    if (verbose) {
      cli_alert_info(paste(
        "{.field {sum(!su)}} of {.field {length(su)}} target IDs",
        "were dropped because they were not present in {.arg map}"
      ))
    }
    realign <- TRUE
    otid <- tid
    tid <- tid[su]
    if (!aggregate) w <- w[su]
  }
  if (!aggregate && anyDuplicated(tid)) {
    if (!realign) {
      realign <- TRUE
      otid <- tid
    }
    agger <- if (weight_agg_method == "auto") {
      if (all(w == as.integer(w))) "summing" else "averaging"
    } else if (grepl("^[Ss]", weight_agg_method)) "summing" else "averaging"
    if (verbose) cli_alert_info("{.arg target_id} contains duplicates, so {agger} {.arg weight}s")
    w <- tapply(w, tid, if (agger == "summing") sum else mean)
    tid <- names(w)
    w <- unname(w)
  }
  if (source_sf) st_geometry(source) <- NULL
  source_numeric <- vapply(seq_len(ncol(source)), function(col) is.numeric(source[, col, drop = TRUE]), TRUE)
  if (!all(source_numeric)) {
    non_numeric <- which(!source_numeric)
    level_map <- list()
    for (col in non_numeric) {
      x <- as.factor(source[, col, drop = TRUE])
      source[, col] <- as.numeric(x)
      level_map[[as.character(col)]] <- list(
        index = col,
        levels = levels(x)
      )
    }
  }
  if (verbose) {
    cli_alert_info(paste(
      "redistributing {.field {length(source_numeric)}} variable{?s}",
      "from {.field {length(sid)}} source{?s} to {.field {length(tid)}} target{?s}:"
    ))
    var_groups <- split(colnames(source), !source_numeric)
    names(var_groups) <- c(
      "TRUE" = "(char; {sum(!source_numeric)})", "FALSE" = "(numb; {sum(source_numeric)})"
    )[names(var_groups)]
    cli_bullets(structure(
      vapply(names(var_groups), function(type) {
        paste(type, if (length(var_groups[[type]]) < 10) paste(var_groups[[type]], collapse = ", "))
      }, ""),
      names = rep("*", length(var_groups))
    ))
    type <- if (aggregate) "aggregating" else "disaggregating"
    cli_progress_step(paste0(type, "..."), msg_done = paste("done", type))
  }
  method <- as.integer(source_numeric)
  if (any_partial) aggregate <- TRUE
  if (aggregate) {
    method <- method + 10
    if (any_partial) method[method == 11] <- 12
  }
  balance <- any_partial && (!is.logical(use_all) || use_all)
  res <- process_distribute(as.matrix(source), method, tid, w, map, aggregate, balance)
  res <- as.data.frame(res)
  colnames(res) <- colnames(source)
  if (verbose) cli_progress_done()
  if (!all(source_numeric)) {
    if (verbose) cli_alert_info("re-converting categorical levels")
    for (l in level_map) {
      x <- res[, l$index]
      x[x == 0] <- NA
      res[, l$index] <- l$levels[x]
    }
  }
  if (rescale) {
    if (verbose) {
      status <- "checking totals"
      cli_progress_step("{status}")
    }
    source_totals <- vapply(
      seq_along(source_numeric),
      function(i) if (source_numeric[i]) sum(source[, i], na.rm = TRUE) else 0,
      0
    )
    res_totals <- vapply(
      seq_along(source_numeric),
      function(i) if (source_numeric[i]) sum(res[, i], na.rm = TRUE) else 0,
      0
    )
    su <- which(source_totals != res_totals)
    if (length(su)) {
      if (verbose) {
        status <- paste0("rescaling ", length(su), " variable", if (length(su) == 1) "" else "s")
        cli_progress_update()
      }
      res_totals[res_totals == 0] <- 1
      for (i in su) res[, i] <- res[, i] / res_totals[i] * source_totals[i]
      if (verbose) {
        status <- sub("rescaling", "rescaled", status, fixed = TRUE)
        cli_progress_done()
      }
    } else if (verbose) {
      status <- "totals are aligned"
      cli_progress_done()
    }
  }
  res <- cbind(id = tid, res)
  if (realign) {
    if (verbose) cli_alert_info("realigning with original target IDs")
    rownames(res) <- tid
    res <- res[otid, ]
    res$id <- otid
    if (!is.na(default_value)) res[!res$id %in% tid, -1] <- default_value
    rownames(res) <- NULL
  }
  if (!is.null(outFile)) {
    if (verbose) cli_progress_step("writing results to {.file {outFile}}")
    dir.create(dirname(outFile), FALSE, TRUE)
    if (!grepl("\\.\\w", outFile)) outFile <- paste0(outFile, ".csv")
    vroom_write(res, outFile, ",")
    if (verbose) cli_progress_done()
  }
  if (return_geometry && target_sf) {
    st_geometry(res) <- if (n_filled) {
      c(st_geometry(target), if (source_sf) {
        structure(source_geom, names = sid)[missed_sources]
      } else {
        st_sfc(lapply(seq_len(n_filled), function(i) st_point()), crs = st_crs(target))
      })
    } else {
      st_geometry(target)
    }
  }
  invisible(res)
}

#' Make Higher-Order Polygons
#'
#' Randomly combine given polygons into contiguous, larger polygons.
#'
#' @param polygons An \code{sf} object with polygons to be reformed.
#' @param ids A vector of IDs the same length as \code{polygons}, or the column name
#' in \code{polygons} to use as IDs.
#' @param n Number of polygons to aim to combine for each new polygon.
#' @param strict_n Logical; if \code{TRUE} (default), \code{n} represents the number of
#' intersecting polygons to select, depending on availability (with a minimum of 2).
#' If \code{FALSE}, \code{n} represents the number of nearest polygons (by centroid)
#' to use when calculating a box to use when selecting polygons (with a minimum of 1).
#' @param n_as_min Logical; if \code{TRUE}, will merge any parents with fewer than \code{n}
#' children with a random neighbor. Otherwise (and by default), parents may have fewer than \code{n}
#' children. Applies if \code{strict_n} is \code{TRUE}.
#' @param buffer_dist Distance around each initial shape or set of shapes, used to define
#' neighboring shapes. Applies if \code{strict_n} is \code{TRUE}
#' @param min_overlap Minimal area of overlap between potential neighboring shapes and
#' the buffered target shape, used to define neighboring shapes.
#' Applies if \code{strict_n} is \code{TRUE}
#' @param verbose Logical; if \code{FALSE}, will not print status messages.
#' @returns A list with entries for \code{new} (an \code{sf} object containing the new polygons)
#' and \code{map} (a list mapping between the old and new polygon indices).
#' @export

make_parent_polygons <- function(
    polygons, ids = NULL, n = 2, strict_n = TRUE, n_as_min = FALSE, buffer_dist = 5e-5,
    min_overlap = NULL, verbose = TRUE) {
  if (!inherits(polygons, "sf")) cli_abort("{.arg polygons} must be an sf object")
  if (length(ids) == 1 && ids %in% colnames(polygons)) ids <- polygons[[ids]]
  polygons <- st_geometry(polygons)
  polygons <- st_cast(st_boundary(polygons), "POLYGON", do_split = FALSE)
  if (strict_n && is.null(NULL)) min_overlap <- min(s2_area(polygons)) * .0035
  n_poly <- length(polygons)
  if (!is.null(ids)) {
    if (length(ids) != n_poly) cli_abort("{.arg ids} do not align with {.arg polygons}")
    names(polygons) <- ids
  }
  n <- max(if (strict_n) 2 else 1, min(n, n_poly - 1))
  sels <- seq_len(n) + 1L
  centers <- st_coordinates(suppressWarnings(st_centroid(polygons)))
  rownames(centers) <- seq_len(nrow(centers))
  map <- new <- NULL
  accounted <- integer(n_poly)
  if (verbose) {
    cli_progress_step(
      "mapping child polygons ({.field {sum(accounted)}}/{.field {n_poly}})",
      msg_done = paste0(
        "created initial map from {.field {length(map)}} parent polygons to ",
        "{.field {n_poly}} child polygons"
      ),
      spinner = TRUE
    )
  }
  for (i in seq_len(n_poly)) {
    if (verbose) cli_progress_update()
    available <- which(accounted == 0L)
    if (!length(available)) break
    start <- if (length(available) == 1) available else sample(available, 1)
    if (strict_n) {
      sg <- st_buffer(polygons[[start]], buffer_dist, 1)
      ag <- polygons[available]
      pre <- st_intersects(sg, ag)[[1]]
      if (length(pre)) {
        pre <- pre[vapply(ag[pre], function(t) {
          s2_area(st_intersection(t, sg, model = "closed"))
        }, 0) > min_overlap]
      }
      set <- unique(c(start, available[pre]))
      sn <- length(set)
      if (sn < n) {
        available <- available[!available %in% set]
        while (length(set) < n && length(available)) {
          sg <- st_buffer(st_boundary(polygons[set])[[1]], buffer_dist, 1)
          ag <- polygons[available]
          pre <- st_intersects(sg, ag)[[1]]
          if (length(pre)) {
            pre <- pre[vapply(ag[pre], function(t) {
              s2_area(st_intersection(t, sg, model = "closed"))
            }, 0) > min_overlap]
          }
          set <- unique(c(set, available[pre]))
          if (length(set) > n) {
            set <- set[seq_len(n)]
            break
          } else {
            su <- !available %in% set
            if (all(su)) {
              break
            } else {
              available <- available[su]
            }
          }
        }
      } else {
        set <- set[seq_len(n)]
      }
    } else {
      nearest <- as.integer(names(sort(
        lma_simets(centers, centers[start, ], metric = "euclidean"), TRUE
      )[sels]))
      adj <- max(abs(centers[nearest, ] - rep(unname(centers[start, ]), each = n)))
      set <- unique(c(start, available[st_intersects(
        st_polygon(list(matrix(
          rep(centers[start, ], each = 5) - adj * c(1, -1, -1, 1, 1, 1, 1, -1, -1, 1), 5
        ))), polygons[available]
      )[[1]]]))
    }
    accounted[set] <- 1L
    map <- c(list(set), map)
  }
  map <- data.frame(
    super = rep(seq_along(map), vapply(map, length, 0)),
    sub = unlist(map, use.names = FALSE)
  )
  map <- map[!duplicated(map$sub), ]
  map <- split(map$sub, map$super)
  if (n_as_min) {
    su <- which(vapply(map, length, 0) < n)
    if (length(su)) {
      i <- 0
      defunct <- NULL
      if (verbose) {
        cli_progress_step(
          "merging in {.field {length(defunct)}}/{.field {length(su)}} parent{?/s} with too few children",
          msg_done = "merged in {.field {length(defunct)}} of {.field {length(su)}} small parent{?/s}",
          spinner = TRUE
        )
      }
      tmap <- data.frame(
        super = rep(seq_along(map), vapply(map, length, 0)),
        sub = unlist(map, use.names = FALSE)
      )
      available <- seq_along(polygons)
      for (i in su) {
        available <- available[!available %in% map[[i]]]
        sg <- st_buffer(st_boundary(polygons[map[[i]]])[[1]], buffer_dist, 1)
        ag <- polygons[available]
        pre <- st_intersects(sg, ag)[[1]]
        if (length(pre)) {
          pre <- pre[vapply(ag[pre], function(t) {
            s2_area(st_intersection(t, sg, model = "closed"))
          }, 0) > min_overlap]
        }
        if (length(pre)) {
          ns <- available[if (length(pre) == 1) pre else sample(pre, 1)]
          np <- tmap[[1]][tmap[[2]] == ns]
          map[[np]] <- c(map[[np]], map[[i]])
          available <- available[!available %in% map[[np]]]
          defunct <- c(defunct, i)
        }
        if (verbose) cli_progress_update()
      }
      if (length(defunct)) map <- map[-defunct]
      if (verbose) cli_progress_done()
    }
    map <- data.frame(
      super = rep(seq_along(map), vapply(map, length, 0)),
      sub = unlist(map, use.names = FALSE)
    )
    map <- map[!duplicated(map$sub), ]
    map <- split(map$sub, map$super)
  }
  if (!is.null(ids)) map <- lapply(map, function(l) ids[l])
  i <- 0
  if (verbose) {
    cli_progress_step(
      "creating new polygons ({.field {i}}/{.field {length(map)}})",
      msg_done = "created {.field {length(map)}} new polygons", spinner = TRUE
    )
  }
  new <- do.call(c, lapply(map, function(l) {
    if (verbose) {
      i <<- i + 1
      cli_progress_update(.envir = parent.frame(2))
    }
    st_union(polygons[l])
  }))
  list(new = new, map = map)
}

#' Download U.S. Census Microdata
#'
#' Download and load U.S. census American Community Survey (ACS) Public Use Microdata Samples (PUMS):
#' \href{https://www.census.gov/programs-surveys/acs/microdata.html}{census.gov/programs-surveys/acs/microdata.html}
#'
#'
#' @param dir Directory in which to save the file(s).
#' @param state Postal or FIPS code of the state.
#' @param year 4 digit year, between 2005 and the most recent year.
#' @param level A character indicating whether to get the person- or household-level sample. Defaults to both.
#' @param one_year Logical; if \code{FALSE}, will get the 5-year estimates rather than the 1-year file. If
#' not specified, will fall back to the 5-year file if the 1-year file is not available.
#' @param calculate_error Logical; if \code{TRUE}, will calculate standard errors for each variable using
#' the Successive Difference Replicate Formula from the
#' \href{https://www.census.gov/programs-surveys/acs/library/handbooks/pums.html}{census.gov/programs-surveys/acs/library/handbooks/pums.html}.
#' @param crosswalk Logical; if \code{FALSE}, will not retrieve the PUMA relationship files for associating
#' Census tracts with PUM areas. This will be treated as \code{TRUE} if
#' \code{geoids} is specified.
#' @param geoids A vector of county, tract, or block group GEOIDs within the specified state to
#' select PUM areas by; defaults to all areas. If specified, \code{crosswalk} will be treated as \code{TRUE}.
#' @param verbose Logical; if \code{FALSE}, will not print status messages.
#' @examples
#' \dontrun{
#' download_census_pums(".", "va")
#' }
#' @return A list with entries for \code{year} and \code{state} (as specified),
#' \code{dictionary} (containing the data dictionary for that year), \code{household} and/or \code{person}
#' (with survey data), and optionally \code{household_error} and/or \code{person_error}
#' (if \code{calculate_error} is \code{TRUE}), and \code{crosswalk} (if \code{crosswalk} is \code{TRUE} or
#' \code{geoids} is specified).
#' @export

download_census_pums <- function(dir, state, year = 2021, level = "both", one_year = TRUE, calculate_error = FALSE,
                                 crosswalk = TRUE, geoids = NULL, verbose = TRUE) {
  us_fips <- list(
    post = c(
      "us", "al", "ak", "az", "ar", "ca", "co", "ct", "de", "dc", "fl", "ga", "hi", "id", "il", "in", "ia", "ks", "ky",
      "la", "me", "md", "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", "nm", "ny", "nc", "nd", "oh", "ok",
      "or", "pa", "ri", "sc", "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy", "pr"
    ),
    fips = c(
      "us", 1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
      33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 72
    )
  )
  state <- gsub("^0|[^a-z0-9]", "", tolower(state[[1]]))
  fid <- which(us_fips$fips == state)
  if (!length(fid) && nchar(state) == 2) fid <- which(us_fips$post == state)
  if (!length(fid) || is.na(fid)) cli_abort("failed to recognize {.arg state}")
  state <- us_fips$post[fid]
  level <- tolower(substring(level, 1, 1))
  if (!level %in% c("h", "p", "b")) {
    cli_abort(
      '{.arg level} should be one of {.field "houshold"}, {.field "person"}, or {.field "both"}.'
    )
  }
  summary <- if (one_year) "1-Year" else "5-Year"
  urls <- paste0(
    "https://www2.census.gov/programs-surveys/acs/data/pums/", year,
    "/", summary, "/csv_", c("h", "p"), state, ".zip"
  )
  if (!is.null(dir)) {
    dir <- tdir <- paste0(normalizePath(dir, "/", FALSE), "/")
    if (!dir.exists(dir)) dir.create(dir, FALSE, TRUE)
  } else {
    tdir <- paste0(tempdir(), "/")
  }
  folder <- paste0(tdir, "pums_", state, "/", year, "/", summary, "/")
  dir.create(folder, FALSE, TRUE)
  res <- list(year = year, state = state, dictionary = NULL, household = NULL, person = NULL)
  dict_json <- paste0(folder, paste0(
    "PUMS_Data_Dictionary_", if (one_year) year else paste0(year - 4, "-", year), ".json"
  ))
  if (file.exists(dict_json)) {
    res$dictionary <- read_json(dict_json, simplifyVector = TRUE)
  } else {
    dict_file <- sub(".json", ".csv", dict_json, fixed = TRUE)
    if (!file.exists(dict_file)) {
      url <- paste0("https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/", basename(dict_file))
      status <- curl_fetch_disk(url, dict_file)
      if (status$status_code != 200) cli_abort("failed to retrieve data dictionary from {.url {url}}")
    }
    dict <- readLines(dict_file, warn = FALSE)
    res$dictionary <- lapply(
      split(dict, sub(",.*$", "", sub("^[^,]+,", "", dict, perl = TRUE), perl = TRUE)),
      function(e) {
        h <- suppressWarnings(scan(text = e[[1]], what = "", sep = ",", quiet = TRUE))
        map <- as.data.frame(do.call(rbind, lapply(
          e[-1], function(l) suppressWarnings(scan(text = l, what = "", sep = ",", quiet = TRUE))[5:7]
        )))
        colnames(map) <- c("from", "to", "value")
        list(
          description = h[5],
          type = if (h[3] == "C") "character" else "integer",
          length = as.numeric(h[4]),
          map = map
        )
      }
    )
    write_json(res$dictionary, dict_json, auto_unbox = TRUE)
  }
  files <- paste0(folder, "/", c("h", "p"), ".csv.xz")
  if (level != "b") {
    lid <- (level == "p") + 1
    urls <- urls[lid]
    files <- files[lid]
  }
  can_change <- missing(one_year)
  for (i in seq_along(urls)) {
    url <- urls[i]
    clevel <- if (grepl("csv_h", url, fixed = TRUE)) "household" else "person"
    if (!file.exists(files[i])) {
      file_zip <- paste0(folder, basename(url))
      if (!file.exists(file_zip)) {
        if (verbose) cli_alert_info("retrieving {clevel} sample: {.url {url}}")
        status <- curl_fetch_disk(url, file_zip)
        if (status$status_code != 200 && can_change) {
          new_summary <- if (one_year) "5-Year" else "1-Year"
          urls <- sub(summary, new_summary, urls, fixed = TRUE)
          files <- sub(summary, new_summary, files, fixed = TRUE)
          summary <- new_summary
          url <- urls[i]
          status <- curl_fetch_disk(url, file_zip)
        }
        can_change <- FALSE
        if (status$status_code != 200) {
          unlink(file_zip)
          cli_abort(paste0(
            "failed to download {.url {url}}: ", strsplit(rawToChar(status$headers), "[\r\n]")[[1]][1]
          ))
        }
      }
      rfile <- paste0(folder, grep(".csv", unzip(file_zip, list = TRUE)$Name, fixed = TRUE, value = TRUE))
      unzip(file_zip, exdir = folder)
      cols <- strsplit(readLines(rfile, 1), ",")[[1]]
      types <- paste(vapply(cols, function(n) {
        type <- res$dictionary[[n]]$type
        if (is.null(type)) if (grepl("WGTP", n, fixed = TRUE)) "i" else "c" else substring(type, 1, 1)
      }, ""), collapse = "")
      res[[clevel]] <- vroom(rfile, col_types = types)
      vroom_write(res[[clevel]], files[i], ",")
      on.exit(unlink(rfile), add = TRUE)
    } else {
      if (verbose) cli_alert_info("loading {clevel} sample: {.field {basename(files[i])}}")
      cols <- strsplit(readLines(files[i], 1), ",")[[1]]
      types <- paste(vapply(cols, function(n) {
        type <- res$dictionary[[n]]$type
        if (is.null(type)) if (grepl("WGTP", n, fixed = TRUE)) "i" else "c" else substring(type, 1, 1)
      }, ""), collapse = "")
      res[[clevel]] <- vroom(files[i], col_types = types)
    }
    weight <- if (clevel == "person") "PWGTP" else "WGTP"
    if (calculate_error && paste0(weight, "1") %in% cols) {
      if (verbose) cli_alert_info("calculating {clevel} errors")
      vars <- cols[!grepl("^(?:RT|SERIAL|DIV|PUMA|REG|ST|ADJ)|WGTP", cols) & cols %in% names(res$dictionary)]
      names(vars) <- vars
      wv <- res[[clevel]][, weight, drop = TRUE]
      w <- as.matrix(res[[clevel]][, grep("WGTP\\d+", cols)])
      res[[paste0(clevel, "_error")]] <- as.data.frame(do.call(cbind, lapply(vars, function(name) {
        v <- res[[clevel]][, name, drop = TRUE]
        map <- res$dictionary[[name]]
        if (map$type == "integer") {
          sqrt(.05 * rowSums((w - v * wv)^2))
        } else {
          levels <- map$map$from
          names(levels) <- paste0(name, "_", levels)
          vapply(levels, function(l) {
            sqrt(.05 * rowSums((w - (v == l) * wv)^2))
          }, numeric(length(v)))
        }
      })))
    }
  }
  if (crosswalk || !is.null(geoids)) {
    url <- if (as.numeric(year) > if (one_year) 2019 else 2024) {
      "https://www2.census.gov/geo/docs/maps-data/data/rel2020/2020_Census_Tract_to_2020_PUMA.txt"
    } else {
      "https://www2.census.gov/geo/docs/maps-data/data/rel/2010_Census_Tract_to_2010_PUMA.txt"
    }
    crosswalk_file <- paste0(folder, basename(url))
    if (!file.exists(crosswalk_file)) {
      if (verbose) cli_alert_info("retrieving crosswalk {.url {url}}")
      status <- curl_fetch_disk(url, crosswalk_file)
      if (status$status_code != 200) cli_warn("failed to retrieve crosswalk from {.url {url}}")
    }
    if (file.exists(crosswalk_file)) {
      if (verbose) cli_alert_info("loading crosswalk {.field {basename(crosswalk_file)}}")
      res$crosswalk <- vroom(crosswalk_file, col_types = "cccc")
      if (!is.null(geoids)) {
        geoids <- substring(geoids, 1, 11)
        su <- do.call(paste0, res$crosswalk[, 1:2]) %in% geoids | do.call(paste0, res$crosswalk[, 1:3]) %in% geoids
        if (any(su)) {
          pids <- unique(res$crosswalk$PUMA5CE[su])
          if (verbose) {
            cli_alert_info(
              "subsetting to {length(pids)} of {length(unique(res$crosswalk$PUMA5CE))} PUM areas"
            )
          }
          for (set in c("household", "person")) {
            su <- res[[set]]$PUMA %in% pids
            res[[set]] <- res[[set]][su, ]
            set_error <- paste0(set, "_error")
            if (set_error %in% names(res)) res[[set_error]] <- res[[set_error]][su, ]
          }
        } else {
          cli_warn("none of the specified {.arg geoids} were found in the crosswalk")
        }
      }
    }
  }
  res
}

#' Method: Proportional, resident-estimate-adjusted parcel unit counts
#'
#' A wrapper around the \code{\link{redistribute}} function, that
#' takes PUMS data as additional input, and uses it to calculates an adjusted weight for
#' redistribution to parcel data.
#'
#' It is assumed that initial weights are unit counts, and these are to be adjusted
#' based on PUMS household data.
#'
#' @param source,target Source and target of \code{\link{redistribute}}.
#' @param households PUMS household data. Can be a list with \code{household} and \code{person}
#' entries, like that returned from \code{\link{download_census_pums}}.
#' @param target_total Column name in \code{target} (e.g., parcel data; or equivalent vector)
#' containing the unit count
#' @param target_indicator Column name of a logical variable in \code{target} (or equivalent vector)
#' to use to assign adjustment values, where \code{TRUE} indicates a single family home.
#' If the values of the column are characters, anything not matching \code{"MULTI"} will be \code{TRUE}.
#' @param households_indicator Same a \code{target_indicator}, but in \code{households}, or
#' a vector (or column name) of BLD codes, where either \code{"02"} or \code{"03"} would be \code{TRUE}.
#' @param households_size A vector of household sizes, or a column in \code{households} containing
#' such values. If this is not specified or found, \code{person} will be used to calculate
#' household size, based on \code{hosuehold_id} and \code{person_household_id}.
#' @param households_id A vector of household IDs aligning with \code{households},
#' or a column name in \code{households} containing such IDs.
#' Only used to calculate \code{households_size} if necessary.
#' @param person PUMS person data. Only used to calculate \code{households_size} if necessary.
#' @param person_household_id A vector of household IDs aligning with \code{person},
#' or a column name in \code{person} containing such IDs.
#' Only used to calculate \code{households_size} if necessary.
#' @param ... Additional arguments to pass to \code{\link{redistribute}}. You will likely need
#' to specify a \code{map}, and potentially \code{source_id} and/or \code{target_id}.
#' @examples
#' \dontrun{
#' if (require("tidycensus")) {
#'   # download source, target, and household data
#'   tracts <- tidycensus::get_acs(
#'     year = 2021,
#'     state = "51",
#'     county = "013",
#'     geography = "tract",
#'     output = "wide",
#'     variables = c(total = "B01001_001"),
#'     geometry = TRUE
#'   )
#'   parcels <- sf::st_read(paste0(
#'     "https://arlgis.arlingtonva.us/arcgis/rest/",
#'     "services/Open_Data/od_MHUD_Polygons/",
#'     "FeatureServer/0/query?where=1=1&outFields=*&outSR=4326&f=json"
#'   ))
#'   pums <- download_census_pums(tempdir(), "51", geoids = tracts$GEOID)
#'
#'   # calculate map between source and target
#'   map_tr_to_parcel <- redistribute(
#'     tracts, parcels,
#'     target_id = "OBJECTID", return_map = TRUE
#'   )
#'
#'   # redistribute tract-level summaries to parcels,
#'   # using a resident-estimate-adjusted weight
#'   parcels_filled <- redistribute_parcel_pums_adj(
#'     tracts[, -2], parcels, pums,
#'     map = map_tr_to_parcel, target_id = "OBJECTID"
#'   )
#'
#'   # this can also be calculated with the underlying function
#'
#'   ## calculate resident estimates
#'   household_size <- tapply(
#'     table(pums$person$SERIALNO)[pums$household$SERIALNO],
#'     pums$household$BLD %in% c("02", "03"),
#'     mean,
#'     na.rm = TRUE
#'   )
#'   residents <- parcels$Total_Units * household_size[
#'     as.character(parcels$Unit_Type != "MULTI")
#'   ]
#'
#'   ## supply as weights
#'   parcels_filled_manual <- redistribute(
#'     tracts[, -2], parcels, map_tr_to_parcel,
#'     target_id = "OBJECTID", weight = residents
#'   )
#'
#'   # either way, you could now use the result to make estimates
#'   # at higher-level geographies by redistributing from the
#'   # parcel-level estimates to the new target
#' }
#' }
#' @returns Result of \code{\link{redistribute}}. These are assumed to be parcel-level
#' estimates, which could then be aggregated to make higher-level estimates.
#' @export

redistribute_parcel_pums_adj <- function(
    source, target, households, target_total = "Total_Units", target_indicator = "Unit_Type",
    households_size = "size", households_indicator = "BLD", households_id = "SERIALNO",
    person = NULL, person_household_id = "SERIALNO", ...) {
  if (is.null(dim(households)) && !is.null(households$household)) {
    if (is.null(person)) person <- households$person
    households <- households$household
  }
  hn <- nrow(households)
  hcols <- colnames(households)
  tn <- nrow(target)
  tcols <- colnames(target)

  # resolve main vectors
  if (is.character(target_total) && length(target_total) == 1) {
    if (!target_total %in% tcols) cli_abort("{.arg target_total} ({target_total}) is not in {.arg target}")
    target_total <- target[[target_total]]
  } else if (length(target_total) != tn) cli_abort("{.arg target_total} is not the same length as {.code nrow(target)}")
  if (is.character(target_indicator) && length(target_indicator) == 1) {
    if (!target_indicator %in% tcols) cli_abort("{.arg target_indicator} ({target_indicator}) is not in {.arg target}")
    target_indicator <- target[[target_indicator]]
  } else if (length(target_indicator) != tn) cli_abort("{.arg target_indicator} is not the same length as {.code nrow(target)}")
  if (is.character(households_indicator) && length(households_indicator) == 1) {
    if (!households_indicator %in% hcols) {
      cli_abort("{.arg households_indicator} ({households_indicator}) is not in {.arg households}")
    }
    households_indicator <- households[[households_indicator]]
  } else if (length(households_indicator) != hn) {
    cli_abort("{.arg households_indicator} is not the same length as {.code nrow(households)}")
  }
  if (!is.logical(target_indicator)) {
    target_indicator <- if (is.numeric(target_indicator)) {
      target_indicator == 1
    } else {
      !grepl("^multi", target_indicator, TRUE)
    }
  }
  if (!is.logical(households_indicator)) {
    households_indicator <- if (is.numeric(households_indicator)) {
      households_indicator == 1
    } else {
      if (all(c("02", "03") %in% households_indicator)) {
        grepl("^0[23]$", households_indicator)
      } else {
        !grepl("^multi", households_indicator, TRUE)
      }
    }
  }

  # resolve size
  size <- NULL
  if (length(households_size) != 1 && length(households_size) != hn) {
    cli_abort("{.arg households_size} is not the same length as {.code nrow(households)}")
  }
  if (length(households_id) != 1 && length(households_id) != hn) {
    cli_abort("{.arg households_id} is not the same length as {.code nrow(households)}")
  }
  if (is.character(households_size) && length(households_size) == 1) {
    if (households_size %in% hcols) {
      size <- households[[households_size]]
    } else {
      if (length(households_id) == 1 && households_id %in% hcols) households_id <- households[[households_id]]
      if (!is.null(person) && length(person_household_id) == 1 && person_household_id %in% colnames(person)) {
        person_household_id <- person[[person_household_id]]
      } else if (length(person_household_id) == 1) {
        cli_abort("{.arg person_household_id} ({person_household_id}) is not in {.arg person}")
      }
      if (!any(households_id %in% person_household_id)) {
        cli_abort("no {.arg households_id}s are present in {.arg person_household_id}")
      }
      size <- table(person_household_id)[households_id]
    }
  }
  if (is.null(size)) cli_abort("failed to resolve {.arg households_size}")

  residents <- target_total * tapply(
    size, households_indicator, mean,
    na.rm = TRUE
  )[as.character(target_indicator)]
  redistribute(source, target, weight = residents, ...)
}

// [[Rcpp::depends(RcppParallel)]]
#include <Rcpp.h>
#include <RcppParallel.h>
using namespace std;
using namespace Rcpp;
using namespace RcppParallel;

const int select(const vector<int> &local, const NumericVector &base) {
  const int n = local.size();
  int sel = 0;
  double prob, sel_prob = 0;
  for (int i = n; i--;) {
    prob = R::rbinom(n, local[i] ? local[i] / n * base[i] : base[i]);
    if (prob > sel_prob) {
      sel = i;
      sel_prob = prob;
    }
  }
  return sel;
}

struct Generate : public Worker {
  RVector<double> individuals;
  const vector<int> start, end;
  const IntegerVector region, head_income, size, renting, space_i, space_p;
  const NumericVector space_x, race_rates;
  const int N, n_neighbors, n_races = race_rates.length(), nh = region.length();
  Generate(
    NumericVector &individuals, const vector<int> &start, const vector<int> &end,
    const IntegerVector &region, const IntegerVector &head_income, const IntegerVector &size,
    const IntegerVector &renting, const S4 &space, const NumericVector &race_rates,
    const int &N, const double &n_neighbors
  ):
    individuals(individuals), start(start), end(end), region(region),
    head_income(head_income), size(size), renting(renting), space_i(space.slot("i")),
    space_p(space.slot("p")), space_x(space.slot("x")), race_rates(race_rates), N(N),
    n_neighbors(n_neighbors)
  {}
  void operator()(size_t s, size_t e){
    int ck = 1e3;
    vector<int> neighbors(n_neighbors), race_table(n_races);
    for (; s < e; s++) {
      const int ci = s, r = region[s] - 1, inc = head_income[s], n = size[s],
        ro = renting[s], ie = end[s];
      int i = 0, nn = 0, nni, nhi = space_p[r], nhn = space_p[r + 1], h1, h2;
      double ns = 0;

      // summarize neighbors
      vector<int> neighbor_index;
      vector<double> neighbor_sims;
      // // find neighbors in range
      for (; i < nh; i++) {
        if (i != ci) {
          const int nr = region[i] - 1;
          ns = 0;
          for (nni = nhi; nni < nhn; nni++) {
            if (space_i[nni] == nr) {
              ns = space_x[nni];
              break;
            }
          }
          if (ns) {
            if (nn < n_neighbors) {
              neighbor_index.push_back(i);
              neighbor_sims.push_back(ns);
              nn++;
            } else {
              ns += R::rnorm(0, 1e-5);
              for (nni = nn; nni--;) {
                if (ns > neighbor_sims[nni]) {
                  neighbor_index[nni] = i;
                  neighbor_sims[nni] = ns;
                  break;
                }
              }
            }
          }
        }
      }
      if (nn) {
        int ni = n_neighbors;
        for (; ni--;) neighbors[ni] = 0;
        for (ni = n_races; ni--;) race_table[ni] = 0;
        // see if neighbors have been filled, and summarize over their household members
        for (ni = nn, nn = 0; ni--;) {
          nni = neighbor_index[ni], nhi = start[nni], nhn = min(nhi + 2, end[nni]);
          if (individuals[nhi]) {
            for (; nhi < nhn; nhi++) {
              neighbors[0] += individuals[nhi + 3 * N]; // sum age
              neighbors[1] += individuals[nhi + 4 * N]; // n 1 sex
              race_table[individuals[nhi + 5 * N]]++;
              neighbors[3] += individuals[nhi + 6 * N]; // sum income
              nn++;
            }
          }
        }
        if (nn) {
          // finalize neighbor summaries
          ni = n_races, i = 0;
          for (int m = -1; ni--;) {
            if (race_table[ni] > m) {
              i = ni;
              m = race_table[ni];
            }
          }
          neighbors[0] /= nn;
          neighbors[2] = i;
          neighbors[3] /= nn;
        }
      }

      // fill out household members
      int hage = 0;
      for (i = start[s], h1 = i, h2 = i + 1; i < ie; i++) {
        // set household and individual ids
        const int row = i;
        individuals[row] = s + 1;
        individuals[row + N] = row + 1;

        // set number of neighbors (col 3)
        individuals[row + 2 * N] = nn;

        // select age (col 4)
        if (row == h1) {
          hage = floor(R::rbeta(1, ro ? 2 : 1.5) * 80 + 18);
          if (nn) {
            hage = max(18, (int)ceil((hage + R::rnorm(neighbors[0], 5)) / 2));
          }
          individuals[row + 3 * N] = hage;
        } else if (row == h2) {
          individuals[row + 3 * N] = max(18, min((int)floor(R::rnorm(hage, hage > 40 ? 15 : 5)), 90));
        } else {
          individuals[row + 3 * N] = ceil(R::runif(0, 1) * (hage - 15));
        }

        // select sex (col 5)
        if (row == h2) {
          individuals[row + 4 * N] = R::rbinom(1, .1) ? individuals[row - 1 + 4 * N] : (int)!individuals[row - 1 + 4 * N];
        } else {
          individuals[row + 4 * N] = R::rbinom(1, .5);
        }

        // select race (col 6)
        if (row == h1) {
          individuals[row + 5 * N] = select(race_table, race_rates);
        } else if (row == h2) {
          individuals[row + 5 * N] = R::rbinom(1, .7) ? individuals[row - 1 + 5 * N] : select(race_table, race_rates);
        } else {
          individuals[row + 5 * N] = individuals[row - (1 + R::rbinom(1, .5)) + 5 * N];
        }

        // select income (col 7)
        if (row == h1) {
          individuals[row + 6 * N] = inc;
        } else if (row == h2) {
          individuals[row + 6 * N] = inc < (nn ? neighbors[3] : 4e4) || n == 2 ? max(0, (int)floor(R::rnorm(inc, 2e4))) : 0;
        }
      }
      if (!--ck) {
        checkUserInterrupt();
        ck = 1e3;
      }
    }
  }
};

// [[Rcpp::export]]
NumericVector generate_individuals(
    const IntegerVector &region, const IntegerVector &head_income, const IntegerVector &size,
    const IntegerVector &renting, const S4 &space, const int &n_neighbors,
    const NumericVector &race_rates
  ) {
  const int nh = size.length(), nvars = 7;

  // map sets of individuals
  vector<int> start(nh), end(nh);
  int n = 0;
  for (int i = 0; i < nh; i++) {
    start[i] = n;
    n += size[i];
    end[i] = n;
  }

  NumericVector individuals(n * nvars);
  Generate g(
    individuals, start, end, region, head_income, size, renting, space, race_rates, n, n_neighbors
  );
  parallelFor(0, nh, g);

  individuals.attr("dim") = Dimension(n, nvars);
  individuals.attr("dimnames") = List::create(R_NilValue, CharacterVector::create(
    "household", "person", "neighbors", "age", "sex", "race", "income"
  ));
  return individuals;
}

// [[Rcpp::depends(RcppParallel)]]
#include <Rcpp.h>
#include <RcppParallel.h>
#include <unordered_map>
using namespace std;
using namespace Rcpp;
using namespace RcppParallel;

const int aggregate_mode(
    const int &os, const RMatrix<double> &source, const int &tstart, const int &tend,
    const vector<int> &source_rows, const vector<double> &iw
) {
  bool any = false;
  double max = 0;
  int max_cat = 0;
  unordered_map<int, double> acc;
  for (int i = tstart, r; i < tend; i++) {
    r = source_rows[i];
    const int cat = source[r + os];
    if (isfinite(cat)) {
      any = true;
      if (acc.find(cat) == acc.end()) {
        acc.insert({cat, iw[i]});
      } else acc.at(cat) += iw[i];
      const double count = acc.at(cat);
      if (count > max) {
        max = count;
        max_cat = cat;
      }
    }
  }
  return any ? max_cat : NA_INTEGER;
}

const double aggregate_sum(
    const int &os, const RMatrix<double> &source, const int &tstart, const int &tend,
    const vector<int> &source_rows, const RVector<double> &w, const vector<double> &iw
) {
  bool any = false;
  double sum = 0;
  for (int i = tstart, r; i < tend; i++) {
    r = source_rows[i];
    const double v = source[r + os];
    if (isfinite(v)) {
      any = true;
      sum += v * iw[i] * w[r];
    }
  }
  return any ? sum : NA_REAL;
}

const double proportional_aggregate(
    const int &os, const RMatrix<double> &source, const int &tstart, const int &tend,
    const vector<int> &source_rows, const RVector<double> &w, const vector<double> &iw,
    const vector<double> &totals, const size_t &s
) {
  bool any = false;
  double sum = 0;
  for (int i = tstart, r; i < tend; i++) {
    r = source_rows[i];
    const double v = source[r + os];
    if (isfinite(v)) {
      any = true;
      if (totals[r]) sum += v * iw[i] * w[s] / totals[r];
    }
  }
  return any ? sum : NA_REAL;
}

void proportional_categoric(
    const int &os, const int &s, const int &tstart, const int &tend, const vector<int> &target_rows,
    RVector<double> &v, const vector<double> &iw
) {
  const int val = isfinite(s) ? s : NA_REAL;
  for (int i = tstart, r; i < tend; i++) {
    r = target_rows[i];
    v[r + os] = val;
  }
}

void proportional_numeric(
  const int &os, const double &s, const int &tstart, const int &tend, const vector<int> &target_rows,
  const RVector<double> &w, const double &total, RVector<double> &v, const vector<double> &iw
) {
  for (int i = tstart, r; i < tend; i++) {
    r = target_rows[i];
    v[r + os] = isfinite(s) ? s * iw[i] * w[r] / total : NA_REAL;
  }
}

struct Distribute : public Worker {
  RVector<double> res;
  const RMatrix<double> source;
  const vector<int> start, end, index;
  const RVector<double> weight;
  const vector<double> index_weight, weight_totals;
  const RVector<int> method;
  const bool agg;
  const int ncol = method.length(), n = agg ? source.nrow() : weight.length(), on = start.size();
  Distribute(
    NumericVector &res, const NumericMatrix &source,
    const vector<int> &start, const vector<int> &end, const vector<int> &index,
    const NumericVector &weight, const vector<double> &index_weight, const vector<double> &weight_totals,
    const IntegerVector &method, const bool &agg
  ):
    res(res), source(source), start(start), end(end), index(index), weight(weight),
    index_weight(index_weight), weight_totals(weight_totals), method(method), agg(agg)
  {}
  void operator()(size_t s, size_t e){
    int c = 0;
    for (; s < e; s++) {
      const int ss = start[s], se = end[s] + 1;
      if (ss > -1) {
        if (agg) {
          for (c = 0; c < ncol; c++) {
            const int os = c * n;
            switch(method[c]) {
              case 10:
                res[s + c * on] = aggregate_mode(os, source, ss, se, index, index_weight);
                break;
              case 11:
                res[s + c * on] = aggregate_sum(os, source, ss, se, index, weight, index_weight);
                break;
              case 12:
                res[s + c * on] = proportional_aggregate(
                  os, source, ss, se, index, weight, index_weight, weight_totals, s
                );
                break;
            }
          }
        } else {
          const double total = weight_totals[s];
          const RMatrix<double>::Row source_row = source.row(s);
          for (c = 0; c < ncol; c++) {
            const int os = c * n;
            switch(method[c]) {
              case 0:
                proportional_categoric(os, source_row[c], ss, se, index, res, index_weight);
                break;
              case 1:
                if (total) {
                  proportional_numeric(os, source_row[c], ss, se, index, weight, total, res, index_weight);
                }
                break;
            }
          }
        }
      }
    }
  }
};

// [[Rcpp::export]]
NumericVector process_distribute(
    const NumericMatrix &s, const IntegerVector &method, const CharacterVector &tid,
    const NumericVector &weight, const List &map, const bool &agg, const bool &balance
  ) {
  const int source_n = s.nrow(), nvars = s.ncol(), target_n = tid.length();
  NumericVector res(nvars * target_n);

  // translate map
  const int iter_max = agg ? target_n : source_n;
  vector<int> start(iter_max), end(iter_max), index;
  vector<double> weight_totals(source_n), index_weight;
  if (agg) {
    unordered_map<String, IntegerVector> imap;
    unordered_map<String, NumericVector> wmap;
    unordered_map<String, double> tmap;
    int i, t;
    for (i = 0; i < target_n; i++) {
      IntegerVector indices;
      NumericVector indices_weight;
      const String id = tid[i];
      imap.insert({id, indices});
      wmap.insert({id, indices_weight});
      tmap.insert({id, balance ? 0 : 1});
    }
    for (i = 0; i < source_n; i++) {
      weight_totals[i] = 0;
      const NumericVector tidws = map[i];
      const int n = tidws.length();
      if (n) {
        const CharacterVector tids = tidws.names();
        for (t = 0; t < n; t++) {
          const String id = tids[t];
          if (imap.find(id) != imap.end()) {
            const double itw = tidws[t];
            imap.at(id).push_back(i);
            wmap.at(id).push_back(itw);
            if (balance) tmap.at(id) += itw;
          }
        }
      }
    }
    for (i = 0; i < target_n; i++) {
      const String id = tid[i];
      const IntegerVector indices = imap.at(id);
      const NumericVector indices_weight = wmap.at(id);
      const double weight_total = tmap.at(id);
      const int n = indices.length();
      bool any = false;
      start[i] = index.size();
      for (t = 0; t < n; t++) {
        const int si = indices[t];
        double wi = indices_weight[t];
        if (balance && weight_total != 1) wi /= weight_total;
        any = true;
        index.push_back(si);
        index_weight.push_back(wi);
        weight_totals[si] += wi * weight[i];
      }
      if (any) {
        end[i] = index.size() - 1;
      } else {
        start[i] = end[i] = -1;
      }
    }
  } else {
    for (int i = 0, t; i < source_n; i++) {
      weight_totals[i] = 0;
      const NumericVector tidws = map[i];
      if (tidws.length()) {
        const CharacterVector tids = tidws.names();
        const IntegerVector indices = match(tids, tid) - 1;
        const int n = indices.length();
        bool any = false;
        start[i] = index.size();
        for (t = 0; t < n; t++) {
          const int si = indices[t];
          if (si > -1 && si < target_n) {
            any = true;
            index.push_back(si);
            index_weight.push_back(tidws[t]);
            weight_totals[i] += weight[si];
          }
        }
        if (any) {
          end[i] = index.size() - 1;
        } else {
          start[i] = end[i] = -1;
        }
      } else {
        start[i] = end[i] = -1;
      }
    }
  }

  // process
  Distribute distributed(res, s, start, end, index, weight, index_weight, weight_totals, method, agg);
  checkUserInterrupt();
  parallelFor(0, iter_max, distributed);

  res.attr("dim") = Dimension(target_n, nvars);
  return res;
}

1		#' Make Higher-Order Polygons
2		#'
3		#' Randomly combine given polygons into contiguous, larger polygons.
4		#'
5		#' @param polygons An \code{sf} object with polygons to be reformed.
6		#' @param ids A vector of IDs the same length as \code{polygons}, or the column name
7		#' in \code{polygons} to use as IDs.
8		#' @param n Number of polygons to aim to combine for each new polygon.
9		#' @param strict_n Logical; if \code{TRUE} (default), \code{n} represents the number of
10		#' intersecting polygons to select, depending on availability (with a minimum of 2).
11		#' If \code{FALSE}, \code{n} represents the number of nearest polygons (by centroid)
12		#' to use when calculating a box to use when selecting polygons (with a minimum of 1).
13		#' @param n_as_min Logical; if \code{TRUE}, will merge any parents with fewer than \code{n}
14		#' children with a random neighbor. Otherwise (and by default), parents may have fewer than \code{n}
15		#' children. Applies if \code{strict_n} is \code{TRUE}.
16		#' @param buffer_dist Distance around each initial shape or set of shapes, used to define
17		#' neighboring shapes. Applies if \code{strict_n} is \code{TRUE}
18		#' @param min_overlap Minimal area of overlap between potential neighboring shapes and
19		#' the buffered target shape, used to define neighboring shapes.
20		#' Applies if \code{strict_n} is \code{TRUE}
21		#' @param verbose Logical; if \code{FALSE}, will not print status messages.
22		#' @returns A list with entries for \code{new} (an \code{sf} object containing the new polygons)
23		#' and \code{map} (a list mapping between the old and new polygon indices).
24		#' @export
25
26		make_parent_polygons <- function(
27		polygons, ids = NULL, n = 2, strict_n = TRUE, n_as_min = FALSE, buffer_dist = 5e-5,
28		min_overlap = NULL, verbose = TRUE) {
29	!	if (!inherits(polygons, "sf")) cli_abort("{.arg polygons} must be an sf object")
30	!	if (length(ids) == 1 && ids %in% colnames(polygons)) ids <- polygons[[ids]]
31	3x	polygons <- st_geometry(polygons)
32	3x	polygons <- st_cast(st_boundary(polygons), "POLYGON", do_split = FALSE)
33	2x	if (strict_n && is.null(NULL)) min_overlap <- min(s2_area(polygons)) * .0035
34	3x	n_poly <- length(polygons)
35	3x	if (!is.null(ids)) {
36	!	if (length(ids) != n_poly) cli_abort("{.arg ids} do not align with {.arg polygons}")
37	!	names(polygons) <- ids
38		}
39	3x	n <- max(if (strict_n) 2 else 1, min(n, n_poly - 1))
40	3x	sels <- seq_len(n) + 1L
41	3x	centers <- st_coordinates(suppressWarnings(st_centroid(polygons)))
42	3x	rownames(centers) <- seq_len(nrow(centers))
43	3x	map <- new <- NULL
44	3x	accounted <- integer(n_poly)
45	3x	if (verbose) {
46	3x	cli_progress_step(
47	3x	"mapping child polygons ({.field {sum(accounted)}}/{.field {n_poly}})",
48	3x	msg_done = paste0(
49	3x	"created initial map from {.field {length(map)}} parent polygons to ",
50	3x	"{.field {n_poly}} child polygons"
51		),
52	3x	spinner = TRUE
53		)
54		}
55	3x	for (i in seq_len(n_poly)) {
56	49x	if (verbose) cli_progress_update()
57	49x	available <- which(accounted == 0L)
58	3x	if (!length(available)) break
59	46x	start <- if (length(available) == 1) available else sample(available, 1)
60	46x	if (strict_n) {
61	35x	sg <- st_buffer(polygons[[start]], buffer_dist, 1)
62	35x	ag <- polygons[available]
63	35x	pre <- st_intersects(sg, ag)[[1]]
64	35x	if (length(pre)) {
65	35x	pre <- pre[vapply(ag[pre], function(t) {
66	157x	s2_area(st_intersection(t, sg, model = "closed"))
67	35x	}, 0) > min_overlap]
68		}
69	35x	set <- unique(c(start, available[pre]))
70	35x	sn <- length(set)
71	35x	if (sn < n) {
72	10x	available <- available[!available %in% set]
73	10x	while (length(set) < n && length(available)) {
74	8x	sg <- st_buffer(st_boundary(polygons[set])[[1]], buffer_dist, 1)
75	8x	ag <- polygons[available]
76	8x	pre <- st_intersects(sg, ag)[[1]]
77	8x	if (length(pre)) {
78	!	pre <- pre[vapply(ag[pre], function(t) {
79	!	s2_area(st_intersection(t, sg, model = "closed"))
80	!	}, 0) > min_overlap]
81		}
82	8x	set <- unique(c(set, available[pre]))
83	8x	if (length(set) > n) {
84	!	set <- set[seq_len(n)]
85	!	break
86		} else {
87	8x	su <- !available %in% set
88	8x	if (all(su)) {
89	8x	break
90		} else {
91	!	available <- available[su]
92		}
93		}
94		}
95		} else {
96	25x	set <- set[seq_len(n)]
97		}
98		} else {
99	11x	nearest <- as.integer(names(sort(
100	11x	lma_simets(centers, centers[start, ], metric = "euclidean"), TRUE
101	11x	)[sels]))
102	11x	adj <- max(abs(centers[nearest, ] - rep(unname(centers[start, ]), each = n)))
103	11x	set <- unique(c(start, available[st_intersects(
104	11x	st_polygon(list(matrix(
105	11x	rep(centers[start, ], each = 5) - adj * c(1, -1, -1, 1, 1, 1, 1, -1, -1, 1), 5
106	11x	))), polygons[available]
107	11x	)[[1]]]))
108		}
109	46x	accounted[set] <- 1L
110	46x	map <- c(list(set), map)
111		}
112	3x	map <- data.frame(
113	3x	super = rep(seq_along(map), vapply(map, length, 0)),
114	3x	sub = unlist(map, use.names = FALSE)
115		)
116	3x	map <- map[!duplicated(map$sub), ]
117	3x	map <- split(map$sub, map$super)
118	3x	if (n_as_min) {
119	1x	su <- which(vapply(map, length, 0) < n)
120	1x	if (length(su)) {
121	1x	i <- 0
122	1x	defunct <- NULL
123	1x	if (verbose) {
124	1x	cli_progress_step(
125	1x	"merging in {.field {length(defunct)}}/{.field {length(su)}} parent{?/s} with too few children",
126	1x	msg_done = "merged in {.field {length(defunct)}} of {.field {length(su)}} small parent{?/s}",
127	1x	spinner = TRUE
128		)
129		}
130	1x	tmap <- data.frame(
131	1x	super = rep(seq_along(map), vapply(map, length, 0)),
132	1x	sub = unlist(map, use.names = FALSE)
133		)
134	1x	available <- seq_along(polygons)
135	1x	for (i in su) {
136	3x	available <- available[!available %in% map[[i]]]
137	3x	sg <- st_buffer(st_boundary(polygons[map[[i]]])[[1]], buffer_dist, 1)
138	3x	ag <- polygons[available]
139	3x	pre <- st_intersects(sg, ag)[[1]]
140	3x	if (length(pre)) {
141	3x	pre <- pre[vapply(ag[pre], function(t) {
142	13x	s2_area(st_intersection(t, sg, model = "closed"))
143	3x	}, 0) > min_overlap]
144		}
145	3x	if (length(pre)) {
146	3x	ns <- available[if (length(pre) == 1) pre else sample(pre, 1)]
147	3x	np <- tmap[[1]][tmap[[2]] == ns]
148	3x	map[[np]] <- c(map[[np]], map[[i]])
149	3x	available <- available[!available %in% map[[np]]]
150	3x	defunct <- c(defunct, i)
151		}
152	3x	if (verbose) cli_progress_update()
153		}
154	1x	if (length(defunct)) map <- map[-defunct]
155	1x	if (verbose) cli_progress_done()
156		}
157	1x	map <- data.frame(
158	1x	super = rep(seq_along(map), vapply(map, length, 0)),
159	1x	sub = unlist(map, use.names = FALSE)
160		)
161	1x	map <- map[!duplicated(map$sub), ]
162	1x	map <- split(map$sub, map$super)
163		}
164	!	if (!is.null(ids)) map <- lapply(map, function(l) ids[l])
165	3x	i <- 0
166	3x	if (verbose) {
167	3x	cli_progress_step(
168	3x	"creating new polygons ({.field {i}}/{.field {length(map)}})",
169	3x	msg_done = "created {.field {length(map)}} new polygons", spinner = TRUE
170		)
171		}
172	3x	new <- do.call(c, lapply(map, function(l) {
173	43x	if (verbose) {
174	43x	i <<- i + 1
175	43x	cli_progress_update(.envir = parent.frame(2))
176		}
177	43x	st_union(polygons[l])
178		}))
179	3x	list(new = new, map = map)
180		}

1		// [[Rcpp::depends(RcppParallel)]]
2		#include <Rcpp.h>
3		#include <RcppParallel.h>
4		using namespace std;
5		using namespace Rcpp;
6		using namespace RcppParallel;
7
8	11980x	const int select(const vector<int> &local, const NumericVector &base) {
9	11980x	const int n = local.size();
10	11980x	int sel = 0;
11	11980x	double prob, sel_prob = 0;
12	83808x	for (int i = n; i--;) {
13	71828x	prob = R::rbinom(n, local[i] ? local[i] / n * base[i] : base[i]);
14	71828x	if (prob > sel_prob) {
15	7851x	sel = i;
16	7851x	sel_prob = prob;
17		}
18		}
19	11980x	return sel;
20		}
21
22		struct Generate : public Worker {
23		RVector<double> individuals;
24		const vector<int> start, end;
25		const IntegerVector region, head_income, size, renting, space_i, space_p;
26		const NumericVector space_x, race_rates;
27	2x	const int N, n_neighbors, n_races = race_rates.length(), nh = region.length();
28	6x	Generate(
29		NumericVector &individuals, const vector<int> &start, const vector<int> &end,
30		const IntegerVector &region, const IntegerVector &head_income, const IntegerVector &size,
31		const IntegerVector &renting, const S4 &space, const NumericVector &race_rates,
32		const int &N, const double &n_neighbors
33		):
34	2x	individuals(individuals), start(start), end(end), region(region),
35	2x	head_income(head_income), size(size), renting(renting), space_i(space.slot("i")),
36	2x	space_p(space.slot("p")), space_x(space.slot("x")), race_rates(race_rates), N(N),
37	2x	n_neighbors(n_neighbors)
38		{}
39	1826x	void operator()(size_t s, size_t e){
40	1826x	int ck = 1e3;
41	1826x	vector<int> neighbors(n_neighbors), race_table(n_races);
42	11826x	for (; s < e; s++) {
43	21930x	const int ci = s, r = region[s] - 1, inc = head_income[s], n = size[s],
44	9991x	ro = renting[s], ie = end[s];
45	9987x	int i = 0, nn = 0, nni, nhi = space_p[r], nhn = space_p[r + 1], h1, h2;
46	21910x	double ns = 0;
47
48		// summarize neighbors
49	21910x	vector<int> neighbor_index;
50	21910x	vector<double> neighbor_sims;
51		// // find neighbors in range
52	13916424x	for (; i < nh; i++) {
53	13906436x	if (i != ci) {
54	13439084x	const int nr = region[i] - 1;
55	13097952x	ns = 0;
56	26221395x	for (nni = nhi; nni < nhn; nni++) {
57	14140029x	if (space_i[nni] == nr) {
58	1313646x	ns = space_x[nni];
59	1313801x	break;
60		}
61		}
62	13395167x	if (ns) {
63	1369336x	if (nn < n_neighbors) {
64	407938x	neighbor_index.push_back(i);
65	416174x	neighbor_sims.push_back(ns);
66	415980x	nn++;
67		} else {
68	961401x	ns += R::rnorm(0, 1e-5);
69	25724556x	for (nni = nn; nni--;) {
70	25235510x	if (ns > neighbor_sims[nni]) {
71	496305x	neighbor_index[nni] = i;
72	496305x	neighbor_sims[nni] = ns;
73	496305x	break;
74		}
75		}
76		}
77		}
78		}
79		}
80	9988x	if (nn) {
81	9098x	int ni = n_neighbors;
82	460485x	for (; ni--;) neighbors[ni] = 0;
83	63627x	for (ni = n_races; ni--;) race_table[ni] = 0;
84		// see if neighbors have been filled, and summarize over their household members
85	420708x	for (ni = nn, nn = 0; ni--;) {
86	411604x	nni = neighbor_index[ni], nhi = start[nni], nhn = min(nhi + 2, end[nni]);
87	416435x	if (individuals[nhi]) {
88	642853x	for (; nhi < nhn; nhi++) {
89	398689x	neighbors[0] += individuals[nhi + 3 * N]; // sum age
90	399610x	neighbors[1] += individuals[nhi + 4 * N]; // n 1 sex
91	398228x	race_table[individuals[nhi + 5 * N]]++;
92	395879x	neighbors[3] += individuals[nhi + 6 * N]; // sum income
93	395441x	nn++;
94		}
95		}
96		}
97	9104x	if (nn) {
98		// finalize neighbor summaries
99	8433x	ni = n_races, i = 0;
100	59011x	for (int m = -1; ni--;) {
101	50578x	if (race_table[ni] > m) {
102	21622x	i = ni;
103	21622x	m = race_table[ni];
104		}
105		}
106	8433x	neighbors[0] /= nn;
107	8433x	neighbors[2] = i;
108	8433x	neighbors[3] /= nn;
109		}
110		}
111
112		// fill out household members
113	9994x	int hage = 0;
114	33162x	for (i = start[s], h1 = i, h2 = i + 1; i < ie; i++) {
115		// set household and individual ids
116	23162x	const int row = i;
117	23162x	individuals[row] = s + 1;
118	23157x	individuals[row + N] = row + 1;
119
120		// set number of neighbors (col 3)
121	23163x	individuals[row + 2 * N] = nn;
122
123		// select age (col 4)
124	23171x	if (row == h1) {
125	10001x	hage = floor(R::rbeta(1, ro ? 2 : 1.5) * 80 + 18);
126	10002x	if (nn) {
127	8441x	hage = max(18, (int)ceil((hage + R::rnorm(neighbors[0], 5)) / 2));
128		}
129	10002x	individuals[row + 3 * N] = hage;
130	23172x	} else if (row == h2) {
131	6411x	individuals[row + 3 * N] = max(18, min((int)floor(R::rnorm(hage, hage > 40 ? 15 : 5)), 90));
132		} else {
133	6759x	individuals[row + 3 * N] = ceil(R::runif(0, 1) * (hage - 15));
134		}
135
136		// select sex (col 5)
137	23172x	if (row == h2) {
138	6414x	individuals[row + 4 * N] = R::rbinom(1, .1) ? individuals[row - 1 + 4 * N] : (int)!individuals[row - 1 + 4 * N];
139		} else {
140	16758x	individuals[row + 4 * N] = R::rbinom(1, .5);
141		}
142
143		// select race (col 6)
144	23174x	if (row == h1) {
145	10002x	individuals[row + 5 * N] = select(race_table, race_rates);
146	23170x	} else if (row == h2) {
147	6412x	individuals[row + 5 * N] = R::rbinom(1, .7) ? individuals[row - 1 + 5 * N] : select(race_table, race_rates);
148		} else {
149	6760x	individuals[row + 5 * N] = individuals[row - (1 + R::rbinom(1, .5)) + 5 * N];
150		}
151
152		// select income (col 7)
153	23169x	if (row == h1) {
154	10000x	individuals[row + 6 * N] = inc;
155	23167x	} else if (row == h2) {
156	6411x	individuals[row + 6 * N] = inc < (nn ? neighbors[3] : 4e4) \|\| n == 2 ? max(0, (int)floor(R::rnorm(inc, 2e4))) : 0;
157		}
158		}
159	10000x	if (!--ck) {
160	!	checkUserInterrupt();
161	!	ck = 1e3;
162		}
163		}
164		}
165		};
166
167		// [[Rcpp::export]]
168	2x	NumericVector generate_individuals(
169		const IntegerVector &region, const IntegerVector &head_income, const IntegerVector &size,
170		const IntegerVector &renting, const S4 &space, const int &n_neighbors,
171		const NumericVector &race_rates
172		) {
173	2x	const int nh = size.length(), nvars = 7;
174
175		// map sets of individuals
176	2x	vector<int> start(nh), end(nh);
177	2x	int n = 0;
178	10004x	for (int i = 0; i < nh; i++) {
179	10002x	start[i] = n;
180	10002x	n += size[i];
181	10002x	end[i] = n;
182		}
183
184	2x	NumericVector individuals(n * nvars);
185	2x	Generate g(
186	2x	individuals, start, end, region, head_income, size, renting, space, race_rates, n, n_neighbors
187		);
188	2x	parallelFor(0, nh, g);
189
190	2x	individuals.attr("dim") = Dimension(n, nvars);
191	2x	individuals.attr("dimnames") = List::create(R_NilValue, CharacterVector::create(
192		"household", "person", "neighbors", "age", "sex", "race", "income"
193		));
194	2x	return individuals;
195		}

1		#' Generate a Population
2		#'
3		#' Simulate a population of individuals within households, with complex relationships between
4		#' demographic and location features.
5		#'
6		#' The population is generated in two steps:
7		#'
8		#' \strong{First}, households are generated and placed within \code{regions}. Placement within regions
9		#' is determined by total distance from one or more regions selected as attraction loci. If
10		#' coordinates are not provided, these are first randomly generated.
11		#'
12		#' After households are placed, household incomes (of the first member) is generated
13		#' based on cost loci, which are then used to generate building types (where types are increasingly
14		#' associated with income) and then household size (based on size loci, income, and building type).
15		#' Renting status is then generated based on income and building type:
16		#' 60\% chance if income is under the mean income, and 20\% otherwise, multiplied by .8
17		#' if the building type is of a selected renting type, or .3 otherwise.
18		#'
19		#' \strong{Second}, individuals are generated for each household. To generate an individual,
20		#' first, neighbors are searched for, based on \code{n_neighbors} and \code{neighbor_range}.
21		#' Any neighbors are summarized: average age and income, and tabulated race.
22		#'
23		#' These then affect the first member of the household: age is first drawn from a Beta
24		#' distribution (with shapes of \code{1} and \code{2} if renting or \code{1.5} otherwise,
25		#' multiplied by \code{80}) and added to \code{18}, then adjusted toward a random value centered
26		#' on the average neighbor age (floored Gaussian with a standard deviation of \code{1}), and race
27		#' is sampled (that with the highest result of a Binomial draw with \code{n_races} trials
28		#' proportion of \code{neighbors * base rate} chance of success for each race group).
29		#'
30		#' Neighbors also affect the income of the second member of the household if the first
31		#' member's income is under the neighbor mean income (or under \code{40,000} given no neighbors);
32		#' in this case, the second member's income is drawn from a Gaussian distribution centered on
33		#' the first member's income, with a standard deviation of \code{10,000}.
34		#'
35		#' The second member's age is based on that of the first member; a floored Gaussian centered on
36		#' the first member's age, with a standard deviation of \code{15} if the first member's age
37		#' if over \code{40}, or \code{5} otherwise, trimmed to be between \code{18} and \code{90}.
38		#'
39		#' The second member's race has a 70\% chance to be that of the first member,
40		#' and a 30\% chance to be selected like the first member's.
41		#'
42		#' Members after the second have no income, have age randomly selected from a
43		#' uniform distribution between \code{0} and the first member's age minus \code{15}
44		#' (which is then rounded up), and have race determined by either the first or second
45		#' member (50\% chance).
46		#'
47		#' Sex has a 50\% chance to be \code{0} or \code{1} for all but the second member; their sex
48		#' has a 10\% chance to be the same as the first member's, and 90\% chance to be opposite.
49		#'
50		#' @param N Number of initial individuals to generate. Final number of individuals will be larger.
51		#' @param regions A vector of region IDs, a matrix of coordinates, or an \code{sf} object with
52		#' geometries from which coordinates can be derived. If not specified
53		#' (and \code{capacities} is not specified), regions similar to housing units (with a mix of
54		#' single and multi-family locations) will be generated.
55		#' @param capacities A vector with the maximum number of households for each entry in \code{regions}.
56		#' @param region_ids A vector of unique IDs for each \code{regions}, or a column name in
57		#' \code{regions} containing IDs.
58		#' @param attraction_loci Number of locations selected to be centers of attractiveness,
59		#' which influence where households are located.
60		#' @param random_regions A number between \code{0} and \code{1}, which determines the proportion of
61		#' people who are randomly relocated, in the case that there is more capacity than households.
62		#' @param cost_loci Number of locations selected to be centers of cost, which influences
63		#' the initial income associated with households.
64		#' @param size_loci Number of locations selected to be centers of size, which influence
65		#' household sizes.
66		#' @param similarity_metric Name of a metric to use to calculate nearness between
67		#' neighbors; see \code{\link[lingmatch]{lma_simets}}.
68		#' @param n_neighbors Number of neighbors used to influence each new household's
69		#' initial age and race.
70		#' @param neighbor_range Minimum similarity between people to be considered
71		#' neighbors, between \code{0} and \code{1} (where \code{0} means unrestricted,
72		#' and \code{1} means same region only).
73		#' @param n_races Number of different race groups to sample from.
74		#' @param n_building_types Number of different building types to sample from.
75		#' @param verbose Logical; if \code{TRUE}, will show status messages.
76		#' @examples
77		#' generate_population(2)
78		#' @returns A list with entries for \code{params} (with initial settings), and two
79		#' \code{data.frames}:
80		#' \describe{
81		#' \item{\code{households}}{
82		#' \tabular{ll}{
83		#' \code{household} \tab Household ID. \cr
84		#' \code{region} \tab Region ID. \cr
85		#' \code{head_income} \tab Income of the first household member. \cr
86		#' \code{size} \tab Number of individuals in the household. \cr
87		#' \code{building_type} \tab Categorical indicator of building type. \cr
88		#' \code{renting} \tab Binary indicator of renting status. \cr
89		#' }
90		#' }
91		#' \item{\code{individuals}}{
92		#' \tabular{ll}{
93		#' \code{household} \tab Household ID. \cr
94		#' \code{person} \tab Person ID. \cr
95		#' \code{neighbors} \tab Number of neighbors who bore on variables. \cr
96		#' \code{age} \tab Age in years. \cr
97		#' \code{sex} \tab Categorical indicator of sex. \cr
98		#' \code{race} \tab Categorical indicator of race. \cr
99		#' \code{income} \tab Income of individual. \cr
100		#' }
101		#' }
102		#' }
103		#' @export
104
105		generate_population <- function(N = 1000, regions = NULL, capacities = NULL, region_ids = NULL,
106		attraction_loci = 3, random_regions = .1, cost_loci = 2, size_loci = 5,
107		similarity_metric = "euclidean", n_neighbors = 50,
108		neighbor_range = .5, n_races = 6, n_building_types = 3, verbose = FALSE) {
109	2x	gen_regions <- missing(regions)
110	2x	gen_capacities <- missing(capacities)
111	2x	if (gen_regions) {
112	2x	if (gen_capacities) {
113	1x	if (verbose) cli_alert_info("regions: sequence along {.arg N}, with housing unit capacities")
114	2x	regions <- seq_len(N)
115	2x	capacities <- rep(1, N)
116	2x	n_multi <- round(N * .1)
117	2x	capacities[sample.int(N, n_multi)] <- sample(5:300, n_multi, TRUE)
118		} else {
119	!	if (verbose) {
120	!	cli_alert_info(
121	!	"regions: sequence along specified {.arg capacities} with specified capacities"
122		)
123		}
124	!	regions <- seq_along(capacities)
125		}
126		}
127	2x	if (!missing(region_ids)) {
128	!	if (length(region_ids) == 1 && region_ids %in% colnames(regions)) {
129	!	su <- colnames(regions) != region_ids
130	!	region_ids <- regions[[region_ids]]
131	!	regions <- regions[, su, drop = FALSE]
132		} else {
133	!	nr <- if (length(dim(regions)) == 2) nrow(regions) else length(regions)
134	!	if (region_ids != nr) cli_abort("{.arg region_ids} are not the same length as {.arg regions}")
135		}
136		}
137	2x	if (length(dim(regions)) == 2) {
138	!	if (length(capacities) == 1 && is.character(capacities) && capacities %in% colnames(regions)) {
139	!	if (verbose) {
140	!	cli_alert_info(
141	!	"regions: specified coordinates, with capacities from {.field {capacities}} column"
142		)
143		}
144	!	su <- colnames(regions) != capacities
145	!	capacities <- regions[[capacities]]
146	!	regions <- regions[, su, drop = FALSE]
147	!	} else if (verbose) {
148	!	cli_alert_info(paste(
149	!	"regions: specified coordinates, with",
150	!	if (gen_capacities) "{.field N / nrow(regions)}" else "specified", "capacities"
151		))
152		}
153	!	rids <- if (length(region_ids)) {
154	!	region_ids
155	!	} else if (is.null(rownames(regions))) {
156	!	paste0("r", seq_len(nrow(regions)))
157		} else {
158	!	rownames(regions)
159		}
160		} else {
161	2x	rids <- if (length(regions) == 1 && is.numeric(regions)) {
162	!	if (verbose) {
163	!	cli_alert_info(paste(
164	!	"regions: sequence of {.field N}, with",
165	!	if (gen_capacities) "{.field N / nrow(regions)}" else "specified", "capacities"
166		))
167		}
168	!	if (length(region_ids)) region_ids else paste0("r", seq_len(regions))
169		} else {
170	2x	if (verbose && !gen_regions) {
171	!	cli_alert_info(paste(
172	!	"regions:", if (gen_capacities) {
173	!	"specified region, with {.field N / length(regions)}"
174		} else {
175	!	"specified regions and"
176	!	}, "capacities"
177		))
178		}
179	2x	if (length(region_ids)) region_ids else regions
180		}
181	2x	regions <- matrix(
182	2x	ceiling(rnorm(length(rids) * 2, 1e5, 1e4)),
183	2x	ncol = 2, dimnames = list(rids, c("X", "Y"))
184		)
185		}
186	2x	nr <- length(rids)
187	2x	capacities <- rep_len(if (!is.numeric(capacities)) N / nr else capacities, nr)
188
189	1x	if (verbose) cli_alert_info("preparing {.field {N}} households")
190	2x	household <- seq_len(N)
191
192		# select attraction loci
193	2x	if (!is.numeric(attraction_loci) \|\| attraction_loci < 1) {
194	!	attraction_loci <- 1
195	!	if (verbose) cli_alert_warning("setting {.arg attraction_loci} to {.field 1}")
196	2x	} else if (attraction_loci > nr) {
197	1x	attraction_loci <- max(1, nr - 1)
198	1x	if (verbose) {
199	1x	cli_alert_warning(
200	1x	"too many attraction loci; setting {.arg attraction_loci} to {.field {attraction_loci}}"
201		)
202		}
203	1x	} else if (verbose) {
204	!	cli_alert_info("attraction loci: {.field {attraction_loci}}")
205		}
206	2x	selected_attraction_loci <- sample.int(nr, attraction_loci)
207	1x	if (verbose) cli_alert_info("calculating region similarities")
208	2x	if (nr == 1) {
209	!	space <- Matrix(1, dimnames = list(rids, rids), sparse = TRUE)
210		} else {
211	!	if (inherits(regions, c("sfc", "sf"))) regions <- st_coordinates(st_centroid(regions))
212	2x	space <- lma_simets(regions, metric = similarity_metric, symmetrical = TRUE)
213		}
214	1x	if (verbose) cli_alert_info("rescaling region similarities")
215	2x	min_space <- min(space@x)
216	2x	space@x <- (space@x - min_space) / (max(space@x) - min_space)
217	2x	space_cols <- integer(ncol(space))
218
219		# select initial locations
220	2x	space_cols[selected_attraction_loci] <- -1L
221	2x	ro <- order(space %*% space_cols)
222	2x	space_cols[selected_attraction_loci] <- 0L
223	2x	capacities <- ceiling(capacities)
224	2x	if (sum(capacities) < N) {
225	!	if (verbose) cli_alert_info("{.arg N} is larger than total capacities; expanding to accomodate")
226	!	capacities <- ceiling(capacities / sum(capacities) * N)
227		}
228	2x	rids_expanded <- rep(ro, capacities[ro])
229	2x	region <- rids_expanded[household]
230	!	if (anyNA(region)) cli_abort("NAs in regions")
231
232	2x	if (N < length(rids_expanded) && is.numeric(random_regions)) {
233		# redistribute a portion
234	1x	moves <- min(length(rids_expanded) - N, N * random_regions)
235	1x	if (moves > 0) {
236	!	if (verbose) cli_alert_info("randomly placing {.field {moves}} households in remaining regions")
237	1x	region[sample(household, moves)] <- sample(rids_expanded[-household], moves)
238		}
239		}
240
241		# select head-of-household income
242	2x	if (!is.numeric(cost_loci) \|\| cost_loci < 1) {
243	!	cost_loci <- 1
244	!	if (verbose) cli_alert_warning("setting {.arg cost_loci} to {.field 1}")
245	2x	} else if (cost_loci > nr) {
246	1x	cost_loci <- max(1, nr - 1)
247	1x	if (verbose) cli_alert_warning("too many cost loci; setting {.arg cost_loci} to {.field {cost_loci}}")
248	1x	} else if (verbose) {
249	!	cli_alert_info("cost loci: {.field {cost_loci}}")
250		}
251	2x	selected_cost_loci <- sample.int(nr, cost_loci)
252	2x	space_cols[selected_cost_loci] <- 1L
253	2x	region_cost <- abs(rnorm(nr, as.numeric(space %% space_cols) 5e5, 1e5))
254	2x	space_cols[selected_cost_loci] <- 0L
255	2x	head_income <- abs(as.integer(
256	2x	rnorm(N, region_cost[region], 2e3) + (rbeta(N, .1, 1) - .6) * region_cost[region]
257		))
258
259		# selecting building type
260	1x	if (verbose) cli_alert_info("drawing building type and rental types")
261	2x	region_building_type <- rep(1L, nr)
262	2x	su <- capacities == 1
263	2x	if (n_building_types > 1) {
264	2x	if (any(su)) {
265	2x	region_building_type[su] <- if (n_building_types == 2) {
266	!	2L
267		} else {
268	2x	sample.int(n_building_types - 1, sum(su), TRUE) + 1L
269		}
270		}
271		}
272	2x	building_type <- region_building_type[region]
273
274		# select household sizes
275	2x	if (!is.numeric(size_loci) \|\| size_loci < 1) {
276	!	size_loci <- 1
277	!	if (verbose) cli_alert_warning("setting {.arg size_loci} to {.field 1}")
278	2x	} else if (size_loci > nr) {
279	1x	size_loci <- max(1, nr - 1)
280	1x	if (verbose) cli_alert_warning("too many size loci; setting {.arg size_loci} to {.field {size_loci}}")
281	1x	} else if (verbose) {
282	!	cli_alert_info("size loci: {.field {size_loci}}")
283		}
284	2x	selected_size_loci <- sample.int(nr, size_loci)
285	2x	space_cols[selected_size_loci] <- .1 / size_loci
286	2x	size_prob <- rnorm(
287	2x	N,
288	2x	(su * 3 + (space %% space_cols) size_loci)[region, ] +
289	2x	head_income / max(head_income) * 5 +
290	2x	1 / building_type * .5,
291	2x	.4
292		)
293	2x	size <- rpois(N, abs(size_prob)) + 1L
294
295		# selecting renting status
296	1x	if (verbose) cli_alert_info("drawing renting status")
297	2x	renting <- rbinom(N, 1, ((head_income < mean(head_income)) * .2 + .4) *
298	2x	((building_type == 1) * .3 + .5))
299
300		# select race base rates
301	1x	if (verbose) cli_alert_info("drawing race baserates")
302	2x	race_rates <- rbeta(n_races, 1, 5)
303	2x	race_rates <- race_rates / (race_rates + max(race_rates))
304
305		# making space sparse
306	1x	if (verbose) cli_alert_info("defining region neighbors")
307	2x	space@x[space@x < neighbor_range] <- 0
308	2x	space <- drop0(space)
309
310		# preparing individual data
311	2x	individuals <- data.frame()
312	2x	if (verbose) {
313	1x	cli_progress_step(
314	1x	"generating individuals...",
315	1x	msg_done = "generated {.field {nrow(individuals)}} individuals"
316		)
317		}
318	2x	individuals <- generate_individuals(
319	2x	region, head_income, size, renting, space, n_neighbors, race_rates
320		)
321
322	2x	list(
323	2x	params = list(
324	2x	neighbors = n_neighbors, range = neighbor_range, races_rates = race_rates,
325	2x	n_building_types = n_building_types
326		),
327	2x	regions = data.frame(
328	2x	id = rids, capacity = capacities, cost = region_cost,
329	2x	building_type = region_building_type, regions
330		),
331	2x	households = data.frame(
332	2x	household,
333	2x	region = rids[region], head_income, size, building_type, renting
334		),
335	2x	individuals = as.data.frame(individuals)
336		)
337		}

1		#' Download U.S. Census Microdata
2		#'
3		#' Download and load U.S. census American Community Survey (ACS) Public Use Microdata Samples (PUMS):
4		#' \href{https://www.census.gov/programs-surveys/acs/microdata.html}{census.gov/programs-surveys/acs/microdata.html}
5		#'
6		#'
7		#' @param dir Directory in which to save the file(s).
8		#' @param state Postal or FIPS code of the state.
9		#' @param year 4 digit year, between 2005 and the most recent year.
10		#' @param level A character indicating whether to get the person- or household-level sample. Defaults to both.
11		#' @param one_year Logical; if \code{FALSE}, will get the 5-year estimates rather than the 1-year file. If
12		#' not specified, will fall back to the 5-year file if the 1-year file is not available.
13		#' @param calculate_error Logical; if \code{TRUE}, will calculate standard errors for each variable using
14		#' the Successive Difference Replicate Formula from the
15		#' \href{https://www.census.gov/programs-surveys/acs/library/handbooks/pums.html}{census.gov/programs-surveys/acs/library/handbooks/pums.html}.
16		#' @param crosswalk Logical; if \code{FALSE}, will not retrieve the PUMA relationship files for associating
17		#' Census tracts with PUM areas. This will be treated as \code{TRUE} if
18		#' \code{geoids} is specified.
19		#' @param geoids A vector of county, tract, or block group GEOIDs within the specified state to
20		#' select PUM areas by; defaults to all areas. If specified, \code{crosswalk} will be treated as \code{TRUE}.
21		#' @param verbose Logical; if \code{FALSE}, will not print status messages.
22		#' @examples
23		#' \dontrun{
24		#' download_census_pums(".", "va")
25		#' }
26		#' @return A list with entries for \code{year} and \code{state} (as specified),
27		#' \code{dictionary} (containing the data dictionary for that year), \code{household} and/or \code{person}
28		#' (with survey data), and optionally \code{household_error} and/or \code{person_error}
29		#' (if \code{calculate_error} is \code{TRUE}), and \code{crosswalk} (if \code{crosswalk} is \code{TRUE} or
30		#' \code{geoids} is specified).
31		#' @export
32
33		download_census_pums <- function(dir, state, year = 2021, level = "both", one_year = TRUE, calculate_error = FALSE,
34		crosswalk = TRUE, geoids = NULL, verbose = TRUE) {
35	3x	us_fips <- list(
36	3x	post = c(
37	3x	"us", "al", "ak", "az", "ar", "ca", "co", "ct", "de", "dc", "fl", "ga", "hi", "id", "il", "in", "ia", "ks", "ky",
38	3x	"la", "me", "md", "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", "nm", "ny", "nc", "nd", "oh", "ok",
39	3x	"or", "pa", "ri", "sc", "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy", "pr"
40		),
41	3x	fips = c(
42	3x	"us", 1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
43	3x	33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 72
44		)
45		)
46	3x	state <- gsub("^0\|[^a-z0-9]", "", tolower(state[[1]]))
47	3x	fid <- which(us_fips$fips == state)
48	2x	if (!length(fid) && nchar(state) == 2) fid <- which(us_fips$post == state)
49	!	if (!length(fid) \|\| is.na(fid)) cli_abort("failed to recognize {.arg state}")
50	3x	state <- us_fips$post[fid]
51	3x	level <- tolower(substring(level, 1, 1))
52	3x	if (!level %in% c("h", "p", "b")) {
53	!	cli_abort(
54	!	'{.arg level} should be one of {.field "houshold"}, {.field "person"}, or {.field "both"}.'
55		)
56		}
57	3x	summary <- if (one_year) "1-Year" else "5-Year"
58	3x	urls <- paste0(
59	3x	"https://www2.census.gov/programs-surveys/acs/data/pums/", year,
60	3x	"/", summary, "/csv_", c("h", "p"), state, ".zip"
61		)
62	3x	if (!is.null(dir)) {
63	3x	dir <- tdir <- paste0(normalizePath(dir, "/", FALSE), "/")
64	!	if (!dir.exists(dir)) dir.create(dir, FALSE, TRUE)
65		} else {
66	!	tdir <- paste0(tempdir(), "/")
67		}
68	3x	folder <- paste0(tdir, "pums_", state, "/", year, "/", summary, "/")
69	3x	dir.create(folder, FALSE, TRUE)
70	3x	res <- list(year = year, state = state, dictionary = NULL, household = NULL, person = NULL)
71	3x	dict_json <- paste0(folder, paste0(
72	3x	"PUMS_Data_Dictionary_", if (one_year) year else paste0(year - 4, "-", year), ".json"
73		))
74	3x	if (file.exists(dict_json)) {
75	1x	res$dictionary <- read_json(dict_json, simplifyVector = TRUE)
76		} else {
77	2x	dict_file <- sub(".json", ".csv", dict_json, fixed = TRUE)
78	2x	if (!file.exists(dict_file)) {
79	2x	url <- paste0("https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/", basename(dict_file))
80	2x	status <- curl_fetch_disk(url, dict_file)
81	!	if (status$status_code != 200) cli_abort("failed to retrieve data dictionary from {.url {url}}")
82		}
83	2x	dict <- readLines(dict_file, warn = FALSE)
84	2x	res$dictionary <- lapply(
85	2x	split(dict, sub(",.*$", "", sub("^[^,]+,", "", dict, perl = TRUE), perl = TRUE)),
86	2x	function(e) {
87	1042x	h <- suppressWarnings(scan(text = e[[1]], what = "", sep = ",", quiet = TRUE))
88	1042x	map <- as.data.frame(do.call(rbind, lapply(
89	1042x	e[-1], function(l) suppressWarnings(scan(text = l, what = "", sep = ",", quiet = TRUE))[5:7]
90		)))
91	1042x	colnames(map) <- c("from", "to", "value")
92	1042x	list(
93	1042x	description = h[5],
94	1042x	type = if (h[3] == "C") "character" else "integer",
95	1042x	length = as.numeric(h[4]),
96	1042x	map = map
97		)
98		}
99		)
100	2x	write_json(res$dictionary, dict_json, auto_unbox = TRUE)
101		}
102	3x	files <- paste0(folder, "/", c("h", "p"), ".csv.xz")
103	3x	if (level != "b") {
104	!	lid <- (level == "p") + 1
105	!	urls <- urls[lid]
106	!	files <- files[lid]
107		}
108	3x	can_change <- missing(one_year)
109	3x	for (i in seq_along(urls)) {
110	6x	url <- urls[i]
111	6x	clevel <- if (grepl("csv_h", url, fixed = TRUE)) "household" else "person"
112	6x	if (!file.exists(files[i])) {
113	4x	file_zip <- paste0(folder, basename(url))
114	4x	if (!file.exists(file_zip)) {
115	4x	if (verbose) cli_alert_info("retrieving {clevel} sample: {.url {url}}")
116	4x	status <- curl_fetch_disk(url, file_zip)
117	4x	if (status$status_code != 200 && can_change) {
118	!	new_summary <- if (one_year) "5-Year" else "1-Year"
119	!	urls <- sub(summary, new_summary, urls, fixed = TRUE)
120	!	files <- sub(summary, new_summary, files, fixed = TRUE)
121	!	summary <- new_summary
122	!	url <- urls[i]
123	!	status <- curl_fetch_disk(url, file_zip)
124		}
125	4x	can_change <- FALSE
126	4x	if (status$status_code != 200) {
127	!	unlink(file_zip)
128	!	cli_abort(paste0(
129	!	"failed to download {.url {url}}: ", strsplit(rawToChar(status$headers), "[\r\n]")[[1]][1]
130		))
131		}
132		}
133	4x	rfile <- paste0(folder, grep(".csv", unzip(file_zip, list = TRUE)$Name, fixed = TRUE, value = TRUE))
134	4x	unzip(file_zip, exdir = folder)
135	4x	cols <- strsplit(readLines(rfile, 1), ",")[[1]]
136	4x	types <- paste(vapply(cols, function(n) {
137	1056x	type <- res$dictionary[[n]]$type
138	1056x	if (is.null(type)) if (grepl("WGTP", n, fixed = TRUE)) "i" else "c" else substring(type, 1, 1)
139	4x	}, ""), collapse = "")
140	4x	res[[clevel]] <- vroom(rfile, col_types = types)
141	4x	vroom_write(res[[clevel]], files[i], ",")
142	4x	on.exit(unlink(rfile), add = TRUE)
143		} else {
144	2x	if (verbose) cli_alert_info("loading {clevel} sample: {.field {basename(files[i])}}")
145	2x	cols <- strsplit(readLines(files[i], 1), ",")[[1]]
146	2x	types <- paste(vapply(cols, function(n) {
147	528x	type <- res$dictionary[[n]]$type
148	528x	if (is.null(type)) if (grepl("WGTP", n, fixed = TRUE)) "i" else "c" else substring(type, 1, 1)
149	2x	}, ""), collapse = "")
150	2x	res[[clevel]] <- vroom(files[i], col_types = types)
151		}
152	6x	weight <- if (clevel == "person") "PWGTP" else "WGTP"
153	6x	if (calculate_error && paste0(weight, "1") %in% cols) {
154	2x	if (verbose) cli_alert_info("calculating {clevel} errors")
155	2x	vars <- cols[!grepl("^(?:RT\|SERIAL\|DIV\|PUMA\|REG\|ST\|ADJ)\|WGTP", cols) & cols %in% names(res$dictionary)]
156	2x	names(vars) <- vars
157	2x	wv <- res[[clevel]][, weight, drop = TRUE]
158	2x	w <- as.matrix(res[[clevel]][, grep("WGTP\\d+", cols)])
159	2x	res[[paste0(clevel, "_error")]] <- as.data.frame(do.call(cbind, lapply(vars, function(name) {
160	350x	v <- res[[clevel]][, name, drop = TRUE]
161	350x	map <- res$dictionary[[name]]
162	350x	if (map$type == "integer") {
163	45x	sqrt(.05 * rowSums((w - v * wv)^2))
164		} else {
165	305x	levels <- map$map$from
166	305x	names(levels) <- paste0(name, "_", levels)
167	305x	vapply(levels, function(l) {
168	4787x	sqrt(.05 * rowSums((w - (v == l) * wv)^2))
169	305x	}, numeric(length(v)))
170		}
171		})))
172		}
173		}
174	3x	if (crosswalk \|\| !is.null(geoids)) {
175	3x	url <- if (as.numeric(year) > if (one_year) 2019 else 2024) {
176	3x	"https://www2.census.gov/geo/docs/maps-data/data/rel2020/2020_Census_Tract_to_2020_PUMA.txt"
177		} else {
178	!	"https://www2.census.gov/geo/docs/maps-data/data/rel/2010_Census_Tract_to_2010_PUMA.txt"
179		}
180	3x	crosswalk_file <- paste0(folder, basename(url))
181	3x	if (!file.exists(crosswalk_file)) {
182	2x	if (verbose) cli_alert_info("retrieving crosswalk {.url {url}}")
183	2x	status <- curl_fetch_disk(url, crosswalk_file)
184	!	if (status$status_code != 200) cli_warn("failed to retrieve crosswalk from {.url {url}}")
185		}
186	3x	if (file.exists(crosswalk_file)) {
187	3x	if (verbose) cli_alert_info("loading crosswalk {.field {basename(crosswalk_file)}}")
188	3x	res$crosswalk <- vroom(crosswalk_file, col_types = "cccc")
189	3x	if (!is.null(geoids)) {
190	2x	geoids <- substring(geoids, 1, 11)
191	2x	su <- do.call(paste0, res$crosswalk[, 1:2]) %in% geoids \| do.call(paste0, res$crosswalk[, 1:3]) %in% geoids
192	2x	if (any(su)) {
193	2x	pids <- unique(res$crosswalk$PUMA5CE[su])
194	2x	if (verbose) {
195	2x	cli_alert_info(
196	2x	"subsetting to {length(pids)} of {length(unique(res$crosswalk$PUMA5CE))} PUM areas"
197		)
198		}
199	2x	for (set in c("household", "person")) {
200	4x	su <- res[[set]]$PUMA %in% pids
201	4x	res[[set]] <- res[[set]][su, ]
202	4x	set_error <- paste0(set, "_error")
203	!	if (set_error %in% names(res)) res[[set_error]] <- res[[set_error]][su, ]
204		}
205		} else {
206	!	cli_warn("none of the specified {.arg geoids} were found in the crosswalk")
207		}
208		}
209		}
210		}
211	3x	res
212		}

1		#' Method: Proportional, resident-estimate-adjusted parcel unit counts
2		#'
3		#' A wrapper around the \code{\link{redistribute}} function, that
4		#' takes PUMS data as additional input, and uses it to calculates an adjusted weight for
5		#' redistribution to parcel data.
6		#'
7		#' It is assumed that initial weights are unit counts, and these are to be adjusted
8		#' based on PUMS household data.
9		#'
10		#' @param source,target Source and target of \code{\link{redistribute}}.
11		#' @param households PUMS household data. Can be a list with \code{household} and \code{person}
12		#' entries, like that returned from \code{\link{download_census_pums}}.
13		#' @param target_total Column name in \code{target} (e.g., parcel data; or equivalent vector)
14		#' containing the unit count
15		#' @param target_indicator Column name of a logical variable in \code{target} (or equivalent vector)
16		#' to use to assign adjustment values, where \code{TRUE} indicates a single family home.
17		#' If the values of the column are characters, anything not matching \code{"MULTI"} will be \code{TRUE}.
18		#' @param households_indicator Same a \code{target_indicator}, but in \code{households}, or
19		#' a vector (or column name) of BLD codes, where either \code{"02"} or \code{"03"} would be \code{TRUE}.
20		#' @param households_size A vector of household sizes, or a column in \code{households} containing
21		#' such values. If this is not specified or found, \code{person} will be used to calculate
22		#' household size, based on \code{hosuehold_id} and \code{person_household_id}.
23		#' @param households_id A vector of household IDs aligning with \code{households},
24		#' or a column name in \code{households} containing such IDs.
25		#' Only used to calculate \code{households_size} if necessary.
26		#' @param person PUMS person data. Only used to calculate \code{households_size} if necessary.
27		#' @param person_household_id A vector of household IDs aligning with \code{person},
28		#' or a column name in \code{person} containing such IDs.
29		#' Only used to calculate \code{households_size} if necessary.
30		#' @param ... Additional arguments to pass to \code{\link{redistribute}}. You will likely need
31		#' to specify a \code{map}, and potentially \code{source_id} and/or \code{target_id}.
32		#' @examples
33		#' \dontrun{
34		#' if (require("tidycensus")) {
35		#' # download source, target, and household data
36		#' tracts <- tidycensus::get_acs(
37		#' year = 2021,
38		#' state = "51",
39		#' county = "013",
40		#' geography = "tract",
41		#' output = "wide",
42		#' variables = c(total = "B01001_001"),
43		#' geometry = TRUE
44		#' )
45		#' parcels <- sf::st_read(paste0(
46		#' "https://arlgis.arlingtonva.us/arcgis/rest/",
47		#' "services/Open_Data/od_MHUD_Polygons/",
48		#' "FeatureServer/0/query?where=1=1&outFields=*&outSR=4326&f=json"
49		#' ))
50		#' pums <- download_census_pums(tempdir(), "51", geoids = tracts$GEOID)
51		#'
52		#' # calculate map between source and target
53		#' map_tr_to_parcel <- redistribute(
54		#' tracts, parcels,
55		#' target_id = "OBJECTID", return_map = TRUE
56		#' )
57		#'
58		#' # redistribute tract-level summaries to parcels,
59		#' # using a resident-estimate-adjusted weight
60		#' parcels_filled <- redistribute_parcel_pums_adj(
61		#' tracts[, -2], parcels, pums,
62		#' map = map_tr_to_parcel, target_id = "OBJECTID"
63		#' )
64		#'
65		#' # this can also be calculated with the underlying function
66		#'
67		#' ## calculate resident estimates
68		#' household_size <- tapply(
69		#' table(pums$person$SERIALNO)[pums$household$SERIALNO],
70		#' pums$household$BLD %in% c("02", "03"),
71		#' mean,
72		#' na.rm = TRUE
73		#' )
74		#' residents <- parcels$Total_Units * household_size[
75		#' as.character(parcels$Unit_Type != "MULTI")
76		#' ]
77		#'
78		#' ## supply as weights
79		#' parcels_filled_manual <- redistribute(
80		#' tracts[, -2], parcels, map_tr_to_parcel,
81		#' target_id = "OBJECTID", weight = residents
82		#' )
83		#'
84		#' # either way, you could now use the result to make estimates
85		#' # at higher-level geographies by redistributing from the
86		#' # parcel-level estimates to the new target
87		#' }
88		#' }
89		#' @returns Result of \code{\link{redistribute}}. These are assumed to be parcel-level
90		#' estimates, which could then be aggregated to make higher-level estimates.
91		#' @export
92
93		redistribute_parcel_pums_adj <- function(
94		source, target, households, target_total = "Total_Units", target_indicator = "Unit_Type",
95		households_size = "size", households_indicator = "BLD", households_id = "SERIALNO",
96		person = NULL, person_household_id = "SERIALNO", ...) {
97	1x	if (is.null(dim(households)) && !is.null(households$household)) {
98	1x	if (is.null(person)) person <- households$person
99	1x	households <- households$household
100		}
101	1x	hn <- nrow(households)
102	1x	hcols <- colnames(households)
103	1x	tn <- nrow(target)
104	1x	tcols <- colnames(target)
105
106		# resolve main vectors
107	1x	if (is.character(target_total) && length(target_total) == 1) {
108	!	if (!target_total %in% tcols) cli_abort("{.arg target_total} ({target_total}) is not in {.arg target}")
109	1x	target_total <- target[[target_total]]
110	!	} else if (length(target_total) != tn) cli_abort("{.arg target_total} is not the same length as {.code nrow(target)}")
111	1x	if (is.character(target_indicator) && length(target_indicator) == 1) {
112	!	if (!target_indicator %in% tcols) cli_abort("{.arg target_indicator} ({target_indicator}) is not in {.arg target}")
113	1x	target_indicator <- target[[target_indicator]]
114	!	} else if (length(target_indicator) != tn) cli_abort("{.arg target_indicator} is not the same length as {.code nrow(target)}")
115	1x	if (is.character(households_indicator) && length(households_indicator) == 1) {
116	1x	if (!households_indicator %in% hcols) {
117	!	cli_abort("{.arg households_indicator} ({households_indicator}) is not in {.arg households}")
118		}
119	1x	households_indicator <- households[[households_indicator]]
120	!	} else if (length(households_indicator) != hn) {
121	!	cli_abort("{.arg households_indicator} is not the same length as {.code nrow(households)}")
122		}
123	1x	if (!is.logical(target_indicator)) {
124	1x	target_indicator <- if (is.numeric(target_indicator)) {
125	!	target_indicator == 1
126		} else {
127	1x	!grepl("^multi", target_indicator, TRUE)
128		}
129		}
130	1x	if (!is.logical(households_indicator)) {
131	1x	households_indicator <- if (is.numeric(households_indicator)) {
132	!	households_indicator == 1
133		} else {
134	1x	if (all(c("02", "03") %in% households_indicator)) {
135	1x	grepl("^0[23]$", households_indicator)
136		} else {
137	!	!grepl("^multi", households_indicator, TRUE)
138		}
139		}
140		}
141
142		# resolve size
143	1x	size <- NULL
144	1x	if (length(households_size) != 1 && length(households_size) != hn) {
145	!	cli_abort("{.arg households_size} is not the same length as {.code nrow(households)}")
146		}
147	1x	if (length(households_id) != 1 && length(households_id) != hn) {
148	!	cli_abort("{.arg households_id} is not the same length as {.code nrow(households)}")
149		}
150	1x	if (is.character(households_size) && length(households_size) == 1) {
151	1x	if (households_size %in% hcols) {
152	!	size <- households[[households_size]]
153		} else {
154	1x	if (length(households_id) == 1 && households_id %in% hcols) households_id <- households[[households_id]]
155	1x	if (!is.null(person) && length(person_household_id) == 1 && person_household_id %in% colnames(person)) {
156	1x	person_household_id <- person[[person_household_id]]
157	!	} else if (length(person_household_id) == 1) {
158	!	cli_abort("{.arg person_household_id} ({person_household_id}) is not in {.arg person}")
159		}
160	1x	if (!any(households_id %in% person_household_id)) {
161	!	cli_abort("no {.arg households_id}s are present in {.arg person_household_id}")
162		}
163	1x	size <- table(person_household_id)[households_id]
164		}
165		}
166	!	if (is.null(size)) cli_abort("failed to resolve {.arg households_size}")
167
168	1x	residents <- target_total * tapply(
169	1x	size, households_indicator, mean,
170	1x	na.rm = TRUE
171	1x	)[as.character(target_indicator)]
172	1x	redistribute(source, target, weight = residents, ...)
173		}