uspto coverage - 82.11%

Files
Source

#' Search PatentsView
#'
#' Submits a query to one of the \href{https://patentsview.org}{PatentsView} endpoints.
#'
#' @param query A query string or list; for example \code{list(patent_number = 7861317)} or
#' \code{'{"_text_all": {"patent_abstract": ["photographic", "noodle"]}}'}. See
#' \href{https://patentsview.org/apis/api-query-language}{API Query Language}.
#' @param fields A vector of fields to return; defaults to all for the given endpoint. Set to \code{NULL}
#' to use the API's default.
#' @param sort A list or list of lists specifying how to sort results (e.g., \code{list(patent_date = "asc")}).
#' @param outDir Directory in which to save results; defaults to a temporary directory.
#' @param include_related Logical; if \code{TRUE}, includes subentities related to entities matching the search criteria.
#' @param start Page to start collecting results on.
#' @param limit Maximum number of results to return; sets page size, up to 10,000 per page
#' (1,000 if \code{legacy} if \code{FALSE}).
#' @param endpoint Name of the API endpoint to search in, as listed on
#' \href{https://patentsview.org/apis/api-endpoints}{PatentsView}.
#' @param post Logical; if \code{FALSE}, will make a GET rather than POST request.
#' @param retry Maximum number of times to retry a request, in the event of a rate limit.
#' @param verbose Logical; if \code{FALSE}, does not print status messages.
#' @param legacy Logical; if \code{FALSE}, will use the beta API, which requires a key.
#' @param key Beta API key (\href{https://patentsview.org/apis/keyrequest}{Request a key});
#' defaults to the \code{PATENTSVIEW_KEY} environment variable.
#' @param cores Number of CPU cores to use when retrieving multiple pages of results.
#' @param overwrite Logical; if \code{TRUE}, overwrites an existing result.
#' @return A \code{data.frame} of results, if any were found. This may contain list entries
#' of \code{data.frame} with varying dimensions, depending on the \code{endpoint} and \code{fields}.
#' @examples
#' \dontrun{
#' # search for patents with a meteorology classification:
#' # https://www.uspto.gov/web/patents/classification/cpc/html/cpc-G01W.html
#' results <- uspto_search_patentsview(list(cpc_group_id = "G01W"))
#'
#' # search by date and keyword:
#' results <- uspto_search_patentsview(list(
#'   "_and" = list(
#'     list(patent_date = "2002-01-08"),
#'     list("_text_any" = list(patent_abstract = "motorcycle"))
#'   )
#' ))
#' }
#' @export

uspto_search_patentsview <- function(query, fields = NULL, sort = NULL, outDir = tempdir(), include_related = FALSE, start = 1,
                                     limit = 1e4, endpoint = "patent", post = TRUE, retry = 10, verbose = FALSE, legacy = TRUE,
                                     key = Sys.getenv("PATENTSVIEW_KEY"), cores = detectCores() - 1, overwrite = FALSE) {
  if (key == "") legacy <- TRUE
  if (legacy) {
    page_limit <- 1e4
    if (is.null(fields)) {
      all_fields <- GET(paste0(
        "https://raw.githubusercontent.com/PatentsView/PatentsView-API/master/querymodule/tests/testspec/",
        endpoint, "_master_field_list.json"
      ))
      if (all_fields$status_code != 200) stop(endpoint, " is not a recognized endpoint", call. = FALSE)
      all_fields <- fromJSON(rawToChar(all_fields$content))$fields
    }
    if (!grepl("s$", endpoint)) endpoint <- paste0(endpoint, "s")
    url <- paste0("https://api.patentsview.org/", endpoint, "/query")
    per_page <- min(page_limit, limit)
    start <- ceiling(start / per_page)
    options <- list(page = start, per_page = per_page, matched_subentities_only = include_related)
  } else {
    page_limit <- 1e3
    url <- paste0("https://search.patentsview.org/api/v1/", endpoint, "/")
    all_fields <- switch(endpoint,
      patent = c(
        "patent_number", "patent_title", "patent_date", "patent_type", "patent_country", "patent_year",
        "patent_abstract", "patent_kind", "patent_num_foreign_documents_cited", "patent_num_us_applications_cited",
        "patent_us_patents_cited", "patent_num_total_documents_cited", "patent_num_times_cited_by_us_patents",
        "patent_earliest_application_date", "patent_patent_processing_days", "patent_uspc_current_mainclass_average_patent_processing_days",
        "patent_cpc_current_group_average_patent_processing_days", "patent_term_extension", "patent_detail_desc_length",
        "assignees_at_grant", "inventors_at_grant", "cpc_current"
      ),
      inventor = c(
        "inventor_id", "name_first", "name_last", "lastknown_city", "lastknown_state", "lastknown_country",
        "first_seen_date", "last_seen_date", "num_assignees", "num_patents", "years_active", "inventor_years"
      ),
      assignee = c(
        "assignee_id", "first_seen_date", "last_seen_date", "lastknown_city", "lastknown_country", "lastknown_latitude",
        "lastknown_longitude", "lastknown_state", "name_first", "name_last", "num_inventors", "num_patents", "organization",
        "type", "years_active", "assignee_years"
      ),
      cpc_subsection = c(
        "cpc_section_id", "cpc_subsection_id", "cpc_subsection_title", "cpc_subsection_num_patents", "cpc_subsection_num_assignees",
        "cpc_subsection_num_inventors", "cpc_subsection_first_seen_date", "cpc_subsection_last_seen_date", "cpc_subsection_years_active"
      ),
      cpc_group = c(
        "cpc_group_id", "cpc_group_title", "cpc_group_num_patents", "cpc_group_num_assignees", "cpc_group_num_inventors",
        "cpc_group_first_seen_date", "cpc_group_last_seen_date", "cpc_group_years_active"
      ),
      cpc_subgroup = c("cpc_subgroup_id", "cpc_subgroup_title"),
      uspc_mainclass = c(
        "uspc_mainclass_id", "uspc_mainclass_title", "uspc_mainclass_num_patents", "uspc_mainclass_num_assignees",
        "uspc_mainclass_num_inventors", "uspc_mainclass_first_seen_date", "uspc_mainclass_last_seen_date", "uspc_mainclass_years_active"
      ),
      uspc_subclass = c("uspc_subclass_id", "uspc_subclass_title"),
      nber_category = c("nber_category_id", "nber_category_title"),
      nber_subcategory = c(
        "nber_subcategory_id", "nber_subcategory_title", "nber_subcategory_num_patents",
        "nber_subcategory_num_assignees", "nber_subcategory_num_inventors", "nber_subcategory_first_seen_date",
        "nber_subcategory_last_seen_date", "nber_subcategory_years_active"
      ),
      patent_citation = c("patent_number", "cited_patent_number", "citation_category", "citation_sequence", "citation_date"),
      application_citation = c(
        "patent_number", "cited_application_number", "citation_category", "citation_kind", "citation_sequence", "citation_date"
      ),
      NULL
    )
    if (is.null(all_fields)) stop(endpoint, " is not a recognized endpoint", call. = FALSE)
    per_page <- min(page_limit, limit)
    start <- ceiling(start / per_page)
    options <- list(size = per_page, matched_subentities_only = include_related)
  }
  if (is.character(query)) query <- jsonlite::fromJSON(query)
  fs <- if (is.null(fields)) all_fields else fields
  rf <- if (post) {
    function(page, attempt = 1) {
      if (is.null(options$page)) {
        options$offset <- page * 1e3
      } else {
        options$page <- page + 1
      }
      body <- list(
        q = query,
        f = fs,
        s = sort,
        o = options
      )
      req <- httr::POST(
        url,
        httr::add_headers("X-Api-Key" = key, "Content-Type" = "application/json", Accept = "application/json"),
        body = body, encode = "json"
      )
      if (req$status_code == 429 && attempt < retry) {
        wait <- as.integer(req$headers[["retry-after"]])
        if (length(wait)) Sys.sleep(wait)
        return(rf(page, attempt + 1))
      }
      if (req$status_code == 200) {
        jsonlite::fromJSON(rawToChar(req$content))
      } else {
        req
      }
    }
  } else {
    function(page, attempt = 1) {
      if (is.null(options$page)) {
        options$offset <- page * 1e3
      } else {
        options$page <- page + 1
      }
      body <- paste0(
        "?q=", jsonlite::toJSON(query, auto_unbox = TRUE),
        paste0('&f=["', paste(fs, collapse = '","'), '"]'),
        if (length(sort)) paste0("&s=", jsonlite::toJSON(sort, auto_unbox = TRUE)),
        paste0("&o=", jsonlite::toJSON(options, auto_unbox = TRUE))
      )
      req <- httr::GET(
        URLencode(paste0(url, body)),
        httr::add_headers("X-Api-Key" = key, "Content-Type" = "application/json", Accept = "application/json")
      )
      if (req$status_code == 429 && attempt < retry) {
        wait <- as.integer(req$headers[["retry-after"]])
        if (length(wait)) Sys.sleep(wait)
        return(rf(page, attempt + 1))
      }
      if (req$status_code == 200) {
        jsonlite::fromJSON(rawToChar(req$content))
      } else {
        req
      }
    }
  }
  dir.create(outDir, FALSE, TRUE)
  cache <- paste0(normalizePath(outDir, "/"), "/", digest::digest(list(url, query, fs, sort, options)), ".json")
  if (!overwrite && file.exists(cache)) {
    if (verbose) message("reading existing results")
    return(jsonlite::read_json(cache, simplifyVector = TRUE))
  }
  res <- rf(0)
  if (!is.null(res$status_code)) {
    stop(
      "request failed: ", res$status_code, "; ", if (res$status_code == 400) {
        paste0(
          res$headers$`x-status-reason`,
          if (!is.null(res$headers$`x-status-reason-code`)) paste0(" (", res$headers$`x-status-reason-code`, ")")
        )
      } else {
        jsonlite::toJSON(jsonlite::fromJSON(rawToChar(res$content)), auto_unbox = TRUE, pretty = TRUE)
      },
      call. = FALSE
    )
  }
  if (!is.null(res$error) && res$error) {
    stop(
      "request failed: ", jsonlite::toJSON(res, auto_unbox = TRUE, pretty = TRUE),
      call. = FALSE
    )
  }
  total_name <- grep("total_", names(res), fixed = TRUE, value = TRUE)
  rn <- names(res)[!names(res) %in% c(total_name, "count", "total_hits", "error")]
  total <- res[[total_name]]
  if (is.null(total) || total == 0) stop("no results found", call. = FALSE)
  limit <- min(limit, total) - res$count
  if (limit > 0) {
    pages <- ceiling(limit / page_limit)
    cores <- min(cores, pages)
    res2 <- if (cores > 1) {
      env <- new.env(parent = globalenv())
      environment(rf) <- env
      cl <- makeCluster(cores)
      on.exit(stopCluster(cl))
      for (o in c("cl", "start", "pages", "key", "rf", "url", "query", "fs", "sort", "options", "retry")) env[[o]] <- get(o)
      clusterExport(cl, ls(envir = env), environment())
      eval(expression(parallel::parLapply(cl, seq(start, pages), rf)), envir = env)
    } else {
      lapply(seq(start, pages), rf)
    }
    res[[rn]] <- do.call(rbind, c(list(res[[rn]]), lapply(res2, "[[", rn)))
  }
  if (verbose) message("found ", total, ", returning ", nrow(res[[rn]]))
  jsonlite::write_json(res[[rn]], cache, auto_unbox = TRUE)
  res[[rn]]
}

#' Extract Timings from Examination Events
#'
#' Extract timings between select event codes within an examination timeline.
#' @param record A matrix-like object with dates in the first column (recognizable by
#' \code{\link{as.Date}}) and event codes (recognizable by the patterns in \code{span}) in the second.
#' @param spans A list with entries defining each event window. Each entry should be a list with
#' entries for \code{start}, \code{end}, and \code{before}, each containing regular expression
#' strings identifying event codes.
#' @returns A list with a \code{data.frame} for each entry in \code{span}.
#' @examples
#' extract_event_timing(data.frame(
#'   date = c("2020-01-15", "2020-01-20"),
#'   event = c("FWDX", "CTNF")
#' ))
#' @export

extract_event_timing <- function(record, spans = list(
                                   "Initial Classification" = list(start = "^PGPC$", end = "^(?:OI|PG$)", before = "^DOCK$"),
                                   "Art Unit to Examiner" = list(start = "^DOCK$", end = "^FWDX$", before = "^(?:CTFR|EX\\.R|N/=\\.)"),
                                   "Examiner to Action" = list(start = "^FWDX$", end = "^(?:CT|EX\\.R|N/=\\.)", before = "^(?:DOCK$|A)")
                                 )) {
  if (is.list(record) && !is.data.frame(record)) {
    if ("prosecutionHistoryDataBag" %in% names(record)) {
      record <- record$prosecutionHistoryDataBag$prosecutionHistoryData
    } else if ("prosecutionHistoryDataBag" %in% names(record[[1]])) {
      timings <- lapply(record, function(r) extract_event_timing(r$prosecutionHistoryDataBag$prosecutionHistoryData))
      names(timings) <- vapply(record, function(r) r$patentCaseMetadata$applicationNumberText$value, "")
      return(timings)
    }
    record <- as.data.frame(lapply(
      as.data.frame(do.call(rbind, record)), unlist,
      recursive = FALSE, use.names = FALSE
    ))
  }
  if (ncol(record) < 2) stop("record is not in the expected format", call. = FALSE)
  record[, 1] <- as.Date(record[, 1], tryFormats = "%Y-%m-%d", options = TRUE)
  if (anyNA(record[, 1])) record <- record[!is.na(record[, 1]), ]
  record <- record[order(record[, 1]), ]
  dates <- record[, 1]
  events <- record[, 2]
  n <- length(events)
  lapply(spans, function(s) {
    pos <- lapply(s, grep, events)
    if (length(pos$start)) {
      res <- list()
      lastEnd <- 0
      current <- 0
      for (i in seq_along(pos$start)) {
        p <- pos$start[i]
        b <- pos$before[pos$before > p]
        b <- if (length(b)) b[1] else Inf
        e <- pos$end[pos$end > current & pos$end > p & pos$end < b]
        if (length(e)) {
          current <- e[1]
          res[[i]] <- data.frame(
            start = n - p + 1, startDate = dates[p], startEvent = events[p],
            end = n - current + 1, endDate = dates[current], endEvent = events[current],
            days = as.numeric(dates[current] - dates[p])
          )
        }
      }
      do.call(rbind, res)
    }
  })
}

#' Search for Patents or Patent Applications (legacy)
#'
#' Searches the Application (AppFT) or Patent (PatFT)
#' \href{https://appft.uspto.gov}{Full-Text Databases}, and returns
#' document/patent numbers and titles.
#'
#' @param query A query string. This can be a set of terms (e.g., \code{"term1 OR term2"}) or
#' an exact phrase (e.g., \code{'"a phrase"'}). To search within a particular
#' \href{https://appft.uspto.gov/netahtml/PTO/help/helpflds.html}{field}, prefix with the field code
#' and a forward slash (e.g., \code{"CPCL/G01W"}, which searches for "G01W" in the CPCL field).
#' @param applications Logical; specifies whether to search in the application database
#' (AppFT; \code{TRUE}; default) or the patent database (PatFT; \code{FALSE}).
#' @param limit A limit to the number of results retrieved, in steps of 50. By default, will
#' try to retrieve all results.
#' @param retries Number of retries for a given results page.
#' @param outdir Path to a directory in which to save results. Only writes if specified.
#' @param overwrite Logical; if \code{TRUE}, will overwrite caches results.
#' @param cores Number of CPU cores to use when reading in multiple pages.
#' @param verbose Logical; if \code{FALSE}, does not print status messages.
#' @return A \code{data.frame} with columns for \code{index} (index of the result),
#' \code{id} (patent number or application document number), and \code{title} (title of patent or application).
#' @seealso For the newer search mechanism, see \code{\link{uspto_search}}.
#' @examples
#' \dontrun{
#' # search for applications with a meteorology classification:
#' # https://www.uspto.gov/web/patents/classification/cpc/html/cpc-G01W.html
#' results <- search_fulltext("CPCL/G01W")
#'
#' # search for patents by date and title keyword
#' results <- search_fulltext("isd/1/8/2002 and motorcycle", FALSE)
#' }
#' @export

search_fulltext <- function(query, applications = TRUE, limit = NULL, retries = 20, outdir = NULL,
                            overwrite = FALSE, cores = parallel::detectCores() - 2, verbose = TRUE) {
  base_url <- paste0(
    "https://", if (applications) "app" else "pat",
    "ft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=/netahtml/PTO/search-adv.html&r=0&f=S&l=50&d=",
    if (applications) "PG01" else "PTXT", "&p="
  )
  query <- URLencode(gsub("\\s+", "+", query, perl = TRUE))
  outFile <- normalizePath(paste0(outdir, "/", digest::digest(query), ".csv"), "/", FALSE)
  if (overwrite) unlink(outFile)
  if (file.exists(outFile)) {
    if (verbose) message("reading existing file: ", outFile)
    read.csv(outFile)
  } else {
    retrieve_page <- function(input, attempt = retries, parse = TRUE) {
      results <- tryCatch(
        {
          con <- curl::curl(paste0(base_url, input, "&Query=", query))
          on.exit(close(con))
          readLines(con, warn = FALSE)
        },
        error = function(e) e$message
      )
      if (length(results) < 100) {
        if (attempt > 0) {
          Sys.sleep(.5)
          retrieve_page(input, attempt - 1, parse)
        } else {
          NULL
        }
      } else {
        if (parse) parse_page(results) else results
      }
    }
    parse_page <- function(results) {
      rows <- c(grep("^<TR><TD", results, perl = TRUE), length(results))
      res <- as.data.frame(do.call(rbind, lapply(seq_len(length(rows) - 1), function(i) {
        row <- results[seq(rows[i], rows[i + 1])]
        split_row <- strsplit(c(row[1:2], paste(row[-(1:2)], collapse = " ")), "[><]")
        c(split_row[[1]][5], split_row[[2]][5], split_row[[3]][if (split_row[[3]][5] == "") 11 else 5])
      })), make.names = FALSE, optional = TRUE)
      colnames(res) <- c("index", "id", "name")
      res$index <- as.integer(res$index)
      res$name <- sub("\\s+$", "", gsub("\\s+", " ", res$name, perl = TRUE), perl = TRUE)
      res
    }
    initial <- retrieve_page(1, retries, FALSE)
    if (length(initial) < 30) stop("Failed to retrieve initial page: ", initial, call. = FALSE)
    report <- grep("Results of Search", initial[1:100], fixed = TRUE)
    if (!length(report)) stop("Response is in an unexpected format; check ", paste0(base_url, 1, "&Query=", query), call. = FALSE)
    n <- as.numeric(regmatches(initial[report], regexec("(\\d+)\\s\\w+\\.$", initial[report]))[[1]][2])
    if (verbose) message("found ", n, " results", if (is.numeric(limit)) paste("; retrieving", limit))
    if (is.numeric(limit)) n <- min(limit, n)
    pages <- ceiling(n / 50)
    res <- if (pages > 1) {
      res <- if (cores > 1) {
        cl <- parallel::makeCluster(max(1, min(pages, cores)))
        parallel::clusterExport(cl, c("base_url", "query", "retrieve_page", "parse_page"), environment())
        on.exit(parallel::stopCluster(cl), TRUE)
        parallel::parLapply(cl, seq(2, pages), retrieve_page)
      } else {
        lapply(seq(2, pages), retrieve_page)
      }
      if (verbose) {
        missed <- vapply(res, length, 0) == 0
        if (any(missed)) warning("missed pages: ", paste(which(missed) + 1, collapse = ", "))
      }
      rbind(parse_page(initial), do.call(rbind, res))
    } else {
      parse_page(initial)
    }
    if (!is.null(outdir)) {
      dir.create(outdir, FALSE, TRUE)
      write.csv(res, outFile, row.names = FALSE)
    }
    res
  }
}

#' Download Patent Examination Data
#'
#' Download a bundled query response from the \href{https://ped.uspto.gov/peds}{Patent Examination Data System}.
#'
#' @param searchText Text to search for; \code{"*:*"} for none (default); see
#' the FAQ on the \href{https://ped.uspto.gov/peds}{PEDS site}.
#' @param filters A list of characters representing filter conditions in the form of \code{"field:value"}
#' (e.g., \code{list('appStatus:"Patented Case"')}).
#' @param outFile Name of the final JSON file, or directory in which to save files with parameter-hash names.
#' @param start Initial record.
#' @param minMatch Minimum number of terms required to match.
#' @param overwrite Logical; overwrite any previous queries with the same body hash.
#' @param waits Number of times to check for a completed bundle before giving up.
#' @param wait Number of seconds to wait between retries (how long \code{waits} are).
#' @param endpoint PEDs API endpoint.
#' @param compress Logical; if \code{FALSE}, will not xz-compress the \code{outFile}.
#' @param load Logical; if \code{FALSE}, will not return the downloaded content, or load in existing content.
#' @param verbose Logical; if \code{FALSE}, will not print status messages.
#' @return A list with an entry for each year in the set of patent metadata.
#' @examples
#' \dontrun{
#' # like case 3 in the API Documentation > API Tutorial on https://ped.uspto.gov/peds
#' download_peds(
#'   "firstNamedApplicant:(Google)",
#'   list('appStatus:"Patented Case"', "appFilingDate:[2013-01-01T00:00:00Z TO 2013-03-31T23:59:59Z]")
#' )
#' }
#' @export

download_peds <- function(searchText = "*:*", filters = list("*:*"), outFile = NULL, start = 0, minMatch = "100%", overwrite = FALSE,
                          waits = 50, wait = 10, endpoint = "https://ped.uspto.gov/api/queries/", compress = TRUE, load = TRUE, verbose = TRUE) {
  # https://ped.uspto.gov/api/search-params
  body <- list(
    searchText = searchText,
    fq = filters,
    fl = "*",
    mm = minMatch,
    df = "patentTitle",
    qf = paste(
      "appEarlyPubNumber applId appLocation appType appStatus_txt appConfrNumber appCustNumber",
      "appGrpArtNumber appCls appSubCls appEntityStatus_txt patentNumber patentTitle primaryInventor",
      "firstNamedApplicant appExamName appExamPrefrdName appAttrDockNumber appPCTNumber",
      "appIntlPubNumber wipoEarlyPubNumber pctAppType firstInventorFile appClsSubCls rankAndInventorsList"
    ),
    facet = FALSE,
    sort = "applId asc",
    start = start
  )
  hash <- digest::digest(body)
  final <- normalizePath(paste0(if (is.null(outFile) || dir.exists(outFile)) {
    paste0(if (is.null(outFile)) tempdir() else outFile, "/", hash)
  } else {
    sub("\\.json.*$", "", outFile)
  }, ".json", if (compress) ".xz"), "/", FALSE)
  if (!overwrite && file.exists(final)) {
    if (load) {
      if (verbose) message("Reading in existing results")
      return(invisible(jsonlite::read_json(final, simplifyDataFrame = FALSE)))
    } else {
      if (verbose) message("Results already exist")
      return()
    }
  }
  bundle <- paste0(normalizePath(tempdir(), "/"), "/", hash, ".zip")
  exdir <- sub(".zip", "", bundle, fixed = TRUE)
  output <- list(ID = "", content = list())
  if (load) on.exit(return(invisible(output$content)))
  if (overwrite || (!file.exists(bundle) && !dir.exists(exdir))) {
    query <- httr::POST(
      endpoint, httr::add_headers("Content-Type" = "application/json", Accept = "application/json"),
      body = body, encode = "json"
    )
    if (200 != httr::status_code(query)) {
      error <- httr::http_error(query)
      errorContent <- httr::content(query)
      stop("query failed:\n", if (length(errorContent)) errorContent else if (is.logical(error)) httr::status_code(query) else error, call. = FALSE)
    }
    res <- httr::content(query)
    output$ID <- ID <- res$queryId
    if (0 == res$queryResults$searchResponse$response$numFound) stop("No records found for ", ID, call. = FALSE)
    httr::PUT(paste0(endpoint, ID, "/package?format=JSON"))
    if (verbose) message("Package requested for ", res$queryResults$searchResponse$response$numFound, " record(s): ", ID)
    retries <- 1
    while (jsonlite::read_json(paste0(endpoint, ID))$jobStatus != "COMPLETED") {
      if (retries > waits) stop("hit wait limit without seeing COMPLETED status for ", ID)
      retries <- retries + 1
      if (verbose) cat(paste0("\rwaiting for bundle (try: ", retries, ") ... "))
      Sys.sleep(wait)
    }
    if (verbose) cat("complete\n")
    url <- paste0(endpoint, ID, "/download?format=JSON")
    download.file(url, bundle, mode = "wb")
  }
  if (file.exists(bundle)) {
    if (verbose) message("Unpacking ", bundle)
    system2("unzip", c("-d", shQuote(exdir), shQuote(bundle)), stdout = TRUE)
    unlink(bundle)
  }
  output$content <- unlist(lapply(list.files(exdir, full.names = TRUE), function(f) {
    res <- tryCatch(jsonlite::read_json(f, simplifyDataFrame = FALSE)$PatentData, error = function(e) NULL)
    if (is.null(res)) {
      con <- gzfile(f)
      on.exit(close(con))
      res <- tryCatch(fromJSON(paste(
        c(scan(con, "", quote = "", na.strings = "", quiet = TRUE), "]}"),
        collapse = " "
      ), simplifyDataFrame = FALSE)$PatentData, error = function(e) NULL)
    }
    res
  }), recursive = FALSE, use.names = FALSE)
  if (verbose) message("writing final results: ", normalizePath(final, "/", FALSE))
  dir.create(dirname(final), FALSE, TRUE)
  if (compress) {
    final <- xzfile(final)
    on.exit(close(final), add = TRUE)
  }
  jsonlite::write_json(output$content, final, auto_unbox = TRUE)
}

#' Download PatentsView Bulk
#'
#' Download one of the bulk tables from \href{https://patentsview.org}{PatentsView}, and optionally
#' make an Arrow dataset from it.
#'
#' @param table Name of the table to download, as listed on the
#' \href{https://patentsview.org/download/data-download-tables}{Data Downloads page}.
#' @param dir Directory in which to save original tables and the dataset if \code{make_db} is \code{TRUE}.
#' @param pregrant Logical; if \code{TRUE}, will download the pre-grant version of \code{table}.
#' @param partition A vector of column names to be used as partition keys, if \code{make_db} is \code{TRUE}.
#' Can be a named list of functions, which will be used to create new columns, then partition by those columns
#' (e.g., \code{list(series_code = function(d) substr(d$patent_id, 1, 2))}). If \code{NULL} not specified, will choose
#' a column with the number of unique values closest to 20.
#' @param make_db Logical; if \code{TRUE}, will make an Arrow dataset out of the downloaded table.
#' @param format Format of the dataset, if \code{make_db} is \code{TRUE}.
#' @param return_table Logical; if \code{FALSE}, returns the path to the file, rather than the read-in table.
#' @param overwrite Logical; if \code{TRUE}, overwrites any existing files (raw and prepared).
#' @return The original table (if \code{return_table} is \code{TRUE}; as a \code{tibble}), an opened dataset
#' (if the path to the dataset exists), or the path to the downloaded file.
#' @examples
#' \dontrun{
#' # download the application table of granted patents
#' download_patentsview_bulk(".", "application", partition = "series_code")
#' }
#' @export

download_patentsview_bulk <- function(table, dir = tempdir(), pregrant = FALSE, partition = NULL, make_db = FALSE,
                                      format = "parquet", return_table = TRUE, overwrite = FALSE) {
  url <- paste0(
    "https://s3.amazonaws.com/data.patentsview.org/",
    if (pregrant) "pregrant_publications" else "download",
    "/", table, ".tsv.zip"
  )
  dir.create(dir, FALSE, TRUE)
  dir <- paste0(normalizePath(dir, "/"), "/")
  file_zip <- paste0(dir, table, ".tsv.zip")
  file <- paste0(dir, table, if (pregrant) "_pg" else "_g", ".tsv")
  dbd <- paste0(dir, table, "_db")
  if (overwrite) unlink(c(file_zip, file, dbd), TRUE, TRUE)
  if (!file.exists(file)) {
    if (!file.exists(file_zip)) download.file(url, file_zip)
    unzip(file_zip, exdir = dir)
    file.rename(sub(".zip", "", file_zip, fixed = TRUE), file)
    unlink(file_zip)
  }
  raw <- NULL
  if (missing(make_db) && (!missing(partition) || !missing(format))) make_db <- TRUE
  if (missing(return_table) && make_db) return_table <- FALSE
  if (make_db) {
    raw <- vroom::vroom(file, delim = "\t", show_col_types = FALSE)
    if (is.null(partition)) {
      partition <- names(which.min(abs(20 - vapply(raw, function(x) length(unique(x)), 0))))
    } else if (is.list(partition)) {
      for (n in names(partition)) {
        raw[[n]] <- partition[[n]](raw)
      }
      partition <- names(partition)
    }
    arrow::write_dataset(raw, dbd, partitioning = partition, format = format)
  }
  if (!return_table && dir.exists(dbd)) {
    arrow::open_dataset(dbd, format = format)
  } else {
    if (return_table) {
      if (is.null(raw)) {
        vroom::vroom(file, delim = "\t", show_col_types = FALSE)
      } else {
        raw
      }
    } else {
      file
    }
  }
}

#' Get Examiner Name from Office Actions
#'
#' Extract the examiner's name from office action body text.
#' @param office_action A list of office action, as retrieved with \code{\link{download_office_actions}};
#' each entry should be a list with entries for \code{bodyText} (the office action text) and
#' \code{examinerEmplyeeNumber}.
#' @param collapse Logical; if \code{FALSE}, will return a row for every entered office action.
#' @return A character vector with the Examiner's employee number and two possible names: one
#' extracted from the language (\code{intext}) and another from the sign-off (\code{signed}). The name
#' will be \code{NA} if there is no examiner (such as if a non-examiner submitted the action), or
#' if the format is not as expected. If \code{office_action} is a list of multiple office actions,
#' these will make up rows of a \code{data.frame}, which will be as many as the length of
#' \code{office_actions} if \code{collapse} is \code{FALSE}, or the length of unique IDs.
#' @examples
#' \dontrun{
#' # retrieve the office actions associated with a particular application
#' office_actions <- download_office_actions("patentApplicationNumber:13877637")
#'
#' # get the name of the examiner associated with the first action:
#' extract_examiner_name(office_actions[[1]])
#' }
#' @export

extract_examiner_name <- function(office_action, collapse = TRUE) {
  if (length(office_action) > 1 && "bodyText" %in% names(office_action[[1]])) {
    names <- do.call(rbind, lapply(office_action, extract_examiner_name))
    if (collapse) {
      nas <- is.na(names[, 1])
      if (any(nas)) names[nas, 1] <- paste0("NA", seq_len(sum(nas)))
      names <- do.call(rbind, lapply(split(as.data.frame(names), names[, 1]), function(e) {
        if (nrow(e) < 2) {
          e[1, ]
        } else {
          intext <- e[, 2]
          intext <- intext[!is.na(intext)]
          signed <- e[, 3]
          signed <- signed[!is.na(signed)]
          data.frame(
            id = e[1, "id"],
            intext = if (length(intext)) intext[which.max(nchar(intext))] else NA,
            signed = if (length(signed)) signed[which.max(nchar(signed))] else NA
          )
        }
      }))
    }
    return(names)
  }
  if (!is.list(office_action)) office_action <- list(bodyText = office_action)
  if (is.list(office_action$bodyText)) office_action$bodyText <- unlist(office_action$bodyText, FALSE, FALSE)
  body <- sub("\\s+$", "", office_action$bodyText, perl = TRUE)
  if (!length(body)) stop("office_action is not in the expected format", call. = FALSE)
  intext <- regmatches(body, regexec(
    "directed to (?:primary )?(?:examiner )?([A-Z][^/\\n\\t]+?) who(?: may be reached|se (?:telephone )?number)",
    body,
    perl = TRUE
  ))
  intext <- if (length(intext) && length(intext[[1]])) {
    intext <- gsub("(?:Primary\\s|Petitions\\s)?Examiner\\s|,$", "", intext[[1]][2], perl = TRUE)
  } else {
    NULL
  }
  l <- nchar(body)
  foot <- paste0(sub("\\s{10,}", " ", if (l > 1000) substring(body, l - 2000) else body, perl = TRUE), "XXEE")
  parts <- strsplit(
    gsub("\\s+[\\w.]+@uspto\\.gov|number='\\d+'|[A-Z][a-z]+ \\d+, \\d{4}", "", foot, perl = TRUE),
    paste0(
      "(?:[^\\w\\s]\\W*|[\\d/\\t\\n-]+)(?:",
      "Examiner|Primary\\s+Examiner|Petitions\\s+Examiner",
      ")\\s?(?:[A-Z]{2}|Art|Unit|[\\d\\n,]+|XX)"
    ),
    perl = TRUE
  )[[1]]
  signed <- if (length(parts) > 1) {
    parts <- strsplit(parts[1], "[\\n/]+", perl = TRUE)[[1]]
    sub("_x000d_.*|\\s+$", "", gsub("\\s+", " ", parts[length(parts)], perl = TRUE))
  } else {
    NULL
  }
  c(
    id = if (is.null(office_action$examinerEmployeeNumber)) NA else trimws(office_action$examinerEmployeeNumber[[1]]),
    intext = if (length(intext) && intext != "") intext else NA,
    signed = if (length(signed) && signed != "") signed else NA
  )
}

#' Search for Patents or Patent Applications
#'
#' Searches the U.S. Pre-Grant Publications (US-PGPUB), Patents (USPAT), and/or
#' Optical Character Recognition (USOCR) databases. See the \href{https://ppubs.uspto.gov/pubwebapp/static/pages/landing.html}{web app}
#' for an interactive search.
#'
#' @param query A query string; for example \code{"noodle"}, \code{"photographic AND noodle"},
#' \code{"wet adj4 silicon"}, or \code{"G01W.CPCL."}. See the Patent Public Search
#' \href{https://ppubs.uspto.gov/pubwebapp/static/pages/searchable-indexes.html}{training materials}.
#' @param databases A character vector specifying which databases to search in; a selection from
#' \code{c("US-PGPUB", "USPAT", "USOCR")}.
#' @param outFile Name of a \code{.csv} file to save results to. If this file exists, it will
#' be loaded instead of searching.
#' @param start Result of the search to start from.
#' @param limit Maximum number of results to return; defaults to all results.
#' @param sort How to sort results; defaults to publication date (\code{"date_publ desc"}).
#' @param english_only Logical; if \code{FALSE}, will return patent and/or applications of any language.
#' @param spellCheck Logical; if \code{TRUE}, will spellcheck \code{query} terms.
#' @param plurals Logical; if \code{TRUE}, will look for plural and singular forms of \code{query} terms.
#' @param britishEquivalents Logical; if \code{TRUE}, will look for British and American forms of \code{query} terms.
#' @param verbose Logical; if \code{FALSE}, does not print status messages.
#' @return A \code{data.frame} of results, if any were found; otherwise \code{NULL}.
#' @examples
#' \dontrun{
#' # search for applications with a meteorology classification:
#' # https://www.uspto.gov/web/patents/classification/cpc/html/cpc-G01W.html
#' results <- uspto_search("G01W.CPCL.", "US-PGPUB")
#'
#' # search by date and keyword:
#' results <- uspto_search("20020801.pd. AND motorcycle")
#' }
#' @export

uspto_search <- function(query, databases = c("US-PGPUB", "USPAT", "USOCR"), outFile = NULL, start = 0, limit = FALSE, sort = "date_publ desc",
                         english_only = TRUE, spellCheck = FALSE, plurals = TRUE, britishEquivalents = TRUE, verbose = FALSE) {
  if (!is.null(outFile)) {
    outFile <- paste0(sub("\\.csv.*$", "", outFile), ".csv")
    if (file.exists(outFile)) {
      if (verbose) message("loading existing results")
      return(read.csv(outFile))
    } else {
      dir.create(dirname(outFile), FALSE)
    }
  }
  databases <- unname(c(b = "US-PGPUB", t = "USPAT", r = "USOCR")[unique(tolower(substring(databases, nchar(databases))))])
  if (!length(databases)) stop("no recognized databases entered", call. = FALSE)
  session_request <- POST(
    "https://ppubs.uspto.gov/dirsearch-public/users/me/session",
    add_headers("Content-Type" = "application/json"),
    body = "-1"
  )
  session <- content(session_request)
  if (is.null(session$userSessionId)) stop("failed to start a session: ", session, call. = FALSE)
  query_object <- list(
    britishEquivalents = britishEquivalents,
    caseId = session$userCase$caseId,
    databaseFilters = lapply(databases, function(s) list(databaseName = s, countryCodes = list())),
    highlights = "0",
    hl_snippets = "2",
    ignorePersist = TRUE,
    op = "OR",
    plurals = plurals,
    q = query,
    qt = "brs",
    queryName = query,
    searchType = 1,
    spellCheck = spellCheck,
    userEnteredQuery = query,
    viewName = "tile"
  )
  count_request <- POST(
    "https://ppubs.uspto.gov/dirsearch-public/searches/counts",
    add_headers("Content-Type" = "application/json"),
    body = toJSON(list(
      britishEquivalents = britishEquivalents,
      caseId = session$userCase$caseId,
      databaseFilters = lapply(databases, function(s) list(databaseName = s, countryCodes = list())),
      highlights = "0",
      hl_snippets = "2",
      ignorePersist = TRUE,
      op = "OR",
      plurals = plurals,
      q = query,
      qt = "brs",
      queryName = query,
      searchType = 1,
      spellCheck = spellCheck,
      userEnteredQuery = query,
      viewName = "tile"
    ), auto_unbox = TRUE)
  )
  count <- content(count_request)
  limit <- if (is.numeric(limit)) limit else count$numResults
  if (is.null(count$numResults)) stop("failed to execute query: ", count, call. = FALSE)
  if (verbose) message("matches: ", count$numResults, "; retrieving ", start, " through ", limit)
  if (count$numResults != 0) {
    result_request <- POST(
      "https://ppubs.uspto.gov/dirsearch-public/searches/searchWithBeFamily",
      add_headers("Content-Type" = "application/json"),
      body = toJSON(list(
        start = start,
        pageCount = limit,
        sort = sort,
        docFamilyFiltering = "familyIdFiltering",
        searchType = 1,
        familyIdEnglishOnly = english_only,
        familyIdFirstPreferred = "USPAT",
        familyIdSecondPreferred = "US-PGPUB",
        familyIdThirdPreferred = "FPRS",
        showDocPerFamilyPref = "showEnglish",
        queryId = 0,
        tagDocSearch = FALSE,
        query = query_object
      ), auto_unbox = TRUE)
    )
    if (status_code(result_request) != 200) stop("failed to retrieve results: ", status_code(result_request), call. = FALSE)
    result <- content(result_request)
    if (is.null(result$patents)) stop("failed to retrieve results: ", result, call. = FALSE)
    res <- do.call(rbind, lapply(result$patents, function(d) {
      if (is.null(d$collapsed)) d$collapsed <- TRUE
      if (is.null(d$lastChild)) d$lastChild <- TRUE
      if (is.null(d$familyCount)) d$familyCount <- 1
      lapply(d, function(e) if (!length(e)) NA else if (is.list(e)) paste0(e, collapse = "; ") else e)
    }))
    res <- as.data.frame(lapply(as.data.frame(res), unlist))
    if (!is.null(outFile)) {
      if (verbose) message("saving results: ", outFile)
      write.csv(res, outFile, row.names = FALSE)
    }
    res
  }
}

#' Get inventor information
#'
#' Extract inventor information from patent/application documents or examination records.
#'
#' @param input A \code{data.frame} of documents (as returned from \code{\link{uspto_download}})
#' or a list of examination records (as returned from \code{\link{download_peds}}).
#' @returns A \code{data.frame} with inventor information.
#' @examples
#' \dontrun{
#' # download an application
#' application <- uspto_download("US-20040216465-A1")
#'
#' # get inventor information from it
#' extract_inventors(application)
#'
#' # get the same from examination record
#' extract_inventors(download_peds("applId:10612573"))
#' }
#' @export

extract_inventors <- function(input) {
  if (is.data.frame(input)) {
    do.call(rbind, unname(lapply(split(
      input[, c("guid", "applicationNumber", "inventorsName", "inventorCountry", "inventorState", "inventorCity")],
      unlist(input$guid, FALSE, FALSE)
    ), function(d) {
      res <- if (grepl(";", d$inventorState, fixed = TRUE)) {
        i <- lapply(d[, -c(1:2)], function(e) strsplit(e[[1]], "; ", fixed = TRUE)[[1]])
        data.frame(
          as.list(unlist(d[, 1:2])),
          do.call(rbind, lapply(strsplit(i$inventorsName, ",?\\s+"), function(n) {
            data.frame(
              firstName = n[2], middleName = if (length(n) > 2) paste(n[-(1:2)], collapse = " "), lastName = n[1]
            )
          })),
          i[-1]
        )
      } else {
        d
      }
      rownames(res) <- NULL
      res
    })))
  } else {
    ex <- function(r) {
      if (length(r) == 1) r <- r[[1]]
      party <- r$patentCaseMetadata$partyBag$applicantBagOrInventorBagOrOwnerBag
      app <- list(
        guid = sub("-?(\\d+)-?", "-\\1-", r$patentCaseMetadata$patentPublicationIdentification$publicationNumber, perl = TRUE),
        applicationNumber = sub("(\\d{2})/?", "\\1/", r$patentCaseMetadata$applicationNumberText$value, perl = TRUE)
      )
      if (!length(app$guid)) app$guid <- ""
      if (!is.null(party)) {
        i <- which(vapply(party, function(p) if (length(p)) names(p) else "", "") == "inventorOrDeceasedInventor")
        if (length(i)) {
          do.call(rbind, lapply(party[[i]]$inventorOrDeceasedInventor, function(g) {
            do.call(rbind, lapply(g$contactOrPublicationContact, function(p) {
              data.frame(
                app,
                p$name$personNameOrOrganizationNameOrEntityName[[1]]$personStructuredName,
                inventorCountry = p$countryCode, inventorState = p$geographicRegionName$value, inventorCity = p$cityName
              )
            }))
          }))
        }
      }
    }
    if ("patentCaseMetaData" %in% names(input[[1]])) do.call(rbind, lapply(input, ex)) else ex(input)
  }
}

#' Get Patent Classification Information
#'
#' Retrieve U.S. patent Class definitions and subclasses from
#' \href{https://www.uspto.gov/web/patents/classification}{USPTO Classification Resources}.
#'
#' @param code A vector of USPC classification symbols (e.g., \code{D14}).
#' @param dir Directory in which to save results.
#' @returns A list with an entry for \code{class} and \code{description} for the entered code, and \code{subclasses}
#' containing a \code{data.frame} with a \code{subclass} (the subclass code) and \code{description} column.
#' @examples
#' \dontrun{
#' classifications <- get_class_info(c(428, 429))
#' }
#' @export

get_class_info <- function(code, dir = tempdir()) {
  if (length(code) > 1) {
    return(Filter(length, lapply(structure(code, names = code), function(cc) {
      tryCatch(get_class_info(cc, dir), error = function(e) {
        warning("failed to retrieve code ", cc, call. = FALSE)
        NULL
      })
    })))
  }
  code <- toupper(code)
  dir.create(dir, FALSE, TRUE)
  dir <- normalizePath(dir, "/")
  output <- paste0(dir, "/uspc", code, ".json")
  if (file.exists(output)) {
    read_json(output, simplifyVector = TRUE)
  } else {
    req <- GET(paste0("https://www.uspto.gov/web/patents/classification/uspc", code, "/sched", code, ".htm"))
    if (req$status_code == 200) {
      res <- strsplit(rawToChar(req$content), "\n+")[[1]]
      res <- res[seq(
        grep("<table summary", res, fixed = TRUE)[[1]],
        grep("Start global footer", res, fixed = TRUE)
      )]
      title <- regmatches(res[1], gregexec(">([^<]+)</", res[1]))[[1]][2, 2]
      rows <- grep("<tr", res, fixed = TRUE)
      subclasses <- as.data.frame(do.call(rbind, lapply(strsplit(res[rows[-(1:2)]], "</td>", fixed = TRUE), function(r) {
        if (length(r) > 5) {
          sm <- regexec("', '([0-9.]+)'", r[2])
          dm <- regexec("> *([A-Za-z][^>]+)<", r[5])
          if (length(sm[[1]]) == 2 && length(dm[[1]]) == 2) {
            c(
              subclass = regmatches(r[2], sm)[[1]][2],
              description = sub(" *$", "", regmatches(r[5], dm)[[1]][2])
            )
          }
        }
      })))
      colnames(subclasses) <- c("subclass", "description")
      subclasses$subclass <- as.numeric(subclasses$subclass)
      res <- list(class = code, description = title, subclasses = subclasses)
      write_json(res, output, auto_unbox = TRUE)
      res
    } else {
      stop("failed to retrieve definition of code ", code, call. = FALSE)
    }
  }
}

#' Download Patents or Patent Applications
#'
#' Downloads from the U.S. Pre-Grant Publications (US-PGPUB), Patents (USPAT), and/or
#' Optical Character Recognition (USOCR) databases. See the \href{https://ppubs.uspto.gov/pubwebapp/static/pages/landing.html}{web app}
#' to search for and view documents from the same source.
#'
#' @param guid Vector of document numbers (e.g., \code{US-20220230294-A1}). Can also be a \code{data.frame} of results,
#' as returned from \code{\link{uspto_search}}; must have a \code{guid} column containing document numbers,
#' and a \code{type} column with the source database abbreviation (e.g., \code{US-PGPUB}).
#' @param outDir Path to a directory in which to save each individual document. If not specified, these are
#' saved to a temporary directory.
#' @param type A vector the same length as \code{guid} (repeated as necessary), indicating the source database of
#' the \code{guid}. Defaults to \code{US-PGPUB} for every ID.
#' @param cores Number of CPU cores to split requests across.
#' @param compress Logical; if \code{FALSE}, will not xz-compress each file.
#' @param load Logical; if \code{FALSE}, will not return content, or load existing files.
#' @param verbose Logical; if \code{FALSE}, does not print status messages.
#' @return A \code{data.frame} with a row for each document.
#' @seealso You can more efficiently download granted patents using \code{\link{download_patents}}.
#' @examples
#' \dontrun{
#' # start with a search
#' results <- uspto_search("G01W.CPCL.", "US-PGPUB")
#'
#' # then download those results
#' applications <- uspto_download(results)
#' }
#' @export

uspto_download <- function(guid, outDir = tempdir(), type = "US-PGPUB", cores = detectCores() - 2, compress = TRUE, load = TRUE, verbose = FALSE) {
  if (missing(type) && is.list(guid) && !is.null(guid$type)) type <- guid$type
  if (!is.character(guid)) guid <- guid$guid
  if (!length(guid)) stop("unrecognized guid input; should be a data.frame with a guid column, or a character vector", call. = FALSE)
  invalid <- !grepl("[A-Z]{2}-\\d+-[A-Z]\\d+", guid, perl = TRUE)
  if (any(invalid)) {
    if (any(!grepl("-", guid[invalid], fixed = TRUE))) guid[invalid] <- sub("(\\d+)", "-\\1-", guid[invalid], perl = TRUE)
    invalid <- !grepl("[A-Z]{2}-\\d+-[A-Z]\\d+", guid, perl = TRUE)
    if (any(invalid)) stop("invalid ID format: ", if (all(invalid)) "all" else guid[invalid])
  }
  type <- rep_len(type, length(guid))
  outDir <- paste0(normalizePath(outDir, "/", FALSE), "/")
  dir.create(outDir, FALSE, TRUE)
  urls <- paste0("https://ppubs.uspto.gov/dirsearch-public/patents/", guid, "/highlight?queryId=0&source=", type)
  if (verbose) message("retrieving ", length(urls), " documents; saving in ", outDir)
  retrieve_document <- function(url, out = outDir, comp = compress) {
    file <- paste0(out, gsub("^.*patents/|/highlight.*$", "", url), ".json", if (comp) ".xz")
    doc <- NULL
    if (file.exists(file)) {
      if (load) {
        if (verbose) message("loading existing document: ", file)
        doc <- as.data.frame(read_json(file)[[1]])
        doc$score <- as.numeric(doc$score)
      } else {
        message("document exists; not loading")
      }
    } else {
      if (verbose) message("retrieving document: ", url)
      req <- GET(url)
      if (req$status_code == 200) {
        res <- content(req)
        doc <- as.data.frame(lapply(res, function(e) if (is.null(e)) "N/A" else if (is.list(e)) paste0(e, collapse = "; ") else e))
        html <- grep("Html$", names(doc), TRUE, value = TRUE)
        for (e in html) {
          doc[[e]] <- gsub('<figref idref="([^"]+)">FIG\\. (\\d+)</figref>', "FIGREF FIGTYPE\\1 FIGID\\2",
            gsub("\\[(\\d+)\\]", "SECID SEC\\1",
              doc[[e]],
              perl = TRUE
            ),
            perl = TRUE
          )
        }
        if (comp) {
          file <- xzfile(file)
          on.exit(close(file))
        }
        write_json(doc, file, auto_unbox = TRUE)
      }
    }
    if (load) doc
  }
  res <- if (cores > 1) {
    cl <- parallel::makeCluster(max(1, min(length(urls), cores)))
    on.exit(parallel::stopCluster(cl), TRUE)
    parallel::parLapply(cl, urls, retrieve_document)
  } else {
    lapply(urls, retrieve_document)
  }
  if (load) invisible(do.call(rbind, res))
}

#' Retrieve Full Patent Text
#'
#' Search for and return a set of full patent texts.
#'
#' @param ids A vector of application numbers, patent numbers, or document IDs (e.g., \code{"US9748552"}).
#' @param outFile Path to a \code{.csv} file to write results to.
#' @param ids_type Specifies what \code{ids} are, between \code{applications} (default), \code{patents},
#' \code{publication} (for publication document IDs), or \code{grant} (for grant document IDs).
#' @param query Search query; either a term/phrase (\code{searchText}) or set of Boolean criteria (\code{criteriaSearchText}).
#' @param inventionTitle Invention title, as it appears on the first page of the specification.
#' @param inventionSubjectMatterCategory Patent category (e.g., \code{DESIGN}).
#' @param assigneeEntityName Name of the patent assignee.
#' @param fromDate,toDate,filingDateFromDate,filingDateToDate Publication/grant and/or filing date ranges
#' in \code{YYYY-MM-DD} format (e.g., \code{2019-01-20}).
#' @param inventorNameText,claimText,abstractText,descriptionText Text to search for in specific portions;
#' \code{query} will search in all portions if it is a term or phrase.
#' @param sortField Field by which to sort results; defaults to date.
#' @param ascending Logical; if \code{TRUE}, will sort in ascending order.
#' @param grant Logical; if \code{TRUE}, searches for grants rather than publications.
#' @param limit Maximum number of results to return, in steps of 100.
#' @param series Series code to prepend to \code{ids} if they are application or patent numbers and not already included.
#' @param cores Number of CPU cores to use when retrieving multiple pages of results.
#' @param verbose Logical; if \code{TRUE}, will print status messages.
#' @return Path to the downloaded package, if it exists and \code{await} was a number; otherwise \code{NULL}.
#' @examples
#' \dontrun{
#' # retrieve a specific set of patents:
#' patents <- download_patents(c(16978600, 17087826, 16041592, 16727940, 17070594))
#' }
#' @export

download_patents <- function(ids = NULL, outFile = NULL, ids_type = "application", query = NULL,
                             inventionTitle = NULL, inventionSubjectMatterCategory = NULL, assigneeEntityName = NULL,
                             fromDate = NULL, toDate = NULL, filingDateFromDate = NULL,
                             filingDateToDate = NULL, inventorNameText = NULL, claimText = NULL,
                             abstractText = NULL, descriptionText = NULL, sortField = NULL, ascending = FALSE,
                             grant = FALSE, limit = FALSE, series = "US", cores = detectCores() - 2, verbose = FALSE) {
  if (!is.null(outFile) && file.exists(outFile)) {
    if (verbose) message("reading in existing file: ", outFile)
    res <- read.csv(outFile)
  } else {
    args <- list(
      inventionSubjectMatterCategory = inventionSubjectMatterCategory,
      filingDateFromDate = filingDateFromDate,
      filingDateToDate = filingDateToDate,
      inventionTitle = inventionTitle,
      assigneeEntityName = assigneeEntityName,
      inventorNameText = inventorNameText,
      claimText = claimText,
      abstractText = abstractText,
      descriptionText = descriptionText,
      sortField = sortField,
      sortOrder = if (ascending) "asc" else "desc"
    )
    if (!is.null(ids)) {
      id_field <- c(
        ap = "patentApplicationNumber", pa = "patentNumber",
        do = paste0(if (grant) "grant" else "publication", "DocumentIdentifier"),
        pu = "publicationDocumentIdentifier", gr = "grantDocumentIdentifier"
      )[tolower(substring(ids_type, 1, 2))]
      if (!grant && grepl("grant", id_field, fixed = TRUE)) grant <- TRUE
      if (is.na(id_field)) stop("ids_type not recognized", call. = FALSE)
      args[id_field] <- format_numbers(ids, if (grepl("Document", id_field, fixed = TRUE)) "" else series)
    }
    if (!is.null(query)) {
      isCriteria <- grepl(":", query, fixed = TRUE)
      args[if (isCriteria) "criteriaSearchText" else "searchText"] <- query
      if (isCriteria) args$largTextSearchFlag <- "Y"
    }
    type <- if (grant) "grant" else "publication"
    if (!is.null(toDate)) args[paste0(type, "ToDate")] <- toDate
    if (!is.null(fromDate)) args[paste0(type, "FromDate")] <- fromDate
    if (is.numeric(limit) && limit < 100) args$rows <- limit
    args <- Filter(length, args)
    req <- GET(paste0(
      "https://developer.uspto.gov/ibd-api/v1/application/", if (grant) "grants" else "publications",
      "?", paste(paste0(names(args), "=", unlist(args, use.names = FALSE)), collapse = "&")
    ))
    res <- tryCatch(content(req), error = function(e) http_status(req))
    if (length(res) && !is.null(res$error)) stop("request failed: ", res$status, ": ", res$error, call. = FALSE)
    retrieved <- length(res$results)
    if (!retrieved) stop("no results were not found", call. = FALSE)
    n <- if (is.numeric(limit)) min(res$recordTotalQuantity, limit) else res$recordTotalQuantity
    if (n > retrieved) {
      pages <- ceiling(n / 100) - 1
      base_url <- paste0(
        "https://developer.uspto.gov/ibd-api/v1/application/", if (grant) "grants" else "publications",
        "?", paste(paste0(names(args), "=", unlist(args, use.names = FALSE)), collapse = "&"), "&start="
      )
      urls <- vapply(seq_len(pages), function(i) paste0(base_url, i * 100), "")
      get_page <- function(url) {
        res <- tryCatch(content(GET(url))$results, error = function(e) NULL)
        if (is.null(res)) warning("failed to retrieve ", url)
        res
      }
      res2 <- if (cores > 1) {
        cl <- parallel::makeCluster(max(1, min(pages, cores)))
        on.exit(parallel::stopCluster(cl), TRUE)
        parallel::parLapply(cl, urls, get_page)
      } else {
        lapply(urls, get_page)
      }
      if (length(res2)) {
        res$results <- c(res$results, unlist(res2, recursive = FALSE))
      } else if (verbose) warning("failed to retrieve more than the initial results", call. = FALSE)
    }
    res <- as.data.frame(do.call(cbind, lapply(as.data.frame(do.call(rbind, lapply(res$results, function(d) {
      lapply(d, function(e) if (is.null(e)) NA else if (is.list(e)) paste(e, collapse = " ") else e)
    }))), unlist)))
    if (verbose) message("found ", nrow(res))
    if (!is.null(outFile)) {
      if (verbose) message("writing results to ", outFile)
      dir.create(dirname(outFile), FALSE, TRUE)
      write.csv(res, outFile, row.names = FALSE)
    }
  }
  res
}

format_numbers <- function(numbers, series) {
  paste(sub("^[A-Z]{2}([A-Z]{2})", "\\1", paste0(series, gsub("[/,]+", "", numbers)), FALSE, perl = TRUE), collapse = ",")
}

#' Make Document-Class Matrix
#'
#' Create A document-class matrix from a patentsview bulk table.
#'
#' @param table A matrix-like object with columns specified by \code{class_id}, \code{doc_id}, and \code{rank}.
#' Alternatively, the path to a tab-separated file containing such a matrix, an \code{Arrow} dataset, or the name
#' of a table, to be passed to \code{\link{download_patentsview_bulk}}.
#' @param outFile Path to an rds file to save the results to.
#' @param class_id,doc_id,rank Column names of the class, document ID, and class rank to be pulled from \code{table}.
#' @param type Table name, used to set a default \code{class_id}.
#' @param ... Additional arguments to be passed to \code{\link{download_patentsview_bulk}},
#' if \code{table} is a table name.
#' @param sparse Logical; if \code{FALSE}, returns a regular, dense matrix.
#' @param overwrite Logical; if \code{TRUE}, overwrites an existing \code{outFile} rather than loading it.
#' @return A sparse matrix (or regular matrix if \code{sparse} is \code{FALSE}) with documents
#' in rows, classes in columns, and the class rank (sequence) as values.
#' @examples
#' table <- data.frame(
#'   patent_id = c("a", "a", "b"),
#'   class = c(1, 3, 2),
#'   sequence = c(1, 0, 0)
#' )
#' patentsview_class_matrix(table, class_id = "class")
#'
#' \dontrun{
#'
#' # get a matrix of WIPO class assignments
#' wipo_fields <- patentsview_class_matrix("wipo")
#'
#' # get a subset without creating the full matrix
#' wipo <- download_patentsview_bulk("wipo", make_db = TRUE)
#' wipo_fields_sub <- patentsview_class_matrix(dplyr::compute(dplyr::filter(
#'   wipo, patent_id %in% c("10000002", "10000015", "10000017")
#' )))
#' }
#' @export

patentsview_class_matrix <- function(table, outFile = NULL, class_id = NULL, doc_id = "patent_id", rank = "sequence",
                                     type = NULL, ..., sparse = TRUE, overwrite = FALSE) {
  m <- NULL
  if (!is.null(outFile)) {
    outFile <- sub("\\.[A-Za-z]{3}.*", ".rds", outFile)
    if (!overwrite && file.exists(outFile)) {
      m <- readRDS(outFile)
    }
  }
  if (is.null(m)) {
    if (is.character(table) && length(table) == 1) {
      type <- if (is.null(type)) basename(table) else type
      table <- if (file.exists(table)) {
        arrow::read_delim_arrow(table, delim = "\t", escape_backslash = TRUE)
      } else {
        download_patentsview_bulk(table, ..., make_db = TRUE, return_table = FALSE)
      }
    }
    if (!is.null(type)) {
      type <- substr(type, 1, 3)
      if (is.null(class_id)) {
        class_id <- switch(type,
          wip = "field_id",
          cpc = "group_id",
          ipc = "section",
          nbe = "subcategory_id",
          usp = "mainclass_id"
        )
      }
    } else if (is.null(class_id)) {
      class_id <- c("field_id", "group_id", "section", "subcategory_id", "mainclass_id")
      class_id <- class_id[class_id %in% names(table)]
      if (length(class_id) > 1) class_id <- class_id[[1]]
    }
    if (!length(class_id)) stop("specify a class_id or type from which it can be inferred", call. = FALSE)
    cols <- c(class_id, doc_id)
    if (!all(cols %in% names(table))) {
      stop("unrecognized column reference: ", paste(cols[!cols %in% names(table)], collapse = ", "))
    }
    if (!is.data.frame(table) && (!is.null(table$NewScan) || !is.null(table$GetColumnByName))) {
      if (!is.null(table$NewScan)) table <- table$NewScan()$Filter(arrow::Expression$scalar(TRUE))$Finish()$ToTable()
      classes <- sort(unique(table$GetColumnByName(class_id)))$as_vector()
      docs <- sort(unique(table$GetColumnByName(doc_id)))$as_vector()
      table <- as.data.frame(table)
    } else {
      if (inherits(table, "arrow_dplyr_query")) {
        stop(
          "enter a completed query (e.g., using dplyr::compute), or a data.frame",
          call. = FALSE
        )
      }
      classes <- sort(unique(table[[class_id]]))
      docs <- sort(unique(table[[doc_id]]))
    }
    m <- Matrix::sparseMatrix(
      as.integer(factor(table[[doc_id]], docs)), as.integer(factor(table[[class_id]], classes)),
      x = if (rank %in% names(table)) table[[rank]] + 1 else 1,
      dims = c(length(docs), length(classes)), dimnames = list(docs, classes)
    )
    if (!is.null(outFile)) saveRDS(m, outFile, compress = "xz")
  }
  if (sparse) m else as.matrix(m)
}

#' Download Office Actions
#'
#' Download a set of Office actions based on a query to the
#' \href{https://developer.uspto.gov/api-catalog/uspto-office-action-text-retrieval-api}{Office Action Text Retrieval API}.
#' @param criteria Search criteria, in the form of \code{"field:value"} (e.g., \code{"examinerEmployeeNumber:80488"}).
#' See available \href{https://developer.uspto.gov/ds-api/oa_actions/v1/fields}{fields}.
#' @param outFile Path to the resulting JSON file; defaults to the hash of the query in the current working directory.
#' @param start Initial record; useful if making requests in chunks.
#' @param limit Maximum number of office actions to return; under 1,000, or in steps of 1,000; defaults to all found.
#' @param overwrite Logical; if \code{TRUE}, will overwrite an existing file with the same \code{outFile}.
#' @param endpoint API endpoint.
#' @param compress Logical; if \code{FALSE}, will not write a compressed file.
#' @param cores Number of CPU cores to split calls across if necessary.
#' @param verbose Logical; if \code{FALSE}, will not print status messages.
#' @return A list of office actions.
#' @examples
#' \dontrun{
#' # retrieve the office actions associated with a particular application
#' office_actions <- download_office_actions("patentApplicationNumber:13877637")
#' }
#' @export

download_office_actions <- function(criteria = "*:*", outFile = NULL, start = 0, limit = FALSE, overwrite = FALSE,
                                    endpoint = "https://developer.uspto.gov/ds-api/oa_actions/v1/records",
                                    compress = TRUE, cores = detectCores() - 2, verbose = TRUE) {
  final <- normalizePath(paste0(if (is.null(outFile)) {
    paste0(tempdir(), "/", if (!missing(criteria)) paste0(sub(":", "_", criteria, fixed = TRUE), "_"), Sys.Date())
  } else {
    sub("\\.json.*$", "", outFile)
  }, ".json", if (compress) ".xz"), "/", FALSE)
  if (!overwrite && file.exists(final)) {
    if (verbose) message("Reading in existing results")
    return(invisible(read_json(final)))
  }
  query <- httr::POST(
    endpoint, httr::add_headers("Content-Type" = "application/x-www-form-urlencoded", Accept = "application/json"),
    body = paste0("criteria=", criteria, "&start=", start, "&rows=", min(1000, limit))
  )
  if (200 != httr::status_code(query)) {
    error <- httr::http_error(query)
    stop("query failed:\n", if (is.logical(error)) httr::content(query) else error, call. = FALSE)
  }
  res <- httr::content(query)$response
  limit <- if (is.numeric(limit)) min(limit, res$numFound) else res$numFound
  if (verbose) {
    message(
      "Found ", res$numFound, " office actions for ",
      if (nchar(criteria) > 104) paste(substring(criteria, 1, 100), "...") else criteria
    )
  }
  if (length(res$docs) < limit) {
    pages <- ceiling((limit - length(res$docs)) / 1000)
    get_page <- function(i) {
      query <- httr::POST(
        endpoint, httr::add_headers("Content-Type" = "application/x-www-form-urlencoded", Accept = "application/json"),
        body = paste0("criteria=", criteria, "&start=", 100 + (i - 1) * 1000, "&rows=1000")
      )
      if (200 == httr::status_code(query)) {
        httr::content(query)$response$docs
      }
    }
    res2 <- if (cores > 1) {
      cl <- parallel::makeCluster(max(1, min(pages, cores)))
      parallel::clusterExport(cl, "criteria", environment())
      on.exit(parallel::stopCluster(cl), TRUE)
      parallel::parLapply(cl, seq_len(pages), get_page)
    } else {
      lapply(seq_len(pages), get_page)
    }
    res$docs <- c(res$docs, unlist(res2, recursive = FALSE))
  }
  dir.create(dirname(final), FALSE, TRUE)
  if (compress) {
    con <- xzfile(final)
    on.exit(close(con), add = TRUE)
  }
  write_json(res$docs, if (compress) con else final, auto_unbox = TRUE)
  invisible(res$docs)
}

1		#' Search PatentsView
2		#'
3		#' Submits a query to one of the \href{https://patentsview.org}{PatentsView} endpoints.
4		#'
5		#' @param query A query string or list; for example \code{list(patent_number = 7861317)} or
6		#' \code{'{"_text_all": {"patent_abstract": ["photographic", "noodle"]}}'}. See
7		#' \href{https://patentsview.org/apis/api-query-language}{API Query Language}.
8		#' @param fields A vector of fields to return; defaults to all for the given endpoint. Set to \code{NULL}
9		#' to use the API's default.
10		#' @param sort A list or list of lists specifying how to sort results (e.g., \code{list(patent_date = "asc")}).
11		#' @param outDir Directory in which to save results; defaults to a temporary directory.
12		#' @param include_related Logical; if \code{TRUE}, includes subentities related to entities matching the search criteria.
13		#' @param start Page to start collecting results on.
14		#' @param limit Maximum number of results to return; sets page size, up to 10,000 per page
15		#' (1,000 if \code{legacy} if \code{FALSE}).
16		#' @param endpoint Name of the API endpoint to search in, as listed on
17		#' \href{https://patentsview.org/apis/api-endpoints}{PatentsView}.
18		#' @param post Logical; if \code{FALSE}, will make a GET rather than POST request.
19		#' @param retry Maximum number of times to retry a request, in the event of a rate limit.
20		#' @param verbose Logical; if \code{FALSE}, does not print status messages.
21		#' @param legacy Logical; if \code{FALSE}, will use the beta API, which requires a key.
22		#' @param key Beta API key (\href{https://patentsview.org/apis/keyrequest}{Request a key});
23		#' defaults to the \code{PATENTSVIEW_KEY} environment variable.
24		#' @param cores Number of CPU cores to use when retrieving multiple pages of results.
25		#' @param overwrite Logical; if \code{TRUE}, overwrites an existing result.
26		#' @return A \code{data.frame} of results, if any were found. This may contain list entries
27		#' of \code{data.frame} with varying dimensions, depending on the \code{endpoint} and \code{fields}.
28		#' @examples
29		#' \dontrun{
30		#' # search for patents with a meteorology classification:
31		#' # https://www.uspto.gov/web/patents/classification/cpc/html/cpc-G01W.html
32		#' results <- uspto_search_patentsview(list(cpc_group_id = "G01W"))
33		#'
34		#' # search by date and keyword:
35		#' results <- uspto_search_patentsview(list(
36		#' "_and" = list(
37		#' list(patent_date = "2002-01-08"),
38		#' list("_text_any" = list(patent_abstract = "motorcycle"))
39		#' )
40		#' ))
41		#' }
42		#' @export
43
44		uspto_search_patentsview <- function(query, fields = NULL, sort = NULL, outDir = tempdir(), include_related = FALSE, start = 1,
45		limit = 1e4, endpoint = "patent", post = TRUE, retry = 10, verbose = FALSE, legacy = TRUE,
46		key = Sys.getenv("PATENTSVIEW_KEY"), cores = detectCores() - 1, overwrite = FALSE) {
47	!	if (key == "") legacy <- TRUE
48	16x	if (legacy) {
49	4x	page_limit <- 1e4
50	4x	if (is.null(fields)) {
51	4x	all_fields <- GET(paste0(
52	4x	"https://raw.githubusercontent.com/PatentsView/PatentsView-API/master/querymodule/tests/testspec/",
53	4x	endpoint, "_master_field_list.json"
54		))
55	!	if (all_fields$status_code != 200) stop(endpoint, " is not a recognized endpoint", call. = FALSE)
56	4x	all_fields <- fromJSON(rawToChar(all_fields$content))$fields
57		}
58	4x	if (!grepl("s$", endpoint)) endpoint <- paste0(endpoint, "s")
59	4x	url <- paste0("https://api.patentsview.org/", endpoint, "/query")
60	4x	per_page <- min(page_limit, limit)
61	4x	start <- ceiling(start / per_page)
62	4x	options <- list(page = start, per_page = per_page, matched_subentities_only = include_related)
63		} else {
64	12x	page_limit <- 1e3
65	12x	url <- paste0("https://search.patentsview.org/api/v1/", endpoint, "/")
66	12x	all_fields <- switch(endpoint,
67	12x	patent = c(
68	12x	"patent_number", "patent_title", "patent_date", "patent_type", "patent_country", "patent_year",
69	12x	"patent_abstract", "patent_kind", "patent_num_foreign_documents_cited", "patent_num_us_applications_cited",
70	12x	"patent_us_patents_cited", "patent_num_total_documents_cited", "patent_num_times_cited_by_us_patents",
71	12x	"patent_earliest_application_date", "patent_patent_processing_days", "patent_uspc_current_mainclass_average_patent_processing_days",
72	12x	"patent_cpc_current_group_average_patent_processing_days", "patent_term_extension", "patent_detail_desc_length",
73	12x	"assignees_at_grant", "inventors_at_grant", "cpc_current"
74		),
75	12x	inventor = c(
76	12x	"inventor_id", "name_first", "name_last", "lastknown_city", "lastknown_state", "lastknown_country",
77	12x	"first_seen_date", "last_seen_date", "num_assignees", "num_patents", "years_active", "inventor_years"
78		),
79	12x	assignee = c(
80	12x	"assignee_id", "first_seen_date", "last_seen_date", "lastknown_city", "lastknown_country", "lastknown_latitude",
81	12x	"lastknown_longitude", "lastknown_state", "name_first", "name_last", "num_inventors", "num_patents", "organization",
82	12x	"type", "years_active", "assignee_years"
83		),
84	12x	cpc_subsection = c(
85	12x	"cpc_section_id", "cpc_subsection_id", "cpc_subsection_title", "cpc_subsection_num_patents", "cpc_subsection_num_assignees",
86	12x	"cpc_subsection_num_inventors", "cpc_subsection_first_seen_date", "cpc_subsection_last_seen_date", "cpc_subsection_years_active"
87		),
88	12x	cpc_group = c(
89	12x	"cpc_group_id", "cpc_group_title", "cpc_group_num_patents", "cpc_group_num_assignees", "cpc_group_num_inventors",
90	12x	"cpc_group_first_seen_date", "cpc_group_last_seen_date", "cpc_group_years_active"
91		),
92	12x	cpc_subgroup = c("cpc_subgroup_id", "cpc_subgroup_title"),
93	12x	uspc_mainclass = c(
94	12x	"uspc_mainclass_id", "uspc_mainclass_title", "uspc_mainclass_num_patents", "uspc_mainclass_num_assignees",
95	12x	"uspc_mainclass_num_inventors", "uspc_mainclass_first_seen_date", "uspc_mainclass_last_seen_date", "uspc_mainclass_years_active"
96		),
97	12x	uspc_subclass = c("uspc_subclass_id", "uspc_subclass_title"),
98	12x	nber_category = c("nber_category_id", "nber_category_title"),
99	12x	nber_subcategory = c(
100	12x	"nber_subcategory_id", "nber_subcategory_title", "nber_subcategory_num_patents",
101	12x	"nber_subcategory_num_assignees", "nber_subcategory_num_inventors", "nber_subcategory_first_seen_date",
102	12x	"nber_subcategory_last_seen_date", "nber_subcategory_years_active"
103		),
104	12x	patent_citation = c("patent_number", "cited_patent_number", "citation_category", "citation_sequence", "citation_date"),
105	12x	application_citation = c(
106	12x	"patent_number", "cited_application_number", "citation_category", "citation_kind", "citation_sequence", "citation_date"
107		),
108	12x	NULL
109		)
110	!	if (is.null(all_fields)) stop(endpoint, " is not a recognized endpoint", call. = FALSE)
111	12x	per_page <- min(page_limit, limit)
112	12x	start <- ceiling(start / per_page)
113	12x	options <- list(size = per_page, matched_subentities_only = include_related)
114		}
115	!	if (is.character(query)) query <- jsonlite::fromJSON(query)
116	16x	fs <- if (is.null(fields)) all_fields else fields
117	16x	rf <- if (post) {
118	14x	function(page, attempt = 1) {
119	13x	if (is.null(options$page)) {
120	11x	options$offset <- page * 1e3
121		} else {
122	2x	options$page <- page + 1
123		}
124	13x	body <- list(
125	13x	q = query,
126	13x	f = fs,
127	13x	s = sort,
128	13x	o = options
129		)
130	13x	req <- httr::POST(
131	13x	url,
132	13x	httr::add_headers("X-Api-Key" = key, "Content-Type" = "application/json", Accept = "application/json"),
133	13x	body = body, encode = "json"
134		)
135	13x	if (req$status_code == 429 && attempt < retry) {
136	!	wait <- as.integer(req$headers[["retry-after"]])
137	!	if (length(wait)) Sys.sleep(wait)
138	!	return(rf(page, attempt + 1))
139		}
140	13x	if (req$status_code == 200) {
141	13x	jsonlite::fromJSON(rawToChar(req$content))
142		} else {
143	!	req
144		}
145		}
146		} else {
147	2x	function(page, attempt = 1) {
148	2x	if (is.null(options$page)) {
149	1x	options$offset <- page * 1e3
150		} else {
151	1x	options$page <- page + 1
152		}
153	2x	body <- paste0(
154	2x	"?q=", jsonlite::toJSON(query, auto_unbox = TRUE),
155	2x	paste0('&f=["', paste(fs, collapse = '","'), '"]'),
156	2x	if (length(sort)) paste0("&s=", jsonlite::toJSON(sort, auto_unbox = TRUE)),
157	2x	paste0("&o=", jsonlite::toJSON(options, auto_unbox = TRUE))
158		)
159	2x	req <- httr::GET(
160	2x	URLencode(paste0(url, body)),
161	2x	httr::add_headers("X-Api-Key" = key, "Content-Type" = "application/json", Accept = "application/json")
162		)
163	2x	if (req$status_code == 429 && attempt < retry) {
164	!	wait <- as.integer(req$headers[["retry-after"]])
165	!	if (length(wait)) Sys.sleep(wait)
166	!	return(rf(page, attempt + 1))
167		}
168	2x	if (req$status_code == 200) {
169	2x	jsonlite::fromJSON(rawToChar(req$content))
170		} else {
171	!	req
172		}
173		}
174		}
175	16x	dir.create(outDir, FALSE, TRUE)
176	16x	cache <- paste0(normalizePath(outDir, "/"), "/", digest::digest(list(url, query, fs, sort, options)), ".json")
177	16x	if (!overwrite && file.exists(cache)) {
178	1x	if (verbose) message("reading existing results")
179	1x	return(jsonlite::read_json(cache, simplifyVector = TRUE))
180		}
181	15x	res <- rf(0)
182	15x	if (!is.null(res$status_code)) {
183	!	stop(
184	!	"request failed: ", res$status_code, "; ", if (res$status_code == 400) {
185	!	paste0(
186	!	res$headers$`x-status-reason`,
187	!	if (!is.null(res$headers$`x-status-reason-code`)) paste0(" (", res$headers$`x-status-reason-code`, ")")
188		)
189		} else {
190	!	jsonlite::toJSON(jsonlite::fromJSON(rawToChar(res$content)), auto_unbox = TRUE, pretty = TRUE)
191		},
192	!	call. = FALSE
193		)
194		}
195	15x	if (!is.null(res$error) && res$error) {
196	!	stop(
197	!	"request failed: ", jsonlite::toJSON(res, auto_unbox = TRUE, pretty = TRUE),
198	!	call. = FALSE
199		)
200		}
201	15x	total_name <- grep("total_", names(res), fixed = TRUE, value = TRUE)
202	15x	rn <- names(res)[!names(res) %in% c(total_name, "count", "total_hits", "error")]
203	15x	total <- res[[total_name]]
204	!	if (is.null(total) \|\| total == 0) stop("no results found", call. = FALSE)
205	15x	limit <- min(limit, total) - res$count
206	15x	if (limit > 0) {
207	!	pages <- ceiling(limit / page_limit)
208	!	cores <- min(cores, pages)
209	!	res2 <- if (cores > 1) {
210	!	env <- new.env(parent = globalenv())
211	!	environment(rf) <- env
212	!	cl <- makeCluster(cores)
213	!	on.exit(stopCluster(cl))
214	!	for (o in c("cl", "start", "pages", "key", "rf", "url", "query", "fs", "sort", "options", "retry")) env[[o]] <- get(o)
215	!	clusterExport(cl, ls(envir = env), environment())
216	!	eval(expression(parallel::parLapply(cl, seq(start, pages), rf)), envir = env)
217		} else {
218	!	lapply(seq(start, pages), rf)
219		}
220	!	res[[rn]] <- do.call(rbind, c(list(res[[rn]]), lapply(res2, "[[", rn)))
221		}
222	!	if (verbose) message("found ", total, ", returning ", nrow(res[[rn]]))
223	15x	jsonlite::write_json(res[[rn]], cache, auto_unbox = TRUE)
224	15x	res[[rn]]
225		}

1		#' Extract Timings from Examination Events
2		#'
3		#' Extract timings between select event codes within an examination timeline.
4		#' @param record A matrix-like object with dates in the first column (recognizable by
5		#' \code{\link{as.Date}}) and event codes (recognizable by the patterns in \code{span}) in the second.
6		#' @param spans A list with entries defining each event window. Each entry should be a list with
7		#' entries for \code{start}, \code{end}, and \code{before}, each containing regular expression
8		#' strings identifying event codes.
9		#' @returns A list with a \code{data.frame} for each entry in \code{span}.
10		#' @examples
11		#' extract_event_timing(data.frame(
12		#' date = c("2020-01-15", "2020-01-20"),
13		#' event = c("FWDX", "CTNF")
14		#' ))
15		#' @export
16
17		extract_event_timing <- function(record, spans = list(
18		"Initial Classification" = list(start = "^PGPC$", end = "^(?:OI\|PG$)", before = "^DOCK$"),
19		"Art Unit to Examiner" = list(start = "^DOCK$", end = "^FWDX$", before = "^(?:CTFR\|EX\\.R\|N/=\\.)"),
20		"Examiner to Action" = list(start = "^FWDX$", end = "^(?:CT\|EX\\.R\|N/=\\.)", before = "^(?:DOCK$\|A)")
21		)) {
22	1x	if (is.list(record) && !is.data.frame(record)) {
23	!	if ("prosecutionHistoryDataBag" %in% names(record)) {
24	!	record <- record$prosecutionHistoryDataBag$prosecutionHistoryData
25	!	} else if ("prosecutionHistoryDataBag" %in% names(record[[1]])) {
26	!	timings <- lapply(record, function(r) extract_event_timing(r$prosecutionHistoryDataBag$prosecutionHistoryData))
27	!	names(timings) <- vapply(record, function(r) r$patentCaseMetadata$applicationNumberText$value, "")
28	!	return(timings)
29		}
30	!	record <- as.data.frame(lapply(
31	!	as.data.frame(do.call(rbind, record)), unlist,
32	!	recursive = FALSE, use.names = FALSE
33		))
34		}
35	!	if (ncol(record) < 2) stop("record is not in the expected format", call. = FALSE)
36	1x	record[, 1] <- as.Date(record[, 1], tryFormats = "%Y-%m-%d", options = TRUE)
37	!	if (anyNA(record[, 1])) record <- record[!is.na(record[, 1]), ]
38	1x	record <- record[order(record[, 1]), ]
39	1x	dates <- record[, 1]
40	1x	events <- record[, 2]
41	1x	n <- length(events)
42	1x	lapply(spans, function(s) {
43	3x	pos <- lapply(s, grep, events)
44	3x	if (length(pos$start)) {
45	1x	res <- list()
46	1x	lastEnd <- 0
47	1x	current <- 0
48	1x	for (i in seq_along(pos$start)) {
49	1x	p <- pos$start[i]
50	1x	b <- pos$before[pos$before > p]
51	1x	b <- if (length(b)) b[1] else Inf
52	1x	e <- pos$end[pos$end > current & pos$end > p & pos$end < b]
53	1x	if (length(e)) {
54	1x	current <- e[1]
55	1x	res[[i]] <- data.frame(
56	1x	start = n - p + 1, startDate = dates[p], startEvent = events[p],
57	1x	end = n - current + 1, endDate = dates[current], endEvent = events[current],
58	1x	days = as.numeric(dates[current] - dates[p])
59		)
60		}
61		}
62	1x	do.call(rbind, res)
63		}
64		})
65		}

1		#' Search for Patents or Patent Applications (legacy)
2		#'
3		#' Searches the Application (AppFT) or Patent (PatFT)
4		#' \href{https://appft.uspto.gov}{Full-Text Databases}, and returns
5		#' document/patent numbers and titles.
6		#'
7		#' @param query A query string. This can be a set of terms (e.g., \code{"term1 OR term2"}) or
8		#' an exact phrase (e.g., \code{'"a phrase"'}). To search within a particular
9		#' \href{https://appft.uspto.gov/netahtml/PTO/help/helpflds.html}{field}, prefix with the field code
10		#' and a forward slash (e.g., \code{"CPCL/G01W"}, which searches for "G01W" in the CPCL field).
11		#' @param applications Logical; specifies whether to search in the application database
12		#' (AppFT; \code{TRUE}; default) or the patent database (PatFT; \code{FALSE}).
13		#' @param limit A limit to the number of results retrieved, in steps of 50. By default, will
14		#' try to retrieve all results.
15		#' @param retries Number of retries for a given results page.
16		#' @param outdir Path to a directory in which to save results. Only writes if specified.
17		#' @param overwrite Logical; if \code{TRUE}, will overwrite caches results.
18		#' @param cores Number of CPU cores to use when reading in multiple pages.
19		#' @param verbose Logical; if \code{FALSE}, does not print status messages.
20		#' @return A \code{data.frame} with columns for \code{index} (index of the result),
21		#' \code{id} (patent number or application document number), and \code{title} (title of patent or application).
22		#' @seealso For the newer search mechanism, see \code{\link{uspto_search}}.
23		#' @examples
24		#' \dontrun{
25		#' # search for applications with a meteorology classification:
26		#' # https://www.uspto.gov/web/patents/classification/cpc/html/cpc-G01W.html
27		#' results <- search_fulltext("CPCL/G01W")
28		#'
29		#' # search for patents by date and title keyword
30		#' results <- search_fulltext("isd/1/8/2002 and motorcycle", FALSE)
31		#' }
32		#' @export
33
34		search_fulltext <- function(query, applications = TRUE, limit = NULL, retries = 20, outdir = NULL,
35		overwrite = FALSE, cores = parallel::detectCores() - 2, verbose = TRUE) {
36	2x	base_url <- paste0(
37	2x	"https://", if (applications) "app" else "pat",
38	2x	"ft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=/netahtml/PTO/search-adv.html&r=0&f=S&l=50&d=",
39	2x	if (applications) "PG01" else "PTXT", "&p="
40		)
41	2x	query <- URLencode(gsub("\\s+", "+", query, perl = TRUE))
42	2x	outFile <- normalizePath(paste0(outdir, "/", digest::digest(query), ".csv"), "/", FALSE)
43	!	if (overwrite) unlink(outFile)
44	2x	if (file.exists(outFile)) {
45	!	if (verbose) message("reading existing file: ", outFile)
46	!	read.csv(outFile)
47		} else {
48	2x	retrieve_page <- function(input, attempt = retries, parse = TRUE) {
49	3x	results <- tryCatch(
50		{
51	3x	con <- curl::curl(paste0(base_url, input, "&Query=", query))
52	3x	on.exit(close(con))
53	3x	readLines(con, warn = FALSE)
54		},
55	3x	error = function(e) e$message
56		)
57	3x	if (length(results) < 100) {
58	!	if (attempt > 0) {
59	!	Sys.sleep(.5)
60	!	retrieve_page(input, attempt - 1, parse)
61		} else {
62	!	NULL
63		}
64		} else {
65	2x	if (parse) parse_page(results) else results
66		}
67		}
68	2x	parse_page <- function(results) {
69	3x	rows <- c(grep("^<TR><TD", results, perl = TRUE), length(results))
70	3x	res <- as.data.frame(do.call(rbind, lapply(seq_len(length(rows) - 1), function(i) {
71	109x	row <- results[seq(rows[i], rows[i + 1])]
72	109x	split_row <- strsplit(c(row[1:2], paste(row[-(1:2)], collapse = " ")), "[><]")
73	109x	c(split_row[[1]][5], split_row[[2]][5], split_row[[3]][if (split_row[[3]][5] == "") 11 else 5])
74	3x	})), make.names = FALSE, optional = TRUE)
75	3x	colnames(res) <- c("index", "id", "name")
76	3x	res$index <- as.integer(res$index)
77	3x	res$name <- sub("\\s+$", "", gsub("\\s+", " ", res$name, perl = TRUE), perl = TRUE)
78	3x	res
79		}
80	2x	initial <- retrieve_page(1, retries, FALSE)
81	!	if (length(initial) < 30) stop("Failed to retrieve initial page: ", initial, call. = FALSE)
82	2x	report <- grep("Results of Search", initial[1:100], fixed = TRUE)
83	!	if (!length(report)) stop("Response is in an unexpected format; check ", paste0(base_url, 1, "&Query=", query), call. = FALSE)
84	2x	n <- as.numeric(regmatches(initial[report], regexec("(\\d+)\\s\\w+\\.$", initial[report]))[[1]][2])
85	!	if (verbose) message("found ", n, " results", if (is.numeric(limit)) paste("; retrieving", limit))
86	1x	if (is.numeric(limit)) n <- min(limit, n)
87	2x	pages <- ceiling(n / 50)
88	2x	res <- if (pages > 1) {
89	1x	res <- if (cores > 1) {
90	1x	cl <- parallel::makeCluster(max(1, min(pages, cores)))
91	1x	parallel::clusterExport(cl, c("base_url", "query", "retrieve_page", "parse_page"), environment())
92	1x	on.exit(parallel::stopCluster(cl), TRUE)
93	1x	parallel::parLapply(cl, seq(2, pages), retrieve_page)
94		} else {
95	!	lapply(seq(2, pages), retrieve_page)
96		}
97	1x	if (verbose) {
98	!	missed <- vapply(res, length, 0) == 0
99	!	if (any(missed)) warning("missed pages: ", paste(which(missed) + 1, collapse = ", "))
100		}
101	1x	rbind(parse_page(initial), do.call(rbind, res))
102		} else {
103	1x	parse_page(initial)
104		}
105	2x	if (!is.null(outdir)) {
106	2x	dir.create(outdir, FALSE, TRUE)
107	2x	write.csv(res, outFile, row.names = FALSE)
108		}
109	2x	res
110		}
111		}

1		#' Download Patent Examination Data
2		#'
3		#' Download a bundled query response from the \href{https://ped.uspto.gov/peds}{Patent Examination Data System}.
4		#'
5		#' @param searchText Text to search for; \code{":"} for none (default); see
6		#' the FAQ on the \href{https://ped.uspto.gov/peds}{PEDS site}.
7		#' @param filters A list of characters representing filter conditions in the form of \code{"field:value"}
8		#' (e.g., \code{list('appStatus:"Patented Case"')}).
9		#' @param outFile Name of the final JSON file, or directory in which to save files with parameter-hash names.
10		#' @param start Initial record.
11		#' @param minMatch Minimum number of terms required to match.
12		#' @param overwrite Logical; overwrite any previous queries with the same body hash.
13		#' @param waits Number of times to check for a completed bundle before giving up.
14		#' @param wait Number of seconds to wait between retries (how long \code{waits} are).
15		#' @param endpoint PEDs API endpoint.
16		#' @param compress Logical; if \code{FALSE}, will not xz-compress the \code{outFile}.
17		#' @param load Logical; if \code{FALSE}, will not return the downloaded content, or load in existing content.
18		#' @param verbose Logical; if \code{FALSE}, will not print status messages.
19		#' @return A list with an entry for each year in the set of patent metadata.
20		#' @examples
21		#' \dontrun{
22		#' # like case 3 in the API Documentation > API Tutorial on https://ped.uspto.gov/peds
23		#' download_peds(
24		#' "firstNamedApplicant:(Google)",
25		#' list('appStatus:"Patented Case"', "appFilingDate:[2013-01-01T00:00:00Z TO 2013-03-31T23:59:59Z]")
26		#' )
27		#' }
28		#' @export
29
30		download_peds <- function(searchText = ":", filters = list(":"), outFile = NULL, start = 0, minMatch = "100%", overwrite = FALSE,
31		waits = 50, wait = 10, endpoint = "https://ped.uspto.gov/api/queries/", compress = TRUE, load = TRUE, verbose = TRUE) {
32		# https://ped.uspto.gov/api/search-params
33	2x	body <- list(
34	2x	searchText = searchText,
35	2x	fq = filters,
36	2x	fl = "*",
37	2x	mm = minMatch,
38	2x	df = "patentTitle",
39	2x	qf = paste(
40	2x	"appEarlyPubNumber applId appLocation appType appStatus_txt appConfrNumber appCustNumber",
41	2x	"appGrpArtNumber appCls appSubCls appEntityStatus_txt patentNumber patentTitle primaryInventor",
42	2x	"firstNamedApplicant appExamName appExamPrefrdName appAttrDockNumber appPCTNumber",
43	2x	"appIntlPubNumber wipoEarlyPubNumber pctAppType firstInventorFile appClsSubCls rankAndInventorsList"
44		),
45	2x	facet = FALSE,
46	2x	sort = "applId asc",
47	2x	start = start
48		)
49	2x	hash <- digest::digest(body)
50	2x	final <- normalizePath(paste0(if (is.null(outFile) \|\| dir.exists(outFile)) {
51	1x	paste0(if (is.null(outFile)) tempdir() else outFile, "/", hash)
52		} else {
53	1x	sub("\\.json.*$", "", outFile)
54	2x	}, ".json", if (compress) ".xz"), "/", FALSE)
55	2x	if (!overwrite && file.exists(final)) {
56	!	if (load) {
57	!	if (verbose) message("Reading in existing results")
58	!	return(invisible(jsonlite::read_json(final, simplifyDataFrame = FALSE)))
59		} else {
60	!	if (verbose) message("Results already exist")
61	!	return()
62		}
63		}
64	2x	bundle <- paste0(normalizePath(tempdir(), "/"), "/", hash, ".zip")
65	2x	exdir <- sub(".zip", "", bundle, fixed = TRUE)
66	2x	output <- list(ID = "", content = list())
67	2x	if (load) on.exit(return(invisible(output$content)))
68	2x	if (overwrite \|\| (!file.exists(bundle) && !dir.exists(exdir))) {
69	2x	query <- httr::POST(
70	2x	endpoint, httr::add_headers("Content-Type" = "application/json", Accept = "application/json"),
71	2x	body = body, encode = "json"
72		)
73	2x	if (200 != httr::status_code(query)) {
74	!	error <- httr::http_error(query)
75	!	errorContent <- httr::content(query)
76	!	stop("query failed:\n", if (length(errorContent)) errorContent else if (is.logical(error)) httr::status_code(query) else error, call. = FALSE)
77		}
78	2x	res <- httr::content(query)
79	2x	output$ID <- ID <- res$queryId
80	!	if (0 == res$queryResults$searchResponse$response$numFound) stop("No records found for ", ID, call. = FALSE)
81	2x	httr::PUT(paste0(endpoint, ID, "/package?format=JSON"))
82	!	if (verbose) message("Package requested for ", res$queryResults$searchResponse$response$numFound, " record(s): ", ID)
83	2x	retries <- 1
84	2x	while (jsonlite::read_json(paste0(endpoint, ID))$jobStatus != "COMPLETED") {
85	!	if (retries > waits) stop("hit wait limit without seeing COMPLETED status for ", ID)
86	3x	retries <- retries + 1
87	!	if (verbose) cat(paste0("\rwaiting for bundle (try: ", retries, ") ... "))
88	3x	Sys.sleep(wait)
89		}
90	!	if (verbose) cat("complete\n")
91	2x	url <- paste0(endpoint, ID, "/download?format=JSON")
92	2x	download.file(url, bundle, mode = "wb")
93		}
94	2x	if (file.exists(bundle)) {
95	!	if (verbose) message("Unpacking ", bundle)
96	2x	system2("unzip", c("-d", shQuote(exdir), shQuote(bundle)), stdout = TRUE)
97	2x	unlink(bundle)
98		}
99	2x	output$content <- unlist(lapply(list.files(exdir, full.names = TRUE), function(f) {
100	2x	res <- tryCatch(jsonlite::read_json(f, simplifyDataFrame = FALSE)$PatentData, error = function(e) NULL)
101	2x	if (is.null(res)) {
102	!	con <- gzfile(f)
103	!	on.exit(close(con))
104	!	res <- tryCatch(fromJSON(paste(
105	!	c(scan(con, "", quote = "", na.strings = "", quiet = TRUE), "]}"),
106	!	collapse = " "
107	!	), simplifyDataFrame = FALSE)$PatentData, error = function(e) NULL)
108		}
109	2x	res
110	2x	}), recursive = FALSE, use.names = FALSE)
111	!	if (verbose) message("writing final results: ", normalizePath(final, "/", FALSE))
112	2x	dir.create(dirname(final), FALSE, TRUE)
113	2x	if (compress) {
114	2x	final <- xzfile(final)
115	2x	on.exit(close(final), add = TRUE)
116		}
117	2x	jsonlite::write_json(output$content, final, auto_unbox = TRUE)
118		}

1		#' Download PatentsView Bulk
2		#'
3		#' Download one of the bulk tables from \href{https://patentsview.org}{PatentsView}, and optionally
4		#' make an Arrow dataset from it.
5		#'
6		#' @param table Name of the table to download, as listed on the
7		#' \href{https://patentsview.org/download/data-download-tables}{Data Downloads page}.
8		#' @param dir Directory in which to save original tables and the dataset if \code{make_db} is \code{TRUE}.
9		#' @param pregrant Logical; if \code{TRUE}, will download the pre-grant version of \code{table}.
10		#' @param partition A vector of column names to be used as partition keys, if \code{make_db} is \code{TRUE}.
11		#' Can be a named list of functions, which will be used to create new columns, then partition by those columns
12		#' (e.g., \code{list(series_code = function(d) substr(d$patent_id, 1, 2))}). If \code{NULL} not specified, will choose
13		#' a column with the number of unique values closest to 20.
14		#' @param make_db Logical; if \code{TRUE}, will make an Arrow dataset out of the downloaded table.
15		#' @param format Format of the dataset, if \code{make_db} is \code{TRUE}.
16		#' @param return_table Logical; if \code{FALSE}, returns the path to the file, rather than the read-in table.
17		#' @param overwrite Logical; if \code{TRUE}, overwrites any existing files (raw and prepared).
18		#' @return The original table (if \code{return_table} is \code{TRUE}; as a \code{tibble}), an opened dataset
19		#' (if the path to the dataset exists), or the path to the downloaded file.
20		#' @examples
21		#' \dontrun{
22		#' # download the application table of granted patents
23		#' download_patentsview_bulk(".", "application", partition = "series_code")
24		#' }
25		#' @export
26
27		download_patentsview_bulk <- function(table, dir = tempdir(), pregrant = FALSE, partition = NULL, make_db = FALSE,
28		format = "parquet", return_table = TRUE, overwrite = FALSE) {
29	7x	url <- paste0(
30	7x	"https://s3.amazonaws.com/data.patentsview.org/",
31	7x	if (pregrant) "pregrant_publications" else "download",
32	7x	"/", table, ".tsv.zip"
33		)
34	7x	dir.create(dir, FALSE, TRUE)
35	7x	dir <- paste0(normalizePath(dir, "/"), "/")
36	7x	file_zip <- paste0(dir, table, ".tsv.zip")
37	7x	file <- paste0(dir, table, if (pregrant) "_pg" else "_g", ".tsv")
38	7x	dbd <- paste0(dir, table, "_db")
39	!	if (overwrite) unlink(c(file_zip, file, dbd), TRUE, TRUE)
40	7x	if (!file.exists(file)) {
41	3x	if (!file.exists(file_zip)) download.file(url, file_zip)
42	3x	unzip(file_zip, exdir = dir)
43	3x	file.rename(sub(".zip", "", file_zip, fixed = TRUE), file)
44	3x	unlink(file_zip)
45		}
46	7x	raw <- NULL
47	1x	if (missing(make_db) && (!missing(partition) \|\| !missing(format))) make_db <- TRUE
48	2x	if (missing(return_table) && make_db) return_table <- FALSE
49	7x	if (make_db) {
50	4x	raw <- vroom::vroom(file, delim = "\t", show_col_types = FALSE)
51	4x	if (is.null(partition)) {
52	3x	partition <- names(which.min(abs(20 - vapply(raw, function(x) length(unique(x)), 0))))
53	1x	} else if (is.list(partition)) {
54	1x	for (n in names(partition)) {
55	1x	raw[[n]] <- partition[[n]](raw)
56		}
57	1x	partition <- names(partition)
58		}
59	4x	arrow::write_dataset(raw, dbd, partitioning = partition, format = format)
60		}
61	7x	if (!return_table && dir.exists(dbd)) {
62	4x	arrow::open_dataset(dbd, format = format)
63		} else {
64	3x	if (return_table) {
65	2x	if (is.null(raw)) {
66	1x	vroom::vroom(file, delim = "\t", show_col_types = FALSE)
67		} else {
68	1x	raw
69		}
70		} else {
71	1x	file
72		}
73		}
74		}

1		#' Get Examiner Name from Office Actions
2		#'
3		#' Extract the examiner's name from office action body text.
4		#' @param office_action A list of office action, as retrieved with \code{\link{download_office_actions}};
5		#' each entry should be a list with entries for \code{bodyText} (the office action text) and
6		#' \code{examinerEmplyeeNumber}.
7		#' @param collapse Logical; if \code{FALSE}, will return a row for every entered office action.
8		#' @return A character vector with the Examiner's employee number and two possible names: one
9		#' extracted from the language (\code{intext}) and another from the sign-off (\code{signed}). The name
10		#' will be \code{NA} if there is no examiner (such as if a non-examiner submitted the action), or
11		#' if the format is not as expected. If \code{office_action} is a list of multiple office actions,
12		#' these will make up rows of a \code{data.frame}, which will be as many as the length of
13		#' \code{office_actions} if \code{collapse} is \code{FALSE}, or the length of unique IDs.
14		#' @examples
15		#' \dontrun{
16		#' # retrieve the office actions associated with a particular application
17		#' office_actions <- download_office_actions("patentApplicationNumber:13877637")
18		#'
19		#' # get the name of the examiner associated with the first action:
20		#' extract_examiner_name(office_actions[[1]])
21		#' }
22		#' @export
23
24		extract_examiner_name <- function(office_action, collapse = TRUE) {
25	4x	if (length(office_action) > 1 && "bodyText" %in% names(office_action[[1]])) {
26	1x	names <- do.call(rbind, lapply(office_action, extract_examiner_name))
27	1x	if (collapse) {
28	1x	nas <- is.na(names[, 1])
29	1x	if (any(nas)) names[nas, 1] <- paste0("NA", seq_len(sum(nas)))
30	1x	names <- do.call(rbind, lapply(split(as.data.frame(names), names[, 1]), function(e) {
31	2x	if (nrow(e) < 2) {
32	2x	e[1, ]
33		} else {
34	!	intext <- e[, 2]
35	!	intext <- intext[!is.na(intext)]
36	!	signed <- e[, 3]
37	!	signed <- signed[!is.na(signed)]
38	!	data.frame(
39	!	id = e[1, "id"],
40	!	intext = if (length(intext)) intext[which.max(nchar(intext))] else NA,
41	!	signed = if (length(signed)) signed[which.max(nchar(signed))] else NA
42		)
43		}
44		}))
45		}
46	1x	return(names)
47		}
48	2x	if (!is.list(office_action)) office_action <- list(bodyText = office_action)
49	!	if (is.list(office_action$bodyText)) office_action$bodyText <- unlist(office_action$bodyText, FALSE, FALSE)
50	3x	body <- sub("\\s+$", "", office_action$bodyText, perl = TRUE)
51	!	if (!length(body)) stop("office_action is not in the expected format", call. = FALSE)
52	3x	intext <- regmatches(body, regexec(
53	3x	"directed to (?:primary )?(?:examiner )?([A-Z][^/\\n\\t]+?) who(?: may be reached\|se (?:telephone )?number)",
54	3x	body,
55	3x	perl = TRUE
56		))
57	3x	intext <- if (length(intext) && length(intext[[1]])) {
58	2x	intext <- gsub("(?:Primary\\s\|Petitions\\s)?Examiner\\s\|,$", "", intext[[1]][2], perl = TRUE)
59		} else {
60	1x	NULL
61		}
62	3x	l <- nchar(body)
63	3x	foot <- paste0(sub("\\s{10,}", " ", if (l > 1000) substring(body, l - 2000) else body, perl = TRUE), "XXEE")
64	3x	parts <- strsplit(
65	3x	gsub("\\s+[\\w.]+@uspto\\.gov\|number='\\d+'\|[A-Z][a-z]+ \\d+, \\d{4}", "", foot, perl = TRUE),
66	3x	paste0(
67	3x	"(?:[^\\w\\s]\\W*\|[\\d/\\t\\n-]+)(?:",
68	3x	"Examiner\|Primary\\s+Examiner\|Petitions\\s+Examiner",
69	3x	")\\s?(?:[A-Z]{2}\|Art\|Unit\|[\\d\\n,]+\|XX)"
70		),
71	3x	perl = TRUE
72	3x	)[[1]]
73	3x	signed <- if (length(parts) > 1) {
74	1x	parts <- strsplit(parts[1], "[\\n/]+", perl = TRUE)[[1]]
75	1x	sub("_x000d_.*\|\\s+$", "", gsub("\\s+", " ", parts[length(parts)], perl = TRUE))
76		} else {
77	2x	NULL
78		}
79	3x	c(
80	3x	id = if (is.null(office_action$examinerEmployeeNumber)) NA else trimws(office_action$examinerEmployeeNumber[[1]]),
81	3x	intext = if (length(intext) && intext != "") intext else NA,
82	3x	signed = if (length(signed) && signed != "") signed else NA
83		)
84		}

1		#' Search for Patents or Patent Applications
2		#'
3		#' Searches the U.S. Pre-Grant Publications (US-PGPUB), Patents (USPAT), and/or
4		#' Optical Character Recognition (USOCR) databases. See the \href{https://ppubs.uspto.gov/pubwebapp/static/pages/landing.html}{web app}
5		#' for an interactive search.
6		#'
7		#' @param query A query string; for example \code{"noodle"}, \code{"photographic AND noodle"},
8		#' \code{"wet adj4 silicon"}, or \code{"G01W.CPCL."}. See the Patent Public Search
9		#' \href{https://ppubs.uspto.gov/pubwebapp/static/pages/searchable-indexes.html}{training materials}.
10		#' @param databases A character vector specifying which databases to search in; a selection from
11		#' \code{c("US-PGPUB", "USPAT", "USOCR")}.
12		#' @param outFile Name of a \code{.csv} file to save results to. If this file exists, it will
13		#' be loaded instead of searching.
14		#' @param start Result of the search to start from.
15		#' @param limit Maximum number of results to return; defaults to all results.
16		#' @param sort How to sort results; defaults to publication date (\code{"date_publ desc"}).
17		#' @param english_only Logical; if \code{FALSE}, will return patent and/or applications of any language.
18		#' @param spellCheck Logical; if \code{TRUE}, will spellcheck \code{query} terms.
19		#' @param plurals Logical; if \code{TRUE}, will look for plural and singular forms of \code{query} terms.
20		#' @param britishEquivalents Logical; if \code{TRUE}, will look for British and American forms of \code{query} terms.
21		#' @param verbose Logical; if \code{FALSE}, does not print status messages.
22		#' @return A \code{data.frame} of results, if any were found; otherwise \code{NULL}.
23		#' @examples
24		#' \dontrun{
25		#' # search for applications with a meteorology classification:
26		#' # https://www.uspto.gov/web/patents/classification/cpc/html/cpc-G01W.html
27		#' results <- uspto_search("G01W.CPCL.", "US-PGPUB")
28		#'
29		#' # search by date and keyword:
30		#' results <- uspto_search("20020801.pd. AND motorcycle")
31		#' }
32		#' @export
33
34		uspto_search <- function(query, databases = c("US-PGPUB", "USPAT", "USOCR"), outFile = NULL, start = 0, limit = FALSE, sort = "date_publ desc",
35		english_only = TRUE, spellCheck = FALSE, plurals = TRUE, britishEquivalents = TRUE, verbose = FALSE) {
36	2x	if (!is.null(outFile)) {
37	2x	outFile <- paste0(sub("\\.csv.*$", "", outFile), ".csv")
38	2x	if (file.exists(outFile)) {
39	!	if (verbose) message("loading existing results")
40	1x	return(read.csv(outFile))
41		} else {
42	1x	dir.create(dirname(outFile), FALSE)
43		}
44		}
45	1x	databases <- unname(c(b = "US-PGPUB", t = "USPAT", r = "USOCR")[unique(tolower(substring(databases, nchar(databases))))])
46	!	if (!length(databases)) stop("no recognized databases entered", call. = FALSE)
47	1x	session_request <- POST(
48	1x	"https://ppubs.uspto.gov/dirsearch-public/users/me/session",
49	1x	add_headers("Content-Type" = "application/json"),
50	1x	body = "-1"
51		)
52	1x	session <- content(session_request)
53	!	if (is.null(session$userSessionId)) stop("failed to start a session: ", session, call. = FALSE)
54	1x	query_object <- list(
55	1x	britishEquivalents = britishEquivalents,
56	1x	caseId = session$userCase$caseId,
57	1x	databaseFilters = lapply(databases, function(s) list(databaseName = s, countryCodes = list())),
58	1x	highlights = "0",
59	1x	hl_snippets = "2",
60	1x	ignorePersist = TRUE,
61	1x	op = "OR",
62	1x	plurals = plurals,
63	1x	q = query,
64	1x	qt = "brs",
65	1x	queryName = query,
66	1x	searchType = 1,
67	1x	spellCheck = spellCheck,
68	1x	userEnteredQuery = query,
69	1x	viewName = "tile"
70		)
71	1x	count_request <- POST(
72	1x	"https://ppubs.uspto.gov/dirsearch-public/searches/counts",
73	1x	add_headers("Content-Type" = "application/json"),
74	1x	body = toJSON(list(
75	1x	britishEquivalents = britishEquivalents,
76	1x	caseId = session$userCase$caseId,
77	1x	databaseFilters = lapply(databases, function(s) list(databaseName = s, countryCodes = list())),
78	1x	highlights = "0",
79	1x	hl_snippets = "2",
80	1x	ignorePersist = TRUE,
81	1x	op = "OR",
82	1x	plurals = plurals,
83	1x	q = query,
84	1x	qt = "brs",
85	1x	queryName = query,
86	1x	searchType = 1,
87	1x	spellCheck = spellCheck,
88	1x	userEnteredQuery = query,
89	1x	viewName = "tile"
90	1x	), auto_unbox = TRUE)
91		)
92	1x	count <- content(count_request)
93	1x	limit <- if (is.numeric(limit)) limit else count$numResults
94	!	if (is.null(count$numResults)) stop("failed to execute query: ", count, call. = FALSE)
95	!	if (verbose) message("matches: ", count$numResults, "; retrieving ", start, " through ", limit)
96	1x	if (count$numResults != 0) {
97	1x	result_request <- POST(
98	1x	"https://ppubs.uspto.gov/dirsearch-public/searches/searchWithBeFamily",
99	1x	add_headers("Content-Type" = "application/json"),
100	1x	body = toJSON(list(
101	1x	start = start,
102	1x	pageCount = limit,
103	1x	sort = sort,
104	1x	docFamilyFiltering = "familyIdFiltering",
105	1x	searchType = 1,
106	1x	familyIdEnglishOnly = english_only,
107	1x	familyIdFirstPreferred = "USPAT",
108	1x	familyIdSecondPreferred = "US-PGPUB",
109	1x	familyIdThirdPreferred = "FPRS",
110	1x	showDocPerFamilyPref = "showEnglish",
111	1x	queryId = 0,
112	1x	tagDocSearch = FALSE,
113	1x	query = query_object
114	1x	), auto_unbox = TRUE)
115		)
116	!	if (status_code(result_request) != 200) stop("failed to retrieve results: ", status_code(result_request), call. = FALSE)
117	1x	result <- content(result_request)
118	!	if (is.null(result$patents)) stop("failed to retrieve results: ", result, call. = FALSE)
119	1x	res <- do.call(rbind, lapply(result$patents, function(d) {
120	17x	if (is.null(d$collapsed)) d$collapsed <- TRUE
121	17x	if (is.null(d$lastChild)) d$lastChild <- TRUE
122	17x	if (is.null(d$familyCount)) d$familyCount <- 1
123	17x	lapply(d, function(e) if (!length(e)) NA else if (is.list(e)) paste0(e, collapse = "; ") else e)
124		}))
125	1x	res <- as.data.frame(lapply(as.data.frame(res), unlist))
126	1x	if (!is.null(outFile)) {
127	!	if (verbose) message("saving results: ", outFile)
128	1x	write.csv(res, outFile, row.names = FALSE)
129		}
130	1x	res
131		}
132		}

1		#' Get inventor information
2		#'
3		#' Extract inventor information from patent/application documents or examination records.
4		#'
5		#' @param input A \code{data.frame} of documents (as returned from \code{\link{uspto_download}})
6		#' or a list of examination records (as returned from \code{\link{download_peds}}).
7		#' @returns A \code{data.frame} with inventor information.
8		#' @examples
9		#' \dontrun{
10		#' # download an application
11		#' application <- uspto_download("US-20040216465-A1")
12		#'
13		#' # get inventor information from it
14		#' extract_inventors(application)
15		#'
16		#' # get the same from examination record
17		#' extract_inventors(download_peds("applId:10612573"))
18		#' }
19		#' @export
20
21		extract_inventors <- function(input) {
22	2x	if (is.data.frame(input)) {
23	1x	do.call(rbind, unname(lapply(split(
24	1x	input[, c("guid", "applicationNumber", "inventorsName", "inventorCountry", "inventorState", "inventorCity")],
25	1x	unlist(input$guid, FALSE, FALSE)
26	1x	), function(d) {
27	1x	res <- if (grepl(";", d$inventorState, fixed = TRUE)) {
28	1x	i <- lapply(d[, -c(1:2)], function(e) strsplit(e[[1]], "; ", fixed = TRUE)[[1]])
29	1x	data.frame(
30	1x	as.list(unlist(d[, 1:2])),
31	1x	do.call(rbind, lapply(strsplit(i$inventorsName, ",?\\s+"), function(n) {
32	2x	data.frame(
33	2x	firstName = n[2], middleName = if (length(n) > 2) paste(n[-(1:2)], collapse = " "), lastName = n[1]
34		)
35		})),
36	1x	i[-1]
37		)
38		} else {
39	!	d
40		}
41	1x	rownames(res) <- NULL
42	1x	res
43		})))
44		} else {
45	1x	ex <- function(r) {
46	1x	if (length(r) == 1) r <- r[[1]]
47	1x	party <- r$patentCaseMetadata$partyBag$applicantBagOrInventorBagOrOwnerBag
48	1x	app <- list(
49	1x	guid = sub("-?(\\d+)-?", "-\\1-", r$patentCaseMetadata$patentPublicationIdentification$publicationNumber, perl = TRUE),
50	1x	applicationNumber = sub("(\\d{2})/?", "\\1/", r$patentCaseMetadata$applicationNumberText$value, perl = TRUE)
51		)
52	!	if (!length(app$guid)) app$guid <- ""
53	1x	if (!is.null(party)) {
54	1x	i <- which(vapply(party, function(p) if (length(p)) names(p) else "", "") == "inventorOrDeceasedInventor")
55	1x	if (length(i)) {
56	1x	do.call(rbind, lapply(party[[i]]$inventorOrDeceasedInventor, function(g) {
57	2x	do.call(rbind, lapply(g$contactOrPublicationContact, function(p) {
58	2x	data.frame(
59	2x	app,
60	2x	p$name$personNameOrOrganizationNameOrEntityName[[1]]$personStructuredName,
61	2x	inventorCountry = p$countryCode, inventorState = p$geographicRegionName$value, inventorCity = p$cityName
62		)
63		}))
64		}))
65		}
66		}
67		}
68	!	if ("patentCaseMetaData" %in% names(input[[1]])) do.call(rbind, lapply(input, ex)) else ex(input)
69		}
70		}

1		#' Get Patent Classification Information
2		#'
3		#' Retrieve U.S. patent Class definitions and subclasses from
4		#' \href{https://www.uspto.gov/web/patents/classification}{USPTO Classification Resources}.
5		#'
6		#' @param code A vector of USPC classification symbols (e.g., \code{D14}).
7		#' @param dir Directory in which to save results.
8		#' @returns A list with an entry for \code{class} and \code{description} for the entered code, and \code{subclasses}
9		#' containing a \code{data.frame} with a \code{subclass} (the subclass code) and \code{description} column.
10		#' @examples
11		#' \dontrun{
12		#' classifications <- get_class_info(c(428, 429))
13		#' }
14		#' @export
15
16		get_class_info <- function(code, dir = tempdir()) {
17	4x	if (length(code) > 1) {
18	1x	return(Filter(length, lapply(structure(code, names = code), function(cc) {
19	2x	tryCatch(get_class_info(cc, dir), error = function(e) {
20	!	warning("failed to retrieve code ", cc, call. = FALSE)
21	!	NULL
22		})
23		})))
24		}
25	3x	code <- toupper(code)
26	3x	dir.create(dir, FALSE, TRUE)
27	3x	dir <- normalizePath(dir, "/")
28	3x	output <- paste0(dir, "/uspc", code, ".json")
29	3x	if (file.exists(output)) {
30	1x	read_json(output, simplifyVector = TRUE)
31		} else {
32	2x	req <- GET(paste0("https://www.uspto.gov/web/patents/classification/uspc", code, "/sched", code, ".htm"))
33	2x	if (req$status_code == 200) {
34	2x	res <- strsplit(rawToChar(req$content), "\n+")[[1]]
35	2x	res <- res[seq(
36	2x	grep("<table summary", res, fixed = TRUE)[[1]],
37	2x	grep("Start global footer", res, fixed = TRUE)
38		)]
39	2x	title <- regmatches(res[1], gregexec(">([^<]+)</", res[1]))[[1]][2, 2]
40	2x	rows <- grep("<tr", res, fixed = TRUE)
41	2x	subclasses <- as.data.frame(do.call(rbind, lapply(strsplit(res[rows[-(1:2)]], "</td>", fixed = TRUE), function(r) {
42	606x	if (length(r) > 5) {
43	592x	sm <- regexec("', '([0-9.]+)'", r[2])
44	592x	dm <- regexec("> *([A-Za-z][^>]+)<", r[5])
45	592x	if (length(sm[[1]]) == 2 && length(dm[[1]]) == 2) {
46	537x	c(
47	537x	subclass = regmatches(r[2], sm)[[1]][2],
48	537x	description = sub(" *$", "", regmatches(r[5], dm)[[1]][2])
49		)
50		}
51		}
52		})))
53	2x	colnames(subclasses) <- c("subclass", "description")
54	2x	subclasses$subclass <- as.numeric(subclasses$subclass)
55	2x	res <- list(class = code, description = title, subclasses = subclasses)
56	2x	write_json(res, output, auto_unbox = TRUE)
57	2x	res
58		} else {
59	!	stop("failed to retrieve definition of code ", code, call. = FALSE)
60		}
61		}
62		}

1		#' Download Patents or Patent Applications
2		#'
3		#' Downloads from the U.S. Pre-Grant Publications (US-PGPUB), Patents (USPAT), and/or
4		#' Optical Character Recognition (USOCR) databases. See the \href{https://ppubs.uspto.gov/pubwebapp/static/pages/landing.html}{web app}
5		#' to search for and view documents from the same source.
6		#'
7		#' @param guid Vector of document numbers (e.g., \code{US-20220230294-A1}). Can also be a \code{data.frame} of results,
8		#' as returned from \code{\link{uspto_search}}; must have a \code{guid} column containing document numbers,
9		#' and a \code{type} column with the source database abbreviation (e.g., \code{US-PGPUB}).
10		#' @param outDir Path to a directory in which to save each individual document. If not specified, these are
11		#' saved to a temporary directory.
12		#' @param type A vector the same length as \code{guid} (repeated as necessary), indicating the source database of
13		#' the \code{guid}. Defaults to \code{US-PGPUB} for every ID.
14		#' @param cores Number of CPU cores to split requests across.
15		#' @param compress Logical; if \code{FALSE}, will not xz-compress each file.
16		#' @param load Logical; if \code{FALSE}, will not return content, or load existing files.
17		#' @param verbose Logical; if \code{FALSE}, does not print status messages.
18		#' @return A \code{data.frame} with a row for each document.
19		#' @seealso You can more efficiently download granted patents using \code{\link{download_patents}}.
20		#' @examples
21		#' \dontrun{
22		#' # start with a search
23		#' results <- uspto_search("G01W.CPCL.", "US-PGPUB")
24		#'
25		#' # then download those results
26		#' applications <- uspto_download(results)
27		#' }
28		#' @export
29
30		uspto_download <- function(guid, outDir = tempdir(), type = "US-PGPUB", cores = detectCores() - 2, compress = TRUE, load = TRUE, verbose = FALSE) {
31	1x	if (missing(type) && is.list(guid) && !is.null(guid$type)) type <- guid$type
32	1x	if (!is.character(guid)) guid <- guid$guid
33	!	if (!length(guid)) stop("unrecognized guid input; should be a data.frame with a guid column, or a character vector", call. = FALSE)
34	3x	invalid <- !grepl("[A-Z]{2}-\\d+-[A-Z]\\d+", guid, perl = TRUE)
35	3x	if (any(invalid)) {
36	!	if (any(!grepl("-", guid[invalid], fixed = TRUE))) guid[invalid] <- sub("(\\d+)", "-\\1-", guid[invalid], perl = TRUE)
37	!	invalid <- !grepl("[A-Z]{2}-\\d+-[A-Z]\\d+", guid, perl = TRUE)
38	!	if (any(invalid)) stop("invalid ID format: ", if (all(invalid)) "all" else guid[invalid])
39		}
40	3x	type <- rep_len(type, length(guid))
41	3x	outDir <- paste0(normalizePath(outDir, "/", FALSE), "/")
42	3x	dir.create(outDir, FALSE, TRUE)
43	3x	urls <- paste0("https://ppubs.uspto.gov/dirsearch-public/patents/", guid, "/highlight?queryId=0&source=", type)
44	1x	if (verbose) message("retrieving ", length(urls), " documents; saving in ", outDir)
45	3x	retrieve_document <- function(url, out = outDir, comp = compress) {
46	11x	file <- paste0(out, gsub("^.patents/\|/highlight.$", "", url), ".json", if (comp) ".xz")
47	11x	doc <- NULL
48	11x	if (file.exists(file)) {
49	5x	if (load) {
50	!	if (verbose) message("loading existing document: ", file)
51	5x	doc <- as.data.frame(read_json(file)[[1]])
52	5x	doc$score <- as.numeric(doc$score)
53		} else {
54	!	message("document exists; not loading")
55		}
56		} else {
57	5x	if (verbose) message("retrieving document: ", url)
58	6x	req <- GET(url)
59	6x	if (req$status_code == 200) {
60	6x	res <- content(req)
61	6x	doc <- as.data.frame(lapply(res, function(e) if (is.null(e)) "N/A" else if (is.list(e)) paste0(e, collapse = "; ") else e))
62	6x	html <- grep("Html$", names(doc), TRUE, value = TRUE)
63	6x	for (e in html) {
64	72x	doc[[e]] <- gsub('<figref idref="([^"]+)">FIG\\. (\\d+)</figref>', "FIGREF FIGTYPE\\1 FIGID\\2",
65	72x	gsub("\\[(\\d+)\\]", "SECID SEC\\1",
66	72x	doc[[e]],
67	72x	perl = TRUE
68		),
69	72x	perl = TRUE
70		)
71		}
72	6x	if (comp) {
73	6x	file <- xzfile(file)
74	6x	on.exit(close(file))
75		}
76	6x	write_json(doc, file, auto_unbox = TRUE)
77		}
78		}
79	11x	if (load) doc
80		}
81	3x	res <- if (cores > 1) {
82	3x	cl <- parallel::makeCluster(max(1, min(length(urls), cores)))
83	3x	on.exit(parallel::stopCluster(cl), TRUE)
84	3x	parallel::parLapply(cl, urls, retrieve_document)
85		} else {
86	!	lapply(urls, retrieve_document)
87		}
88	3x	if (load) invisible(do.call(rbind, res))
89		}

1		#' Retrieve Full Patent Text
2		#'
3		#' Search for and return a set of full patent texts.
4		#'
5		#' @param ids A vector of application numbers, patent numbers, or document IDs (e.g., \code{"US9748552"}).
6		#' @param outFile Path to a \code{.csv} file to write results to.
7		#' @param ids_type Specifies what \code{ids} are, between \code{applications} (default), \code{patents},
8		#' \code{publication} (for publication document IDs), or \code{grant} (for grant document IDs).
9		#' @param query Search query; either a term/phrase (\code{searchText}) or set of Boolean criteria (\code{criteriaSearchText}).
10		#' @param inventionTitle Invention title, as it appears on the first page of the specification.
11		#' @param inventionSubjectMatterCategory Patent category (e.g., \code{DESIGN}).
12		#' @param assigneeEntityName Name of the patent assignee.
13		#' @param fromDate,toDate,filingDateFromDate,filingDateToDate Publication/grant and/or filing date ranges
14		#' in \code{YYYY-MM-DD} format (e.g., \code{2019-01-20}).
15		#' @param inventorNameText,claimText,abstractText,descriptionText Text to search for in specific portions;
16		#' \code{query} will search in all portions if it is a term or phrase.
17		#' @param sortField Field by which to sort results; defaults to date.
18		#' @param ascending Logical; if \code{TRUE}, will sort in ascending order.
19		#' @param grant Logical; if \code{TRUE}, searches for grants rather than publications.
20		#' @param limit Maximum number of results to return, in steps of 100.
21		#' @param series Series code to prepend to \code{ids} if they are application or patent numbers and not already included.
22		#' @param cores Number of CPU cores to use when retrieving multiple pages of results.
23		#' @param verbose Logical; if \code{TRUE}, will print status messages.
24		#' @return Path to the downloaded package, if it exists and \code{await} was a number; otherwise \code{NULL}.
25		#' @examples
26		#' \dontrun{
27		#' # retrieve a specific set of patents:
28		#' patents <- download_patents(c(16978600, 17087826, 16041592, 16727940, 17070594))
29		#' }
30		#' @export
31
32		download_patents <- function(ids = NULL, outFile = NULL, ids_type = "application", query = NULL,
33		inventionTitle = NULL, inventionSubjectMatterCategory = NULL, assigneeEntityName = NULL,
34		fromDate = NULL, toDate = NULL, filingDateFromDate = NULL,
35		filingDateToDate = NULL, inventorNameText = NULL, claimText = NULL,
36		abstractText = NULL, descriptionText = NULL, sortField = NULL, ascending = FALSE,
37		grant = FALSE, limit = FALSE, series = "US", cores = detectCores() - 2, verbose = FALSE) {
38	4x	if (!is.null(outFile) && file.exists(outFile)) {
39	!	if (verbose) message("reading in existing file: ", outFile)
40	1x	res <- read.csv(outFile)
41		} else {
42	3x	args <- list(
43	3x	inventionSubjectMatterCategory = inventionSubjectMatterCategory,
44	3x	filingDateFromDate = filingDateFromDate,
45	3x	filingDateToDate = filingDateToDate,
46	3x	inventionTitle = inventionTitle,
47	3x	assigneeEntityName = assigneeEntityName,
48	3x	inventorNameText = inventorNameText,
49	3x	claimText = claimText,
50	3x	abstractText = abstractText,
51	3x	descriptionText = descriptionText,
52	3x	sortField = sortField,
53	3x	sortOrder = if (ascending) "asc" else "desc"
54		)
55	3x	if (!is.null(ids)) {
56	1x	id_field <- c(
57	1x	ap = "patentApplicationNumber", pa = "patentNumber",
58	1x	do = paste0(if (grant) "grant" else "publication", "DocumentIdentifier"),
59	1x	pu = "publicationDocumentIdentifier", gr = "grantDocumentIdentifier"
60	1x	)[tolower(substring(ids_type, 1, 2))]
61	!	if (!grant && grepl("grant", id_field, fixed = TRUE)) grant <- TRUE
62	!	if (is.na(id_field)) stop("ids_type not recognized", call. = FALSE)
63	1x	args[id_field] <- format_numbers(ids, if (grepl("Document", id_field, fixed = TRUE)) "" else series)
64		}
65	3x	if (!is.null(query)) {
66	!	isCriteria <- grepl(":", query, fixed = TRUE)
67	!	args[if (isCriteria) "criteriaSearchText" else "searchText"] <- query
68	!	if (isCriteria) args$largTextSearchFlag <- "Y"
69		}
70	3x	type <- if (grant) "grant" else "publication"
71	!	if (!is.null(toDate)) args[paste0(type, "ToDate")] <- toDate
72	!	if (!is.null(fromDate)) args[paste0(type, "FromDate")] <- fromDate
73	1x	if (is.numeric(limit) && limit < 100) args$rows <- limit
74	3x	args <- Filter(length, args)
75	3x	req <- GET(paste0(
76	3x	"https://developer.uspto.gov/ibd-api/v1/application/", if (grant) "grants" else "publications",
77	3x	"?", paste(paste0(names(args), "=", unlist(args, use.names = FALSE)), collapse = "&")
78		))
79	3x	res <- tryCatch(content(req), error = function(e) http_status(req))
80	!	if (length(res) && !is.null(res$error)) stop("request failed: ", res$status, ": ", res$error, call. = FALSE)
81	3x	retrieved <- length(res$results)
82	!	if (!retrieved) stop("no results were not found", call. = FALSE)
83	3x	n <- if (is.numeric(limit)) min(res$recordTotalQuantity, limit) else res$recordTotalQuantity
84	3x	if (n > retrieved) {
85	1x	pages <- ceiling(n / 100) - 1
86	1x	base_url <- paste0(
87	1x	"https://developer.uspto.gov/ibd-api/v1/application/", if (grant) "grants" else "publications",
88	1x	"?", paste(paste0(names(args), "=", unlist(args, use.names = FALSE)), collapse = "&"), "&start="
89		)
90	1x	urls <- vapply(seq_len(pages), function(i) paste0(base_url, i * 100), "")
91	1x	get_page <- function(url) {
92	1x	res <- tryCatch(content(GET(url))$results, error = function(e) NULL)
93	!	if (is.null(res)) warning("failed to retrieve ", url)
94	1x	res
95		}
96	1x	res2 <- if (cores > 1) {
97	1x	cl <- parallel::makeCluster(max(1, min(pages, cores)))
98	1x	on.exit(parallel::stopCluster(cl), TRUE)
99	1x	parallel::parLapply(cl, urls, get_page)
100		} else {
101	!	lapply(urls, get_page)
102		}
103	1x	if (length(res2)) {
104	1x	res$results <- c(res$results, unlist(res2, recursive = FALSE))
105	!	} else if (verbose) warning("failed to retrieve more than the initial results", call. = FALSE)
106		}
107	3x	res <- as.data.frame(do.call(cbind, lapply(as.data.frame(do.call(rbind, lapply(res$results, function(d) {
108	210x	lapply(d, function(e) if (is.null(e)) NA else if (is.list(e)) paste(e, collapse = " ") else e)
109	3x	}))), unlist)))
110	!	if (verbose) message("found ", nrow(res))
111	3x	if (!is.null(outFile)) {
112	!	if (verbose) message("writing results to ", outFile)
113	1x	dir.create(dirname(outFile), FALSE, TRUE)
114	1x	write.csv(res, outFile, row.names = FALSE)
115		}
116		}
117	4x	res
118		}
119
120		format_numbers <- function(numbers, series) {
121	1x	paste(sub("^[A-Z]{2}([A-Z]{2})", "\\1", paste0(series, gsub("[/,]+", "", numbers)), FALSE, perl = TRUE), collapse = ",")
122		}

1		#' Make Document-Class Matrix
2		#'
3		#' Create A document-class matrix from a patentsview bulk table.
4		#'
5		#' @param table A matrix-like object with columns specified by \code{class_id}, \code{doc_id}, and \code{rank}.
6		#' Alternatively, the path to a tab-separated file containing such a matrix, an \code{Arrow} dataset, or the name
7		#' of a table, to be passed to \code{\link{download_patentsview_bulk}}.
8		#' @param outFile Path to an rds file to save the results to.
9		#' @param class_id,doc_id,rank Column names of the class, document ID, and class rank to be pulled from \code{table}.
10		#' @param type Table name, used to set a default \code{class_id}.
11		#' @param ... Additional arguments to be passed to \code{\link{download_patentsview_bulk}},
12		#' if \code{table} is a table name.
13		#' @param sparse Logical; if \code{FALSE}, returns a regular, dense matrix.
14		#' @param overwrite Logical; if \code{TRUE}, overwrites an existing \code{outFile} rather than loading it.
15		#' @return A sparse matrix (or regular matrix if \code{sparse} is \code{FALSE}) with documents
16		#' in rows, classes in columns, and the class rank (sequence) as values.
17		#' @examples
18		#' table <- data.frame(
19		#' patent_id = c("a", "a", "b"),
20		#' class = c(1, 3, 2),
21		#' sequence = c(1, 0, 0)
22		#' )
23		#' patentsview_class_matrix(table, class_id = "class")
24		#'
25		#' \dontrun{
26		#'
27		#' # get a matrix of WIPO class assignments
28		#' wipo_fields <- patentsview_class_matrix("wipo")
29		#'
30		#' # get a subset without creating the full matrix
31		#' wipo <- download_patentsview_bulk("wipo", make_db = TRUE)
32		#' wipo_fields_sub <- patentsview_class_matrix(dplyr::compute(dplyr::filter(
33		#' wipo, patent_id %in% c("10000002", "10000015", "10000017")
34		#' )))
35		#' }
36		#' @export
37
38		patentsview_class_matrix <- function(table, outFile = NULL, class_id = NULL, doc_id = "patent_id", rank = "sequence",
39		type = NULL, ..., sparse = TRUE, overwrite = FALSE) {
40	6x	m <- NULL
41	6x	if (!is.null(outFile)) {
42	2x	outFile <- sub("\\.[A-Za-z]{3}.*", ".rds", outFile)
43	2x	if (!overwrite && file.exists(outFile)) {
44	1x	m <- readRDS(outFile)
45		}
46		}
47	6x	if (is.null(m)) {
48	5x	if (is.character(table) && length(table) == 1) {
49	1x	type <- if (is.null(type)) basename(table) else type
50	1x	table <- if (file.exists(table)) {
51	!	arrow::read_delim_arrow(table, delim = "\t", escape_backslash = TRUE)
52		} else {
53	1x	download_patentsview_bulk(table, ..., make_db = TRUE, return_table = FALSE)
54		}
55		}
56	5x	if (!is.null(type)) {
57	3x	type <- substr(type, 1, 3)
58	3x	if (is.null(class_id)) {
59	3x	class_id <- switch(type,
60	3x	wip = "field_id",
61	3x	cpc = "group_id",
62	3x	ipc = "section",
63	3x	nbe = "subcategory_id",
64	3x	usp = "mainclass_id"
65		)
66		}
67	2x	} else if (is.null(class_id)) {
68	1x	class_id <- c("field_id", "group_id", "section", "subcategory_id", "mainclass_id")
69	1x	class_id <- class_id[class_id %in% names(table)]
70	!	if (length(class_id) > 1) class_id <- class_id[[1]]
71		}
72	!	if (!length(class_id)) stop("specify a class_id or type from which it can be inferred", call. = FALSE)
73	5x	cols <- c(class_id, doc_id)
74	5x	if (!all(cols %in% names(table))) {
75	!	stop("unrecognized column reference: ", paste(cols[!cols %in% names(table)], collapse = ", "))
76		}
77	5x	if (!is.data.frame(table) && (!is.null(table$NewScan) \|\| !is.null(table$GetColumnByName))) {
78	1x	if (!is.null(table$NewScan)) table <- table$NewScan()$Filter(arrow::Expression$scalar(TRUE))$Finish()$ToTable()
79	3x	classes <- sort(unique(table$GetColumnByName(class_id)))$as_vector()
80	3x	docs <- sort(unique(table$GetColumnByName(doc_id)))$as_vector()
81	3x	table <- as.data.frame(table)
82		} else {
83	2x	if (inherits(table, "arrow_dplyr_query")) {
84	!	stop(
85	!	"enter a completed query (e.g., using dplyr::compute), or a data.frame",
86	!	call. = FALSE
87		)
88		}
89	2x	classes <- sort(unique(table[[class_id]]))
90	2x	docs <- sort(unique(table[[doc_id]]))
91		}
92	5x	m <- Matrix::sparseMatrix(
93	5x	as.integer(factor(table[[doc_id]], docs)), as.integer(factor(table[[class_id]], classes)),
94	5x	x = if (rank %in% names(table)) table[[rank]] + 1 else 1,
95	5x	dims = c(length(docs), length(classes)), dimnames = list(docs, classes)
96		)
97	1x	if (!is.null(outFile)) saveRDS(m, outFile, compress = "xz")
98		}
99	1x	if (sparse) m else as.matrix(m)
100		}

1		#' Download Office Actions
2		#'
3		#' Download a set of Office actions based on a query to the
4		#' \href{https://developer.uspto.gov/api-catalog/uspto-office-action-text-retrieval-api}{Office Action Text Retrieval API}.
5		#' @param criteria Search criteria, in the form of \code{"field:value"} (e.g., \code{"examinerEmployeeNumber:80488"}).
6		#' See available \href{https://developer.uspto.gov/ds-api/oa_actions/v1/fields}{fields}.
7		#' @param outFile Path to the resulting JSON file; defaults to the hash of the query in the current working directory.
8		#' @param start Initial record; useful if making requests in chunks.
9		#' @param limit Maximum number of office actions to return; under 1,000, or in steps of 1,000; defaults to all found.
10		#' @param overwrite Logical; if \code{TRUE}, will overwrite an existing file with the same \code{outFile}.
11		#' @param endpoint API endpoint.
12		#' @param compress Logical; if \code{FALSE}, will not write a compressed file.
13		#' @param cores Number of CPU cores to split calls across if necessary.
14		#' @param verbose Logical; if \code{FALSE}, will not print status messages.
15		#' @return A list of office actions.
16		#' @examples
17		#' \dontrun{
18		#' # retrieve the office actions associated with a particular application
19		#' office_actions <- download_office_actions("patentApplicationNumber:13877637")
20		#' }
21		#' @export
22
23		download_office_actions <- function(criteria = ":", outFile = NULL, start = 0, limit = FALSE, overwrite = FALSE,
24		endpoint = "https://developer.uspto.gov/ds-api/oa_actions/v1/records",
25		compress = TRUE, cores = detectCores() - 2, verbose = TRUE) {
26	2x	final <- normalizePath(paste0(if (is.null(outFile)) {
27	1x	paste0(tempdir(), "/", if (!missing(criteria)) paste0(sub(":", "_", criteria, fixed = TRUE), "_"), Sys.Date())
28		} else {
29	1x	sub("\\.json.*$", "", outFile)
30	2x	}, ".json", if (compress) ".xz"), "/", FALSE)
31	2x	if (!overwrite && file.exists(final)) {
32	!	if (verbose) message("Reading in existing results")
33	!	return(invisible(read_json(final)))
34		}
35	2x	query <- httr::POST(
36	2x	endpoint, httr::add_headers("Content-Type" = "application/x-www-form-urlencoded", Accept = "application/json"),
37	2x	body = paste0("criteria=", criteria, "&start=", start, "&rows=", min(1000, limit))
38		)
39	2x	if (200 != httr::status_code(query)) {
40	!	error <- httr::http_error(query)
41	!	stop("query failed:\n", if (is.logical(error)) httr::content(query) else error, call. = FALSE)
42		}
43	2x	res <- httr::content(query)$response
44	2x	limit <- if (is.numeric(limit)) min(limit, res$numFound) else res$numFound
45	2x	if (verbose) {
46	!	message(
47	!	"Found ", res$numFound, " office actions for ",
48	!	if (nchar(criteria) > 104) paste(substring(criteria, 1, 100), "...") else criteria
49		)
50		}
51	2x	if (length(res$docs) < limit) {
52	1x	pages <- ceiling((limit - length(res$docs)) / 1000)
53	1x	get_page <- function(i) {
54	1x	query <- httr::POST(
55	1x	endpoint, httr::add_headers("Content-Type" = "application/x-www-form-urlencoded", Accept = "application/json"),
56	1x	body = paste0("criteria=", criteria, "&start=", 100 + (i - 1) * 1000, "&rows=1000")
57		)
58	1x	if (200 == httr::status_code(query)) {
59	1x	httr::content(query)$response$docs
60		}
61		}
62	1x	res2 <- if (cores > 1) {
63	1x	cl <- parallel::makeCluster(max(1, min(pages, cores)))
64	1x	parallel::clusterExport(cl, "criteria", environment())
65	1x	on.exit(parallel::stopCluster(cl), TRUE)
66	1x	parallel::parLapply(cl, seq_len(pages), get_page)
67		} else {
68	!	lapply(seq_len(pages), get_page)
69		}
70	1x	res$docs <- c(res$docs, unlist(res2, recursive = FALSE))
71		}
72	2x	dir.create(dirname(final), FALSE, TRUE)
73	2x	if (compress) {
74	2x	con <- xzfile(final)
75	2x	on.exit(close(con), add = TRUE)
76		}
77	2x	write_json(res$docs, if (compress) con else final, auto_unbox = TRUE)
78	2x	invisible(res$docs)
79		}