1 |
#' Search PatentsView |
|
2 |
#' |
|
3 |
#' Submits a query to one of the \href{https://patentsview.org}{PatentsView} endpoints. |
|
4 |
#' |
|
5 |
#' @param query A query string or list; for example \code{list(patent_number = 7861317)} or |
|
6 |
#' \code{'{"_text_all": {"patent_abstract": ["photographic", "noodle"]}}'}. See |
|
7 |
#' \href{https://patentsview.org/apis/api-query-language}{API Query Language}. |
|
8 |
#' @param fields A vector of fields to return; defaults to all for the given endpoint. Set to \code{NULL} |
|
9 |
#' to use the API's default. |
|
10 |
#' @param sort A list or list of lists specifying how to sort results (e.g., \code{list(patent_date = "asc")}). |
|
11 |
#' @param outDir Directory in which to save results; defaults to a temporary directory. |
|
12 |
#' @param include_related Logical; if \code{TRUE}, includes subentities related to entities matching the search criteria. |
|
13 |
#' @param start Page to start collecting results on. |
|
14 |
#' @param limit Maximum number of results to return; sets page size, up to 10,000 per page |
|
15 |
#' (1,000 if \code{legacy} if \code{FALSE}). |
|
16 |
#' @param endpoint Name of the API endpoint to search in, as listed on |
|
17 |
#' \href{https://patentsview.org/apis/api-endpoints}{PatentsView}. |
|
18 |
#' @param post Logical; if \code{FALSE}, will make a GET rather than POST request. |
|
19 |
#' @param retry Maximum number of times to retry a request, in the event of a rate limit. |
|
20 |
#' @param verbose Logical; if \code{FALSE}, does not print status messages. |
|
21 |
#' @param legacy Logical; if \code{FALSE}, will use the beta API, which requires a key. |
|
22 |
#' @param key Beta API key (\href{https://patentsview.org/apis/keyrequest}{Request a key}); |
|
23 |
#' defaults to the \code{PATENTSVIEW_KEY} environment variable. |
|
24 |
#' @param cores Number of CPU cores to use when retrieving multiple pages of results. |
|
25 |
#' @param overwrite Logical; if \code{TRUE}, overwrites an existing result. |
|
26 |
#' @return A \code{data.frame} of results, if any were found. This may contain list entries |
|
27 |
#' of \code{data.frame} with varying dimensions, depending on the \code{endpoint} and \code{fields}. |
|
28 |
#' @examples |
|
29 |
#' \dontrun{ |
|
30 |
#' # search for patents with a meteorology classification: |
|
31 |
#' # https://www.uspto.gov/web/patents/classification/cpc/html/cpc-G01W.html |
|
32 |
#' results <- uspto_search_patentsview(list(cpc_group_id = "G01W")) |
|
33 |
#' |
|
34 |
#' # search by date and keyword: |
|
35 |
#' results <- uspto_search_patentsview(list( |
|
36 |
#' "_and" = list( |
|
37 |
#' list(patent_date = "2002-01-08"), |
|
38 |
#' list("_text_any" = list(patent_abstract = "motorcycle")) |
|
39 |
#' ) |
|
40 |
#' )) |
|
41 |
#' } |
|
42 |
#' @export |
|
43 | ||
44 |
uspto_search_patentsview <- function(query, fields = NULL, sort = NULL, outDir = tempdir(), include_related = FALSE, start = 1, |
|
45 |
limit = 1e4, endpoint = "patent", post = TRUE, retry = 10, verbose = FALSE, legacy = TRUE, |
|
46 |
key = Sys.getenv("PATENTSVIEW_KEY"), cores = detectCores() - 1, overwrite = FALSE) { |
|
47 | ! |
if (key == "") legacy <- TRUE |
48 | 16x |
if (legacy) { |
49 | 4x |
page_limit <- 1e4 |
50 | 4x |
if (is.null(fields)) { |
51 | 4x |
all_fields <- GET(paste0( |
52 | 4x |
"https://raw.githubusercontent.com/PatentsView/PatentsView-API/master/querymodule/tests/testspec/", |
53 | 4x |
endpoint, "_master_field_list.json" |
54 |
)) |
|
55 | ! |
if (all_fields$status_code != 200) stop(endpoint, " is not a recognized endpoint", call. = FALSE) |
56 | 4x |
all_fields <- fromJSON(rawToChar(all_fields$content))$fields |
57 |
} |
|
58 | 4x |
if (!grepl("s$", endpoint)) endpoint <- paste0(endpoint, "s") |
59 | 4x |
url <- paste0("https://api.patentsview.org/", endpoint, "/query") |
60 | 4x |
per_page <- min(page_limit, limit) |
61 | 4x |
start <- ceiling(start / per_page) |
62 | 4x |
options <- list(page = start, per_page = per_page, matched_subentities_only = include_related) |
63 |
} else { |
|
64 | 12x |
page_limit <- 1e3 |
65 | 12x |
url <- paste0("https://search.patentsview.org/api/v1/", endpoint, "/") |
66 | 12x |
all_fields <- switch(endpoint, |
67 | 12x |
patent = c( |
68 | 12x |
"patent_number", "patent_title", "patent_date", "patent_type", "patent_country", "patent_year", |
69 | 12x |
"patent_abstract", "patent_kind", "patent_num_foreign_documents_cited", "patent_num_us_applications_cited", |
70 | 12x |
"patent_us_patents_cited", "patent_num_total_documents_cited", "patent_num_times_cited_by_us_patents", |
71 | 12x |
"patent_earliest_application_date", "patent_patent_processing_days", "patent_uspc_current_mainclass_average_patent_processing_days", |
72 | 12x |
"patent_cpc_current_group_average_patent_processing_days", "patent_term_extension", "patent_detail_desc_length", |
73 | 12x |
"assignees_at_grant", "inventors_at_grant", "cpc_current" |
74 |
), |
|
75 | 12x |
inventor = c( |
76 | 12x |
"inventor_id", "name_first", "name_last", "lastknown_city", "lastknown_state", "lastknown_country", |
77 | 12x |
"first_seen_date", "last_seen_date", "num_assignees", "num_patents", "years_active", "inventor_years" |
78 |
), |
|
79 | 12x |
assignee = c( |
80 | 12x |
"assignee_id", "first_seen_date", "last_seen_date", "lastknown_city", "lastknown_country", "lastknown_latitude", |
81 | 12x |
"lastknown_longitude", "lastknown_state", "name_first", "name_last", "num_inventors", "num_patents", "organization", |
82 | 12x |
"type", "years_active", "assignee_years" |
83 |
), |
|
84 | 12x |
cpc_subsection = c( |
85 | 12x |
"cpc_section_id", "cpc_subsection_id", "cpc_subsection_title", "cpc_subsection_num_patents", "cpc_subsection_num_assignees", |
86 | 12x |
"cpc_subsection_num_inventors", "cpc_subsection_first_seen_date", "cpc_subsection_last_seen_date", "cpc_subsection_years_active" |
87 |
), |
|
88 | 12x |
cpc_group = c( |
89 | 12x |
"cpc_group_id", "cpc_group_title", "cpc_group_num_patents", "cpc_group_num_assignees", "cpc_group_num_inventors", |
90 | 12x |
"cpc_group_first_seen_date", "cpc_group_last_seen_date", "cpc_group_years_active" |
91 |
), |
|
92 | 12x |
cpc_subgroup = c("cpc_subgroup_id", "cpc_subgroup_title"), |
93 | 12x |
uspc_mainclass = c( |
94 | 12x |
"uspc_mainclass_id", "uspc_mainclass_title", "uspc_mainclass_num_patents", "uspc_mainclass_num_assignees", |
95 | 12x |
"uspc_mainclass_num_inventors", "uspc_mainclass_first_seen_date", "uspc_mainclass_last_seen_date", "uspc_mainclass_years_active" |
96 |
), |
|
97 | 12x |
uspc_subclass = c("uspc_subclass_id", "uspc_subclass_title"), |
98 | 12x |
nber_category = c("nber_category_id", "nber_category_title"), |
99 | 12x |
nber_subcategory = c( |
100 | 12x |
"nber_subcategory_id", "nber_subcategory_title", "nber_subcategory_num_patents", |
101 | 12x |
"nber_subcategory_num_assignees", "nber_subcategory_num_inventors", "nber_subcategory_first_seen_date", |
102 | 12x |
"nber_subcategory_last_seen_date", "nber_subcategory_years_active" |
103 |
), |
|
104 | 12x |
patent_citation = c("patent_number", "cited_patent_number", "citation_category", "citation_sequence", "citation_date"), |
105 | 12x |
application_citation = c( |
106 | 12x |
"patent_number", "cited_application_number", "citation_category", "citation_kind", "citation_sequence", "citation_date" |
107 |
), |
|
108 | 12x |
NULL |
109 |
) |
|
110 | ! |
if (is.null(all_fields)) stop(endpoint, " is not a recognized endpoint", call. = FALSE) |
111 | 12x |
per_page <- min(page_limit, limit) |
112 | 12x |
start <- ceiling(start / per_page) |
113 | 12x |
options <- list(size = per_page, matched_subentities_only = include_related) |
114 |
} |
|
115 | ! |
if (is.character(query)) query <- jsonlite::fromJSON(query) |
116 | 16x |
fs <- if (is.null(fields)) all_fields else fields |
117 | 16x |
rf <- if (post) { |
118 | 14x |
function(page, attempt = 1) { |
119 | 13x |
if (is.null(options$page)) { |
120 | 11x |
options$offset <- page * 1e3 |
121 |
} else { |
|
122 | 2x |
options$page <- page + 1 |
123 |
} |
|
124 | 13x |
body <- list( |
125 | 13x |
q = query, |
126 | 13x |
f = fs, |
127 | 13x |
s = sort, |
128 | 13x |
o = options |
129 |
) |
|
130 | 13x |
req <- httr::POST( |
131 | 13x |
url, |
132 | 13x |
httr::add_headers("X-Api-Key" = key, "Content-Type" = "application/json", Accept = "application/json"), |
133 | 13x |
body = body, encode = "json" |
134 |
) |
|
135 | 13x |
if (req$status_code == 429 && attempt < retry) { |
136 | ! |
wait <- as.integer(req$headers[["retry-after"]]) |
137 | ! |
if (length(wait)) Sys.sleep(wait) |
138 | ! |
return(rf(page, attempt + 1)) |
139 |
} |
|
140 | 13x |
if (req$status_code == 200) { |
141 | 13x |
jsonlite::fromJSON(rawToChar(req$content)) |
142 |
} else { |
|
143 | ! |
req |
144 |
} |
|
145 |
} |
|
146 |
} else { |
|
147 | 2x |
function(page, attempt = 1) { |
148 | 2x |
if (is.null(options$page)) { |
149 | 1x |
options$offset <- page * 1e3 |
150 |
} else { |
|
151 | 1x |
options$page <- page + 1 |
152 |
} |
|
153 | 2x |
body <- paste0( |
154 | 2x |
"?q=", jsonlite::toJSON(query, auto_unbox = TRUE), |
155 | 2x |
paste0('&f=["', paste(fs, collapse = '","'), '"]'), |
156 | 2x |
if (length(sort)) paste0("&s=", jsonlite::toJSON(sort, auto_unbox = TRUE)), |
157 | 2x |
paste0("&o=", jsonlite::toJSON(options, auto_unbox = TRUE)) |
158 |
) |
|
159 | 2x |
req <- httr::GET( |
160 | 2x |
URLencode(paste0(url, body)), |
161 | 2x |
httr::add_headers("X-Api-Key" = key, "Content-Type" = "application/json", Accept = "application/json") |
162 |
) |
|
163 | 2x |
if (req$status_code == 429 && attempt < retry) { |
164 | ! |
wait <- as.integer(req$headers[["retry-after"]]) |
165 | ! |
if (length(wait)) Sys.sleep(wait) |
166 | ! |
return(rf(page, attempt + 1)) |
167 |
} |
|
168 | 2x |
if (req$status_code == 200) { |
169 | 2x |
jsonlite::fromJSON(rawToChar(req$content)) |
170 |
} else { |
|
171 | ! |
req |
172 |
} |
|
173 |
} |
|
174 |
} |
|
175 | 16x |
dir.create(outDir, FALSE, TRUE) |
176 | 16x |
cache <- paste0(normalizePath(outDir, "/"), "/", digest::digest(list(url, query, fs, sort, options)), ".json") |
177 | 16x |
if (!overwrite && file.exists(cache)) { |
178 | 1x |
if (verbose) message("reading existing results") |
179 | 1x |
return(jsonlite::read_json(cache, simplifyVector = TRUE)) |
180 |
} |
|
181 | 15x |
res <- rf(0) |
182 | 15x |
if (!is.null(res$status_code)) { |
183 | ! |
stop( |
184 | ! |
"request failed: ", res$status_code, "; ", if (res$status_code == 400) { |
185 | ! |
paste0( |
186 | ! |
res$headers$`x-status-reason`, |
187 | ! |
if (!is.null(res$headers$`x-status-reason-code`)) paste0(" (", res$headers$`x-status-reason-code`, ")") |
188 |
) |
|
189 |
} else { |
|
190 | ! |
jsonlite::toJSON(jsonlite::fromJSON(rawToChar(res$content)), auto_unbox = TRUE, pretty = TRUE) |
191 |
}, |
|
192 | ! |
call. = FALSE |
193 |
) |
|
194 |
} |
|
195 | 15x |
if (!is.null(res$error) && res$error) { |
196 | ! |
stop( |
197 | ! |
"request failed: ", jsonlite::toJSON(res, auto_unbox = TRUE, pretty = TRUE), |
198 | ! |
call. = FALSE |
199 |
) |
|
200 |
} |
|
201 | 15x |
total_name <- grep("total_", names(res), fixed = TRUE, value = TRUE) |
202 | 15x |
rn <- names(res)[!names(res) %in% c(total_name, "count", "total_hits", "error")] |
203 | 15x |
total <- res[[total_name]] |
204 | ! |
if (is.null(total) || total == 0) stop("no results found", call. = FALSE) |
205 | 15x |
limit <- min(limit, total) - res$count |
206 | 15x |
if (limit > 0) { |
207 | ! |
pages <- ceiling(limit / page_limit) |
208 | ! |
cores <- min(cores, pages) |
209 | ! |
res2 <- if (cores > 1) { |
210 | ! |
env <- new.env(parent = globalenv()) |
211 | ! |
environment(rf) <- env |
212 | ! |
cl <- makeCluster(cores) |
213 | ! |
on.exit(stopCluster(cl)) |
214 | ! |
for (o in c("cl", "start", "pages", "key", "rf", "url", "query", "fs", "sort", "options", "retry")) env[[o]] <- get(o) |
215 | ! |
clusterExport(cl, ls(envir = env), environment()) |
216 | ! |
eval(expression(parallel::parLapply(cl, seq(start, pages), rf)), envir = env) |
217 |
} else { |
|
218 | ! |
lapply(seq(start, pages), rf) |
219 |
} |
|
220 | ! |
res[[rn]] <- do.call(rbind, c(list(res[[rn]]), lapply(res2, "[[", rn))) |
221 |
} |
|
222 | ! |
if (verbose) message("found ", total, ", returning ", nrow(res[[rn]])) |
223 | 15x |
jsonlite::write_json(res[[rn]], cache, auto_unbox = TRUE) |
224 | 15x |
res[[rn]] |
225 |
} |
1 |
#' Extract Timings from Examination Events |
|
2 |
#' |
|
3 |
#' Extract timings between select event codes within an examination timeline. |
|
4 |
#' @param record A matrix-like object with dates in the first column (recognizable by |
|
5 |
#' \code{\link{as.Date}}) and event codes (recognizable by the patterns in \code{span}) in the second. |
|
6 |
#' @param spans A list with entries defining each event window. Each entry should be a list with |
|
7 |
#' entries for \code{start}, \code{end}, and \code{before}, each containing regular expression |
|
8 |
#' strings identifying event codes. |
|
9 |
#' @returns A list with a \code{data.frame} for each entry in \code{span}. |
|
10 |
#' @examples |
|
11 |
#' extract_event_timing(data.frame( |
|
12 |
#' date = c("2020-01-15", "2020-01-20"), |
|
13 |
#' event = c("FWDX", "CTNF") |
|
14 |
#' )) |
|
15 |
#' @export |
|
16 | ||
17 |
extract_event_timing <- function(record, spans = list( |
|
18 |
"Initial Classification" = list(start = "^PGPC$", end = "^(?:OI|PG$)", before = "^DOCK$"), |
|
19 |
"Art Unit to Examiner" = list(start = "^DOCK$", end = "^FWDX$", before = "^(?:CTFR|EX\\.R|N/=\\.)"), |
|
20 |
"Examiner to Action" = list(start = "^FWDX$", end = "^(?:CT|EX\\.R|N/=\\.)", before = "^(?:DOCK$|A)") |
|
21 |
)) { |
|
22 | 1x |
if (is.list(record) && !is.data.frame(record)) { |
23 | ! |
if ("prosecutionHistoryDataBag" %in% names(record)) { |
24 | ! |
record <- record$prosecutionHistoryDataBag$prosecutionHistoryData |
25 | ! |
} else if ("prosecutionHistoryDataBag" %in% names(record[[1]])) { |
26 | ! |
timings <- lapply(record, function(r) extract_event_timing(r$prosecutionHistoryDataBag$prosecutionHistoryData)) |
27 | ! |
names(timings) <- vapply(record, function(r) r$patentCaseMetadata$applicationNumberText$value, "") |
28 | ! |
return(timings) |
29 |
} |
|
30 | ! |
record <- as.data.frame(lapply( |
31 | ! |
as.data.frame(do.call(rbind, record)), unlist, |
32 | ! |
recursive = FALSE, use.names = FALSE |
33 |
)) |
|
34 |
} |
|
35 | ! |
if (ncol(record) < 2) stop("record is not in the expected format", call. = FALSE) |
36 | 1x |
record[, 1] <- as.Date(record[, 1], tryFormats = "%Y-%m-%d", options = TRUE) |
37 | ! |
if (anyNA(record[, 1])) record <- record[!is.na(record[, 1]), ] |
38 | 1x |
record <- record[order(record[, 1]), ] |
39 | 1x |
dates <- record[, 1] |
40 | 1x |
events <- record[, 2] |
41 | 1x |
n <- length(events) |
42 | 1x |
lapply(spans, function(s) { |
43 | 3x |
pos <- lapply(s, grep, events) |
44 | 3x |
if (length(pos$start)) { |
45 | 1x |
res <- list() |
46 | 1x |
lastEnd <- 0 |
47 | 1x |
current <- 0 |
48 | 1x |
for (i in seq_along(pos$start)) { |
49 | 1x |
p <- pos$start[i] |
50 | 1x |
b <- pos$before[pos$before > p] |
51 | 1x |
b <- if (length(b)) b[1] else Inf |
52 | 1x |
e <- pos$end[pos$end > current & pos$end > p & pos$end < b] |
53 | 1x |
if (length(e)) { |
54 | 1x |
current <- e[1] |
55 | 1x |
res[[i]] <- data.frame( |
56 | 1x |
start = n - p + 1, startDate = dates[p], startEvent = events[p], |
57 | 1x |
end = n - current + 1, endDate = dates[current], endEvent = events[current], |
58 | 1x |
days = as.numeric(dates[current] - dates[p]) |
59 |
) |
|
60 |
} |
|
61 |
} |
|
62 | 1x |
do.call(rbind, res) |
63 |
} |
|
64 |
}) |
|
65 |
} |
1 |
#' Search for Patents or Patent Applications (legacy) |
|
2 |
#' |
|
3 |
#' Searches the Application (AppFT) or Patent (PatFT) |
|
4 |
#' \href{https://appft.uspto.gov}{Full-Text Databases}, and returns |
|
5 |
#' document/patent numbers and titles. |
|
6 |
#' |
|
7 |
#' @param query A query string. This can be a set of terms (e.g., \code{"term1 OR term2"}) or |
|
8 |
#' an exact phrase (e.g., \code{'"a phrase"'}). To search within a particular |
|
9 |
#' \href{https://appft.uspto.gov/netahtml/PTO/help/helpflds.html}{field}, prefix with the field code |
|
10 |
#' and a forward slash (e.g., \code{"CPCL/G01W"}, which searches for "G01W" in the CPCL field). |
|
11 |
#' @param applications Logical; specifies whether to search in the application database |
|
12 |
#' (AppFT; \code{TRUE}; default) or the patent database (PatFT; \code{FALSE}). |
|
13 |
#' @param limit A limit to the number of results retrieved, in steps of 50. By default, will |
|
14 |
#' try to retrieve all results. |
|
15 |
#' @param retries Number of retries for a given results page. |
|
16 |
#' @param outdir Path to a directory in which to save results. Only writes if specified. |
|
17 |
#' @param overwrite Logical; if \code{TRUE}, will overwrite caches results. |
|
18 |
#' @param cores Number of CPU cores to use when reading in multiple pages. |
|
19 |
#' @param verbose Logical; if \code{FALSE}, does not print status messages. |
|
20 |
#' @return A \code{data.frame} with columns for \code{index} (index of the result), |
|
21 |
#' \code{id} (patent number or application document number), and \code{title} (title of patent or application). |
|
22 |
#' @seealso For the newer search mechanism, see \code{\link{uspto_search}}. |
|
23 |
#' @examples |
|
24 |
#' \dontrun{ |
|
25 |
#' # search for applications with a meteorology classification: |
|
26 |
#' # https://www.uspto.gov/web/patents/classification/cpc/html/cpc-G01W.html |
|
27 |
#' results <- search_fulltext("CPCL/G01W") |
|
28 |
#' |
|
29 |
#' # search for patents by date and title keyword |
|
30 |
#' results <- search_fulltext("isd/1/8/2002 and motorcycle", FALSE) |
|
31 |
#' } |
|
32 |
#' @export |
|
33 | ||
34 |
search_fulltext <- function(query, applications = TRUE, limit = NULL, retries = 20, outdir = NULL, |
|
35 |
overwrite = FALSE, cores = parallel::detectCores() - 2, verbose = TRUE) { |
|
36 | 2x |
base_url <- paste0( |
37 | 2x |
"https://", if (applications) "app" else "pat", |
38 | 2x |
"ft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=/netahtml/PTO/search-adv.html&r=0&f=S&l=50&d=", |
39 | 2x |
if (applications) "PG01" else "PTXT", "&p=" |
40 |
) |
|
41 | 2x |
query <- URLencode(gsub("\\s+", "+", query, perl = TRUE)) |
42 | 2x |
outFile <- normalizePath(paste0(outdir, "/", digest::digest(query), ".csv"), "/", FALSE) |
43 | ! |
if (overwrite) unlink(outFile) |
44 | 2x |
if (file.exists(outFile)) { |
45 | ! |
if (verbose) message("reading existing file: ", outFile) |
46 | ! |
read.csv(outFile) |
47 |
} else { |
|
48 | 2x |
retrieve_page <- function(input, attempt = retries, parse = TRUE) { |
49 | 3x |
results <- tryCatch( |
50 |
{ |
|
51 | 3x |
con <- curl::curl(paste0(base_url, input, "&Query=", query)) |
52 | 3x |
on.exit(close(con)) |
53 | 3x |
readLines(con, warn = FALSE) |
54 |
}, |
|
55 | 3x |
error = function(e) e$message |
56 |
) |
|
57 | 3x |
if (length(results) < 100) { |
58 | ! |
if (attempt > 0) { |
59 | ! |
Sys.sleep(.5) |
60 | ! |
retrieve_page(input, attempt - 1, parse) |
61 |
} else { |
|
62 | ! |
NULL |
63 |
} |
|
64 |
} else { |
|
65 | 2x |
if (parse) parse_page(results) else results |
66 |
} |
|
67 |
} |
|
68 | 2x |
parse_page <- function(results) { |
69 | 3x |
rows <- c(grep("^<TR><TD", results, perl = TRUE), length(results)) |
70 | 3x |
res <- as.data.frame(do.call(rbind, lapply(seq_len(length(rows) - 1), function(i) { |
71 | 109x |
row <- results[seq(rows[i], rows[i + 1])] |
72 | 109x |
split_row <- strsplit(c(row[1:2], paste(row[-(1:2)], collapse = " ")), "[><]") |
73 | 109x |
c(split_row[[1]][5], split_row[[2]][5], split_row[[3]][if (split_row[[3]][5] == "") 11 else 5]) |
74 | 3x |
})), make.names = FALSE, optional = TRUE) |
75 | 3x |
colnames(res) <- c("index", "id", "name") |
76 | 3x |
res$index <- as.integer(res$index) |
77 | 3x |
res$name <- sub("\\s+$", "", gsub("\\s+", " ", res$name, perl = TRUE), perl = TRUE) |
78 | 3x |
res |
79 |
} |
|
80 | 2x |
initial <- retrieve_page(1, retries, FALSE) |
81 | ! |
if (length(initial) < 30) stop("Failed to retrieve initial page: ", initial, call. = FALSE) |
82 | 2x |
report <- grep("Results of Search", initial[1:100], fixed = TRUE) |
83 | ! |
if (!length(report)) stop("Response is in an unexpected format; check ", paste0(base_url, 1, "&Query=", query), call. = FALSE) |
84 | 2x |
n <- as.numeric(regmatches(initial[report], regexec("(\\d+)\\s\\w+\\.$", initial[report]))[[1]][2]) |
85 | ! |
if (verbose) message("found ", n, " results", if (is.numeric(limit)) paste("; retrieving", limit)) |
86 | 1x |
if (is.numeric(limit)) n <- min(limit, n) |
87 | 2x |
pages <- ceiling(n / 50) |
88 | 2x |
res <- if (pages > 1) { |
89 | 1x |
res <- if (cores > 1) { |
90 | 1x |
cl <- parallel::makeCluster(max(1, min(pages, cores))) |
91 | 1x |
parallel::clusterExport(cl, c("base_url", "query", "retrieve_page", "parse_page"), environment()) |
92 | 1x |
on.exit(parallel::stopCluster(cl), TRUE) |
93 | 1x |
parallel::parLapply(cl, seq(2, pages), retrieve_page) |
94 |
} else { |
|
95 | ! |
lapply(seq(2, pages), retrieve_page) |
96 |
} |
|
97 | 1x |
if (verbose) { |
98 | ! |
missed <- vapply(res, length, 0) == 0 |
99 | ! |
if (any(missed)) warning("missed pages: ", paste(which(missed) + 1, collapse = ", ")) |
100 |
} |
|
101 | 1x |
rbind(parse_page(initial), do.call(rbind, res)) |
102 |
} else { |
|
103 | 1x |
parse_page(initial) |
104 |
} |
|
105 | 2x |
if (!is.null(outdir)) { |
106 | 2x |
dir.create(outdir, FALSE, TRUE) |
107 | 2x |
write.csv(res, outFile, row.names = FALSE) |
108 |
} |
|
109 | 2x |
res |
110 |
} |
|
111 |
} |
1 |
#' Download Patent Examination Data |
|
2 |
#' |
|
3 |
#' Download a bundled query response from the \href{https://ped.uspto.gov/peds}{Patent Examination Data System}. |
|
4 |
#' |
|
5 |
#' @param searchText Text to search for; \code{"*:*"} for none (default); see |
|
6 |
#' the FAQ on the \href{https://ped.uspto.gov/peds}{PEDS site}. |
|
7 |
#' @param filters A list of characters representing filter conditions in the form of \code{"field:value"} |
|
8 |
#' (e.g., \code{list('appStatus:"Patented Case"')}). |
|
9 |
#' @param outFile Name of the final JSON file, or directory in which to save files with parameter-hash names. |
|
10 |
#' @param start Initial record. |
|
11 |
#' @param minMatch Minimum number of terms required to match. |
|
12 |
#' @param overwrite Logical; overwrite any previous queries with the same body hash. |
|
13 |
#' @param waits Number of times to check for a completed bundle before giving up. |
|
14 |
#' @param wait Number of seconds to wait between retries (how long \code{waits} are). |
|
15 |
#' @param endpoint PEDs API endpoint. |
|
16 |
#' @param compress Logical; if \code{FALSE}, will not xz-compress the \code{outFile}. |
|
17 |
#' @param load Logical; if \code{FALSE}, will not return the downloaded content, or load in existing content. |
|
18 |
#' @param verbose Logical; if \code{FALSE}, will not print status messages. |
|
19 |
#' @return A list with an entry for each year in the set of patent metadata. |
|
20 |
#' @examples |
|
21 |
#' \dontrun{ |
|
22 |
#' # like case 3 in the API Documentation > API Tutorial on https://ped.uspto.gov/peds |
|
23 |
#' download_peds( |
|
24 |
#' "firstNamedApplicant:(Google)", |
|
25 |
#' list('appStatus:"Patented Case"', "appFilingDate:[2013-01-01T00:00:00Z TO 2013-03-31T23:59:59Z]") |
|
26 |
#' ) |
|
27 |
#' } |
|
28 |
#' @export |
|
29 | ||
30 |
download_peds <- function(searchText = "*:*", filters = list("*:*"), outFile = NULL, start = 0, minMatch = "100%", overwrite = FALSE, |
|
31 |
waits = 50, wait = 10, endpoint = "https://ped.uspto.gov/api/queries/", compress = TRUE, load = TRUE, verbose = TRUE) { |
|
32 |
# https://ped.uspto.gov/api/search-params |
|
33 | 2x |
body <- list( |
34 | 2x |
searchText = searchText, |
35 | 2x |
fq = filters, |
36 | 2x |
fl = "*", |
37 | 2x |
mm = minMatch, |
38 | 2x |
df = "patentTitle", |
39 | 2x |
qf = paste( |
40 | 2x |
"appEarlyPubNumber applId appLocation appType appStatus_txt appConfrNumber appCustNumber", |
41 | 2x |
"appGrpArtNumber appCls appSubCls appEntityStatus_txt patentNumber patentTitle primaryInventor", |
42 | 2x |
"firstNamedApplicant appExamName appExamPrefrdName appAttrDockNumber appPCTNumber", |
43 | 2x |
"appIntlPubNumber wipoEarlyPubNumber pctAppType firstInventorFile appClsSubCls rankAndInventorsList" |
44 |
), |
|
45 | 2x |
facet = FALSE, |
46 | 2x |
sort = "applId asc", |
47 | 2x |
start = start |
48 |
) |
|
49 | 2x |
hash <- digest::digest(body) |
50 | 2x |
final <- normalizePath(paste0(if (is.null(outFile) || dir.exists(outFile)) { |
51 | 1x |
paste0(if (is.null(outFile)) tempdir() else outFile, "/", hash) |
52 |
} else { |
|
53 | 1x |
sub("\\.json.*$", "", outFile) |
54 | 2x |
}, ".json", if (compress) ".xz"), "/", FALSE) |
55 | 2x |
if (!overwrite && file.exists(final)) { |
56 | ! |
if (load) { |
57 | ! |
if (verbose) message("Reading in existing results") |
58 | ! |
return(invisible(jsonlite::read_json(final, simplifyDataFrame = FALSE))) |
59 |
} else { |
|
60 | ! |
if (verbose) message("Results already exist") |
61 | ! |
return() |
62 |
} |
|
63 |
} |
|
64 | 2x |
bundle <- paste0(normalizePath(tempdir(), "/"), "/", hash, ".zip") |
65 | 2x |
exdir <- sub(".zip", "", bundle, fixed = TRUE) |
66 | 2x |
output <- list(ID = "", content = list()) |
67 | 2x |
if (load) on.exit(return(invisible(output$content))) |
68 | 2x |
if (overwrite || (!file.exists(bundle) && !dir.exists(exdir))) { |
69 | 2x |
query <- httr::POST( |
70 | 2x |
endpoint, httr::add_headers("Content-Type" = "application/json", Accept = "application/json"), |
71 | 2x |
body = body, encode = "json" |
72 |
) |
|
73 | 2x |
if (200 != httr::status_code(query)) { |
74 | ! |
error <- httr::http_error(query) |
75 | ! |
errorContent <- httr::content(query) |
76 | ! |
stop("query failed:\n", if (length(errorContent)) errorContent else if (is.logical(error)) httr::status_code(query) else error, call. = FALSE) |
77 |
} |
|
78 | 2x |
res <- httr::content(query) |
79 | 2x |
output$ID <- ID <- res$queryId |
80 | ! |
if (0 == res$queryResults$searchResponse$response$numFound) stop("No records found for ", ID, call. = FALSE) |
81 | 2x |
httr::PUT(paste0(endpoint, ID, "/package?format=JSON")) |
82 | ! |
if (verbose) message("Package requested for ", res$queryResults$searchResponse$response$numFound, " record(s): ", ID) |
83 | 2x |
retries <- 1 |
84 | 2x |
while (jsonlite::read_json(paste0(endpoint, ID))$jobStatus != "COMPLETED") { |
85 | ! |
if (retries > waits) stop("hit wait limit without seeing COMPLETED status for ", ID) |
86 | 3x |
retries <- retries + 1 |
87 | ! |
if (verbose) cat(paste0("\rwaiting for bundle (try: ", retries, ") ... ")) |
88 | 3x |
Sys.sleep(wait) |
89 |
} |
|
90 | ! |
if (verbose) cat("complete\n") |
91 | 2x |
url <- paste0(endpoint, ID, "/download?format=JSON") |
92 | 2x |
download.file(url, bundle, mode = "wb") |
93 |
} |
|
94 | 2x |
if (file.exists(bundle)) { |
95 | ! |
if (verbose) message("Unpacking ", bundle) |
96 | 2x |
system2("unzip", c("-d", shQuote(exdir), shQuote(bundle)), stdout = TRUE) |
97 | 2x |
unlink(bundle) |
98 |
} |
|
99 | 2x |
output$content <- unlist(lapply(list.files(exdir, full.names = TRUE), function(f) { |
100 | 2x |
res <- tryCatch(jsonlite::read_json(f, simplifyDataFrame = FALSE)$PatentData, error = function(e) NULL) |
101 | 2x |
if (is.null(res)) { |
102 | ! |
con <- gzfile(f) |
103 | ! |
on.exit(close(con)) |
104 | ! |
res <- tryCatch(fromJSON(paste( |
105 | ! |
c(scan(con, "", quote = "", na.strings = "", quiet = TRUE), "]}"), |
106 | ! |
collapse = " " |
107 | ! |
), simplifyDataFrame = FALSE)$PatentData, error = function(e) NULL) |
108 |
} |
|
109 | 2x |
res |
110 | 2x |
}), recursive = FALSE, use.names = FALSE) |
111 | ! |
if (verbose) message("writing final results: ", normalizePath(final, "/", FALSE)) |
112 | 2x |
dir.create(dirname(final), FALSE, TRUE) |
113 | 2x |
if (compress) { |
114 | 2x |
final <- xzfile(final) |
115 | 2x |
on.exit(close(final), add = TRUE) |
116 |
} |
|
117 | 2x |
jsonlite::write_json(output$content, final, auto_unbox = TRUE) |
118 |
} |
1 |
#' Download PatentsView Bulk |
|
2 |
#' |
|
3 |
#' Download one of the bulk tables from \href{https://patentsview.org}{PatentsView}, and optionally |
|
4 |
#' make an Arrow dataset from it. |
|
5 |
#' |
|
6 |
#' @param table Name of the table to download, as listed on the |
|
7 |
#' \href{https://patentsview.org/download/data-download-tables}{Data Downloads page}. |
|
8 |
#' @param dir Directory in which to save original tables and the dataset if \code{make_db} is \code{TRUE}. |
|
9 |
#' @param pregrant Logical; if \code{TRUE}, will download the pre-grant version of \code{table}. |
|
10 |
#' @param partition A vector of column names to be used as partition keys, if \code{make_db} is \code{TRUE}. |
|
11 |
#' Can be a named list of functions, which will be used to create new columns, then partition by those columns |
|
12 |
#' (e.g., \code{list(series_code = function(d) substr(d$patent_id, 1, 2))}). If \code{NULL} not specified, will choose |
|
13 |
#' a column with the number of unique values closest to 20. |
|
14 |
#' @param make_db Logical; if \code{TRUE}, will make an Arrow dataset out of the downloaded table. |
|
15 |
#' @param format Format of the dataset, if \code{make_db} is \code{TRUE}. |
|
16 |
#' @param return_table Logical; if \code{FALSE}, returns the path to the file, rather than the read-in table. |
|
17 |
#' @param overwrite Logical; if \code{TRUE}, overwrites any existing files (raw and prepared). |
|
18 |
#' @return The original table (if \code{return_table} is \code{TRUE}; as a \code{tibble}), an opened dataset |
|
19 |
#' (if the path to the dataset exists), or the path to the downloaded file. |
|
20 |
#' @examples |
|
21 |
#' \dontrun{ |
|
22 |
#' # download the application table of granted patents |
|
23 |
#' download_patentsview_bulk(".", "application", partition = "series_code") |
|
24 |
#' } |
|
25 |
#' @export |
|
26 | ||
27 |
download_patentsview_bulk <- function(table, dir = tempdir(), pregrant = FALSE, partition = NULL, make_db = FALSE, |
|
28 |
format = "parquet", return_table = TRUE, overwrite = FALSE) { |
|
29 | 7x |
url <- paste0( |
30 | 7x |
"https://s3.amazonaws.com/data.patentsview.org/", |
31 | 7x |
if (pregrant) "pregrant_publications" else "download", |
32 | 7x |
"/", table, ".tsv.zip" |
33 |
) |
|
34 | 7x |
dir.create(dir, FALSE, TRUE) |
35 | 7x |
dir <- paste0(normalizePath(dir, "/"), "/") |
36 | 7x |
file_zip <- paste0(dir, table, ".tsv.zip") |
37 | 7x |
file <- paste0(dir, table, if (pregrant) "_pg" else "_g", ".tsv") |
38 | 7x |
dbd <- paste0(dir, table, "_db") |
39 | ! |
if (overwrite) unlink(c(file_zip, file, dbd), TRUE, TRUE) |
40 | 7x |
if (!file.exists(file)) { |
41 | 3x |
if (!file.exists(file_zip)) download.file(url, file_zip) |
42 | 3x |
unzip(file_zip, exdir = dir) |
43 | 3x |
file.rename(sub(".zip", "", file_zip, fixed = TRUE), file) |
44 | 3x |
unlink(file_zip) |
45 |
} |
|
46 | 7x |
raw <- NULL |
47 | 1x |
if (missing(make_db) && (!missing(partition) || !missing(format))) make_db <- TRUE |
48 | 2x |
if (missing(return_table) && make_db) return_table <- FALSE |
49 | 7x |
if (make_db) { |
50 | 4x |
raw <- vroom::vroom(file, delim = "\t", show_col_types = FALSE) |
51 | 4x |
if (is.null(partition)) { |
52 | 3x |
partition <- names(which.min(abs(20 - vapply(raw, function(x) length(unique(x)), 0)))) |
53 | 1x |
} else if (is.list(partition)) { |
54 | 1x |
for (n in names(partition)) { |
55 | 1x |
raw[[n]] <- partition[[n]](raw) |
56 |
} |
|
57 | 1x |
partition <- names(partition) |
58 |
} |
|
59 | 4x |
arrow::write_dataset(raw, dbd, partitioning = partition, format = format) |
60 |
} |
|
61 | 7x |
if (!return_table && dir.exists(dbd)) { |
62 | 4x |
arrow::open_dataset(dbd, format = format) |
63 |
} else { |
|
64 | 3x |
if (return_table) { |
65 | 2x |
if (is.null(raw)) { |
66 | 1x |
vroom::vroom(file, delim = "\t", show_col_types = FALSE) |
67 |
} else { |
|
68 | 1x |
raw |
69 |
} |
|
70 |
} else { |
|
71 | 1x |
file |
72 |
} |
|
73 |
} |
|
74 |
} |
1 |
#' Get Examiner Name from Office Actions |
|
2 |
#' |
|
3 |
#' Extract the examiner's name from office action body text. |
|
4 |
#' @param office_action A list of office action, as retrieved with \code{\link{download_office_actions}}; |
|
5 |
#' each entry should be a list with entries for \code{bodyText} (the office action text) and |
|
6 |
#' \code{examinerEmplyeeNumber}. |
|
7 |
#' @param collapse Logical; if \code{FALSE}, will return a row for every entered office action. |
|
8 |
#' @return A character vector with the Examiner's employee number and two possible names: one |
|
9 |
#' extracted from the language (\code{intext}) and another from the sign-off (\code{signed}). The name |
|
10 |
#' will be \code{NA} if there is no examiner (such as if a non-examiner submitted the action), or |
|
11 |
#' if the format is not as expected. If \code{office_action} is a list of multiple office actions, |
|
12 |
#' these will make up rows of a \code{data.frame}, which will be as many as the length of |
|
13 |
#' \code{office_actions} if \code{collapse} is \code{FALSE}, or the length of unique IDs. |
|
14 |
#' @examples |
|
15 |
#' \dontrun{ |
|
16 |
#' # retrieve the office actions associated with a particular application |
|
17 |
#' office_actions <- download_office_actions("patentApplicationNumber:13877637") |
|
18 |
#' |
|
19 |
#' # get the name of the examiner associated with the first action: |
|
20 |
#' extract_examiner_name(office_actions[[1]]) |
|
21 |
#' } |
|
22 |
#' @export |
|
23 | ||
24 |
extract_examiner_name <- function(office_action, collapse = TRUE) { |
|
25 | 4x |
if (length(office_action) > 1 && "bodyText" %in% names(office_action[[1]])) { |
26 | 1x |
names <- do.call(rbind, lapply(office_action, extract_examiner_name)) |
27 | 1x |
if (collapse) { |
28 | 1x |
nas <- is.na(names[, 1]) |
29 | 1x |
if (any(nas)) names[nas, 1] <- paste0("NA", seq_len(sum(nas))) |
30 | 1x |
names <- do.call(rbind, lapply(split(as.data.frame(names), names[, 1]), function(e) { |
31 | 2x |
if (nrow(e) < 2) { |
32 | 2x |
e[1, ] |
33 |
} else { |
|
34 | ! |
intext <- e[, 2] |
35 | ! |
intext <- intext[!is.na(intext)] |
36 | ! |
signed <- e[, 3] |
37 | ! |
signed <- signed[!is.na(signed)] |
38 | ! |
data.frame( |
39 | ! |
id = e[1, "id"], |
40 | ! |
intext = if (length(intext)) intext[which.max(nchar(intext))] else NA, |
41 | ! |
signed = if (length(signed)) signed[which.max(nchar(signed))] else NA |
42 |
) |
|
43 |
} |
|
44 |
})) |
|
45 |
} |
|
46 | 1x |
return(names) |
47 |
} |
|
48 | 2x |
if (!is.list(office_action)) office_action <- list(bodyText = office_action) |
49 | ! |
if (is.list(office_action$bodyText)) office_action$bodyText <- unlist(office_action$bodyText, FALSE, FALSE) |
50 | 3x |
body <- sub("\\s+$", "", office_action$bodyText, perl = TRUE) |
51 | ! |
if (!length(body)) stop("office_action is not in the expected format", call. = FALSE) |
52 | 3x |
intext <- regmatches(body, regexec( |
53 | 3x |
"directed to (?:primary )?(?:examiner )?([A-Z][^/\\n\\t]+?) who(?: may be reached|se (?:telephone )?number)", |
54 | 3x |
body, |
55 | 3x |
perl = TRUE |
56 |
)) |
|
57 | 3x |
intext <- if (length(intext) && length(intext[[1]])) { |
58 | 2x |
intext <- gsub("(?:Primary\\s|Petitions\\s)?Examiner\\s|,$", "", intext[[1]][2], perl = TRUE) |
59 |
} else { |
|
60 | 1x |
NULL |
61 |
} |
|
62 | 3x |
l <- nchar(body) |
63 | 3x |
foot <- paste0(sub("\\s{10,}", " ", if (l > 1000) substring(body, l - 2000) else body, perl = TRUE), "XXEE") |
64 | 3x |
parts <- strsplit( |
65 | 3x |
gsub("\\s+[\\w.]+@uspto\\.gov|number='\\d+'|[A-Z][a-z]+ \\d+, \\d{4}", "", foot, perl = TRUE), |
66 | 3x |
paste0( |
67 | 3x |
"(?:[^\\w\\s]\\W*|[\\d/\\t\\n-]+)(?:", |
68 | 3x |
"Examiner|Primary\\s+Examiner|Petitions\\s+Examiner", |
69 | 3x |
")\\s?(?:[A-Z]{2}|Art|Unit|[\\d\\n,]+|XX)" |
70 |
), |
|
71 | 3x |
perl = TRUE |
72 | 3x |
)[[1]] |
73 | 3x |
signed <- if (length(parts) > 1) { |
74 | 1x |
parts <- strsplit(parts[1], "[\\n/]+", perl = TRUE)[[1]] |
75 | 1x |
sub("_x000d_.*|\\s+$", "", gsub("\\s+", " ", parts[length(parts)], perl = TRUE)) |
76 |
} else { |
|
77 | 2x |
NULL |
78 |
} |
|
79 | 3x |
c( |
80 | 3x |
id = if (is.null(office_action$examinerEmployeeNumber)) NA else trimws(office_action$examinerEmployeeNumber[[1]]), |
81 | 3x |
intext = if (length(intext) && intext != "") intext else NA, |
82 | 3x |
signed = if (length(signed) && signed != "") signed else NA |
83 |
) |
|
84 |
} |
1 |
#' Search for Patents or Patent Applications |
|
2 |
#' |
|
3 |
#' Searches the U.S. Pre-Grant Publications (US-PGPUB), Patents (USPAT), and/or |
|
4 |
#' Optical Character Recognition (USOCR) databases. See the \href{https://ppubs.uspto.gov/pubwebapp/static/pages/landing.html}{web app} |
|
5 |
#' for an interactive search. |
|
6 |
#' |
|
7 |
#' @param query A query string; for example \code{"noodle"}, \code{"photographic AND noodle"}, |
|
8 |
#' \code{"wet adj4 silicon"}, or \code{"G01W.CPCL."}. See the Patent Public Search |
|
9 |
#' \href{https://ppubs.uspto.gov/pubwebapp/static/pages/searchable-indexes.html}{training materials}. |
|
10 |
#' @param databases A character vector specifying which databases to search in; a selection from |
|
11 |
#' \code{c("US-PGPUB", "USPAT", "USOCR")}. |
|
12 |
#' @param outFile Name of a \code{.csv} file to save results to. If this file exists, it will |
|
13 |
#' be loaded instead of searching. |
|
14 |
#' @param start Result of the search to start from. |
|
15 |
#' @param limit Maximum number of results to return; defaults to all results. |
|
16 |
#' @param sort How to sort results; defaults to publication date (\code{"date_publ desc"}). |
|
17 |
#' @param english_only Logical; if \code{FALSE}, will return patent and/or applications of any language. |
|
18 |
#' @param spellCheck Logical; if \code{TRUE}, will spellcheck \code{query} terms. |
|
19 |
#' @param plurals Logical; if \code{TRUE}, will look for plural and singular forms of \code{query} terms. |
|
20 |
#' @param britishEquivalents Logical; if \code{TRUE}, will look for British and American forms of \code{query} terms. |
|
21 |
#' @param verbose Logical; if \code{FALSE}, does not print status messages. |
|
22 |
#' @return A \code{data.frame} of results, if any were found; otherwise \code{NULL}. |
|
23 |
#' @examples |
|
24 |
#' \dontrun{ |
|
25 |
#' # search for applications with a meteorology classification: |
|
26 |
#' # https://www.uspto.gov/web/patents/classification/cpc/html/cpc-G01W.html |
|
27 |
#' results <- uspto_search("G01W.CPCL.", "US-PGPUB") |
|
28 |
#' |
|
29 |
#' # search by date and keyword: |
|
30 |
#' results <- uspto_search("20020801.pd. AND motorcycle") |
|
31 |
#' } |
|
32 |
#' @export |
|
33 | ||
34 |
uspto_search <- function(query, databases = c("US-PGPUB", "USPAT", "USOCR"), outFile = NULL, start = 0, limit = FALSE, sort = "date_publ desc", |
|
35 |
english_only = TRUE, spellCheck = FALSE, plurals = TRUE, britishEquivalents = TRUE, verbose = FALSE) { |
|
36 | 2x |
if (!is.null(outFile)) { |
37 | 2x |
outFile <- paste0(sub("\\.csv.*$", "", outFile), ".csv") |
38 | 2x |
if (file.exists(outFile)) { |
39 | ! |
if (verbose) message("loading existing results") |
40 | 1x |
return(read.csv(outFile)) |
41 |
} else { |
|
42 | 1x |
dir.create(dirname(outFile), FALSE) |
43 |
} |
|
44 |
} |
|
45 | 1x |
databases <- unname(c(b = "US-PGPUB", t = "USPAT", r = "USOCR")[unique(tolower(substring(databases, nchar(databases))))]) |
46 | ! |
if (!length(databases)) stop("no recognized databases entered", call. = FALSE) |
47 | 1x |
session_request <- POST( |
48 | 1x |
"https://ppubs.uspto.gov/dirsearch-public/users/me/session", |
49 | 1x |
add_headers("Content-Type" = "application/json"), |
50 | 1x |
body = "-1" |
51 |
) |
|
52 | 1x |
session <- content(session_request) |
53 | ! |
if (is.null(session$userSessionId)) stop("failed to start a session: ", session, call. = FALSE) |
54 | 1x |
query_object <- list( |
55 | 1x |
britishEquivalents = britishEquivalents, |
56 | 1x |
caseId = session$userCase$caseId, |
57 | 1x |
databaseFilters = lapply(databases, function(s) list(databaseName = s, countryCodes = list())), |
58 | 1x |
highlights = "0", |
59 | 1x |
hl_snippets = "2", |
60 | 1x |
ignorePersist = TRUE, |
61 | 1x |
op = "OR", |
62 | 1x |
plurals = plurals, |
63 | 1x |
q = query, |
64 | 1x |
qt = "brs", |
65 | 1x |
queryName = query, |
66 | 1x |
searchType = 1, |
67 | 1x |
spellCheck = spellCheck, |
68 | 1x |
userEnteredQuery = query, |
69 | 1x |
viewName = "tile" |
70 |
) |
|
71 | 1x |
count_request <- POST( |
72 | 1x |
"https://ppubs.uspto.gov/dirsearch-public/searches/counts", |
73 | 1x |
add_headers("Content-Type" = "application/json"), |
74 | 1x |
body = toJSON(list( |
75 | 1x |
britishEquivalents = britishEquivalents, |
76 | 1x |
caseId = session$userCase$caseId, |
77 | 1x |
databaseFilters = lapply(databases, function(s) list(databaseName = s, countryCodes = list())), |
78 | 1x |
highlights = "0", |
79 | 1x |
hl_snippets = "2", |
80 | 1x |
ignorePersist = TRUE, |
81 | 1x |
op = "OR", |
82 | 1x |
plurals = plurals, |
83 | 1x |
q = query, |
84 | 1x |
qt = "brs", |
85 | 1x |
queryName = query, |
86 | 1x |
searchType = 1, |
87 | 1x |
spellCheck = spellCheck, |
88 | 1x |
userEnteredQuery = query, |
89 | 1x |
viewName = "tile" |
90 | 1x |
), auto_unbox = TRUE) |
91 |
) |
|
92 | 1x |
count <- content(count_request) |
93 | 1x |
limit <- if (is.numeric(limit)) limit else count$numResults |
94 | ! |
if (is.null(count$numResults)) stop("failed to execute query: ", count, call. = FALSE) |
95 | ! |
if (verbose) message("matches: ", count$numResults, "; retrieving ", start, " through ", limit) |
96 | 1x |
if (count$numResults != 0) { |
97 | 1x |
result_request <- POST( |
98 | 1x |
"https://ppubs.uspto.gov/dirsearch-public/searches/searchWithBeFamily", |
99 | 1x |
add_headers("Content-Type" = "application/json"), |
100 | 1x |
body = toJSON(list( |
101 | 1x |
start = start, |
102 | 1x |
pageCount = limit, |
103 | 1x |
sort = sort, |
104 | 1x |
docFamilyFiltering = "familyIdFiltering", |
105 | 1x |
searchType = 1, |
106 | 1x |
familyIdEnglishOnly = english_only, |
107 | 1x |
familyIdFirstPreferred = "USPAT", |
108 | 1x |
familyIdSecondPreferred = "US-PGPUB", |
109 | 1x |
familyIdThirdPreferred = "FPRS", |
110 | 1x |
showDocPerFamilyPref = "showEnglish", |
111 | 1x |
queryId = 0, |
112 | 1x |
tagDocSearch = FALSE, |
113 | 1x |
query = query_object |
114 | 1x |
), auto_unbox = TRUE) |
115 |
) |
|
116 | ! |
if (status_code(result_request) != 200) stop("failed to retrieve results: ", status_code(result_request), call. = FALSE) |
117 | 1x |
result <- content(result_request) |
118 | ! |
if (is.null(result$patents)) stop("failed to retrieve results: ", result, call. = FALSE) |
119 | 1x |
res <- do.call(rbind, lapply(result$patents, function(d) { |
120 | 17x |
if (is.null(d$collapsed)) d$collapsed <- TRUE |
121 | 17x |
if (is.null(d$lastChild)) d$lastChild <- TRUE |
122 | 17x |
if (is.null(d$familyCount)) d$familyCount <- 1 |
123 | 17x |
lapply(d, function(e) if (!length(e)) NA else if (is.list(e)) paste0(e, collapse = "; ") else e) |
124 |
})) |
|
125 | 1x |
res <- as.data.frame(lapply(as.data.frame(res), unlist)) |
126 | 1x |
if (!is.null(outFile)) { |
127 | ! |
if (verbose) message("saving results: ", outFile) |
128 | 1x |
write.csv(res, outFile, row.names = FALSE) |
129 |
} |
|
130 | 1x |
res |
131 |
} |
|
132 |
} |
1 |
#' Get inventor information |
|
2 |
#' |
|
3 |
#' Extract inventor information from patent/application documents or examination records. |
|
4 |
#' |
|
5 |
#' @param input A \code{data.frame} of documents (as returned from \code{\link{uspto_download}}) |
|
6 |
#' or a list of examination records (as returned from \code{\link{download_peds}}). |
|
7 |
#' @returns A \code{data.frame} with inventor information. |
|
8 |
#' @examples |
|
9 |
#' \dontrun{ |
|
10 |
#' # download an application |
|
11 |
#' application <- uspto_download("US-20040216465-A1") |
|
12 |
#' |
|
13 |
#' # get inventor information from it |
|
14 |
#' extract_inventors(application) |
|
15 |
#' |
|
16 |
#' # get the same from examination record |
|
17 |
#' extract_inventors(download_peds("applId:10612573")) |
|
18 |
#' } |
|
19 |
#' @export |
|
20 | ||
21 |
extract_inventors <- function(input) { |
|
22 | 2x |
if (is.data.frame(input)) { |
23 | 1x |
do.call(rbind, unname(lapply(split( |
24 | 1x |
input[, c("guid", "applicationNumber", "inventorsName", "inventorCountry", "inventorState", "inventorCity")], |
25 | 1x |
unlist(input$guid, FALSE, FALSE) |
26 | 1x |
), function(d) { |
27 | 1x |
res <- if (grepl(";", d$inventorState, fixed = TRUE)) { |
28 | 1x |
i <- lapply(d[, -c(1:2)], function(e) strsplit(e[[1]], "; ", fixed = TRUE)[[1]]) |
29 | 1x |
data.frame( |
30 | 1x |
as.list(unlist(d[, 1:2])), |
31 | 1x |
do.call(rbind, lapply(strsplit(i$inventorsName, ",?\\s+"), function(n) { |
32 | 2x |
data.frame( |
33 | 2x |
firstName = n[2], middleName = if (length(n) > 2) paste(n[-(1:2)], collapse = " "), lastName = n[1] |
34 |
) |
|
35 |
})), |
|
36 | 1x |
i[-1] |
37 |
) |
|
38 |
} else { |
|
39 | ! |
d |
40 |
} |
|
41 | 1x |
rownames(res) <- NULL |
42 | 1x |
res |
43 |
}))) |
|
44 |
} else { |
|
45 | 1x |
ex <- function(r) { |
46 | 1x |
if (length(r) == 1) r <- r[[1]] |
47 | 1x |
party <- r$patentCaseMetadata$partyBag$applicantBagOrInventorBagOrOwnerBag |
48 | 1x |
app <- list( |
49 | 1x |
guid = sub("-?(\\d+)-?", "-\\1-", r$patentCaseMetadata$patentPublicationIdentification$publicationNumber, perl = TRUE), |
50 | 1x |
applicationNumber = sub("(\\d{2})/?", "\\1/", r$patentCaseMetadata$applicationNumberText$value, perl = TRUE) |
51 |
) |
|
52 | ! |
if (!length(app$guid)) app$guid <- "" |
53 | 1x |
if (!is.null(party)) { |
54 | 1x |
i <- which(vapply(party, function(p) if (length(p)) names(p) else "", "") == "inventorOrDeceasedInventor") |
55 | 1x |
if (length(i)) { |
56 | 1x |
do.call(rbind, lapply(party[[i]]$inventorOrDeceasedInventor, function(g) { |
57 | 2x |
do.call(rbind, lapply(g$contactOrPublicationContact, function(p) { |
58 | 2x |
data.frame( |
59 | 2x |
app, |
60 | 2x |
p$name$personNameOrOrganizationNameOrEntityName[[1]]$personStructuredName, |
61 | 2x |
inventorCountry = p$countryCode, inventorState = p$geographicRegionName$value, inventorCity = p$cityName |
62 |
) |
|
63 |
})) |
|
64 |
})) |
|
65 |
} |
|
66 |
} |
|
67 |
} |
|
68 | ! |
if ("patentCaseMetaData" %in% names(input[[1]])) do.call(rbind, lapply(input, ex)) else ex(input) |
69 |
} |
|
70 |
} |
1 |
#' Get Patent Classification Information |
|
2 |
#' |
|
3 |
#' Retrieve U.S. patent Class definitions and subclasses from |
|
4 |
#' \href{https://www.uspto.gov/web/patents/classification}{USPTO Classification Resources}. |
|
5 |
#' |
|
6 |
#' @param code A vector of USPC classification symbols (e.g., \code{D14}). |
|
7 |
#' @param dir Directory in which to save results. |
|
8 |
#' @returns A list with an entry for \code{class} and \code{description} for the entered code, and \code{subclasses} |
|
9 |
#' containing a \code{data.frame} with a \code{subclass} (the subclass code) and \code{description} column. |
|
10 |
#' @examples |
|
11 |
#' \dontrun{ |
|
12 |
#' classifications <- get_class_info(c(428, 429)) |
|
13 |
#' } |
|
14 |
#' @export |
|
15 | ||
16 |
get_class_info <- function(code, dir = tempdir()) { |
|
17 | 4x |
if (length(code) > 1) { |
18 | 1x |
return(Filter(length, lapply(structure(code, names = code), function(cc) { |
19 | 2x |
tryCatch(get_class_info(cc, dir), error = function(e) { |
20 | ! |
warning("failed to retrieve code ", cc, call. = FALSE) |
21 | ! |
NULL |
22 |
}) |
|
23 |
}))) |
|
24 |
} |
|
25 | 3x |
code <- toupper(code) |
26 | 3x |
dir.create(dir, FALSE, TRUE) |
27 | 3x |
dir <- normalizePath(dir, "/") |
28 | 3x |
output <- paste0(dir, "/uspc", code, ".json") |
29 | 3x |
if (file.exists(output)) { |
30 | 1x |
read_json(output, simplifyVector = TRUE) |
31 |
} else { |
|
32 | 2x |
req <- GET(paste0("https://www.uspto.gov/web/patents/classification/uspc", code, "/sched", code, ".htm")) |
33 | 2x |
if (req$status_code == 200) { |
34 | 2x |
res <- strsplit(rawToChar(req$content), "\n+")[[1]] |
35 | 2x |
res <- res[seq( |
36 | 2x |
grep("<table summary", res, fixed = TRUE)[[1]], |
37 | 2x |
grep("Start global footer", res, fixed = TRUE) |
38 |
)] |
|
39 | 2x |
title <- regmatches(res[1], gregexec(">([^<]+)</", res[1]))[[1]][2, 2] |
40 | 2x |
rows <- grep("<tr", res, fixed = TRUE) |
41 | 2x |
subclasses <- as.data.frame(do.call(rbind, lapply(strsplit(res[rows[-(1:2)]], "</td>", fixed = TRUE), function(r) { |
42 | 606x |
if (length(r) > 5) { |
43 | 592x |
sm <- regexec("', '([0-9.]+)'", r[2]) |
44 | 592x |
dm <- regexec("> *([A-Za-z][^>]+)<", r[5]) |
45 | 592x |
if (length(sm[[1]]) == 2 && length(dm[[1]]) == 2) { |
46 | 537x |
c( |
47 | 537x |
subclass = regmatches(r[2], sm)[[1]][2], |
48 | 537x |
description = sub(" *$", "", regmatches(r[5], dm)[[1]][2]) |
49 |
) |
|
50 |
} |
|
51 |
} |
|
52 |
}))) |
|
53 | 2x |
colnames(subclasses) <- c("subclass", "description") |
54 | 2x |
subclasses$subclass <- as.numeric(subclasses$subclass) |
55 | 2x |
res <- list(class = code, description = title, subclasses = subclasses) |
56 | 2x |
write_json(res, output, auto_unbox = TRUE) |
57 | 2x |
res |
58 |
} else { |
|
59 | ! |
stop("failed to retrieve definition of code ", code, call. = FALSE) |
60 |
} |
|
61 |
} |
|
62 |
} |
1 |
#' Download Patents or Patent Applications |
|
2 |
#' |
|
3 |
#' Downloads from the U.S. Pre-Grant Publications (US-PGPUB), Patents (USPAT), and/or |
|
4 |
#' Optical Character Recognition (USOCR) databases. See the \href{https://ppubs.uspto.gov/pubwebapp/static/pages/landing.html}{web app} |
|
5 |
#' to search for and view documents from the same source. |
|
6 |
#' |
|
7 |
#' @param guid Vector of document numbers (e.g., \code{US-20220230294-A1}). Can also be a \code{data.frame} of results, |
|
8 |
#' as returned from \code{\link{uspto_search}}; must have a \code{guid} column containing document numbers, |
|
9 |
#' and a \code{type} column with the source database abbreviation (e.g., \code{US-PGPUB}). |
|
10 |
#' @param outDir Path to a directory in which to save each individual document. If not specified, these are |
|
11 |
#' saved to a temporary directory. |
|
12 |
#' @param type A vector the same length as \code{guid} (repeated as necessary), indicating the source database of |
|
13 |
#' the \code{guid}. Defaults to \code{US-PGPUB} for every ID. |
|
14 |
#' @param cores Number of CPU cores to split requests across. |
|
15 |
#' @param compress Logical; if \code{FALSE}, will not xz-compress each file. |
|
16 |
#' @param load Logical; if \code{FALSE}, will not return content, or load existing files. |
|
17 |
#' @param verbose Logical; if \code{FALSE}, does not print status messages. |
|
18 |
#' @return A \code{data.frame} with a row for each document. |
|
19 |
#' @seealso You can more efficiently download granted patents using \code{\link{download_patents}}. |
|
20 |
#' @examples |
|
21 |
#' \dontrun{ |
|
22 |
#' # start with a search |
|
23 |
#' results <- uspto_search("G01W.CPCL.", "US-PGPUB") |
|
24 |
#' |
|
25 |
#' # then download those results |
|
26 |
#' applications <- uspto_download(results) |
|
27 |
#' } |
|
28 |
#' @export |
|
29 | ||
30 |
uspto_download <- function(guid, outDir = tempdir(), type = "US-PGPUB", cores = detectCores() - 2, compress = TRUE, load = TRUE, verbose = FALSE) { |
|
31 | 1x |
if (missing(type) && is.list(guid) && !is.null(guid$type)) type <- guid$type |
32 | 1x |
if (!is.character(guid)) guid <- guid$guid |
33 | ! |
if (!length(guid)) stop("unrecognized guid input; should be a data.frame with a guid column, or a character vector", call. = FALSE) |
34 | 3x |
invalid <- !grepl("[A-Z]{2}-\\d+-[A-Z]\\d+", guid, perl = TRUE) |
35 | 3x |
if (any(invalid)) { |
36 | ! |
if (any(!grepl("-", guid[invalid], fixed = TRUE))) guid[invalid] <- sub("(\\d+)", "-\\1-", guid[invalid], perl = TRUE) |
37 | ! |
invalid <- !grepl("[A-Z]{2}-\\d+-[A-Z]\\d+", guid, perl = TRUE) |
38 | ! |
if (any(invalid)) stop("invalid ID format: ", if (all(invalid)) "all" else guid[invalid]) |
39 |
} |
|
40 | 3x |
type <- rep_len(type, length(guid)) |
41 | 3x |
outDir <- paste0(normalizePath(outDir, "/", FALSE), "/") |
42 | 3x |
dir.create(outDir, FALSE, TRUE) |
43 | 3x |
urls <- paste0("https://ppubs.uspto.gov/dirsearch-public/patents/", guid, "/highlight?queryId=0&source=", type) |
44 | 1x |
if (verbose) message("retrieving ", length(urls), " documents; saving in ", outDir) |
45 | 3x |
retrieve_document <- function(url, out = outDir, comp = compress) { |
46 | 11x |
file <- paste0(out, gsub("^.*patents/|/highlight.*$", "", url), ".json", if (comp) ".xz") |
47 | 11x |
doc <- NULL |
48 | 11x |
if (file.exists(file)) { |
49 | 5x |
if (load) { |
50 | ! |
if (verbose) message("loading existing document: ", file) |
51 | 5x |
doc <- as.data.frame(read_json(file)[[1]]) |
52 | 5x |
doc$score <- as.numeric(doc$score) |
53 |
} else { |
|
54 | ! |
message("document exists; not loading") |
55 |
} |
|
56 |
} else { |
|
57 | 5x |
if (verbose) message("retrieving document: ", url) |
58 | 6x |
req <- GET(url) |
59 | 6x |
if (req$status_code == 200) { |
60 | 6x |
res <- content(req) |
61 | 6x |
doc <- as.data.frame(lapply(res, function(e) if (is.null(e)) "N/A" else if (is.list(e)) paste0(e, collapse = "; ") else e)) |
62 | 6x |
html <- grep("Html$", names(doc), TRUE, value = TRUE) |
63 | 6x |
for (e in html) { |
64 | 72x |
doc[[e]] <- gsub('<figref idref="([^"]+)">FIG\\. (\\d+)</figref>', "FIGREF FIGTYPE\\1 FIGID\\2", |
65 | 72x |
gsub("\\[(\\d+)\\]", "SECID SEC\\1", |
66 | 72x |
doc[[e]], |
67 | 72x |
perl = TRUE |
68 |
), |
|
69 | 72x |
perl = TRUE |
70 |
) |
|
71 |
} |
|
72 | 6x |
if (comp) { |
73 | 6x |
file <- xzfile(file) |
74 | 6x |
on.exit(close(file)) |
75 |
} |
|
76 | 6x |
write_json(doc, file, auto_unbox = TRUE) |
77 |
} |
|
78 |
} |
|
79 | 11x |
if (load) doc |
80 |
} |
|
81 | 3x |
res <- if (cores > 1) { |
82 | 3x |
cl <- parallel::makeCluster(max(1, min(length(urls), cores))) |
83 | 3x |
on.exit(parallel::stopCluster(cl), TRUE) |
84 | 3x |
parallel::parLapply(cl, urls, retrieve_document) |
85 |
} else { |
|
86 | ! |
lapply(urls, retrieve_document) |
87 |
} |
|
88 | 3x |
if (load) invisible(do.call(rbind, res)) |
89 |
} |
1 |
#' Retrieve Full Patent Text |
|
2 |
#' |
|
3 |
#' Search for and return a set of full patent texts. |
|
4 |
#' |
|
5 |
#' @param ids A vector of application numbers, patent numbers, or document IDs (e.g., \code{"US9748552"}). |
|
6 |
#' @param outFile Path to a \code{.csv} file to write results to. |
|
7 |
#' @param ids_type Specifies what \code{ids} are, between \code{applications} (default), \code{patents}, |
|
8 |
#' \code{publication} (for publication document IDs), or \code{grant} (for grant document IDs). |
|
9 |
#' @param query Search query; either a term/phrase (\code{searchText}) or set of Boolean criteria (\code{criteriaSearchText}). |
|
10 |
#' @param inventionTitle Invention title, as it appears on the first page of the specification. |
|
11 |
#' @param inventionSubjectMatterCategory Patent category (e.g., \code{DESIGN}). |
|
12 |
#' @param assigneeEntityName Name of the patent assignee. |
|
13 |
#' @param fromDate,toDate,filingDateFromDate,filingDateToDate Publication/grant and/or filing date ranges |
|
14 |
#' in \code{YYYY-MM-DD} format (e.g., \code{2019-01-20}). |
|
15 |
#' @param inventorNameText,claimText,abstractText,descriptionText Text to search for in specific portions; |
|
16 |
#' \code{query} will search in all portions if it is a term or phrase. |
|
17 |
#' @param sortField Field by which to sort results; defaults to date. |
|
18 |
#' @param ascending Logical; if \code{TRUE}, will sort in ascending order. |
|
19 |
#' @param grant Logical; if \code{TRUE}, searches for grants rather than publications. |
|
20 |
#' @param limit Maximum number of results to return, in steps of 100. |
|
21 |
#' @param series Series code to prepend to \code{ids} if they are application or patent numbers and not already included. |
|
22 |
#' @param cores Number of CPU cores to use when retrieving multiple pages of results. |
|
23 |
#' @param verbose Logical; if \code{TRUE}, will print status messages. |
|
24 |
#' @return Path to the downloaded package, if it exists and \code{await} was a number; otherwise \code{NULL}. |
|
25 |
#' @examples |
|
26 |
#' \dontrun{ |
|
27 |
#' # retrieve a specific set of patents: |
|
28 |
#' patents <- download_patents(c(16978600, 17087826, 16041592, 16727940, 17070594)) |
|
29 |
#' } |
|
30 |
#' @export |
|
31 | ||
32 |
download_patents <- function(ids = NULL, outFile = NULL, ids_type = "application", query = NULL, |
|
33 |
inventionTitle = NULL, inventionSubjectMatterCategory = NULL, assigneeEntityName = NULL, |
|
34 |
fromDate = NULL, toDate = NULL, filingDateFromDate = NULL, |
|
35 |
filingDateToDate = NULL, inventorNameText = NULL, claimText = NULL, |
|
36 |
abstractText = NULL, descriptionText = NULL, sortField = NULL, ascending = FALSE, |
|
37 |
grant = FALSE, limit = FALSE, series = "US", cores = detectCores() - 2, verbose = FALSE) { |
|
38 | 4x |
if (!is.null(outFile) && file.exists(outFile)) { |
39 | ! |
if (verbose) message("reading in existing file: ", outFile) |
40 | 1x |
res <- read.csv(outFile) |
41 |
} else { |
|
42 | 3x |
args <- list( |
43 | 3x |
inventionSubjectMatterCategory = inventionSubjectMatterCategory, |
44 | 3x |
filingDateFromDate = filingDateFromDate, |
45 | 3x |
filingDateToDate = filingDateToDate, |
46 | 3x |
inventionTitle = inventionTitle, |
47 | 3x |
assigneeEntityName = assigneeEntityName, |
48 | 3x |
inventorNameText = inventorNameText, |
49 | 3x |
claimText = claimText, |
50 | 3x |
abstractText = abstractText, |
51 | 3x |
descriptionText = descriptionText, |
52 | 3x |
sortField = sortField, |
53 | 3x |
sortOrder = if (ascending) "asc" else "desc" |
54 |
) |
|
55 | 3x |
if (!is.null(ids)) { |
56 | 1x |
id_field <- c( |
57 | 1x |
ap = "patentApplicationNumber", pa = "patentNumber", |
58 | 1x |
do = paste0(if (grant) "grant" else "publication", "DocumentIdentifier"), |
59 | 1x |
pu = "publicationDocumentIdentifier", gr = "grantDocumentIdentifier" |
60 | 1x |
)[tolower(substring(ids_type, 1, 2))] |
61 | ! |
if (!grant && grepl("grant", id_field, fixed = TRUE)) grant <- TRUE |
62 | ! |
if (is.na(id_field)) stop("ids_type not recognized", call. = FALSE) |
63 | 1x |
args[id_field] <- format_numbers(ids, if (grepl("Document", id_field, fixed = TRUE)) "" else series) |
64 |
} |
|
65 | 3x |
if (!is.null(query)) { |
66 | ! |
isCriteria <- grepl(":", query, fixed = TRUE) |
67 | ! |
args[if (isCriteria) "criteriaSearchText" else "searchText"] <- query |
68 | ! |
if (isCriteria) args$largTextSearchFlag <- "Y" |
69 |
} |
|
70 | 3x |
type <- if (grant) "grant" else "publication" |
71 | ! |
if (!is.null(toDate)) args[paste0(type, "ToDate")] <- toDate |
72 | ! |
if (!is.null(fromDate)) args[paste0(type, "FromDate")] <- fromDate |
73 | 1x |
if (is.numeric(limit) && limit < 100) args$rows <- limit |
74 | 3x |
args <- Filter(length, args) |
75 | 3x |
req <- GET(paste0( |
76 | 3x |
"https://developer.uspto.gov/ibd-api/v1/application/", if (grant) "grants" else "publications", |
77 | 3x |
"?", paste(paste0(names(args), "=", unlist(args, use.names = FALSE)), collapse = "&") |
78 |
)) |
|
79 | 3x |
res <- tryCatch(content(req), error = function(e) http_status(req)) |
80 | ! |
if (length(res) && !is.null(res$error)) stop("request failed: ", res$status, ": ", res$error, call. = FALSE) |
81 | 3x |
retrieved <- length(res$results) |
82 | ! |
if (!retrieved) stop("no results were not found", call. = FALSE) |
83 | 3x |
n <- if (is.numeric(limit)) min(res$recordTotalQuantity, limit) else res$recordTotalQuantity |
84 | 3x |
if (n > retrieved) { |
85 | 1x |
pages <- ceiling(n / 100) - 1 |
86 | 1x |
base_url <- paste0( |
87 | 1x |
"https://developer.uspto.gov/ibd-api/v1/application/", if (grant) "grants" else "publications", |
88 | 1x |
"?", paste(paste0(names(args), "=", unlist(args, use.names = FALSE)), collapse = "&"), "&start=" |
89 |
) |
|
90 | 1x |
urls <- vapply(seq_len(pages), function(i) paste0(base_url, i * 100), "") |
91 | 1x |
get_page <- function(url) { |
92 | 1x |
res <- tryCatch(content(GET(url))$results, error = function(e) NULL) |
93 | ! |
if (is.null(res)) warning("failed to retrieve ", url) |
94 | 1x |
res |
95 |
} |
|
96 | 1x |
res2 <- if (cores > 1) { |
97 | 1x |
cl <- parallel::makeCluster(max(1, min(pages, cores))) |
98 | 1x |
on.exit(parallel::stopCluster(cl), TRUE) |
99 | 1x |
parallel::parLapply(cl, urls, get_page) |
100 |
} else { |
|
101 | ! |
lapply(urls, get_page) |
102 |
} |
|
103 | 1x |
if (length(res2)) { |
104 | 1x |
res$results <- c(res$results, unlist(res2, recursive = FALSE)) |
105 | ! |
} else if (verbose) warning("failed to retrieve more than the initial results", call. = FALSE) |
106 |
} |
|
107 | 3x |
res <- as.data.frame(do.call(cbind, lapply(as.data.frame(do.call(rbind, lapply(res$results, function(d) { |
108 | 210x |
lapply(d, function(e) if (is.null(e)) NA else if (is.list(e)) paste(e, collapse = " ") else e) |
109 | 3x |
}))), unlist))) |
110 | ! |
if (verbose) message("found ", nrow(res)) |
111 | 3x |
if (!is.null(outFile)) { |
112 | ! |
if (verbose) message("writing results to ", outFile) |
113 | 1x |
dir.create(dirname(outFile), FALSE, TRUE) |
114 | 1x |
write.csv(res, outFile, row.names = FALSE) |
115 |
} |
|
116 |
} |
|
117 | 4x |
res |
118 |
} |
|
119 | ||
120 |
format_numbers <- function(numbers, series) { |
|
121 | 1x |
paste(sub("^[A-Z]{2}([A-Z]{2})", "\\1", paste0(series, gsub("[/,]+", "", numbers)), FALSE, perl = TRUE), collapse = ",") |
122 |
} |
1 |
#' Make Document-Class Matrix |
|
2 |
#' |
|
3 |
#' Create A document-class matrix from a patentsview bulk table. |
|
4 |
#' |
|
5 |
#' @param table A matrix-like object with columns specified by \code{class_id}, \code{doc_id}, and \code{rank}. |
|
6 |
#' Alternatively, the path to a tab-separated file containing such a matrix, an \code{Arrow} dataset, or the name |
|
7 |
#' of a table, to be passed to \code{\link{download_patentsview_bulk}}. |
|
8 |
#' @param outFile Path to an rds file to save the results to. |
|
9 |
#' @param class_id,doc_id,rank Column names of the class, document ID, and class rank to be pulled from \code{table}. |
|
10 |
#' @param type Table name, used to set a default \code{class_id}. |
|
11 |
#' @param ... Additional arguments to be passed to \code{\link{download_patentsview_bulk}}, |
|
12 |
#' if \code{table} is a table name. |
|
13 |
#' @param sparse Logical; if \code{FALSE}, returns a regular, dense matrix. |
|
14 |
#' @param overwrite Logical; if \code{TRUE}, overwrites an existing \code{outFile} rather than loading it. |
|
15 |
#' @return A sparse matrix (or regular matrix if \code{sparse} is \code{FALSE}) with documents |
|
16 |
#' in rows, classes in columns, and the class rank (sequence) as values. |
|
17 |
#' @examples |
|
18 |
#' table <- data.frame( |
|
19 |
#' patent_id = c("a", "a", "b"), |
|
20 |
#' class = c(1, 3, 2), |
|
21 |
#' sequence = c(1, 0, 0) |
|
22 |
#' ) |
|
23 |
#' patentsview_class_matrix(table, class_id = "class") |
|
24 |
#' |
|
25 |
#' \dontrun{ |
|
26 |
#' |
|
27 |
#' # get a matrix of WIPO class assignments |
|
28 |
#' wipo_fields <- patentsview_class_matrix("wipo") |
|
29 |
#' |
|
30 |
#' # get a subset without creating the full matrix |
|
31 |
#' wipo <- download_patentsview_bulk("wipo", make_db = TRUE) |
|
32 |
#' wipo_fields_sub <- patentsview_class_matrix(dplyr::compute(dplyr::filter( |
|
33 |
#' wipo, patent_id %in% c("10000002", "10000015", "10000017") |
|
34 |
#' ))) |
|
35 |
#' } |
|
36 |
#' @export |
|
37 | ||
38 |
patentsview_class_matrix <- function(table, outFile = NULL, class_id = NULL, doc_id = "patent_id", rank = "sequence", |
|
39 |
type = NULL, ..., sparse = TRUE, overwrite = FALSE) { |
|
40 | 6x |
m <- NULL |
41 | 6x |
if (!is.null(outFile)) { |
42 | 2x |
outFile <- sub("\\.[A-Za-z]{3}.*", ".rds", outFile) |
43 | 2x |
if (!overwrite && file.exists(outFile)) { |
44 | 1x |
m <- readRDS(outFile) |
45 |
} |
|
46 |
} |
|
47 | 6x |
if (is.null(m)) { |
48 | 5x |
if (is.character(table) && length(table) == 1) { |
49 | 1x |
type <- if (is.null(type)) basename(table) else type |
50 | 1x |
table <- if (file.exists(table)) { |
51 | ! |
arrow::read_delim_arrow(table, delim = "\t", escape_backslash = TRUE) |
52 |
} else { |
|
53 | 1x |
download_patentsview_bulk(table, ..., make_db = TRUE, return_table = FALSE) |
54 |
} |
|
55 |
} |
|
56 | 5x |
if (!is.null(type)) { |
57 | 3x |
type <- substr(type, 1, 3) |
58 | 3x |
if (is.null(class_id)) { |
59 | 3x |
class_id <- switch(type, |
60 | 3x |
wip = "field_id", |
61 | 3x |
cpc = "group_id", |
62 | 3x |
ipc = "section", |
63 | 3x |
nbe = "subcategory_id", |
64 | 3x |
usp = "mainclass_id" |
65 |
) |
|
66 |
} |
|
67 | 2x |
} else if (is.null(class_id)) { |
68 | 1x |
class_id <- c("field_id", "group_id", "section", "subcategory_id", "mainclass_id") |
69 | 1x |
class_id <- class_id[class_id %in% names(table)] |
70 | ! |
if (length(class_id) > 1) class_id <- class_id[[1]] |
71 |
} |
|
72 | ! |
if (!length(class_id)) stop("specify a class_id or type from which it can be inferred", call. = FALSE) |
73 | 5x |
cols <- c(class_id, doc_id) |
74 | 5x |
if (!all(cols %in% names(table))) { |
75 | ! |
stop("unrecognized column reference: ", paste(cols[!cols %in% names(table)], collapse = ", ")) |
76 |
} |
|
77 | 5x |
if (!is.data.frame(table) && (!is.null(table$NewScan) || !is.null(table$GetColumnByName))) { |
78 | 1x |
if (!is.null(table$NewScan)) table <- table$NewScan()$Filter(arrow::Expression$scalar(TRUE))$Finish()$ToTable() |
79 | 3x |
classes <- sort(unique(table$GetColumnByName(class_id)))$as_vector() |
80 | 3x |
docs <- sort(unique(table$GetColumnByName(doc_id)))$as_vector() |
81 | 3x |
table <- as.data.frame(table) |
82 |
} else { |
|
83 | 2x |
if (inherits(table, "arrow_dplyr_query")) { |
84 | ! |
stop( |
85 | ! |
"enter a completed query (e.g., using dplyr::compute), or a data.frame", |
86 | ! |
call. = FALSE |
87 |
) |
|
88 |
} |
|
89 | 2x |
classes <- sort(unique(table[[class_id]])) |
90 | 2x |
docs <- sort(unique(table[[doc_id]])) |
91 |
} |
|
92 | 5x |
m <- Matrix::sparseMatrix( |
93 | 5x |
as.integer(factor(table[[doc_id]], docs)), as.integer(factor(table[[class_id]], classes)), |
94 | 5x |
x = if (rank %in% names(table)) table[[rank]] + 1 else 1, |
95 | 5x |
dims = c(length(docs), length(classes)), dimnames = list(docs, classes) |
96 |
) |
|
97 | 1x |
if (!is.null(outFile)) saveRDS(m, outFile, compress = "xz") |
98 |
} |
|
99 | 1x |
if (sparse) m else as.matrix(m) |
100 |
} |
1 |
#' Download Office Actions |
|
2 |
#' |
|
3 |
#' Download a set of Office actions based on a query to the |
|
4 |
#' \href{https://developer.uspto.gov/api-catalog/uspto-office-action-text-retrieval-api}{Office Action Text Retrieval API}. |
|
5 |
#' @param criteria Search criteria, in the form of \code{"field:value"} (e.g., \code{"examinerEmployeeNumber:80488"}). |
|
6 |
#' See available \href{https://developer.uspto.gov/ds-api/oa_actions/v1/fields}{fields}. |
|
7 |
#' @param outFile Path to the resulting JSON file; defaults to the hash of the query in the current working directory. |
|
8 |
#' @param start Initial record; useful if making requests in chunks. |
|
9 |
#' @param limit Maximum number of office actions to return; under 1,000, or in steps of 1,000; defaults to all found. |
|
10 |
#' @param overwrite Logical; if \code{TRUE}, will overwrite an existing file with the same \code{outFile}. |
|
11 |
#' @param endpoint API endpoint. |
|
12 |
#' @param compress Logical; if \code{FALSE}, will not write a compressed file. |
|
13 |
#' @param cores Number of CPU cores to split calls across if necessary. |
|
14 |
#' @param verbose Logical; if \code{FALSE}, will not print status messages. |
|
15 |
#' @return A list of office actions. |
|
16 |
#' @examples |
|
17 |
#' \dontrun{ |
|
18 |
#' # retrieve the office actions associated with a particular application |
|
19 |
#' office_actions <- download_office_actions("patentApplicationNumber:13877637") |
|
20 |
#' } |
|
21 |
#' @export |
|
22 | ||
23 |
download_office_actions <- function(criteria = "*:*", outFile = NULL, start = 0, limit = FALSE, overwrite = FALSE, |
|
24 |
endpoint = "https://developer.uspto.gov/ds-api/oa_actions/v1/records", |
|
25 |
compress = TRUE, cores = detectCores() - 2, verbose = TRUE) { |
|
26 | 2x |
final <- normalizePath(paste0(if (is.null(outFile)) { |
27 | 1x |
paste0(tempdir(), "/", if (!missing(criteria)) paste0(sub(":", "_", criteria, fixed = TRUE), "_"), Sys.Date()) |
28 |
} else { |
|
29 | 1x |
sub("\\.json.*$", "", outFile) |
30 | 2x |
}, ".json", if (compress) ".xz"), "/", FALSE) |
31 | 2x |
if (!overwrite && file.exists(final)) { |
32 | ! |
if (verbose) message("Reading in existing results") |
33 | ! |
return(invisible(read_json(final))) |
34 |
} |
|
35 | 2x |
query <- httr::POST( |
36 | 2x |
endpoint, httr::add_headers("Content-Type" = "application/x-www-form-urlencoded", Accept = "application/json"), |
37 | 2x |
body = paste0("criteria=", criteria, "&start=", start, "&rows=", min(1000, limit)) |
38 |
) |
|
39 | 2x |
if (200 != httr::status_code(query)) { |
40 | ! |
error <- httr::http_error(query) |
41 | ! |
stop("query failed:\n", if (is.logical(error)) httr::content(query) else error, call. = FALSE) |
42 |
} |
|
43 | 2x |
res <- httr::content(query)$response |
44 | 2x |
limit <- if (is.numeric(limit)) min(limit, res$numFound) else res$numFound |
45 | 2x |
if (verbose) { |
46 | ! |
message( |
47 | ! |
"Found ", res$numFound, " office actions for ", |
48 | ! |
if (nchar(criteria) > 104) paste(substring(criteria, 1, 100), "...") else criteria |
49 |
) |
|
50 |
} |
|
51 | 2x |
if (length(res$docs) < limit) { |
52 | 1x |
pages <- ceiling((limit - length(res$docs)) / 1000) |
53 | 1x |
get_page <- function(i) { |
54 | 1x |
query <- httr::POST( |
55 | 1x |
endpoint, httr::add_headers("Content-Type" = "application/x-www-form-urlencoded", Accept = "application/json"), |
56 | 1x |
body = paste0("criteria=", criteria, "&start=", 100 + (i - 1) * 1000, "&rows=1000") |
57 |
) |
|
58 | 1x |
if (200 == httr::status_code(query)) { |
59 | 1x |
httr::content(query)$response$docs |
60 |
} |
|
61 |
} |
|
62 | 1x |
res2 <- if (cores > 1) { |
63 | 1x |
cl <- parallel::makeCluster(max(1, min(pages, cores))) |
64 | 1x |
parallel::clusterExport(cl, "criteria", environment()) |
65 | 1x |
on.exit(parallel::stopCluster(cl), TRUE) |
66 | 1x |
parallel::parLapply(cl, seq_len(pages), get_page) |
67 |
} else { |
|
68 | ! |
lapply(seq_len(pages), get_page) |
69 |
} |
|
70 | 1x |
res$docs <- c(res$docs, unlist(res2, recursive = FALSE)) |
71 |
} |
|
72 | 2x |
dir.create(dirname(final), FALSE, TRUE) |
73 | 2x |
if (compress) { |
74 | 2x |
con <- xzfile(final) |
75 | 2x |
on.exit(close(con), add = TRUE) |
76 |
} |
|
77 | 2x |
write_json(res$docs, if (compress) con else final, auto_unbox = TRUE) |
78 | 2x |
invisible(res$docs) |
79 |
} |