1 |
#' Search for Companies (Legacy) |
|
2 |
#' |
|
3 |
#' Search for companies based on Standard Industrial Classification. |
|
4 |
#' @param SIC Standard Industrial Classification code (e.g., \code{2080}). |
|
5 |
#' @param start Number of results to skip. |
|
6 |
#' @param limit Maximum number of results to return, in steps of 100. |
|
7 |
#' @param user User agent string (e.g., \code{"Company Name AdminContact@example.com"}); |
|
8 |
#' defaults to the \code{SEC_USER} environment variable, which can be set with |
|
9 |
#' \code{\link{Sys.setenv}} (e.g., \code{Sys.setenv(SEC_USER = "...")}). |
|
10 |
#' @param verbose Logical; if \code{FALSE}, will not show status updates. |
|
11 |
#' @seealso For more complete results, see \code{\link{sec_search}}. |
|
12 |
#' @examples |
|
13 |
#' \dontrun{ |
|
14 |
#' |
|
15 |
#' # get a list of beverage companies |
|
16 |
#' sec_search_companies(2080) |
|
17 |
#' } |
|
18 |
#' @returns A \code{data.frame} of results, with columns for \code{CIK} (Central Index Key), |
|
19 |
#' \code{Company}, and \code{Location}. |
|
20 |
#' @export |
|
21 | ||
22 |
sec_search_companies <- function(SIC, start = 0, limit = Inf, user = Sys.getenv("SEC_USER"), verbose = TRUE) { |
|
23 | 1x |
base_url <- paste("https://www.sec.gov/cgi-bin/browse-edgar?SIC=") |
24 | 1x |
count <- min(100, limit) |
25 | 1x |
extract_table <- function(req) { |
26 | 3x |
d <- rawToChar(req$content) |
27 | 3x |
tab <- regmatches(d, regexec("<table.*</table>", d))[[1]] |
28 | 3x |
if (length(tab)) { |
29 | 2x |
rows <- strsplit(tab, "<tr[^>]*>")[[1]][-(1:2)] |
30 | 2x |
p <- as.data.frame(do.call(rbind, lapply(strsplit(rows, "\\s*<[^>]+>"), "[", c(3, 6, 9)))) |
31 | 2x |
colnames(p) <- c("CIK", "Company", "Location") |
32 | 2x |
p |
33 |
} |
|
34 |
} |
|
35 | 1x |
user_agent <- httr::user_agent(user) |
36 | 1x |
req <- httr::GET( |
37 | 1x |
paste0(base_url, SIC, "&start=", start, "&count=", count), |
38 | 1x |
user_agent |
39 |
) |
|
40 | 1x |
res <- NULL |
41 | 1x |
if (req$status_code == 200) { |
42 | 1x |
res <- extract_table(req) |
43 | 1x |
if (nrow(res) < limit && nrow(res) >= 100) { |
44 | 1x |
while (nrow(res) < limit) { |
45 | 2x |
req <- httr::GET( |
46 | 2x |
paste0(base_url, SIC, "&start=", nrow(res), "&count=100"), |
47 | 2x |
user_agent |
48 |
) |
|
49 | 2x |
if (req$status_code != 200) { |
50 | ! |
if (verbose) cli_alert_warning("page {nrow(res) / 100 + 1} failed: {rawToChar(req$content)}") |
51 | ! |
break |
52 |
} |
|
53 | 2x |
res2 <- extract_table(req) |
54 | 1x |
if (!length(res2)) break |
55 | 1x |
res <- rbind(res, res2) |
56 |
} |
|
57 |
} |
|
58 |
} else { |
|
59 | ! |
stop("request failed: ", rawToChar(req$content), call. = FALSE) |
60 |
} |
|
61 | 1x |
res |
62 |
} |
1 |
#' Download Company Information |
|
2 |
#' |
|
3 |
#' Retrieve information about companies. |
|
4 |
#' |
|
5 |
#' @param ciks A vector of Central Index Keys, or a \code{data.frame} with a \code{ciks} |
|
6 |
#' column. |
|
7 |
#' @param type Type of information to collect, between \code{submissions}, which includes things |
|
8 |
#' like Standard Industrial Classification, ticker, and previous names, as well as information |
|
9 |
#' about recent filings, and \code{facts}, including financial |
|
10 |
#' @param out Path to a directory in which to save filings. Each filing will be saved |
|
11 |
#' within a subdirectory: \code{{out}/{cik}/{SEC Accession Number}}. |
|
12 |
#' @param load Logical; if \code{TRUE}, will read in the downloaded info file. |
|
13 |
#' @param overwrite Logical; if \code{TRUE}, download the info file even if it already exists. |
|
14 |
#' @param user User agent string (e.g., \code{"Company Name AdminContact@example.com"}); |
|
15 |
#' defaults to the \code{SEC_USER} environment variable, which can be set with |
|
16 |
#' \code{\link{Sys.setenv}} (e.g., \code{Sys.setenv(SEC_USER = "...")}). |
|
17 |
#' @param verbose Logical; if \code{FALSE}, will not show status updates. |
|
18 |
#' @seealso Search for filings with \code{\link{sec_search}}. |
|
19 |
#' @examples |
|
20 |
#' \dontrun{ |
|
21 |
#' |
|
22 |
#' # retrieve information about Pepsico |
|
23 |
#' pepsi <- sec_companies("0000077476") |
|
24 |
#' pepsi$`0000077476`[1:10] |
|
25 |
#' |
|
26 |
#' # retrieve financial information about Pepsico |
|
27 |
#' pepsi_facts <- sec_companies("0000077476", "facts") |
|
28 |
#' pepsi_facts$`0000077476`$facts$`us-gaap`$Revenues |
|
29 |
#' } |
|
30 |
#' @returns An invisible list with named entries for each successful CIK if \code{load} is \code{TRUE}, |
|
31 |
#' or a vector of HTTP status codes otherwise. |
|
32 |
#' @export |
|
33 | ||
34 |
sec_companies <- function(ciks, type = "submissions", out = tempdir(), load = TRUE, overwrite = FALSE, |
|
35 |
user = Sys.getenv("SEC_USER"), verbose = TRUE) { |
|
36 | ! |
if (is.data.frame(ciks)) ciks <- ciks$ciks |
37 | 2x |
ciks <- formatC(unique(ciks), width = 10, flag = 0) |
38 | 2x |
dir.create(out, FALSE, TRUE) |
39 | 2x |
out <- paste0(normalizePath(out, "/"), "/") |
40 | 2x |
type <- if (grepl("^s", tolower(type))) "submissions" else "facts" |
41 | 2x |
base_url <- paste0( |
42 | 2x |
"https://data.sec.gov/", |
43 | 2x |
if (type == "submissions") "submissions" else "api/xbrl/companyfacts", |
44 | 2x |
"/CIK" |
45 |
) |
|
46 | 2x |
user_agent <- httr::user_agent(user) |
47 | 2x |
n <- length(ciks) |
48 | 2x |
reqs <- structure(rep(200, n), names = ciks) |
49 | 2x |
res <- list() |
50 | 2x |
if (missing(verbose) && n == 1) verbose <- FALSE |
51 | 2x |
if (verbose) { |
52 | ! |
cli_progress_step( |
53 | ! |
"0 / {n} downloaded", |
54 | ! |
msg_done = "finished; {sum(reqs == 200)} of {n} successful", spinner = TRUE |
55 |
) |
|
56 |
} |
|
57 | 2x |
for (i in seq_len(n)) { |
58 | 2x |
dir <- paste0(out, ciks[[i]], "/") |
59 | 2x |
dir.create(dir, FALSE, TRUE) |
60 | 2x |
file <- paste0(dir, type, ".json") |
61 | 2x |
if (overwrite || !file.exists(file)) { |
62 | 2x |
req <- httr::GET(paste0(base_url, ciks[[i]], ".json"), httr::write_disk(file), user_agent) |
63 | ! |
if (req$status_code != 200) unlink(file) |
64 | 2x |
Sys.sleep(max(.001, .1 - req$times[["total"]])) |
65 | 2x |
reqs[i] <- req$status_code |
66 |
} |
|
67 | 2x |
if (load && file.exists(file)) { |
68 | 2x |
res[[ciks[[i]]]] <- jsonlite::read_json(file, simplifyVector = TRUE) |
69 |
} |
|
70 | ! |
if (verbose) cli_progress_update() |
71 |
} |
|
72 | 2x |
invisible(if (load) res else reqs) |
73 |
} |
1 |
#' Search for Filings |
|
2 |
#' |
|
3 |
#' Search for forms by keywords, Central Index Keys, and/or Standard Industrial Classification |
|
4 |
#' codes, filed after 2001. |
|
5 |
#' |
|
6 |
#' @param query A character with words or phrases to search for within the form |
|
7 |
#' (e.g., \code{grape "fruit juice"}). |
|
8 |
#' @param entity A character with a name, ticker, or Central Index Key. |
|
9 |
#' @param CIKs A vector of Central Index Keys (e.g., \code{c(0000317540, 0000077476)}). |
|
10 |
#' @param SICs A vector of Standard Industrial Classification codes (e.g., \code{2080}). |
|
11 |
#' @param forms A vector of forms to include (e.g., \code{"10-K"}). |
|
12 |
#' @param date_to Latest date to include results from (e.g., \code{"2019-01-01"}); defaults to current. |
|
13 |
#' @param date_from Earliest date to include results from; defaults to the earliest available: |
|
14 |
#' \code{"2001-01-01"}. |
|
15 |
#' @param outFile Path of a file to write results to. |
|
16 |
#' @param overwrite Logical; if \code{FALSE}, will read in an existing results file rather than |
|
17 |
#' executing the search. |
|
18 |
#' @param start Number of results to skip. |
|
19 |
#' @param limit Maximum number of results to return, in steps of 100. |
|
20 |
#' @param verbose Logical; if \code{FALSE}, will no display status messages. |
|
21 |
#' @seealso To retrieve a simpler list of companies, see \code{\link{sec_search_companies}}. |
|
22 |
#' @examples |
|
23 |
#' \dontrun{ |
|
24 |
#' |
|
25 |
#' # search for 10-K filings by beverage companies |
|
26 |
#' results <- sec_search(SICs = 2080, forms = "10-K", limit = 100) |
|
27 |
#' } |
|
28 |
#' @returns A \code{data.frame} of results. |
|
29 |
#' @export |
|
30 | ||
31 |
sec_search <- function(query = NULL, entity = NULL, CIKs = NULL, SICs = NULL, forms = NULL, date_to = NULL, |
|
32 |
date_from = "2001-01-01", outFile = NULL, overwrite = TRUE, start = 0, limit = Inf, |
|
33 |
verbose = TRUE) { |
|
34 | 2x |
if (!overwrite && !is.null(outFile) && file.exists(outFile)) { |
35 | ! |
return(read.csv(outFile)) |
36 |
} |
|
37 | 2x |
if (!is.null(query) && !is.character(query)) { |
38 | ! |
cli_abort("query must be a character; use other arguments to enter numeric CIKs or SICs") |
39 |
} |
|
40 | 2x |
body <- Filter(length, list( |
41 | 2x |
dateRange = "custom", |
42 | 2x |
q = query, |
43 | 2x |
entityName = as.character(entity), |
44 | 2x |
ciks = as.list(formatC(CIKs, width = 10, flag = 0)), |
45 | 2x |
sics = as.list(as.character(SICs)), |
46 | 2x |
startdt = date_to, |
47 | 2x |
enddt = date_from, |
48 | 2x |
forms = as.list(toupper(forms)), |
49 | 2x |
from = start |
50 |
)) |
|
51 | 2x |
req <- httr::POST( |
52 | 2x |
"https://efts.sec.gov/LATEST/search-index", |
53 | 2x |
body = body, encode = "json" |
54 |
) |
|
55 | 2x |
req$status_code |
56 | 2x |
res <- jsonlite::fromJSON(rawToChar(req$content)) |
57 | ! |
if (!is.null(res$errorMessage)) cli_abort("request failed\n message: {res$errorType} {res$errorMessage}") |
58 | ! |
if (res$hits$total$value == 0) cli_abort("no results found") |
59 | 2x |
if (verbose) { |
60 | 2x |
cli_alert_info(paste0( |
61 | 2x |
"found {res$hits$total$value} records", if (res$hits$total$value > limit) "; returning {limit}" |
62 |
)) |
|
63 |
} |
|
64 | 2x |
extract_hits <- function(res) { |
65 | 7x |
d <- res$hits$hits |
66 | 7x |
if (nrow(d)) { |
67 | 7x |
d[["_source"]]$id <- d[["_id"]] |
68 | 7x |
d <- d[["_source"]] |
69 | 7x |
d$items <- NULL |
70 | 7x |
do.call(rbind, lapply(seq_len(nrow(d)), function(r) { |
71 | 610x |
if (length(d[r, 1][[1]]) == 1) { |
72 | 609x |
do.call(cbind, lapply(d[r, ], as.character)) |
73 |
} else { |
|
74 | 1x |
row <- lapply(d[r, ], function(x) lapply(x, function(v) if (length(v)) v else "")[[1]]) |
75 | 1x |
if (length(row$biz_states) != length(row$biz_locations)) { |
76 | 1x |
row$biz_states <- sub("^.*, ", "", row$biz_locations) |
77 |
} |
|
78 | 1x |
do.call(cbind, row) |
79 |
} |
|
80 |
})) |
|
81 |
} |
|
82 |
} |
|
83 | 2x |
hits <- extract_hits(res) |
84 | 2x |
limit <- min(limit, res$hits$total$value) |
85 | 2x |
if (limit > nrow(hits)) { |
86 | 1x |
while (nrow(hits) < limit) { |
87 | 5x |
body$from <- nrow(hits) |
88 | 5x |
req <- httr::POST( |
89 | 5x |
"https://efts.sec.gov/LATEST/search-index", |
90 | 5x |
body = body, encode = "json" |
91 |
) |
|
92 | 5x |
if (req$status_code != 200) { |
93 | ! |
if (verbose) cli_alert_warning("page {nrow(hits) / 100 + 1} failed: {rawToChar(req$content)}") |
94 | ! |
break |
95 |
} |
|
96 | 5x |
res <- jsonlite::fromJSON(rawToChar(req$content)) |
97 | ! |
if (!nrow(res$hits$hits)) break |
98 | 5x |
hits <- rbind(hits, extract_hits(res)) |
99 |
} |
|
100 |
} |
|
101 | 2x |
hits[hits == "character(0)"] <- "" |
102 | 2x |
hits <- as.data.frame(hits) |
103 | ! |
if (!is.null(outFile)) write.csv(hits, outFile, row.names = FALSE) |
104 | 2x |
hits |
105 |
} |
1 |
#' Download Filings |
|
2 |
#' |
|
3 |
#' Download SEC filings based on Central Index Key (CIK) and document ID (as returned |
|
4 |
#' from \code{\link{sec_search}}. |
|
5 |
#' |
|
6 |
#' @param ciks A vector of Central Index Keys, or a \code{data.frame} with a \code{ciks} |
|
7 |
#' column. |
|
8 |
#' @param out Path to a directory in which to save filings. Each filing will be saved |
|
9 |
#' within a subdirectory: \code{{out}/{cik}/{SEC Accession Number}}. |
|
10 |
#' @param ids A vector of document IDs. These are combinations of SEC Accession Numbers |
|
11 |
#' and file names, separated by a colon (e.g., \code{"0001564590-22-032043:gs-424b2.htm"}). |
|
12 |
#' If the file name is not included, this will default to the complete submission text file |
|
13 |
#' (e.g., \code{"0001564590-22-032043.txt"}). |
|
14 |
#' @param complete Logical; if \code{TRUE}, will download the complete submission text file |
|
15 |
#' rather than another files specified after a colon in \code{ids}. |
|
16 |
#' @param user User agent string (e.g., \code{"Company Name AdminContact@example.com"}); |
|
17 |
#' defaults to the \code{SEC_USER} environment variable, which can be set with |
|
18 |
#' \code{\link{Sys.setenv}} (e.g., \code{Sys.setenv(SEC_USER = "...")}). |
|
19 |
#' @param verbose Logical; if \code{FALSE}, will not show status updates. |
|
20 |
#' @seealso Search for filings with \code{\link{sec_search}}. |
|
21 |
#' @examples |
|
22 |
#' \dontrun{ |
|
23 |
#' |
|
24 |
#' # search for filings based on SIC |
|
25 |
#' forms <- sec_search(SICs = 2080, forms = "10-K", limit = 100) |
|
26 |
#' |
|
27 |
#' # download some of those filings |
|
28 |
#' sec_download(forms[1:3, ], out = tempdir()) |
|
29 |
#' } |
|
30 |
#' @returns An invisible vector of HTTP status codes. |
|
31 |
#' @export |
|
32 | ||
33 |
sec_download <- function(ciks, out, ids = NULL, complete = FALSE, user = Sys.getenv("SEC_USER"), verbose = TRUE) { |
|
34 | 1x |
if (is.data.frame(ciks)) { |
35 | 1x |
if (is.null(ids)) ids <- ciks$id |
36 | 1x |
ciks <- ciks$ciks |
37 |
} |
|
38 | 1x |
ciks <- formatC(unique(ciks), width = 10, flag = 0) |
39 | ! |
if (missing(out)) cli_abort("{.arg out} must be specified") |
40 | ! |
if (is.null(ids)) cli_abort("{.arg ids} must be specified if {.arg ciks} is not a data.frame containing it") |
41 | 1x |
dir.create(out, FALSE, TRUE) |
42 | 1x |
out <- paste0(normalizePath(out, "/"), "/") |
43 | 1x |
base_url <- "https://www.sec.gov/Archives/edgar/data/" |
44 | 1x |
user_agent <- httr::user_agent(user) |
45 | 1x |
n <- length(ciks) |
46 | 1x |
reqs <- structure(rep(200, n), names = ciks) |
47 | 1x |
if (verbose) { |
48 | 1x |
cli_progress_step( |
49 | 1x |
"0 / {n} downloaded", |
50 | 1x |
msg_done = "finished; {sum(reqs == 200)} of {n} successful", spinner = TRUE |
51 |
) |
|
52 |
} |
|
53 | 1x |
for (i in seq_len(n)) { |
54 | 3x |
id <- strsplit(ids[[i]], ":", fixed = TRUE)[[1]] |
55 | 3x |
if (complete || length(id) < 2) { |
56 | 3x |
id[2] <- id[1] |
57 | 3x |
if (!grepl("-", id[2])) { |
58 | ! |
id[2] <- paste0( |
59 | ! |
substr(id[2], 1, 10), "-", substr(id[2], 11, 12), "-", substr(id[2], 12, 18) |
60 |
) |
|
61 |
} |
|
62 | 3x |
id[2] <- paste0(id[2], ".txt") |
63 |
} |
|
64 | 3x |
dir <- paste0(out, ciks[[i]], "/", id[1], "/") |
65 | 3x |
dir.create(dir, FALSE, TRUE) |
66 | 3x |
file <- paste0(dir, id[2]) |
67 | 3x |
if (!file.exists(file)) { |
68 | 3x |
req <- httr::GET(paste0( |
69 | 3x |
base_url, ciks[[i]], "/", gsub("-", "", id[1], fixed = TRUE), "/", id[2] |
70 | 3x |
), httr::write_disk(file), user_agent) |
71 | ! |
if (req$status_code != 200) unlink(file) |
72 | 3x |
Sys.sleep(max(.001, .1 - req$times[["total"]])) |
73 | 3x |
reqs[i] <- req$status_code |
74 |
} |
|
75 | 3x |
if (verbose) cli_progress_update() |
76 |
} |
|
77 | 1x |
invisible(reqs) |
78 |
} |
1 |
.onLoad <- function(lib, pkg) { |
|
2 | ! |
if (Sys.getenv("SEC_USER") == "") { |
3 | ! |
Sys.setenv(SEC_USER = paste( |
4 | ! |
"Name", Sys.Date(), "contact@example.com" |
5 |
)) |
|
6 |
} |
|
7 |
} |
|
8 | ||
9 |
#' @importFrom httr GET user_agent write_disk POST |
|
10 |
#' @importFrom jsonlite fromJSON |
|
11 |
#' @importFrom cli cli_abort cli_progress_step cli_progress_update cli_alert_info cli_alert_warning |
|
12 |
#' @importFrom utils read.csv write.csv |
|
13 |
#' @keywords internal |
|
14 |
"_PACKAGE" |
|
15 | ||
16 |
## usethis namespace: start |
|
17 |
## usethis namespace: end |
|
18 |
NULL |