example-patentsview_inventors.Rmd
Built with R 4.2.1
This example explores the PatentsView bulk tables, with a focus on assigned inventor sex.
Before starting, we’ll need to load the package, and point to a directory where we’d like things saved:
# install if needed: remotes::install_guthub("uva-bi-sdad/uspto")
library(uspto)
outDir <- "../patentsview/"
The first step toward answering this question is to assign a sex to inventors based on their given name. The inventors table includes USPTO’s assignment, which we’ll start with:
# you may need to increase your download timeout, depending on your connection
options(timeout = 300)
inventors <- as.data.frame(download_patentsview_bulk("inventor", outDir))
Now, we can add a few of our own prediction methods, including one based on the included USPTO flags:
# associate a sex with each unique given name
inventor_sex_file <- paste0(outDir, "inventor_sex.csv.xz")
if (file.exists(inventor_sex_file)) {
inventor_sex <- vroom::vroom(inventor_sex_file, show_col_types = FALSE)
} else {
# add a standardized version of given names
inventors$name_first[is.na(inventors$name_first)] <- ""
inventors$given <- sub(
"^(.)", "\\U\\1", gsub("^([a-z-]{,6}[.-])+ | +.*", "", tolower(inventors$name_first)),
perl = TRUE
)
inventors$name_last[is.na(inventors$name_last)] <- ""
inventors$family <- sub(
"^(.)", "\\U\\1", gsub("^([a-z-]{,6}[.-])+ | +.*", "", tolower(inventors$name_last)),
perl = TRUE
)
# reverse male flag
inventors$pred_fem_patentsview <- inventors$male_flag
inventors$pred_fem_patentsview[is.na(inventors$pred_fem_patentsview)] <- .5
inventors$pred_fem_patentsview <- 1 - inventors$pred_fem_patentsview
## install if needed: remotes::install_github("miserman/lusilab")
library(lusilab)
inventor_sex <- cbind(
inventors[, c("id", "given", "family", "pred_fem_patentsview")], predict_demographics(
inventors$given, inventors$name_last,
dir = paste0(dirname(outDir), "/names")
)[, -(1:3)]
)
## make aggregate predictor
count_vars <- grep("count_", colnames(inventor_sex), fixed = TRUE, value = TRUE)
inventor_sex$prob_fem <- rowSums(
inventor_sex[, count_vars] * inventor_sex[, sub("count", "prob_fem", count_vars, fixed = TRUE)]
) / rowSums(inventor_sex[, count_vars])
vroom::vroom_write(inventor_sex, inventor_sex_file, ",")
}
# get categorical predictions
prob_cols <- grep("^prob_", colnames(inventor_sex), value = TRUE)
inventor_sex_preds <- as.data.frame(inventor_sex[, prob_cols])
dimnames(inventor_sex_preds) <- list(
inventor_sex$id,
sub("prob", "pred", prob_cols, fixed = TRUE)
)
inventor_sex_preds[inventor_sex_preds > .5] <- 1
inventor_sex_preds[inventor_sex_preds < .5] <- 0
First, we can compare these different methods with the USPTO assignments:
kable(
data.frame(
"Proportion Sexed" = colMeans(inventor_sex_preds != .5),
"Accuracy of Sexed to PatentsView" = colMeans(vapply(
inventor_sex_preds, function(x) x != .5 & x == inventor_sex$pred_fem_patentsview,
logical(nrow(inventor_sex))
)),
check.names = FALSE
),
caption = "Sex Predition Method Comparisons"
)
Proportion Sexed | Accuracy of Sexed to PatentsView | |
---|---|---|
pred_fem_wgnd | 0.8899481 | 0.7273651 |
pred_fem_fb | 0.8968863 | 0.7242883 |
pred_fem_fb_scraped | 0.7293734 | 0.6086984 |
pred_fem_skydeck | 0.7906620 | 0.6558273 |
pred_fem_usssa | 0.7724898 | 0.6429201 |
pred_fem_uspto | 0.9939588 | 0.7795239 |
pred_fem | 0.9955821 | 0.7712082 |
Now we can move forward with the method with the aggregate prediction:
# add assigned sex to inventors data
inventors$pred_fem <- inventor_sex_preds$pred_fem
## focus on just sexed inventors
inventors <- inventors[inventors$pred_fem != .5, ]
rownames(inventors) <- inventors$id
# add patent IDs
patent_inventor <- download_patentsview_bulk("patent_inventor", outDir)
patent_inventor <- patent_inventor[patent_inventor$inventor_id %in% inventors$id, ]
inventors <- cbind(inventors[patent_inventor$inventor_id, -1], patent_inventor)
# count inventors predicted to be female in each patent
female_inventors <- tapply(inventors$pred_fem == 1, inventors$patent_id, sum)
Now we can add patent category information, to get breakdowns of classes by inventor sex. There are multiple classification schemes, but we can start with the World Intellectual Property Organization (WIPO) technology fields for a high-level overview:
library(Matrix)
# this makes a patent x WIPO category matrix
categories_wipo <- patentsview_class_matrix("wipo", paste0(outDir, "wipo_matrix.rds"), dir = outDir)
dim(categories_wipo)
#> [1] 7221215 35
# which we can join to the inventors matrix
inventors_wipo <- female_inventors[names(female_inventors) %in% rownames(categories_wipo)]
inventors_wipo <- cbind(as.numeric(inventors_wipo), categories_wipo[names(inventors_wipo), ])
# and get category breakdowns by inventor sex
wipo_summary <- data.frame(
Any_Female = colSums(inventors_wipo[inventors_wipo[, 1] != 0, -1] != 0),
No_Female = colSums(inventors_wipo[inventors_wipo[, 1] == 0, -1] != 0)
)
wipo_summary <- cbind(wipo_summary, sweep(wipo_summary, 2, colSums(wipo_summary), "/") * 100)
colnames(wipo_summary)[3:4] <- c("Any_Female_Percent", "No_Female_Percent")
wipo_summary$Difference <- wipo_summary$Any_Female_Percent - wipo_summary$No_Female_Percent
wipo_summary <- wipo_summary[order(-wipo_summary$Difference), ]
# add category titles
wipo_field <- as.data.frame(download_patentsview_bulk("wipo_field", outDir))
rownames(wipo_field) <- wipo_field$id
wipo_summary$Title <- wipo_field[rownames(wipo_summary), "field_title"]
kable(
wipo_summary,
digits = 3,
col.names = gsub("_", " ", colnames(wipo_summary), fixed = TRUE),
caption = "World Intellectual Property Organization Categories"
)
Any Female | No Female | Any Female Percent | No Female Percent | Difference | Title | |
---|---|---|---|---|---|---|
16 | 140288 | 168928 | 6.430 | 1.995 | 4.435 | Pharmaceuticals |
15 | 102368 | 112843 | 4.692 | 1.333 | 3.359 | Biotechnology |
14 | 113777 | 204459 | 5.215 | 2.415 | 2.800 | Organic fine chemistry |
4 | 142830 | 439789 | 6.547 | 5.194 | 1.353 | Digital communication |
6 | 223799 | 765056 | 10.258 | 9.035 | 1.223 | Computer technology |
19 | 70943 | 176861 | 3.252 | 2.089 | 1.163 | Basic materials chemistry |
11 | 31762 | 50883 | 1.456 | 0.601 | 0.855 | Analysis of biological materials |
17 | 53230 | 143628 | 2.440 | 1.696 | 0.744 | Macromolecular chemistry, polymers |
8 | 110959 | 370133 | 5.086 | 4.371 | 0.715 | Semiconductors |
7 | 46905 | 123348 | 2.150 | 1.457 | 0.693 | IT methods for management |
18 | 19609 | 47593 | 0.899 | 0.562 | 0.337 | Food chemistry |
22 | 17513 | 44769 | 0.803 | 0.529 | 0.274 | Micro-structural and nano-technology |
13 | 103601 | 383823 | 4.749 | 4.533 | 0.216 | Medical technology |
20 | 36649 | 136649 | 1.680 | 1.614 | 0.066 | Materials, metallurgy |
21 | 37255 | 140355 | 1.708 | 1.658 | 0.050 | Surface technology, coating |
34 | 37871 | 153868 | 1.736 | 1.817 | -0.081 | Other consumer goods |
3 | 75840 | 312227 | 3.476 | 3.687 | -0.211 | Telecommunications |
23 | 49807 | 217028 | 2.283 | 2.563 | -0.280 | Chemical engineering |
28 | 34725 | 158782 | 1.592 | 1.875 | -0.284 | Textile and paper machines |
24 | 20131 | 103924 | 0.923 | 1.227 | -0.305 | Environmental technology |
9 | 86916 | 369601 | 3.984 | 4.365 | -0.381 | Optics |
12 | 41088 | 196484 | 1.883 | 2.320 | -0.437 | Control |
30 | 14141 | 106691 | 0.648 | 1.260 | -0.612 | Thermal processes and apparatus |
5 | 26046 | 155256 | 1.194 | 1.834 | -0.640 | Basic communication processes |
2 | 99922 | 447367 | 4.580 | 5.283 | -0.703 | Audio-visual technology |
33 | 34932 | 195347 | 1.601 | 2.307 | -0.706 | Furniture, games |
29 | 47057 | 279402 | 2.157 | 3.300 | -1.143 | Other special machines |
10 | 83073 | 425977 | 3.808 | 5.031 | -1.223 | Measurement |
25 | 25881 | 207495 | 1.186 | 2.450 | -1.264 | Handling |
26 | 22856 | 199093 | 1.048 | 2.351 | -1.304 | Machine tools |
27 | 29373 | 232992 | 1.346 | 2.752 | -1.405 | Engines, pumps, turbines |
1 | 99479 | 519907 | 4.560 | 6.140 | -1.580 | Electrical machinery, apparatus, energy |
35 | 25264 | 233882 | 1.158 | 2.762 | -1.604 | Civil engineering |
31 | 27215 | 269472 | 1.247 | 3.182 | -1.935 | Mechanical elements |
32 | 48656 | 373769 | 2.230 | 4.414 | -2.184 | Transport |
For a more refined classification, we might look at the United States Patent Classification (USPC), which is most closely related to examination process:
# get the patent x USPC category matrix
categories_uspc <- patentsview_class_matrix(
"uspc_current", paste0(outDir, "uspc_current_matrix.rds"),
dir = outDir
)
dim(categories_uspc)
#> [1] 6597925 475
# join to the inventors matrix
inventors_uspc <- female_inventors[names(female_inventors) %in% rownames(categories_uspc)]
inventors_uspc <- cbind(as.numeric(inventors_uspc), categories_uspc[names(inventors_uspc), ])
# and get category breakdowns by inventor sex
uspc_summary <- data.frame(
Any_Female = colSums(inventors_uspc[inventors_uspc[, 1] != 0, -1] != 0),
No_Female = colSums(inventors_uspc[inventors_uspc[, 1] == 0, -1] != 0)
)
uspc_summary <- cbind(uspc_summary, sweep(uspc_summary, 2, colSums(uspc_summary), "/") * 100)
colnames(uspc_summary)[3:4] <- c("Any_Female_Percent", "No_Female_Percent")
uspc_summary$Difference <- uspc_summary$Any_Female_Percent - uspc_summary$No_Female_Percent
uspc_summary <- uspc_summary[order(-uspc_summary$Difference), ]
# add category titles
uspc_field <- as.data.frame(download_patentsview_bulk("mainclass_current", outDir))
rownames(uspc_field) <- uspc_field$id
uspc_summary$Title <- uspc_field[rownames(uspc_summary), "title"]
kable(
uspc_summary[c(1:20, 1:20 + nrow(uspc_summary) - 20), ],
digits = 3,
col.names = gsub("_", " ", colnames(uspc_summary), fixed = TRUE),
caption = paste(
"U.S. Patent Classification categories with the highest and lowest",
"proportion of any female inventor"
)
)
Any Female | No Female | Any Female Percent | No Female Percent | Difference | Title | |
---|---|---|---|---|---|---|
No longer published | 224151 | 677667 | 12.453 | 7.938 | 4.515 | NO LONGER PUBLISHED |
514 | 58210 | 92627 | 3.234 | 1.085 | 2.149 | DRUG, BIO-AFFECTING AND BODY TREATING COMPOSITIONS |
435 | 53861 | 77304 | 2.992 | 0.906 | 2.087 | CHEMISTRY: MOLECULAR BIOLOGY AND MICROBIOLOGY |
424 | 40283 | 61337 | 2.238 | 0.719 | 1.520 | DRUG, BIO-AFFECTING AND BODY TREATING COMPOSITIONS |
536 | 25823 | 32451 | 1.435 | 0.380 | 1.055 | ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES |
530 | 24279 | 29769 | 1.349 | 0.349 | 1.000 | CHEMISTRY: NATURAL RESINS OR DERIVATIVES; PEPTIDES OR PROTEINS; LIGNINS OR REACTION PRODUCTS THEREOF |
546 | 16501 | 29030 | 0.917 | 0.340 | 0.577 | ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES |
544 | 15184 | 26435 | 0.844 | 0.310 | 0.534 | ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES |
548 | 14286 | 28499 | 0.794 | 0.334 | 0.460 | ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES |
438 | 30399 | 111712 | 1.689 | 1.309 | 0.380 | SEMICONDUCTOR DEVICE MANUFACTURING: PROCESS |
D14 | 17197 | 51719 | 0.955 | 0.606 | 0.350 | RECORDING, COMMUNICATION, OR INFORMATION RETRIEVAL EQUIPMENT |
436 | 10907 | 25378 | 0.606 | 0.297 | 0.309 | CHEMISTRY: ANALYTICAL AND IMMUNOLOGICAL TESTING |
707 | 16543 | 52594 | 0.919 | 0.616 | 0.303 | DATA PROCESSING: DATABASE AND FILE MANAGEMENT OR DATA STRUCTURES |
370 | 27087 | 103467 | 1.505 | 1.212 | 0.293 | MULTIPLEX COMMUNICATIONS |
257 | 47154 | 199440 | 2.620 | 2.336 | 0.284 | ACTIVE SOLID-STATE DEVICES (E.G., TRANSISTORS, SOLID-STATE DIODES) |
705 | 13156 | 40236 | 0.731 | 0.471 | 0.260 | DATA PROCESSING: FINANCIAL, BUSINESS PRACTICE, MANAGEMENT, OR COST/PRICE DETERMINATION |
428 | 26798 | 106553 | 1.489 | 1.248 | 0.241 | STOCK MATERIAL OR MISCELLANEOUS ARTICLES |
430 | 14084 | 46833 | 0.782 | 0.549 | 0.234 | RADIATION IMAGERY CHEMISTRY: PROCESS, COMPOSITION, OR PRODUCT THEREOF |
709 | 19121 | 70876 | 1.062 | 0.830 | 0.232 | ELECTRICAL COMPUTERS AND DIGITAL PROCESSING SYSTEMS: MULTICOMPUTER DATA TRANSFERRING |
510 | 6612 | 12180 | 0.367 | 0.143 | 0.225 | CLEANING COMPOSITIONS FOR SOLID SURFACES, AUXILIARY COMPOSITIONS THEREFOR, OR PROCESSES OF PREPARING THE COMPOSITIONS |
310 | 4287 | 37460 | 0.238 | 0.439 | -0.201 | ELECTRICAL GENERATOR OR MOTOR STRUCTURE |
G9B | 10744 | 68623 | 0.597 | 0.804 | -0.207 | INFORMATION STORAGE BASED ON RELATIVE MOVEMENT BETWEEN RECORD CARRIER AND TRANSDUCER |
180 | 3177 | 33643 | 0.177 | 0.394 | -0.218 | MOTOR VEHICLES |
414 | 1944 | 28781 | 0.108 | 0.337 | -0.229 | MATERIAL OR ARTICLE HANDLING |
60 | 3918 | 39833 | 0.218 | 0.467 | -0.249 | POWER PLANTS |
348 | 14224 | 88743 | 0.790 | 1.040 | -0.249 | TELEVISION |
324 | 7288 | 56738 | 0.405 | 0.665 | -0.260 | ELECTRICITY: MEASURING AND TESTING |
327 | 5469 | 48145 | 0.304 | 0.564 | -0.260 | MISCELLANEOUS ACTIVE ELECTRICAL NONLINEAR DEVICES, CIRCUITS, AND SYSTEMS |
248 | 4322 | 42844 | 0.240 | 0.502 | -0.262 | SUPPORTS |
250 | 9920 | 70470 | 0.551 | 0.825 | -0.274 | RADIANT ENERGY |
74 | 2464 | 35477 | 0.137 | 0.416 | -0.279 | MACHINE ELEMENT OR MECHANISM |
361 | 9646 | 70289 | 0.536 | 0.823 | -0.287 | ELECTRICITY: ELECTRICAL SYSTEMS AND DEVICES |
280 | 4845 | 48938 | 0.269 | 0.573 | -0.304 | LAND VEHICLES |
340 | 9825 | 74449 | 0.546 | 0.872 | -0.326 | COMMUNICATIONS: ELECTRICAL |
52 | 3101 | 42632 | 0.172 | 0.499 | -0.327 | STATIC STRUCTURES (E.G., BUILDINGS) |
439 | 6112 | 58424 | 0.340 | 0.684 | -0.345 | ELECTRICAL CONNECTORS |
137 | 3075 | 44134 | 0.171 | 0.517 | -0.346 | FLUID HANDLING |
29 | 8097 | 70267 | 0.450 | 0.823 | -0.373 | METAL WORKING |
123 | 5260 | 56999 | 0.292 | 0.668 | -0.375 | INTERNAL-COMBUSTION ENGINES |
73 | 9268 | 78942 | 0.515 | 0.925 | -0.410 | MEASURING AND TESTING |
To get at this question, we’ll need to associate a date with each patent number:
# start with all patent information
patents <- download_patentsview_bulk("patent", outDir)
# associate a date with each patent number
all_dates <- structure(patents$date, names = patents$number)
# get years, and focus on only those found in the USPC set
uspc_year <- substr(all_dates[names(all_dates) %in% rownames(inventors_uspc)], 1, 4)
Now we could look at trends in differences in the percent of female versus male assigned inventors:
# use this set of years to break down the overall summaries
uspc_yearly_summaries <- do.call(rbind, lapply(sort(unique(uspc_year)), function(year) {
d <- inventors_uspc[names(which(uspc_year == year)), ]
r <- data.frame(
Any_Female = colSums(d[d[, 1] != 0, -1] != 0, na.rm = TRUE),
No_Female = colSums(d[d[, 1] == 0, -1] != 0, na.rm = TRUE)
)
r <- sweep(r, 2, colSums(r), "/") * 100
r$Difference <- r$Any_Female - r$No_Female
c(Year = as.numeric(year), t(r[3])[1, ])
}))
uspc_yearly_summaries[is.na(uspc_yearly_summaries)] <- 0
uspc_yearly_summaries <- uspc_yearly_summaries[, colSums(uspc_yearly_summaries) != 0]
# plot categories with the most positive and negative trends
library(splot)
trends <- sort(cor(uspc_yearly_summaries[, -1], uspc_yearly_summaries[, 1])[, 1], TRUE)
splot(
uspc_yearly_summaries[, names(trends[c(1:5, 1:4 + length(trends) - 4)])] ~ uspc_yearly_summaries[, 1],
lines = "spline", levels = list(mv = uspc_field[names(trends[c(1:5, 1:4 + length(trends) - 4)]), "title"]),
title = "Inventor Distribution by Assigned Sex Per USPC Class Over Time",
laby = "Percent Female-Assigned Inventors - Male-Assigned Inventors",
labx = "Year", leg.title = "U.S. Patent Classification", myl = c(-.6, .7)
)
This highlights categories that are trending toward more similar allocations of inventors by sex, and suggests there are no categories showing a strong trend toward increasing difference in inventor allocation by sex.
We might also look at categories that show some trend in the percent of female-assigned inventors alone – where a greater or lesser percentage of the female-assigned inventor workforce has been allocated:
uspc_yearly_female_summaries <- do.call(rbind, lapply(sort(unique(uspc_year)), function(year) {
d <- inventors_uspc[names(which(uspc_year == year)), ]
r <- colSums(d[d[, 1] != 0, -1] != 0, na.rm = TRUE)
r <- r / sum(r) * 100
c(Year = as.numeric(year), r)
}))
uspc_yearly_female_summaries <- uspc_yearly_female_summaries[, colSums(uspc_yearly_female_summaries) != 0]
trends_female <- sort(cor(uspc_yearly_female_summaries[, -1], uspc_yearly_female_summaries[, 1])[, 1], TRUE)
splot(
uspc_yearly_female_summaries[, names(trends_female[c(1:7, 1:2 + length(trends_female) - 6)])] ~
uspc_yearly_female_summaries[, 1],
lines = "spline", levels = list(
mv = uspc_field[names(trends_female[c(1:7, 1:2 + length(trends_female) - 6)]), "title"]
),
labels.trim = FALSE,
title = "Share of Female-Assigned Inventors Per USPC Class Over Time", lpos = "topleft",
laby = "Percent Female-Assigned Inventors", labx = "Year", leg.title = "U.S. Patent Classification"
)
Cooperative Patent Classifications (CPC) might give an even more refined look, and could be compared across patent offices:
# get the patent x CPC category matrix
categories_cpc <- patentsview_class_matrix(
"cpc_current", paste0(outDir, "cpc_current_matrix.rds"),
dir = outDir
)
dim(categories_cpc)
#> [1] 7228678 670
# join to the inventors matrix
inventors_cpc <- female_inventors[names(female_inventors) %in% rownames(categories_cpc)]
inventors_cpc <- cbind(as.numeric(inventors_cpc), categories_cpc[names(inventors_cpc), ])
# and get category breakdowns by inventor sex
cpc_summary <- data.frame(
Any_Female = colSums(inventors_cpc[inventors_cpc[, 1] != 0, -1] != 0),
No_Female = colSums(inventors_cpc[inventors_cpc[, 1] == 0, -1] != 0)
)
cpc_summary <- cbind(cpc_summary, sweep(cpc_summary, 2, colSums(cpc_summary), "/") * 100)
colnames(cpc_summary)[3:4] <- c("Any_Female_Percent", "No_Female_Percent")
cpc_summary$Difference <- cpc_summary$Any_Female_Percent - cpc_summary$No_Female_Percent
cpc_summary <- cpc_summary[order(-cpc_summary$Difference), ]
# add category titles
cpc_group <- as.data.frame(download_patentsview_bulk("cpc_group", outDir))
rownames(cpc_group) <- cpc_group$id
cpc_summary <- cpc_summary[rownames(cpc_summary) %in% cpc_group$id, ]
cpc_summary$Title <- cpc_group[rownames(cpc_summary), "title"]
cpc_selection <- names(which(rowSums(cpc_summary[, 1:2]) > 5))
cpc_selection <- cpc_selection[c(1:20, 1:20 + length(cpc_selection) - 20)]
kable(
cpc_summary[cpc_selection, ],
digits = 3,
col.names = gsub("_", " ", colnames(cpc_summary), fixed = TRUE),
caption = paste(
"Cooperative Patent Classifications: 20 categories with the highest and lowest",
"proportion of any female inventor with at least 5 associated patents"
)
)
Any Female | No Female | Any Female Percent | No Female Percent | Difference | Title | |
---|---|---|---|---|---|---|
A61K | 118059 | 143894 | 3.895 | 1.244 | 2.651 | PREPARATIONS FOR MEDICAL, DENTAL, OR TOILET PURPOSES |
A61P | 104944 | 111091 | 3.462 | 0.961 | 2.502 | SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMPOUNDS OR MEDICINAL PREPARATIONS |
C07K | 53879 | 52116 | 1.778 | 0.451 | 1.327 | PEPTIDES |
C12N | 52539 | 52564 | 1.733 | 0.455 | 1.279 | MICROORGANISMS OR ENZYMES; COMPOSITIONS THEREOF; PROPAGATING, PRESERVING, OR MAINTAINING MICROORGANISMS; MUTATION OR GENETIC ENGINEERING; CULTURE MEDIA |
C07D | 62193 | 97354 | 2.052 | 0.842 | 1.210 | HETEROCYCLIC COMPOUNDS |
G06F | 159496 | 525401 | 5.262 | 4.544 | 0.719 | ELECTRIC DIGITAL DATA PROCESSING |
H04W | 66143 | 169788 | 2.182 | 1.468 | 0.714 | WIRELESS COMMUNICATION NETWORKS |
H04L | 109031 | 341426 | 3.597 | 2.953 | 0.645 | TRANSMISSION OF DIGITAL INFORMATION, e.g. TELEGRAPHIC COMMUNICATION |
G01N | 57456 | 148061 | 1.896 | 1.280 | 0.615 | INVESTIGATING OR ANALYSING MATERIALS BY DETERMINING THEIR CHEMICAL OR PHYSICAL PROPERTIES |
C12Q | 22569 | 27393 | 0.745 | 0.237 | 0.508 | MEASURING OR TESTING PROCESSES INVOLVING ENZYMES, NUCLEIC ACIDS OR MICROORGANISMS ; COMPOSITIONS OR TEST PAPERS THEREFOR; PROCESSES OF PREPARING SUCH COMPOSITIONS; CONDITION-RESPONSIVE CONTROL IN MICROBIOLOGICAL OR ENZYMOLOGICAL PROCESSES |
G06Q | 48204 | 127337 | 1.590 | 1.101 | 0.489 | DATA PROCESSING SYSTEMS OR METHODS, SPECIALLY ADAPTED FOR ADMINISTRATIVE, COMMERCIAL, FINANCIAL, MANAGERIAL, SUPERVISORY OR FORECASTING PURPOSES; SYSTEMS OR METHODS SPECIALLY ADAPTED FOR ADMINISTRATIVE, COMMERCIAL, FINANCIAL, MANAGERIAL, SUPERVISORY OR FORECASTING PURPOSES, NOT OTHERWISE PROVIDED FOR |
H01L | 113814 | 381962 | 3.755 | 3.303 | 0.452 | SEMICONDUCTOR DEVICES; ELECTRIC SOLID STATE DEVICES NOT OTHERWISE PROVIDED FOR |
A61Q | 16866 | 18698 | 0.556 | 0.162 | 0.395 | SPECIFIC USE OF COSMETICS OR SIMILAR TOILET PREPARATIONS |
C07C | 33187 | 82127 | 1.095 | 0.710 | 0.385 | ACYCLIC OR CARBOCYCLIC COMPOUNDS |
C12P | 12297 | 13758 | 0.406 | 0.119 | 0.287 | FERMENTATION OR ENZYME-USING PROCESSES TO SYNTHESISE A DESIRED CHEMICAL COMPOUND OR COMPOSITION OR TO SEPARATE OPTICAL ISOMERS FROM A RACEMIC MIXTURE |
C12Y | 9799 | 8108 | 0.323 | 0.070 | 0.253 | ENZYMES |
C08L | 20786 | 51386 | 0.686 | 0.444 | 0.241 | COMPOSITIONS OF MACROMOLECULAR COMPOUNDS |
C09D | 16359 | 35799 | 0.540 | 0.310 | 0.230 | COATING COMPOSITIONS, e.g. PAINTS, VARNISHES OR LACQUERS; FILLING PASTES; CHEMICAL PAINT OR INK REMOVERS; INKS; CORRECTING FLUIDS; WOODSTAINS; PASTES OR SOLIDS FOR COLOURING OR PRINTING; USE OF MATERIALS THEREFOR |
C09K | 15928 | 35505 | 0.526 | 0.307 | 0.218 | MATERIALS FOR MISCELLANEOUS APPLICATIONS, NOT PROVIDED FOR ELSEWHERE |
C08F | 17480 | 41598 | 0.577 | 0.360 | 0.217 | MACROMOLECULAR COMPOUNDS OBTAINED BY REACTIONS ONLY INVOLVING CARBON-TO-CARBON UNSATURATED BONDS |
F02B | 3796 | 35699 | 0.125 | 0.309 | -0.183 | INTERNAL-COMBUSTION PISTON ENGINES; COMBUSTION ENGINES IN GENERAL |
H01H | 4635 | 38928 | 0.153 | 0.337 | -0.184 | ELECTRIC SWITCHES; RELAYS; SELECTORS; EMERGENCY PROTECTIVE DEVICES |
F16D | 2968 | 33174 | 0.098 | 0.287 | -0.189 | COUPLINGS FOR TRANSMITTING ROTATION; CLUTCHES; BRAKES |
B65G | 2849 | 33446 | 0.094 | 0.289 | -0.195 | TRANSPORT OR STORAGE DEVICES, e.g. CONVEYORS FOR LOADING OR TIPPING, SHOP CONVEYOR SYSTEMS OR PNEUMATIC TUBE CONVEYORS |
B62D | 4831 | 41973 | 0.159 | 0.363 | -0.204 | MOTOR VEHICLES; TRAILERS |
H03K | 8291 | 55429 | 0.274 | 0.479 | -0.206 | PULSE TECHNIQUE |
F16L | 3595 | 37625 | 0.119 | 0.325 | -0.207 | PIPES; JOINTS OR FITTINGS FOR PIPES; SUPPORTS FOR PIPES, CABLES OR PROTECTIVE TUBING; MEANS FOR THERMAL INSULATION IN GENERAL |
G01R | 14938 | 81869 | 0.493 | 0.708 | -0.215 | MEASURING ELECTRIC VARIABLES; MEASURING MAGNETIC VARIABLES |
B65H | 4546 | 42323 | 0.150 | 0.366 | -0.216 | HANDLING THIN OR FILAMENTARY MATERIAL, e.g. SHEETS, WEBS, CABLES |
A63B | 6350 | 50705 | 0.210 | 0.438 | -0.229 | APPARATUS FOR PHYSICAL TRAINING, GYMNASTICS, SWIMMING, CLIMBING, OR FENCING; BALL GAMES; TRAINING EQUIPMENT |
B65D | 11747 | 71448 | 0.388 | 0.618 | -0.230 | CONTAINERS FOR STORAGE OR TRANSPORT OF ARTICLES OR MATERIALS, e.g. BAGS, BARRELS, BOTTLES, BOXES, CANS, CARTONS, CRATES, DRUMS, JARS, TANKS, HOPPERS, FORWARDING CONTAINERS; ACCESSORIES, CLOSURES, OR FITTINGS THEREFOR; PACKAGING ELEMENTS; PACKAGES |
G11B | 20710 | 106241 | 0.683 | 0.919 | -0.235 | INFORMATION STORAGE BASED ON RELATIVE MOVEMENT BETWEEN RECORD CARRIER AND TRANSDUCER |
B60R | 9077 | 62621 | 0.299 | 0.542 | -0.242 | VEHICLES, VEHICLE FITTINGS, OR VEHICLE PARTS, NOT OTHERWISE PROVIDED FOR |
E21B | 8401 | 62739 | 0.277 | 0.543 | -0.265 | EARTH DRILLING, e.g. DEEP DRILLING ; OBTAINING OIL, GAS, WATER, SOLUBLE OR MELTABLE MATERIALS OR A SLURRY OF MINERALS FROM WELLS |
F16H | 5390 | 51508 | 0.178 | 0.445 | -0.268 | GEARING |
Y02T | 17156 | 97400 | 0.566 | 0.842 | -0.276 | CLIMATE CHANGE MITIGATION TECHNOLOGIES RELATED TO TRANSPORTATION |
B29C | 15116 | 90347 | 0.499 | 0.781 | -0.283 | SHAPING OR JOINING OF PLASTICS; SHAPING OF MATERIAL IN A PLASTIC STATE, NOT OTHERWISE PROVIDED FOR; AFTER-TREATMENT OF THE SHAPED PRODUCTS, e.g. REPAIRING |
H01R | 9747 | 75973 | 0.322 | 0.657 | -0.335 | ELECTRICALLY-CONDUCTIVE CONNECTIONS; STRUCTURAL ASSOCIATIONS OF A PLURALITY OF MUTUALLY-INSULATED ELECTRICAL CONNECTING ELEMENTS; COUPLING DEVICES; CURRENT COLLECTORS |
Y10S | 75281 | 353260 | 2.484 | 3.055 | -0.571 | TECHNICAL SUBJECTS COVERED BY FORMER USPC CROSS-REFERENCE ART COLLECTIONS [XRACs] AND DIGESTS |
Y10T | 83598 | 503119 | 2.758 | 4.351 | -1.593 | TECHNICAL SUBJECTS COVERED BY FORMER US CLASSIFICATION |
The locations table associates inventor IDs with location IDs:
locations <- as.data.frame(download_patentsview_bulk("location", outDir))
locations$state_fips[!is.na(locations$state_fips)] <- formatC(
locations$state_fips[!is.na(locations$state_fips)],
width = 2, flag = 0, format = "d"
)
locations$county_fips[!is.na(locations$county_fips)] <- formatC(
locations$county_fips[!is.na(locations$county_fips)],
width = 5, flag = 0, format = "d"
)
# we can align this with our inventors data for inventor sex
rownames(locations) <- locations$id
located_inventors <- inventors[!is.na(inventors$location_id), ]
locations <- locations[located_inventors$location_id, ]
Inventor location is recorded as part of each patent, which means inventors may have multiple locations over time. For an initial look, we can focus only on each inventors most recent location:
# add date information to inventor data
inventors$date <- structure(patents$date, names = patents$number)[inventors$patent_id]
# sort inventors by date, then add location information
inventors_last_seen <- inventors[order(inventors$date, decreasing = TRUE), ]
inventors_last_seen <- inventors_last_seen[!duplicated(inventors_last_seen$inventor_id), ]
inventors_last_seen <- inventors_last_seen[inventors_last_seen$location_id %in% locations$id, ]
inventors_last_seen <- cbind(inventors_last_seen, locations[inventors_last_seen$location_id, ])
Now we can look at high-level summaries of locations, like we did with patent classes:
# top countries
breakdown_countries <- as.data.frame(t(vapply(
split(inventors_last_seen$pred_fem, inventors_last_seen$country),
function(d) c(Female = sum(d == 1), Male = sum(d == 0)),
c(0, 0)
)))
breakdown_countries <- cbind(
breakdown_countries,
sweep(breakdown_countries, 2, colSums(breakdown_countries), "/") * 100
)
colnames(breakdown_countries)[3:4] <- c("Female_Percent", "Male_Percent")
breakdown_countries$Difference <- breakdown_countries$Female_Percent - breakdown_countries$Male_Percent
breakdown_countries <- breakdown_countries[order(-breakdown_countries$Difference), ]
kable(
breakdown_countries[rowSums(breakdown_countries[, 1:2]) > 1e4, ],
col.names = gsub("_", " ", colnames(breakdown_countries), fixed = TRUE),
caption = "Countries with at least 10,000 associated inventors"
)
Female | Male | Female Percent | Male Percent | Difference | |
---|---|---|---|---|---|
CN | 38044 | 104951 | 6.7701299 | 3.0053345 | 3.7647953 |
FR | 24075 | 106275 | 4.2842728 | 3.0432481 | 1.2410248 |
KR | 23911 | 122321 | 4.2550882 | 3.5027348 | 0.7523533 |
IT | 9958 | 44167 | 1.7720785 | 1.2647484 | 0.5073301 |
US | 269384 | 1658264 | 47.9382994 | 47.4853794 | 0.4529200 |
ES | 4650 | 17826 | 0.8274920 | 0.5104581 | 0.3170338 |
FI | 4357 | 16529 | 0.7753511 | 0.4733178 | 0.3020333 |
IL | 7116 | 33690 | 1.2663296 | 0.9647333 | 0.3015964 |
IN | 9325 | 49048 | 1.6594328 | 1.4045188 | 0.2549140 |
SU | 3268 | 12008 | 0.5815578 | 0.3438562 | 0.2377015 |
SG | 2692 | 10755 | 0.4790556 | 0.3079758 | 0.1710797 |
BE | 3709 | 17464 | 0.6600361 | 0.5000921 | 0.1599440 |
DK | 2682 | 14017 | 0.4772760 | 0.4013852 | 0.0758908 |
RU | 2211 | 11556 | 0.3934591 | 0.3309130 | 0.0625461 |
SE | 5775 | 34979 | 1.0276916 | 1.0016445 | 0.0260471 |
NO | 1348 | 9725 | 0.2398837 | 0.2784812 | -0.0385975 |
AU | 3865 | 26212 | 0.6877971 | 0.7505963 | -0.0627992 |
NL | 6026 | 41085 | 1.0723584 | 1.1764935 | -0.1041351 |
CH | 4763 | 37774 | 0.8476009 | 1.0816810 | -0.2340801 |
AT | 1551 | 17833 | 0.2760086 | 0.5106586 | -0.2346500 |
CA | 13261 | 92134 | 2.3598647 | 2.6383121 | -0.2784474 |
GB | 13156 | 98987 | 2.3411794 | 2.8345518 | -0.4933724 |
TW | 9993 | 93957 | 1.7783069 | 2.6905148 | -0.9122079 |
DE | 26375 | 265556 | 4.6935699 | 7.6043546 | -2.9107846 |
JP | 52435 | 464053 | 9.3310840 | 13.2884346 | -3.9573506 |
# top states
breakdown_states <- as.data.frame(t(vapply(
split(inventors_last_seen$pred_fem, inventors_last_seen$state_fips),
function(d) c(Female = sum(d == 1), Male = sum(d == 0)),
c(0, 0)
)))
breakdown_states <- cbind(
breakdown_states,
sweep(breakdown_states, 2, colSums(breakdown_states), "/") * 100
)
colnames(breakdown_states)[3:4] <- c("Female_Percent", "Male_Percent")
breakdown_states$Difference <- breakdown_states$Female_Percent - breakdown_states$Male_Percent
breakdown_states <- breakdown_states[order(-breakdown_states$Difference), ]
## install if needed: remotes::install_github("uva-bi-sdad/catchment")
library(catchment)
states <- download_census_shapes(paste0(dirname(outDir), "/maps"))
state_names <- structure(states$NAME, names = states$STATEFP)
breakdown_states <- breakdown_states[rownames(breakdown_states) %in% names(state_names), ]
rownames(breakdown_states) <- state_names[rownames(breakdown_states)]
kable(
breakdown_states[rowSums(breakdown_states[, 1:2]) > 1e4, ],
col.names = gsub("_", " ", colnames(breakdown_states), fixed = TRUE),
caption = "States with at least 10,000 associated inventors"
)
Female | Male | Female Percent | Male Percent | Difference | |
---|---|---|---|---|---|
California | 59595 | 337658 | 22.1259802 | 20.3653186 | 1.7606616 |
New York | 18892 | 97296 | 7.0140787 | 5.8682573 | 1.1458213 |
New Jersey | 11716 | 58560 | 4.3498277 | 3.5319556 | 0.8178722 |
Massachusetts | 13899 | 73660 | 5.1603154 | 4.4426887 | 0.7176268 |
Maryland | 5991 | 29161 | 2.2242931 | 1.7588005 | 0.4654926 |
Georgia | 5596 | 29640 | 2.0776405 | 1.7876906 | 0.2899499 |
Florida | 9745 | 56698 | 3.6180498 | 3.4196519 | 0.1983978 |
North Carolina | 6806 | 39094 | 2.5268801 | 2.3578940 | 0.1689861 |
Virginia | 4951 | 29177 | 1.8381698 | 1.7597655 | 0.0784043 |
Missouri | 3441 | 20193 | 1.2775484 | 1.2179095 | 0.0596389 |
Louisiana | 1326 | 9313 | 0.4923072 | 0.5616992 | -0.0693919 |
Washington | 10677 | 66907 | 3.9640757 | 4.0353919 | -0.0713163 |
Alabama | 1352 | 9553 | 0.5019603 | 0.5761744 | -0.0742141 |
Tennessee | 2616 | 17403 | 0.9712487 | 1.0496350 | -0.0783863 |
South Carolina | 2002 | 13700 | 0.7432874 | 0.8262943 | -0.0830069 |
Kentucky | 1422 | 10146 | 0.5279494 | 0.6119403 | -0.0839909 |
Kansas | 1473 | 10490 | 0.5468843 | 0.6326881 | -0.0858038 |
Minnesota | 7133 | 45613 | 2.6482862 | 2.7510773 | -0.1027911 |
Oregon | 3915 | 26031 | 1.4535315 | 1.5700194 | -0.1164878 |
Colorado | 5377 | 35085 | 1.9963318 | 2.1160974 | -0.1197655 |
Arizona | 4355 | 28950 | 1.6168914 | 1.7460743 | -0.1291829 |
Oklahoma | 1343 | 11238 | 0.4986189 | 0.6778025 | -0.1791837 |
Illinois | 11426 | 73643 | 4.2421587 | 4.4416633 | -0.1995046 |
New Hampshire | 1292 | 11438 | 0.4796840 | 0.6898652 | -0.2101813 |
Connecticut | 4479 | 31161 | 1.6629292 | 1.8794274 | -0.2164982 |
Iowa | 1564 | 13703 | 0.5806701 | 0.8264752 | -0.2458051 |
Utah | 2113 | 17121 | 0.7844986 | 1.0326266 | -0.2481279 |
Pennsylvania | 9865 | 65445 | 3.6626025 | 3.9472137 | -0.2846112 |
Indiana | 4086 | 29885 | 1.5170191 | 1.8024674 | -0.2854483 |
Wisconsin | 4793 | 35458 | 1.7795087 | 2.1385943 | -0.3590855 |
Ohio | 9223 | 64548 | 3.4242456 | 3.8931125 | -0.4688669 |
Texas | 16033 | 109365 | 5.9526108 | 6.5961803 | -0.6435696 |
Michigan | 9543 | 77209 | 3.5430528 | 4.6567411 | -1.1136883 |
# map of counties
library(leaflet)
library(sf)
counties <- st_transform(download_census_shapes(
paste0(dirname(outDir), "/maps"),
entity = "county"
), "WGS84")
counties$NAME <- paste0(counties$NAME, ", ", state_names[counties$STATEFP])
breakdown_counties <- as.data.frame(t(vapply(
split(inventors_last_seen$pred_fem, inventors_last_seen$county_fips),
function(d) c(Female = sum(d == 1), Male = sum(d == 0)),
c(0, 0)
)))
breakdown_counties <- cbind(
breakdown_counties,
sweep(breakdown_counties, 2, colSums(breakdown_counties), "/") * 100
)
colnames(breakdown_counties)[3:4] <- c("Female_Percent", "Male_Percent")
breakdown_counties$Difference <- breakdown_counties$Female_Percent - breakdown_counties$Male_Percent
breakdown_counties$Total <- rowSums(breakdown_counties[, 1:2])
breakdown_counties$Capped_Total <- breakdown_counties$Total
breakdown_counties$Capped_Total[breakdown_counties$Capped_Total > 5e4] <- 1e5
breakdown_counties <- breakdown_counties[counties$GEOID, ]
pal <- colorNumeric(
scico::scico(255, direction = -1, palette = "vik"),
rep(max(abs(breakdown_counties$Difference), na.rm = TRUE), 2) * c(-1, 1)
)
pal_total <- colorNumeric(
scico::scico(255, direction = -1, palette = "lajolla"), breakdown_counties$Capped_Total
)
leaflet(counties[, 1], options = leafletOptions(attributionControl = FALSE)) |>
addProviderTiles("CartoDB.Positron") |>
setView(-95.5810546875, 39.5040407055842, 4) |>
addControl("Percent Female-Assigned Inventors - Percent Male-Assigned Inventors", "topright") |>
addLayersControl(position = "topleft", overlayGroups = c("Total", "Difference")) |>
addLegend(
"bottomright", pal_total, breakdown_counties$Capped_Total,
opacity = 1,
title = "Totals", group = "Total"
) |>
addPolygons(
fillColor = pal_total(breakdown_counties$Capped_Total),
fillOpacity = .8, weight = 1, color = "#000", highlightOptions = highlightOptions(color = "#fff"),
group = "Total", label = paste0("County: ", counties$NAME, "; Total: ", breakdown_counties$Total)
) |>
hideGroup("Total") |>
addLegend(
"bottomright", pal, breakdown_counties$Difference,
opacity = 1,
title = "Percent Difference", group = "Difference"
) |>
addPolygons(
fillColor = pal(breakdown_counties$Difference), fillOpacity = .8, weight = 1, color = "#000",
highlightOptions = highlightOptions(color = "#fff"),
group = "Difference", label = paste0(
"County: ", counties$NAME,
"; Female: ", breakdown_counties$Female,
"; Male: ", breakdown_counties$Male,
"; Percent Difference (Female - Male): ", round(breakdown_counties$Difference, 3)
)
)