Built with R 4.2.1


This example explores the PatentsView bulk tables, with a focus on assigned inventor sex.

Setup

Before starting, we’ll need to load the package, and point to a directory where we’d like things saved:

# install if needed: remotes::install_guthub("uva-bi-sdad/uspto")
library(uspto)
outDir <- "../patentsview/"

Which patent categories have most and fewest female inventors?

The first step toward answering this question is to assign a sex to inventors based on their given name. The inventors table includes USPTO’s assignment, which we’ll start with:

# you may need to increase your download timeout, depending on your connection
options(timeout = 300)

inventors <- as.data.frame(download_patentsview_bulk("inventor", outDir))

Now, we can add a few of our own prediction methods, including one based on the included USPTO flags:

# associate a sex with each unique given name
inventor_sex_file <- paste0(outDir, "inventor_sex.csv.xz")
if (file.exists(inventor_sex_file)) {
  inventor_sex <- vroom::vroom(inventor_sex_file, show_col_types = FALSE)
} else {
  # add a standardized version of given names
  inventors$name_first[is.na(inventors$name_first)] <- ""
  inventors$given <- sub(
    "^(.)", "\\U\\1", gsub("^([a-z-]{,6}[.-])+ | +.*", "", tolower(inventors$name_first)),
    perl = TRUE
  )
  inventors$name_last[is.na(inventors$name_last)] <- ""
  inventors$family <- sub(
    "^(.)", "\\U\\1", gsub("^([a-z-]{,6}[.-])+ | +.*", "", tolower(inventors$name_last)),
    perl = TRUE
  )

  # reverse male flag
  inventors$pred_fem_patentsview <- inventors$male_flag
  inventors$pred_fem_patentsview[is.na(inventors$pred_fem_patentsview)] <- .5
  inventors$pred_fem_patentsview <- 1 - inventors$pred_fem_patentsview

  ## install if needed: remotes::install_github("miserman/lusilab")
  library(lusilab)
  inventor_sex <- cbind(
    inventors[, c("id", "given", "family", "pred_fem_patentsview")], predict_demographics(
      inventors$given, inventors$name_last,
      dir = paste0(dirname(outDir), "/names")
    )[, -(1:3)]
  )

  ## make aggregate predictor
  count_vars <- grep("count_", colnames(inventor_sex), fixed = TRUE, value = TRUE)
  inventor_sex$prob_fem <- rowSums(
    inventor_sex[, count_vars] * inventor_sex[, sub("count", "prob_fem", count_vars, fixed = TRUE)]
  ) / rowSums(inventor_sex[, count_vars])

  vroom::vroom_write(inventor_sex, inventor_sex_file, ",")
}

# get categorical predictions
prob_cols <- grep("^prob_", colnames(inventor_sex), value = TRUE)
inventor_sex_preds <- as.data.frame(inventor_sex[, prob_cols])
dimnames(inventor_sex_preds) <- list(
  inventor_sex$id,
  sub("prob", "pred", prob_cols, fixed = TRUE)
)
inventor_sex_preds[inventor_sex_preds > .5] <- 1
inventor_sex_preds[inventor_sex_preds < .5] <- 0

First, we can compare these different methods with the USPTO assignments:

kable(
  data.frame(
    "Proportion Sexed" = colMeans(inventor_sex_preds != .5),
    "Accuracy of Sexed to PatentsView" = colMeans(vapply(
      inventor_sex_preds, function(x) x != .5 & x == inventor_sex$pred_fem_patentsview,
      logical(nrow(inventor_sex))
    )),
    check.names = FALSE
  ),
  caption = "Sex Predition Method Comparisons"
)
Sex Predition Method Comparisons
Proportion Sexed Accuracy of Sexed to PatentsView
pred_fem_wgnd 0.8899481 0.7273651
pred_fem_fb 0.8968863 0.7242883
pred_fem_fb_scraped 0.7293734 0.6086984
pred_fem_skydeck 0.7906620 0.6558273
pred_fem_usssa 0.7724898 0.6429201
pred_fem_uspto 0.9939588 0.7795239
pred_fem 0.9955821 0.7712082

Now we can move forward with the method with the aggregate prediction:

# add assigned sex to inventors data
inventors$pred_fem <- inventor_sex_preds$pred_fem

## focus on just sexed inventors
inventors <- inventors[inventors$pred_fem != .5, ]
rownames(inventors) <- inventors$id

# add patent IDs
patent_inventor <- download_patentsview_bulk("patent_inventor", outDir)
patent_inventor <- patent_inventor[patent_inventor$inventor_id %in% inventors$id, ]
inventors <- cbind(inventors[patent_inventor$inventor_id, -1], patent_inventor)

# count inventors predicted to be female in each patent
female_inventors <- tapply(inventors$pred_fem == 1, inventors$patent_id, sum)

World Intellectual Property Organization

Now we can add patent category information, to get breakdowns of classes by inventor sex. There are multiple classification schemes, but we can start with the World Intellectual Property Organization (WIPO) technology fields for a high-level overview:

library(Matrix)

# this makes a patent x WIPO category matrix
categories_wipo <- patentsview_class_matrix("wipo", paste0(outDir, "wipo_matrix.rds"), dir = outDir)
dim(categories_wipo)
#> [1] 7221215      35

# which we can join to the inventors matrix
inventors_wipo <- female_inventors[names(female_inventors) %in% rownames(categories_wipo)]
inventors_wipo <- cbind(as.numeric(inventors_wipo), categories_wipo[names(inventors_wipo), ])

# and get category breakdowns by inventor sex
wipo_summary <- data.frame(
  Any_Female = colSums(inventors_wipo[inventors_wipo[, 1] != 0, -1] != 0),
  No_Female = colSums(inventors_wipo[inventors_wipo[, 1] == 0, -1] != 0)
)
wipo_summary <- cbind(wipo_summary, sweep(wipo_summary, 2, colSums(wipo_summary), "/") * 100)
colnames(wipo_summary)[3:4] <- c("Any_Female_Percent", "No_Female_Percent")
wipo_summary$Difference <- wipo_summary$Any_Female_Percent - wipo_summary$No_Female_Percent
wipo_summary <- wipo_summary[order(-wipo_summary$Difference), ]

# add category titles
wipo_field <- as.data.frame(download_patentsview_bulk("wipo_field", outDir))
rownames(wipo_field) <- wipo_field$id
wipo_summary$Title <- wipo_field[rownames(wipo_summary), "field_title"]

kable(
  wipo_summary,
  digits = 3,
  col.names = gsub("_", " ", colnames(wipo_summary), fixed = TRUE),
  caption = "World Intellectual Property Organization Categories"
)
World Intellectual Property Organization Categories
Any Female No Female Any Female Percent No Female Percent Difference Title
16 140288 168928 6.430 1.995 4.435 Pharmaceuticals
15 102368 112843 4.692 1.333 3.359 Biotechnology
14 113777 204459 5.215 2.415 2.800 Organic fine chemistry
4 142830 439789 6.547 5.194 1.353 Digital communication
6 223799 765056 10.258 9.035 1.223 Computer technology
19 70943 176861 3.252 2.089 1.163 Basic materials chemistry
11 31762 50883 1.456 0.601 0.855 Analysis of biological materials
17 53230 143628 2.440 1.696 0.744 Macromolecular chemistry, polymers
8 110959 370133 5.086 4.371 0.715 Semiconductors
7 46905 123348 2.150 1.457 0.693 IT methods for management
18 19609 47593 0.899 0.562 0.337 Food chemistry
22 17513 44769 0.803 0.529 0.274 Micro-structural and nano-technology
13 103601 383823 4.749 4.533 0.216 Medical technology
20 36649 136649 1.680 1.614 0.066 Materials, metallurgy
21 37255 140355 1.708 1.658 0.050 Surface technology, coating
34 37871 153868 1.736 1.817 -0.081 Other consumer goods
3 75840 312227 3.476 3.687 -0.211 Telecommunications
23 49807 217028 2.283 2.563 -0.280 Chemical engineering
28 34725 158782 1.592 1.875 -0.284 Textile and paper machines
24 20131 103924 0.923 1.227 -0.305 Environmental technology
9 86916 369601 3.984 4.365 -0.381 Optics
12 41088 196484 1.883 2.320 -0.437 Control
30 14141 106691 0.648 1.260 -0.612 Thermal processes and apparatus
5 26046 155256 1.194 1.834 -0.640 Basic communication processes
2 99922 447367 4.580 5.283 -0.703 Audio-visual technology
33 34932 195347 1.601 2.307 -0.706 Furniture, games
29 47057 279402 2.157 3.300 -1.143 Other special machines
10 83073 425977 3.808 5.031 -1.223 Measurement
25 25881 207495 1.186 2.450 -1.264 Handling
26 22856 199093 1.048 2.351 -1.304 Machine tools
27 29373 232992 1.346 2.752 -1.405 Engines, pumps, turbines
1 99479 519907 4.560 6.140 -1.580 Electrical machinery, apparatus, energy
35 25264 233882 1.158 2.762 -1.604 Civil engineering
31 27215 269472 1.247 3.182 -1.935 Mechanical elements
32 48656 373769 2.230 4.414 -2.184 Transport

United States Patent Classification

For a more refined classification, we might look at the United States Patent Classification (USPC), which is most closely related to examination process:

# get the patent x USPC category matrix
categories_uspc <- patentsview_class_matrix(
  "uspc_current", paste0(outDir, "uspc_current_matrix.rds"),
  dir = outDir
)
dim(categories_uspc)
#> [1] 6597925     475

# join to the inventors matrix
inventors_uspc <- female_inventors[names(female_inventors) %in% rownames(categories_uspc)]
inventors_uspc <- cbind(as.numeric(inventors_uspc), categories_uspc[names(inventors_uspc), ])

# and get category breakdowns by inventor sex
uspc_summary <- data.frame(
  Any_Female = colSums(inventors_uspc[inventors_uspc[, 1] != 0, -1] != 0),
  No_Female = colSums(inventors_uspc[inventors_uspc[, 1] == 0, -1] != 0)
)
uspc_summary <- cbind(uspc_summary, sweep(uspc_summary, 2, colSums(uspc_summary), "/") * 100)
colnames(uspc_summary)[3:4] <- c("Any_Female_Percent", "No_Female_Percent")
uspc_summary$Difference <- uspc_summary$Any_Female_Percent - uspc_summary$No_Female_Percent
uspc_summary <- uspc_summary[order(-uspc_summary$Difference), ]

# add category titles
uspc_field <- as.data.frame(download_patentsview_bulk("mainclass_current", outDir))
rownames(uspc_field) <- uspc_field$id
uspc_summary$Title <- uspc_field[rownames(uspc_summary), "title"]

kable(
  uspc_summary[c(1:20, 1:20 + nrow(uspc_summary) - 20), ],
  digits = 3,
  col.names = gsub("_", " ", colnames(uspc_summary), fixed = TRUE),
  caption = paste(
    "U.S. Patent Classification categories with the highest and lowest",
    "proportion of any female inventor"
  )
)
U.S. Patent Classification categories with the highest and lowest proportion of any female inventor
Any Female No Female Any Female Percent No Female Percent Difference Title
No longer published 224151 677667 12.453 7.938 4.515 NO LONGER PUBLISHED
514 58210 92627 3.234 1.085 2.149 DRUG, BIO-AFFECTING AND BODY TREATING COMPOSITIONS
435 53861 77304 2.992 0.906 2.087 CHEMISTRY: MOLECULAR BIOLOGY AND MICROBIOLOGY
424 40283 61337 2.238 0.719 1.520 DRUG, BIO-AFFECTING AND BODY TREATING COMPOSITIONS
536 25823 32451 1.435 0.380 1.055 ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES
530 24279 29769 1.349 0.349 1.000 CHEMISTRY: NATURAL RESINS OR DERIVATIVES; PEPTIDES OR PROTEINS; LIGNINS OR REACTION PRODUCTS THEREOF
546 16501 29030 0.917 0.340 0.577 ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES
544 15184 26435 0.844 0.310 0.534 ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES
548 14286 28499 0.794 0.334 0.460 ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES
438 30399 111712 1.689 1.309 0.380 SEMICONDUCTOR DEVICE MANUFACTURING: PROCESS
D14 17197 51719 0.955 0.606 0.350 RECORDING, COMMUNICATION, OR INFORMATION RETRIEVAL EQUIPMENT
436 10907 25378 0.606 0.297 0.309 CHEMISTRY: ANALYTICAL AND IMMUNOLOGICAL TESTING
707 16543 52594 0.919 0.616 0.303 DATA PROCESSING: DATABASE AND FILE MANAGEMENT OR DATA STRUCTURES
370 27087 103467 1.505 1.212 0.293 MULTIPLEX COMMUNICATIONS
257 47154 199440 2.620 2.336 0.284 ACTIVE SOLID-STATE DEVICES (E.G., TRANSISTORS, SOLID-STATE DIODES)
705 13156 40236 0.731 0.471 0.260 DATA PROCESSING: FINANCIAL, BUSINESS PRACTICE, MANAGEMENT, OR COST/PRICE DETERMINATION
428 26798 106553 1.489 1.248 0.241 STOCK MATERIAL OR MISCELLANEOUS ARTICLES
430 14084 46833 0.782 0.549 0.234 RADIATION IMAGERY CHEMISTRY: PROCESS, COMPOSITION, OR PRODUCT THEREOF
709 19121 70876 1.062 0.830 0.232 ELECTRICAL COMPUTERS AND DIGITAL PROCESSING SYSTEMS: MULTICOMPUTER DATA TRANSFERRING
510 6612 12180 0.367 0.143 0.225 CLEANING COMPOSITIONS FOR SOLID SURFACES, AUXILIARY COMPOSITIONS THEREFOR, OR PROCESSES OF PREPARING THE COMPOSITIONS
310 4287 37460 0.238 0.439 -0.201 ELECTRICAL GENERATOR OR MOTOR STRUCTURE
G9B 10744 68623 0.597 0.804 -0.207 INFORMATION STORAGE BASED ON RELATIVE MOVEMENT BETWEEN RECORD CARRIER AND TRANSDUCER
180 3177 33643 0.177 0.394 -0.218 MOTOR VEHICLES
414 1944 28781 0.108 0.337 -0.229 MATERIAL OR ARTICLE HANDLING
60 3918 39833 0.218 0.467 -0.249 POWER PLANTS
348 14224 88743 0.790 1.040 -0.249 TELEVISION
324 7288 56738 0.405 0.665 -0.260 ELECTRICITY: MEASURING AND TESTING
327 5469 48145 0.304 0.564 -0.260 MISCELLANEOUS ACTIVE ELECTRICAL NONLINEAR DEVICES, CIRCUITS, AND SYSTEMS
248 4322 42844 0.240 0.502 -0.262 SUPPORTS
250 9920 70470 0.551 0.825 -0.274 RADIANT ENERGY
74 2464 35477 0.137 0.416 -0.279 MACHINE ELEMENT OR MECHANISM
361 9646 70289 0.536 0.823 -0.287 ELECTRICITY: ELECTRICAL SYSTEMS AND DEVICES
280 4845 48938 0.269 0.573 -0.304 LAND VEHICLES
340 9825 74449 0.546 0.872 -0.326 COMMUNICATIONS: ELECTRICAL
52 3101 42632 0.172 0.499 -0.327 STATIC STRUCTURES (E.G., BUILDINGS)
439 6112 58424 0.340 0.684 -0.345 ELECTRICAL CONNECTORS
137 3075 44134 0.171 0.517 -0.346 FLUID HANDLING
29 8097 70267 0.450 0.823 -0.373 METAL WORKING
123 5260 56999 0.292 0.668 -0.375 INTERNAL-COMBUSTION ENGINES
73 9268 78942 0.515 0.925 -0.410 MEASURING AND TESTING

How have category breakdowns changed over time?

To get at this question, we’ll need to associate a date with each patent number:

# start with all patent information
patents <- download_patentsview_bulk("patent", outDir)

# associate a date with each patent number
all_dates <- structure(patents$date, names = patents$number)

# get years, and focus on only those found in the USPC set
uspc_year <- substr(all_dates[names(all_dates) %in% rownames(inventors_uspc)], 1, 4)

Now we could look at trends in differences in the percent of female versus male assigned inventors:

# use this set of years to break down the overall summaries
uspc_yearly_summaries <- do.call(rbind, lapply(sort(unique(uspc_year)), function(year) {
  d <- inventors_uspc[names(which(uspc_year == year)), ]
  r <- data.frame(
    Any_Female = colSums(d[d[, 1] != 0, -1] != 0, na.rm = TRUE),
    No_Female = colSums(d[d[, 1] == 0, -1] != 0, na.rm = TRUE)
  )
  r <- sweep(r, 2, colSums(r), "/") * 100
  r$Difference <- r$Any_Female - r$No_Female
  c(Year = as.numeric(year), t(r[3])[1, ])
}))
uspc_yearly_summaries[is.na(uspc_yearly_summaries)] <- 0
uspc_yearly_summaries <- uspc_yearly_summaries[, colSums(uspc_yearly_summaries) != 0]

# plot categories with the most positive and negative trends
library(splot)
trends <- sort(cor(uspc_yearly_summaries[, -1], uspc_yearly_summaries[, 1])[, 1], TRUE)
splot(
  uspc_yearly_summaries[, names(trends[c(1:5, 1:4 + length(trends) - 4)])] ~ uspc_yearly_summaries[, 1],
  lines = "spline", levels = list(mv = uspc_field[names(trends[c(1:5, 1:4 + length(trends) - 4)]), "title"]),
  title = "Inventor Distribution by Assigned Sex Per USPC Class Over Time",
  laby = "Percent Female-Assigned Inventors - Male-Assigned Inventors",
  labx = "Year", leg.title = "U.S. Patent Classification", myl = c(-.6, .7)
)

This highlights categories that are trending toward more similar allocations of inventors by sex, and suggests there are no categories showing a strong trend toward increasing difference in inventor allocation by sex.

We might also look at categories that show some trend in the percent of female-assigned inventors alone – where a greater or lesser percentage of the female-assigned inventor workforce has been allocated:

uspc_yearly_female_summaries <- do.call(rbind, lapply(sort(unique(uspc_year)), function(year) {
  d <- inventors_uspc[names(which(uspc_year == year)), ]
  r <- colSums(d[d[, 1] != 0, -1] != 0, na.rm = TRUE)
  r <- r / sum(r) * 100
  c(Year = as.numeric(year), r)
}))
uspc_yearly_female_summaries <- uspc_yearly_female_summaries[, colSums(uspc_yearly_female_summaries) != 0]

trends_female <- sort(cor(uspc_yearly_female_summaries[, -1], uspc_yearly_female_summaries[, 1])[, 1], TRUE)
splot(
  uspc_yearly_female_summaries[, names(trends_female[c(1:7, 1:2 + length(trends_female) - 6)])] ~
    uspc_yearly_female_summaries[, 1],
  lines = "spline", levels = list(
    mv = uspc_field[names(trends_female[c(1:7, 1:2 + length(trends_female) - 6)]), "title"]
  ),
  labels.trim = FALSE,
  title = "Share of Female-Assigned Inventors Per USPC Class Over Time", lpos = "topleft",
  laby = "Percent Female-Assigned Inventors", labx = "Year", leg.title = "U.S. Patent Classification"
)

Cooperative Patent Classifications

Cooperative Patent Classifications (CPC) might give an even more refined look, and could be compared across patent offices:

# get the patent x CPC category matrix
categories_cpc <- patentsview_class_matrix(
  "cpc_current", paste0(outDir, "cpc_current_matrix.rds"),
  dir = outDir
)
dim(categories_cpc)
#> [1] 7228678     670

# join to the inventors matrix
inventors_cpc <- female_inventors[names(female_inventors) %in% rownames(categories_cpc)]
inventors_cpc <- cbind(as.numeric(inventors_cpc), categories_cpc[names(inventors_cpc), ])

# and get category breakdowns by inventor sex
cpc_summary <- data.frame(
  Any_Female = colSums(inventors_cpc[inventors_cpc[, 1] != 0, -1] != 0),
  No_Female = colSums(inventors_cpc[inventors_cpc[, 1] == 0, -1] != 0)
)
cpc_summary <- cbind(cpc_summary, sweep(cpc_summary, 2, colSums(cpc_summary), "/") * 100)
colnames(cpc_summary)[3:4] <- c("Any_Female_Percent", "No_Female_Percent")
cpc_summary$Difference <- cpc_summary$Any_Female_Percent - cpc_summary$No_Female_Percent
cpc_summary <- cpc_summary[order(-cpc_summary$Difference), ]

# add category titles
cpc_group <- as.data.frame(download_patentsview_bulk("cpc_group", outDir))
rownames(cpc_group) <- cpc_group$id
cpc_summary <- cpc_summary[rownames(cpc_summary) %in% cpc_group$id, ]
cpc_summary$Title <- cpc_group[rownames(cpc_summary), "title"]

cpc_selection <- names(which(rowSums(cpc_summary[, 1:2]) > 5))
cpc_selection <- cpc_selection[c(1:20, 1:20 + length(cpc_selection) - 20)]
kable(
  cpc_summary[cpc_selection, ],
  digits = 3,
  col.names = gsub("_", " ", colnames(cpc_summary), fixed = TRUE),
  caption = paste(
    "Cooperative Patent Classifications: 20 categories with the highest and lowest",
    "proportion of any female inventor with at least 5 associated patents"
  )
)
Cooperative Patent Classifications: 20 categories with the highest and lowest proportion of any female inventor with at least 5 associated patents
Any Female No Female Any Female Percent No Female Percent Difference Title
A61K 118059 143894 3.895 1.244 2.651 PREPARATIONS FOR MEDICAL, DENTAL, OR TOILET PURPOSES
A61P 104944 111091 3.462 0.961 2.502 SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMPOUNDS OR MEDICINAL PREPARATIONS
C07K 53879 52116 1.778 0.451 1.327 PEPTIDES
C12N 52539 52564 1.733 0.455 1.279 MICROORGANISMS OR ENZYMES; COMPOSITIONS THEREOF; PROPAGATING, PRESERVING, OR MAINTAINING MICROORGANISMS; MUTATION OR GENETIC ENGINEERING; CULTURE MEDIA
C07D 62193 97354 2.052 0.842 1.210 HETEROCYCLIC COMPOUNDS
G06F 159496 525401 5.262 4.544 0.719 ELECTRIC DIGITAL DATA PROCESSING
H04W 66143 169788 2.182 1.468 0.714 WIRELESS COMMUNICATION NETWORKS
H04L 109031 341426 3.597 2.953 0.645 TRANSMISSION OF DIGITAL INFORMATION, e.g. TELEGRAPHIC COMMUNICATION
G01N 57456 148061 1.896 1.280 0.615 INVESTIGATING OR ANALYSING MATERIALS BY DETERMINING THEIR CHEMICAL OR PHYSICAL PROPERTIES
C12Q 22569 27393 0.745 0.237 0.508 MEASURING OR TESTING PROCESSES INVOLVING ENZYMES, NUCLEIC ACIDS OR MICROORGANISMS ; COMPOSITIONS OR TEST PAPERS THEREFOR; PROCESSES OF PREPARING SUCH COMPOSITIONS; CONDITION-RESPONSIVE CONTROL IN MICROBIOLOGICAL OR ENZYMOLOGICAL PROCESSES
G06Q 48204 127337 1.590 1.101 0.489 DATA PROCESSING SYSTEMS OR METHODS, SPECIALLY ADAPTED FOR ADMINISTRATIVE, COMMERCIAL, FINANCIAL, MANAGERIAL, SUPERVISORY OR FORECASTING PURPOSES; SYSTEMS OR METHODS SPECIALLY ADAPTED FOR ADMINISTRATIVE, COMMERCIAL, FINANCIAL, MANAGERIAL, SUPERVISORY OR FORECASTING PURPOSES, NOT OTHERWISE PROVIDED FOR
H01L 113814 381962 3.755 3.303 0.452 SEMICONDUCTOR DEVICES; ELECTRIC SOLID STATE DEVICES NOT OTHERWISE PROVIDED FOR
A61Q 16866 18698 0.556 0.162 0.395 SPECIFIC USE OF COSMETICS OR SIMILAR TOILET PREPARATIONS
C07C 33187 82127 1.095 0.710 0.385 ACYCLIC OR CARBOCYCLIC COMPOUNDS
C12P 12297 13758 0.406 0.119 0.287 FERMENTATION OR ENZYME-USING PROCESSES TO SYNTHESISE A DESIRED CHEMICAL COMPOUND OR COMPOSITION OR TO SEPARATE OPTICAL ISOMERS FROM A RACEMIC MIXTURE
C12Y 9799 8108 0.323 0.070 0.253 ENZYMES
C08L 20786 51386 0.686 0.444 0.241 COMPOSITIONS OF MACROMOLECULAR COMPOUNDS
C09D 16359 35799 0.540 0.310 0.230 COATING COMPOSITIONS, e.g. PAINTS, VARNISHES OR LACQUERS; FILLING PASTES; CHEMICAL PAINT OR INK REMOVERS; INKS; CORRECTING FLUIDS; WOODSTAINS; PASTES OR SOLIDS FOR COLOURING OR PRINTING; USE OF MATERIALS THEREFOR
C09K 15928 35505 0.526 0.307 0.218 MATERIALS FOR MISCELLANEOUS APPLICATIONS, NOT PROVIDED FOR ELSEWHERE
C08F 17480 41598 0.577 0.360 0.217 MACROMOLECULAR COMPOUNDS OBTAINED BY REACTIONS ONLY INVOLVING CARBON-TO-CARBON UNSATURATED BONDS
F02B 3796 35699 0.125 0.309 -0.183 INTERNAL-COMBUSTION PISTON ENGINES; COMBUSTION ENGINES IN GENERAL
H01H 4635 38928 0.153 0.337 -0.184 ELECTRIC SWITCHES; RELAYS; SELECTORS; EMERGENCY PROTECTIVE DEVICES
F16D 2968 33174 0.098 0.287 -0.189 COUPLINGS FOR TRANSMITTING ROTATION; CLUTCHES; BRAKES
B65G 2849 33446 0.094 0.289 -0.195 TRANSPORT OR STORAGE DEVICES, e.g. CONVEYORS FOR LOADING OR TIPPING, SHOP CONVEYOR SYSTEMS OR PNEUMATIC TUBE CONVEYORS
B62D 4831 41973 0.159 0.363 -0.204 MOTOR VEHICLES; TRAILERS
H03K 8291 55429 0.274 0.479 -0.206 PULSE TECHNIQUE
F16L 3595 37625 0.119 0.325 -0.207 PIPES; JOINTS OR FITTINGS FOR PIPES; SUPPORTS FOR PIPES, CABLES OR PROTECTIVE TUBING; MEANS FOR THERMAL INSULATION IN GENERAL
G01R 14938 81869 0.493 0.708 -0.215 MEASURING ELECTRIC VARIABLES; MEASURING MAGNETIC VARIABLES
B65H 4546 42323 0.150 0.366 -0.216 HANDLING THIN OR FILAMENTARY MATERIAL, e.g. SHEETS, WEBS, CABLES
A63B 6350 50705 0.210 0.438 -0.229 APPARATUS FOR PHYSICAL TRAINING, GYMNASTICS, SWIMMING, CLIMBING, OR FENCING; BALL GAMES; TRAINING EQUIPMENT
B65D 11747 71448 0.388 0.618 -0.230 CONTAINERS FOR STORAGE OR TRANSPORT OF ARTICLES OR MATERIALS, e.g. BAGS, BARRELS, BOTTLES, BOXES, CANS, CARTONS, CRATES, DRUMS, JARS, TANKS, HOPPERS, FORWARDING CONTAINERS; ACCESSORIES, CLOSURES, OR FITTINGS THEREFOR; PACKAGING ELEMENTS; PACKAGES
G11B 20710 106241 0.683 0.919 -0.235 INFORMATION STORAGE BASED ON RELATIVE MOVEMENT BETWEEN RECORD CARRIER AND TRANSDUCER
B60R 9077 62621 0.299 0.542 -0.242 VEHICLES, VEHICLE FITTINGS, OR VEHICLE PARTS, NOT OTHERWISE PROVIDED FOR
E21B 8401 62739 0.277 0.543 -0.265 EARTH DRILLING, e.g. DEEP DRILLING ; OBTAINING OIL, GAS, WATER, SOLUBLE OR MELTABLE MATERIALS OR A SLURRY OF MINERALS FROM WELLS
F16H 5390 51508 0.178 0.445 -0.268 GEARING
Y02T 17156 97400 0.566 0.842 -0.276 CLIMATE CHANGE MITIGATION TECHNOLOGIES RELATED TO TRANSPORTATION
B29C 15116 90347 0.499 0.781 -0.283 SHAPING OR JOINING OF PLASTICS; SHAPING OF MATERIAL IN A PLASTIC STATE, NOT OTHERWISE PROVIDED FOR; AFTER-TREATMENT OF THE SHAPED PRODUCTS, e.g. REPAIRING
H01R 9747 75973 0.322 0.657 -0.335 ELECTRICALLY-CONDUCTIVE CONNECTIONS; STRUCTURAL ASSOCIATIONS OF A PLURALITY OF MUTUALLY-INSULATED ELECTRICAL CONNECTING ELEMENTS; COUPLING DEVICES; CURRENT COLLECTORS
Y10S 75281 353260 2.484 3.055 -0.571 TECHNICAL SUBJECTS COVERED BY FORMER USPC CROSS-REFERENCE ART COLLECTIONS [XRACs] AND DIGESTS
Y10T 83598 503119 2.758 4.351 -1.593 TECHNICAL SUBJECTS COVERED BY FORMER US CLASSIFICATION

Where are there most and fewest female inventors?

The locations table associates inventor IDs with location IDs:

locations <- as.data.frame(download_patentsview_bulk("location", outDir))
locations$state_fips[!is.na(locations$state_fips)] <- formatC(
  locations$state_fips[!is.na(locations$state_fips)],
  width = 2, flag = 0, format = "d"
)
locations$county_fips[!is.na(locations$county_fips)] <- formatC(
  locations$county_fips[!is.na(locations$county_fips)],
  width = 5, flag = 0, format = "d"
)

# we can align this with our inventors data for inventor sex
rownames(locations) <- locations$id
located_inventors <- inventors[!is.na(inventors$location_id), ]
locations <- locations[located_inventors$location_id, ]

Inventor location is recorded as part of each patent, which means inventors may have multiple locations over time. For an initial look, we can focus only on each inventors most recent location:

# add date information to inventor data
inventors$date <- structure(patents$date, names = patents$number)[inventors$patent_id]

# sort inventors by date, then add location information
inventors_last_seen <- inventors[order(inventors$date, decreasing = TRUE), ]
inventors_last_seen <- inventors_last_seen[!duplicated(inventors_last_seen$inventor_id), ]
inventors_last_seen <- inventors_last_seen[inventors_last_seen$location_id %in% locations$id, ]
inventors_last_seen <- cbind(inventors_last_seen, locations[inventors_last_seen$location_id, ])

Now we can look at high-level summaries of locations, like we did with patent classes:

# top countries
breakdown_countries <- as.data.frame(t(vapply(
  split(inventors_last_seen$pred_fem, inventors_last_seen$country),
  function(d) c(Female = sum(d == 1), Male = sum(d == 0)),
  c(0, 0)
)))
breakdown_countries <- cbind(
  breakdown_countries,
  sweep(breakdown_countries, 2, colSums(breakdown_countries), "/") * 100
)
colnames(breakdown_countries)[3:4] <- c("Female_Percent", "Male_Percent")
breakdown_countries$Difference <- breakdown_countries$Female_Percent - breakdown_countries$Male_Percent
breakdown_countries <- breakdown_countries[order(-breakdown_countries$Difference), ]
kable(
  breakdown_countries[rowSums(breakdown_countries[, 1:2]) > 1e4, ],
  col.names = gsub("_", " ", colnames(breakdown_countries), fixed = TRUE),
  caption = "Countries with at least 10,000 associated inventors"
)
Countries with at least 10,000 associated inventors
Female Male Female Percent Male Percent Difference
CN 38044 104951 6.7701299 3.0053345 3.7647953
FR 24075 106275 4.2842728 3.0432481 1.2410248
KR 23911 122321 4.2550882 3.5027348 0.7523533
IT 9958 44167 1.7720785 1.2647484 0.5073301
US 269384 1658264 47.9382994 47.4853794 0.4529200
ES 4650 17826 0.8274920 0.5104581 0.3170338
FI 4357 16529 0.7753511 0.4733178 0.3020333
IL 7116 33690 1.2663296 0.9647333 0.3015964
IN 9325 49048 1.6594328 1.4045188 0.2549140
SU 3268 12008 0.5815578 0.3438562 0.2377015
SG 2692 10755 0.4790556 0.3079758 0.1710797
BE 3709 17464 0.6600361 0.5000921 0.1599440
DK 2682 14017 0.4772760 0.4013852 0.0758908
RU 2211 11556 0.3934591 0.3309130 0.0625461
SE 5775 34979 1.0276916 1.0016445 0.0260471
NO 1348 9725 0.2398837 0.2784812 -0.0385975
AU 3865 26212 0.6877971 0.7505963 -0.0627992
NL 6026 41085 1.0723584 1.1764935 -0.1041351
CH 4763 37774 0.8476009 1.0816810 -0.2340801
AT 1551 17833 0.2760086 0.5106586 -0.2346500
CA 13261 92134 2.3598647 2.6383121 -0.2784474
GB 13156 98987 2.3411794 2.8345518 -0.4933724
TW 9993 93957 1.7783069 2.6905148 -0.9122079
DE 26375 265556 4.6935699 7.6043546 -2.9107846
JP 52435 464053 9.3310840 13.2884346 -3.9573506
# top states
breakdown_states <- as.data.frame(t(vapply(
  split(inventors_last_seen$pred_fem, inventors_last_seen$state_fips),
  function(d) c(Female = sum(d == 1), Male = sum(d == 0)),
  c(0, 0)
)))
breakdown_states <- cbind(
  breakdown_states,
  sweep(breakdown_states, 2, colSums(breakdown_states), "/") * 100
)
colnames(breakdown_states)[3:4] <- c("Female_Percent", "Male_Percent")
breakdown_states$Difference <- breakdown_states$Female_Percent - breakdown_states$Male_Percent
breakdown_states <- breakdown_states[order(-breakdown_states$Difference), ]
## install if needed: remotes::install_github("uva-bi-sdad/catchment")
library(catchment)
states <- download_census_shapes(paste0(dirname(outDir), "/maps"))
state_names <- structure(states$NAME, names = states$STATEFP)
breakdown_states <- breakdown_states[rownames(breakdown_states) %in% names(state_names), ]
rownames(breakdown_states) <- state_names[rownames(breakdown_states)]
kable(
  breakdown_states[rowSums(breakdown_states[, 1:2]) > 1e4, ],
  col.names = gsub("_", " ", colnames(breakdown_states), fixed = TRUE),
  caption = "States with at least 10,000 associated inventors"
)
States with at least 10,000 associated inventors
Female Male Female Percent Male Percent Difference
California 59595 337658 22.1259802 20.3653186 1.7606616
New York 18892 97296 7.0140787 5.8682573 1.1458213
New Jersey 11716 58560 4.3498277 3.5319556 0.8178722
Massachusetts 13899 73660 5.1603154 4.4426887 0.7176268
Maryland 5991 29161 2.2242931 1.7588005 0.4654926
Georgia 5596 29640 2.0776405 1.7876906 0.2899499
Florida 9745 56698 3.6180498 3.4196519 0.1983978
North Carolina 6806 39094 2.5268801 2.3578940 0.1689861
Virginia 4951 29177 1.8381698 1.7597655 0.0784043
Missouri 3441 20193 1.2775484 1.2179095 0.0596389
Louisiana 1326 9313 0.4923072 0.5616992 -0.0693919
Washington 10677 66907 3.9640757 4.0353919 -0.0713163
Alabama 1352 9553 0.5019603 0.5761744 -0.0742141
Tennessee 2616 17403 0.9712487 1.0496350 -0.0783863
South Carolina 2002 13700 0.7432874 0.8262943 -0.0830069
Kentucky 1422 10146 0.5279494 0.6119403 -0.0839909
Kansas 1473 10490 0.5468843 0.6326881 -0.0858038
Minnesota 7133 45613 2.6482862 2.7510773 -0.1027911
Oregon 3915 26031 1.4535315 1.5700194 -0.1164878
Colorado 5377 35085 1.9963318 2.1160974 -0.1197655
Arizona 4355 28950 1.6168914 1.7460743 -0.1291829
Oklahoma 1343 11238 0.4986189 0.6778025 -0.1791837
Illinois 11426 73643 4.2421587 4.4416633 -0.1995046
New Hampshire 1292 11438 0.4796840 0.6898652 -0.2101813
Connecticut 4479 31161 1.6629292 1.8794274 -0.2164982
Iowa 1564 13703 0.5806701 0.8264752 -0.2458051
Utah 2113 17121 0.7844986 1.0326266 -0.2481279
Pennsylvania 9865 65445 3.6626025 3.9472137 -0.2846112
Indiana 4086 29885 1.5170191 1.8024674 -0.2854483
Wisconsin 4793 35458 1.7795087 2.1385943 -0.3590855
Ohio 9223 64548 3.4242456 3.8931125 -0.4688669
Texas 16033 109365 5.9526108 6.5961803 -0.6435696
Michigan 9543 77209 3.5430528 4.6567411 -1.1136883
# map of counties
library(leaflet)
library(sf)
counties <- st_transform(download_census_shapes(
  paste0(dirname(outDir), "/maps"),
  entity = "county"
), "WGS84")
counties$NAME <- paste0(counties$NAME, ", ", state_names[counties$STATEFP])

breakdown_counties <- as.data.frame(t(vapply(
  split(inventors_last_seen$pred_fem, inventors_last_seen$county_fips),
  function(d) c(Female = sum(d == 1), Male = sum(d == 0)),
  c(0, 0)
)))
breakdown_counties <- cbind(
  breakdown_counties,
  sweep(breakdown_counties, 2, colSums(breakdown_counties), "/") * 100
)
colnames(breakdown_counties)[3:4] <- c("Female_Percent", "Male_Percent")
breakdown_counties$Difference <- breakdown_counties$Female_Percent - breakdown_counties$Male_Percent
breakdown_counties$Total <- rowSums(breakdown_counties[, 1:2])
breakdown_counties$Capped_Total <- breakdown_counties$Total
breakdown_counties$Capped_Total[breakdown_counties$Capped_Total > 5e4] <- 1e5
breakdown_counties <- breakdown_counties[counties$GEOID, ]

pal <- colorNumeric(
  scico::scico(255, direction = -1, palette = "vik"),
  rep(max(abs(breakdown_counties$Difference), na.rm = TRUE), 2) * c(-1, 1)
)
pal_total <- colorNumeric(
  scico::scico(255, direction = -1, palette = "lajolla"), breakdown_counties$Capped_Total
)
leaflet(counties[, 1], options = leafletOptions(attributionControl = FALSE)) |>
  addProviderTiles("CartoDB.Positron") |>
  setView(-95.5810546875, 39.5040407055842, 4) |>
  addControl("Percent Female-Assigned Inventors - Percent Male-Assigned Inventors", "topright") |>
  addLayersControl(position = "topleft", overlayGroups = c("Total", "Difference")) |>
  addLegend(
    "bottomright", pal_total, breakdown_counties$Capped_Total,
    opacity = 1,
    title = "Totals", group = "Total"
  ) |>
  addPolygons(
    fillColor = pal_total(breakdown_counties$Capped_Total),
    fillOpacity = .8, weight = 1, color = "#000", highlightOptions = highlightOptions(color = "#fff"),
    group = "Total", label = paste0("County: ", counties$NAME, "; Total: ", breakdown_counties$Total)
  ) |>
  hideGroup("Total") |>
  addLegend(
    "bottomright", pal, breakdown_counties$Difference,
    opacity = 1,
    title = "Percent Difference", group = "Difference"
  ) |>
  addPolygons(
    fillColor = pal(breakdown_counties$Difference), fillOpacity = .8, weight = 1, color = "#000",
    highlightOptions = highlightOptions(color = "#fff"),
    group = "Difference", label = paste0(
      "County: ", counties$NAME,
      "; Female: ", breakdown_counties$Female,
      "; Male: ", breakdown_counties$Male,
      "; Percent Difference (Female - Male): ", round(breakdown_counties$Difference, 3)
    )
  )

How have inventors migrated?

Instead of collapsing inventors to a single location, we could also look for instances of inventors moving between states, and see which moves are most common:

# start with all of located inventors
inventor_states <- cbind(
  located_inventors[, c("inventor_id", "pred_fem")],
  date = all_dates[located_inventors$patent_id],
  state = state_names[locations$state_fips]
)
inventor_states <- inventor_states[!is.na(inventor_states$state), ]
inventor_states <- inventor_states[!is.na(inventor_states$date), ]

# select only those that are associated with multiple patents
inventor_states <- inventor_states[
  inventor_states$inventor_id %in% inventor_states$inventor_id[duplicated(inventor_states$inventor_id)],
]
inventor_states <- inventor_states[order(inventor_states$date, decreasing = TRUE), ]

## see how many unique inventors this leaves
length(unique(inventor_states$inventor_id))
#> [1] 1057115

# then look at each inventor to see if they have more than one state.
# if they do, record their transition; previous state -> new state
inventor_history <- lapply(split(inventor_states, inventor_states$inventor_id), function(d) {
  if (!all(d$state == d$state[[1]])) {
    res <- NULL
    states <- d$state
    for (i in seq_len(nrow(d) - 1)) {
      if (states[[i]] != states[[i + 1]]) res <- c(paste(states[[i + 1]], "->", states[[i]]), res)
    }
    res
  }
})
inventor_history <- Filter(length, inventor_history)

## see how many inventors have associated moves
length(inventor_history)
#> [1] 265374

inventor_moves <- unlist(inventor_history)
inventor_moves <- tapply(inventor_moves, inventor_moves, length)
inventor_transitions <- data.frame(
  do.call(rbind, strsplit(names(inventor_moves), " -> ", fixed = TRUE)),
  inventor_moves
)
colnames(inventor_transitions) <- c("from", "to", "count")

# use these transitions to make a origin x destination matrix
states <- sort(unique(c(inventor_transitions$from, inventor_transitions$to)))
migrations <- matrix(0, length(states), length(states), dimnames = list(states, states))
for (r in seq_len(nrow(inventor_transitions))) {
  move <- inventor_transitions[r, ]
  migrations[move$from, move$to] <- move$count
}
migrations <- migrations[rowSums(migrations) > 1e4, rowSums(migrations) > 1e4]
migrations <- migrations[order(-rowSums(migrations)), order(-rowSums(migrations))]

# and make a chord diagram out of the most frequently involved states
# colored by percent difference (Female - Male; higher female percent = bluer)
## devtools::install_github("mattflor/chorddiag")
library(chorddiag)
chorddiag(
  migrations,
  groupColors = scico::scico(nrow(migrations), palette = "vik")[
    order(order(-breakdown_states[rownames(migrations), "Difference"]))
  ], showTicks = FALSE, groupnamePadding = 5
)