Analyze Inventor Features Using PatentsView Data • uspto

Built with R 4.2.1

This example explores the PatentsView bulk tables, with a focus on assigned inventor sex.

Setup

Before starting, we’ll need to load the package, and point to a directory where we’d like things saved:

# install if needed: remotes::install_guthub("uva-bi-sdad/uspto")
library(uspto)
outDir <- "../patentsview/"

Which patent categories have most and fewest female inventors?

The first step toward answering this question is to assign a sex to inventors based on their given name. The inventors table includes USPTO’s assignment, which we’ll start with:

# you may need to increase your download timeout, depending on your connection
options(timeout = 300)

inventors <- as.data.frame(download_patentsview_bulk("inventor", outDir))

Now, we can add a few of our own prediction methods, including one based on the included USPTO flags:

# associate a sex with each unique given name
inventor_sex_file <- paste0(outDir, "inventor_sex.csv.xz")
if (file.exists(inventor_sex_file)) {
  inventor_sex <- vroom::vroom(inventor_sex_file, show_col_types = FALSE)
} else {
  # add a standardized version of given names
  inventors$name_first[is.na(inventors$name_first)] <- ""
  inventors$given <- sub(
    "^(.)", "\\U\\1", gsub("^([a-z-]{,6}[.-])+ | +.*", "", tolower(inventors$name_first)),
    perl = TRUE
  )
  inventors$name_last[is.na(inventors$name_last)] <- ""
  inventors$family <- sub(
    "^(.)", "\\U\\1", gsub("^([a-z-]{,6}[.-])+ | +.*", "", tolower(inventors$name_last)),
    perl = TRUE
  )

  # reverse male flag
  inventors$pred_fem_patentsview <- inventors$male_flag
  inventors$pred_fem_patentsview[is.na(inventors$pred_fem_patentsview)] <- .5
  inventors$pred_fem_patentsview <- 1 - inventors$pred_fem_patentsview

  ## install if needed: remotes::install_github("miserman/lusilab")
  library(lusilab)
  inventor_sex <- cbind(
    inventors[, c("id", "given", "family", "pred_fem_patentsview")], predict_demographics(
      inventors$given, inventors$name_last,
      dir = paste0(dirname(outDir), "/names")
    )[, -(1:3)]
  )

  ## make aggregate predictor
  count_vars <- grep("count_", colnames(inventor_sex), fixed = TRUE, value = TRUE)
  inventor_sex$prob_fem <- rowSums(
    inventor_sex[, count_vars] * inventor_sex[, sub("count", "prob_fem", count_vars, fixed = TRUE)]
  ) / rowSums(inventor_sex[, count_vars])

  vroom::vroom_write(inventor_sex, inventor_sex_file, ",")
}

# get categorical predictions
prob_cols <- grep("^prob_", colnames(inventor_sex), value = TRUE)
inventor_sex_preds <- as.data.frame(inventor_sex[, prob_cols])
dimnames(inventor_sex_preds) <- list(
  inventor_sex$id,
  sub("prob", "pred", prob_cols, fixed = TRUE)
)
inventor_sex_preds[inventor_sex_preds > .5] <- 1
inventor_sex_preds[inventor_sex_preds < .5] <- 0

First, we can compare these different methods with the USPTO assignments:

kable(
  data.frame(
    "Proportion Sexed" = colMeans(inventor_sex_preds != .5),
    "Accuracy of Sexed to PatentsView" = colMeans(vapply(
      inventor_sex_preds, function(x) x != .5 & x == inventor_sex$pred_fem_patentsview,
      logical(nrow(inventor_sex))
    )),
    check.names = FALSE
  ),
  caption = "Sex Predition Method Comparisons"
)

Sex Predition Method Comparisons
	Proportion Sexed	Accuracy of Sexed to PatentsView
pred_fem_wgnd	0.8899481	0.7273651
pred_fem_fb	0.8968863	0.7242883
pred_fem_fb_scraped	0.7293734	0.6086984
pred_fem_skydeck	0.7906620	0.6558273
pred_fem_usssa	0.7724898	0.6429201
pred_fem_uspto	0.9939588	0.7795239
pred_fem	0.9955821	0.7712082

Now we can move forward with the method with the aggregate prediction:

# add assigned sex to inventors data
inventors$pred_fem <- inventor_sex_preds$pred_fem

## focus on just sexed inventors
inventors <- inventors[inventors$pred_fem != .5, ]
rownames(inventors) <- inventors$id

# add patent IDs
patent_inventor <- download_patentsview_bulk("patent_inventor", outDir)
patent_inventor <- patent_inventor[patent_inventor$inventor_id %in% inventors$id, ]
inventors <- cbind(inventors[patent_inventor$inventor_id, -1], patent_inventor)

# count inventors predicted to be female in each patent
female_inventors <- tapply(inventors$pred_fem == 1, inventors$patent_id, sum)

World Intellectual Property Organization

Now we can add patent category information, to get breakdowns of classes by inventor sex. There are multiple classification schemes, but we can start with the World Intellectual Property Organization (WIPO) technology fields for a high-level overview:

library(Matrix)

# this makes a patent x WIPO category matrix
categories_wipo <- patentsview_class_matrix("wipo", paste0(outDir, "wipo_matrix.rds"), dir = outDir)
dim(categories_wipo)
#> [1] 7221215      35

# which we can join to the inventors matrix
inventors_wipo <- female_inventors[names(female_inventors) %in% rownames(categories_wipo)]
inventors_wipo <- cbind(as.numeric(inventors_wipo), categories_wipo[names(inventors_wipo), ])

# and get category breakdowns by inventor sex
wipo_summary <- data.frame(
  Any_Female = colSums(inventors_wipo[inventors_wipo[, 1] != 0, -1] != 0),
  No_Female = colSums(inventors_wipo[inventors_wipo[, 1] == 0, -1] != 0)
)
wipo_summary <- cbind(wipo_summary, sweep(wipo_summary, 2, colSums(wipo_summary), "/") * 100)
colnames(wipo_summary)[3:4] <- c("Any_Female_Percent", "No_Female_Percent")
wipo_summary$Difference <- wipo_summary$Any_Female_Percent - wipo_summary$No_Female_Percent
wipo_summary <- wipo_summary[order(-wipo_summary$Difference), ]

# add category titles
wipo_field <- as.data.frame(download_patentsview_bulk("wipo_field", outDir))
rownames(wipo_field) <- wipo_field$id
wipo_summary$Title <- wipo_field[rownames(wipo_summary), "field_title"]

kable(
  wipo_summary,
  digits = 3,
  col.names = gsub("_", " ", colnames(wipo_summary), fixed = TRUE),
  caption = "World Intellectual Property Organization Categories"
)

World Intellectual Property Organization Categories
	Any Female	No Female	Any Female Percent	No Female Percent	Difference	Title
16	140288	168928	6.430	1.995	4.435	Pharmaceuticals
15	102368	112843	4.692	1.333	3.359	Biotechnology
14	113777	204459	5.215	2.415	2.800	Organic fine chemistry
4	142830	439789	6.547	5.194	1.353	Digital communication
6	223799	765056	10.258	9.035	1.223	Computer technology
19	70943	176861	3.252	2.089	1.163	Basic materials chemistry
11	31762	50883	1.456	0.601	0.855	Analysis of biological materials
17	53230	143628	2.440	1.696	0.744	Macromolecular chemistry, polymers
8	110959	370133	5.086	4.371	0.715	Semiconductors
7	46905	123348	2.150	1.457	0.693	IT methods for management
18	19609	47593	0.899	0.562	0.337	Food chemistry
22	17513	44769	0.803	0.529	0.274	Micro-structural and nano-technology
13	103601	383823	4.749	4.533	0.216	Medical technology
20	36649	136649	1.680	1.614	0.066	Materials, metallurgy
21	37255	140355	1.708	1.658	0.050	Surface technology, coating
34	37871	153868	1.736	1.817	-0.081	Other consumer goods
3	75840	312227	3.476	3.687	-0.211	Telecommunications
23	49807	217028	2.283	2.563	-0.280	Chemical engineering
28	34725	158782	1.592	1.875	-0.284	Textile and paper machines
24	20131	103924	0.923	1.227	-0.305	Environmental technology
9	86916	369601	3.984	4.365	-0.381	Optics
12	41088	196484	1.883	2.320	-0.437	Control
30	14141	106691	0.648	1.260	-0.612	Thermal processes and apparatus
5	26046	155256	1.194	1.834	-0.640	Basic communication processes
2	99922	447367	4.580	5.283	-0.703	Audio-visual technology
33	34932	195347	1.601	2.307	-0.706	Furniture, games
29	47057	279402	2.157	3.300	-1.143	Other special machines
10	83073	425977	3.808	5.031	-1.223	Measurement
25	25881	207495	1.186	2.450	-1.264	Handling
26	22856	199093	1.048	2.351	-1.304	Machine tools
27	29373	232992	1.346	2.752	-1.405	Engines, pumps, turbines
1	99479	519907	4.560	6.140	-1.580	Electrical machinery, apparatus, energy
35	25264	233882	1.158	2.762	-1.604	Civil engineering
31	27215	269472	1.247	3.182	-1.935	Mechanical elements
32	48656	373769	2.230	4.414	-2.184	Transport

United States Patent Classification

For a more refined classification, we might look at the United States Patent Classification (USPC), which is most closely related to examination process:

# get the patent x USPC category matrix
categories_uspc <- patentsview_class_matrix(
  "uspc_current", paste0(outDir, "uspc_current_matrix.rds"),
  dir = outDir
)
dim(categories_uspc)
#> [1] 6597925     475

# join to the inventors matrix
inventors_uspc <- female_inventors[names(female_inventors) %in% rownames(categories_uspc)]
inventors_uspc <- cbind(as.numeric(inventors_uspc), categories_uspc[names(inventors_uspc), ])

# and get category breakdowns by inventor sex
uspc_summary <- data.frame(
  Any_Female = colSums(inventors_uspc[inventors_uspc[, 1] != 0, -1] != 0),
  No_Female = colSums(inventors_uspc[inventors_uspc[, 1] == 0, -1] != 0)
)
uspc_summary <- cbind(uspc_summary, sweep(uspc_summary, 2, colSums(uspc_summary), "/") * 100)
colnames(uspc_summary)[3:4] <- c("Any_Female_Percent", "No_Female_Percent")
uspc_summary$Difference <- uspc_summary$Any_Female_Percent - uspc_summary$No_Female_Percent
uspc_summary <- uspc_summary[order(-uspc_summary$Difference), ]

# add category titles
uspc_field <- as.data.frame(download_patentsview_bulk("mainclass_current", outDir))
rownames(uspc_field) <- uspc_field$id
uspc_summary$Title <- uspc_field[rownames(uspc_summary), "title"]

kable(
  uspc_summary[c(1:20, 1:20 + nrow(uspc_summary) - 20), ],
  digits = 3,
  col.names = gsub("_", " ", colnames(uspc_summary), fixed = TRUE),
  caption = paste(
    "U.S. Patent Classification categories with the highest and lowest",
    "proportion of any female inventor"
  )
)

U.S. Patent Classification categories with the highest and lowest proportion of any female inventor
	Any Female	No Female	Any Female Percent	No Female Percent	Difference	Title
No longer published	224151	677667	12.453	7.938	4.515	NO LONGER PUBLISHED
514	58210	92627	3.234	1.085	2.149	DRUG, BIO-AFFECTING AND BODY TREATING COMPOSITIONS
435	53861	77304	2.992	0.906	2.087	CHEMISTRY: MOLECULAR BIOLOGY AND MICROBIOLOGY
424	40283	61337	2.238	0.719	1.520	DRUG, BIO-AFFECTING AND BODY TREATING COMPOSITIONS
536	25823	32451	1.435	0.380	1.055	ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES
530	24279	29769	1.349	0.349	1.000	CHEMISTRY: NATURAL RESINS OR DERIVATIVES; PEPTIDES OR PROTEINS; LIGNINS OR REACTION PRODUCTS THEREOF
546	16501	29030	0.917	0.340	0.577	ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES
544	15184	26435	0.844	0.310	0.534	ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES
548	14286	28499	0.794	0.334	0.460	ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES
438	30399	111712	1.689	1.309	0.380	SEMICONDUCTOR DEVICE MANUFACTURING: PROCESS
D14	17197	51719	0.955	0.606	0.350	RECORDING, COMMUNICATION, OR INFORMATION RETRIEVAL EQUIPMENT
436	10907	25378	0.606	0.297	0.309	CHEMISTRY: ANALYTICAL AND IMMUNOLOGICAL TESTING
707	16543	52594	0.919	0.616	0.303	DATA PROCESSING: DATABASE AND FILE MANAGEMENT OR DATA STRUCTURES
370	27087	103467	1.505	1.212	0.293	MULTIPLEX COMMUNICATIONS
257	47154	199440	2.620	2.336	0.284	ACTIVE SOLID-STATE DEVICES (E.G., TRANSISTORS, SOLID-STATE DIODES)
705	13156	40236	0.731	0.471	0.260	DATA PROCESSING: FINANCIAL, BUSINESS PRACTICE, MANAGEMENT, OR COST/PRICE DETERMINATION
428	26798	106553	1.489	1.248	0.241	STOCK MATERIAL OR MISCELLANEOUS ARTICLES
430	14084	46833	0.782	0.549	0.234	RADIATION IMAGERY CHEMISTRY: PROCESS, COMPOSITION, OR PRODUCT THEREOF
709	19121	70876	1.062	0.830	0.232	ELECTRICAL COMPUTERS AND DIGITAL PROCESSING SYSTEMS: MULTICOMPUTER DATA TRANSFERRING
510	6612	12180	0.367	0.143	0.225	CLEANING COMPOSITIONS FOR SOLID SURFACES, AUXILIARY COMPOSITIONS THEREFOR, OR PROCESSES OF PREPARING THE COMPOSITIONS
310	4287	37460	0.238	0.439	-0.201	ELECTRICAL GENERATOR OR MOTOR STRUCTURE
G9B	10744	68623	0.597	0.804	-0.207	INFORMATION STORAGE BASED ON RELATIVE MOVEMENT BETWEEN RECORD CARRIER AND TRANSDUCER
180	3177	33643	0.177	0.394	-0.218	MOTOR VEHICLES
414	1944	28781	0.108	0.337	-0.229	MATERIAL OR ARTICLE HANDLING
60	3918	39833	0.218	0.467	-0.249	POWER PLANTS
348	14224	88743	0.790	1.040	-0.249	TELEVISION
324	7288	56738	0.405	0.665	-0.260	ELECTRICITY: MEASURING AND TESTING
327	5469	48145	0.304	0.564	-0.260	MISCELLANEOUS ACTIVE ELECTRICAL NONLINEAR DEVICES, CIRCUITS, AND SYSTEMS
248	4322	42844	0.240	0.502	-0.262	SUPPORTS
250	9920	70470	0.551	0.825	-0.274	RADIANT ENERGY
74	2464	35477	0.137	0.416	-0.279	MACHINE ELEMENT OR MECHANISM
361	9646	70289	0.536	0.823	-0.287	ELECTRICITY: ELECTRICAL SYSTEMS AND DEVICES
280	4845	48938	0.269	0.573	-0.304	LAND VEHICLES
340	9825	74449	0.546	0.872	-0.326	COMMUNICATIONS: ELECTRICAL
52	3101	42632	0.172	0.499	-0.327	STATIC STRUCTURES (E.G., BUILDINGS)
439	6112	58424	0.340	0.684	-0.345	ELECTRICAL CONNECTORS
137	3075	44134	0.171	0.517	-0.346	FLUID HANDLING
29	8097	70267	0.450	0.823	-0.373	METAL WORKING
123	5260	56999	0.292	0.668	-0.375	INTERNAL-COMBUSTION ENGINES
73	9268	78942	0.515	0.925	-0.410	MEASURING AND TESTING

How have category breakdowns changed over time?

To get at this question, we’ll need to associate a date with each patent number:

# start with all patent information
patents <- download_patentsview_bulk("patent", outDir)

# associate a date with each patent number
all_dates <- structure(patents$date, names = patents$number)

# get years, and focus on only those found in the USPC set
uspc_year <- substr(all_dates[names(all_dates) %in% rownames(inventors_uspc)], 1, 4)

Now we could look at trends in differences in the percent of female versus male assigned inventors:

# use this set of years to break down the overall summaries
uspc_yearly_summaries <- do.call(rbind, lapply(sort(unique(uspc_year)), function(year) {
  d <- inventors_uspc[names(which(uspc_year == year)), ]
  r <- data.frame(
    Any_Female = colSums(d[d[, 1] != 0, -1] != 0, na.rm = TRUE),
    No_Female = colSums(d[d[, 1] == 0, -1] != 0, na.rm = TRUE)
  )
  r <- sweep(r, 2, colSums(r), "/") * 100
  r$Difference <- r$Any_Female - r$No_Female
  c(Year = as.numeric(year), t(r[3])[1, ])
}))
uspc_yearly_summaries[is.na(uspc_yearly_summaries)] <- 0
uspc_yearly_summaries <- uspc_yearly_summaries[, colSums(uspc_yearly_summaries) != 0]

# plot categories with the most positive and negative trends
library(splot)
trends <- sort(cor(uspc_yearly_summaries[, -1], uspc_yearly_summaries[, 1])[, 1], TRUE)
splot(
  uspc_yearly_summaries[, names(trends[c(1:5, 1:4 + length(trends) - 4)])] ~ uspc_yearly_summaries[, 1],
  lines = "spline", levels = list(mv = uspc_field[names(trends[c(1:5, 1:4 + length(trends) - 4)]), "title"]),
  title = "Inventor Distribution by Assigned Sex Per USPC Class Over Time",
  laby = "Percent Female-Assigned Inventors - Male-Assigned Inventors",
  labx = "Year", leg.title = "U.S. Patent Classification", myl = c(-.6, .7)
)

This highlights categories that are trending toward more similar allocations of inventors by sex, and suggests there are no categories showing a strong trend toward increasing difference in inventor allocation by sex.

We might also look at categories that show some trend in the percent of female-assigned inventors alone – where a greater or lesser percentage of the female-assigned inventor workforce has been allocated:

uspc_yearly_female_summaries <- do.call(rbind, lapply(sort(unique(uspc_year)), function(year) {
  d <- inventors_uspc[names(which(uspc_year == year)), ]
  r <- colSums(d[d[, 1] != 0, -1] != 0, na.rm = TRUE)
  r <- r / sum(r) * 100
  c(Year = as.numeric(year), r)
}))
uspc_yearly_female_summaries <- uspc_yearly_female_summaries[, colSums(uspc_yearly_female_summaries) != 0]

trends_female <- sort(cor(uspc_yearly_female_summaries[, -1], uspc_yearly_female_summaries[, 1])[, 1], TRUE)
splot(
  uspc_yearly_female_summaries[, names(trends_female[c(1:7, 1:2 + length(trends_female) - 6)])] ~
    uspc_yearly_female_summaries[, 1],
  lines = "spline", levels = list(
    mv = uspc_field[names(trends_female[c(1:7, 1:2 + length(trends_female) - 6)]), "title"]
  ),
  labels.trim = FALSE,
  title = "Share of Female-Assigned Inventors Per USPC Class Over Time", lpos = "topleft",
  laby = "Percent Female-Assigned Inventors", labx = "Year", leg.title = "U.S. Patent Classification"
)

Cooperative Patent Classifications

Cooperative Patent Classifications (CPC) might give an even more refined look, and could be compared across patent offices:

# get the patent x CPC category matrix
categories_cpc <- patentsview_class_matrix(
  "cpc_current", paste0(outDir, "cpc_current_matrix.rds"),
  dir = outDir
)
dim(categories_cpc)
#> [1] 7228678     670

# join to the inventors matrix
inventors_cpc <- female_inventors[names(female_inventors) %in% rownames(categories_cpc)]
inventors_cpc <- cbind(as.numeric(inventors_cpc), categories_cpc[names(inventors_cpc), ])

# and get category breakdowns by inventor sex
cpc_summary <- data.frame(
  Any_Female = colSums(inventors_cpc[inventors_cpc[, 1] != 0, -1] != 0),
  No_Female = colSums(inventors_cpc[inventors_cpc[, 1] == 0, -1] != 0)
)
cpc_summary <- cbind(cpc_summary, sweep(cpc_summary, 2, colSums(cpc_summary), "/") * 100)
colnames(cpc_summary)[3:4] <- c("Any_Female_Percent", "No_Female_Percent")
cpc_summary$Difference <- cpc_summary$Any_Female_Percent - cpc_summary$No_Female_Percent
cpc_summary <- cpc_summary[order(-cpc_summary$Difference), ]

# add category titles
cpc_group <- as.data.frame(download_patentsview_bulk("cpc_group", outDir))
rownames(cpc_group) <- cpc_group$id
cpc_summary <- cpc_summary[rownames(cpc_summary) %in% cpc_group$id, ]
cpc_summary$Title <- cpc_group[rownames(cpc_summary), "title"]

cpc_selection <- names(which(rowSums(cpc_summary[, 1:2]) > 5))
cpc_selection <- cpc_selection[c(1:20, 1:20 + length(cpc_selection) - 20)]
kable(
  cpc_summary[cpc_selection, ],
  digits = 3,
  col.names = gsub("_", " ", colnames(cpc_summary), fixed = TRUE),
  caption = paste(
    "Cooperative Patent Classifications: 20 categories with the highest and lowest",
    "proportion of any female inventor with at least 5 associated patents"
  )
)

Cooperative Patent Classifications: 20 categories with the highest and lowest proportion of any female inventor with at least 5 associated patents
	Any Female	No Female	Any Female Percent	No Female Percent	Difference	Title
A61K	118059	143894	3.895	1.244	2.651	PREPARATIONS FOR MEDICAL, DENTAL, OR TOILET PURPOSES
A61P	104944	111091	3.462	0.961	2.502	SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMPOUNDS OR MEDICINAL PREPARATIONS
C07K	53879	52116	1.778	0.451	1.327	PEPTIDES
C12N	52539	52564	1.733	0.455	1.279	MICROORGANISMS OR ENZYMES; COMPOSITIONS THEREOF; PROPAGATING, PRESERVING, OR MAINTAINING MICROORGANISMS; MUTATION OR GENETIC ENGINEERING; CULTURE MEDIA
C07D	62193	97354	2.052	0.842	1.210	HETEROCYCLIC COMPOUNDS
G06F	159496	525401	5.262	4.544	0.719	ELECTRIC DIGITAL DATA PROCESSING
H04W	66143	169788	2.182	1.468	0.714	WIRELESS COMMUNICATION NETWORKS
H04L	109031	341426	3.597	2.953	0.645	TRANSMISSION OF DIGITAL INFORMATION, e.g. TELEGRAPHIC COMMUNICATION
G01N	57456	148061	1.896	1.280	0.615	INVESTIGATING OR ANALYSING MATERIALS BY DETERMINING THEIR CHEMICAL OR PHYSICAL PROPERTIES
C12Q	22569	27393	0.745	0.237	0.508	MEASURING OR TESTING PROCESSES INVOLVING ENZYMES, NUCLEIC ACIDS OR MICROORGANISMS ; COMPOSITIONS OR TEST PAPERS THEREFOR; PROCESSES OF PREPARING SUCH COMPOSITIONS; CONDITION-RESPONSIVE CONTROL IN MICROBIOLOGICAL OR ENZYMOLOGICAL PROCESSES
G06Q	48204	127337	1.590	1.101	0.489	DATA PROCESSING SYSTEMS OR METHODS, SPECIALLY ADAPTED FOR ADMINISTRATIVE, COMMERCIAL, FINANCIAL, MANAGERIAL, SUPERVISORY OR FORECASTING PURPOSES; SYSTEMS OR METHODS SPECIALLY ADAPTED FOR ADMINISTRATIVE, COMMERCIAL, FINANCIAL, MANAGERIAL, SUPERVISORY OR FORECASTING PURPOSES, NOT OTHERWISE PROVIDED FOR
H01L	113814	381962	3.755	3.303	0.452	SEMICONDUCTOR DEVICES; ELECTRIC SOLID STATE DEVICES NOT OTHERWISE PROVIDED FOR
A61Q	16866	18698	0.556	0.162	0.395	SPECIFIC USE OF COSMETICS OR SIMILAR TOILET PREPARATIONS
C07C	33187	82127	1.095	0.710	0.385	ACYCLIC OR CARBOCYCLIC COMPOUNDS
C12P	12297	13758	0.406	0.119	0.287	FERMENTATION OR ENZYME-USING PROCESSES TO SYNTHESISE A DESIRED CHEMICAL COMPOUND OR COMPOSITION OR TO SEPARATE OPTICAL ISOMERS FROM A RACEMIC MIXTURE
C12Y	9799	8108	0.323	0.070	0.253	ENZYMES
C08L	20786	51386	0.686	0.444	0.241	COMPOSITIONS OF MACROMOLECULAR COMPOUNDS
C09D	16359	35799	0.540	0.310	0.230	COATING COMPOSITIONS, e.g. PAINTS, VARNISHES OR LACQUERS; FILLING PASTES; CHEMICAL PAINT OR INK REMOVERS; INKS; CORRECTING FLUIDS; WOODSTAINS; PASTES OR SOLIDS FOR COLOURING OR PRINTING; USE OF MATERIALS THEREFOR
C09K	15928	35505	0.526	0.307	0.218	MATERIALS FOR MISCELLANEOUS APPLICATIONS, NOT PROVIDED FOR ELSEWHERE
C08F	17480	41598	0.577	0.360	0.217	MACROMOLECULAR COMPOUNDS OBTAINED BY REACTIONS ONLY INVOLVING CARBON-TO-CARBON UNSATURATED BONDS
F02B	3796	35699	0.125	0.309	-0.183	INTERNAL-COMBUSTION PISTON ENGINES; COMBUSTION ENGINES IN GENERAL
H01H	4635	38928	0.153	0.337	-0.184	ELECTRIC SWITCHES; RELAYS; SELECTORS; EMERGENCY PROTECTIVE DEVICES
F16D	2968	33174	0.098	0.287	-0.189	COUPLINGS FOR TRANSMITTING ROTATION; CLUTCHES; BRAKES
B65G	2849	33446	0.094	0.289	-0.195	TRANSPORT OR STORAGE DEVICES, e.g. CONVEYORS FOR LOADING OR TIPPING, SHOP CONVEYOR SYSTEMS OR PNEUMATIC TUBE CONVEYORS
B62D	4831	41973	0.159	0.363	-0.204	MOTOR VEHICLES; TRAILERS
H03K	8291	55429	0.274	0.479	-0.206	PULSE TECHNIQUE
F16L	3595	37625	0.119	0.325	-0.207	PIPES; JOINTS OR FITTINGS FOR PIPES; SUPPORTS FOR PIPES, CABLES OR PROTECTIVE TUBING; MEANS FOR THERMAL INSULATION IN GENERAL
G01R	14938	81869	0.493	0.708	-0.215	MEASURING ELECTRIC VARIABLES; MEASURING MAGNETIC VARIABLES
B65H	4546	42323	0.150	0.366	-0.216	HANDLING THIN OR FILAMENTARY MATERIAL, e.g. SHEETS, WEBS, CABLES
A63B	6350	50705	0.210	0.438	-0.229	APPARATUS FOR PHYSICAL TRAINING, GYMNASTICS, SWIMMING, CLIMBING, OR FENCING; BALL GAMES; TRAINING EQUIPMENT
B65D	11747	71448	0.388	0.618	-0.230	CONTAINERS FOR STORAGE OR TRANSPORT OF ARTICLES OR MATERIALS, e.g. BAGS, BARRELS, BOTTLES, BOXES, CANS, CARTONS, CRATES, DRUMS, JARS, TANKS, HOPPERS, FORWARDING CONTAINERS; ACCESSORIES, CLOSURES, OR FITTINGS THEREFOR; PACKAGING ELEMENTS; PACKAGES
G11B	20710	106241	0.683	0.919	-0.235	INFORMATION STORAGE BASED ON RELATIVE MOVEMENT BETWEEN RECORD CARRIER AND TRANSDUCER
B60R	9077	62621	0.299	0.542	-0.242	VEHICLES, VEHICLE FITTINGS, OR VEHICLE PARTS, NOT OTHERWISE PROVIDED FOR
E21B	8401	62739	0.277	0.543	-0.265	EARTH DRILLING, e.g. DEEP DRILLING ; OBTAINING OIL, GAS, WATER, SOLUBLE OR MELTABLE MATERIALS OR A SLURRY OF MINERALS FROM WELLS
F16H	5390	51508	0.178	0.445	-0.268	GEARING
Y02T	17156	97400	0.566	0.842	-0.276	CLIMATE CHANGE MITIGATION TECHNOLOGIES RELATED TO TRANSPORTATION
B29C	15116	90347	0.499	0.781	-0.283	SHAPING OR JOINING OF PLASTICS; SHAPING OF MATERIAL IN A PLASTIC STATE, NOT OTHERWISE PROVIDED FOR; AFTER-TREATMENT OF THE SHAPED PRODUCTS, e.g. REPAIRING
H01R	9747	75973	0.322	0.657	-0.335	ELECTRICALLY-CONDUCTIVE CONNECTIONS; STRUCTURAL ASSOCIATIONS OF A PLURALITY OF MUTUALLY-INSULATED ELECTRICAL CONNECTING ELEMENTS; COUPLING DEVICES; CURRENT COLLECTORS
Y10S	75281	353260	2.484	3.055	-0.571	TECHNICAL SUBJECTS COVERED BY FORMER USPC CROSS-REFERENCE ART COLLECTIONS [XRACs] AND DIGESTS
Y10T	83598	503119	2.758	4.351	-1.593	TECHNICAL SUBJECTS COVERED BY FORMER US CLASSIFICATION

Where are there most and fewest female inventors?

The locations table associates inventor IDs with location IDs:

locations <- as.data.frame(download_patentsview_bulk("location", outDir))
locations$state_fips[!is.na(locations$state_fips)] <- formatC(
  locations$state_fips[!is.na(locations$state_fips)],
  width = 2, flag = 0, format = "d"
)
locations$county_fips[!is.na(locations$county_fips)] <- formatC(
  locations$county_fips[!is.na(locations$county_fips)],
  width = 5, flag = 0, format = "d"
)

# we can align this with our inventors data for inventor sex
rownames(locations) <- locations$id
located_inventors <- inventors[!is.na(inventors$location_id), ]
locations <- locations[located_inventors$location_id, ]

Inventor location is recorded as part of each patent, which means inventors may have multiple locations over time. For an initial look, we can focus only on each inventors most recent location:

# add date information to inventor data
inventors$date <- structure(patents$date, names = patents$number)[inventors$patent_id]

# sort inventors by date, then add location information
inventors_last_seen <- inventors[order(inventors$date, decreasing = TRUE), ]
inventors_last_seen <- inventors_last_seen[!duplicated(inventors_last_seen$inventor_id), ]
inventors_last_seen <- inventors_last_seen[inventors_last_seen$location_id %in% locations$id, ]
inventors_last_seen <- cbind(inventors_last_seen, locations[inventors_last_seen$location_id, ])

Now we can look at high-level summaries of locations, like we did with patent classes:

# top countries
breakdown_countries <- as.data.frame(t(vapply(
  split(inventors_last_seen$pred_fem, inventors_last_seen$country),
  function(d) c(Female = sum(d == 1), Male = sum(d == 0)),
  c(0, 0)
)))
breakdown_countries <- cbind(
  breakdown_countries,
  sweep(breakdown_countries, 2, colSums(breakdown_countries), "/") * 100
)
colnames(breakdown_countries)[3:4] <- c("Female_Percent", "Male_Percent")
breakdown_countries$Difference <- breakdown_countries$Female_Percent - breakdown_countries$Male_Percent
breakdown_countries <- breakdown_countries[order(-breakdown_countries$Difference), ]
kable(
  breakdown_countries[rowSums(breakdown_countries[, 1:2]) > 1e4, ],
  col.names = gsub("_", " ", colnames(breakdown_countries), fixed = TRUE),
  caption = "Countries with at least 10,000 associated inventors"
)

Countries with at least 10,000 associated inventors
	Female	Male	Female Percent	Male Percent	Difference
CN	38044	104951	6.7701299	3.0053345	3.7647953
FR	24075	106275	4.2842728	3.0432481	1.2410248
KR	23911	122321	4.2550882	3.5027348	0.7523533
IT	9958	44167	1.7720785	1.2647484	0.5073301
US	269384	1658264	47.9382994	47.4853794	0.4529200
ES	4650	17826	0.8274920	0.5104581	0.3170338
FI	4357	16529	0.7753511	0.4733178	0.3020333
IL	7116	33690	1.2663296	0.9647333	0.3015964
IN	9325	49048	1.6594328	1.4045188	0.2549140
SU	3268	12008	0.5815578	0.3438562	0.2377015
SG	2692	10755	0.4790556	0.3079758	0.1710797
BE	3709	17464	0.6600361	0.5000921	0.1599440
DK	2682	14017	0.4772760	0.4013852	0.0758908
RU	2211	11556	0.3934591	0.3309130	0.0625461
SE	5775	34979	1.0276916	1.0016445	0.0260471
NO	1348	9725	0.2398837	0.2784812	-0.0385975
AU	3865	26212	0.6877971	0.7505963	-0.0627992
NL	6026	41085	1.0723584	1.1764935	-0.1041351
CH	4763	37774	0.8476009	1.0816810	-0.2340801
AT	1551	17833	0.2760086	0.5106586	-0.2346500
CA	13261	92134	2.3598647	2.6383121	-0.2784474
GB	13156	98987	2.3411794	2.8345518	-0.4933724
TW	9993	93957	1.7783069	2.6905148	-0.9122079
DE	26375	265556	4.6935699	7.6043546	-2.9107846
JP	52435	464053	9.3310840	13.2884346	-3.9573506

# top states
breakdown_states <- as.data.frame(t(vapply(
  split(inventors_last_seen$pred_fem, inventors_last_seen$state_fips),
  function(d) c(Female = sum(d == 1), Male = sum(d == 0)),
  c(0, 0)
)))
breakdown_states <- cbind(
  breakdown_states,
  sweep(breakdown_states, 2, colSums(breakdown_states), "/") * 100
)
colnames(breakdown_states)[3:4] <- c("Female_Percent", "Male_Percent")
breakdown_states$Difference <- breakdown_states$Female_Percent - breakdown_states$Male_Percent
breakdown_states <- breakdown_states[order(-breakdown_states$Difference), ]
## install if needed: remotes::install_github("uva-bi-sdad/catchment")
library(catchment)
states <- download_census_shapes(paste0(dirname(outDir), "/maps"))
state_names <- structure(states$NAME, names = states$STATEFP)
breakdown_states <- breakdown_states[rownames(breakdown_states) %in% names(state_names), ]
rownames(breakdown_states) <- state_names[rownames(breakdown_states)]
kable(
  breakdown_states[rowSums(breakdown_states[, 1:2]) > 1e4, ],
  col.names = gsub("_", " ", colnames(breakdown_states), fixed = TRUE),
  caption = "States with at least 10,000 associated inventors"
)

States with at least 10,000 associated inventors
	Female	Male	Female Percent	Male Percent	Difference
California	59595	337658	22.1259802	20.3653186	1.7606616
New York	18892	97296	7.0140787	5.8682573	1.1458213
New Jersey	11716	58560	4.3498277	3.5319556	0.8178722
Massachusetts	13899	73660	5.1603154	4.4426887	0.7176268
Maryland	5991	29161	2.2242931	1.7588005	0.4654926
Georgia	5596	29640	2.0776405	1.7876906	0.2899499
Florida	9745	56698	3.6180498	3.4196519	0.1983978
North Carolina	6806	39094	2.5268801	2.3578940	0.1689861
Virginia	4951	29177	1.8381698	1.7597655	0.0784043
Missouri	3441	20193	1.2775484	1.2179095	0.0596389
Louisiana	1326	9313	0.4923072	0.5616992	-0.0693919
Washington	10677	66907	3.9640757	4.0353919	-0.0713163
Alabama	1352	9553	0.5019603	0.5761744	-0.0742141
Tennessee	2616	17403	0.9712487	1.0496350	-0.0783863
South Carolina	2002	13700	0.7432874	0.8262943	-0.0830069
Kentucky	1422	10146	0.5279494	0.6119403	-0.0839909
Kansas	1473	10490	0.5468843	0.6326881	-0.0858038
Minnesota	7133	45613	2.6482862	2.7510773	-0.1027911
Oregon	3915	26031	1.4535315	1.5700194	-0.1164878
Colorado	5377	35085	1.9963318	2.1160974	-0.1197655
Arizona	4355	28950	1.6168914	1.7460743	-0.1291829
Oklahoma	1343	11238	0.4986189	0.6778025	-0.1791837
Illinois	11426	73643	4.2421587	4.4416633	-0.1995046
New Hampshire	1292	11438	0.4796840	0.6898652	-0.2101813
Connecticut	4479	31161	1.6629292	1.8794274	-0.2164982
Iowa	1564	13703	0.5806701	0.8264752	-0.2458051
Utah	2113	17121	0.7844986	1.0326266	-0.2481279
Pennsylvania	9865	65445	3.6626025	3.9472137	-0.2846112
Indiana	4086	29885	1.5170191	1.8024674	-0.2854483
Wisconsin	4793	35458	1.7795087	2.1385943	-0.3590855
Ohio	9223	64548	3.4242456	3.8931125	-0.4688669
Texas	16033	109365	5.9526108	6.5961803	-0.6435696
Michigan	9543	77209	3.5430528	4.6567411	-1.1136883

# map of counties
library(leaflet)
library(sf)
counties <- st_transform(download_census_shapes(
  paste0(dirname(outDir), "/maps"),
  entity = "county"
), "WGS84")
counties$NAME <- paste0(counties$NAME, ", ", state_names[counties$STATEFP])

breakdown_counties <- as.data.frame(t(vapply(
  split(inventors_last_seen$pred_fem, inventors_last_seen$county_fips),
  function(d) c(Female = sum(d == 1), Male = sum(d == 0)),
  c(0, 0)
)))
breakdown_counties <- cbind(
  breakdown_counties,
  sweep(breakdown_counties, 2, colSums(breakdown_counties), "/") * 100
)
colnames(breakdown_counties)[3:4] <- c("Female_Percent", "Male_Percent")
breakdown_counties$Difference <- breakdown_counties$Female_Percent - breakdown_counties$Male_Percent
breakdown_counties$Total <- rowSums(breakdown_counties[, 1:2])
breakdown_counties$Capped_Total <- breakdown_counties$Total
breakdown_counties$Capped_Total[breakdown_counties$Capped_Total > 5e4] <- 1e5
breakdown_counties <- breakdown_counties[counties$GEOID, ]

pal <- colorNumeric(
  scico::scico(255, direction = -1, palette = "vik"),
  rep(max(abs(breakdown_counties$Difference), na.rm = TRUE), 2) * c(-1, 1)
)
pal_total <- colorNumeric(
  scico::scico(255, direction = -1, palette = "lajolla"), breakdown_counties$Capped_Total
)
leaflet(counties[, 1], options = leafletOptions(attributionControl = FALSE)) |>
  addProviderTiles("CartoDB.Positron") |>
  setView(-95.5810546875, 39.5040407055842, 4) |>
  addControl("Percent Female-Assigned Inventors - Percent Male-Assigned Inventors", "topright") |>
  addLayersControl(position = "topleft", overlayGroups = c("Total", "Difference")) |>
  addLegend(
    "bottomright", pal_total, breakdown_counties$Capped_Total,
    opacity = 1,
    title = "Totals", group = "Total"
  ) |>
  addPolygons(
    fillColor = pal_total(breakdown_counties$Capped_Total),
    fillOpacity = .8, weight = 1, color = "#000", highlightOptions = highlightOptions(color = "#fff"),
    group = "Total", label = paste0("County: ", counties$NAME, "; Total: ", breakdown_counties$Total)
  ) |>
  hideGroup("Total") |>
  addLegend(
    "bottomright", pal, breakdown_counties$Difference,
    opacity = 1,
    title = "Percent Difference", group = "Difference"
  ) |>
  addPolygons(
    fillColor = pal(breakdown_counties$Difference), fillOpacity = .8, weight = 1, color = "#000",
    highlightOptions = highlightOptions(color = "#fff"),
    group = "Difference", label = paste0(
      "County: ", counties$NAME,
      "; Female: ", breakdown_counties$Female,
      "; Male: ", breakdown_counties$Male,
      "; Percent Difference (Female - Male): ", round(breakdown_counties$Difference, 3)
    )
  )

How have inventors migrated?

Instead of collapsing inventors to a single location, we could also look for instances of inventors moving between states, and see which moves are most common:

# start with all of located inventors
inventor_states <- cbind(
  located_inventors[, c("inventor_id", "pred_fem")],
  date = all_dates[located_inventors$patent_id],
  state = state_names[locations$state_fips]
)
inventor_states <- inventor_states[!is.na(inventor_states$state), ]
inventor_states <- inventor_states[!is.na(inventor_states$date), ]

# select only those that are associated with multiple patents
inventor_states <- inventor_states[
  inventor_states$inventor_id %in% inventor_states$inventor_id[duplicated(inventor_states$inventor_id)],
]
inventor_states <- inventor_states[order(inventor_states$date, decreasing = TRUE), ]

## see how many unique inventors this leaves
length(unique(inventor_states$inventor_id))
#> [1] 1057115

# then look at each inventor to see if they have more than one state.
# if they do, record their transition; previous state -> new state
inventor_history <- lapply(split(inventor_states, inventor_states$inventor_id), function(d) {
  if (!all(d$state == d$state[[1]])) {
    res <- NULL
    states <- d$state
    for (i in seq_len(nrow(d) - 1)) {
      if (states[[i]] != states[[i + 1]]) res <- c(paste(states[[i + 1]], "->", states[[i]]), res)
    }
    res
  }
})
inventor_history <- Filter(length, inventor_history)

## see how many inventors have associated moves
length(inventor_history)
#> [1] 265374

inventor_moves <- unlist(inventor_history)
inventor_moves <- tapply(inventor_moves, inventor_moves, length)
inventor_transitions <- data.frame(
  do.call(rbind, strsplit(names(inventor_moves), " -> ", fixed = TRUE)),
  inventor_moves
)
colnames(inventor_transitions) <- c("from", "to", "count")

# use these transitions to make a origin x destination matrix
states <- sort(unique(c(inventor_transitions$from, inventor_transitions$to)))
migrations <- matrix(0, length(states), length(states), dimnames = list(states, states))
for (r in seq_len(nrow(inventor_transitions))) {
  move <- inventor_transitions[r, ]
  migrations[move$from, move$to] <- move$count
}
migrations <- migrations[rowSums(migrations) > 1e4, rowSums(migrations) > 1e4]
migrations <- migrations[order(-rowSums(migrations)), order(-rowSums(migrations))]

# and make a chord diagram out of the most frequently involved states
# colored by percent difference (Female - Male; higher female percent = bluer)
## devtools::install_github("mattflor/chorddiag")
library(chorddiag)
chorddiag(
  migrations,
  groupColors = scico::scico(nrow(migrations), palette = "vik")[
    order(order(-breakdown_states[rownames(migrations), "Difference"]))
  ], showTicks = FALSE, groupnamePadding = 5
)