Built with R 4.2.1


In the previous example we assigned sex to inventors and took a high-level look at some differences between sex groups.

This example focuses on inventors that were active in the last year, and tries to take a deeper look at group differences.

library(uspto)
outDir <- "../patentsview/"

We’ll start by collecting a unified set of patents with inventors who were associated with a patent in 2021:

latest_inventors_file <- paste0(outDir, "latest_inventors.csv.xz")
if (file.exists(latest_inventors_file)) {
  latest_inventors <- vroom::vroom(latest_inventors_file, show_col_types = FALSE)
} else {
  # 1. identify patents granted in the most recent year
  patents <- download_patentsview_bulk("patent", outDir, make_db = TRUE)
  latest_patents <- as.character(dplyr::collect(dplyr::select(
    dplyr::filter(patents, date > as.Date("2020-12-31")), number
  ))$number)

  # 2. identify the inventors associated with those patents
  patent_inventors <- download_patentsview_bulk(
    "patent_inventor", outDir,
    make_db = TRUE,
    partition = list(series_code = function(d) substr(d$patent_id, 1, 2))
  )
  latest_inventors <- dplyr::collect(dplyr::select(dplyr::filter(
    patent_inventors, patent_id %in% latest_patents
  ), inventor_id))
  latest_inventors <- dplyr::collect(dplyr::filter(
    patent_inventors, inventor_id %in% latest_inventors$inventor_id
  ))
  latest_inventors$n_inventors <- as.numeric(tapply(
    latest_inventors$inventor_id, latest_inventors$patent_id, length
  )[latest_inventors$patent_id])

  # 3. combine patent and inventor data
  patents_target <- as.data.frame(dplyr::collect(dplyr::select(dplyr::filter(
    patents, as.character(number) %in% latest_inventors$patent_id
  ), number, type, date, num_claims, withdrawn, kind)))
  rownames(patents_target) <- patents_target$number
  latest_inventors <- cbind(latest_inventors, patents_target[latest_inventors$patent_id, -1])

  ## add inventor-level patenting summaries
  inventor_histories <- as.data.frame(do.call(rbind, lapply(
    split(latest_inventors$date, latest_inventors$inventor_id),
    function(d) {
      r <- as.character(range(d))
      if (r[2] >= "2020-12-31") {
        c(n_patents = length(d), first_date = r[1], last_date = r[2])
      }
    }
  )))
  inventor_histories$n_patents <- as.numeric(inventor_histories$n_patents)
  latest_inventors <- cbind(latest_inventors, inventor_histories[latest_inventors$inventor_id, ])

  # 4. add assignee data
  patent_assignees <- download_patentsview_bulk(
    "patent_assignee", outDir,
    make_db = TRUE,
    partition = list(series_code = function(d) substr(d$patent_id, 1, 2))
  )
  assignees <- dplyr::collect(dplyr::filter(
    patent_assignees, patent_id %in% latest_inventors$patent_id
  ))
  assignees <- download_patentsview_bulk("rawassignee", outDir, make_db = TRUE)
  assignees_target <- dplyr::collect(dplyr::filter(
    assignees,
    as.character(patent_id) %in% latest_inventors$patent_id & sequence == 0 & !is.na(organization)
  ))
  latest_inventors$assignee_id <- structure(
    assignees_target$assignee_id,
    names = assignees_target$patent_id
  )[latest_inventors$patent_id]
  latest_inventors$assignee_type <- structure(
    assignees_target$type,
    names = assignees_target$patent_id
  )[latest_inventors$patent_id]
  latest_inventors$assignee_organization <- structure(
    assignees_target$organization,
    names = assignees_target$patent_id
  )[latest_inventors$patent_id]

  # 5. add location information
  locations <- download_patentsview_bulk("location", outDir, make_db = TRUE)
  locations_target <- as.data.frame(dplyr::collect(dplyr::filter(
    locations, id %in% latest_inventors$location_id
  )))
  locations_target[is.na(locations_target$country), "country"] <- "US"
  locations_target <- locations_target[, !colnames(locations_target) %in% c("state", "county")]
  rownames(locations_target) <- locations_target$id
  latest_inventors <- latest_inventors[latest_inventors$location_id %in% locations_target$id, -4]
  latest_inventors <- cbind(latest_inventors, locations_target[latest_inventors$location_id, -1])
  latest_inventors$n_countries <- as.numeric(tapply(
    latest_inventors$country, latest_inventors$patent_id, function(cs) length(unique(cs))
  )[latest_inventors$patent_id])

  # 6. add sex predictions
  inventor_sex <- vroom::vroom(
    paste0(outDir, "inventor_sex.csv.xz"),
    show_col_types = FALSE
  )
  latest_inventors$prob_fem <- structure(
    inventor_sex$prob_fem,
    names = inventor_sex$id
  )[latest_inventors$inventor_id]
  latest_inventors$pred_fem_patentsview <- structure(
    inventor_sex$pred_fem_patentsview,
    names = inventor_sex$id
  )[latest_inventors$inventor_id]

  # 7. add WIPO comparison category
  wipo_fields <- download_patentsview_bulk("wipo_field", outDir)
  wipo_fields <- c(
    pharma = wipo_fields[wipo_fields$field_title == "Pharmaceuticals", "id", drop = TRUE],
    civil = wipo_fields[wipo_fields$field_title == "Civil engineering", "id", drop = TRUE]
  )
  wipo_matrix <- patentsview_class_matrix("wipo", paste0(outDir, "wipo_matrix.rds"), dir = outDir)
  wipo_matrix <- wipo_matrix[rownames(wipo_matrix) %in% latest_inventors$patent_id, wipo_fields]
  wipo_pharma <- names(which(wipo_matrix[, wipo_fields[["pharma"]]] != 0))
  wipo_civil <- names(which(wipo_matrix[, wipo_fields[["civil"]]] != 0))
  wipo_both <- intersect(wipo_pharma, wipo_civil)
  latest_inventors$wipo_comp <- NA_character_
  latest_inventors$wipo_comp[latest_inventors$patent_id %in% wipo_pharma] <- "pharma"
  latest_inventors$wipo_comp[latest_inventors$patent_id %in% wipo_civil] <- "civil"
  latest_inventors$wipo_comp[latest_inventors$patent_id %in% wipo_both] <- "both"

  vroom::vroom_write(latest_inventors, latest_inventors_file, ",")
}

And get a high-level feel for the set:

unique_inventors <- latest_inventors[!duplicated(latest_inventors$inventor_id), ]
nrow(unique_inventors)
#> [1] 617692

kable(rbind(
  Countries = round(summary(latest_inventors$n_countries), 3),
  Inventors = round(summary(latest_inventors$n_inventors), 3),
  "Inventor Patents" = round(summary(unique_inventors$n_patents), 3),
  "First Patent" = as.character(summary(as.Date(latest_inventors$first_date))),
  "Latest Patent" = as.character(summary(as.Date(latest_inventors$last_date)))
))
Min. 1st Qu. Median Mean 3rd Qu. Max.
Countries 1 1 1 1.085 1 9
Inventors 1 1 2 3.065 4 131
Inventor Patents 1 1 3 10.968 10 6027
First Patent 1976-01-06 1994-09-13 2004-05-04 2002-12-24 2013-02-05 2021-12-28
Latest Patent 2021-01-05 2021-07-06 2021-10-05 2021-09-07 2021-11-30 2021-12-28

Now we can get into exploring the relationships between location, category, and inventor sex.

How much do patent categories explain country proportion differences?

We can start by identifying the set of most represented countries, and looking at their inventor sex proportions:

breakdown_countries <- as.data.frame(t(vapply(
  split(latest_inventors$prob_fem, latest_inventors$country),
  function(d) c(Female = sum(d == 1), Male = sum(d == 0)),
  c(0, 0)
)))
breakdown_countries <- breakdown_countries[rowSums(breakdown_countries) != 0, ]
breakdown_countries$Proportion_Female <- breakdown_countries$Female / rowSums(breakdown_countries)
breakdown_countries <- breakdown_countries[order(-breakdown_countries$Proportion_Female), ]
top_countries <- names(which(table(latest_inventors$country) > 1e5))
kable(
  breakdown_countries[rownames(breakdown_countries) %in% top_countries, ],
  col.names = gsub("_", " ", colnames(breakdown_countries), fixed = TRUE),
  caption = "Countries with at least 10,000 associated intentors"
)
Countries with at least 10,000 associated intentors
Female Male Proportion Female
CN 6200 43104 0.1257504
KR 17389 158068 0.0991069
CA 956 12645 0.0702889
US 18701 251712 0.0691572
GB 334 5022 0.0623600
TW 8690 184428 0.0449984
JP 1175 37826 0.0301274
DE 563 20006 0.0273713

The first step toward getting at how much categories account for country differences, we need to associate categories to our set of patents:

cpc_matrix_file <- paste0(outDir, "latest_cpc_matrix.rds")
if (file.exists(cpc_matrix_file)) {
  cpc_matrix <- readRDS(cpc_matrix_file)
} else {
  cpc_current <- download_patentsview_bulk("cpc_current", outDir, make_db = TRUE)
  cpc_matrix <- patentsview_class_matrix(dplyr::compute(dplyr::filter(
    cpc_current, as.character(patent_id) %in% latest_inventors$patent_id
  )))
  saveRDS(cpc_matrix, cpc_matrix_file)
}

Now, we might just look at sex proportions within classes and countries – if proportions are similar, it would indicate that differences in the distribution of classes might explain the difference in sex distributions:

cpc_matrix <- cpc_matrix[rownames(cpc_matrix) %in% latest_inventors$patent_id, ]
latest_inventors$pred_fem <- latest_inventors$pred_fem_patentsview == 1 |
  (latest_inventors$pred_fem_patentsview == .5 & latest_inventors$prob_fem > .5)
latest_inventors$pred_masc <- latest_inventors$pred_fem_patentsview == 0 |
  (latest_inventors$pred_fem_patentsview == .5 & latest_inventors$prob_fem < .5)
patent_any_fem <- tapply(
  latest_inventors$pred_fem,
  latest_inventors$patent_id, any
)[rownames(cpc_matrix)]

class_country_matrix <- vapply(
  split(latest_inventors[, c("patent_id", "pred_fem", "pred_masc")], latest_inventors$country),
  function(inventors) {
    inventors <- inventors[inventors$patent_id %in% rownames(cpc_matrix), ]
    m <- cpc_matrix[inventors$patent_id, , drop = FALSE] != 0
    unlist(list(
      female = colSums(m * inventors$pred_fem),
      male = colSums(m * inventors$pred_masc)
    ))
  },
  rep(colSums(cpc_matrix), 2)
)
class_country_matrix <- list(
  female = class_country_matrix[seq_len(nrow(class_country_matrix) / 2), ],
  male = class_country_matrix[seq_len(nrow(class_country_matrix) / 2) + nrow(class_country_matrix) / 2, ]
)
class_country_matrix <- lapply(class_country_matrix, function(m) {
  total <- colSums(m)
  total[total == 0] <- 1
  sweep(m, 2, total, "/") * 100
})

We can start by focusing on a set of classes where there are most female-assigned inventors in average across the top countries:

top_classes <- rownames(class_country_matrix$female)[
  order(-rowMeans(class_country_matrix$female[, top_countries]))[1:20]
]
ccm_top_country <- as.data.frame(do.call(rbind, lapply(class_country_matrix, function(m) {
  vapply(
    top_countries, function(country) cumsum(m[top_classes, country]), m[top_classes, 1]
  )
})))
cpc_group <- as.data.frame(download_patentsview_bulk("cpc_group", outDir))
rownames(cpc_group) <- cpc_group$id
kable(cpc_group[top_classes, ], row.names = FALSE, caption = "CPC Class Definitions")
CPC Class Definitions
id title
H01L SEMICONDUCTOR DEVICES; ELECTRIC SOLID STATE DEVICES NOT OTHERWISE PROVIDED FOR
G06F ELECTRIC DIGITAL DATA PROCESSING
H04L TRANSMISSION OF DIGITAL INFORMATION, e.g. TELEGRAPHIC COMMUNICATION
A61P SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMPOUNDS OR MEDICINAL PREPARATIONS
A61K PREPARATIONS FOR MEDICAL, DENTAL, OR TOILET PURPOSES
H04W WIRELESS COMMUNICATION NETWORKS
C07D HETEROCYCLIC COMPOUNDS
H04N PICTORIAL COMMUNICATION, e.g. TELEVISION
G01N INVESTIGATING OR ANALYSING MATERIALS BY DETERMINING THEIR CHEMICAL OR PHYSICAL PROPERTIES
Y10T TECHNICAL SUBJECTS COVERED BY FORMER US CLASSIFICATION
Y02E REDUCTION OF GREENHOUSE GAS [GHG] EMISSIONS, RELATED TO ENERGY GENERATION, TRANSMISSION OR DISTRIBUTION
C12N MICROORGANISMS OR ENZYMES; COMPOSITIONS THEREOF; PROPAGATING, PRESERVING, OR MAINTAINING MICROORGANISMS; MUTATION OR GENETIC ENGINEERING; CULTURE MEDIA
G02B OPTICAL ELEMENTS, SYSTEMS OR APPARATUS
C07K PEPTIDES
H04B TRANSMISSION
A61B DIAGNOSIS; SURGERY; IDENTIFICATION
G02F OPTICAL DEVICES OR ARRANGEMENTS FOR THE CONTROL OF LIGHT BY MODIFICATION OF THE OPTICAL PROPERTIES OF THE MEDIA OF THE ELEMENTS INVOLVED THEREIN; NON-LINEAR OPTICS; FREQUENCY-CHANGING OF LIGHT; OPTICAL LOGIC ELEMENTS; OPTICAL ANALOGUE/DIGITAL CONVERTERS
G06T IMAGE DATA PROCESSING OR GENERATION, IN GENERAL
G06K GRAPHICAL DATA READING ; PRESENTATION OF DATA; RECORD CARRIERS; HANDLING RECORD CARRIERS
G09G ARRANGEMENTS OR CIRCUITS FOR CONTROL OF INDICATING DEVICES USING STATIC MEANS TO PRESENT VARIABLE INFORMATION
kable(cbind(
  "Assigned Sex" = rep(c("Female", "Male"), each = length(top_classes)),
  Class = rep(top_classes, 2), ccm_top_country
), digits = 2, row.names = FALSE, caption = "Cumulative Percent of Inventors by Class and Country")
Cumulative Percent of Inventors by Class and Country
Assigned Sex Class CA CN DE GB JP KR TW US
Female H01L 1.11 4.63 3.01 1.19 7.02 10.06 23.49 3.07
Female G06F 7.64 12.34 5.06 5.88 10.99 15.40 30.75 10.47
Female H04L 16.16 20.25 6.61 8.77 12.63 20.49 32.47 15.60
Female A61P 19.63 22.53 13.25 19.00 13.50 21.56 33.22 20.06
Female A61K 23.21 24.62 19.84 27.99 14.56 22.66 34.20 24.63
Female H04W 30.21 31.59 20.95 30.39 15.56 27.77 35.59 27.60
Female C07D 31.88 33.21 26.44 36.73 16.53 28.78 35.87 30.04
Female H04N 33.83 35.27 27.20 37.67 20.29 32.69 38.12 31.99
Female G01N 35.29 36.74 29.87 39.88 21.70 33.39 39.01 33.98
Female Y10T 36.68 37.46 31.35 40.96 24.32 34.70 41.12 35.73
Female Y02E 37.31 38.92 33.14 41.94 26.16 37.99 42.14 36.69
Female C12N 38.50 39.76 36.54 44.77 26.84 38.58 42.40 38.77
Female G02B 39.36 41.44 37.18 45.66 29.22 40.78 44.57 39.73
Female C07K 40.64 42.14 40.93 48.27 29.61 41.19 44.83 41.91
Female H04B 42.57 44.34 41.40 49.23 30.24 43.41 45.78 43.15
Female A61B 44.76 45.06 42.42 50.82 31.24 44.10 46.32 45.61
Female G02F 45.15 47.65 42.70 51.18 32.54 47.11 48.19 45.92
Female G06T 46.90 49.25 43.30 52.22 33.94 48.40 49.10 47.28
Female G06K 48.04 51.26 43.71 53.18 35.21 49.60 50.18 48.67
Female G09G 48.40 54.08 43.76 53.35 36.19 52.08 52.00 48.97
Male H01L 0.89 5.04 3.29 1.22 7.02 9.80 28.13 3.33
Male G06F 8.57 13.24 6.31 8.39 10.37 15.07 34.81 11.13
Male H04L 16.87 20.83 8.51 13.12 12.28 21.99 36.56 16.43
Male A61P 18.77 22.58 11.10 17.79 12.80 22.75 37.08 18.66
Male A61K 20.54 24.11 13.39 21.39 13.38 23.56 37.70 20.95
Male H04W 26.31 30.52 14.83 24.54 14.66 30.40 39.09 23.79
Male C07D 27.37 31.85 17.44 28.30 15.33 31.18 39.36 25.15
Male H04N 29.17 34.26 18.42 30.05 19.46 35.27 41.63 27.04
Male G01N 30.46 35.44 20.10 32.05 20.49 35.85 42.45 28.52
Male Y10T 32.18 36.24 22.93 33.98 23.66 37.09 44.26 30.90
Male Y02E 33.23 37.89 24.32 34.93 25.64 40.19 45.07 31.84
Male C12N 34.02 38.45 25.28 35.82 25.92 40.60 45.24 32.84
Male G02B 35.21 40.54 26.47 37.01 28.14 42.40 48.31 34.01
Male C07K 36.02 41.00 27.61 38.21 28.35 42.72 48.49 35.00
Male H04B 38.57 43.32 28.43 39.31 29.30 45.70 49.36 36.35
Male A61B 40.49 43.98 30.33 40.73 30.28 46.36 49.85 39.47
Male G02F 40.86 46.71 30.51 41.20 31.80 49.11 51.49 39.78
Male G06T 42.43 48.51 31.42 42.70 32.98 50.29 52.43 41.14
Male G06K 43.40 50.56 32.05 43.96 34.17 51.49 53.41 42.49
Male G09G 44.22 53.18 32.17 44.44 35.28 53.96 55.09 42.84

Looking at the last class in the table (G09G) gives a feel for how well represented each country is by this set of classes. For example, Japan (JP) is least well represented, with only 36% and 35% of Female- and Male-assigned inventors accounted for, whereas Taiwan (TW) is better represented, with 52% and 55% of Female- and Male-assigned inventors represented. Taiwan is, however, not represented by these classes as evenly, which can be seen in the large initial percent – 23% and 28% of Taiwan’s inventors are accounted for by just the semiconductor devices class (H01L).

Plotting out profiles might give a better feel for how similar countries are in terms of their distribution of inventors:

library(splot)
splot(
  (ccm_top_country[1:20, ] - ccm_top_country[21:40, ]) ~ top_classes,
  title = "Inventor Sex Ballance within Classes between Countries",
  laby = "Cumulative Percent of Inventor Sex Group (Female - Male)",
  labx = "CPC Class", sort = FALSE, line.type = "b", xlas = 2, mar = c(3, 2, 0, 0)
)

The magnitude on this plot indicates how skewed the distribution of female-assigned inventors is relative to that of male-assigned inventors within each country. The more relatively skewed this distribution is, the more differences in class frequency might explain differences in sex frequency. Here, the United Kingdom (GB) is a standout example where a particularly large percent of female- but not male- assigned inventors are accounted for by a few classes (here, A61P and A61K in particular). In contrast, China (CN) has well aligned distributions, despite arriving at a similar percent of the female-assigned inventor population to GB.

How much do patent categories explain organization proportion differences?

One approach to this question could be similar to what we did with countries – identify the top organizations, then compare the top categories between them:

We’ll start by counting up patents by each organization between inventor sex summaries:

# make a patent-level summary of inventor sex
latest_inventors$n_female_inventors <- tapply(
  latest_inventors$pred_fem, latest_inventors$patent_id, sum
)[latest_inventors$patent_id]
patents <- latest_inventors[!duplicated(latest_inventors$patent_id), ]

# extract all unique organizations
organizations <- as.data.frame(latest_inventors[
  !is.na(latest_inventors$assignee_id) & !duplicated(latest_inventors$assignee_id),
  c("assignee_id", "assignee_organization")
])
rownames(organizations) <- organizations$assignee_id

# break those down by patents
breakdown_organizations <- table(patents$assignee_id, patents$n_female_inventors != 0)
breakdown_organizations <- as.data.frame(cbind(
  breakdown_organizations,
  sweep(breakdown_organizations, 2, colSums(breakdown_organizations), "/") * 100
))
colnames(breakdown_organizations) <- c(
  "No Female", "Any Female", "No Female Percent", "Any Female Percent"
)
breakdown_organizations$Difference <- breakdown_organizations[, 4] - breakdown_organizations[, 3]
breakdown_organizations$Organization <- organizations[
  rownames(breakdown_organizations), "assignee_organization"
]
kable(
  rbind(
    breakdown_organizations[order(-breakdown_organizations$Difference)[1:20], ],
    breakdown_organizations[order(breakdown_organizations$Difference)[20:1], ]
  ),
  digits = 3, row.names = FALSE,
  caption = "20 Organizations with Biggest Positive and Negative Sex Percent Difference"
)
20 Organizations with Biggest Positive and Negative Sex Percent Difference
No Female Any Female No Female Percent Any Female Percent Difference Organization
49121 28938 1.694 5.656 3.961 Samsung Electronics Co., Ltd.
84239 22789 2.906 4.454 1.548 International Business Machines Corporation
19006 10700 0.656 2.091 1.436 LG ELECTRONICS INC.
14116 9125 0.487 1.783 1.297 TAIWAN SEMICONDUCTOR MANUFACTURING CO., LTD.
9321 6224 0.322 1.216 0.895 Samsung Display Co., Ltd.
8168 5954 0.282 1.164 0.882 HUAWEI TECHNOLOGIES CO., LTD.
4990 3767 0.172 0.736 0.564 BOE Technology Group Co., Ltd.
17649 5658 0.609 1.106 0.497 Apple Inc.
4553 3341 0.157 0.653 0.496 Electronics and Telecommunications Research Institute
3181 2908 0.110 0.568 0.459 LG Chem, Ltd.
17348 5255 0.598 1.027 0.429 QUALCOMM Incorporated
26283 6467 0.907 1.264 0.357 Intel Corporation
6220 2810 0.215 0.549 0.335 HYUNDAI MOTOR COMPANY
25497 5979 0.880 1.169 0.289 Microsoft Technology Licensing, LLC
4438 2261 0.153 0.442 0.289 LG DISPLAY CO., LTD.
2972 1739 0.103 0.340 0.237 Samsung Electro-Mechanics Co., Ltd.
2582 1528 0.089 0.299 0.210 Samsung SDI Co., Ltd.
1052 1070 0.036 0.209 0.173 Genentech, Inc.
2391 1298 0.082 0.254 0.171 UNITED MICROELECTRONICS CORP.
3240 1444 0.112 0.282 0.170 INDUSTRIAL TECHNOLOGY RESEARCH INSTITUTE
9018 616 0.311 0.120 -0.191 DENSO CORPORATION
8802 539 0.304 0.105 -0.198 Robert Bosch GmbH
9244 601 0.319 0.117 -0.201 Siemens Aktiengesellschaft
19597 2410 0.676 0.471 -0.205 General Electric Company
14237 1365 0.491 0.267 -0.224 Ricoh Co., Ltd.
10113 614 0.349 0.120 -0.229 PANASONIC CORPORATION
9347 478 0.322 0.093 -0.229 BROTHER KOGYO KABUSHIKI KAISHA
12376 959 0.427 0.187 -0.239 NEC CORPORATION
9079 354 0.313 0.069 -0.244 Mitsubishi Denki Kabushiki Kaisha
12869 970 0.444 0.190 -0.254 Semiconductor Energy Laboratory Co., Ltd.
12716 748 0.439 0.146 -0.292 Sharp Kabushiki Kaisha
13272 681 0.458 0.133 -0.325 Matsushita Electric Industrial Co., Ltd.
18786 1632 0.648 0.319 -0.329 Micron Technology, Inc.
19092 1447 0.659 0.283 -0.376 Fujitsu Ltd.
17111 1075 0.590 0.210 -0.380 Toyota Jidosha Kogyo Kabushiki Kaisha
17425 992 0.601 0.194 -0.407 Seiko Epson Corporation
24185 1840 0.834 0.360 -0.475 Hitachi, Ltd.
29263 2331 1.009 0.456 -0.554 Toshiba Seiki Kabushiki Kaisha
33246 2150 1.147 0.420 -0.727 Sony Corporation
46441 2900 1.602 0.567 -1.035 Canon Kabushiki Kaisha

Now we can calculate a class-organization matrix, and look at the same sort of percent differences:

top_organizations <- names(sort(-rowSums(breakdown_organizations[, 1:2]))[1:9])
su_top_organization <- latest_inventors$assignee_id %in% top_organizations
class_organization_matrix <- vapply(
  split(
    latest_inventors[su_top_organization, c("patent_id", "pred_fem", "pred_masc")],
    latest_inventors[su_top_organization, "assignee_id", drop = TRUE]
  ),
  function(inventors) {
    inventors <- inventors[inventors$patent_id %in% rownames(cpc_matrix), ]
    m <- cpc_matrix[inventors$patent_id, , drop = FALSE] != 0
    unlist(list(
      female = colSums(m * inventors$pred_fem), male = colSums(m * inventors$pred_masc)
    ))
  },
  rep(colSums(cpc_matrix), 2)
)
class_organization_matrix <- list(
  female = class_organization_matrix[seq_len(nrow(class_organization_matrix) / 2), ],
  male = class_organization_matrix[
    seq_len(nrow(class_organization_matrix) / 2) + nrow(class_organization_matrix) / 2,
  ]
)
class_organization_matrix <- lapply(class_organization_matrix, function(m) {
  total <- colSums(m)
  total[total == 0] <- 1
  sweep(m, 2, total, "/") * 100
})

top_classes_org <- rownames(class_organization_matrix$female)[
  order(-rowMeans(class_organization_matrix$female))[1:20]
]
com_top_org <- as.data.frame(do.call(rbind, lapply(class_organization_matrix, function(m) {
  vapply(top_organizations, function(id) cumsum(m[top_classes_org, id]), m[top_classes_org, 1])
})))
colnames(com_top_org) <- sub("International", "IBM", sub(
  "[^a-z].*$", "", organizations[colnames(com_top_org), "assignee_organization"], TRUE
), fixed = TRUE)
kable(cpc_group[top_classes_org, ], row.names = FALSE, caption = "CPC Class Definitions")
CPC Class Definitions
id title
G06F ELECTRIC DIGITAL DATA PROCESSING
H04L TRANSMISSION OF DIGITAL INFORMATION, e.g. TELEGRAPHIC COMMUNICATION
H01L SEMICONDUCTOR DEVICES; ELECTRIC SOLID STATE DEVICES NOT OTHERWISE PROVIDED FOR
H04N PICTORIAL COMMUNICATION, e.g. TELEVISION
H04W WIRELESS COMMUNICATION NETWORKS
G06T IMAGE DATA PROCESSING OR GENERATION, IN GENERAL
G06Q DATA PROCESSING SYSTEMS OR METHODS, SPECIALLY ADAPTED FOR ADMINISTRATIVE, COMMERCIAL, FINANCIAL, MANAGERIAL, SUPERVISORY OR FORECASTING PURPOSES; SYSTEMS OR METHODS SPECIALLY ADAPTED FOR ADMINISTRATIVE, COMMERCIAL, FINANCIAL, MANAGERIAL, SUPERVISORY OR FORECASTING PURPOSES, NOT OTHERWISE PROVIDED FOR
G06K GRAPHICAL DATA READING ; PRESENTATION OF DATA; RECORD CARRIERS; HANDLING RECORD CARRIERS
H04B TRANSMISSION
G02B OPTICAL ELEMENTS, SYSTEMS OR APPARATUS
G03G ELECTROGRAPHY; ELECTROPHOTOGRAPHY; MAGNETOGRAPHY
Y02D CLIMATE CHANGE MITIGATION TECHNOLOGIES IN INFORMATION AND COMMUNICATION TECHNOLOGIES [ICT], I.E. INFORMATION AND COMMUNICATION TECHNOLOGIES AIMING AT THE REDUCTION OF THEIR OWN ENERGY USE
G11C STATIC STORES
B41J TYPEWRITERS; SELECTIVE PRINTING MECHANISMS, ; e.g. INK-JET PRINTERS, THERMAL PRINTERS; , i.e. MECHANISMS PRINTING OTHERWISE THAN FROM A FORME; CORRECTION OF TYPOGRAPHICAL ERRORS
G06N COMPUTING ARRANGEMENTS BASED ON SPECIFIC COMPUTATIONAL MODELS
G11B INFORMATION STORAGE BASED ON RELATIVE MOVEMENT BETWEEN RECORD CARRIER AND TRANSDUCER
Y10S TECHNICAL SUBJECTS COVERED BY FORMER USPC CROSS-REFERENCE ART COLLECTIONS [XRACs] AND DIGESTS
H04M TELEPHONIC COMMUNICATION
A61B DIAGNOSIS; SURGERY; IDENTIFICATION
G09G ARRANGEMENTS OR CIRCUITS FOR CONTROL OF INDICATING DEVICES USING STATIC MEANS TO PRESENT VARIABLE INFORMATION
kable(
  cbind(
    "Assigned Sex" = rep(c("Female", "Male"), each = length(top_classes_org)),
    Class = rep(top_classes_org, 2), com_top_org
  ),
  digits = 2, row.names = FALSE,
  caption = "Cumulative Percent of Inventors by Class and Organization"
)
Cumulative Percent of Inventors by Class and Organization
Assigned Sex Class IBM Samsung Canon Sony Intel Toshiba Microsoft LG Hitachi
Female G06F 31.19 9.25 6.29 8.90 16.41 4.60 32.41 4.79 16.67
Female H04L 43.12 17.53 7.49 14.76 27.54 6.89 46.69 19.07 21.50
Female H01L 51.41 29.31 10.86 20.62 38.84 26.70 46.91 20.45 25.86
Female H04N 52.53 34.55 25.86 32.59 42.05 30.36 52.41 29.96 27.95
Female H04W 55.28 43.41 26.42 39.55 52.64 32.92 56.07 46.45 28.81
Female G06T 56.88 45.80 30.66 45.41 56.34 35.15 60.49 47.34 30.24
Female G06Q 63.44 46.37 31.06 46.69 57.14 36.13 68.47 47.88 31.43
Female G06K 65.93 48.05 34.73 49.87 58.98 37.40 72.61 48.77 32.40
Female H04B 66.23 51.73 34.97 53.57 63.40 38.80 72.95 54.14 32.64
Female G02B 66.55 53.90 38.27 57.64 64.28 40.71 74.36 55.01 33.53
Female G03G 66.55 54.43 51.69 57.66 64.28 42.33 74.36 55.01 33.69
Female Y02D 67.55 56.13 51.99 58.54 70.15 43.23 75.33 56.65 34.49
Female G11C 68.42 59.41 52.03 58.77 72.63 47.87 75.58 56.66 36.07
Female B41J 68.44 59.65 63.45 59.04 72.65 48.74 75.59 56.66 36.20
Female G06N 72.63 60.08 63.67 59.19 74.74 48.96 79.76 56.80 37.39
Female G11B 73.45 61.45 64.37 63.01 75.05 51.30 80.17 57.57 39.62
Female Y10S 75.00 62.35 65.81 64.10 75.88 52.81 81.57 58.22 42.91
Female H04M 76.11 64.58 66.23 65.39 76.45 53.08 83.89 60.72 43.24
Female A61B 76.76 65.65 68.80 66.14 76.81 57.30 84.32 60.85 43.98
Female G09G 77.01 67.49 69.46 69.00 77.83 57.67 85.36 61.84 45.50
Male G06F 30.46 9.08 4.28 8.12 23.30 5.15 35.83 3.05 11.43
Male H04L 41.06 17.20 5.66 14.18 34.56 7.50 50.86 25.26 14.86
Male H01L 53.20 29.42 10.82 21.15 44.52 22.77 51.15 26.13 21.87
Male H04N 54.34 35.23 25.66 36.40 46.98 27.06 55.87 33.00 24.71
Male H04W 56.29 43.75 26.38 41.33 54.61 28.16 59.54 56.24 25.55
Male G06T 57.57 46.10 29.71 45.47 58.79 30.20 64.47 56.82 27.11
Male G06Q 61.85 46.67 29.93 46.90 59.38 30.98 70.67 57.09 28.21
Male G06K 63.66 48.56 33.50 49.44 60.55 32.60 73.87 57.67 29.27
Male H04B 64.05 52.34 33.72 52.01 64.33 33.59 74.42 66.34 30.09
Male G02B 64.55 54.08 37.46 54.81 65.24 34.44 76.44 66.95 31.01
Male G03G 64.56 54.51 52.75 54.82 65.25 37.13 76.44 66.95 31.30
Male Y02D 65.57 56.16 53.05 55.48 71.76 37.65 77.88 68.58 31.94
Male G11C 67.18 59.86 53.14 56.80 75.20 45.22 78.07 68.59 34.70
Male B41J 67.22 60.13 63.56 57.06 75.22 46.52 78.08 68.59 35.14
Male G06N 70.22 60.55 63.71 57.41 76.67 46.73 81.11 68.69 35.54
Male G11B 72.01 61.45 64.67 63.35 76.90 51.50 81.67 69.21 39.90
Male Y10S 74.00 62.22 66.28 64.70 77.75 53.15 84.10 69.53 43.84
Male H04M 74.80 64.57 66.47 65.89 78.31 53.47 85.91 71.22 44.06
Male A61B 75.18 65.65 68.36 66.77 78.49 55.97 86.10 71.35 45.96
Male G09G 75.50 67.67 69.36 69.84 79.66 56.51 87.12 72.01 47.24
splot(
  (com_top_org[1:20, ] - com_top_org[21:40, ]) ~ top_classes_org,
  title = "Inventor Sex Ballance within Classes between Organizations",
  laby = "Cumulative Percent of Inventor Sex Group (Female - Male)",
  labx = "CPC Class", sort = FALSE, line.type = "b", xlas = 2, mar = c(3, 2, 0, 0)
)

How have inventors migrated over time?

We previously took a quick look at dominant migrations over the full PatentsView history, but in this more current subset, we might take a close look to see if there are coherent migration patterns over time.

As before, the first step is to extract migrations:

# remove records without a county or sex
inventor_counties <- latest_inventors[
  , c("inventor_id", "pred_fem", "pred_masc", "county_fips", "date", "patent_id", "assignee_id")
]
inventor_counties <- inventor_counties[
  !is.na(inventor_counties$county_fips) & (inventor_counties$pred_fem + inventor_counties$pred_masc) != 0,
]
nrow(inventor_counties)
#> [1] 2997399

# select only those that are associated with multiple patents
inventor_counties <- inventor_counties[
  inventor_counties$inventor_id %in% inventor_counties$inventor_id[duplicated(inventor_counties$inventor_id)],
  -3
]
inventor_counties <- inventor_counties[order(inventor_counties$date, decreasing = TRUE), ]
nrow(inventor_counties)
#> [1] 2901084

## see how many unique inventors this leaves
length(unique(inventor_counties$inventor_id))
#> [1] 195489

# then look at each inventor to see if they have more than one county
# if they do, record their transition; previous county -> new county
inventor_moves_file <- paste0(outDir, "inventor_moves.csv.xz")
if (file.exists(inventor_moves_file)) {
  inventor_moves <- vroom::vroom(inventor_moves_file, show_col_types = FALSE)
} else {
  inventor_moves <- do.call(rbind, lapply(split(
    as.data.frame(inventor_counties), inventor_counties$inventor_id
  ), function(d) {
    if (!all(d$county_fips == d$county_fips[[1]])) {
      res <- NULL
      county <- d$county_fips
      for (i in seq_len(nrow(d) - 1)) {
        if (county[[i]] != county[[i + 1]]) {
          res <- rbind(
            res,
            as.data.frame(cbind(
              d[i, c("inventor_id", "patent_id", "assignee_id", "date")],
              from = county[[i + 1]],
              from_patent = d[i + 1, "patent_id"],
              from_assignee = d[i + 1, "assignee_id"],
              to = county[[i]]
            )),
            make.row.names = FALSE
          )
        }
      }
      res
    }
  }))
  vroom::vroom_write(inventor_moves, inventor_moves_file, ",")
}
unique_inventors <- as.data.frame(latest_inventors[!duplicated(latest_inventors$inventor_id), ])
rownames(unique_inventors) <- unique_inventors$inventor_id
inventor_moves$female <- unique_inventors[inventor_moves$inventor_id, "pred_fem"]

## see how many inventors have associated moves
length(unique(inventor_moves$inventor_id))
#> [1] 80159

Most Recent Moves

First, we might look at origin and destination state of moves in the latest 3 years for each sex group:

moves <- inventor_moves[inventor_moves$date > "2018-12-31", c("from", "to", "female")]
moves$from <- substr(formatC(moves$from, width = 5, flag = 0, mode = "integer"), 1, 2)
moves$to <- substr(formatC(moves$to, width = 5, flag = 0, mode = "integer"), 1, 2)
moves <- do.call(paste, moves)
moves <- tapply(moves, moves, length)
moves <- data.frame(
  do.call(rbind, strsplit(names(moves), " ", fixed = TRUE)),
  moves
)
colnames(moves) <- c("from", "to", "female", "count")
moves$female <- as.logical(moves$female)

# use these transitions to make an origin x destination matrix
## devtools::install_github("uva-bi-sdad/catchment")
library(catchment)
states <- download_census_shapes(paste0(dirname(outDir), "/maps"))
state_names <- structure(states$NAME, names = states$STATEFP)
observed_states <- sort(unique(c(moves$from, moves$to)))
migrations_female <- migrations_male <- matrix(
  0, length(observed_states), length(observed_states),
  dimnames = list(observed_states, observed_states)
)
for (r in seq_len(nrow(moves))) {
  move <- moves[r, ]
  if (move$female) {
    migrations_female[move$from, move$to] <- move$count
  } else {
    migrations_male[move$from, move$to] <- move$count
  }
}
migrations <- (migrations_female / sum(migrations_female) * 100) -
  (migrations_male / sum(migrations_male) * 100)
su <- rowSums(abs(migrations)) > 1 & colnames(migrations) %in% names(state_names)
migrations <- migrations[su, su]
migrations <- migrations[order(-rowSums(migrations)), order(-rowSums(migrations))]
migrations_female <- migrations_female[rownames(migrations), colnames(migrations)]
migrations_male <- migrations_male[rownames(migrations), colnames(migrations)]
dimnames(migrations_female) <- dimnames(migrations_male) <- rep(
  list(unname(state_names[colnames(migrations)])), 2
)
migrations_female <- migrations_female / sum(migrations_female) * 100
migrations_male <- migrations_male / sum(migrations_male) * 100

# and make a chord diagram out of the most frequently involved states
## devtools::install_github("mattflor/chorddiag")
library(chorddiag)
palette <- scico::scico(nrow(migrations), palette = "vik")

Percent of female-assigned inventor moves between 2018 and 2021:

m <- migrations_female
diag(m) <- 0
chorddiag(m, groupColors = palette, showTicks = FALSE, groupnamePadding = 5)

Percent of male-assigned inventor moves between 2018 and 2021:

m <- migrations_male
diag(m) <- 0
chorddiag(m, groupColors = palette, showTicks = FALSE, groupnamePadding = 5)
migration_difference <- migrations_female - migrations_male
order <- names(sort(-apply(migration_difference, 1, median)))
migration_difference <- data.frame(
  Difference = as.numeric(migration_difference),
  From = factor(rep(rownames(migrations_female), each = nrow(migrations)), order, ordered = TRUE),
  To = factor(colnames(migrations_female), order, ordered = TRUE)
)
library(plotly)
config(plot_ly(
  migration_difference,
  x = ~To, y = ~From, z = ~Difference,
  type = "heatmap", colors = leaflet::colorNumeric(palette, c(0, 1.35), reverse = TRUE)
), displayModeBar = FALSE, showTips = FALSE, showAxisDragHandles = FALSE)

Organization Attractiveness

One way to interpret an inventor’s move from one organization to another is that the target organization is more attractive. From this, we might derive a measure of organization attractiveness, and we might see if different organizations are attractive to the different sex groups:

# we'll only be interested in moves with associated organizations, and with different
# to and from organizations:
organization_moves <- inventor_moves[!is.na(inventor_moves$assignee_id), ]
organization_moves <- organization_moves[!is.na(organization_moves$from_assignee), ]
organization_moves <- organization_moves[
  organization_moves$assignee_id != organization_moves$from_assignee,
]
data.frame(
  moves = nrow(organization_moves),
  inventors = length(unique(organization_moves$inventor_id))
)
#>    moves inventors
#> 1 164419     53493

# now we can just see how often each organization is the destination of a move:
organization_move_frequency <- table(organization_moves$assignee_id, organization_moves$female)
rownames(organization_move_frequency) <- organizations[
  rownames(organization_move_frequency), "assignee_organization"
]
organization_move_frequency <- as.data.frame(cbind(
  organization_move_frequency,
  sweep(organization_move_frequency, 2, colSums(organization_move_frequency), "/") * 100
))
colnames(organization_move_frequency) <- c("Male", "Female", "Male_Percent", "Female_Percent")
organization_move_frequency$Difference <- organization_move_frequency$Female_Percent -
  organization_move_frequency$Male_Percent
organization_move_frequency <- organization_move_frequency[
  order(-organization_move_frequency$Difference),
]
kable(organization_move_frequency[c(1:20, 1:20 + nrow(organization_move_frequency) - 20), ], digits = 3)
Male Female Male_Percent Female_Percent Difference
APPLIED MATERIALS, INC. 691 130 0.458 0.971 0.513
THE REGENTS OF THE UNIVERSITY OF CALIFORNIA 953 150 0.631 1.120 0.489
Microsoft Technology Licensing, LLC 2629 292 1.741 2.180 0.439
QUALCOMM Incorporated 1666 193 1.103 1.441 0.338
ORACLE INTERNATIONAL CORPORATION 530 91 0.351 0.679 0.329
Board of Regents, The University of Texas System 292 68 0.193 0.508 0.314
InterDigital CE Patent Holdings 106 50 0.070 0.373 0.303
Seagate Technology LLC 339 70 0.224 0.523 0.298
AT&T INTELLECTUAL PROPERTY I, L.P. 815 110 0.540 0.821 0.282
Facebook Technologies, LLC 791 106 0.524 0.791 0.268
Rigel Pharmaceuticals, Inc. 62 41 0.041 0.306 0.265
Marvell International Ltd. 217 53 0.144 0.396 0.252
AMGEN INC. 136 45 0.090 0.336 0.246
NOVARTIS AG 271 53 0.179 0.396 0.216
Florida State University Research Foundation, Inc. 274 53 0.181 0.396 0.214
Samsung Electronics Co., Ltd. 484 70 0.320 0.523 0.202
Bristol-Myers Squibb Company 180 42 0.119 0.314 0.194
Futurewei Technologies, Inc. 305 51 0.202 0.381 0.179
INNOPHASE INC. 5 24 0.003 0.179 0.176
General Electric Company 1280 136 0.848 1.015 0.168
Eastman Kodak Company 181 6 0.120 0.045 -0.075
Covidien LP 283 15 0.187 0.112 -0.075
salesforce.com, inc. 297 16 0.197 0.119 -0.077
Apple Inc. 1887 157 1.249 1.172 -0.077
Eaton Corporation 129 1 0.085 0.007 -0.078
Deere & Company 178 5 0.118 0.037 -0.081
United Technologies Corporation 170 4 0.113 0.030 -0.083
CFPH, LLC 131 0 0.087 0.000 -0.087
Caterpillar Inc. 286 12 0.189 0.090 -0.100
Raytheon Company 263 9 0.174 0.067 -0.107
IGT 214 4 0.142 0.030 -0.112
The United States of America as represented by the Secretary of the Navy 291 10 0.193 0.075 -0.118
Motorola, Inc. 423 21 0.280 0.157 -0.123
Micron Technology, Inc. 881 58 0.583 0.433 -0.150
Amazon Technologies, Inc. 1228 88 0.813 0.657 -0.156
Ford Global Technologies, LLC 599 32 0.397 0.239 -0.158
HONEYWELL INTERNATIONAL INC. 493 20 0.326 0.149 -0.177
The Boeing Company 708 39 0.469 0.291 -0.178
Dell Products L.P. 397 11 0.263 0.082 -0.181
Texas Instruments Incorporated 832 32 0.551 0.239 -0.312

We might also look at this within each of the latest years to see if there are trends:

organization_moves$year <- substr(organization_moves$date, 1, 4)
organization_moves$assignee_id <- as.factor(organization_moves$assignee_id)
organization_yearly_difference <- do.call(rbind, lapply(2011:2021, function(year) {
  d <- organization_moves[organization_moves$year == year, ]
  m <- table(d$assignee_id, d$female)
  m <- sweep(m, 2, colSums(m), "/") * 100
  m[, 2] - m[, 1]
}))
organization_yearly_difference <- organization_yearly_difference[
  , colSums(organization_yearly_difference != 0) == 11
]

# overall trends
organization_trends <- cor(
  organization_yearly_difference, seq_len(nrow(organization_yearly_difference))^3
)[, 1]
organizations_trending <- names(sort(abs(organization_trends), TRUE)[1:9])
splot(
  organization_yearly_difference[, organizations_trending] ~ as.character(2011:2021),
  levels = list(mv = organizations[organizations_trending, "assignee_organization"]),
  line.type = "b", title = "Organization Move Target Percent Difference over Time",
  laby = "Percent Difference (Female - Male)", labx = "Year", lpos = "topright",
  leg = "inside", myl = c(-.9, 1.7)
)


# recent trends
organization_trends <- cor(
  organization_yearly_difference[(1:4) + 7, ], (1:4)^3
)[, 1]
organizations_trending <- names(sort(abs(organization_trends), TRUE)[1:9])
splot(
  organization_yearly_difference[, organizations_trending] ~ as.character(2011:2021),
  levels = list(mv = organizations[organizations_trending, "assignee_organization"]),
  line.type = "b", title = "Organization Move Target Percent Difference over Time",
  laby = "Percent Difference (Female - Male)", labx = "Year", lpos = "bottomright",
  leg = "inside", myl = c(-.8, .6)
)

Of course, this is only looking at organizational moved that coincided with physical moves.