Fast gene set enrichment
Background
In contrast to a simple gene set enrichment with the hypergeometric
test, fast gene set enrichment (fgsea
)
does not only test for over- or under-representation of items (groups)
in a defined global set, but also takes a quantitative score (expression
strength, fold change, etc) into account.
This document walks through one example where the the Bioconductor
package fgsea
is used to determine gene enrichment for
pathways of the [subtiwiki pathway categories](http://www.subtiwiki.uni-goettingen.de/v3/category/ for
the bacterium Bacillus subtilis.
Libraries and test data
- Install
fgsea
package from Bioconductor
BiocManager::install("fgsea")
suppressPackageStartupMessages({
library(fgsea)
library(tidyverse)
})
- import category test data (from subtiwiki)
df_subtiwiki <- read_csv("../data/geneCategories-2022-09-02.csv",
name_repair = function(x) str_replace_all(x, " ", "_"),
show_col_types = FALSE)
head(df_subtiwiki)
fgsea
requires a gene expression matrix and
gene-pathway annotation as input
- can calculate arbitrarily small p-values using iterative random
sampling
- bring input into right format
df_pathway <- df_subtiwiki %>%
select(-gene_name, -category_id) %>%
distinct() %>%
rename(locus_tag = gene_locus) %>%
mutate(locus_tag = gsub("_", "", locus_tag))
df_expression <- df_pathway %>%
select(locus_tag) %>%
distinct() %>%
mutate(condition = "test", log2FC = rnorm(n(), 0, sd = 3))
list_fgsea_pathways <- df_pathway %>%
group_by(category) %>%
summarize(locus_tag = list(locus_tag)) %>%
deframe
head(list_fgsea_pathways, n = 3)
$`6S RNA`
[1] "BSUmiscRNA32" "BSUmiscRNA41"
$`A/P endonucleases`
[1] "BSU22340" "BSU25130" "BSU40880"
$`ABC transporters for the uptake of iron/ siderophores`
[1] "BSU01610" "BSU01620" "BSU01630" "BSU03800" "BSU03810" "BSU03820" "BSU03830" "BSU07490" "BSU07500" "BSU07510" "BSU07520"
[12] "BSU08440" "BSU08450" "BSU08460" "BSU10330" "BSU32940" "BSU33290" "BSU33300" "BSU33310" "BSU33320" "BSU39610"
Generalized function to run fgsea
- we can make a generalized function that can run
fgsea
on arbitrary data
- can run on many categories at once
- takes as input:
gene_table
: data frame with columns
condition
, log2FC
, locus_tag
conditions
: optional vector of conditions where tests
are run independently; requires a matching column in
gene_table
named condition
pathways
: named list of pathways containing vectors of
locus_tag
s
collapse
: logical that defines whether similar pathways
should be collapsed
fgsea_multiple <- function(
conditions, gene_table, pathways,
minSize = 5, maxSize = 200, collapse = TRUE) {
result <- lapply(conditions,
function(cond){
genes <- gene_table %>%
filter(condition == cond, !is.na(log2FC), !is.na(locus_tag)) %>%
select(locus_tag, log2FC) %>%
deframe
df_fgsea <- fgsea(
pathways, genes,
minSize = minSize, maxSize = maxSize
)
if (collapse) {
main_pws <- collapsePathways(
df_fgsea,
pathways,
genes
)$mainPathways
df_fgsea <- filter(df_fgsea, pathway %in% main_pws)
}
return(df_fgsea)
})
names(result) <- conditions
return(bind_rows(result, .id = "condition"))
}
Run test and inspect results
- set a seed to obtain same result every time
set.seed(123)
fgsea_result <- fgsea_multiple(
conditions = "test",
gene_table = df_expression,
pathways = list_fgsea_pathways
)
- expected result: adjusted p-values should not be significant (no
enrichment of a specific pathway)
fgsea_result %>%
arrange(padj) %>%
head(n = 5)
- what happens if one pathway has higher log2 FC?
- it suddenly becomes highly significant
df_expression <- df_expression %>%
mutate(log2FC = case_when(
locus_tag %in% list_fgsea_pathways[["Biosynthesis of peptidoglycan"]] ~ rnorm(n()) - 6,
.default = log2FC
))
set.seed(123)
fgsea_result <- fgsea_multiple(
conditions = "test",
gene_table = df_expression,
pathways = list_fgsea_pathways
)
fgsea_result %>%
arrange(padj) %>%
head(n = 5)
LS0tCnRpdGxlOiAiRmFzdCBnZW5lIHNldCBlbnJpY2htZW50IHdpdGggUiIKYXV0aG9yOiBNaWNoYWVsIEphaG4KZGF0ZTogImByIGZvcm1hdChTeXMudGltZSgpLCAnJWQgJUIsICVZJylgIgpvdXRwdXQ6CiAgaHRtbF9ub3RlYm9vazoKICAgIHRoZW1lOiBjb3NtbwogICAgdG9jOiBubwogICAgbnVtYmVyX3NlY3Rpb25zOiBubwogIGh0bWxfZG9jdW1lbnQ6CiAgICB0b2M6IG5vCiAgICBkZl9wcmludDogcGFnZWQKLS0tCgpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFKQpgYGAKCiMjIEZhc3QgZ2VuZSBzZXQgZW5yaWNobWVudAoKIyMjIEJhY2tncm91bmQKCkluIGNvbnRyYXN0IHRvIGEgc2ltcGxlIGdlbmUgc2V0IGVucmljaG1lbnQgd2l0aCB0aGUgaHlwZXJnZW9tZXRyaWMgdGVzdCwgZmFzdCBnZW5lIHNldCBlbnJpY2htZW50ICgqKmBmZ3NlYWAqKikgZG9lcyBub3Qgb25seSB0ZXN0IGZvciBvdmVyLSBvciB1bmRlci1yZXByZXNlbnRhdGlvbiBvZiBpdGVtcyAoZ3JvdXBzKSBpbiBhIGRlZmluZWQgZ2xvYmFsIHNldCwgYnV0IGFsc28gdGFrZXMgYSBxdWFudGl0YXRpdmUgc2NvcmUgKGV4cHJlc3Npb24gc3RyZW5ndGgsIGZvbGQgY2hhbmdlLCBldGMpIGludG8gYWNjb3VudC4KClRoaXMgZG9jdW1lbnQgd2Fsa3MgdGhyb3VnaCBvbmUgZXhhbXBsZSB3aGVyZSB0aGUgdGhlIEJpb2NvbmR1Y3RvciBwYWNrYWdlIGBmZ3NlYWAgaXMgdXNlZCB0byBkZXRlcm1pbmUgZ2VuZSBlbnJpY2htZW50IGZvciBwYXRod2F5cyBvZiB0aGUgW3N1YnRpd2lraSBwYXRod2F5IGNhdGVnb3JpZXNdKGh0dHA6Ly93d3cuc3VidGl3aWtpLnVuaS1nb2V0dGluZ2VuLmRlL3YzL2NhdGVnb3J5LyBmb3IgdGhlIGJhY3Rlcml1bSAqQmFjaWxsdXMgc3VidGlsaXMqLgoKIyMjIExpYnJhcmllcyBhbmQgdGVzdCBkYXRhCgotIEluc3RhbGwgYGZnc2VhYCBwYWNrYWdlIGZyb20gQmlvY29uZHVjdG9yCgpgYGB7ciwgZXZhbCA9IEZBTFNFfQpCaW9jTWFuYWdlcjo6aW5zdGFsbCgiZmdzZWEiKQpgYGAKCgotIGxvYWQgcmVxdWlyZWQgbGlicmFyaWVzCgpgYGB7cn0Kc3VwcHJlc3NQYWNrYWdlU3RhcnR1cE1lc3NhZ2VzKHsKICBsaWJyYXJ5KGZnc2VhKQogIGxpYnJhcnkodGlkeXZlcnNlKQp9KQpgYGAKCi0gaW1wb3J0IGNhdGVnb3J5IHRlc3QgZGF0YSAoZnJvbSBzdWJ0aXdpa2kpCgpgYGB7cn0KZGZfc3VidGl3aWtpIDwtIHJlYWRfY3N2KCIuLi9kYXRhL2dlbmVDYXRlZ29yaWVzLTIwMjItMDktMDIuY3N2IiwKICBuYW1lX3JlcGFpciA9IGZ1bmN0aW9uKHgpIHN0cl9yZXBsYWNlX2FsbCh4LCAiICIsICJfIiksCiAgc2hvd19jb2xfdHlwZXMgPSBGQUxTRSkKCmhlYWQoZGZfc3VidGl3aWtpKQpgYGAKCi0gYGZnc2VhYCByZXF1aXJlcyBhIGdlbmUgZXhwcmVzc2lvbiBtYXRyaXggYW5kIGdlbmUtcGF0aHdheSBhbm5vdGF0aW9uIGFzIGlucHV0Ci0gY2FuIGNhbGN1bGF0ZSBhcmJpdHJhcmlseSBzbWFsbCBwLXZhbHVlcyB1c2luZyBpdGVyYXRpdmUgcmFuZG9tIHNhbXBsaW5nCi0gYnJpbmcgaW5wdXQgaW50byByaWdodCBmb3JtYXQKCmBgYHtyfQpkZl9wYXRod2F5IDwtIGRmX3N1YnRpd2lraSAlPiUKICBzZWxlY3QoLWdlbmVfbmFtZSwgLWNhdGVnb3J5X2lkKSAlPiUKICBkaXN0aW5jdCgpICU+JQogIHJlbmFtZShsb2N1c190YWcgPSBnZW5lX2xvY3VzKSAlPiUKICBtdXRhdGUobG9jdXNfdGFnID0gZ3N1YigiXyIsICIiLCBsb2N1c190YWcpKQoKZGZfZXhwcmVzc2lvbiA8LSBkZl9wYXRod2F5ICU+JQogIHNlbGVjdChsb2N1c190YWcpICU+JQogIGRpc3RpbmN0KCkgJT4lCiAgbXV0YXRlKGNvbmRpdGlvbiA9ICJ0ZXN0IiwgbG9nMkZDID0gcm5vcm0obigpLCAwLCBzZCA9IDMpKQoKbGlzdF9mZ3NlYV9wYXRod2F5cyA8LSBkZl9wYXRod2F5ICU+JQogICAgZ3JvdXBfYnkoY2F0ZWdvcnkpICU+JQogICAgc3VtbWFyaXplKGxvY3VzX3RhZyA9IGxpc3QobG9jdXNfdGFnKSkgJT4lCiAgICBkZWZyYW1lCgpoZWFkKGxpc3RfZmdzZWFfcGF0aHdheXMsIG4gPSAzKQpgYGAKCiMjIyBHZW5lcmFsaXplZCBmdW5jdGlvbiB0byBydW4gYGZnc2VhYCAKCi0gd2UgY2FuIG1ha2UgYSBnZW5lcmFsaXplZCBmdW5jdGlvbiB0aGF0IGNhbiBydW4gYGZnc2VhYCBvbiBhcmJpdHJhcnkgZGF0YQotIGNhbiBydW4gb24gbWFueSBjYXRlZ29yaWVzIGF0IG9uY2UKLSB0YWtlcyBhcyBpbnB1dDoKICAtIGBnZW5lX3RhYmxlYDogZGF0YSBmcmFtZSB3aXRoIGNvbHVtbnMgYGNvbmRpdGlvbmAsIGBsb2cyRkNgLCBgbG9jdXNfdGFnYAogIC0gYGNvbmRpdGlvbnNgOiBvcHRpb25hbCB2ZWN0b3Igb2YgY29uZGl0aW9ucyB3aGVyZSB0ZXN0cyBhcmUgcnVuIGluZGVwZW5kZW50bHk7IHJlcXVpcmVzIGEgbWF0Y2hpbmcgY29sdW1uIGluIGBnZW5lX3RhYmxlYCBuYW1lZCBgY29uZGl0aW9uYAogIC0gYHBhdGh3YXlzYDogbmFtZWQgbGlzdCBvZiBwYXRod2F5cyBjb250YWluaW5nIHZlY3RvcnMgb2YgYGxvY3VzX3RhZ2BzCiAgLSBgY29sbGFwc2VgOiBsb2dpY2FsIHRoYXQgZGVmaW5lcyB3aGV0aGVyIHNpbWlsYXIgcGF0aHdheXMgc2hvdWxkIGJlIGNvbGxhcHNlZAogIAoKYGBge3J9CmZnc2VhX211bHRpcGxlIDwtIGZ1bmN0aW9uKAogICAgY29uZGl0aW9ucywgZ2VuZV90YWJsZSwgcGF0aHdheXMsCiAgICBtaW5TaXplID0gNSwgbWF4U2l6ZSA9IDIwMCwgY29sbGFwc2UgPSBUUlVFKSB7CiAgcmVzdWx0IDwtIGxhcHBseShjb25kaXRpb25zLAogICAgZnVuY3Rpb24oY29uZCl7CiAgICAgIGdlbmVzIDwtIGdlbmVfdGFibGUgJT4lCiAgICAgICAgZmlsdGVyKGNvbmRpdGlvbiA9PSBjb25kLCAhaXMubmEobG9nMkZDKSwgIWlzLm5hKGxvY3VzX3RhZykpICU+JQogICAgICAgIHNlbGVjdChsb2N1c190YWcsIGxvZzJGQykgJT4lCiAgICAgICAgZGVmcmFtZQogICAgICBkZl9mZ3NlYSA8LSBmZ3NlYSgKICAgICAgICBwYXRod2F5cywgZ2VuZXMsCiAgICAgICAgbWluU2l6ZSA9IG1pblNpemUsIG1heFNpemUgPSBtYXhTaXplCiAgICAgICkKICAgICAgaWYgKGNvbGxhcHNlKSB7CiAgICAgICAgICBtYWluX3B3cyA8LSBjb2xsYXBzZVBhdGh3YXlzKAogICAgICAgICAgZGZfZmdzZWEsCiAgICAgICAgICBwYXRod2F5cywKICAgICAgICAgIGdlbmVzCiAgICAgICAgKSRtYWluUGF0aHdheXMKICAgICAgICBkZl9mZ3NlYSA8LSBmaWx0ZXIoZGZfZmdzZWEsIHBhdGh3YXkgJWluJSBtYWluX3B3cykKICAgICAgfQogICAgICByZXR1cm4oZGZfZmdzZWEpCiAgfSkKICBuYW1lcyhyZXN1bHQpIDwtIGNvbmRpdGlvbnMKICByZXR1cm4oYmluZF9yb3dzKHJlc3VsdCwgLmlkID0gImNvbmRpdGlvbiIpKQp9CmBgYAoKCiMjIyBSdW4gdGVzdCBhbmQgaW5zcGVjdCByZXN1bHRzCgotIHNldCBhIHNlZWQgdG8gb2J0YWluIHNhbWUgcmVzdWx0IGV2ZXJ5IHRpbWUKCmBgYHtyfQpzZXQuc2VlZCgxMjMpCgpmZ3NlYV9yZXN1bHQgPC0gZmdzZWFfbXVsdGlwbGUoCiAgY29uZGl0aW9ucyA9ICJ0ZXN0IiwKICBnZW5lX3RhYmxlID0gZGZfZXhwcmVzc2lvbiwKICBwYXRod2F5cyA9IGxpc3RfZmdzZWFfcGF0aHdheXMKKQpgYGAKCi0gZXhwZWN0ZWQgcmVzdWx0OiBhZGp1c3RlZCBwLXZhbHVlcyBzaG91bGQgbm90IGJlIHNpZ25pZmljYW50IChubyBlbnJpY2htZW50IG9mIGEgc3BlY2lmaWMgcGF0aHdheSkKCmBgYHtyfQpmZ3NlYV9yZXN1bHQgJT4lCiAgYXJyYW5nZShwYWRqKSAlPiUKICBoZWFkKG4gPSA1KQpgYGAKCi0gd2hhdCBoYXBwZW5zIGlmIG9uZSBwYXRod2F5IGhhcyBoaWdoZXIgbG9nMiBGQz8KLSBpdCBzdWRkZW5seSBiZWNvbWVzIGhpZ2hseSBzaWduaWZpY2FudAoKYGBge3J9CmRmX2V4cHJlc3Npb24gPC0gZGZfZXhwcmVzc2lvbiAlPiUKICBtdXRhdGUobG9nMkZDID0gY2FzZV93aGVuKAogICAgbG9jdXNfdGFnICVpbiUgbGlzdF9mZ3NlYV9wYXRod2F5c1tbIkJpb3N5bnRoZXNpcyBvZiBwZXB0aWRvZ2x5Y2FuIl1dIH4gcm5vcm0obigpKSAtIDYsCiAgICAuZGVmYXVsdCA9IGxvZzJGQwogICkpCmBgYAoKCmBgYHtyfQpzZXQuc2VlZCgxMjMpCgpmZ3NlYV9yZXN1bHQgPC0gZmdzZWFfbXVsdGlwbGUoCiAgY29uZGl0aW9ucyA9ICJ0ZXN0IiwKICBnZW5lX3RhYmxlID0gZGZfZXhwcmVzc2lvbiwKICBwYXRod2F5cyA9IGxpc3RfZmdzZWFfcGF0aHdheXMKKQoKZmdzZWFfcmVzdWx0ICU+JQogIGFycmFuZ2UocGFkaikgJT4lCiAgaGVhZChuID0gNSkKYGBgCgo=