Background

Often in biology we encounter situations where one gene maps to different pathways, or different pathways to the same gene. I other words, gene-pathway relationships are member-category relationships that can have a large degree of redundancy or overlap. Some pathways/categories might be highly similar when comparing the list of their members. It is often useful to know the degree of overlap between pathways/categories, and optionally reduce the list of pathways to leave only one representative instead of multiple highly similar ones.

This document walks through one example where we determine overlap between pathways of the [subtiwiki pathway categories](http://www.subtiwiki.uni-goettingen.de/v3/category/ for the bacterium Bacillus subtilis.

If you find a built-in function in an R package that will do this automatically, please leave a message.

Libraries and test data

suppressPackageStartupMessages({
  library(tidyverse)
})
df_subtiwiki <- read_csv("../data/geneCategories-2022-09-02.csv",
  name_repair = function(x) str_replace_all(x, " ", "_"),
  show_col_types = FALSE)

head(df_subtiwiki)

Calculate overlap

mat_categories <- df_subtiwiki %>%
  mutate(category = str_sub(category, 1, 25)) %>%
  select(category, gene_locus) %>%
  distinct() %>% 
  mutate(dummy = 1) %>%
  pivot_wider(id_cols = category, names_from = gene_locus,
    values_from = dummy, values_fill = 0) %>%
  column_to_rownames("category") %>%
  as.matrix

mat_categories[1:5, 1:5]
                      BSU_00010 BSU_00020 BSU_00030 BSU_00040 BSU_00050
DNA replication               1         1         0         0         0
Essential genes               1         1         0         0         0
Mismatch repair (MMR)         0         1         0         0         0
Ribosome assembly             0         0         1         0         0
Cell shape                    0         0         1         0         0
hclust_cat <- dist(mat_categories[1:50, ], method = "binary") %>%
  hclust(method = "complete")

plot(hclust_cat, cex = 0.6, main = NA)

cutree(hclust_cat, h = 0.5) %>%
  {duplicated(.) | rev(duplicated(rev(.)))} %>%
  which %>% hclust_cat$labels[.]
[1] "DNA repair/ recombination" "Other proteins"           
dist(mat_categories[1:50, ], method = "binary") %>%
  as.matrix %>%
  heatmap

dist(mat_categories[1:50, ], method = "binary") %>%
  as.matrix %>%
  as.data.frame %>%
  mutate(across(everything(), ~ replace(.x, .x == 0, NA))) %>%
  filter(if_any(everything(), ~ .x < 0.5))
LS0tCnRpdGxlOiAiUXVhbnRpZml5aW5nIG92ZXJsYXAgaW4gdGhlIGNvbXBvc2l0aW9uIG9mIGNhdGVnb3JpZXMiCmF1dGhvcjogTWljaGFlbCBKYWhuCmRhdGU6ICJgciBmb3JtYXQoU3lzLnRpbWUoKSwgJyVkICVCLCAlWScpYCIKb3V0cHV0OgogIGh0bWxfbm90ZWJvb2s6CiAgICB0aGVtZTogY29zbW8KICAgIHRvYzogbm8KICAgIG51bWJlcl9zZWN0aW9uczogbm8KICBodG1sX2RvY3VtZW50OgogICAgdG9jOiBubwogICAgZGZfcHJpbnQ6IHBhZ2VkCi0tLQoKYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9CmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSkKYGBgCgojIyMgQmFja2dyb3VuZAoKT2Z0ZW4gaW4gYmlvbG9neSB3ZSBlbmNvdW50ZXIgc2l0dWF0aW9ucyB3aGVyZSBvbmUgZ2VuZSBtYXBzIHRvIGRpZmZlcmVudCBwYXRod2F5cywgb3IgZGlmZmVyZW50IHBhdGh3YXlzIHRvIHRoZSBzYW1lIGdlbmUuCkkgb3RoZXIgd29yZHMsIGdlbmUtcGF0aHdheSByZWxhdGlvbnNoaXBzIGFyZSBtZW1iZXItY2F0ZWdvcnkgcmVsYXRpb25zaGlwcyB0aGF0IGNhbiBoYXZlIGEgbGFyZ2UgZGVncmVlIG9mIHJlZHVuZGFuY3kgb3Igb3ZlcmxhcC4gU29tZSBwYXRod2F5cy9jYXRlZ29yaWVzIG1pZ2h0IGJlIGhpZ2hseSBzaW1pbGFyIHdoZW4gY29tcGFyaW5nIHRoZSBsaXN0IG9mIHRoZWlyIG1lbWJlcnMuIEl0IGlzIG9mdGVuIHVzZWZ1bCB0byBrbm93IHRoZSBkZWdyZWUgb2Ygb3ZlcmxhcCBiZXR3ZWVuIHBhdGh3YXlzL2NhdGVnb3JpZXMsIGFuZCBvcHRpb25hbGx5IHJlZHVjZSB0aGUgbGlzdCBvZiBwYXRod2F5cyB0byBsZWF2ZSBvbmx5IG9uZSByZXByZXNlbnRhdGl2ZSBpbnN0ZWFkIG9mIG11bHRpcGxlIGhpZ2hseSBzaW1pbGFyIG9uZXMuCgpUaGlzIGRvY3VtZW50IHdhbGtzIHRocm91Z2ggb25lIGV4YW1wbGUgd2hlcmUgd2UgZGV0ZXJtaW5lIG92ZXJsYXAgYmV0d2VlbiBwYXRod2F5cyBvZiB0aGUgW3N1YnRpd2lraSBwYXRod2F5IGNhdGVnb3JpZXNdKGh0dHA6Ly93d3cuc3VidGl3aWtpLnVuaS1nb2V0dGluZ2VuLmRlL3YzL2NhdGVnb3J5LyBmb3IgdGhlIGJhY3Rlcml1bSAqQmFjaWxsdXMgc3VidGlsaXMqLgoKSWYgeW91IGZpbmQgYSBidWlsdC1pbiBmdW5jdGlvbiBpbiBhbiBSIHBhY2thZ2UgdGhhdCB3aWxsIGRvIHRoaXMgYXV0b21hdGljYWxseSwgcGxlYXNlIFtsZWF2ZSBhIG1lc3NhZ2VdKGh0dHBzOi8vZ2l0aHViLmNvbS9NUFVTUC9iaW9pbmZvLWNvZGUtY2h1bmtzL2lzc3VlcykuCgojIyMgTGlicmFyaWVzIGFuZCB0ZXN0IGRhdGEKCi0gbG9hZCByZXF1aXJlZCBsaWJyYXJpZXMKCmBgYHtyfQpzdXBwcmVzc1BhY2thZ2VTdGFydHVwTWVzc2FnZXMoewogIGxpYnJhcnkodGlkeXZlcnNlKQp9KQpgYGAKCi0gaW1wb3J0IGNhdGVnb3J5IHRlc3QgZGF0YSAoZnJvbSBzdWJ0aXdpa2kpCgpgYGB7cn0KZGZfc3VidGl3aWtpIDwtIHJlYWRfY3N2KCIuLi9kYXRhL2dlbmVDYXRlZ29yaWVzLTIwMjItMDktMDIuY3N2IiwKICBuYW1lX3JlcGFpciA9IGZ1bmN0aW9uKHgpIHN0cl9yZXBsYWNlX2FsbCh4LCAiICIsICJfIiksCiAgc2hvd19jb2xfdHlwZXMgPSBGQUxTRSkKCmhlYWQoZGZfc3VidGl3aWtpKQpgYGAKCiMjIyBDYWxjdWxhdGUgb3ZlcmxhcAoKLSBpbiBvcmRlciB0byBlc3RpbWF0ZSBvdmVybGFwLCB0aGUgZGF0YSBpcyB0cmFuc2Zvcm1lZCBpbnRvIGEgMi1kaW1lbnNpb25hbCBtYXRyaXggd2l0aCBwYXRod2F5cyBvbiByb3dzLCBnZW5lcyBvbiBjb2x1bW5zCi0gdGhlIG1hdHJpeCBpcyBhIGJpbmFyeSBtYXRyaXggZmlsbGVkIHdpdGggbWVtYmVyc2hpcCByZWxhdGlvbnMgKDEgLSBtZW1iZXIsIDAgLSBubyBtZW1iZXIpCgpgYGB7cn0KbWF0X2NhdGVnb3JpZXMgPC0gZGZfc3VidGl3aWtpICU+JQogIG11dGF0ZShjYXRlZ29yeSA9IHN0cl9zdWIoY2F0ZWdvcnksIDEsIDI1KSkgJT4lCiAgc2VsZWN0KGNhdGVnb3J5LCBnZW5lX2xvY3VzKSAlPiUKICBkaXN0aW5jdCgpICU+JSAKICBtdXRhdGUoZHVtbXkgPSAxKSAlPiUKICBwaXZvdF93aWRlcihpZF9jb2xzID0gY2F0ZWdvcnksIG5hbWVzX2Zyb20gPSBnZW5lX2xvY3VzLAogICAgdmFsdWVzX2Zyb20gPSBkdW1teSwgdmFsdWVzX2ZpbGwgPSAwKSAlPiUKICBjb2x1bW5fdG9fcm93bmFtZXMoImNhdGVnb3J5IikgJT4lCiAgYXMubWF0cml4CgptYXRfY2F0ZWdvcmllc1sxOjUsIDE6NV0KYGBgCgotIHJlZHVjZSByZXN1bHQgdG8gNTAgZmlyc3QgY2F0ZWdvcmllcwotIHRoZSBiaW5hcnkgbWF0cml4ICBpcyBjbHVzdGVyZWQgYnkgc2ltaWxhcml0eSBhbmQgcGxvdHRlZCBhcyBkZW5kcm9ncmFtCgpgYGB7cn0KaGNsdXN0X2NhdCA8LSBkaXN0KG1hdF9jYXRlZ29yaWVzWzE6NTAsIF0sIG1ldGhvZCA9ICJiaW5hcnkiKSAlPiUKICBoY2x1c3QobWV0aG9kID0gImNvbXBsZXRlIikKCnBsb3QoaGNsdXN0X2NhdCwgY2V4ID0gMC42LCBtYWluID0gTkEpCmBgYAoKCi0gdGhlIGJpbmFyeSBkaXN0YW5jZSBtZXRyaWMgc2hvd3MgdGhhdCwgZm9yIGV4YW1wbGUsIHR3byBwYXRod2F5cyB3aXRoIGRpc3RhbmNlIDAuNSBoYXZlIDUwJSBvZiB0aGVpciBtZW1iZXIgZ2VuZXMgaW4gY29tbW9uCi0gd2UgY2FuIGZpbmQgdGhlc2UgYnkgImN1dHRpbmciIHRoZSBkZW5kcm9ncmFtIGF0IGhlaWdodCA9IDAuNQoKYGBge3J9CmN1dHJlZShoY2x1c3RfY2F0LCBoID0gMC41KSAlPiUKICB7ZHVwbGljYXRlZCguKSB8IHJldihkdXBsaWNhdGVkKHJldiguKSkpfSAlPiUKICB3aGljaCAlPiUgaGNsdXN0X2NhdCRsYWJlbHNbLl0KYGBgCgotIHdlIGNhbiBhbHNvIGNhbGN1bGF0ZSB0aGUgZGlzdGFuY2UgbWV0cmljIGRpcmVjdGx5IGFuZCBmaW5kIHNpbWlsYXIgcGF0aHdheXMgYnkgZmlsdGVyaW5nCi0gbm90ZTogdGhlIGhlYXRtYXAgc2hvd3Mgb25seSBldmVyeSBzZWNvbmQgbGFiZWwgZHVlIHRvIHNpemUgcmVzdHJpY3Rpb25zCgpgYGB7ciwgZmlnLndpZHRoID0gNS41LCBmaWcuaGVpZ2h0ID0gNS41fQpkaXN0KG1hdF9jYXRlZ29yaWVzWzE6NTAsIF0sIG1ldGhvZCA9ICJiaW5hcnkiKSAlPiUKICBhcy5tYXRyaXggJT4lCiAgaGVhdG1hcApgYGAKLSBmaWx0ZXIgY2F0ZWdvcmllcyAocm93cykgd2hlcmUgZGlzdCBtZXRyaWMgZmFsbHMgYmVsb3cgYSB0aHJlc2hvbGQgb2YgMC41Ci0gc2FtZSByZXN1bHQgYXMgZGVuZHJvZ3JhbQoKYGBge3J9CmRpc3QobWF0X2NhdGVnb3JpZXNbMTo1MCwgXSwgbWV0aG9kID0gImJpbmFyeSIpICU+JQogIGFzLm1hdHJpeCAlPiUKICBhcy5kYXRhLmZyYW1lICU+JQogIG11dGF0ZShhY3Jvc3MoZXZlcnl0aGluZygpLCB+IHJlcGxhY2UoLngsIC54ID09IDAsIE5BKSkpICU+JQogIGZpbHRlcihpZl9hbnkoZXZlcnl0aGluZygpLCB+IC54IDwgMC41KSkKYGBgCgoK