Background

The functions and code chunks collected here fullfill a very simple purpose: to plot read coverage or related information, as well as gene annotation as coverate tracks on one canvas. This is often very useful for sequencing data that is created from large throughput pipelines, and where it is not practical to import and plot coverage etc in a genome browser such as IGV.

Libraries and test data

Packages

  • ggcoverage can be installed with latest version from github
devtools::install_github("showteeth/ggcoverage")
  • load required libraries
suppressPackageStartupMessages({
  library(tidyverse)
  library(GenomicFeatures)
  library(GenomicRanges)
  library(GenomicAlignments)
  library(ggcoverage)
  library(ggrepel)
  library(cowplot)
})

Import utility functions

  • get_coverage calculates coverage per nt (similar to bedgraph files) from GAlignments or Granges
  • track_coverage_combined plots one figure with different tracks overlaid
  • track_coverage_separate plots one figure with different tracks stacked on each other
  • track_genomic_features plots genome features from a Granges or GrangesList
  • check_gff checks *.gff files for problematic tags and replaces those
source("../source/coverage_utils.R")

Import genome annotation

  • first import the *.gff file with gene annotation
  • has to be parsed by GenomicFeatures or ORFik packages to txdb
  • then CDS, transcripts etc. can be extracted as GRanges object
# file path
genome_gff <- "../data/spyogenes_genome.gff"

# check gff file
check_gff(genome_gff, "Name=[a-zA-Z0-9\\s\\-\\_]*,", "Name=[a-zA-Z0-9\\s\\-\\_]*")
found no errors, no new GFF file exported
# import genome annotation
txdb <- makeTxDbFromGFF(genome_gff, format = "gff3")
Warning in call_fun_in_txdbmaker("makeTxDbFromGFF", ...): makeTxDbFromGFF() has moved to the txdbmaker package. Please call
  txdbmaker::makeTxDbFromGFF() to get rid of this warning.
Import genomic features from the file as a GRanges object ...
OK
Prepare the 'metadata' data frame ... OK
Make the TxDb object ... 
Warning in makeTxDbFromGRanges(gr, metadata = metadata): The following transcripts were dropped because their exon ranks could
  not be inferred (either because the exons are not on the same
  chromosome/strand or because they are not separated by introns):
  gene-SPY_RS01020, gene-SPY_RS05570
OK
# extract all CDSs as GRanges object
list_cds <- transcripts(txdb)
head(list_cds)
GRanges object with 6 ranges and 2 metadata columns:
         seqnames    ranges strand |     tx_id     tx_name
            <Rle> <IRanges>  <Rle> | <integer> <character>
  [1] NC_002737.2  232-1587      + |         1 SPY_RS00005
  [2] NC_002737.2 1742-2878      + |         2 SPY_RS00010
  [3] NC_002737.2 2953-3150      + |         3 SPY_RS00015
  [4] NC_002737.2 3480-4595      + |         4 SPY_RS00020
  [5] NC_002737.2 4665-5234      + |         5 SPY_RS00025
  [6] NC_002737.2 5237-8740      + |         6 SPY_RS00030
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

Import *.bam files

  • next import example *.bam files with read data
  • one RNASeq data file, first 5000 nt of genome
  • one RiboSeq data file, first 5000 nt of genome
rnaseq <- readGAlignments("../data/spyogenes_rnaseq.bam")
riboseq <- readGAlignments("../data/spyogenes_riboseq.bam")

head(rnaseq)
GAlignments object with 6 alignments and 0 metadata columns:
         seqnames strand       cigar    qwidth     start       end     width
            <Rle>  <Rle> <character> <integer> <integer> <integer> <integer>
  [1] NC_002737.2      +         21M        21         1        21        21
  [2] NC_002737.2      +         21M        21         1        21        21
  [3] NC_002737.2      +         22M        22         2        23        22
  [4] NC_002737.2      +         22M        22         2        23        22
  [5] NC_002737.2      +         17M        17         4        20        17
  [6] NC_002737.2      +         19M        19         4        22        19
          njunc
      <integer>
  [1]         0
  [2]         0
  [3]         0
  [4]         0
  [5]         0
  [6]         0
  -------
  seqinfo: 1 sequence from an unspecified genome
  • if read data is paired-end instead of single-end, use readGAlignmentsPairs
  • note: does not work with this example data
rnaseq <- readGAlignmentPairs("../data/spyogenes_rnaseq.bam")
riboseq <- readGAlignmentPairs("../data/spyogenes_riboseq.bam")

Get coverage for *.bam files

  • specify the appropriate data source with format (one of “bam”, “BigWig”)
  • specify a name for the track with sample
  • coverage is calculated per strand
  • a new variable sample_strand combines the two
df_coverage <- bind_rows(
  get_coverage(rnaseq, format = "bam", sample = "RNA"),
  get_coverage(riboseq, format = "bam", sample = "RIBO")
)

# turn sample names into factor for same order of tracks
df_coverage <- df_coverage %>%
  mutate(sample_strand = factor(sample_strand, unique(sample_strand)))

# print
head(df_coverage)

Import *.bigwig files

  • import *.bigwig/*.bw as an alternative to bam
  • bigwig files are strand-specific binary (compressed) coverage files
  • therefore need two files per sample, and specify strand when importing
  • similar to *.bedgraph files
rnaseq_bw_for <- rtracklayer::import(con = "../data/spyogenes_rnaseq_forward.bw", format = "BigWig")
rnaseq_bw_rev <- rtracklayer::import(con = "../data/spyogenes_rnaseq_reverse.bw", format = "BigWig")

strand(rnaseq_bw_for) <- "+"
strand(rnaseq_bw_rev) <- "-"

head(rnaseq_bw_for)
GRanges object with 6 ranges and 1 metadata column:
         seqnames    ranges strand |     score
            <Rle> <IRanges>  <Rle> | <numeric>
  [1] NC_002737.2         1      + |         2
  [2] NC_002737.2       2-3      + |         4
  [3] NC_002737.2         4      + |         7
  [4] NC_002737.2       5-7      + |         9
  [5] NC_002737.2      8-20      + |        10
  [6] NC_002737.2        21      + |         9
  -------
  seqinfo: 1 sequence from an unspecified genome

Get coverage for *.bigwig files

df_coverage_bw <- bind_rows(
  get_coverage(rnaseq_bw_for, format = "BigWig", sample = "RNA"),
  get_coverage(rnaseq_bw_rev, format = "BigWig", sample = "RNA"),
)

# turn sample names into factor for same order of tracks
df_coverage_bw <- df_coverage_bw %>%
  mutate(sample_strand = factor(sample_strand, unique(sample_strand)))

# print
head(df_coverage_bw)

Plot tracks

Plot tracks

  • plot individual coverage tracks
  • first all tracks together
track1 <- track_coverage_combined(
  df = df_coverage,
  start_coord = 0,
  end_coord = 5000,
  track_color = c("#B3B3B3", "#C9C9C9", "#7570B3", "#9C97DA")
)

track1

  • then all tracks separated by sample and strand
track2 <- track_coverage_separate(
  df = df_coverage,
  start_coord = 0,
  end_coord = 5000,
  track_color = c("#B3B3B3", "#C9C9C9", "#7570B3", "#9C97DA")
)

track2

  • all tracks can be customized by changing the theme options
track3 <- track_coverage_separate(
  df = df_coverage,
  start_coord = 0,
  end_coord = 5000,
  track_color = c("#B3B3B3", "#C9C9C9", "#7570B3", "#9C97DA")
) +
  theme_bw() +
  theme(
    panel.spacing.y = unit(-0.2, "pt"),
    axis.text.y = element_blank()
  )

track3

  • test if coverage plots for Bam and BigWig files are identical
  • they are (left: Bam, right: BigWig)
cowplot::plot_grid(
  track_coverage_separate(
    df = filter(df_coverage, sample == "RNA"),
    start_coord = 0,
    end_coord = 5000,
    track_color = c("#B3B3B3", "#C9C9C9")
  ) + theme_bw(),
  track_coverage_separate(
    df = filter(df_coverage_bw, sample == "RNA"),
    start_coord = 0,
    end_coord = 5000,
    track_color = c("#B3B3B3", "#C9C9C9")
  ) + theme_bw(),
  ncol = 2, align = "h"
)

Plot genome features

  • plot genomic features such as CDSs or transcripts
  • use a prettier theme right from the start
track4 <- track_genomic_features(
  data = as_tibble(list_cds) %>%
    filter(seqnames == unique(df_coverage$seqnames)), ,
  name_id = "tx_name",
  track_name = "GENES",
  start_coord = 0,
  end_coord = 5000,
  track_color = grey(0.4)
) +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text.y = element_blank()
  )

track4

Combine different tracks in one figure

  • can use ggpubr::ggarrange or cowplot::plot_grid to arrange multiple plots
cowplot::plot_grid(
  track3 + theme(plot.margin = unit(c(12, 12, 0, 12), "point")),
  track4 + theme(plot.margin = unit(c(0, 12, 12, 12), "point")),
  nrow = 2, rel_heights = c(0.7, 0.3), align = "v"
)

Plot coverage with size reduction

  • sometimes coverage of large genonomic ranges can be too dense
  • file size gets large and plotting becomes slow
  • in this case get_coverage can be used with binnin = TRUE
  • data will be binned in bin_width-wide windows, and coverage calculated by function bin_fun
  • default is to use 100 nt windows and mean() for averaging
df_coverage_binned <- get_coverage(rnaseq, format = "bam", sample = "RNA", binning = TRUE)

# turn sample names into factor for same order of tracks
df_coverage_binned <- df_coverage_binned %>%
  mutate(sample_strand = factor(sample_strand, unique(sample_strand)))

cowplot::plot_grid(
  track_coverage_separate(
    df = filter(df_coverage, sample == "RNA"),
    start_coord = 0,
    end_coord = 5000,
    track_color = c("#B3B3B3", "#C9C9C9")
  ) + theme_bw(),
  track_coverage_separate(
    df = filter(df_coverage_binned, sample == "RNA"),
    start_coord = 0,
    end_coord = 5000,
    track_color = c("#B3B3B3", "#C9C9C9")
  ) + theme_bw(),
  ncol = 2, align = "h"
)

---
title: "Plot coverage tracks for genomic data"
author: Michael Jahn
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
  html_notebook:
    theme: cosmo
    toc: no
    number_sections: no
  html_document:
    toc: no
    df_print: paged
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Background

The functions and code chunks collected here fullfill a very simple purpose: 
to plot read coverage or related information, as well as gene annotation as
coverate tracks on one canvas. This is often very useful for sequencing data that
is created from large throughput pipelines, and where it is not practical to
import and plot coverage etc in a genome browser such as IGV.

## Libraries and test data

### Packages

- `ggcoverage` can be installed with latest version from github

```{r, eval = FALSE}
devtools::install_github("showteeth/ggcoverage")
```

- load required libraries

```{r}
suppressPackageStartupMessages({
  library(tidyverse)
  library(GenomicFeatures)
  library(GenomicRanges)
  library(GenomicAlignments)
  library(ggcoverage)
  library(ggrepel)
  library(cowplot)
})
```

### Import utility functions

- `get_coverage` calculates coverage per nt (similar to bedgraph files) from `GAlignments` or `Granges`
- `track_coverage_combined` plots one figure with different tracks overlaid
- `track_coverage_separate` plots one figure with different tracks stacked on each other
- `track_genomic_features` plots genome features from a `Granges` or `GrangesList`
- `check_gff` checks `*.gff` files for problematic tags and replaces those


```{r}
source("../source/coverage_utils.R")
```

### Import genome annotation

- first import the `*.gff` file with gene annotation
- has to be parsed by `GenomicFeatures` or `ORFik` packages to txdb
- then CDS, transcripts etc. can be extracted as `GRanges` object


```{r}
# file path
genome_gff <- "../data/spyogenes_genome.gff"

# check gff file
check_gff(genome_gff, "Name=[a-zA-Z0-9\\s\\-\\_]*,", "Name=[a-zA-Z0-9\\s\\-\\_]*")

# import genome annotation
txdb <- makeTxDbFromGFF(genome_gff, format = "gff3")

# extract all CDSs as GRanges object
list_cds <- transcripts(txdb)
head(list_cds)
```

### Import *.bam files

- next import example `*.bam` files with read data
- one RNASeq data file, first 5000 nt of genome
- one RiboSeq data file, first 5000 nt of genome

```{r}
rnaseq <- readGAlignments("../data/spyogenes_rnaseq.bam")
riboseq <- readGAlignments("../data/spyogenes_riboseq.bam")

head(rnaseq)
```

- if read data is **paired-end** instead of single-end, use `readGAlignmentsPairs`
- note: does not work with this example data

```{r, eval = FALSE}
rnaseq <- readGAlignmentPairs("../data/spyogenes_rnaseq.bam")
riboseq <- readGAlignmentPairs("../data/spyogenes_riboseq.bam")
```

### Get coverage for *.bam files

- specify the appropriate data source with `format` (one of "bam", "BigWig")
- specify a name for the track with `sample`
- coverage is calculated per `strand`
- a new variable `sample_strand` combines the two

```{r}
df_coverage <- bind_rows(
  get_coverage(rnaseq, format = "bam", sample = "RNA"),
  get_coverage(riboseq, format = "bam", sample = "RIBO")
)

# turn sample names into factor for same order of tracks
df_coverage <- df_coverage %>%
  mutate(sample_strand = factor(sample_strand, unique(sample_strand)))

# print
head(df_coverage)
```

### Import *.bigwig files

- import `*.bigwig`/`*.bw` as an alternative to bam
- bigwig files are strand-specific binary (compressed) coverage files
- therefore need two files per sample, and specify strand when importing
- similar to `*.bedgraph` files

```{r}
rnaseq_bw_for <- rtracklayer::import(con = "../data/spyogenes_rnaseq_forward.bw", format = "BigWig")
rnaseq_bw_rev <- rtracklayer::import(con = "../data/spyogenes_rnaseq_reverse.bw", format = "BigWig")

strand(rnaseq_bw_for) <- "+"
strand(rnaseq_bw_rev) <- "-"

head(rnaseq_bw_for)
```

### Get coverage for  *.bigwig files

```{r}
df_coverage_bw <- bind_rows(
  get_coverage(rnaseq_bw_for, format = "BigWig", sample = "RNA"),
  get_coverage(rnaseq_bw_rev, format = "BigWig", sample = "RNA"),
)

# turn sample names into factor for same order of tracks
df_coverage_bw <- df_coverage_bw %>%
  mutate(sample_strand = factor(sample_strand, unique(sample_strand)))

# print
head(df_coverage_bw)
```

## Plot tracks

### Plot tracks

- plot individual coverage tracks
- first all tracks together

```{r, fig.width = 7.5, fig.height = 3.5}
track1 <- track_coverage_combined(
  df = df_coverage,
  start_coord = 0,
  end_coord = 5000,
  track_color = c("#B3B3B3", "#C9C9C9", "#7570B3", "#9C97DA")
)

track1
```

- then all tracks separated by `sample` and `strand`

```{r, fig.width = 7.5, fig.height = 3.5}
track2 <- track_coverage_separate(
  df = df_coverage,
  start_coord = 0,
  end_coord = 5000,
  track_color = c("#B3B3B3", "#C9C9C9", "#7570B3", "#9C97DA")
)

track2
```

- all tracks can be customized by changing the `theme` options

```{r, fig.width = 7.5, fig.height = 3.5}
track3 <- track_coverage_separate(
  df = df_coverage,
  start_coord = 0,
  end_coord = 5000,
  track_color = c("#B3B3B3", "#C9C9C9", "#7570B3", "#9C97DA")
) +
  theme_bw() +
  theme(
    panel.spacing.y = unit(-0.2, "pt"),
    axis.text.y = element_blank()
  )

track3
```

- test if coverage plots for Bam and BigWig files are identical
- they are (left: Bam, right: BigWig)

```{r, fig.width = 7.5, fig.height = 3.5}
cowplot::plot_grid(
  track_coverage_separate(
    df = filter(df_coverage, sample == "RNA"),
    start_coord = 0,
    end_coord = 5000,
    track_color = c("#B3B3B3", "#C9C9C9")
  ) + theme_bw(),
  track_coverage_separate(
    df = filter(df_coverage_bw, sample == "RNA"),
    start_coord = 0,
    end_coord = 5000,
    track_color = c("#B3B3B3", "#C9C9C9")
  ) + theme_bw(),
  ncol = 2, align = "h"
)
```


### Plot genome features

- plot genomic features such as CDSs or transcripts
- use a prettier theme right from the start

```{r, fig.width = 7.5, fig.height = 1.2}
track4 <- track_genomic_features(
  data = as_tibble(list_cds) %>%
    filter(seqnames == unique(df_coverage$seqnames)), ,
  name_id = "tx_name",
  track_name = "GENES",
  start_coord = 0,
  end_coord = 5000,
  track_color = grey(0.4)
) +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text.y = element_blank()
  )

track4
```

### Combine different tracks in one figure

- can use `ggpubr::ggarrange` or `cowplot::plot_grid` to arrange multiple plots

```{r}
cowplot::plot_grid(
  track3 + theme(plot.margin = unit(c(12, 12, 0, 12), "point")),
  track4 + theme(plot.margin = unit(c(0, 12, 12, 12), "point")),
  nrow = 2, rel_heights = c(0.7, 0.3), align = "v"
)
```


### Plot coverage with size reduction

- sometimes coverage of large genonomic ranges can be too dense
- file size gets large and plotting becomes slow
- in this case `get_coverage` can be used with `binnin = TRUE`
- data will be binned in `bin_width`-wide windows, and coverage calculated by function `bin_fun`
- default is to use 100 nt windows and `mean()` for averaging

```{r, fig.width = 7.5, fig.height = 3.5}
df_coverage_binned <- get_coverage(rnaseq, format = "bam", sample = "RNA", binning = TRUE)

# turn sample names into factor for same order of tracks
df_coverage_binned <- df_coverage_binned %>%
  mutate(sample_strand = factor(sample_strand, unique(sample_strand)))

cowplot::plot_grid(
  track_coverage_separate(
    df = filter(df_coverage, sample == "RNA"),
    start_coord = 0,
    end_coord = 5000,
    track_color = c("#B3B3B3", "#C9C9C9")
  ) + theme_bw(),
  track_coverage_separate(
    df = filter(df_coverage_binned, sample == "RNA"),
    start_coord = 0,
    end_coord = 5000,
    track_color = c("#B3B3B3", "#C9C9C9")
  ) + theme_bw(),
  ncol = 2, align = "h"
)
```
