Here is an example of mapping CHR:POS:REF:ALT columns to dbSNP build 156 RSIDs.

library(genepi.utils)

gwas <- data.table::fread(system.file("extdata", "example_gwas_sumstats.tsv", package="genepi.utils"))

head(gwas)
gwas_with_rsids <- genepi.utils::chrpos_to_rsid(gwas,
                                                chr_col = "chromosome",
                                                pos_col = "base_pair_location",
                                                ea_col  = "effect_allele",
                                                nea_col = "other_allele",
                                                build   = "b37_dbsnp156")

head(gwas_with_rsids)

#>    chromosome base_pair_location effect_allele other_allele       RSID
#>         <int>              <int>        <char>       <char>     <char>
#> 1:         10          100000625             G            A  rs7899632
#> 2:         10          100000645             C            A rs61875309
#> 3:         10          100003242             G            T rs12258651
#> 4:         10          100003785             C            T  rs1359508
#> 5:         10          100004360             A            G  rs1048754

Available builds

Currently we have the latest dbSNP build 156 available for hg37 and hg38.

Monitoring progress

The mapping can take some time, depending on the number of cores on your machine. To monitor progress we have to use the form below. We need to wrap the function in progressr::with_progress({ }).

progressr::with_progress({
  
  gwas_with_rsids <- genepi.utils::chrpos_to_rsid(gwas,
                                                  chr_col = "chromosome",
                                                  pos_col = "base_pair_location",
                                                  ea_col  = "effect_allele",
                                                  nea_col = "other_allele",
                                                  build   = "b37_dbsnp156")

})

# Chromosomes to process:  10
# |=================                                                                    |  12%

Function options

Alt RSID output

In the dbSNP database there are multiple RSID assigned to the same CHR:POS:REF:ALT combination. This function will return the first RSID encountered and filter out the rest. If you want to get the alternative RSIDs then use the alt_rsids=TRUE flag. This will change the structure of the output to a named list() of two data.table elements: 1) “data”=gwas_data and 2) “alt_rsids”=alt_rsid_data.

gwas_with_rsids <- genepi.utils::chrpos_to_rsid(gwas,
                                                chr_col = "chromosome",
                                                pos_col = "base_pair_location",
                                                ea_col  = "effect_allele",
                                                nea_col = "other_allele",
                                                build   = "b37_dbsnp156", 
                                                alt_rsids=TRUE)

str(gwas_with_rsids)

Allele specification / matching

Although matching on alleles is desirable, especially for indels, we can still match on just chromosome and position by either leaving ea_col and nea_col arguments out, or setting them to NULL.

gwas_with_rsids <- genepi.utils::chrpos_to_rsid(gwas,
                                                chr_col = "chromosome",
                                                pos_col = "base_pair_location",
                                                build   = "b37_dbsnp156")

str(gwas_with_rsids)

Alleles are not always aligned to the reference and so we allow allele flipping by default with the parameter flip="allow". If you would like to flag which variants have been assigned an rsID based on flipped alleles use flip="report"; this will add a logical column to the output called rsid_flip_match. To prevent allele flipping and only assign based on the reference allele orientation use flip="no_flip".

gwas_with_rsids <- genepi.utils::chrpos_to_rsid(gwas,
                                                chr_col = "chromosome",
                                                pos_col = "base_pair_location",
                                                ea_col  = "effect_allele",
                                                nea_col = "other_allele",
                                                flip    = "report",
                                                build   = "b37_dbsnp156")

str(gwas_with_rsids)

Coding of indels as D/I is allowed, however please check the alternate rsID output as mutliple rsIDs (indels) may be present for D/I coding at any particular base position.

gwas_id_coding <- data.table::fread(system.file("extdata", "example3_gwas_sumstats.tsv", package="genepi.utils"))

gwas_with_rsids <- genepi.utils::chrpos_to_rsid(dt      = gwas_id_coding,
                                                chr_col = "chromosome",
                                                pos_col = "base_pair_location",
                                                ea_col  = "effect_allele",
                                                nea_col = "other_allele",
                                                flip    = "allow",
                                                build   = "b37_dbsnp156", 
                                                alt_rsids=TRUE)

str(gwas_with_rsids)

Evaluation speed

The choice of parameters will have a small impact computation speed. Since the dbSNP data is stored as .fst binary files and only the desired rows / columns are ever read into memory, the more data that is requested the slower the computation. That said, it is still much faster / feasible than trying to read in the entire dbSNP database (over 1 billion rsIDs).

library(microbenchmark)
library(ggplot2)

# some GWAS data
dt <- data.table::fread( gwas_sumstats_4.3million_rows )

# benchmarking
mbm <- microbenchmark("chrpos_to_rsid: no alleles, no alt rsids" = {
                              progressr::with_progress({
                                genepi.utils::chrpos_to_rsid(dt,
                                                             chr_col="CHR",
                                                             pos_col="POS",
                                                             ea_col=NULL,
                                                             nea_col=NULL,
                                                             build="b37_dbsnp156",
                                                             alt_rsids=FALSE,
                                                             flip="report")
                              })
                        },
                      "chrpos_to_rsid: no alleles, with alt rsids" = {
                        progressr::with_progress({
                          genepi.utils::chrpos_to_rsid(dt,
                                                       chr_col="CHR",
                                                       pos_col="POS",
                                                       ea_col=NULL,
                                                       nea_col=NULL,
                                                       build="b37_dbsnp156",
                                                       alt_rsids=TRUE,
                                                       flip="report")
                        })
                      },
                      "chrpos_to_rsid: with alleles, no alt rsids" = {
                        progressr::with_progress({
                          genepi.utils::chrpos_to_rsid(dt,
                                                       chr_col="CHR",
                                                       pos_col="POS",
                                                       ea_col="EFFECT_ALLELE",
                                                       nea_col="OTHER_ALLELE",
                                                       build="b37_dbsnp156",
                                                       alt_rsids=FALSE,
                                                       flip="report")
                        })
                      },
                      "chrpos_to_rsid: with alleles, with alt rsids" = {
                        progressr::with_progress({
                          genepi.utils::chrpos_to_rsid(dt,
                                                       chr_col="CHR",
                                                       pos_col="POS",
                                                       ea_col="EFFECT_ALLELE",
                                                       nea_col="OTHER_ALLELE",
                                                       build="b37_dbsnp156",
                                                       alt_rsids=TRUE,
                                                       flip="report")
                        })
                      },
                      times = 10L)

# plot
autoplot(mbm)