CHR:POS to RSID mapping
chrpos_to_rsid.Rmd
Here is an example of mapping CHR:POS:REF:ALT columns to dbSNP build 156 RSIDs.
library(genepi.utils)
gwas <- data.table::fread(system.file("extdata", "example_gwas_sumstats.tsv", package="genepi.utils"))
head(gwas)
gwas_with_rsids <- genepi.utils::chrpos_to_rsid(gwas,
chr_col = "chromosome",
pos_col = "base_pair_location",
ea_col = "effect_allele",
nea_col = "other_allele",
build = "b37_dbsnp156")
head(gwas_with_rsids)
#> chromosome base_pair_location effect_allele other_allele RSID
#> <int> <int> <char> <char> <char>
#> 1: 10 100000625 G A rs7899632
#> 2: 10 100000645 C A rs61875309
#> 3: 10 100003242 G T rs12258651
#> 4: 10 100003785 C T rs1359508
#> 5: 10 100004360 A G rs1048754
Monitoring progress
The mapping can take some time, depending on the number of cores on
your machine. To monitor progress we have to use the form below. We need
to wrap the function in progressr::with_progress({ })
.
progressr::with_progress({
gwas_with_rsids <- genepi.utils::chrpos_to_rsid(gwas,
chr_col = "chromosome",
pos_col = "base_pair_location",
ea_col = "effect_allele",
nea_col = "other_allele",
build = "b37_dbsnp156")
})
# Chromosomes to process: 10
# |================= | 12%
Function options
Alt RSID output
In the dbSNP database there are multiple RSID assigned to the same
CHR:POS:REF:ALT combination. This function will return the first RSID
encountered and filter out the rest. If you want to get the alternative
RSIDs then use the alt_rsids=TRUE
flag. This will change
the structure of the output to a named list()
of two
data.table
elements: 1) “data”=gwas_data and 2)
“alt_rsids”=alt_rsid_data.
gwas_with_rsids <- genepi.utils::chrpos_to_rsid(gwas,
chr_col = "chromosome",
pos_col = "base_pair_location",
ea_col = "effect_allele",
nea_col = "other_allele",
build = "b37_dbsnp156",
alt_rsids=TRUE)
str(gwas_with_rsids)
Allele specification / matching
Although matching on alleles is desirable, especially for indels, we
can still match on just chromosome and position by either leaving
ea_col
and nea_col
arguments out, or setting
them to NULL
.
gwas_with_rsids <- genepi.utils::chrpos_to_rsid(gwas,
chr_col = "chromosome",
pos_col = "base_pair_location",
build = "b37_dbsnp156")
str(gwas_with_rsids)
Alleles are not always aligned to the reference and so we allow
allele flipping by default with the parameter flip="allow"
.
If you would like to flag which variants have been assigned an rsID
based on flipped alleles use flip="report"
; this will add a
logical column to the output called rsid_flip_match
. To
prevent allele flipping and only assign based on the reference allele
orientation use flip="no_flip"
.
gwas_with_rsids <- genepi.utils::chrpos_to_rsid(gwas,
chr_col = "chromosome",
pos_col = "base_pair_location",
ea_col = "effect_allele",
nea_col = "other_allele",
flip = "report",
build = "b37_dbsnp156")
str(gwas_with_rsids)
Coding of indels as D/I is allowed, however please check the alternate rsID output as mutliple rsIDs (indels) may be present for D/I coding at any particular base position.
gwas_id_coding <- data.table::fread(system.file("extdata", "example3_gwas_sumstats.tsv", package="genepi.utils"))
gwas_with_rsids <- genepi.utils::chrpos_to_rsid(dt = gwas_id_coding,
chr_col = "chromosome",
pos_col = "base_pair_location",
ea_col = "effect_allele",
nea_col = "other_allele",
flip = "allow",
build = "b37_dbsnp156",
alt_rsids=TRUE)
str(gwas_with_rsids)
Evaluation speed
The choice of parameters will have a small impact computation speed.
Since the dbSNP data is stored as .fst
binary files and
only the desired rows / columns are ever read into memory, the more data
that is requested the slower the computation. That said, it is still
much faster / feasible than trying to read in the entire dbSNP database
(over 1 billion rsIDs).
library(microbenchmark)
library(ggplot2)
# some GWAS data
dt <- data.table::fread( gwas_sumstats_4.3million_rows )
# benchmarking
mbm <- microbenchmark("chrpos_to_rsid: no alleles, no alt rsids" = {
progressr::with_progress({
genepi.utils::chrpos_to_rsid(dt,
chr_col="CHR",
pos_col="POS",
ea_col=NULL,
nea_col=NULL,
build="b37_dbsnp156",
alt_rsids=FALSE,
flip="report")
})
},
"chrpos_to_rsid: no alleles, with alt rsids" = {
progressr::with_progress({
genepi.utils::chrpos_to_rsid(dt,
chr_col="CHR",
pos_col="POS",
ea_col=NULL,
nea_col=NULL,
build="b37_dbsnp156",
alt_rsids=TRUE,
flip="report")
})
},
"chrpos_to_rsid: with alleles, no alt rsids" = {
progressr::with_progress({
genepi.utils::chrpos_to_rsid(dt,
chr_col="CHR",
pos_col="POS",
ea_col="EFFECT_ALLELE",
nea_col="OTHER_ALLELE",
build="b37_dbsnp156",
alt_rsids=FALSE,
flip="report")
})
},
"chrpos_to_rsid: with alleles, with alt rsids" = {
progressr::with_progress({
genepi.utils::chrpos_to_rsid(dt,
chr_col="CHR",
pos_col="POS",
ea_col="EFFECT_ALLELE",
nea_col="OTHER_ALLELE",
build="b37_dbsnp156",
alt_rsids=TRUE,
flip="report")
})
},
times = 10L)
# plot
autoplot(mbm)