HERMES QUALITY CONTROL
hermes_qc_script.Rmd
QC script inputs
The QC script input parameters can be seen by calling the script with
the --help
flag.
Rscript $QC_SCRIPT --help
#>
#>
[36m──
[39m
[1mQC script - active arguments
[22m
[36m────────────────────────────────────────────────
[39m
#> usage: /home/runner/work/heRmes/heRmes/scripts/gwas_qc.R [-h] [-v] [-q]
#> [-g GWAS] [-r REF]
#> [-o OUTPUT]
#> [-chr GWAS_CHR]
#> [-bp GWAS_BP]
#> [-ea GWAS_EA]
#> [-oa GWAS_OA]
#> [-eaf GWAS_EAF]
#> [-beta GWAS_BETA]
#> [-se GWAS_SE]
#> [-p GWAS_P]
#> [-n GWAS_N]
#> [-info GWAS_INFO]
#> [-r_id REF_ID]
#> [-r_chr REF_CHR]
#> [-r_bp REF_BP]
#> [-r_ea REF_EA]
#> [-r_oa REF_OA]
#> [-r_eaf REF_EAF]
#> [-adj {lambda,ldsc}]
#> [-noind]
#> [-fd FREQ_DIFF]
#> [-it INFO_THRESH]
#> [-an {AFR,AMR,CSA,EAS,EUR,MID}]
#>
#> options:
#> -h, --help show this help message and exit
#> -v, --verbose Print extra output [default]
#> -q, --quietly Print little output
#> -g GWAS, --gwas GWAS GWAS file path
#> -r REF, --ref REF Reference file path
#> -o OUTPUT, --output OUTPUT
#> Output directory path
#> -chr GWAS_CHR, --gwas_chr GWAS_CHR
#> Chromosome column name in the GWAS file
#> -bp GWAS_BP, --gwas_bp GWAS_BP
#> Base position column name in the GWAS file
#> -ea GWAS_EA, --gwas_ea GWAS_EA
#> Effect allele column name in the GWAS file
#> -oa GWAS_OA, --gwas_oa GWAS_OA
#> Other allele column name in the GWAS file
#> -eaf GWAS_EAF, --gwas_eaf GWAS_EAF
#> Effect allele frequency column name in the GWAS file
#> -beta GWAS_BETA, --gwas_beta GWAS_BETA
#> Beta column name in the GWAS file
#> -se GWAS_SE, --gwas_se GWAS_SE
#> Standard error column name in the GWAS file
#> -p GWAS_P, --gwas_p GWAS_P
#> P-value column name in the GWAS file
#> -n GWAS_N, --gwas_n GWAS_N
#> Sample size column name in the GWAS file
#> -info GWAS_INFO, --gwas_info GWAS_INFO
#> Imputation information score column name in the GWAS
#> file
#> -r_id REF_ID, --ref_id REF_ID
#> Reference rsid column name in the REF file
#> -r_chr REF_CHR, --ref_chr REF_CHR
#> Reference chromosome column name in the REF file
#> -r_bp REF_BP, --ref_bp REF_BP
#> Reference base position column name in the REF file
#> -r_ea REF_EA, --ref_ea REF_EA
#> Reference effect allele column name in the REF file
#> -r_oa REF_OA, --ref_oa REF_OA
#> Reference other allele column name in the REF file
#> -r_eaf REF_EAF, --ref_eaf REF_EAF
#> Reference effect allele frequency column name in the
#> REF file
#> -adj {lambda,ldsc}, --adjustment {lambda,ldsc}
#> Apply genomic control p-value & standard error
#> adjustment using either the GC lambda ('lambda') or
#> the LD score regression intercept ('ldsc')
#> [default=OFF]
#> -noind, --no_indel_alleles
#> Remove indel alleles [default=FALSE]
#> -fd FREQ_DIFF, --freq_diff FREQ_DIFF
#> Retain variants with GWAS:REF allele frequency
#> different less than ... [default=0.2]
#> -it INFO_THRESH, --info_thresh INFO_THRESH
#> Retain variants with info score greater than ...
#> [default=NULL (off)]
#> -an {AFR,AMR,CSA,EAS,EUR,MID}, --ancestry {AFR,AMR,CSA,EAS,EUR,MID}
#> Ancestry code [default=EUR]
Reviewer QC input controls
The main required inputs required are in the table below, we likely will need to add more.
Arg | Type | Source | Expose to UI |
---|---|---|---|
–gwas | string | uploaded gwas file | NA |
–ref | string | static ancestry-specific reference file | NA |
–out | string | output directory to write file and .pngs | NA |
–gwas_[colname] | string | standard column names from mapping | NA |
–ref_[colname] | string | standard column names from reference file | NA |
–ancestry | string | GWAS meta-data input | Meta-data UI |
–no_indel_alleles | bool | reviewer QC user interface | Yes |
–freq_diff | num | reviewer QC user interface | Yes |
–adjustment | string | reviewer QC user interface | Yes |
–info_thresh | num | reviewer QC user interface | Yes |
Example QC script call
# Rscript /Users/xx20081/git/heRmes/scripts/gwas_qc.R -h
Rscript /Users/xx20081/git/heRmes/scripts/gwas_qc.R \
--gwas '/Users/xx20081/Desktop/hermes_cohorts/discovehr_omni/discovehromni_pheno5_hrc_eur_mixed.tsv.gz' \
--gwas_chr 'chr' \
--gwas_bp 'pos_b37' \
--gwas_ea 'A_coded' \
--gwas_oa 'A_noncoded' \
--gwas_eaf 'AFreq_coded' \
--gwas_beta 'beta' \
--gwas_se 'SE' \
--gwas_p 'pval' \
--gwas_n 'n_total' \
--ref '/Users/xx20081/Documents/local_data/genome_reference/hrc_37/HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz' \
--ref_id 'ID' \
--ref_chr '#CHROM' \
--ref_bp 'POS' \
--ref_ea 'ALT' \
--ref_oa 'REF' \
--ref_eaf 'AF' \
--freq_diff 0.2 \
--out '/Users/xx20081/Desktop/qc_tests' \
--freq_diff 0.2 #\
#--adjustment 'ldsc'
Rscript /Users/xx20081/git/heRmes/scripts/gwas_qc.R \
--gwas '/Users/xx20081/Downloads/sample-gwas-expanded.csv' \
--gwas_chr 'CHR' \
--gwas_bp 'BP' \
--gwas_ea 'EA' \
--gwas_oa 'OA' \
--gwas_eaf 'EUR_EAF' \
--gwas_beta 'BETA' \
--gwas_se 'SE' \
--gwas_p 'P' \
--gwas_n 'N' \
--ref '/Users/xx20081/Documents/local_data/genome_reference/hrc_37/HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz' \
--ref_id 'ID' \
--ref_chr '#CHROM' \
--ref_bp 'POS' \
--ref_ea 'ALT' \
--ref_oa 'REF' \
--ref_eaf 'AF' \
--freq_diff 0.2 \
--out '/Users/xx20081/Desktop/qc_tests' \
--no_indel_alleles
QC script
The quality control procedure code is presented below.
#!/usr/bin/env Rscript
#
# Nick Sunderland <nicholas.sunderland@bristol.ac.uk>
#
# GWAS QC script
#
# Command line options= run this script with the -h/--help flag to see options
# Some good resources here: https://github.com/swvanderlaan/MetaGWASToolKit/tree/master/SCRIPTS
#
#=============================================================================
# required packages
#=============================================================================
suppressPackageStartupMessages(library(argparse))
suppressPackageStartupMessages(library(cli))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(viridis))
suppressPackageStartupMessages(library(stats))
# devtools::install_github("mglev1n/ldscr")
suppressPackageStartupMessages(library(ldscr))
#=============================================================================
# parse cli arguments
#=============================================================================
cli_h1("QC script - active arguments")
parser <- ArgumentParser()
# basic controls
parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help="Print extra output [default]")
parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output")
# file paths and file handling
parser$add_argument("-g", "--gwas", action="store", type="character", help="GWAS file path")
parser$add_argument("-r", "--ref", action="store", type="character", help="Reference file path")
parser$add_argument("-o", "--output", action="store", type="character", help="Output directory path")
# GWAS file columns
parser$add_argument("-chr", "--gwas_chr", action="store", type="character", help="Chromosome column name in the GWAS file", default = "chr")
parser$add_argument("-bp", "--gwas_bp", action="store", type="character", help="Base position column name in the GWAS file", default = "bp")
parser$add_argument("-ea", "--gwas_ea", action="store", type="character", help="Effect allele column name in the GWAS file", default = "ea")
parser$add_argument("-oa", "--gwas_oa", action="store", type="character", help="Other allele column name in the GWAS file", default = "oa")
parser$add_argument("-eaf", "--gwas_eaf", action="store", type="character", help="Effect allele frequency column name in the GWAS file", default = "eaf")
parser$add_argument("-beta","--gwas_beta", action="store", type="character", help="Beta column name in the GWAS file", default = "beta")
parser$add_argument("-se", "--gwas_se", action="store", type="character", help="Standard error column name in the GWAS file", default = "se")
parser$add_argument("-p", "--gwas_p", action="store", type="character", help="P-value column name in the GWAS file", default = "p")
parser$add_argument("-n", "--gwas_n", action="store", type="character", help="Sample size column name in the GWAS file", default = "n")
parser$add_argument("-info","--gwas_info", action="store", type="character", help="Imputation information score column name in the GWAS file", default = NULL)
# Reference file columns
parser$add_argument("-r_id", "--ref_id", action="store", type="character", help="Reference rsid column name in the REF file", default = "chr")
parser$add_argument("-r_chr", "--ref_chr", action="store", type="character", help="Reference chromosome column name in the REF file", default = "chr")
parser$add_argument("-r_bp", "--ref_bp", action="store", type="character", help="Reference base position column name in the REF file", default = "bp")
parser$add_argument("-r_ea", "--ref_ea", action="store", type="character", help="Reference effect allele column name in the REF file", default = "ea")
parser$add_argument("-r_oa", "--ref_oa", action="store", type="character", help="Reference other allele column name in the REF file", default = "oa")
parser$add_argument("-r_eaf", "--ref_eaf", action="store", type="character", help="Reference effect allele frequency column name in the REF file", default = "eaf")
# QC parameters
parser$add_argument("-adj", "--adjustment", action="store", type="character", default=NULL, choices = c("lambda", "ldsc"), help="Apply genomic control p-value & standard error adjustment using either the GC lambda ('lambda') or the LD score regression intercept ('ldsc') [default=OFF]")
parser$add_argument("-noind", "--no_indel_alleles", action="store_true", default=FALSE, help="Remove indel alleles [default=FALSE]")
parser$add_argument("-fd", "--freq_diff", action="store", type="numeric", default=0.2, help="Retain variants with GWAS:REF allele frequency different less than ... [default=0.2]")
parser$add_argument("-it", "--info_thresh", action="store", type="numeric", default=NULL, help="Retain variants with info score greater than ... [default=NULL (off)]")
parser$add_argument("-an", "--ancestry", action="store", type="character", default="EUR", choices = c("AFR", "AMR", "CSA", "EAS", "EUR", "MID"), help="Ancestry code [default=EUR]")
# parse CLI arguments
args <- parser$parse_args()
# print out the arguments used to the console
for (i in seq_along(args)) {
cli_text("{.strong {names(args)[i]}} = {.val {args[[i]]}}")
}
#=============================================================================
# start QC run
#=============================================================================
cli_h1("Running GWAS quality control")
# summary table to be iteratively filled
summary <- data.table()
#=============================================================================
# basic check files
#=============================================================================
cli_progress_step("checking file paths and columns names")
# Check GWAS file path provided and that we can read from it
if (is.null(args$gwas) || !file.exists(args$gwas) || file.access(args$gwas, 4) != 0) {
cli_abort("GWAS file path `{.file {args$gwas}}` does not exist or is not readable")
}
# Check GWAS file column names all present
gwas_header <- fread(args$gwas, nrows = 0)
gwas_cols <- sapply(args[c("gwas_chr","gwas_bp","gwas_ea","gwas_oa","gwas_eaf","gwas_beta","gwas_se","gwas_p","gwas_n")], function(x) x %in% names(gwas_header))
if (!all(gwas_cols)) {
for (i in seq_along(gwas_cols)) {
if (gwas_cols[[i]]) {
cli_alert_success("{names(gwas_cols)[i]} = {args[names(gwas_cols)[i]]}")
} else {
cli_alert_danger("{names(gwas_cols)[i]} = {args[names(gwas_cols)[i]]}")
}
}
cli_abort("Columns not all found in GWAS file columns [{names(gwas_header)}]")
}
# Check reference file path provided and that we can read from in
if (is.null(args$ref) || !file.exists(args$ref) || file.access(args$ref, 4) != 0) {
cli_abort("Reference file path `{.file {args$reference}}` does not exist or is not readable")
}
# Check GWAS file column names all present
ref_header <- fread(args$ref, nrows = 0)
ref_cols <- sapply(args[c("ref_id", "ref_chr","ref_bp","ref_ea","ref_oa","ref_eaf")], function(x) x %in% names(ref_header))
if (!all(ref_cols)) {
for (i in seq_along(ref_cols)) {
if (ref_cols[[i]]) {
cli_alert_success("{names(ref_cols)[i]} = {args[names(ref_cols)[i]]}")
} else {
cli_alert_danger("{names(ref_cols)[i]} = {args[names(ref_cols)[i]]}")
}
}
cli_abort("Columns not all found in reference file columns [{names(ref_header)}]")
}
# Check output directory exists
if (is.null(args$output) || !dir.exists(args$output)) {
cli_abort("Output directory path `{.file {args$output}}` not provided or does not exist")
}
#=============================================================================
# read GWAS and reference data
#=============================================================================
cli_progress_step("reading GWAS")
gwas <- fread(args$gwas)
cli_progress_step("reading reference")
ref <- fread(args$ref)
cli_process_done()
#=============================================================================
# check column data missingness
#=============================================================================
cli_h1("Checking column data")
cli_progress_step("checking column data")
gwas_cols <- unlist(args[ names(args)[grepl("^gwas_", names(args))] ])
ref_cols <- unlist(args[ names(args)[grepl("^ref_", names(args))] ])
# total and NA counts
num_na_summary <- gwas[, lapply(.SD, function(col) c(num = .N, num_na = sum(is.na(col)), pct_na = sprintf("%.1f%%", 100 * (sum(is.na(col)) / .N)))), .SDcols = gwas_cols]
# add to summary table
summary <- rbind(summary,
data.table(column = gwas_cols,
std_name = names(gwas_cols),
type = sapply(gwas[, mget(gwas_cols)], typeof),
num = as.integer(sapply(num_na_summary, `[[`, 1)),
num_na = as.integer(sapply(num_na_summary, `[[`, 2)),
pct_na = sapply(num_na_summary, `[[`, 3)))
# report to console
cli_process_done()
print(summary)
#=============================================================================
# check input data validity
#=============================================================================
cli_h1("Checking data validity")
cli_progress_step("checking data validity")
# per column functions that return true if that row is valid
col_fn_list <- list(
gwas_chr = function(x) x %in% 1:26,
gwas_bp = function(x) is.integer(x) & x > 0,
gwas_ea = function(x) grepl("^[ACTG]+$|^[DI]$", x),
gwas_oa = function(x) grepl("^[ACTG]+$|^[DI]$", x),
gwas_eaf = function(x) is.numeric(x) & x >= 0 & x <= 1,
gwas_beta = function(x) is.numeric(x) & is.finite(x),
gwas_se = function(x) is.numeric(x) & x > 0 & is.finite(x),
gwas_p = function(x) is.numeric(x) & x >= 0 & x <=1,
gwas_n = function(x) is.integer(x) & x > 0,
gwas_info = function(x) is.numeric(x) & x >= 0 & x <=1,
ref_chr = function(x) x %in% 1:26,
ref_bp = function(x) is.integer(x) & x > 0,
ref_ea = function(x) grepl("^[ACTG]+$|^[DI]$", x),
ref_oa = function(x) grepl("^[ACTG]+$|^[DI]$", x),
ref_eaf = function(x) is.numeric(x) & x >= 0 & x <= 1
)
# apply the functions to the corresponding columns and create summary table
valid_summary <- gwas[, Map(function(fn, col) {
valid <- sum(fn(col), na.rm = TRUE)
valid_pct <- sprintf("%.1f%%", 100 * (valid / .N))
c(valid = valid, valid_pct = valid_pct)
}, col_fn_list[names(gwas_cols)], .SD), .SDcols = gwas_cols]
# add to main summary table
summary <- cbind(summary,
data.table(valid = as.integer(sapply(valid_summary, `[[`, 1)),
valid_pct = sapply(valid_summary, `[[`, 2)))
# print summary to console as well as the validity functions used
cli_process_done()
print(summary)
cli_div(theme = list(span.var = list(color = "gray"), span.code = list(color = "gray")))
cli_h2("Column validation functions:")
for (i in seq_along(gwas_cols)) {
cli_text("{.field {gwas_cols[i]}:} {.code {deparse(col_fn_list[[names(gwas_cols)[i]]])[2]}}")
}
#=============================================================================
# attempt recoding / data fixes
#=============================================================================
cli_h1("Formatting data")
cli_progress_step("formatting data")
# (re)code chromosome column as integer
gwas[, (args$gwas_chr) := fcase(as.integer(get(args$gwas_chr)) %in% 1:26, as.integer(get(args$gwas_chr)),
grepl("(?i)^X$", get(args$gwas_chr)), 23L,
grepl("(?i)^Y$", get(args$gwas_chr)), 24L,
grepl("(?i)^(PAR|XY|YX)$", get(args$gwas_chr)), 25L,
grepl("(?i)^MT$", get(args$gwas_chr)), 26L,
default = NA_integer_)]
# parse base position and n-sample columns to integer
integer_cols <- c(args$gwas_bp, args$gwas_n)
gwas[, (integer_cols) := lapply(.SD, as.integer), .SDcols = integer_cols]
# parse frequency, beta, se, p, info columns to numeric
numeric_cols <- c(args$gwas_eaf, args$gwas_beta, args$gwas_se, args$gwas_p, args$gwas_info)
gwas[, (numeric_cols) := lapply(.SD, as.numeric), .SDcols = numeric_cols]
# remove allele frequencies <0 or >1
gwas[get(args$gwas_eaf) < 0 | get(args$gwas_eaf) > 1, (args$gwas_eaf) := NA_real_]
# remove infinite betas
gwas[is.infinite(get(args$gwas_beta)), (args$gwas_beta) := NA_real_]
# remove infinite, zero, or negative standard errors
gwas[is.infinite(get(args$gwas_se)) | get(args$gwas_se) <= 0, (args$gwas_se) := NA_real_]
# recode pvalue=0 to minimum machine precision and remove p>1 or p<0
gwas[, (args$gwas_p) := fcase(get(args$gwas_p) == 0, .Machine$double.xmin,
get(args$gwas_p) > 0 & get(args$gwas_p) <= 1, get(args$gwas_p),
default = NA_real_)]
# remove info scores >1 or <0
if (!is.null(args$info_thresh) & !is.null(args$gwas_info)) {
gwas[get(args$gwas_info) < 0 | get(args$gwas_info) > 1 | get(args$gwas_info) < args$info_thresh, (args$gwas_info) := NA_real_]
} else if (is.null(args$info_thresh) & !is.null(args$gwas_info)) {
cli_alert_info("info_thresh not provided, skipping info score filtering and setting info score to 1 for all variants")
gwas[, (args$gwas_info) := 1]
}
# alleles must be characters
gwas[, c(args$gwas_ea, args$gwas_oa) := lapply(.SD, as.character), .SDcols = c(args$gwas_ea, args$gwas_oa)]
# find indels and and number to summary, remove indels if flag specified
indel_idx <- which(gwas[, .(nchar(get(args$gwas_ea)) > 1 | nchar(get(args$gwas_oa)) > 1 | grepl("(?i)^[DI]$", get(args$gwas_ea)) | grepl("(?i)^[DI]$", get(args$gwas_oa)))]$V1)
if (args$no_indel_alleles) {
gwas[indel_idx, c(args$gwas_ea, args$gwas_oa) := NA_character_]
cli_alert_warning("removing {length(indel_idx)} indel alleles")
}
summary[grep("^gwas_(ea|oa)$", std_name), indels := length(indel_idx)]
# ensure alleles are upper case ACTG or D/I
gwas[, c(args$gwas_ea, args$gwas_oa) := lapply(.SD, function(x) ifelse(grepl("(?i)^[ACTG]+$|^[DI]$", x), toupper(x), NA_character_)), .SDcols = c(args$gwas_ea, args$gwas_oa)]
# summarise the counts during this recoding process
postfix_na_summary <- gwas[, lapply(.SD, function(col) c(postfix_valid = sum(!is.na(col)), pct_na = sprintf("%.1f%%", 100 * (sum(!is.na(col)) / .N)))), .SDcols = gwas_cols]
# add to overall summary table
summary <- cbind(summary,
data.table(postfix_valid = as.integer(sapply(postfix_na_summary, `[[`, 1)),
postfix_valid_pct = sapply(postfix_na_summary, `[[`, 2)))
# remove invalid rows (those containing NAs)
gwas <- gwas[ stats::complete.cases(gwas[, mget(gwas_cols)]) ]
# print updated summary to console
print(summary)
# exit if no variants remain
if (nrow(gwas) == 0) {
cli_abort("no variants remain after data validation filters - check data and summary above")
} else {
cli_process_done()
}
#=============================================================================
# formatting reference
#=============================================================================
col_fn_list <- list(
ref_id = as.character,
ref_chr = as.character,
ref_bp = as.integer,
ref_ea = as.character,
ref_oa = as.character,
ref_eaf = as.numeric
)
ref[ , (ref_cols) := Map(function(fn, col) fn(col), col_fn_list[names(ref_cols) ], .SD), .SDcols = ref_cols ]
#=============================================================================
# harmonise alleles
#=============================================================================
cli_h1("Harmonising to reference")
cli_progress_step("harmonising data")
# currently the harmonising function is in my package, but probably best to supply as a stand alone script
h <- genepi.utils::harmonise_gwas(gwas, ref, join = "chr:bp", action = 2,
gmap = c(chr = args$gwas_chr, bp = args$gwas_bp, ea = args$gwas_ea, oa = args$gwas_oa, eaf = args$gwas_eaf, beta = args$gwas_beta),
rmap = c(chr = args$ref_chr, bp = args$ref_bp, ea = args$ref_ea, oa = args$ref_oa, eaf = args$ref_eaf))
# harmonisation summary, add to main summary
summary[, `:=`(harmonised = nrow(h), harmonised_pct = sprintf("%.1f%%", 100*(nrow(h)/num[1])))]
# print updated summary to console
print(summary)
# exit if no variants remain
if (nrow(h) == 0) {
cli_warn("no variants remain after harmonisation - check data and summary above")
}
cli_process_done()
#=============================================================================
# allele frequency analysis
#=============================================================================
cli_h1("Anaylsing GWAS:REF allele frequency")
cli_progress_step("analysing allele frequency difference")
# absolute frequency difference cohort vs reference
h[, freq_diff := abs(eaf - eaf_ref)]
# add frequency difference counts to summary table
diff_col <- paste0("freq_diff_lt", args$freq_diff)
summary[, c(diff_col, paste0(diff_col, "_pct")) := .(sum(h$freq_diff < args$freq_diff), sprintf("%.1f%%", 100*(sum(h$freq_diff < args$freq_diff)/num[1])))]
# print summary to console
cli_process_done()
print(summary)
# plot the frequency differences
eaf_plot <- ggplot(h[freq_diff > args$freq_diff], aes(x = eaf_ref, y = eaf, color = freq_diff)) +
geom_point(size = 3) +
geom_point(data = h[freq_diff > args$freq_diff & strand_flip == TRUE], shape = 17, color = "red", size = 3) +
geom_abline(slope = 1, intercept = args$freq_diff, linetype = "dashed", color = "red") +
geom_abline(slope = 1, intercept = -1 * args$freq_diff, linetype = "dashed", color = "red") +
viridis::scale_colour_viridis(option = "mako") +
labs(x = "Reference allele frequency", y = "Cohort allele frequency",
caption = "*red triangles strand flip") +
lims(x = c(0,1), y = c(0,1)) +
theme_minimal(base_size = 18) +
theme(legend.position = "none")
# save plot
cli_progress_step("plotting allele frequency difference")
grDevices::png(file.path(args$output, "eaf_plot.png"), width = 600, height = 600)
print(eaf_plot)
invisible(dev.off())
cli_process_done()
# filter out the frequency differences over the provided threshold
h <- h[freq_diff < args$freq_diff]
# extract the data from the harmonisation table (don't need the reference columns now, except for the rsid)
setnames(h,
old = c(paste0(ref_cols[["ref_id"]], "_ref"), "chr","bp","ea","oa","eaf","beta", gwas_cols[["gwas_se"]], gwas_cols[["gwas_p"]], gwas_cols[["gwas_n"]]),
new = c(ref_cols[["ref_id"]],
gwas_cols[["gwas_chr"]],
gwas_cols[["gwas_bp"]],
gwas_cols[["gwas_ea"]],
gwas_cols[["gwas_oa"]],
gwas_cols[["gwas_eaf"]],
gwas_cols[["gwas_beta"]],
gwas_cols[["gwas_se"]],
gwas_cols[["gwas_p"]],
gwas_cols[["gwas_n"]]))
#=============================================================================
# PZ plot
#=============================================================================
cli_h1("Assessing for analytical issues")
# generate PZ plot
pz_plot <- h[, `:=`(observed = -log10(get(gwas_cols[["gwas_p"]])),
expected = -log10(2*pnorm(abs(get(gwas_cols[["gwas_beta"]]) / get(gwas_cols[["gwas_se"]])), lower.tail=FALSE)))] |>
ggplot(aes(x = expected, y = observed)) +
geom_point(size = 0.5, color="darkblue") +
geom_abline(slope=1, intercept=0, color="darkred", linetype = "dotted") +
labs(x = "P.ztest (-log10)",
y = "P (-log10)",
color = NULL) +
theme_minimal(base_size = 18) +
theme(aspect.ratio = 1)
# save plot
cli_progress_step("plotting PZ plot")
grDevices::png(file.path(args$output, "pz_plot.png"), width = 600, height = 600)
print(pz_plot)
invisible(dev.off())
cli_process_done()
#=============================================================================
# population stratification analysis 1
#=============================================================================
cli_h1("Assessing for population stratification issues")
cli_progress_step("running LDSC")
# get Z score and run LDSC
h[, z := get(gwas_cols[["gwas_beta"]]) / get(gwas_cols[["gwas_se"]])]
# try to run LDSC
tryCatch({
ldsc_res <- ldsc_h2(munged_sumstats = h[, list(SNP = get(ref_cols[["ref_id"]]), A1=get(gwas_cols[["gwas_ea"]]), A2=get(gwas_cols[["gwas_oa"]]), Z=z, N=get(gwas_cols[["gwas_n"]]))],
ancestry = args$ancestry)
},
error = function(e) {
cli_warn(paste0("LDSC failed: \n", e))
ldsc_res <<- list(intercept = NA_real_)
})
#=============================================================================
# population stratification analysis 2
#=============================================================================
cli_progress_step("calculating lambda-GC")
# data for the QQ plot
h[, `:=`(chisq = qchisq(log(get(gwas_cols[["gwas_p"]])), df = 1, lower.tail = FALSE, log.p = TRUE))] # https://stackoverflow.com/questions/40144267/calculating-miniscule-numbers-for-chi-squared-distribution-numerical-precisio
h[, `:=`(lambda = median(chisq) / qchisq(0.5, 1))]
h[, `:=`(adj_chisq_gc = chisq/lambda)]
h[, `:=`(adj_p_gc = pchisq(adj_chisq_gc, 1, lower.tail=FALSE))]
h[, `:=`(adj_se_gc = get(gwas_cols[["gwas_se"]]) * sqrt(lambda[1]))]
h[, `:=`(adj_chisq_ldsc = chisq/ldsc_res$intercept)]
h[, `:=`(adj_p_ldsc = pchisq(adj_chisq_ldsc, 1, lower.tail=FALSE))]
h[, `:=`(adj_se_ldsc = get(gwas_cols[["gwas_se"]]) * sqrt(ldsc_res$intercept))]
qq_data <- rbind(
# unadjusted
h[, .(stat = "Unadjusted",
value = NA_real_,
observed = -log10(sort(get(gwas_cols[["gwas_p"]]), decreasing=FALSE, na.last=TRUE)),
expected = -log10(ppoints(.N)),
clower = -log10(qbeta(p = (1 - 0.95) / 2, shape1 = 1:.N, shape2 = .N:1)),
cupper = -log10(qbeta(p = (1 + 0.95) / 2, shape1 = 1:.N, shape2 = .N:1)))],
# GC lambda adjusted
h[, .(stat = "GC lambda",
value = lambda[1],
observed = -log10(sort(adj_p_gc, decreasing=FALSE, na.last=TRUE)),
expected = -log10(ppoints(.N)),
clower = NA_real_,
cupper = NA_real_)],
# LDSC intercept adjusted
h[, .(stat = "LDSC intercept",
value = ldsc_res$intercept,
observed = -log10(sort(adj_p_ldsc, decreasing=FALSE, na.last=TRUE)),
expected = -log10(ppoints(.N)),
clower = NA_real_,
cupper = NA_real_)]
)
setorder(qq_data, expected)
# generate QQ-plot
# axis labels
log10Pe <- expression(paste("Expected -log"[10], plain(P)))
log10Po <- expression(paste("Observed -log"[10], plain(P)))
# lambda labels
labels <- qq_data[stat != "Unadjusted",
list(value = value[1],
expected = 2.0,
observed = .GRP-1 + 0.25), by = "stat"]
labels[, label := sprintf("%s = %.3f", stat, value)]
# plot
# colors
point_colors <- if (is.null(args$adjustment)) {
c(Unadjusted = "blue", `GC lambda` = "lightgray", `LDSC intercept` = "darkgray")
} else if (args$adjustment=="lambda") {
c(Unadjusted = "lightgray", `GC lambda` = "blue", `LDSC intercept` = "darkgray")
} else if (args$adjustment=="ldsc") {
c(Unadjusted = "lightgray", `GC lambda` = "darkgray", `LDSC intercept` = "blue")
}
qq_plot <- qq_data |>
ggplot(aes(x = expected, y = observed, color = stat)) +
geom_point(size = 0.5) +
geom_ribbon(aes(x = expected, ymin = clower, ymax = cupper), alpha = 0.1, color="transparent") +
geom_abline(slope=1, intercept=0, color="darkred", linetype = "dotted") +
geom_text(data = labels, aes(x=expected, y=observed, label=label), hjust = 0, color="black", show.legend = FALSE) +
scale_color_manual(values = point_colors) +
labs(x = log10Pe,
y = log10Po,
color = "Adjustment") +
theme_minimal() +
theme(aspect.ratio = 1,
legend.position = "top")
# save plot
cli_progress_step("plotting QQ plot")
grDevices::png(file.path(args$output, "qq_plot.png"), width = 600, height = 600)
print(qq_plot)
invisible(dev.off())
cli_process_done()
#=============================================================================
# extract then save the clean GWAS and summary table
#=============================================================================
# if using adjustment with lambda GC or LDSC intercept, switch the P and SE columns
if (!is.null(args$adjustment)) {
if (args$adjustment=="lambda") {
h[, c(gwas_cols[["gwas_se"]], gwas_cols[["gwas_p"]]) := .(adj_se_gc, adj_p_gc)]
} else if (args$adjustment=="ldsc") {
h[, c(gwas_cols[["gwas_se"]], gwas_cols[["gwas_p"]]) := .(adj_se_ldsc, adj_p_ldsc)]
}
}
# extract wanted columns
gwas <- h[, .SD, .SDcols = c(ref_cols[["ref_id"]], gwas_cols)]
# save gwas
sans_ext <- sub("([^.]+)\\.[[:alnum:]]+$", "\\1", sub("[.](gz|bz2|xz)$", "", basename(args$gwas)))
out_path <- file.path(args$output, paste0(sans_ext, '_clean.tsv.gz'))
cli_progress_step("saving clean GWAS file to {.file {out_path}}")
fwrite(gwas, out_path, sep = "\t")
# save summary
log_path <- file.path(args$output, paste0(sans_ext, '_log.tsv'))
cli_progress_step("saving log file to {.file {log_path}}")
fwrite(summary, log_path, sep = "\t")
#=============================================================================
# finished
#=============================================================================
cli_process_done()
#=============================================================================
# testing
#=============================================================================
if (FALSE) {
args = list()
args$gwas = '/Users/xx20081/Documents/local_data/hermes_incidence/raw/Pheno5-DCM_EUR/FORMAT-METAL_Pheno5-DCM_EUR.tsv.gz'
args$ref ='/Users/xx20081/Documents/local_data/genome_reference/hrc_37/HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz'
args$gwas_beta= "A1_beta"
args$gwas_bp= "pos_b37"
args$gwas_chr= "chr"
args$gwas_ea= "A1"
args$gwas_eaf= "A1_freq"
args$gwas_info= NULL
args$gwas_n= "N_total"
args$gwas_oa= "A2"
args$gwas_p= "pval"
args$gwas_se= "se"
args$output= "/Users/xx20081/Desktop/qc_tests"
args$overwrite= TRUE
args$ref_id= "ID"
args$ref_bp= "POS"
args$ref_chr= "#CHROM"
args$ref_ea= "ALT"
args$ref_eaf= "AF"
args$ref_oa= "REF"
args$verbose= TRUE
args$freq_diff= 0.2
args$adjustment=NULL #"ldsc"
args$no_indel_alleles= FALSE
args$info_thresh= NULL
args$ancestry="EUR"
}