HERMES3 PHENOTYPING
hermes3_ukbb_phenotyping.Rmd
Packages
Load the data
read the data
withdraw <- fread(file.path(data_dir, “withdraw81499_55_20230915.txt”)) demog <- fread(file.path(data_dir, “data_participant.tsv”)) hesin <- fread(file.path(data_dir, “data_hesin.tsv”)) diag <- fread(file.path(data_dir, “data_hesin_diag.tsv”)) oper <- fread(file.path(data_dir, “data_hesin_oper.tsv”)) self_oper <- fread(file.path(data_dir, “data_self_reported_procedures.tsv”))
read the codes
codes <- fread(system.file(“extdata”, “hermes_3_codes”,
“hermes_3_codes.tsv”, package = “heRmes”)) self_reported_codes <-
list( list(name = “Heart Failure”, code = “1076”, code_type =
“ukbb_self_reported_illness”), list(name = “Myocardial infarction”, code
= “1075”, code_type = “ukbb_self_reported_illness”), list(name =
“Hypertrophic cardiomyopathy”, code = “1588”, code_type =
“ukbb_self_reported_illness”), list(name = “Coronary artery bypass
grafting”, code = “1095”, code_type = “ukbb_self_reported_procedure”),
list(name = “Percutaneous coronary intervention”, code = “1070”,
code_type = “ukbb_self_reported_procedure”) ) codes <- rbind(codes,
data.table(Concept = paste0(sapply(self_reported_codes, function(x)
x$name), " Self Reported"),
Code = sapply(self_reported_codes,
function(x) x$code), Source = sapply(self_reported_codes,
function(x)
xname)))
codes[, :=
(code = Code, code_type = fcase(Source==“ICD10”,
“icd10”, Source==“ICD9”, “icd9”, Source==“OPCS4”, “opcs4”,
Source==“ukbb_self_reported_illness”, “ukbb_self_reported_illness”,
Source==“ukbb_self_reported_procedure”,
“ukbb_self_reported_procedure”))] codes <-
codes[!is.na(code_type)]
ethnicity codes
ethnicity_codes <- list( white = 1, british = 1001, white_black_caribbean = 2001, indian = 3001, caribbean = 4001, mixed = 2, irish = 1002, white_black_african = 2002, pakistani = 3002, african = 4002, asian_or_asian_british= 3, any_other_white = 1003, white_asian = 2003, bangladeshi = 3003, any_other_black = 4003, black_or_black_british= 4, any_other_mixed = 2004, any_other_asian = 3004, chinese = 5, other_ethnic_group = 6)
long data of codes
cohort <- demog[, list(eid = eid, age =
as.integer(21022-0.0
), sex = factor(31-0.0
,
levels = 0:1, labels = c(“female”, “male”)), ethnicity =
factor(21000-0.0
, levels = unlist(ethnicity_codes), labels
= names(ethnicity_codes)), ethnicity_group =
factor(sub(“([0-9])00[0-9]”, “\1”, 21000-0.0
), levels =
unlist(ethnicity_codes), labels = names(ethnicity_codes)), genetic_sex =
factor(22001-0.0
, levels = 0:1, labels = c(“female”,
“male”)), genetic_ethnicity = factor(22006-0.0
, levels = 1,
labels = c(“caucasian”)))] cohort[demog, paste0(“pc”, 1:12) :=
mget(paste0(“i.22009-0.”, 1:12)), on = “eid”]
check
stopifnot(“Failed to parse some date of births” = all(!is.na(cohort$dob))) stopifnot("some ages / dob indicate cohort age <37, is this right?" = all(cohort$dob <= as.Date(“1972-01-01”)))
self reported illness codes
self_rep_code_regex <- “20002-[0-9]+\.[0-9]+” self_rep_year_regex
<- “20008-[0-9]+\.[0-9]+” self_rep_code_cols <-
grep(self_rep_code_regex, names(demog), value = TRUE) self_rep_year_cols
<- grep(self_rep_year_regex, names(demog), value = TRUE) demog[,
(self_rep_code_cols) := lapply(.SD, as.character), .SDcols =
self_rep_code_cols] demog[, (self_rep_year_cols) := lapply(.SD,
as.numeric), .SDcols = self_rep_year_cols] self_rep_illness <-
data.table::melt(demog, id.vars = “eid”, measure =
patterns(self_rep_code_regex, self_rep_year_regex), variable.name =
“element”, value.name = c(“code”, “year”), na.rm = TRUE)
self_rep_illness <- self_rep_illness[year != -1 & year != -3] #
unknown / prefer not to answer self_rep_illness[, :=
(date =
lubridate::ymd(paste0(as.character(floor(year)), “-01-01”)) +
lubridate::days(as.integer(365.25 * (year - floor(year)))), year = NULL,
element = NULL, code = as.character(code), code_type =
“ukbb_self_reported_illness”)]
self reported procedure codes
self_rep_proc_code_regex <- “20004-[0-9]+\.[0-9]+”
self_rep_proc_year_regex <- “20010-[0-9]+\.[0-9]+”
self_rep_proc_code_cols <- grep(self_rep_proc_code_regex,
names(self_oper), value = TRUE) self_rep_proc_year_cols <-
grep(self_rep_proc_year_regex, names(self_oper), value = TRUE)
self_oper[, (self_rep_proc_code_cols) := lapply(.SD, as.character),
.SDcols = self_rep_proc_code_cols] self_oper[, (self_rep_proc_year_cols)
:= lapply(.SD, as.numeric), .SDcols = self_rep_proc_year_cols]
self_rep_oper <- data.table::melt(self_oper, id.vars = “eid”, measure
= patterns(self_rep_proc_code_regex, self_rep_proc_year_regex),
variable.name = “element”, value.name = c(“code”, “year”), na.rm = TRUE)
self_rep_oper <- self_rep_oper[year != -1 & year != -3] # unknown
/ prefer not to answer self_rep_oper[, :=
(date =
lubridate::ymd(paste0(as.character(floor(year)), “-01-01”)) +
lubridate::days(as.integer(365.25 * (year - floor(year)))), year = NULL,
element = NULL, code = as.character(code), code_type =
“ukbb_self_reported_procedure”)]
join self reported diseases and procedures
self_rep_illness <- rbind(self_rep_illness, self_rep_oper)
check self report illness table
stopifnot(“unable to parse dates for self-reported illness codes” = all(!is.na(self_rep_illness$date))) stopifnot("are you sure something happened before 1900?" = all(self_rep_illness$date > as.Date(“1900-01-01”)))
get the inpatient diagnosis codes
hesin[is.na(epistart) | epistart == ““, epistart := admidate]
diag[hesin, date := as.Date(i.epistart), on = c(“eid(participant -
eid)”, “ins_index”)] diag[, eid := eid(participant - eid)
]
diag[diag_icd9 == ““, diag_icd9 := NA_character_] diag[diag_icd10 == ““,
diag_icd10 := NA_character_] diag <- data.table::melt(diag, id.vars =
c(“eid”, “date”), measure.vars = c(“diag_icd9”, “diag_icd10”),
variable.name = “code_type”, value.name = “code”, na.rm = TRUE) diag[,
code_type := data.table::fcase(code_type == “diag_icd9”, “icd9”,
code_type == “diag_icd10”, “icd10”)]
get the inpatient procedure codes
oper[hesin, date := as.Date(i.epistart), on = c(“eid(participant -
eid)”, “ins_index”)] oper[, eid := eid(participant - eid)
]
oper[oper3 == ““, oper3 := NA_character_] oper[oper4 == ““, oper4 :=
NA_character_] oper <- data.table::melt(oper, id.vars = c(“eid”,
“date”), measure.vars = c(“oper3”, “oper4”), variable.name =
“code_type”, value.name = “code”, na.rm = TRUE) oper[, code_type :=
data.table::fcase(code_type == “oper3”, “opcs3”, code_type == “oper4”,
“opcs4”)]
read in the codes and annotate
combined <- codes[combined, on = c(“code” = “code”, “code_type” = “code_type”), allow.cartesian = TRUE] combined <- combined[!is.na(Concept)]
keep only unique, first occurance (in any coding system)
combined <- combined[combined[, .I[which.min(date)], by = c(“eid”, “Concept”)]$V1]
add to the cohort
concepts <- unique(codes$Concept) for (g in concepts) {
col_name <- tolower(gsub(” “,”_“, gsub(”[()]“,”“,g))) cohort[combined[Concept == g], paste0(col_name, c(““,”_first_date”)) := list(TRUE, as.Date(i.date)), on = “eid”] cohort[is.na(get(col_name)), (col_name) := FALSE]
}
add the withdrawal flags
cohort[withdraw, withdrawal := TRUE, on = c(“eid” = “V1”)] cohort[is.na(withdrawal), withdrawal := FALSE]
any ischaemic ICD codes
ischaemic_cols <- c(“myocardial_infarction”, “coronary_artery_bypass_grafting”, “percutaneous_coronary_intervention”, “thrombolysis_coronary”, “ischaemic_cardiomyopathy”) cohort[, ischaemic := rowSums(.SD) > 0, .SDcols = ischaemic_cols] cohort[, ischaemic_first_date := do.call(pmin, c(.SD, na.rm = TRUE)), .SDcols = paste0(ischaemic_cols, “_first_date”)]
combined NICM ICD codes
nicm_cols <- c(“dilated_cardiomyopathy”, “dilated_cardiomyopathy_associated_with”, “left_ventricular_systolic_dysfunction”) cohort[, nicm_comb := rowSums(.SD) > 0, .SDcols = nicm_cols] cohort[, nicm_comb_first_date := do.call(pmin, c(.SD, na.rm = TRUE)), .SDcols = paste0(nicm_cols, “_first_date”)]
any ischaemic self reported codes
self_isch_cols <- c(“myocardial_infarction_self_reported”, “coronary_artery_bypass_grafting_self_reported”, “percutaneous_coronary_intervention_self_reported”) cohort[, self_isch := rowSums(.SD) > 0, .SDcols = self_isch_cols] cohort[, self_isch_first_date := do.call(pmin, c(.SD, na.rm = TRUE)), .SDcols = paste0(self_isch_cols, “_first_date”)]
any self reported HF codes
self_hf_col = c(“heart_failure_self_reported”) cohort[, self_hf := rowSums(.SD) > 0, .SDcols = self_hf_col] cohort[, self_hf_first_date := do.call(pmin, c(.SD, na.rm = TRUE)), .SDcols = paste0(self_hf_col, “_first_date”)]
any self reported HCM codes
self_hcm_col = c(“hypertrophic_cardiomyopathy_self_reported”) cohort[, self_hcm := rowSums(.SD) > 0, .SDcols = self_hcm_col] cohort[, self_hcm_first_date := do.call(pmin, c(.SD, na.rm = TRUE)), .SDcols = paste0(self_hcm_col, “_first_date”)]
HF exclusions
cohort[, hf_exclude := withdrawal == TRUE | congenital_heart_disease==TRUE | # all congenital heart disease (heart_failure==FALSE & (self_hf==TRUE | ischaemic==TRUE | self_isch==TRUE)) | # non-HF but with ischaemic ICD history or self reported heart failure or ischaemic history (heart_failure==TRUE & (ischaemic==TRUE & ischaemic_first_date > heart_failure_first_date))] # HF but with first ischaemic event after the HF diagnosis # pheno 1 cohort[, pheno1 := congenital_heart_disease==FALSE & heart_failure==TRUE]
HF controls
cohort[, hf_control := hf_exclude==FALSE & pheno1==FALSE & pheno2==FALSE & pheno3==FALSE]
DCM exclusions
cohort[, cm_exclude := withdrawal == TRUE | congenital_heart_disease==TRUE | # all congenital heart disease hypertrophic_cardiomyopathy==TRUE | self_hcm==TRUE | # all HCM restrictive_cardiomyopathy==TRUE] # all RCM
pheno 4
cohort[, pheno4 := cm_exclude==FALSE & !(dilated_cardiomyopathy==TRUE & (ischaemic==TRUE & ischaemic_first_date <= dilated_cardiomyopathy_first_date)) & # DCM but with first ischaemic event prior to the DCM diagnosis dilated_cardiomyopathy==TRUE]
pheno 5
cohort[, pheno5 := cm_exclude==FALSE & !(nicm_comb==TRUE & (ischaemic==TRUE & ischaemic_first_date <= nicm_comb_first_date)) & # NICM but with first ischaemic event prior to the NICM diagnosis nicm_comb==TRUE]
CM controls
cohort[, cm_control := cm_exclude==FALSE & pheno4==FALSE & pheno5==FALSE & ischaemic==FALSE & self_isch==FALSE]
check HF phenotyping
base_cols <- c(“eid”, “age”, “sex”, “ethnicity”, “ethnicity_group”,“genetic_sex”, “genetic_ethnicity”, paste0(“pc”,1:12)) sum_cols <- names(cohort)[!names(cohort) %in% base_cols & !grepl(“date”, names(cohort))] summary <- data.table (name = c(“total”, sum_cols), sex = “all”, N = c(nrow(cohort), cohort[, .(sapply(.SD, sum)), .SDcols = sum_cols]$V1)) summary <- rbind(summary, data.table(name = rep(sum_cols, 2), cohort[, .(N = sapply(.SD, sum)), .SDcols = sum_cols, by = “sex”]), fill=TRUE) summary
write summary
fwrite(dcast(summary, name ~ sex, value.var = “N”), file = file.path(Sys.getenv(“DATA_DIR”), “ukbb_81499_20241114”, “hermes3_phenotype_summary.tsv”), sep = “)
cat(“Total =”, summary[name==“total”,N], “; Sum (exc/crtl/1) =”, sum(summary[name%in%c(“hf_exclude”,paste0(“pheno1”),“hf_control”), N]), “”) cat(“Total =”, summary[name==“total”,N], “; Sum (exc/crtl/2/3) =”, sum(summary[name%in%c(“hf_exclude”,paste0(“pheno”,2:3),“hf_control”), N]), “”) cat(“Total =”, summary[name==“total”,N], “; Sum (exc/crtl/5) =”, sum(summary[name%in%c(“cm_exclude”,paste0(“pheno5”),“cm_control”), N]), “”)
write out
fwrite(cohort[, mget(c(base_cols[base_cols != “eid”], paste0(“pheno”, 1:3), “hf_exclude”, “hf_control”, paste0(“pheno”, 4:5), “cm_exclude”, “cm_control”))], file = file.path(Sys.getenv(“DATA_DIR”), “ukbb_81499_20241114”, “hermes3_phenotypes.tsv.gz”), sep = “)
random test samples
set.seed(123) cohort[hf_exclude==T][sample.int(.N, 3), eid] # 5607008-Y 3463412-Y 3399308-Y set.seed(123) cohort[hf_control==T][sample.int(.N, 3), eid] # 5617489-Y 1398510-Y 3387157-Y set.seed(123) cohort[pheno1==T][sample.int(.N, 3), eid] # 5087228-Y 3558314-Y 5669956-Y set.seed(123) cohort[pheno2==T][sample.int(.N, 3), eid] # 4964821-Y 5145334-Y 4110018-Y set.seed(123) cohort[pheno3==T][sample.int(.N, 3), eid] # 1535528-Y 1660078-Y 4720288-Y set.seed(123) cohort[pheno4==T][sample.int(.N, 3), eid] # 4547830-Y 5543541-Y 4762107-Y set.seed(123) cohort[pheno5==T][sample.int(.N, 3), eid] # 2469121-Y 2659022-Y 1340307-Y set.seed(123) cohort[cm_control==T][sample.int(.N, 3), eid] # 5335780-Y 1276673-Y 3304835-Y set.seed(123) cohort[cm_exclude==T][sample.int(.N, 3), eid] # 2946309-Y 4951293-Y 4815931-Y