<- paste0("/Users/MarcChoisy/Library/CloudStorage/",
data_path "OneDrive-OxfordUniversityClinicalResearchUnit/",
"GitHub/choisy/typhoid/")
Clinical score
Synopsis
1 discretizing the continuous variables: here it’s the temperature and the heart rate. To start with, let’s binarize it (but latter on we could run a random forest algorithm in order to the look for the relationship and define an optimal number of classes to discretize this variable)
2 run an ElasticNet logistic regression: for a given discretization choice made on the continuous variables, run an ElasticNet logistic regression, tuning the penalty and mixture hyper-parameters by 10-fold cross-validation of the training data set.
Parameters
The path to the data folder:
Packages
The required packages:
<- c("dplyr", "purrr", "stringr", "rsample", "glmnet", "glmnetUtils",
required_packages "parallel", "magrittr", "tidyr")
Making sure that the required packages are installed:
<- required_packages[! required_packages %in% installed.packages()[, "Package"]]
to_inst if (length(to_inst)) install.packages(to_inst)
rm(required_packages, to_inst)
Loading some of these packages:
library(dplyr)
library(purrr)
library(rsample)
library(glmnet)
library(glmnetUtils)
library(tidyr)
Utilitary functions
A function that computes the moving average of a vector x
with a window w
:
<- function(x, w = 20) {
moving_average |>
x seq_along() |>
map(~ tail(c(0, x), -.x)) |>
head(-w + 1) |>
map(head, w) |>
map_dbl(mean)
}
A function that computes the specificity from a confusion matrix x
:
<- function(x) {
specificity 1, 1] / sum(x[1, ])
x[ }
A function that computes the sensitivity from a confusion matrix x
:
<- function(x) {
sensitivity 2, 2] / sum(x[2, ])
x[ }
A function that computes the accuracy from a confusion matrix x
:
<- function(x) {
accuracy sum(diag(x)) / sum(x)
}
A function that computes Youden’s J index from a confusion matrix x
:
<- function(x) {
j_index specificity(x) + sensitivity(x) - 1
}
A function that converts a named vector x
of coefficient values into a named vector of points of a clinical score, where n
is the number of number of bins used over the range of x
:
<- function(x, n) {
coef2scores <- min(x)
xmin <- max(x)
xmax <- min(diff(seq(xmin, xmax, le = n)))
stepv <- seq(xmin - stepv, xmax + stepv, le = n)
ubrks <- ubrks - ubrks[which.min(abs(ubrks))]
cbrks setNames(sort(c(-(1:sum(cbrks < 0)), 1:sum(cbrks > 0)))[as.integer(cut(x, cbrks))],
::str_remove(names(x), "TRUE")) |>
stringrreplace_na(0)
}
A function that computes the points from a data frame df
of binary valued variables and a vector cscore
of clinical scores. The names of the data frame and the vector should be the same, although not necessarily in the same order:
<- function(df, cscore) {
make_points colSums(t(as.matrix(df)) * cscore[names(df)])
}
A function that splits a data frame x
by rows into a list of 1-row data frames:
<- function(x) {
df2list_rowwise map(1:nrow(x), ~ slice(x, .x))
}
A function that generates the values of thresholds on a x
vector that should be tested, where by
is the step by which these values should be generated:
<- function(x, by = 1) {
make_thresholds seq(min2(x) + by, max2(x) - by, by)
}
A wrapper around rsample
’s bootstraps()
function that returns the actual data frames instead of the row indexes:
<- function(data, ...) {
bootstraps2 <- bootstraps(data, ...)
x map(x$splits, ~ data[.$in_id, ])
}
A function that converts a logical vector x
into a factor, making sure that it’s a factor of 2 levels, TRUE
and FALSE
, even if the x
contains only one of these levels:
<- function(x) {
log2fac factor(x, levels = c("FALSE", "TRUE"))
}
The following function tunes the penalty \(\lambda\) and mixture \(\alpha\) hyper-parameters of a logistic ElasticNet regression and returns the tuned values:
<- function(formula, data) {
tune_model <- glmnetUtils::cva.glmnet(formula, data, family = binomial)
model <- map(model$modlist, ~ as_tibble(.x[1:2]))
lambdas <- rep(model$alpha, map_int(lambdas, nrow))
alphas <- bind_rows(lambdas)
the_grid $alpha <- alphas
the_grid|>
the_grid filter(cvm == min(cvm)) |>
select(lambda, alpha) |>
unlist()
}
For each element of a given vector x
, the following two functions return the minimum (maximum) of this element and 0:
<- function(x) sum(map_int(x, min, 0))
min2 <- function(x) sum(map_int(x, max, 0)) max2
A tuned version of parallel::mclapply()
:
<- function(...) {
mclapply2 ::mclapply(..., mc.cores = parallel::detectCores() - 1)
parallel }
A version of bind_cols()
that works on two data frames, switching their order in the binding process:
<- function(x, y) bind_cols(y, x) bind_cols2
A tuned version of the plot()
function:
<- function(...) plot(..., col = 4) plot2
A function that adds an order
variable to a data frame x
according to a variable v
:
<- function(x) {
add_position0 |>
x arrange(desc(j)) |>
mutate(order = row_number())
}
<- function(x, v) {
add_position |>
x arrange(desc({{ v }})) |>
mutate(order = row_number())
}
Reading the clean data
The Nepal data set:
<- readRDS(paste0(data_path, "clean_data/nepal.rds")) nepal_raw
Feature engineering
Recoding durations (in number of days) into presence / absence and remove observations with missing values:
<- nepal_raw |>
nepal_recoded mutate(across(c(Cough, Diarrhoea, vomiting, Abdopain, Constipation, Headache,
~ .x > 0)) |>
Anorexia, Nausea, Typhoid_IgM), na.exclude()
The subset of symptoms variables:
<- select(nepal_recoded, BloodCSResult, Cough:Hepatomegaly) nepal_symptoms
Let’s have a look at the binary variables:
<- function(x) {
make_tables |>
x pivot_longer(everything(), names_to = "variable", values_to = "value") |>
group_by(variable) |>
group_modify(~ as_tibble(table(.x))) |>
pivot_wider(names_from = value, values_from = n)
}
|>
nepal_symptoms select(where(is.logical)) |>
make_tables() |>
arrange(`TRUE`)
# A tibble: 11 × 3
# Groups: variable [11]
variable `FALSE` `TRUE`
<chr> <int> <int>
1 Splenomegaly 570 21
2 Hepatomegaly 559 32
3 Constipation 516 75
4 Diarrhoea 450 141
5 vomiting 424 167
6 BloodCSResult 421 170
7 Abdopain 401 190
8 Cough 339 252
9 Nausea 318 273
10 Anorexia 153 438
11 Headache 80 511
An alternative way to look at them:
|>
nepal_symptoms select(where(is.logical)) |>
group_by(BloodCSResult) |>
group_modify(~ make_tables(.x)) |>
select(variable, everything()) |>
arrange(variable, BloodCSResult)
# A tibble: 20 × 4
# Groups: BloodCSResult [2]
variable BloodCSResult `FALSE` `TRUE`
<chr> <lgl> <int> <int>
1 Abdopain FALSE 298 123
2 Abdopain TRUE 103 67
3 Anorexia FALSE 115 306
4 Anorexia TRUE 38 132
5 Constipation FALSE 370 51
6 Constipation TRUE 146 24
7 Cough FALSE 228 193
8 Cough TRUE 111 59
9 Diarrhoea FALSE 341 80
10 Diarrhoea TRUE 109 61
11 Headache FALSE 66 355
12 Headache TRUE 14 156
13 Hepatomegaly FALSE 399 22
14 Hepatomegaly TRUE 160 10
15 Nausea FALSE 236 185
16 Nausea TRUE 82 88
17 Splenomegaly FALSE 406 15
18 Splenomegaly TRUE 164 6
19 vomiting FALSE 310 111
20 vomiting TRUE 114 56
A function that makes mirrored histograms to compare histograms of a continuous variable in case of blood culture positive (red) and negative (blue):
<- function(x, xlab, ymax, epsilon = .18) {
mirror_hist <- pull(nepal_symptoms, {{ x }})
variable
<- function(...) {
hist2 hist(...,
freq = FALSE,
breaks = (min(variable) - .5):(max(variable) + .5),
ylab = "density",
main = NA)
}
<- par(mfrow = 2:1)
opar <- par("plt")
plt 3:4] <- c(0, 1 - epsilon)
plt[par(plt = plt)
|>
nepal_symptoms filter(BloodCSResult) |>
pull({{ x }}) |>
hist2(ylim = c(0, ymax), xlab = NA, axes = FALSE, col = 2)
axis(2); axis(3)
mtext(xlab, line = 1.5)
3:4] <- c(epsilon, 1)
plt[par(plt = plt)
|>
nepal_symptoms filter(! BloodCSResult) |>
pull({{ x }}) |>
hist2(ylim = c(ymax, 0), xlab = xlab, col = 4)
par(opar)
}
Let’s now look at the continuous variables:
mirror_hist(Pulse, "pulse", .15)
mirror_hist(OralTemperature, "temperature", .45)
Another way to look at the effect of a continuous on the blood test result, with this function that plots a moving average of the probability of positive result of a variable x
of the the data frame data
as a function of a moving average of the BloodCSResult
variable of this same data frame, with w
the window parameter of these moving averages:
<- function(x, w, data) {
plot_smooth order(data[[x]]), ] |>
data[select({{ x }}, BloodCSResult) |>
map_dfc(moving_average, w) |>
plot2()
}
Let’s have a look at the effect of pulse and temperature:
plot_smooth("OralTemperature", 120, nepal_symptoms)
plot_smooth("Pulse", 100, nepal_symptoms)
Specific functions
This function recodes the OralTemperature
and Pulse
variable of the data set x
according to the thresholds temp
and pulse
respectively:
<- function(x, temp, pulse) {
recoding mutate(x, across(OralTemperature, ~ .x > temp),
across(Pulse, ~ .x > pulse))
}
This function (i) recodes the OralTemperature
and Pulse
variables of the data set x
according to the thresholds temp
and pulse
respectively, (ii) transforms the explanatory variables (all of them binary now) into a variable of points according to the score
(iii) converts these points into a prediction according to threshold
that (iv) is used together with the corresponding observed values to produce a confusion matrix:
<- function(data, temp, pulse, score, threshold) {
confusion_matrix tibble(obsv = log2fac(data$BloodCSResult),
pred = log2fac(make_points(
recoding(data, temp, pulse)[, -1], score) > threshold)) |>
table()
}
This function converts a vector of thresholds
values into a list of confusion matrices generated by the function confusion_matrix
on the data set data, with parameters
tempand
pulseand the score
score`:
<- function(data, temp, pulse, score, thresholds) {
matrix_from_threshold map(thresholds, confusion_matrix,
data = data, temp = temp, pulse = pulse, score = score)
}
For each value of the vector thresholds
of decision-making threshold values, this function computes the average Youden J index averaged across bootstrapped versions of the data set data
, where model predictions are made based on the temp
and pulse
threshold for data recoding and the clinical score score
:
<- function(data, temp, pulse, score, thresholds, ...) {
find_best_threshold |>
data bootstraps2(...) |>
map(matrix_from_threshold, temp = temp, pulse = pulse, score = score,
thresholds = thresholds) |>
map(map, j_index) |>
unlist() |>
matrix(length(thresholds)) |>
rowMeans() |>
setNames(thresholds)
}
For a given row rowX
of a data frame that contains values of temp
and pulse
as well corresponding tuned coefficients of the logistic ElasticNet model, this function returns the tuned n
and t
hyper-parameters as well as the corresponding value of the Youden J index. n_vals
is the vector of values of the n
parameter of the function coef2scores()
to try, by
is the parameter passed to the make_thresholds()
function to generate the decision-making thresholds values to try, and data
is the data frame used to do the computation:
<- function(rowX, n_vals, data, by = 1) {
n_t_j <- n_vals |>
j_values map(coef2scores, x = unlist(rowX[["coeffs"]])) |>
map(~ find_best_threshold(data, rowX$temp, rowX$pulse, .x,
make_thresholds(.x, by))) |>
map(~ .x[which.max(.x)])
<- which.max(j_values)
which_n tibble(n = n_vals[which_n],
t = as.integer(names(j_values[[which_n]])),
j = max(unlist(j_values)))
}
This function returns the coefficient of a tuned ElasticNet logistic regression explaining the BloodCSResult
variable of the data frame x
where the OralTemperature
and Pulse
variable are recoding using the temp
and pulse
threshold values:
<- function(x, temp, pulse) {
tuned_coefficients <- recoding(x, temp, pulse)
x_rec <- BloodCSResult ~ .
frmla <- tune_model(frmla, x_rec)
hyper <- glmnet(model.matrix(frmla, x_rec)[, -1], as.factor(x_rec$BloodCSResult),
model alpha = hyper["alpha"], lambda = hyper["lambda"])
binomial, <- coef(model)[-1, ]
out setNames(out, stringr::str_remove(names(out), "TRUE"))
}
Spliting the data
Let’s create the train and test data sets:
<- initial_split(nepal_symptoms)
data_split <- training(data_split)
train_data <- testing(data_split) test_data
Model training
In series
The pipeline of the inner loop (i.e. tuning hyper-parameters n
and t
on data frame data
). The inner loop results are added to the outer loop results x
. Tried values of n
are provided directly whereas tried values of t
and generated in the loop (as they depends on the values of n
) and the process is controled by the parameter by
.
<- function(x, n, data, by = 1) {
add_ntJ |>
x df2list_rowwise() |>
map_dfr(n_t_j, n, data, by) |>
bind_cols2(x)
}
The full pipeline with the outer loop (hyper-parameters temp
and pulse
) and the inner loop (hyper-parameters n
and t
(whose exploration is controled by by
)):
<- function(data, temp, pulse, n, by = 1) {
serial_pipeline expand.grid(temp = temp, pulse = pulse) |>
as_tibble() |>
mutate(coeffs = map2(temp, pulse, tuned_coefficients, x = data)) |>
add_ntJ(n, data, by)
}
Takes 74’:
<- serial_pipeline(train_data, seq(35, 42, .5), seq(60, 130, 10), 3:15, 1) out_serial
In parallel
<- function(data, temp, pulse, n, by = 1) {
parallel_pipeline # initial grid:
<- expand.grid(temp = temp, pulse = pulse)
grid
# adding outer loop:
<- grid |>
grid as_tibble() |>
mutate(coeffs = mclapply2(1:nrow(grid),
function(x) tuned_coefficients(data,
$temp[x],
grid$pulse[x])))
grid
# adding inner loop:
|>
grid df2list_rowwise() |>
mclapply2(n_t_j, n_vals = n, data = data, by = by) |>
bind_rows() |>
bind_cols2(grid)
}
Takes 18’:
<- parallel_pipeline(train_data,
out_parallel seq(35, 42, .5), seq(60, 130, 10), 3:15, 1)
Processing the outputs
filter(out_serial, j == max(j))
# A tibble: 1 × 6
temp pulse coeffs n t j
<dbl> <dbl> <list> <int> <int> <dbl>
1 36.5 130 <dbl [12]> 7 3 0.297
filter(out_parallel, j == max(j))
# A tibble: 1 × 6
temp pulse coeffs n t j
<dbl> <dbl> <list> <int> <int> <dbl>
1 35.5 120 <dbl [12]> 12 6 0.306
Let’s compare the first best models:
arrange(out_serial, desc(j))
# A tibble: 120 × 6
temp pulse coeffs n t j
<dbl> <dbl> <list> <int> <int> <dbl>
1 36.5 130 <dbl [12]> 7 3 0.297
2 39.5 110 <dbl [12]> 14 7 0.297
3 37.5 60 <dbl [12]> 13 2 0.296
4 42 120 <dbl [12]> 12 7 0.295
5 41 120 <dbl [12]> 12 7 0.294
6 38 110 <dbl [12]> 13 6 0.293
7 39 120 <dbl [12]> 10 8 0.291
8 36.5 70 <dbl [12]> 15 2 0.291
9 37 80 <dbl [12]> 10 0 0.291
10 39 80 <dbl [12]> 15 8 0.290
# ℹ 110 more rows
arrange(out_parallel, desc(j))
# A tibble: 120 × 6
temp pulse coeffs n t j
<dbl> <dbl> <list> <int> <int> <dbl>
1 35.5 120 <dbl [12]> 12 6 0.306
2 36 120 <dbl [12]> 13 8 0.305
3 41.5 130 <dbl [12]> 13 6 0.292
4 37.5 120 <dbl [12]> 13 6 0.292
5 40.5 70 <dbl [12]> 11 4 0.291
6 41.5 80 <dbl [12]> 14 5 0.291
7 35.5 70 <dbl [12]> 13 4 0.290
8 36 80 <dbl [12]> 14 5 0.290
9 38.5 120 <dbl [12]> 12 9 0.289
10 36 130 <dbl [12]> 7 3 0.288
# ℹ 110 more rows
<- bind_rows(mutate(add_position(out_serial, j), batch = "serial"),
out_all mutate(add_position(out_parallel, j), batch = "parallel"))
with(out_all, plot(order, j, col = c(serial = 4, parallel = 2)[batch]))
This shows that there is still a bit of variability.
Model testing
<- filter(out_serial, j == max(j))
best_model_serial <- filter(out_parallel, j == max(j)) best_model_parallel
<- function(x) {
extract_score with(x, list(tempt = temp,
pulse = pulse,
score = coef2scores(first(coeffs), n),
thres = x$t))
}
<- extract_score(best_model_serial)
model_serial <- extract_score(best_model_parallel) model_parallel
<- function(data, model) {
make_predictions |>
data recoding(model$tempt, model$pulse) |>
select(-BloodCSResult) |>
make_points(model$score) |>
::is_greater_than(model$thres)
magrittr }
make_predictions(test_data, model_serial)
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
[13] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
[25] FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
[37] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
[49] TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
[61] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
[73] FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
[85] FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE
[97] FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
[109] FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE
[121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE
[133] TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[145] TRUE TRUE TRUE TRUE
make_predictions(test_data, model_parallel)
[1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
[13] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
[25] FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
[37] FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
[49] TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
[61] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
[73] FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
[85] FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE
[97] FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
[109] FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE
[121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE
[133] TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[145] TRUE TRUE TRUE TRUE
<- function(data, model) {
confmat_test |>
data mutate(predictions = make_predictions(data, model)) |>
select(BloodCSResult, predictions) |>
mutate_all(log2fac) |>
table()
}
<- confmat_test(test_data, model_serial)
confmat_serial <- confmat_test(test_data, model_parallel) confmat_parallel
accuracy(confmat_serial)
[1] 0.6554054
sensitivity(confmat_serial)
[1] 0.5686275
specificity(confmat_serial)
[1] 0.7010309
j_index(confmat_serial)
[1] 0.2696584
accuracy(confmat_parallel)
[1] 0.6554054
sensitivity(confmat_parallel)
[1] 0.5882353
specificity(confmat_parallel)
[1] 0.6907216
j_index(confmat_parallel)
[1] 0.2789569