Measles serology

1 Constants

The path to the data folder:

if (grepl("arc|hoisy", Sys.getenv("USER"))) {
  path2data <- paste0("~/Library/CloudStorage/OneDrive-OxfordUniversityClinical",
                      "ResearchUnit/GitHub/choisy/measles serology/")
}

Some labels definitions:

titer_lab <- "titer (IU/L)"
age_lab <- "age (years)"
col_female <- rgb(247, 114, 184, max = 255)
col_male <- rgb(62, 146, 247, max = 255)

2 Packages

Required packages:

required <- c("sf", "readxl", "stringr", "stringi", "purrr", "tidyr", "magrittr",
              "RColorBrewer", "lubridate", "mgcv", "forcats", "dplyr")

Installing those that are not installed yet:

to_install <- required[! required %in% installed.packages()[, "Package"]]
if (length(to_install)) install.packages(to_install)

Loading the packages for interactive use:

invisible(sapply(required, library, character.only = TRUE))

3 Functions

A function that generates names of a vector of character strings from its content:

set_names2 <- function(x) {
  setNames(x, x |>
                str_remove("^.*_") |> 
                str_remove("^.*-") |> 
                str_remove(".xlsx$"))
}

A tuning of the text() function that includes white background to the text:

text2 <- function(x, y, labels, cex = 1, bg = "white", padx = 10, pady = .1, ...) {
  w <- strwidth(labels, cex = cex)
  h <- strheight(labels, cex = cex)
  rect(x - w / 2 - padx, y - h / 2 - pady / 2, x + w / 2 + padx, y + h / 2 + pady / 2,
       col = bg, border = NA)
  text(x, y, labels, cex = cex, ...)
}

Tuning of the rgb() function:

rgb2 <- function(...) rgb(..., maxColorValue = 255)

Tunings of the plot() function:

plot2 <- function(...) plot(..., axes = FALSE)
plot3 <- function(...) plot2(..., ann = FALSE)
plot4 <- function(...) plot3(..., type = "n")
plot_titers_empty <- function(...) {
  plot2(..., type = "n", xlab = titer_lab, ylab = "density")
}

A tuning of the pie() function:

pie2 <- function(x, ...) if (all(x == 0)) plot4(1) else pie(x, ...)

A function that pads a data frame of coordinates:

pad <- function(x) {
  x <- arrange(x, 1)
  xs <- x[[1]]
  tibble(c(first(xs), xs, last(xs)), c(0, x[[2]], 0)) |> 
    setNames(names(x)[1:2])
}

Some tunings of the abline() function:

abline2 <- function(...) abline(..., col = "lightgrey")
abline3 <- function(...) abline(..., col = "grey")

Some tunings of the mgcv::gam() function:

gam2 <- function(...) mgcv::gam(..., method = "REML")
gam3 <- function(formula, data, ...) gam2(formula, data, family = binomial, ...)

A function that retrieves the REML from a GAM model:

get_reml <- function(x) as.numeric(x$gcv.ubre)

A function that retrieves the percentage of deviance explained from a GAM model:

dev_explained <- function(x) summary(x)$dev.expl

A tuning of the segments() function:

segments2 <- function(...) segments(..., col = 2, lwd = 4)

A function that extends the tally() function by adding a column of the corresponding proportions:

tally2 <- function(x) {
  x |> 
    tally() |> 
    mutate(prop = 100 * n / sum(n))
}

A tuning of the arrows() function:

arrows2 <- function(...) arrows(..., angle = 90, code = 3)

A tuning of the points() function:

points2 <- function(...) points(..., pch = 19)

A tuning of the par() function:

par2 <- function(...) par(..., cex = 1)

A function that adds the \(y\)-axis on a log scale:

logaxis <- function(side, po10 = 1:5) {
  axis(side, log10(unlist(map(10^(po10), ~ .x * 1:10))), NA, tcl = - .25)
  axis(side, po10, 10^po10)
}

A tuning of the binom.test() function:

binom_test2 <- function(x, n, ...) {
  if (n < 1) return(list(estimate = NA, conf.int = rep(NA, 2)))
  binom.test(x, n, ...)
}

A tuning of the predict() function:

predict2 <- function(...) mgcv::predict.gam(..., type = "response")

A tuning of the predict2() function:

predict4 <- function(object, newdata, se = .95, ...) {
  if (! is.data.frame(newdata)) newdata <- as_tibble(newdata)
  bind_cols(newdata, as_tibble(predict2(object, newdata, se = .95, ...)))
}

The following function takes a vector v of densities of probabilities as an input and returns a threshold value above which the values of v belong to the p confidence interval:

ci2th <- function(v, p) {
  denominator <- sum(v)
  f <- function(th) sum(v[v > th]) / denominator
  optimize(function(y) abs(f(y) - p), range(v))$min
}

A tuning of the cut() function:

cut2 <- function(x, ...) {
  if (all(is.na(x))) return(x)
  cut(x, ...)
}

The following function converts a vector x of densities of probabilities into a vector of quantile belonging where the quantiles are defined by the vector breaks:

quantval2quantcat <- function(x, breaks = seq(.95, .05, -.05)) {
  as.numeric(cut2(x, c(0, map_dbl(breaks, ci2th, v = x), max(x))))
}

A function that converts Vietnamese characters into Latin characters:

convert <- function(x) stringi::stri_trans_general(x, "Latin-ASCII")

A function that converts the names of a data frame into Latin ASCII:

latin_var_names <- function(df) {
  names(df) <- convert(names(df))
  df
}

A tuning of the lubridate::as_date function:

as_date_excel <- function(...) lubridate::as_date(..., origin = "1899-12-30")

A function that converts a date from number of days in character string into an R date object, using Excel’s origin of time:

convert_date <- function(x) x |> as.numeric() |> as_date_excel()

A tuning of the readxl::read_excel() function:

read_excel2 <- function(...) readxl::read_excel(..., col_types = "text")

Tunings of the min() and max() functions:

min2 <- function(...) min(..., na.rm = TRUE)
max2 <- function(...) max(..., na.rm = TRUE)

A function that replaces all the NA of a data frame by 0:

replace_na2 <- function(x) mutate(x, across(everything(), ~ replace_na(.x, 0)))

A function that plots an sf object using one of its columns to define the colors:

plot_sf2 <- function(sf, col) {
  sf |> 
    st_geometry() |> 
    plot(col = sf[[col]])
}

A function that adds the column-wise sum of the numeric variables at the bottom of a data frame:

add_colsum <- function(df) {
  bind_rows(df, map_dfr(df, function(x) ifelse(is.numeric(x), sum(x), NA)))
}

A tuning of the polygon() function for densities:

polygon2 <- function(x, y, col, ...) {
  polygon(c(first(x), x, last(x)), c(0, y, 0), col = col, border = col, ...)
}

A tuning of the polygon() function for bands:

polygon3 <- function(x, l, u, ...) {
  polygon(c(x, rev(x)), c(l, rev(u)), border = NA, ...)
}

A tuning of hcl.colors():

yellow_red <- function(n) hcl.colors(n, "YlOrRd", rev = TRUE)

A tuning of the str_split() function:

str_split2 <- function(...) first(stringr::str_split(...))

A tuning of the density() function:

density2 <- function(x, from, to, n, ...) {
  if (length(x) < 2) return(rep(NA, n))
  density(x, from = from, to = to, n = n, ...)$y
}

4 Data

4.1 Functions

A hash table to recode sex:

hash_sex <- setNames(rep(c("female", "male"), c(4, 3)),
                     c("f", "nu", "nnu", "nuu", "m", "n", "nam"))

A piece of code common to all the sample collections:

common_data_processing <- function(x) {
  x |> 
    filter_out(if_all(everything(), is.na)) |>
    mutate(across(sex, ~ hash_sex[tolower(convert(.x))])) |> 
    separate_wider_delim(ID, regex("\\s+"), names = c("sID", "cdate", "plate")) |>
    mutate(across(titer, as.double))
}

A function that generates age groups from age:

make_age_group <- function(x) cut(x, c(0:20, seq(25, 70, 5), 100), right = FALSE)

4.2 Data

4.2.1 South

Loading all the excel files of the data folder:

the3files <- paste0(path2data, "south") |> 
  list.files(full.names = TRUE) |> 
  set_names2() |> 
  map(read_excel2, "Marc")

The first collection:

first_collection <- the3files$DEC2019 |> 
  common_data_processing() |> 
  mutate(date_chr = if_else( grepl("[./]", date_collection), date_collection, NA),
         date_num = if_else(!grepl("[./]", date_collection), date_collection, NA),
         across(date_num, ~ .x |> as.numeric() |> as_date_excel()),
         across(date_chr, dmy),
         collection_date = coalesce(date_num, date_chr)) |> 
  select(participant, hospital, collection, sample_ID, age_group, age, sex,
         collection_date, plate, titer, conclusion)

Combining the collections:

south <- the3files |> 
  extract(c("Dec2022", "Apr24")) |> 
  map(select, -date_collection) |> 
  bind_rows() |> 
  common_data_processing() |> 
  mutate(collection_date = ymd(paste(year_collection, month_collection, day_collection,
                                     sep = "-"))) |> 
  select(participant, province, hospital, site, collection, sample_ID, age_group, age,
         sex, collection_date, plate, titer, conclusion) |> 
  bind_rows(first_collection) |> 
  mutate(province = ifelse(is.na(province), hospital, province),
         across(province, ~ recode_values(.x, "AG"     ~ "An Giang",
                                              "BD"     ~ "Binh Dinh",
                                              "DL"     ~ "Dak Lak",
                                              "DT"     ~ "Dong Thap",
                                              "HC"     ~ "HCMC",
                                              "HU"     ~ "Hue",
                                              "KG"     ~ "Kien Giang",
                                              "KH"     ~ "Khanh Hoa",
                                              "QN"     ~ "Quang Ngai",
                                              "ST"     ~ "Soc Trang",
                                              "Daklak" ~ "Dak Lak",
                                              "HUE"    ~ "Hue",
                                              "HCM"    ~ "HCMC", default = .x)),
         year = year(collection_date),
         time_point = recode_values(year, 2019:2020 ~ "T1",
                                          2022:2023 ~ "T2", default = "T3"),
         across(age, ~ ifelse(participant == "0158" & province == "Dak Lak" &
                                collection == "42", "48", .x)),
         age_unit = str_remove_all(age, "\\d"),
         age2 = as.numeric(str_remove_all(age, "\\D")),
         age3 = as.numeric(age),  # note: this generate NAs
         age4 = ifelse(is.na(age3), ifelse(age_unit %in% c("M", "th"), age2 / 12,
                                           ifelse(age_unit == "ng", age2 / 365, age2)),
                       age3),
         age = ifelse(age4 > 100, year - age4, age4),
         across(age, ~ ifelse(.x %% 1 == 0 & .x > 0, .x + .5, .x)),
         age_group = make_age_group(age),
         across(conclusion, ~ ifelse(.x == "pós", "Pos", .x))) |> 
  select(-c(year, age_unit, age2, age3, age4, hospital)) |> 
  arrange(time_point, province, age_group, collection_date)

A hash table to generate site ID from province names:

hash_id <- south |> 
  filter_out(time_point == "T1") |> 
  filter(age > 20) |> 
  select(province, site) |> 
  unique() |> 
  with(setNames(site, province))

Completing the site ID when missing:

south <- mutate(south, across(site, ~ ifelse(is.na(.x), hash_id[province], .x)))

Which gives:

south

# A tibble: 5,573 × 13
   participant province site  collection sample_ID age_group   age sex   
   <chr>       <chr>    <chr> <chr>      <chr>     <fct>     <dbl> <chr> 
 1 0033        An Giang 119   42         AG420033  [9,10)      9.5 female
 2 0060        An Giang 119   42         AG420060  [9,10)      9.5 female
 3 0016        An Giang 119   42         AG420016  [14,15)    14.5 male  
 4 0036        An Giang 119   42         AG420036  [14,15)    14.5 female
 5 0040        An Giang 119   42         AG420040  [14,15)    14.5 female
 6 0022        An Giang 119   42         AG420022  [15,16)    15.5 female
 7 0037        An Giang 119   42         AG420037  [15,16)    15.5 female
 8 0042        An Giang 119   42         AG420042  [15,16)    15.5 male  
 9 0009        An Giang 119   42         AG420009  [16,17)    16.5 female
10 0013        An Giang 119   42         AG420013  [16,17)    16.5 female
# ℹ 5,563 more rows
# ℹ 5 more variables: collection_date <date>, plate <chr>, titer <dbl>,
#   conclusion <chr>, time_point <chr>

4.2.2 North

Hash table for conclusion:

hash_conclusion <- c("Am tinh"    = "Neg",
                     "Duong tinh" = "Pos",
                     "Nghi ngo"   = "Equi",
                     "Equi"       = "Equi",
                     "Pos"        = "Pos",
                     "Neg"        = "Neg",
                     "pós"        = "Pos")

Hash table for the variable names of the northern part of the country:

hash_translation <- c("Benh vien"     = "hospital",  
                      "Ten benh vien" = "province",
                      "Lab ID"        = "Lab ID",
                      "Ma benh vien"  = "site",
                      "Dot lay mau"   = "collection",
                      "Ma benh pham"  = "participant",
                      "Study ID"      = "sample_ID",
                      "Ten viet tat"  = "initials",
                      "Do tuoi"       = "age",
                      "Gioi tinh"     = "sex",
                      "Khoa"          = "ward",
                      "Ngay nhan mau" = "reception_date",
                      "Ngay thu mau"  = "collection_date",
                      "Note"          = "Note",
                      "Ghi chu"       = "Note2")

A function that translates the names of the variable of a data frame from Vietnamese to English using the hash_translation hash table:

convert_var_names <- function(df) {
  names(df) <- hash_translation[names(df)]
  df
}

Putting the titers data into shape:

titers <- map(c("IgG Serion 2025 Measles 3021 mau.xls",
                "IgG Serion 2025 Measles 663 mau.xls"),
              ~ read_excel(paste0(path2data, "north/titers/", .x))) |> 
  bind_rows() |> 
  latin_var_names() |> 
  select(-STT, -`ID mau`) |> 
  rename(titer      = `Nong do khang the`,
         conclusion = `Ket qua`) |> 
  mutate(across(conclusion, ~ hash_conclusion[convert(.x)])) |> 
  unique()

Generating the data set for the northern part of the country:

north <- c("02FL Tracking sample round 1 to 6 Final.xlsx",
           "02FL Tracking sample round 7 to 9.xlsx") |> 
  rep(2:1) |> 
  map2(c(1, 2, 1), ~ read_excel2(paste0(path2data, "north/metadata/", .x), .y)) |> 
  bind_rows() |> 
  latin_var_names() |> 
  convert_var_names() |> 
  mutate(across(province, convert),
         across(province, ~ .x |>
                  str_replace("Viet Tiep" , "Hai Phong") |> 
                  str_replace("Thanh Nhan", "Ha Noi")),
         across(collection_date, convert_date),
         across(reception_date, ~ as_date(ifelse(.x == "30/10/2020", ymd(20201030),
                                                 convert_date(.x)))),
         sex       = hash_sex[tolower(convert(sex))],
         sex       = ifelse(sample_ID == "108-09-0357", "female", sex), # correction
         unit      = trimws(convert(str_remove_all(age, "\\d"))),
         age       = as.numeric(str_remove_all(age, "\\D")),
         age       = ifelse(unit == "ngay", age / 365,
                            ifelse(unit == "thang", age / 12, age)),
         age = ifelse(`Lab ID` %in% paste(17401:17405), age, age + .5), # age correct
         age_group = make_age_group(age)) |> 
  left_join(titers, y = _, "Lab ID") |> 
  rename(lab_ID = `Lab ID`) |>
  select(participant, province, site, collection, sample_ID, age_group, age, sex,
         collection_date, reception_date, lab_ID, titer, conclusion) |> 
  mutate(time_point = ifelse(collection == "01", "T1", "T2"))

Check:

map_int(north, ~ .x |> is.na() |> sum())

    participant        province            site      collection       sample_ID 
              0               0               0               0               0 
      age_group             age             sex collection_date  reception_date 
              2               2              10               2            1544 
         lab_ID           titer      conclusion      time_point 
              0               0               0               0

north |> 
  group_by(collection) |> 
  summarize(from_recep      = min2(reception_date),
            to_recep        = max2(reception_date),
            n               = n(),
            NA_coll         = sum(is.na(collection_date)))

# A tibble: 2 × 5
  collection from_recep to_recep       n NA_coll
  <chr>      <date>     <date>     <int>   <int>
1 01         2019-12-09 2020-10-29  1773       1
2 09         2022-08-09 2022-08-09  1909       1

Checking inconsistency between dates of collection and reception:

filter(north, reception_date < collection_date)

# A tibble: 239 × 14
   participant province  site  collection sample_ID   age_group   age sex   
   <chr>       <chr>     <chr> <chr>      <chr>       <fct>     <dbl> <chr> 
 1 1184        Phu Tho   111   01         111-01-1184 [16,17)    16.5 female
 2 0606        Hai Phong 027   09         027-09-0606 [2,3)       2.5 male  
 3 0616        Hai Phong 027   09         027-09-0616 [4,5)       4.5 male  
 4 0628        Hai Phong 027   09         027-09-0628 [6,7)       6.5 male  
 5 0631        Hai Phong 027   09         027-09-0631 [7,8)       7.5 male  
 6 0637        Hai Phong 027   09         027-09-0637 [8,9)       8.5 female
 7 0638        Hai Phong 027   09         027-09-0638 [8,9)       8.5 male  
 8 0643        Hai Phong 027   09         027-09-0643 [9,10)      9.5 male  
 9 0644        Hai Phong 027   09         027-09-0644 [9,10)      9.5 male  
10 0645        Hai Phong 027   09         027-09-0645 [9,10)      9.5 male  
# ℹ 229 more rows
# ℹ 6 more variables: collection_date <date>, reception_date <date>,
#   lab_ID <chr>, titer <dbl>, conclusion <chr>, time_point <chr>

4.2.3 All

dataset <- north |> 
  mutate(collection_date = as_date(ifelse(is.na(collection_date),
                                          reception_date, collection_date))) |> 
  select(-reception_date, -lab_ID) |> 
  bind_rows(select(south, -plate)) |> 
  filter_out(is.na(age_group)) |> 
  mutate(region = ifelse(province %in% unique(north$province), "north", "south"))

Checking the thresholds:

dataset |> 
  filter(conclusion == "Neg") |> 
  pull(titer) |> 
  max()

[1] 149.9753

dataset |> 
  filter(conclusion == "Equi") |> 
  pull(titer) |> 
  range()

[1] 150.3105 199.9807

dataset |> 
  filter(conclusion == "Pos") |> 
  pull(titer) |> 
  min()

[1] 200.1092

Number of samples per site per time point:

(nb_samples <- dataset |> 
  group_by(province, time_point) |> 
  tally() |> 
  ungroup() |> 
  pivot_wider(names_from = time_point, values_from = n) |> 
  replace_na2())

# A tibble: 20 × 4
   province       T1    T2    T3
   <chr>       <int> <int> <int>
 1 An Giang      200   200   200
 2 Binh Dinh     200   200   200
 3 Dak Lak       200   200   200
 4 Dong Thap     200   200   200
 5 HCMC          200   186   187
 6 Ha Giang      200   200     0
 7 Ha Noi         40   200     0
 8 Ha Tinh       200   200     0
 9 Hai Phong     178   168     0
10 Hue           200   200   200
11 Khanh Hoa     200   200   200
12 Kien Giang    200     0     0
13 Lang Son      192   199     0
14 Lao Cai       200   200     0
15 Phu Tho       199   200     0
16 Quang Ngai    200   200   200
17 Soc Trang     200   200   200
18 Son La        200   200     0
19 Thai Binh     163   141     0
20 Thai Nguyen   200   200     0

5 Sampling locations

Loading the and modifying sf object of the provinces of Vietnam:

col_map <- c(brewer.pal(3, "Set1"), "lightyellow")

gadm1 <- readRDS(paste0(path2data, "gadm36_VNM_1_sf.rds"))
st_crs(gadm1) <- st_crs(gadm1)

gadm1 <- gadm1 |> 
  rename(province = VARNAME_1) |> 
  select(province) |> 
  mutate(across(province, ~ .x |>
                  str_replace("Thua Thien Hue", "Hue") |>
                  str_replace("Ho Chi Minh", "HCMC")),
         centroid = geometry |> st_centroid() |> st_coordinates(),
         latitude = centroid[, "Y"]) |> 
  left_join(nb_samples, "province") |> 
  replace_na2() |> 
  mutate(col = col_map[(T1 < 1) + (T2 < 1) + (T3 < 1) + 1]) |> 
  select(province, latitude, col)

Plotting the provinces from which samples where collected:

opar <- par(plt = rep(0:1, 2))
plot_sf2(gadm1, "col")
par(opar)

Figure 1: The provinces of Vietnam from which serum samples are collected for the first time point only (Dec 2019 in green), the first two time points only (Dec 2019 and Dec 2022 in blue), and for the three time points (Dec 2019, Dec 2022 and Apr 2024 in red).

6 Samples metadata

6.1 Sampling intensity

The number of samples per province and time point together with latitude and color:

nb_samples2 <- gadm1 |>
  st_drop_geometry() |> 
  left_join(nb_samples, y = _, "province") |> 
  arrange(desc(latitude))

The order of provinces according to latitude:

province_order <- nb_samples2 |> 
  mutate(order = row_number()) |> 
  select(province, order)

Number of samples per day and site:

hzl <- 1:20

opar <- par(plt = c(.105, .97, .1, .9))

dataset |> 
  group_by(province, collection_date) |> 
  tally() |> 
  ungroup() |> 
  mutate(y = max(hzl) + 1 - as.numeric(factor(province, nb_samples2$province))) |> 
  left_join(nb_samples2, "province") |> 
  slice_sample(prop = 1) |> 
  with(plot3(collection_date, y, cex = sqrt(n), xpd = TRUE, col = col))

lb <- 2020:2024
axis(1, ymd(paste(lb, "0101")), lb)
abline3(h = hzl)
text2(rep(ymd(20210701), length(hzl)), rev(hzl), nb_samples2$province)

par(opar)

Number of samples:

nb_samples2 |> 
  select(-c(latitude, col)) |> 
  mutate(total = T1 + T2 + T3) |> 
  add_colsum() |> 
  print(n = Inf)

# A tibble: 21 × 5
   province       T1    T2    T3 total
   <chr>       <int> <int> <int> <int>
 1 Ha Giang      200   200     0   400
 2 Lao Cai       200   200     0   400
 3 Lang Son      192   199     0   391
 4 Thai Nguyen   200   200     0   400
 5 Phu Tho       199   200     0   399
 6 Son La        200   200     0   400
 7 Ha Noi         40   200     0   240
 8 Hai Phong     178   168     0   346
 9 Thai Binh     163   141     0   304
10 Ha Tinh       200   200     0   400
11 Hue           200   200   200   600
12 Quang Ngai    200   200   200   600
13 Binh Dinh     200   200   200   600
14 Dak Lak       200   200   200   600
15 Khanh Hoa     200   200   200   600
16 HCMC          200   186   187   573
17 Dong Thap     200   200   200   600
18 An Giang      200   200   200   600
19 Kien Giang    200     0     0   200
20 Soc Trang     200   200   200   600
21 <NA>         3772  3694  1787  9253

An alternative representation:

opar <- par(plt = c(.0825, .9125, .15, .95))

dataset |> 
  group_by(collection_date) |> 
  tally() |> 
  with(plot(collection_date, n, type = "h", col = "grey", xlab = NA,
            ylab = "sampling intensity (nb samples/day)"))

par(new = TRUE)

plot2(NA, xlim = range(dataset$collection_date), ylim = c(0, 600), ann = FALSE)
axis(4)
mtext("cumulative number of samples", 4, 1.5)

dataset |> 
  left_join(select(nb_samples2, province, col), "province") |> 
  group_by(province, collection_date) |> 
  summarise(n = n(), col = unique(col), .groups = "drop_last") |> 
  group_modify(~ mutate(.x, cumn = cumsum(n))) |> 
  group_walk(~ with(.x, lines(collection_date, cumn, type = "s", col = col)))

par(opar)

Figure 3: The sampling intensity (grey, left y-axis) and the cumulative number of samples as a function of space and time. The the latter, the color code is the same as for Figure 1.

6.2 Age distributions

A tuning of the barplot() function:

barplot2 <- function(x, n1 = 5, n2 = 10, ad = 20, mn = 0, mx = 31, col = 4, ...) {
  if (sum(x$Freq) < 1) {
    plot4(1)
  } else {
    with(x, barplot(Freq, space = 0, col = col, ...))
    segments2(mn, n1, ad, n1)
    segments2(ad, n2, mx, n2)
  }
}

A function that plots the age distributions of one site:

barplot_row <- function(x, ylim) {
  plot4(1); text(1, 1, x$province[1])
  x |> 
    group_by(time_point) |> 
    group_split() |> 
    walk2(c(TRUE, FALSE, FALSE), ~ barplot2(.x, ylim = ylim, axes = .y))
}

Number of sample per age class, per province, per time point:

tmp <- dataset |> 
  with(table(age_group, province, time_point)) |>
  as.data.frame() |> 
  left_join(province_order, "province") |> 
  arrange(order, time_point, age_group)

ylim <- c(0, max(tmp$Freq))

opar <- par2(mfrow = c(20, 4), plt = c(.12, .985, .05, 1))

tmp |> 
  group_by(order) |> 
  group_split() |> 
  walk(~ barplot_row(.x, ylim = ylim))

par(opar)

Figure 4: Number of samples collected as a function of age class and site (ordered from south to north from bottom to top) for the three time points (Dec 2019, Dec 2022 and Apr 2024 from left to right). Age classes are 1-year age classes from 0 to 20 and 5-year age classes from 20 to 70 year-old. The horizontal red lines show the targetted number of samples per age class which are 5 for the 1-year age classes below 20 year-old and 10 for the 5-year age classes above 20 year-old.

6.3 Gender distribution

A function that plots the sex distributions of one site:

#pie_row <- function(x, col = c(rgb2(247, 114, 184), rgb2(62, 146, 247))) {
pie_row <- function(x, col = c(col_female, col_male)) {  
  plot4(1); text(1, 1, x$province[1], xpd = TRUE)
  x |> 
    group_by(time_point) |> 
    group_split() |> 
    walk(~ with(.x, pie2(Freq, NA, col = col)))
}

Number of sample per sex, per province, per time point:

tmp <- dataset |> 
  with(table(sex, province, time_point)) |> 
  as.data.frame() |> 
  left_join(province_order, "province") |> 
  arrange(order, time_point, sex)

opar <- par(mfrow = c(20, 4), plt = c(.12, .985, .05, 1))

tmp |> 
  group_by(order) |> 
  group_split() |> 
  walk(pie_row)

par(opar)

Figure 5: Proportion of female (pink) and male (blue) samples as a function of site (ordered from south to north from bottom to top) for the three time points (Dec 2019, Dec 2022 and Apr 2024 from left to right).

7 Titers

A function that plots a 3-color distribution from a 2-column data frame df that contains the coordinates of the distribution, with \(x\)-values in the first column named x and \(y\)-values in the second column, on a decimal log scale of the \(x\) values:

ptiters <- function(df, main = NULL, th1 = 150, th2 = 200, lb2 = 1:5, ...) {
  plot_titers_empty(df, ...)
  logaxis(1); axis(2)
  df |> 
    group_by(cut(x, c(-Inf, log10(c(th1, th2)), Inf))) |> 
    group_map(~ pad(.x)) |> 
    walk2(c(2, 7, 3), ~ polygon(.x, col = .y, border = NA))
  if (! is.null(main)) title(main, line = -.25, cex.main = 1)
}

A function that (1) generates the 2-column input data frame of the function ptiters() from the column named titer of the data frame df and then (2) feeds it to the ptiters() function to draw the 3-color distribution:

plot_titers <- function(df, main = NULL, ...) {
  df |> 
    pull(titer) |> 
    log10() |> 
    density() |> 
    extract(1:2) |> 
    as_tibble() |> 
    ptiters(main, ...)
}

Plotting the distribution of all the titers, across the 3 time points and the 10 sites:

plot_titers(dataset)

Figure 6: Distribution of titers across all sites and time points. Colors show negative (red), equivocal (orange) and positive (green).

The corresponding proportions in each of the 3 categories of titer values are the following:

dataset |> 
  group_by(conclusion) |> 
  tally2()

# A tibble: 3 × 3
  conclusion     n  prop
  <chr>      <int> <dbl>
1 Equi         453  4.90
2 Neg         1290 13.9 
3 Pos         7510 81.2

The distributions of all the titers across the 10 sites for each time point separately:

timepoints <- c(paste("Dec", c(2019, 2022)), "Apr 2024")

opar <- par2(mfrow = c(3, 1), plt = c(.13, .97, .18, .95))
dataset |> 
  group_by(time_point) |> 
  group_split() |> 
  walk2(timepoints, ~ plot_titers(.x, .y, xlim = c(.5, 4.5), ylim = c(0, 1.15)))
par(opar)

Figure 7: Distribution of titers across all sites for the 3 time points. Colors show negative (red), equivocal (orange) and positive (green).

The corresponding proportions in each of the 3 categories of titer values and for each time point are the following:

dataset |> 
  group_by(time_point, conclusion) |> 
  tally2() |> 
  pivot_wider(id_cols = -n, names_from = time_point, values_from = prop)

# A tibble: 3 × 4
  conclusion    T1    T2    T3
  <chr>      <dbl> <dbl> <dbl>
1 Equi        4.83  4.47  5.93
2 Neg        11.4  15.5  16.2 
3 Pos        83.8  80.1  77.8

7.1 Age profile

A function that adds the thresholds values to a plot:

add_thresholds <- function(ths = c(150, 200), col = 2:3) {
  abline(h = log10(ths), col = col, lty = 2, lwd = 2)
}

A function that plots an image of the distribution of titers (along the \(y\) axis) for all the age classes (on the \(x\) axis):

image_titer_density_age <- function(data, xaxis = TRUE, xlab = age_lab, n = 1024,
                                    min_titer = NULL, max_titer = NULL, buff = TRUE,
                                    breaks = seq(.95, .05, -.05)) {
  if (is.null(min_titer)) min_titer <- min(data$titer)
  if (is.null(max_titer)) max_titer <- max(data$titer)

  if (buff) {
    bf <- .04 * (max_titer - min_titer)
    min_titer <- min_titer - bf
    max_titer <- max_titer + bf
  }
  
  data |> 
    filter(age < 70) |>
    mutate(across(age_group, ~ fct_recode(.x, NULL = "[70,100)"))) |> 
    group_by(age_group, .drop = FALSE) |>
    group_map(~ density2(.x$titer, min_titer, max_titer, n)) |>
    rep(c(rep(1, 20), rep(5, 10))) |> # age classes
    map(quantval2quantcat, breaks) |> # this computes the categorical HDI
    as.data.frame() |> 
    as.matrix() |> 
    t() |>
    image(1:70 - .5, seq(min_titer, max_titer, le = n), z = _, axes = FALSE,
          xlab = xlab, ylab = titer_lab,
          col = yellow_red(length(breaks) + 1))
  
  if (xaxis) axis(1)
  logaxis(2)
  add_thresholds()
  box(bty = "o")
}

A function that adds an axis of the ages on the top of a figure:

add_years_on_top <- function(at = 10 * 0:7, line = 1.7) {
  axis(3, at)
  mtext("age (years)", line = line)
}

A function that plots both the raw titer data and the modelled distribution per age class:

plot_titers_age_profile <- function(df, ymin = .9, ymax = 4.3) {
  opar <- par(plt = c(.105, .97, .5, 1 - (1 - .95) / 2 - .15 / 2))
  
  df |> 
    filter_out(age_group == "[70,100)") |> 
    with(plot(age, log10(titer), col = adjustcolor(4, .2), axes = FALSE, xaxs = "i",
              xlim = c(0, 70), ylim = c(ymin, ymax), xpd = TRUE,
              xlab = NA, ylab = titer_lab))
  logaxis(2); box(bty = "o")
  add_years_on_top()
  add_thresholds()
  
  par(plt = c(.105, .97, .15 / 2 + (1 - .95) / 2, .5), new = TRUE)
  
  df |>
    mutate(across(titer, log10)) |> 
    image_titer_density_age(min_titer = ymin, max_titer = ymax)
  
  par(opar)
}

Let’s plot the distributions of the titers as a function of age for the whole data set:

plot_titers_age_profile(dataset)

Figure 8: Distribution of titers across all sites and time points for the 20 age classes. Colors show 20 ranges of high density intervals (HDI) from 0 (yellow) to 1 (red) by steps of 0.05.

A function that plots the density with a color code according to the high density interval (HDI):

plot_density_HDI <- function(df, breaks = seq(.95, .05, -.05),
                             n = 1024, from = .9, to = 4.3) {
  df <- df |> 
    with(as_tibble(density(titer, from = from, to = to, n = n)[c("x", "y")])) |> 
    mutate(cat = as.numeric(cut(y, c(0, map_dbl(breaks, ci2th, v = y), max(y)))),
           subcat   = cumsum(c(0, diff(cat)) != 0))
  
  add_polygons <- function(ct, cl) {
    df |> 
      filter(cat == ct) |> 
      group_by(subcat) |> 
      group_walk(~ polygon2(.x$x, .x$y, col = cl))
  }
  
  with(df, plot_titers_empty(x, y))
  logaxis(1); axis(2)
  walk2(seq_along(c(0, breaks)), yellow_red(length(breaks) + 1),
        ~ add_polygons(.x, .y))
}

Let’s look at the density of titers for the 10-11 year-old for example:

dataset |> 
  filter(age_group == "[10,11)") |>
  mutate(across(titer, log10)) |> 
  plot_density_HDI()

Figure 9: An example of the distribution of the titers for the 10-11 year-old children together with a color code mapping high density intervals (HDI) from 0 (yellow) to 1 (red) by steps of 0.05.

The scale:

image(t(matrix(1:20)), col = yellow_red(20), axes = FALSE)

7.2 Time points

The age profiles of titers density in for the north and the south and the 3 time points:

dataset |> 
  mutate(across(titer, log10)) |>
  group_by(region, time_point) |> 
  group_walk(~ image_titer_density_age(.x, min_titer = .9, max_titer = 3.4))

7.3 Titers per age class and time point

A function that plots the decline of maternal antibody titers:

plot_maternal_immunity <- function(df, remove_age0 = TRUE, k = -1, n = 512, ci = .95,
                                   xlim = c(0, 12), ylim = c(.3, 3.4)) {
  tmp <- df |> 
    filter(age_group == "[0,1)") |> 
    mutate(across(titer, log10),
           across(age, ~ 12 * .x),
           across(sex, ~ as.numeric(as.factor(.x))))
  
  if (remove_age0) tmp <- filter_out(tmp, age == 0)
  
  tmp |> 
    group_by(sex) |> 
    tally()
  
  preds <- tmp |>
    gam(titer ~ s(age, k = k), data = _) |> 
    predict4(list(age = seq(0, 12, le = n)), se = ci)
    
  plot(1, type = "n", xlim = xlim, ylim = ylim, yaxt = "n",
       xlab = "age (months)", ylab = titer_lab)
  
  with(preds, {
    polygon3(age, ll, ul, col = "lightgrey")
    lines(age, fit)
  })
  with(tmp, points(age, titer, col = c(col_female, col_male)[sex], pch = 19))
  abline(h = log10(c(150, 200)), col = 2:3, lty = 2)
  abline(v = 9, lty = 2)
  logaxis(2, 0:5)
}

dataset |> 
  filter(region == "south", time_point == "T1") |> 
  plot_maternal_immunity(k = 8)

dataset |> 
  filter(region == "south", time_point == "T2") |> 
  plot_maternal_immunity()

dataset |> 
  filter(region == "south", time_point == "T3") |> 
  plot_maternal_immunity()

plot_titers_age <- function(df, k = -1, col = "lightgrey", ylim = c(2, 3.5), n = 512) {
  df |> 
    mutate(across(age, ~ ifelse(.x == 0, .5, .x)),
           across(titer, log10)) |> 
    gam(titer ~ s(age, k = k), data = _) |> 
    predict4(list(age = seq(0, 70, le = n))) |> 
    with({
      plot(age, fit, type = "n", ylim = ylim, yaxt = "n",
           xlab = age_lab, ylab = titer_lab)
      polygon3(age, ll, ul, col = col)
      lines(age, fit)
    })
  abline(h = log10(c(150, 200)), col = 2:3, lty = 2)
  logaxis(2, 0:5)
}

dataset |> 
  filter(region == "north", time_point == "T1") |> 
  plot_titers_age(k = 50)

dataset |> 
  filter(region == "north", time_point == "T2") |> 
  plot_titers_age(k = 50)

dataset |> 
  filter(region == "south", time_point == "T1") |> 
  plot_titers_age(k = 50)

dataset |> 
  filter(region == "south", time_point == "T2") |> 
  plot_titers_age(k = 50)

dataset |> 
  filter(region == "south", time_point == "T3") |> 
  plot_titers_age(k = 50)

7.4 Provinces (T1)

A function that compares all pairs of provinces in the data frame df:

compare_provinces <- function(df) {
  
  compare_provinces_ <- function(p1, p2) {
    df2 <- df |>
      filter(province %in% c(p1, p2)) |> 
      mutate(across(province, as.factor))
    list(m0 = gam2(titer ~ s(age), data = df2),
         m1 = gam2(titer ~ province + s(age, by = province), data = df2))
  }

  prov_names <- unique(df$province)
  nb_prov <- length(prov_names)
  i_vals <- 1:(nb_prov - 1)
  out <- vector("list", sum(i_vals))
  k <- 1
  
  for(i in i_vals) {
    for(j in (i + 1):nb_prov) {
      out[[k]] <- compare_provinces_(prov_names[i], prov_names[j])
      names(out)[k] <- paste(prov_names[i], prov_names[j], sep = " - ")
      k <- k + 1
    }
  }
  
  out
}

A function that agglomerate provinces according to their titer age profile:

agglomerate <- function(df) {
  Delta_AICs <- df |>
    compare_provinces() |>
    map_dbl(~ abs(diff(AIC(.x$m0, .x$m1)[, 2])))
  
  ind <- which.min(Delta_AICs)
  if (Delta_AICs[ind] < 10) {
    ind_name <- names(ind) 
    
    df <- df |> 
      group_by(province %in% str_split2(ind_name, " - ")) |> 
      group_split(.keep = FALSE)
    df[[2]]$province <- str_replace(ind_name, " - ", " | ")
    df <- bind_rows(df)
    
    if (length(unique(df$province)) < 2) return(df)
    return(agglomerate(df))
  }
  return(df)
}

Let’s run it on the first time point (takes about 3’00”):

datasetT1 <- dataset |>
  filter(time_point == "T1") |> 
  mutate(across(titer, log10)) |> 
  agglomerate()

Let’s plot the age profiles for the identified clusters of provinces:

datasetT1 |> 
  mutate(across(titer, log10)) |> 
  group_by(province) |> 
  group_walk(~ image_titer_density_age(.x, min_titer = .9, max_titer = 4.3, n = 2048))

data <- datasetT1 |> 
  mutate(across(titer, log10)) |> 
  filter(province == "Thai Binh | Lao Cai | Ha Noi | Ha Tinh") |> 
  image_titer_density_age(min_titer = .9, max_titer = 4.3, n = 2048)

The list of province for each of the plots above:

unique(datasetT1$province)

[1] "Khanh Hoa"                                                                                
[2] "Thai Binh | Lao Cai | Ha Noi | Ha Tinh"                                                   
[3] "Hai Phong | Lang Son | Thai Nguyen | Son La | Phu Tho | Ha Giang | Binh Dinh | Quang Ngai"
[4] "Dong Thap | An Giang | Hue | HCMC | Soc Trang"                                            
[5] "Dak Lak | Kien Giang"

8 Sero-prevalence

This function takes a data frame that contains at least 2 columns named age_group and titer) and estimates sero-prevalence with confidence interval per age_group where positivity is defined by a titer higher than th:

prepare_data <- function(df, th) {
  df |> 
    with(table(age_group, titer > th)) |> 
    as.data.frame() |> 
    pivot_wider(names_from = Var2, values_from = Freq) |> 
    mutate(tmp = map2(`TRUE`, `TRUE` + `FALSE`, binom_test2) |> 
             map(extract, c("estimate", "conf.int")) |>
             map(unlist) |> 
             map(setNames, c("p", "l", "u"))) |> 
    unnest_wider(tmp)
}

A tuning of the plot() function:

plotn <- function(...) {
  plot(..., type = "n", xaxs = "i", yaxs = "i", ylab = "sero-prevalence")
}

This function plots the output of prepare_data():

make_p_age_profile <- function(xs = c(.5:19.5, seq(22.5, 67.5, 5), 75)) {
  function(x, main = NULL) {
    plotn(xs, rep(.5, length(xs)), ylim = 0:1, xlim = c(0, 80),
          xlab = "age classe (years)")
    abline2(v = seq(0, 70, 5))
    abline2(h = seq(0, 1, .1))
    with(x, {
      arrows2(xs, l, xs, u, .02, col = 4)
      points2(xs, p, col = 4, xpd = TRUE)
    })
    box(bty = "o")
    if (! is.null(main)) title(main, line = -9, cex.main = 1)
  }
}

p_age_profile <- make_p_age_profile()

A wrapper around the two previous functions:

plot_age_profile <- function(df, th, main = NULL) {
  df |> 
    prepare_data(th) |> 
    p_age_profile(main)
}

Sero-prevalence profile across the 3 time points and the 10 sites:

plot_age_profile(south, 200)

Sero-prevalence profiles across the 10 sites for each of the time points:

opar <- par2(mfrow = c(3, 1), plt = c(.13, .97, .18, .95))
south |> 
  group_by(time_point) |> 
  group_split() |> 
  walk2(timepoints, ~ plot_age_profile(.x, 200, .y))

par(opar)

The following function returns a list of data frames of 3 columns too where the number of slots of the list and the number of rows of the data frames are switched.

time_change <- function(x) {
  map(1:nrow(first(x)), function(y) map_dfr(x, ~ .x[y, ]))
}

This function takes a 3-column (proportion estimate and confidence interval bounds) / 3-row (for 3 time points) data frame that corresponds to a given age group and plots the change in sero-prevalence as a function of time.

time_trend <- function(x, xs = 1:3, col = 4) {
  with(x, {
    plot4(xs, p, xlim = c(0, 4), ylim = 0:1)
    abline2(h = seq(.2, .8, .2))
    arrows2(xs, l, xs, u, .05, col = col)
    points2(xs, p, col = col)
    box(bty = "o")
  })
}

Showing the change in seroprevalence as a function of time for all the age groups separately:

epsl <- .065
opar <- par(mfrow = c(3, 10), plt = rep(c(epsl, 1 - epsl), 2))
south |> 
  filter_out(age_group == last(levels(age_group))) |> 
  group_by(time_point) |> 
  group_split() |> 
  map(prepare_data, 200) |> 
  map(~ .x[-nrow(.x), ]) |> 
  time_change() |> 
  walk(time_trend)

par(opar)

8.1 GAM

8.1.1 Functions

A tuning of the predict2() function:

TO DO: rewrite it as a function of predict4()

predict3 <- function(model, x = seq(0, 70, le = 512), ci = .95) {
  bind_cols(tibble(x  = x),
            model |> 
              predict2(list(age = x), se.fit = ci) |> 
              as_tibble())
}

A function that adds prediction and confidence interval to a plot:

add_ribbon <- function(pred, col = 4) {
  with(pred, {
    polygon(c(x, rev(x)), c(ll, rev(ul)), border = NA, col = adjustcolor(col, .2))
    lines(x, fit, col = col, lwd = 2)
  })
}

where pred is an output of the predict3() function and a 4-column data frame with names x, fit, ll and ul. This function plots the predictions of a GAM model:

plot_pred <- function(pred, ylim = NULL, col = 4, ...) {
  if (is.null(ylim)) ylim <- 0:1
  with(pred, plotn(x, fit, ylim = ylim, xlab = age_lab, ...))
  abline2(v = c(.75, 1.5, seq(5, 65, 10)))
  abline3(v = seq(10, 60, 10))
  add_ribbon(pred, col)
  box(bty = "o")
}

where pred is an output of the predict4() function. A wrapper around the predict3(), plot_pred() and add_ribbon() functions:

plot_seroprevalences <- function(data, bs = "tp", k = 50, th1 = 200, th2 = 150) {
  gam3((titer > th1) ~ s(age, bs = bs, k = k), data) |> 
    predict3() |> 
    plot_pred(col = 3)
  
  gam3((titer > th2) ~ s(age, bs = bs, k = k), data) |> 
    predict3() |> 
    add_ribbon(2)
  
  abline3(h = 1:9 / 10)
}

8.1.2 Analysis

A function that runs 10 different smoothing bases on a GAM model and compare them:

compare_bs <- function(data, th = 200, k = 50, n = 1000) {
  tibble(
    smoothing_basis = c("thin plate", "thin plate", "Duchon", "cubic regression",
                        "cubic regression", "cyclic cubic", "B-spline", "P-spline",
                        "cyclic P-spline", "Gaussian process"),
    code = c("tp", "ts", "ds", "cr", "cs", "cc", "bs", "ps", "cp", "gp")) |> 
    mutate(model   = map(code, ~ gam3((titer > th) ~ s(age, bs = .x, k = k), data)),
           REML    = map_dbl(model, get_reml),
           per_dev = map_dbl(model, dev_explained),
           AIC     = map_dbl(model, AIC),
           k_check = map(model, ~ as_tibble(k.check(.x, n.rep = n)))) |> 
    unnest_wider(k_check)
}

Let’s try 10 smoothing bases to see whether one stands out as much better that the others:

models <- compare_bs(south)

which gives:

select(models, -model)

# A tibble: 10 × 9
   smoothing_basis  code   REML per_dev   AIC  `k'`   edf `k-index` `p-value`
   <chr>            <chr> <dbl>   <dbl> <dbl> <dbl> <dbl>     <dbl>     <dbl>
 1 thin plate       tp    2595.   0.109 5152.    49  16.2     0.971     0.339
 2 thin plate       ts    2600.   0.109 5153.    49  15.6     0.973     0.359
 3 Duchon           ds    2594.   0.110 5151.    49  16.6     0.981     0.694
 4 cubic regression cr    2593.   0.109 5151.    49  15.9     0.959     0.127
 5 cubic regression cs    2598.   0.109 5152.    49  15.3     0.982     0.569
 6 cyclic cubic     cc    2599.   0.108 5161.    48  15.3     0.995     0.926
 7 B-spline         bs    2593.   0.109 5153.    49  15.9     0.978     0.506
 8 P-spline         ps    2593.   0.109 5154.    49  15.3     0.976     0.487
 9 cyclic P-spline  cp    2599.   0.108 5159.    49  15.4     0.975     0.397
10 Gaussian process gp    2598.   0.109 5152.    49  16.2     0.987     0.726

Let’s then stick to the thin plate smoothing. Plotting smoothed trends:

models |> 
  filter(code == "tp") |> 
  pull(model) |> 
  first() |>
  predict3() |>
  plot_pred()
abline3(h = 1:9 / 10)

With the two thresholds together:

plot_seroprevalences(south)

And together with the distributions of titers as a function of age:

opar <- par(plt = c(.105, .97, .5, 1 - (1 - .95) / 2 - .15 / 2))
south |> 
  mutate(across(titer, log10)) |> 
  image_titer_density_age(xaxis = FALSE, xlab = NA)
add_years_on_top()
par(plt = c(.105, .97, .15 / 2 + (1 - .95) / 2, .5), new = TRUE)
plot_seroprevalences(south)
par(opar)

9 TO DO

run the gam2 model only on the range of age where we have (enough) data

subset_vector <- function(x, th) x[x > th]

smoother <- function(df, bs = "tp", k = -1) {
  c("p", "l", "u") |> 
    setNames(c("fit", "ll", "ul")) |> 
    map(~ gam2(reformulate(paste0('s(age, bs="', bs, '", k=', k, ")"), .x), betar, df))
}

gam_predictions <- function(data, th = 200, n = 512, bs = "tp", k = -1) {
  data <- data |> 
    mutate(tmp = age_group |> str_remove("\\[") |> str_remove("\\)")) |> 
    separate(tmp, c("l", "u")) |> 
    mutate(across(c(l, u), as.integer))
  
  ndata <- data |> 
    select(l, u) |> 
    range() |> 
    c(n) |> 
    setNames(c("from", "to", "length")) |> 
    as.list() |> 
    do.call(seq, args = _)
  
  data |> 
    group_by(age_group) |> 
    summarise(age = mean(age)) |> 
    left_join(na.exclude(prepare_data(data, th)), "age_group") |> 
    smoother(bs, k) |> 
    map(predict2, newdata = list(age = ndata)) |> 
    as_tibble() |> 
    bind_cols(tibble(x = ndata)) |> 
    relocate(x, .before = 1)
}

gam_predictions(south)

# A tibble: 512 × 4
       x       fit        ll        ul
   <dbl> <dbl[1d]> <dbl[1d]> <dbl[1d]>
 1 0         0.707     0.609     0.792
 2 0.196     0.707     0.609     0.792
 3 0.391     0.707     0.609     0.791
 4 0.587     0.706     0.609     0.791
 5 0.783     0.706     0.610     0.790
 6 0.978     0.706     0.610     0.790
 7 1.17      0.706     0.610     0.789
 8 1.37      0.705     0.610     0.789
 9 1.57      0.705     0.610     0.788
10 1.76      0.705     0.611     0.788
# ℹ 502 more rows

south |> 
  filter(province == "An Giang", time_point == "T1") |> 
  filter_out(age < 14 | age > 70) |>
  gam_predictions() |> 
  plot_pred()

Warning in family$saturated.ll(y, prior.weights, theta): saturated likelihood
may be inaccurate
Warning in family$saturated.ll(y, prior.weights, theta): saturated likelihood
may be inaccurate

9.1

Model probability of seropositivity between the two thresholds

titer2protection <- function(x) {
  ifelse(x < 150, 0, ifelse(x > 200, 1, (x - 150) / 50))
}

south |> 
  mutate(titer2 = titer2protection(titer)) |> 
  pull(titer2) |> 
  mean()

[1] 0.8152297

?dbeta

library(MASS)


Attaching package: 'MASS'

The following object is masked from 'package:dplyr':

    select

th <- 150
#prepare_data <- function(df, th) {
  south |> 
    with(table(age_group, titer > th)) |> 
    as.data.frame() |> 
    pivot_wider(names_from = Var2, values_from = Freq) |> 
    mutate(tmp = map2(`TRUE`, `TRUE` + `FALSE`, binom_test2) |> 
             map(extract, c("estimate", "conf.int")) |>
             map(unlist) |> 
             map(setNames, c("p", "l", "u"))) |> 
    unnest_wider(tmp)

# A tibble: 31 × 6
   age_group `FALSE` `TRUE`     p     l     u
   <fct>       <int>  <int> <dbl> <dbl> <dbl>
 1 [0,1)          61     46 0.430 0.335 0.529
 2 [1,2)          36     87 0.707 0.619 0.786
 3 [2,3)          19    108 0.850 0.776 0.907
 4 [3,4)          15    107 0.877 0.805 0.930
 5 [4,5)          15    120 0.889 0.823 0.936
 6 [5,6)          26    105 0.802 0.723 0.866
 7 [6,7)          28    101 0.783 0.702 0.851
 8 [7,8)          39     91 0.7   0.613 0.777
 9 [8,9)          27     99 0.786 0.704 0.854
10 [9,10)         39     92 0.702 0.616 0.779
# ℹ 21 more rows

#}

#south |> 
#  filter()

model density() in 2D in order to see density of data points not only as a function of age but also as a function of titers
doing clustering on the ANCOVA

seroprevT1T2 <- function(x, th = 200, bs = "tp", k = 50, title = FALSE) {
  x |> 
    filter(time_point == "T1") |> 
    gam3((titer > th) ~ s(age, bs = bs, k = k), data = _) |> 
    predict3() |> 
    plot_pred(col = 2)
  
  x |> 
    filter(time_point == "T2") |>
    gam3((titer > th) ~ s(age, bs = bs, k = k), data = _) |> 
    predict3() |> 
    add_ribbon(3)
  
  if (title) x |> 
    pull(province) |> 
    first() |> 
    text(35, .2, labels = _)
}

seroprev3timepoints <- function(x, th = 200, bs = "tp", k = 50, title = FALSE) {
  seroprevT1T2(x, th, bs, k, title)
  x |> 
    filter(time_point == "T3") |>
    gam3((titer > th) ~ s(age, bs = bs, k = k), data = _) |> 
    predict3() |> 
    add_ribbon(4)  
}

seroprev3timepoints(south)

seroprevT1T2(south)

opar <- par(mfrow = c(3,3), plt = c(.12, .97, .18, .95))
south |> 
  filter_out(province == "Kien Giang") |> 
  group_by(province) |> 
  group_split() |> 
  walk(seroprev3timepoints, title = TRUE)
par(opar)

opar <- par(mfrow = c(3,3), plt = c(.12, .97, .18, .95))
south |> 
  filter_out(province == "Kien Giang") |> 
  group_by(province) |> 
  group_split() |> 
  walk(plot_age_profile, 200)
par(opar)

understanding the age profile of titers and seroprevalence. Maybe as an effect of vaccination and waning immunity for vaccination
Looking at 0-1 and 0-5 in order to see whether we see more detail signal as a function of age
test differences between sexes