Data cleaning

Parameters

The path to the data folder:

data_path <- paste0("/Users/MarcChoisy/Library/CloudStorage/",
                    "OneDrive-OxfordUniversityClinicalResearchUnit/",
                    "GitHub/choisy/typhoid/")

Packages

required packages:

required_packages <- c("readxl", "dplyr", "naniar")

Making sure that the required packages are installed:

to_inst <- required_packages[! required_packages %in% installed.packages()[,"Package"]]
if (length(to_inst)) install.packages(to_inst)

Loading some of these packages:

library(readxl)
library(dplyr)

Utilitary function

Tuning factor():

factor2 <- function(...) factor(..., ordered = TRUE)

Reading and cleaning data

The files in the raw data folder:

files <- dir(paste0(data_path, "raw_data"), full = TRUE)

The names of the variables:

var_names <- c("culture", "sex", "age", "fever", "cough", "diarrhea", "vomiting",
               "abdominal_pain", "constipation", "headache", "pulse", "temperature",
               "splenomegaly", "hepatomegaly", "WBC", "platelets", "ALT", "IgM", "CRP")

CRP levels:

crp_levels <- c("<10", "10-40", "40-80", ">80")

Reading and cleaning raw data from Nepal:

nepal <- files[grep("nepa", files)] |>
  read_excel() |> 
  select(BloodCSResult, Sex, Age,
         Fever, Cough, Diarrhoea, `Vomiting...26`, Abdopain, Constipation, Headache,
         Pulse, OralTemperature, Splenomegaly, Hepatomegaly, WBC_gro, Platelets_gro,
         ALT, Typhoid_IgM, `CRP_mg/L`, scorev1, scorev2, scorev3, scorev4, Score8) |> 
  setNames(c(var_names, paste0("score", c(1:4, 8)))) |> 
  mutate(across(culture, ~ .x == "SPA" | .x == "ST"),
         across(IgM, ~ factor2(sub("N", "0", substring(.x, 1, 1)))),
         across(CRP, ~ factor2(sub(" *\\(.*\\).*$", "", .x), levels = crp_levels)),
         across(c(age, fever, cough, diarrhea, vomiting, abdominal_pain, constipation,
                  headache, pulse, platelets, ALT, starts_with("score")), as.integer),
         across(sex, ~ factor(c("female", "male")[(.x == "1" | .x == "Male") + 1])),
         across(c(splenomegaly, hepatomegaly),
                ~ .x == "1" | .x == "TRUE" | .x == "Yes"))

Reading and cleaning raw data from Cambodia and Bangladesh:

cambodia_bangladesh <- files[grep("camb_", files)] |>
  read_excel() |> 
  select(culture, sex, Age, feverdays, cough, diarrhoea, vomiting, abdopain,
         constipation, headache, pulse, `temp A`, spleen, hepat, wbc, plts, alt,
         `IgM life assay d0`, `CRP Group...63`, st) |> 
  setNames(c(var_names, "country")) |> 
  mutate(across(-c(sex, age, fever, pulse, temperature, WBC, platelets, ALT, IgM, CRP,
                   country), as.logical),
         across(IgM, factor2),
         across(CRP, ~ factor2(crp_levels[.x + 1], levels = crp_levels)),
         across(c(fever, pulse, ALT), as.integer),
         across(sex, ~ factor(c("female", "male")[.x + 1])),
         across(country, ~ factor(c("Cambodia", "Bangladesh")[.x])))

Reading and cleaning the blood volume data

The data from Bangladesh:

bv_bangladesh <- files[grep("bangl", files)] |> 
  read_excel() |> 
  naniar::replace_with_na(list(Weight = "nr")) |> 
  rename(age = `Age (years)`) |> 
  mutate(weight  = as.numeric(Weight),
         volume  = BC_post - BC_pre,
         culture = BC_result == "Pos") |> 
  select(age, weight, volume, culture)

The data from Cambodia:

bv_cambodia <- files[grep("cambo", files)] |> 
  read_excel() |> 
  mutate(age     = as.numeric((admdate - dateofbirth) / 365.25),
         volume  = bcultwt2 - bcultwt1,
         culture = bculture == "S typhi") |> 
  tidyr::replace_na(list(culture = FALSE)) |> 
  select(age, weight, volume, culture)

Saving to disk:

saveRDS(nepal, paste0(data_path, "clean_data/nepal.rds"))
saveRDS(cambodia_bangladesh, paste0(data_path, "clean_data/cambodia_bangladesh.rds"))
saveRDS(bv_bangladesh, paste0(data_path, "clean_data/bv_bangladesh.rds"))
saveRDS(bv_cambodia, paste0(data_path, "clean_data/bv_cambodia.rds"))