<- paste0("/Users/MarcChoisy/Library/CloudStorage/",
data_path "OneDrive-OxfordUniversityClinicalResearchUnit/",
"GitHub/choisy/typhoid/")
Data cleaning
Parameters
The path to the data folder:
Packages
required packages:
<- c("readxl", "dplyr", "naniar") required_packages
Making sure that the required packages are installed:
<- required_packages[! required_packages %in% installed.packages()[,"Package"]]
to_inst if (length(to_inst)) install.packages(to_inst)
Loading some of these packages:
library(readxl)
library(dplyr)
Utilitary function
Tuning factor()
:
<- function(...) factor(..., ordered = TRUE) factor2
Reading and cleaning data
The files in the raw data folder:
<- dir(paste0(data_path, "raw_data"), full = TRUE) files
The names of the variables:
<- c("culture", "sex", "age", "fever", "cough", "diarrhea", "vomiting",
var_names "abdominal_pain", "constipation", "headache", "pulse", "temperature",
"splenomegaly", "hepatomegaly", "WBC", "platelets", "ALT", "IgM", "CRP")
CRP levels:
<- c("<10", "10-40", "40-80", ">80") crp_levels
Reading and cleaning raw data from Nepal:
<- files[grep("nepa", files)] |>
nepal read_excel() |>
select(BloodCSResult, Sex, Age,
`Vomiting...26`, Abdopain, Constipation, Headache,
Fever, Cough, Diarrhoea,
Pulse, OralTemperature, Splenomegaly, Hepatomegaly, WBC_gro, Platelets_gro,`CRP_mg/L`, scorev1, scorev2, scorev3, scorev4, Score8) |>
ALT, Typhoid_IgM, setNames(c(var_names, paste0("score", c(1:4, 8)))) |>
mutate(across(culture, ~ .x == "SPA" | .x == "ST"),
across(IgM, ~ factor2(sub("N", "0", substring(.x, 1, 1)))),
across(CRP, ~ factor2(sub(" *\\(.*\\).*$", "", .x), levels = crp_levels)),
across(c(age, fever, cough, diarrhea, vomiting, abdominal_pain, constipation,
starts_with("score")), as.integer),
headache, pulse, platelets, ALT, across(sex, ~ factor(c("female", "male")[(.x == "1" | .x == "Male") + 1])),
across(c(splenomegaly, hepatomegaly),
~ .x == "1" | .x == "TRUE" | .x == "Yes"))
Reading and cleaning raw data from Cambodia and Bangladesh:
<- files[grep("camb_", files)] |>
cambodia_bangladesh read_excel() |>
select(culture, sex, Age, feverdays, cough, diarrhoea, vomiting, abdopain,
`temp A`, spleen, hepat, wbc, plts, alt,
constipation, headache, pulse, `IgM life assay d0`, `CRP Group...63`, st) |>
setNames(c(var_names, "country")) |>
mutate(across(-c(sex, age, fever, pulse, temperature, WBC, platelets, ALT, IgM, CRP,
country), as.logical),across(IgM, factor2),
across(CRP, ~ factor2(crp_levels[.x + 1], levels = crp_levels)),
across(c(fever, pulse, ALT), as.integer),
across(sex, ~ factor(c("female", "male")[.x + 1])),
across(country, ~ factor(c("Cambodia", "Bangladesh")[.x])))
Reading and cleaning the blood volume data
The data from Bangladesh:
<- files[grep("bangl", files)] |>
bv_bangladesh read_excel() |>
::replace_with_na(list(Weight = "nr")) |>
naniarrename(age = `Age (years)`) |>
mutate(weight = as.numeric(Weight),
volume = BC_post - BC_pre,
culture = BC_result == "Pos") |>
select(age, weight, volume, culture)
The data from Cambodia:
<- files[grep("cambo", files)] |>
bv_cambodia read_excel() |>
mutate(age = as.numeric((admdate - dateofbirth) / 365.25),
volume = bcultwt2 - bcultwt1,
culture = bculture == "S typhi") |>
::replace_na(list(culture = FALSE)) |>
tidyrselect(age, weight, volume, culture)
Saving to disk:
saveRDS(nepal, paste0(data_path, "clean_data/nepal.rds"))
saveRDS(cambodia_bangladesh, paste0(data_path, "clean_data/cambodia_bangladesh.rds"))
saveRDS(bv_bangladesh, paste0(data_path, "clean_data/bv_bangladesh.rds"))
saveRDS(bv_cambodia, paste0(data_path, "clean_data/bv_cambodia.rds"))