library(survival)
library(car)
library(grid)
library(pscl)
library(segmented)
library(readxl)
library(lubridate)
library(tidyr)
library(purrr)
library(stringr)
library(magrittr) # safer to load in penultimate position
library(dplyr) # safer to load in last position
A function that does the likelihood ratio test (LRT):
lrt <- function(...) anova(..., test = "LRT")
A tuning of the diff()
function:
diff2 <- function(x, ...) c(diff(x, ...), NA)
Social risk factors:
srf <- c("hmp", # prison
"etoh", # alcoholic
"drugs", # drug user
"nfa") # homeless
Loading the data sets from the files dataset1.xlsx
and
dataset3.xlsx
:
bmg1 <- "dataset1.xlsx" %>%
read_excel() %>%
mutate_at(srf, as.logical) # no NA is these variables
bmg3 <- "dataset3.xlsx" %>%
read_excel() %>%
mutate_at(srf, ~ .x %>%
na_if("Unknown") %>% # assumes that "Unknown" = NA
equals("Yes"))
Let’s compare bmg1
and bmg3
with some basic
checks. Checking for the presence of duplicated IDs:
any(table(bmg1$id) > 1)
## [1] FALSE
any(table(bmg3$id) > 1)
## [1] FALSE
Checking that the exact same sets of IDs are used for the 2 data sets:
identical(sort(bmg1$id), sort(bmg3$id))
## [1] TRUE
Are there any variables that have have different values in the two data sets:
intsct <- setdiff(intersect(names(bmg1), names(bmg3)), "id")
intsct_1 <- paste0(intsct, ".1")
intsct_3 <- paste0(intsct, ".3")
bmg13 <- bmg3 %>%
mutate_at(srf, replace_na, FALSE) %>%
left_join(bmg1, "id", suffix = c(".3", ".1"))
(difrt <- intsct[which(! map2_lgl(bmg13[, intsct_1], bmg13[, intsct_3], identical))])
## [1] "lineage"
Thus, the variable lineage
seems to have different
values in the two data sets:
if (length(difrt)) {
show_differences <- function(x, y) {
X <- bmg13[[x]]
Y <- bmg13[[y]]
bmg13[is.na(X - Y) | (X != Y), c("id", x, y)]
}
map2(paste0(difrt, ".1"), paste0(difrt, ".3"), show_differences)
}
## [[1]]
## # A tibble: 1 × 3
## id lineage.1 lineage.3
## <dbl> <dbl> <dbl>
## 1 5636 10 NA
From these observations, let’s merge bmg1
and
bmg3
, taking bmg3
as a reference (i.e. with a
missing lineage):
bmg <- bmg3 %>%
select(c("id", setdiff(names(bmg3), names(bmg1)))) %>%
left_join(bmg1, "id") %>%
rename(prison = hmp,
alcohol = etoh,
homeless = nfa,
post_code = pc) %>%
mutate(date = as_date(firstsample),
sex = c("female", "male")[mf],
lineage = na_if(lineage, 10),
born = ifelse(ukb == "Not known", NA, c("uk", "os1", "os2")[imstat + 1])) %>%
mutate_at(vars(index2y, cluster, pul), as.logical) %>%
mutate_at(vars(id, cluster_number, lineage, post_code), as.integer) %>%
select(id, post_code, date, sex, born, pul, prison, alcohol, homeless, drugs,
cluster, cluster_number, lineage)
Adding the variables start
and chronorder
used for generating figure 2:
bmg %<>%
group_by(cluster_number) %>%
summarise(start = min(date)) %>%
mutate(chronorder = rank(start, ties = "first")) %>%
left_join(bmg, ., "cluster_number")
Adding time
and status
for the survival
analyses:
end_of_data <- max(bmg$date)
bmg %<>%
arrange(date) %>%
group_by(cluster_number) %>%
group_map(~ mutate(.x, time = diff2(as.numeric(date))), .keep = TRUE) %>%
bind_rows() %>%
mutate(status = !is.na(time),
time = ifelse(status, time, end_of_data - date),
no_other_factor = !(prison | alcohol | homeless | drugs))
Adding the number of social risk factors:
bmg %<>% mutate(srf = prison + alcohol + homeless + drugs)
Final version of the data set that merges bmg1
and
bmg3
is thus:
bmg
## # A tibble: 1,653 × 19
## id post_code date sex born pul prison alcohol homeless drugs
## <int> <int> <date> <chr> <chr> <lgl> <lgl> <lgl> <lgl> <lgl>
## 1 234 8 2011-06-04 male os1 TRUE FALSE FALSE FALSE FALSE
## 2 560 8 2013-04-10 female os1 TRUE FALSE FALSE FALSE FALSE
## 3 253 66 2011-07-04 female os2 TRUE FALSE FALSE FALSE FALSE
## 4 290 66 2011-08-25 male os1 TRUE FALSE FALSE FALSE FALSE
## 5 3497 19 2014-03-24 male os2 TRUE FALSE FALSE FALSE FALSE
## 6 4347 19 2015-07-07 male uk TRUE FALSE FALSE FALSE FALSE
## 7 2299 11 2009-11-23 female os1 TRUE FALSE FALSE FALSE FALSE
## 8 5 11 2009-12-14 female os1 TRUE FALSE FALSE FALSE FALSE
## 9 426 28 2012-06-08 male os1 FALSE FALSE FALSE FALSE FALSE
## 10 2542 66 2010-03-25 female uk TRUE FALSE FALSE FALSE FALSE
## # ℹ 1,643 more rows
## # ℹ 9 more variables: cluster <lgl>, cluster_number <int>, lineage <int>,
## # start <date>, chronorder <int>, time <dbl>, status <lgl>,
## # no_other_factor <lgl>, srf <int>
Now we want to verify the cluster
variable. Here we
compute from scratch the set of IDs that belong to a cluster:
in_a_cluster1 <- bmg %>%
group_by(cluster_number) %>%
tally() %>%
filter(n > 1) %>%
pull(cluster_number)
This is the set of IDs reported in the data set to belong to a cluster:
in_a_cluster2 <- bmg %>%
filter(cluster) %>%
pull(cluster_number) %>%
unique()
We can verify that this 2 sets are identical:
identical(in_a_cluster1, in_a_cluster2)
## [1] TRUE
Next, we want to verify that there is only one lineage per cluster:
bmg %>%
group_by(lineage, cluster_number) %>%
tally() %>%
pivot_wider(names_from = lineage, values_from = n) %>%
mutate_all(replace_na, 0) %>%
mutate_at(vars(`1`, `2`, `3`, `4`, `NA`), as.logical) %>%
mutate(s = `1` + `2` + `3` + `4` + `NA`) %>%
filter(s > 1)
## # A tibble: 1 × 7
## cluster_number `1` `2` `3` `4` `NA` s
## <int> <lgl> <lgl> <lgl> <lgl> <lgl> <int>
## 1 27 FALSE FALSE TRUE FALSE TRUE 2
All good. Now confirming that there is one lineage missing value (ID 5636) in the cluster number 27:
bmg %>%
filter(is.na(lineage)) %>%
select(id, cluster_number, lineage)
## # A tibble: 1 × 3
## id cluster_number lineage
## <int> <int> <int>
## 1 5636 27 NA
bmg1_12 <- "dataset1_snps12.xlsx" %>%
read_excel() %>%
mutate_at(srf, as.logical) %>%
select(-`_merge`)
bmg_12 <- bmg3 %>%
select(c("id", setdiff(names(bmg3), names(bmg1_12)))) %>%
left_join(bmg1_12, "id") %>%
rename(prison = hmp,
alcohol = etoh,
homeless = nfa,
post_code = pc) %>%
mutate(date = as_date(firstsample),
sex = c("female", "male")[mf],
lineage = na_if(lineage, 10),
born = ifelse(ukb == "Not known", NA, c("uk", "os1", "os2")[imstat + 1])) %>%
mutate_at(vars(index2y, cluster, pul), as.logical) %>%
mutate_at(vars(id, cluster_number, lineage, post_code), as.integer) %>%
select(id, post_code, date, sex, born, pul, prison, alcohol, homeless, drugs,
cluster, cluster_number, lineage)
bmg_12 %<>%
group_by(cluster_number) %>%
summarise(start = min(date)) %>%
mutate(chronorder = rank(start, ties = "first")) %>%
left_join(bmg_12, ., "cluster_number")
end_of_data <- max(bmg_12$date)
bmg_12 %<>%
arrange(date) %>%
group_by(cluster_number) %>%
group_map(~ mutate(.x, time = diff2(as.numeric(date))), .keep = TRUE) %>%
bind_rows() %>%
mutate(status = !is.na(time),
time = ifelse(status, time, end_of_data - date),
no_other_factor = !(prison | alcohol | homeless | drugs))
bmg_12 %<>% mutate(srf = prison + alcohol + homeless + drugs)
bmg_12
## # A tibble: 1,653 × 19
## id post_code date sex born pul prison alcohol homeless drugs
## <int> <int> <date> <chr> <chr> <lgl> <lgl> <lgl> <lgl> <lgl>
## 1 234 8 2011-06-04 male os1 TRUE FALSE FALSE FALSE FALSE
## 2 560 8 2013-04-10 female os1 TRUE FALSE FALSE FALSE FALSE
## 3 253 66 2011-07-04 female os2 TRUE FALSE FALSE FALSE FALSE
## 4 290 66 2011-08-25 male os1 TRUE FALSE FALSE FALSE FALSE
## 5 3497 19 2014-03-24 male os2 TRUE FALSE FALSE FALSE FALSE
## 6 4347 19 2015-07-07 male uk TRUE FALSE FALSE FALSE FALSE
## 7 2299 11 2009-11-23 female os1 TRUE FALSE FALSE FALSE FALSE
## 8 5 11 2009-12-14 female os1 TRUE FALSE FALSE FALSE FALSE
## 9 426 28 2012-06-08 male os1 FALSE FALSE FALSE FALSE FALSE
## 10 2542 66 2010-03-25 female uk TRUE FALSE FALSE FALSE FALSE
## # ℹ 1,643 more rows
## # ℹ 9 more variables: cluster <lgl>, cluster_number <int>, lineage <int>,
## # start <date>, chronorder <int>, time <dbl>, status <lgl>,
## # no_other_factor <lgl>, srf <int>
Let’s compare bmg
and bmg12
:
table(bmg$cluster, bmg_12$cluster)
##
## FALSE TRUE
## FALSE 1095 47
## TRUE 0 511
Reading the new data set:
bmg1_12b <- "snp12_clusters.xlsx" %>%
read_excel() %>%
setNames(c("id", "date", "cluster_number")) %>%
mutate(id = as.integer(id),
date = as_date(date),
cluster_number = as.integer(cluster_number))
Checking consistency on the date:
bmg %>%
select(id, date) %>%
left_join(select(bmg1_12b, id, date), "id") %>%
mutate(dff = date.x - date.y) %>%
pull(dff) %>%
unique()
## [1] 0
The cluster ID that actually correspond to actual clusters:
in_cluster <- bmg1_12b %>%
group_by(cluster_number) %>%
tally() %>%
filter(n > 1) %>%
pull(cluster_number)
new_cluster <- bmg1_12b %>%
mutate(cluster2 = as.numeric(cluster_number %in% in_cluster)) %>%
select(id, cluster2)
bmg1_12b <- bmg1_12 %>%
left_join(new_cluster, "id") %>%
mutate(cluster = cluster2) %>%
select(-cluster2)
bmg_12b <- bmg3 %>%
select(c("id", setdiff(names(bmg3), names(bmg1_12b)))) %>%
left_join(bmg1_12b, "id") %>%
rename(prison = hmp,
alcohol = etoh,
homeless = nfa,
post_code = pc) %>%
mutate(date = as_date(firstsample),
sex = c("female", "male")[mf],
lineage = na_if(lineage, 10),
born = ifelse(ukb == "Not known", NA, c("uk", "os1", "os2")[imstat + 1])) %>%
mutate_at(vars(index2y, cluster, pul), as.logical) %>%
mutate_at(vars(id, cluster_number, lineage, post_code), as.integer) %>%
select(id, post_code, date, sex, born, pul, prison, alcohol, homeless, drugs,
cluster, cluster_number, lineage)
bmg_12b %<>%
group_by(cluster_number) %>%
summarise(start = min(date)) %>%
mutate(chronorder = rank(start, ties = "first")) %>%
left_join(bmg_12b, ., "cluster_number")
end_of_data <- max(bmg_12b$date)
bmg_12b %<>%
arrange(date) %>%
group_by(cluster_number) %>%
group_map(~ mutate(.x, time = diff2(as.numeric(date))), .keep = TRUE) %>%
bind_rows() %>%
mutate(status = !is.na(time),
time = ifelse(status, time, end_of_data - date),
no_other_factor = !(prison | alcohol | homeless | drugs))
bmg_12b %<>% mutate(srf = prison + alcohol + homeless + drugs)
bmg_12b
## # A tibble: 1,653 × 19
## id post_code date sex born pul prison alcohol homeless drugs
## <int> <int> <date> <chr> <chr> <lgl> <lgl> <lgl> <lgl> <lgl>
## 1 234 8 2011-06-04 male os1 TRUE FALSE FALSE FALSE FALSE
## 2 560 8 2013-04-10 female os1 TRUE FALSE FALSE FALSE FALSE
## 3 253 66 2011-07-04 female os2 TRUE FALSE FALSE FALSE FALSE
## 4 290 66 2011-08-25 male os1 TRUE FALSE FALSE FALSE FALSE
## 5 3497 19 2014-03-24 male os2 TRUE FALSE FALSE FALSE FALSE
## 6 4347 19 2015-07-07 male uk TRUE FALSE FALSE FALSE FALSE
## 7 2299 11 2009-11-23 female os1 TRUE FALSE FALSE FALSE FALSE
## 8 5 11 2009-12-14 female os1 TRUE FALSE FALSE FALSE FALSE
## 9 426 28 2012-06-08 male os1 FALSE FALSE FALSE FALSE FALSE
## 10 2542 66 2010-03-25 female uk TRUE FALSE FALSE FALSE FALSE
## # ℹ 1,643 more rows
## # ℹ 9 more variables: cluster <lgl>, cluster_number <int>, lineage <int>,
## # start <date>, chronorder <int>, time <dbl>, status <lgl>,
## # no_other_factor <lgl>, srf <int>
Let’s compare bmg12
and bmg12b
:
table(bmg_12$cluster, bmg_12b$cluster)
##
## FALSE TRUE
## FALSE 1091 4
## TRUE 2 556
Total number of patients:
(nb_patients <- nrow(bmg))
## [1] 1653
Numbers of male and female:
sex_ratio <- bmg %>%
group_by(sex) %>%
tally() %>%
mutate(perc = round(n / nb_patients, 2))
sex_ratio
## # A tibble: 2 × 3
## sex n perc
## <chr> <int> <dbl>
## 1 female 695 0.42
## 2 male 958 0.58
Number of missing data on location of birth:
(nb_missing_born <- sum(is.na(bmg$born)))
## [1] 10
Corresponding percentage:
round(nb_missing_born / nb_patients, 2)
## [1] 0.01
Number of people born in the UK:
(nb_uk <- sum(bmg$born == "uk", na.rm = TRUE))
## [1] 448
Corresponding percentage:
(perc_uk <- round(nb_uk / nb_patients, 2))
## [1] 0.27
Number of people born overseas:
nb_patients - nb_uk - nb_missing_born
## [1] 1195
Corresponding percentage:
round((nb_patients - nb_uk - nb_missing_born) / nb_patients, 2)
## [1] 0.72
Pulmonary TB:
bmg %>%
group_by(pul) %>%
tally()
## # A tibble: 2 × 2
## pul n
## <lgl> <int>
## 1 FALSE 610
## 2 TRUE 1043
Type of TB x born in the UK or not:
tbl <- bmg %>%
mutate(uk = born == "uk") %$%
table(uk, pul) %>%
addmargins()
tbl
## pul
## uk FALSE TRUE Sum
## FALSE 505 690 1195
## TRUE 99 349 448
## Sum 604 1039 1643
Corresponding percentages:
round(100 * tbl[1:2, -3] / tbl[1:2, 3])
## pul
## uk FALSE TRUE
## FALSE 42 58
## TRUE 22 78
Lineages:
bmg %>%
group_by(lineage) %>%
tally()
## # A tibble: 5 × 2
## lineage n
## <int> <int>
## 1 1 214
## 2 2 86
## 3 3 703
## 4 4 649
## 5 NA 1
Number of patients in a cluster (versus not):
nb_clst <- bmg %>%
group_by(cluster) %>%
tally()
nb_clst
## # A tibble: 2 × 2
## cluster n
## <lgl> <int>
## 1 FALSE 1142
## 2 TRUE 511
Confirming above result, sex and place of birth are independent but social risk factors have a significant effect on both of them, namely males and those born in the UK are more associated with social risk factors:
summary(glm(sex == "male" ~ born == "uk", binomial, bmg))
##
## Call:
## glm(formula = sex == "male" ~ born == "uk", family = binomial,
## data = bmg)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.31556 0.05858 5.387 7.16e-08 ***
## born == "uk"TRUE 0.04545 0.11249 0.404 0.686
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2234.1 on 1642 degrees of freedom
## Residual deviance: 2233.9 on 1641 degrees of freedom
## (10 observations deleted due to missingness)
## AIC: 2237.9
##
## Number of Fisher Scoring iterations: 4
summary(glm(sex == "male" ~ prison | alcohol | homeless | drugs, binomial, bmg))
##
## Call:
## glm(formula = sex == "male" ~ prison | alcohol | homeless | drugs,
## family = binomial, data = bmg)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20606 0.05191 3.969 7.21e-05
## prison | alcohol | homeless | drugsTRUE 1.57810 0.23611 6.684 2.33e-11
##
## (Intercept) ***
## prison | alcohol | homeless | drugsTRUE ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2249.5 on 1652 degrees of freedom
## Residual deviance: 2189.6 on 1651 degrees of freedom
## AIC: 2193.6
##
## Number of Fisher Scoring iterations: 3
summary(glm(born == "uk" ~ prison | alcohol | homeless | drugs, binomial, bmg))
##
## Call:
## glm(formula = born == "uk" ~ prison | alcohol | homeless | drugs,
## family = binomial, data = bmg)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.19209 0.06129 -19.45 <2e-16
## prison | alcohol | homeless | drugsTRUE 1.85596 0.18135 10.23 <2e-16
##
## (Intercept) ***
## prison | alcohol | homeless | drugsTRUE ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1925.3 on 1642 degrees of freedom
## Residual deviance: 1813.5 on 1641 degrees of freedom
## (10 observations deleted due to missingness)
## AIC: 1817.5
##
## Number of Fisher Scoring iterations: 4
Sex is not associated with the likelihood of the case to be part of a cluster but social risk factor and being born in the UK both significantly increase the likelihood to be part of a cluster:
summary(glm(cluster ~ (prison | alcohol | homeless | drugs) + sex, binomial, bmg))
##
## Call:
## glm(formula = cluster ~ (prison | alcohol | homeless | drugs) +
## sex, family = binomial, data = bmg)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.99858 0.08548 -11.682 < 2e-16
## prison | alcohol | homeless | drugsTRUE 1.33140 0.17796 7.482 7.35e-14
## sexmale 0.09132 0.11258 0.811 0.417
##
## (Intercept) ***
## prison | alcohol | homeless | drugsTRUE ***
## sexmale
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2044.5 on 1652 degrees of freedom
## Residual deviance: 1982.1 on 1650 degrees of freedom
## AIC: 1988.1
##
## Number of Fisher Scoring iterations: 4
summary(glm(cluster ~ (prison | alcohol | homeless | drugs) + (born == "uk"), binomial, bmg))
##
## Call:
## glm(formula = cluster ~ (prison | alcohol | homeless | drugs) +
## (born == "uk"), family = binomial, data = bmg)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.36774 0.07219 -18.945 < 2e-16
## prison | alcohol | homeless | drugsTRUE 0.81466 0.18998 4.288 1.8e-05
## born == "uk"TRUE 1.51194 0.12273 12.320 < 2e-16
##
## (Intercept) ***
## prison | alcohol | homeless | drugsTRUE ***
## born == "uk"TRUE ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2035.4 on 1642 degrees of freedom
## Residual deviance: 1820.0 on 1640 degrees of freedom
## (10 observations deleted due to missingness)
## AIC: 1826
##
## Number of Fisher Scoring iterations: 4
Let’s first transform the data into monthly incidence time series. Because we will model potential seasonality with harmonic regression, we also generate cos and sine wave with the following function:
wave_trans <- function(x, f = cos) {
f(2 * pi * x / 365)
}
And now the time series data can be produced this way:
bmg_ts <- bmg %>%
mutate(year = year(date),
month = month(date)) %>%
group_by(year, month, cluster) %>%
tally() %>%
ungroup() %>%
mutate(date = ymd(paste(year, month, "1")),
date2 = as.integer(date),
cosdate = wave_trans(date2, cos),
sindate = wave_trans(date2, sin))
Which looks like:
bmg_ts
## # A tibble: 248 × 8
## year month cluster n date date2 cosdate sindate
## <dbl> <dbl> <lgl> <int> <date> <int> <dbl> <dbl>
## 1 2009 1 FALSE 13 2009-01-01 14245 0.985 0.171
## 2 2009 1 TRUE 5 2009-01-01 14245 0.985 0.171
## 3 2009 2 FALSE 17 2009-02-01 14276 0.761 0.649
## 4 2009 2 TRUE 5 2009-02-01 14276 0.761 0.649
## 5 2009 3 FALSE 16 2009-03-01 14304 0.374 0.928
## 6 2009 3 TRUE 10 2009-03-01 14304 0.374 0.928
## 7 2009 4 FALSE 16 2009-04-01 14335 -0.150 0.989
## 8 2009 4 TRUE 8 2009-04-01 14335 -0.150 0.989
## 9 2009 5 FALSE 16 2009-05-01 14365 -0.619 0.786
## 10 2009 5 TRUE 11 2009-05-01 14365 -0.619 0.786
## # ℹ 238 more rows
Let’s consider some Poisson models with various options for the interactions:
# no interactions:
mod_pois1 <- glm(n ~ date2 + cluster + cosdate + sindate, poisson, bmg_ts)
# interaction between seasonality and cluster:
mod_pois2 <- update(mod_pois1, . ~ . + cosdate : cluster + sindate : cluster)
# interaction between trend and cluster:
mod_pois3 <- update(mod_pois1, . ~ . + date2 : cluster)
# interactions between seasonality and cluster, as well as between trend and cluster:
mod_pois4 <- update(mod_pois2, . ~ . + date2 : cluster)
Let’s compare these models:
lrt(mod_pois1, mod_pois2, mod_pois4)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate +
## date2:cluster
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 243 231.51
## 2 241 221.94 2 9.5713 0.008349 **
## 3 240 221.73 1 0.2091 0.647474
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrt(mod_pois1, mod_pois3, mod_pois4)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + date2:cluster
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate +
## date2:cluster
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 243 231.51
## 2 242 231.25 1 0.2607 0.609650
## 3 240 221.73 2 9.5198 0.008567 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
So, the best model would be mod_pois2
(i.e. interaction
between cluster and seasonality, but not between cluster and linear
trend). Next, let’s see whether the seasonality is significant for both
the cluster and non-cluster:
mod_pois2a <- update(mod_pois1, . ~ . - cluster, data = filter(bmg_ts, cluster))
mod_pois2b <- update(mod_pois2a, data = filter(bmg_ts, !cluster))
test_seasonality <- function(x) {
lrt(update(x, . ~ . - cosdate - sindate), x)
}
test_seasonality(mod_pois2a)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 121 112.147
## 2 119 91.685 2 20.462 3.604e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_seasonality(mod_pois2b)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 134.11
## 2 121 130.05 2 4.061 0.1313
Which means that seasonality is significant only for the cluster
(mod_pois2a
). Let’s thus update this model accordingly:
mod_pois2b %<>% update(. ~ . - cosdate - sindate)
Next, let’s look at segmented regressions version of these two models, with periods predefined by these dates:
date_1 <- ymd(20110101)
date_2 <- ymd(20150101)
Let’s do the test for the cluster and non cluster:
segmented2 <- function(x, y) {
segmented(x, ~ date2, y - 15, control = seg.control(it.max = 0))
}
test_segments <- function(x) {
mod1 <- segmented2(x, date_1)
mod2 <- segmented2(x, date_2)
mod3 <- segmented2(x, c(date_1, date_2))
print(lrt(x, mod1))
print(lrt(x, mod2))
lrt(mod2, mod3)
}
test_segments(mod_pois2a)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 119 91.685
## 2 118 91.601 1 0.083495 0.7726
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 119 91.685
## 2 118 90.949 1 0.73597 0.391
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate + U1.date2
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 118 90.949
## 2 117 89.970 1 0.9785 0.3226
test_segments(mod_pois2b)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 134.11
## 2 122 133.49 1 0.61578 0.4326
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 134.11
## 2 122 128.93 1 5.1811 0.02283 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + U1.date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 122 128.93
## 2 121 128.42 1 0.5077 0.4761
Which means that there is a change in the trend only for the
non-cluster, and only after 2015. Let’s update model
mod_pois2b
accordingly:
mod_pois2b %<>% segmented2(date_2)
Now we can make a figure. First we need a function that computes predictions with confidence intervals from a model:
predict2 <- function(object, newdata, level = .95) {
q <- (1 - level) / 2
predict(object, newdata, se.fit = TRUE) %>%
magrittr::extract(-3) %>%
as_tibble() %>%
mutate(lwr = fit + qt(q, Inf) * se.fit,
upr = fit + qt(1 - q, Inf) * se.fit) %>%
mutate_at(vars(fit, lwr, upr), object$family$linkinv) %>%
cbind(newdata, .)
}
Let’s generate some new data for the covariables used for prediction:
newdata <- tibble(date = unique(bmg_ts$date)) %>%
mutate(date2 = as.integer(date),
cosdate = wave_trans(date2, cos),
sindate = wave_trans(date2, sin),
U1.date2 = date2 - as.numeric(date_2) + 15,
U1.date2 = ifelse(U1.date2 < 0, 0, U1.date2))
Next we need a function that plots the model predictions with confidence interval:
arrows2 <- function(...) arrows(..., length = .02, angle = 90, code = 3)
plot_mod_pred <- function(x, color, lwd = 1.5) {
with(x, {
lines(date, fit, col = color, lwd = lwd)
arrows2(date, lwr, date, upr, col = color, lwd = lwd)
})
}
Now that we have these two functions, we can make the figure:
# The data of non cluster:
bmg_ts %>%
filter(! cluster) %$%
plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")
# The data of cluster:
bmg_ts %>%
filter(cluster) %$%
points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)
# The model of cluster:
mod_pois2a %>%
predict2(newdata) %>%
plot_mod_pred(2)
# The model of non cluster:
mod_pois2b %>%
predict2(newdata) %>%
plot_mod_pred(4)
# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)
# The legend:
lgd <- c("non cluster", "cluster")
legend2 <- function(...) legend(..., box.col = "white")
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)
On figure 1, replace incidence by incidence rate (reviewer 5).
ons <- readxl::read_excel("ONS-mid-year-population-estimates.xlsx")
bmg_ts2 <- ons %>%
select(Year, Total) %>%
left_join(bmg_ts, ., c("year" = "Year"))
# no interactions:
mod_pois1_2 <- glm(n ~ date2 + cluster + cosdate + sindate, poisson, bmg_ts2, offset = log(Total))
# interaction between seasonality and cluster:
mod_pois2_2 <- update(mod_pois1_2, . ~ . + cosdate : cluster + sindate : cluster)
# interaction between trend and cluster:
mod_pois3_2 <- update(mod_pois1_2, . ~ . + date2 : cluster)
# interactions between seasonality and cluster, as well as between trend and cluster:
mod_pois4_2 <- update(mod_pois2_2, . ~ . + date2 : cluster)
lrt(mod_pois1_2, mod_pois2_2, mod_pois4_2)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate +
## date2:cluster
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 243 230.97
## 2 241 221.39 2 9.5721 0.008345 **
## 3 240 221.18 1 0.2098 0.646923
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod_pois2a_2 <- update(mod_pois1_2, . ~ . - cluster, data = filter(bmg_ts2, cluster))
mod_pois2b_2 <- update(mod_pois2a_2, data = filter(bmg_ts2, !cluster))
test_seasonality(mod_pois2a_2)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 121 111.625
## 2 119 91.485 2 20.14 4.233e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_seasonality(mod_pois2b_2)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 133.67
## 2 121 129.70 2 3.9706 0.1373
mod_pois2b_2 %<>% update(. ~ . - cosdate - sindate)
test_segments(mod_pois2a_2)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 119 91.485
## 2 118 91.412 1 0.073012 0.787
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 119 91.485
## 2 118 90.711 1 0.77457 0.3788
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate + U1.date2
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 118 90.711
## 2 117 89.746 1 0.96441 0.3261
test_segments(mod_pois2b_2)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 133.67
## 2 122 133.02 1 0.64837 0.4207
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 133.67
## 2 122 128.35 1 5.3165 0.02112 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + U1.date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 122 128.35
## 2 121 127.85 1 0.50206 0.4786
mod_pois2b_2 %<>% segmented2(date_2)
newdata2 <- tibble(date = unique(bmg_ts$date)) %>%
mutate(date2 = as.integer(date),
cosdate = wave_trans(date2, cos),
sindate = wave_trans(date2, sin),
U1.date2 = date2 - as.numeric(date_2) + 15,
U1.date2 = ifelse(U1.date2 < 0, 0, U1.date2),
Year = year(date)) %>%
left_join(select(ons, Year, Total), "Year") %>%
select(-Year)
The figure:
# The data of non cluster:
bmg_ts %>%
filter(! cluster) %$%
plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")
# The data of cluster:
bmg_ts %>%
filter(cluster) %$%
points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)
# The model of cluster:
mod_pois2a_2 %>%
predict2(newdata2) %>%
plot_mod_pred(2)
# The model of non cluster:
mod_pois2b_2 %>%
predict2(newdata2) %>%
plot_mod_pred(4)
# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)
# The legend:
lgd <- c("non cluster", "cluster")
legend2 <- function(...) legend(..., box.col = "white")
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)
On figure 1, replace the segmented regression by a spline regression (reviewer 3).
library(mgcv)
# The data of non cluster:
bmg_ts %>%
filter(! cluster) %$%
plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")
# The data of cluster:
bmg_ts %>%
filter(cluster) %$%
points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)
# The model of non cluster:
bmg_ts %>%
filter(! cluster) %>%
gam(n ~ s(date2), poisson, .) %>%
# gam(n ~ s(date2) + s(cosdate) + s(sindate), poisson, .) %>%
predict2(newdata) %>%
plot_mod_pred(4)
# The model of cluster:
bmg_ts %>%
filter(cluster) %>%
gam(n ~ s(date2) + s(cosdate) + s(sindate), poisson, .) %>%
predict2(newdata) %>%
plot_mod_pred(2)
# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)
# The legend:
lgd <- c("non cluster", "cluster")
legend2 <- function(...) legend(..., box.col = "white")
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)
Let’s look at the dispersion in the residuals
AER::dispersiontest(mod_pois2a)
##
## Overdispersion test
##
## data: mod_pois2a
## z = -2.4579, p-value = 0.993
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 0.7780348
AER::dispersiontest(mod_pois2b)
##
## Overdispersion test
##
## data: mod_pois2b
## z = -0.080627, p-value = 0.5321
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 0.9917055
Which comforts us in the choice of a Poisson model. Let’s now look at temporal autocorrelation in residuals. Below is a function that performs a few checks:
test_autocorr <- function(x) {
Residuals <- resid(x)
print(summary(lm(Residuals[-length(Residuals)] ~ Residuals[-1])))
opar <- par(mfrow = 1:2)
plot(Residuals, type = "l", col = 4)
abline(h = 0)
acf(Residuals, main = NA)
par(opar)
}
Let’s check the models for cluster and non cluster
test_autocorr(mod_pois2a)
##
## Call:
## lm(formula = Residuals[-length(Residuals)] ~ Residuals[-1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7054 -0.6757 -0.1475 0.5343 2.2863
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.06134 0.07910 -0.775 0.440
## Residuals[-1] -0.01188 0.09129 -0.130 0.897
##
## Residual standard error: 0.8716 on 120 degrees of freedom
## Multiple R-squared: 0.0001411, Adjusted R-squared: -0.008191
## F-statistic: 0.01694 on 1 and 120 DF, p-value: 0.8967
test_autocorr(mod_pois2b)
##
## Call:
## lm(formula = Residuals[-length(Residuals)] ~ Residuals[-1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1867 -0.5622 0.1478 0.7324 2.1779
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.04882 0.09167 -0.533 0.595
## Residuals[-1] -0.02381 0.08992 -0.265 0.792
##
## Residual standard error: 1.019 on 122 degrees of freedom
## Multiple R-squared: 0.0005744, Adjusted R-squared: -0.007618
## F-statistic: 0.07012 on 1 and 122 DF, p-value: 0.7916
Everything looks OK!
First we need a function that estimates the rate of decrease in % / year, with confidence interval:
est_decrease <- function(x, level = .95) {
c(coef(x)["date2"], rev(confint(x, "date2", level))) %>%
multiply_by(365) %>%
x$family$linkinv() %>%
subtract(1, .) %>%
multiply_by(100) %>%
setNames(c("est", "lwr", "upr"))
}
Let’s apply it to all the data:
glm(n ~ date2, poisson, bmg_ts) %>%
est_decrease() %>%
round(2)
## est lwr upr
## 6.70 5.19 8.19
Let’s apply it the cluster cases
mod_pois2a %>%
update(. ~ . - cosdate - sindate) %>% # we remove the seasonality
est_decrease() %>%
round(2)
## est lwr upr
## 7.40 4.67 10.08
Let’s apply it to the non cluster before 2015:
mod_pois2b %>%
update(. ~ . -U1.date2, data = filter(bmg_ts, ! cluster, date < date_2)) %>%
est_decrease() %>%
round(2)
## est lwr upr
## 10.99 7.15 14.69
And to the non cluster after 2015:
mod_pois2b %>%
update(. ~ . -U1.date2, data = filter(bmg_ts, ! cluster, date >= date_2)) %>%
est_decrease() %>%
round(2)
## est lwr upr
## 3.71 -3.71 10.62
This function plots temporal trends of proportions by quarter or month:
plot_prop_trends <- function(x, val, ylab, ylim = 0:1, dta = bmg, col = "grey",
f1 = lubridate::quarter, f2 = lubridate::yq) {
dta %>%
rename(var = x) %>%
mutate(xxxx = f1(date),
year = year(date)) %>%
group_by(year, xxxx) %>%
summarise(yes = sum(var == val),
no = sum(var != val)) %>%
ungroup() %>%
mutate(date = f2(paste(year, xxxx)),
prop = map2(yes, yes + no, prop.test),
estimate = map_dbl(prop, extract2, "estimate"),
confint = map(prop, extract2, "conf.int")) %>%
unnest_wider(confint, "") %>%
select(date, estimate, confint1, confint2) %$%
plot(date, estimate, ylim = ylim, type = "o",
xlab = NA, ylab = ylab, col = col, pch = 19)
}
The same function but by year:
plot_prop_trends_yr <- function(x, val, ylab, ylim = 0:1, dta = bmg, col = "grey") {
dta %>%
rename(var = x) %>%
mutate(year = year(date)) %>%
group_by(year) %>%
summarise(yes = sum(var == val),
no = sum(var != val)) %>%
ungroup() %>%
mutate(date = ymd(paste(year, 6, 15)),
prop = map2(yes, yes + no, prop.test),
estimate = map_dbl(prop, extract2, "estimate"),
confint = map(prop, extract2, "conf.int")) %>%
unnest_wider(confint, "") %>%
select(date, estimate, confint1, confint2) %$%
plot(date, estimate, ylim = ylim, type = "o",
xlab = NA, ylab = ylab, col = col, pch = 19)
}
A function that adds prediction and confidence interval to an existing plot:
add_pred_ci <- function(x, fit, lwr, upr, col = 4, alpha = .2) {
polygon(c(x, rev(x)), c(lwr, rev(upr)), border = NA, col = adjustcolor(col, alpha))
lines(x, fit, col = col)
}
Plotting the temporal trends by quarter (left column) and year (right column):
opar <- par(mfrow = c(4, 2), cex = 1)
# 1. Temporal trend of the proportion of males:
newdata3 <- tibble(date = seq(min(bmg$date), max(bmg$date), 1))
plot_prop_trends("sex", "male", "proportion of male")
glm(sex == "male" ~ date, binomial, bmg) %>%
predict2(newdata3) %>%
with(add_pred_ci(date, fit, lwr, upr))
abline(h = .5)
abline(h = select(filter(sex_ratio, sex == "male"), perc), lty = 2)
# 2. The same, by year:
plot_prop_trends_yr("sex", "male", "proportion of male")
glm(sex == "male" ~ date, binomial, bmg) %>%
predict2(newdata3) %>%
with(add_pred_ci(date, fit, lwr, upr))
abline(h = .5)
abline(h = select(filter(sex_ratio, sex == "male"), perc), lty = 2)
# 3. Temporal trend of the proportion born in the UK:
tmp <- mutate(bmg, born = replace_na(born, "uk"))
plot_prop_trends("born", "uk", "proportion UK born", ylim = 0:1, dta = tmp)
glm(born == "uk" ~ date, binomial, tmp) %>%
predict2(newdata3) %>%
with(add_pred_ci(date, fit, lwr, upr))
abline(h = perc_uk, lty = 2)
# 4. The same, by year:
plot_prop_trends_yr("born", "uk", "proportion UK born", ylim = 0:1,
dta = mutate(bmg, born = replace_na(born, "uk")))
glm(born == "uk" ~ date, binomial, tmp) %>%
predict2(newdata3) %>%
with(add_pred_ci(date, fit, lwr, upr))
abline(h = perc_uk, lty = 2)
# 5. Temporal trend of the proportion in clusters:
plot_prop_trends("cluster", TRUE, "proportion in cluster")
glm(cluster ~ date, binomial, bmg) %>%
predict2(newdata3) %>%
with(add_pred_ci(date, fit, lwr, upr))
abline(h = nb_clst$n[2] / sum(nb_clst$n), lty = 2)
# 6. The same, by year:
plot_prop_trends_yr("cluster", TRUE, "proportion in cluster")
glm(cluster ~ date, binomial, bmg) %>%
predict2(newdata3) %>%
with(add_pred_ci(date, fit, lwr, upr))
abline(h = nb_clst$n[2] / sum(nb_clst$n), lty = 2)
# 7. Temporal trend of the proportion with social risk factors:
tmp <- mutate(bmg, srf = prison | alcohol | homeless | drugs)
plot_prop_trends("srf", TRUE, "proportion with social risk factors", dta = tmp)
glm(srf ~ date, binomial, tmp) %>%
predict2(newdata3) %>%
with(add_pred_ci(date, fit, lwr, upr))
abline(h = sum(tmp$srf) / nrow(tmp), lty = 2)
# 8. The same, by year:
plot_prop_trends_yr("srf", TRUE, "proportion with social risk factors", dta = tmp)
glm(srf ~ date, binomial, tmp) %>%
predict2(newdata3) %>%
with(add_pred_ci(date, fit, lwr, upr))
abline(h = sum(tmp$srf) / nrow(tmp), lty = 2)
par(opar)
Let’s test these trends:
Anova(glm(drugs ~ date + prison + alcohol + homeless, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
##
## Response: drugs
## LR Chisq Df Pr(>Chisq)
## date 5.237 1 0.02211 *
## prison 97.116 1 < 2.2e-16 ***
## alcohol 15.515 1 8.187e-05 ***
## homeless 31.410 1 2.089e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(prison ~ date + drugs + alcohol + homeless, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
##
## Response: prison
## LR Chisq Df Pr(>Chisq)
## date 0.209 1 0.6473
## drugs 99.458 1 <2e-16 ***
## alcohol 2.476 1 0.1156
## homeless 4.197 1 0.0405 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(alcohol ~ date + drugs + prison + homeless, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
##
## Response: alcohol
## LR Chisq Df Pr(>Chisq)
## date 0.0094 1 0.922624
## drugs 18.4062 1 1.785e-05 ***
## prison 2.8375 1 0.092087 .
## homeless 7.8248 1 0.005153 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(homeless ~ date + drugs + prison + alcohol, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
##
## Response: homeless
## LR Chisq Df Pr(>Chisq)
## date 1.353 1 0.244696
## drugs 34.378 1 4.539e-09 ***
## prison 4.277 1 0.038641 *
## alcohol 7.480 1 0.006239 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Which means that, among these social risk factors, drug usage seems to be the main driver of the increasing trend.
summary(glm(srf ~ date, binomial, tmp))
##
## Call:
## glm(formula = srf ~ date, family = binomial, data = tmp)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.910e+00 1.224e+00 -4.830 1.36e-06 ***
## date 2.262e-04 7.557e-05 2.993 0.00276 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1019.6 on 1652 degrees of freedom
## Residual deviance: 1010.7 on 1651 degrees of freedom
## AIC: 1014.7
##
## Number of Fisher Scoring iterations: 5
bmg_ts_pul <- bmg %>%
filter(pul) %>%
mutate(year = year(date),
month = month(date)) %>%
group_by(year, month, cluster) %>%
tally() %>%
ungroup() %>%
mutate(date = ymd(paste(year, month, "1")),
date2 = as.integer(date),
cosdate = wave_trans(date2, cos),
sindate = wave_trans(date2, sin))
Let’s consider some Poisson models with various options for the interactions:
# no interactions:
mod_pois1_pul <- glm(n ~ date2 + cluster + cosdate + sindate, poisson, bmg_ts_pul)
# interaction between seasonality and cluster:
mod_pois2_pul <- update(mod_pois1_pul, . ~ . + cosdate : cluster + sindate : cluster)
# interaction between trend and cluster:
mod_pois3_pul <- update(mod_pois1_pul, . ~ . + date2 : cluster)
# interactions between seasonality and cluster, as well as between trend and cluster:
mod_pois4_pul <- update(mod_pois2_pul, . ~ . + date2 : cluster)
Let’s compare these models:
lrt(mod_pois1_pul, mod_pois2_pul, mod_pois4_pul)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate +
## date2:cluster
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 238 187.74
## 2 236 182.56 2 5.1806 0.0750 .
## 3 235 182.04 1 0.5152 0.4729
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrt(mod_pois1_pul, mod_pois3_pul, mod_pois4_pul)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + date2:cluster
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate +
## date2:cluster
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 238 187.74
## 2 237 187.17 1 0.5727 0.44921
## 3 235 182.04 2 5.1232 0.07718 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
So, the best model would (marginally) be mod_pois2
(i.e. interaction between cluster and seasonality, but not between
cluster and linear trend). Next, let’s see whether the seasonality is
significant for both the cluster and non-cluster:
mod_pois2a_pul <- update(mod_pois1_pul, . ~ . - cluster, data = filter(bmg_ts_pul, cluster))
mod_pois2b_pul <- update(mod_pois2a_pul, data = filter(bmg_ts_pul, !cluster))
test_seasonality(mod_pois2a_pul)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 118 90.918
## 2 116 79.717 2 11.201 0.003695 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_seasonality(mod_pois2b_pul)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 121 102.67
## 2 119 102.33 2 0.34646 0.8409
Which means that seasonality is significant only for the cluster
(mod_pois2b
). Let’s thus update this model accordingly:
mod_pois2b_pul %<>% update(. ~ . - cosdate - sindate)
Next, let’s look at segmented regressions version of these two models, with periods predefined by these dates. Let’s do the test for the cluster and non cluster:
test_segments(mod_pois2a_pul)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 116 79.717
## 2 115 79.696 1 0.020579 0.8859
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 116 79.717
## 2 115 79.354 1 0.36332 0.5467
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate + U1.date2
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 115 79.354
## 2 114 79.283 1 0.070156 0.7911
test_segments(mod_pois2b_pul)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 121 102.674
## 2 120 99.242 1 3.4327 0.06392 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 121 102.674
## 2 120 99.494 1 3.1799 0.07455 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + U1.date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 120 99.494
## 2 119 98.536 1 0.95884 0.3275
Which means that there is a marginal change in the trend only for the
non-cluster, either after 2011 or after 2015, but not both at the same
time. Let’s thus not update model mod_pois2b
. Now we can
make the figure. Let’s generate some new data for the covariables used
for prediction:
newdata_pul <- tibble(date = unique(bmg_ts_pul$date)) %>%
mutate(date2 = as.integer(date),
cosdate = wave_trans(date2, cos),
sindate = wave_trans(date2, sin),
U1.date2 = date2 - as.numeric(date_2) + 15,
U1.date2 = ifelse(U1.date2 < 0, 0, U1.date2))
And now the figure:
# The data of non cluster:
bmg_ts_pul %>%
filter(! cluster) %$%
plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")
# The data of cluster:
bmg_ts_pul %>%
filter(cluster) %$%
points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)
# The model of cluster:
mod_pois2a_pul %>%
predict2(newdata_pul) %>%
plot_mod_pred(2)
# The model of non cluster:
mod_pois2b_pul %>%
predict2(newdata_pul) %>%
plot_mod_pred(4)
# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)
# The legend:
lgd <- c("non cluster", "cluster")
legend2 <- function(...) legend(..., box.col = "white")
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)
bmg_ts_nonpul <- bmg %>%
filter(!pul) %>%
mutate(year = year(date),
month = month(date)) %>%
group_by(year, month, cluster) %>%
tally() %>%
ungroup() %>%
mutate(date = ymd(paste(year, month, "1")),
date2 = as.integer(date),
cosdate = wave_trans(date2, cos),
sindate = wave_trans(date2, sin))
Let’s consider some Poisson models with various options for the interactions:
# no interactions:
mod_pois1_nonpul <- glm(n ~ date2 + cluster + cosdate + sindate, poisson, bmg_ts_nonpul)
# interaction between seasonality and cluster:
mod_pois2_nonpul <- update(mod_pois1_nonpul, . ~ . + cosdate : cluster + sindate : cluster)
# interaction between trend and cluster:
mod_pois3_nonpul <- update(mod_pois1_nonpul, . ~ . + date2 : cluster)
# interactions between seasonality and cluster, as well as between trend and cluster:
mod_pois4_nonpul <- update(mod_pois2_nonpul, . ~ . + date2 : cluster)
Let’s compare these models:
lrt(mod_pois1_nonpul, mod_pois2_nonpul, mod_pois4_nonpul)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate +
## date2:cluster
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 187 160.11
## 2 185 160.03 2 0.08641 0.95771
## 3 184 157.06 1 2.96532 0.08507 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrt(mod_pois1_nonpul, mod_pois3_nonpul, mod_pois4_nonpul)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + date2:cluster
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate +
## date2:cluster
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 187 160.11
## 2 186 157.12 1 2.98718 0.08393 .
## 3 184 157.06 2 0.06455 0.96824
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
So, the best model would (marginally) be mod_pois3
(i.e. interaction between cluster and linear trend, but not between
cluster and seasonality). Next, let’s see whether the linear trend is
significant for both the cluster and non-cluster:
mod_pois2a_nonpul <- update(mod_pois1_nonpul, . ~ . - cluster,
data = filter(bmg_ts_nonpul, cluster))
mod_pois2b_nonpul <- update(mod_pois2a_nonpul,
data = filter(bmg_ts_nonpul, !cluster))
test_lineartrend <- function(x) {
lrt(update(x, . ~ . - date2), x)
}
test_lineartrend(mod_pois2a_nonpul)
## Analysis of Deviance Table
##
## Model 1: n ~ cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 66 28.184
## 2 65 28.183 1 0.00080014 0.9774
test_lineartrend(mod_pois2b_nonpul)
## Analysis of Deviance Table
##
## Model 1: n ~ cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 120 145.56
## 2 119 128.88 1 16.684 4.416e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Which means that linear trend is significant only for the non-cluster
(mod_pois2b_nonpul
). Let’s thus update this model
accordingly:
#mod_pois2a_nonpul %<>% update(. ~ . - date2)
But we still want to test the significance of seasonality for both clustered and non-clustered:
mod_pois2a_nonpul2 <- update(mod_pois1_nonpul, . ~ . - cluster,
data = filter(bmg_ts_nonpul, cluster))
mod_pois2b_nonpul2 <- update(mod_pois2a_nonpul2,
data = filter(bmg_ts_nonpul, !cluster))
test_seasonality(mod_pois2a_nonpul)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 67 28.841
## 2 65 28.183 2 0.65759 0.7198
test_seasonality(mod_pois2b_nonpul)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 121 132.71
## 2 119 128.88 2 3.8322 0.1472
which means that seasonality is not significant, neither for the clustered or the non-clustered. Thus, let’s update the models accordingly:
mod_pois2a_nonpul %<>% update(. ~ . - cosdate - sindate)
mod_pois2b_nonpul %<>% update(. ~ . - cosdate - sindate)
Next, let’s look at segmented regressions version of these two models, with periods predefined by these dates. Let’s do the test for the cluster and non cluster:
test_segments <- function(x) {
mod1 <- segmented2(x, date_1)
mod2 <- segmented2(x, date_2)
mod3 <- segmented2(x, c(date_1, date_2))
print(lrt(x, mod1))
print(lrt(x, mod2))
print(lrt(x, mod3))
lrt(mod2, mod3)
}
test_segments(mod_pois2a_nonpul)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 67 28.841
## 2 66 27.513 1 1.3276 0.2492
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 67 28.841
## 2 66 28.590 1 0.25116 0.6163
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 67 28.841
## 2 65 27.430 2 1.4109 0.4939
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + U1.date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 66 28.59
## 2 65 27.43 1 1.1598 0.2815
test_segments(mod_pois2b_nonpul)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 121 132.71
## 2 120 131.43 1 1.2807 0.2578
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 121 132.71
## 2 120 131.23 1 1.4775 0.2242
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 121 132.71
## 2 119 125.93 2 6.7745 0.0338 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + U1.date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 120 131.23
## 2 119 125.93 1 5.297 0.02136 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Which means that there is a significant change in the trend only for
the non-cluster, but both after 2011 and after 2015. Let’s thus update
model mod_pois2b_nonpul
accordingly:
mod_pois2b_nonpul %<>% segmented2(c(date_1, date_2))
Now we can make the figure. Let’s generate some new data for the covariables used for prediction:
newdata_nonpul <- tibble(date = unique(bmg_ts_nonpul$date)) %>%
mutate(date2 = as.integer(date),
cosdate = wave_trans(date2, cos),
sindate = wave_trans(date2, sin),
U1.date2 = date2 - as.numeric(date_1) + 11,
U1.date2 = ifelse(U1.date2 < 0, 0, U1.date2),
U2.date2 = date2 - as.numeric(date_2) + 15,
U2.date2 = ifelse(U2.date2 < 0, 0, U2.date2))
And now the figure:
# The data of non cluster:
bmg_ts_nonpul %>%
filter(! cluster) %$%
plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")
# The data of cluster:
bmg_ts_nonpul %>%
filter(cluster) %$%
points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)
# The model of cluster:
mod_pois2a_nonpul %>%
predict2(newdata_nonpul) %>%
plot_mod_pred(2)
# The model of non cluster:
mod_pois2b_nonpul %>%
predict2(newdata_nonpul) %>%
plot_mod_pred(4)
# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)
# The legend:
lgd <- c("non cluster", "cluster")
legend2 <- function(...) legend(..., box.col = "white")
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)
bmg_ts_12 <- bmg_12 %>%
mutate(year = year(date),
month = month(date)) %>%
group_by(year, month, cluster) %>%
tally() %>%
mutate(date = ymd(paste(year, month, "1")),
date2 = as.integer(date),
cosdate = wave_trans(date2, cos),
sindate = wave_trans(date2, sin))
Which looks like:
bmg_ts_12
## # A tibble: 248 × 8
## # Groups: year, month [126]
## year month cluster n date date2 cosdate sindate
## <dbl> <dbl> <lgl> <int> <date> <int> <dbl> <dbl>
## 1 2009 1 FALSE 11 2009-01-01 14245 0.985 0.171
## 2 2009 1 TRUE 7 2009-01-01 14245 0.985 0.171
## 3 2009 2 FALSE 17 2009-02-01 14276 0.761 0.649
## 4 2009 2 TRUE 5 2009-02-01 14276 0.761 0.649
## 5 2009 3 FALSE 16 2009-03-01 14304 0.374 0.928
## 6 2009 3 TRUE 10 2009-03-01 14304 0.374 0.928
## 7 2009 4 FALSE 15 2009-04-01 14335 -0.150 0.989
## 8 2009 4 TRUE 9 2009-04-01 14335 -0.150 0.989
## 9 2009 5 FALSE 14 2009-05-01 14365 -0.619 0.786
## 10 2009 5 TRUE 13 2009-05-01 14365 -0.619 0.786
## # ℹ 238 more rows
Let’s consider some Poisson models with various options for the interactions:
# no interactions:
mod_pois1 <- glm(n ~ date2 + cluster + cosdate + sindate, poisson, bmg_ts_12)
# interaction between seasonality and cluster:
mod_pois2 <- update(mod_pois1, . ~ . + cosdate : cluster + sindate : cluster)
# interaction between trend and cluster:
mod_pois3 <- update(mod_pois1, . ~ . + date2 : cluster)
# interactions between seasonality and cluster, as well as between trend and cluster:
mod_pois4 <- update(mod_pois2, . ~ . + date2 : cluster)
Let’s compare these models:
lrt(mod_pois1, mod_pois2, mod_pois4)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate +
## date2:cluster
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 243 239.79
## 2 241 231.90 2 7.8964 0.01929 *
## 3 240 231.46 1 0.4349 0.50957
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrt(mod_pois1, mod_pois3, mod_pois4)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + date2:cluster
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate +
## date2:cluster
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 243 239.79
## 2 242 239.30 1 0.4987 0.48006
## 3 240 231.46 2 7.8326 0.01991 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
So, the best model would be mod_pois2
(i.e. interaction
between cluster and seasonality, but not between cluster and linear
trend). Next, let’s see whether the seasonality is significant for both
the cluster and non-cluster:
mod_pois2a <- update(mod_pois1, . ~ . - cluster, data = filter(bmg_ts_12, cluster))
mod_pois2b <- update(mod_pois2a, data = filter(bmg_ts_12, !cluster))
test_seasonality(mod_pois2a)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 121 114.698
## 2 119 95.104 2 19.594 5.563e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_seasonality(mod_pois2b)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 139.38
## 2 121 136.36 2 3.0165 0.2213
Which means that seasonality is significant only for the cluster
(mod_pois2b
). Let’s thus update this model accordingly:
mod_pois2b %<>% update(. ~ . - cosdate - sindate)
Next, let’s look at segmented regressions version of these two models, with periods predefined by these dates. Let’s do the test for the cluster and non cluster:
test_segments(mod_pois2a)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 119 95.104
## 2 118 95.088 1 0.016541 0.8977
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 119 95.104
## 2 118 93.822 1 1.2828 0.2574
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 119 95.104
## 2 117 93.364 2 1.7406 0.4188
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate + U1.date2
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 118 93.822
## 2 117 93.364 1 0.45778 0.4987
test_segments(mod_pois2b)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 139.38
## 2 122 139.10 1 0.27515 0.5999
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 139.38
## 2 122 134.83 1 4.5485 0.03295 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 139.38
## 2 121 133.97 2 5.4078 0.06694 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + U1.date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 122 134.83
## 2 121 133.97 1 0.85935 0.3539
Which means that there is a change in the trend only for the
non-cluster, and only after 2015. Let’s update model
mod_pois2b
accordingly:
mod_pois2b %<>% segmented2(date_2)
Now we can make a figure. First we need a function that computes predictions with confidence intervals from a model. Let’s generate some new data for the covariables used for prediction:
newdata <- tibble(date = unique(bmg_ts_12$date)) %>%
mutate(date2 = as.integer(date),
cosdate = wave_trans(date2, cos),
sindate = wave_trans(date2, sin),
U1.date2 = date2 - as.numeric(date_2) + 15,
U1.date2 = ifelse(U1.date2 < 0, 0, U1.date2))
Next we need a function that plots the model predictions with confidence interval. Now that we have these two functions, we can make the figure:
# The data of non cluster:
bmg_ts_12 %>%
filter(! cluster) %$%
plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")
# The data of cluster:
bmg_ts_12 %>%
filter(cluster) %$%
points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)
# The model of cluster:
mod_pois2a %>%
predict2(newdata) %>%
plot_mod_pred(2)
# The model of non cluster:
mod_pois2b %>%
predict2(newdata) %>%
plot_mod_pred(4)
# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)
# The legend:
lgd <- c("non cluster", "cluster")
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)
Let’s look at the dispersion in the residuals
AER::dispersiontest(mod_pois2a)
##
## Overdispersion test
##
## data: mod_pois2a
## z = -2.6588, p-value = 0.9961
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 0.7838471
AER::dispersiontest(mod_pois2b)
##
## Overdispersion test
##
## data: mod_pois2b
## z = 0.29027, p-value = 0.3858
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 1.031179
Which comforts us in the choice of a Poisson model. Let’s now look at temporal autocorrelation in residuals. Below is a function that performs a few checks. Let’s check the models for cluster and non cluster
test_autocorr(mod_pois2a)
##
## Call:
## lm(formula = Residuals[-length(Residuals)] ~ Residuals[-1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.69972 -0.74893 -0.05969 0.61864 2.09660
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.05804 0.08057 -0.720 0.473
## Residuals[-1] 0.01110 0.09132 0.122 0.903
##
## Residual standard error: 0.8875 on 120 degrees of freedom
## Multiple R-squared: 0.000123, Adjusted R-squared: -0.008209
## F-statistic: 0.01477 on 1 and 120 DF, p-value: 0.9035
test_autocorr(mod_pois2b)
##
## Call:
## lm(formula = Residuals[-length(Residuals)] ~ Residuals[-1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.38805 -0.59268 0.02363 0.70206 2.09143
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.05302 0.09387 -0.565 0.573
## Residuals[-1] -0.01531 0.09014 -0.170 0.865
##
## Residual standard error: 1.044 on 122 degrees of freedom
## Multiple R-squared: 0.0002363, Adjusted R-squared: -0.007958
## F-statistic: 0.02884 on 1 and 122 DF, p-value: 0.8654
Everything looks OK!
First we need a function that estimates the rate of decrease in % / year, with confidence interval. Let’s apply it to all the data:
glm(n ~ date2, poisson, bmg_ts_12) %>%
est_decrease() %>%
round(2)
## est lwr upr
## 6.70 5.19 8.19
Let’s apply it the cluster cases
mod_pois2a %>%
update(. ~ . - cosdate - sindate) %>% # we remove the seasonality
est_decrease() %>%
round(2)
## est lwr upr
## 7.56 4.95 10.12
Let’s apply it to the non cluster before 2015:
mod_pois2b %>%
update(. ~ . -U1.date2, data = filter(bmg_ts_12, ! cluster, date < date_2)) %>%
est_decrease() %>%
round(2)
## est lwr upr
## 10.33 6.38 14.13
And to the non cluster after 2015:
mod_pois2b %>%
update(. ~ . -U1.date2, data = filter(bmg_ts_12, ! cluster, date >= date_2)) %>%
est_decrease() %>%
round(2)
## est lwr upr
## 2.95 -4.71 10.06
bmg_ts_12b <- bmg_12b %>%
mutate(year = year(date),
month = month(date)) %>%
group_by(year, month, cluster) %>%
tally() %>%
ungroup() %>%
mutate(date = ymd(paste(year, month, "1")),
date2 = as.integer(date),
cosdate = wave_trans(date2, cos),
sindate = wave_trans(date2, sin))
Which looks like:
bmg_ts_12b
## # A tibble: 248 × 8
## year month cluster n date date2 cosdate sindate
## <dbl> <dbl> <lgl> <int> <date> <int> <dbl> <dbl>
## 1 2009 1 FALSE 11 2009-01-01 14245 0.985 0.171
## 2 2009 1 TRUE 7 2009-01-01 14245 0.985 0.171
## 3 2009 2 FALSE 17 2009-02-01 14276 0.761 0.649
## 4 2009 2 TRUE 5 2009-02-01 14276 0.761 0.649
## 5 2009 3 FALSE 16 2009-03-01 14304 0.374 0.928
## 6 2009 3 TRUE 10 2009-03-01 14304 0.374 0.928
## 7 2009 4 FALSE 14 2009-04-01 14335 -0.150 0.989
## 8 2009 4 TRUE 10 2009-04-01 14335 -0.150 0.989
## 9 2009 5 FALSE 14 2009-05-01 14365 -0.619 0.786
## 10 2009 5 TRUE 13 2009-05-01 14365 -0.619 0.786
## # ℹ 238 more rows
Let’s consider some Poisson models with various options for the interactions:
# no interactions:
mod_pois1 <- glm(n ~ date2 + cluster + cosdate + sindate, poisson, bmg_ts_12b)
# interaction between seasonality and cluster:
mod_pois2 <- update(mod_pois1, . ~ . + cosdate : cluster + sindate : cluster)
# interaction between trend and cluster:
mod_pois3 <- update(mod_pois1, . ~ . + date2 : cluster)
# interactions between seasonality and cluster, as well as between trend and cluster:
mod_pois4 <- update(mod_pois2, . ~ . + date2 : cluster)
Let’s compare these models:
lrt(mod_pois1, mod_pois2, mod_pois4)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate +
## date2:cluster
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 243 239.74
## 2 241 231.56 2 8.1777 0.01676 *
## 3 240 230.93 1 0.6245 0.42936
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrt(mod_pois1, mod_pois3, mod_pois4)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + date2:cluster
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate +
## date2:cluster
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 243 239.74
## 2 242 239.03 1 0.7034 0.40165
## 3 240 230.93 2 8.0989 0.01743 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
So, the best model would be mod_pois2
(i.e. interaction
between cluster and seasonality, but not between cluster and linear
trend). Next, let’s see whether the seasonality is significant for both
the cluster and non-cluster:
mod_pois2a <- update(mod_pois1, . ~ . - cluster, data = filter(bmg_ts_12b, cluster))
mod_pois2b <- update(mod_pois2a, data = filter(bmg_ts_12b, !cluster))
test_seasonality(mod_pois2a)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 121 114.057
## 2 119 94.355 2 19.702 5.27e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_seasonality(mod_pois2b)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 139.74
## 2 121 136.58 2 3.1604 0.2059
Which means that seasonality is significant only for the cluster
(mod_pois2b
). Let’s thus update this model accordingly:
mod_pois2b %<>% update(. ~ . - cosdate - sindate)
Next, let’s look at segmented regressions version of these two models, with periods predefined by these dates. Let’s do the test for the cluster and non cluster:
test_segments(mod_pois2a)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 119 94.355
## 2 118 94.290 1 0.065389 0.7982
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 119 94.355
## 2 118 92.926 1 1.4295 0.2318
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 119 94.355
## 2 117 92.605 2 1.7503 0.4168
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate + U1.date2
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 118 92.926
## 2 117 92.605 1 0.32075 0.5712
test_segments(mod_pois2b)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 139.74
## 2 122 139.56 1 0.17705 0.6739
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 139.74
## 2 122 135.41 1 4.3255 0.03755 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 139.74
## 2 121 134.38 2 5.357 0.06867 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + U1.date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 122 135.41
## 2 121 134.38 1 1.0315 0.3098
Which means that there is a change in the trend only for the
non-cluster, and only after 2015. Let’s update model
mod_pois2b
accordingly:
mod_pois2b %<>% segmented2(date_2)
Now we can make a figure. First we need a function that computes predictions with confidence intervals from a model. Let’s generate some new data for the covariables used for prediction:
newdata <- tibble(date = unique(bmg_ts_12b$date)) %>%
mutate(date2 = as.integer(date),
cosdate = wave_trans(date2, cos),
sindate = wave_trans(date2, sin),
U1.date2 = date2 - as.numeric(date_2) + 15,
U1.date2 = ifelse(U1.date2 < 0, 0, U1.date2))
Next we need a function that plots the model predictions with confidence interval. Now that we have these two functions, we can make the figure:
# The data of non cluster:
bmg_ts_12b %>%
filter(! cluster) %$%
plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")
# The data of cluster:
bmg_ts_12b %>%
filter(cluster) %$%
points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)
# The model of cluster:
mod_pois2a %>%
predict2(newdata) %>%
plot_mod_pred(2)
# The model of non cluster:
mod_pois2b %>%
predict2(newdata) %>%
plot_mod_pred(4)
# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)
# The legend:
lgd <- c("non cluster", "cluster")
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)
On figure 1, replace incidence by incidence rate (reviewer 5).
bmg_ts2 <- ons %>%
select(Year, Total) %>%
left_join(bmg_ts_12b, ., c("year" = "Year"))
# no interactions:
mod_pois1_2 <- glm(n ~ date2 + cluster + cosdate + sindate, poisson, bmg_ts2, offset = log(Total))
# interaction between seasonality and cluster:
mod_pois2_2 <- update(mod_pois1_2, . ~ . + cosdate : cluster + sindate : cluster)
# interaction between trend and cluster:
mod_pois3_2 <- update(mod_pois1_2, . ~ . + date2 : cluster)
# interactions between seasonality and cluster, as well as between trend and cluster:
mod_pois4_2 <- update(mod_pois2_2, . ~ . + date2 : cluster)
lrt(mod_pois1_2, mod_pois2_2, mod_pois4_2)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate +
## date2:cluster
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 243 239.19
## 2 241 231.01 2 8.1787 0.01675 *
## 3 240 230.39 1 0.6264 0.42867
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod_pois2a_2 <- update(mod_pois1_2, . ~ . - cluster, data = filter(bmg_ts2, cluster))
mod_pois2b_2 <- update(mod_pois2a_2, data = filter(bmg_ts2, !cluster))
test_seasonality(mod_pois2a_2)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 121 113.535
## 2 119 94.152 2 19.383 6.181e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_seasonality(mod_pois2b_2)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 139.30
## 2 121 136.23 2 3.0696 0.2155
mod_pois2b_2 %<>% update(. ~ . - cosdate - sindate)
test_segments(mod_pois2a_2)
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 119 94.152
## 2 118 94.076 1 0.075729 0.7832
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 119 94.152
## 2 118 92.667 1 1.4852 0.223
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 119 94.152
## 2 117 92.354 2 1.7977 0.407
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + cosdate + sindate + U1.date2
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 118 92.667
## 2 117 92.354 1 0.31247 0.5762
test_segments(mod_pois2b_2)
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 139.30
## 2 122 139.11 1 0.19442 0.6593
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 139.30
## 2 122 134.86 1 4.4468 0.03497 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
##
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 123 139.30
## 2 121 133.83 2 5.4701 0.06489 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
##
## Model 1: n ~ date2 + U1.date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 122 134.86
## 2 121 133.83 1 1.0233 0.3117
mod_pois2b_2 %<>% segmented2(date_2)
The figure:
# The data of non cluster:
bmg_ts %>%
filter(! cluster) %$%
plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")
# The data of cluster:
bmg_ts %>%
filter(cluster) %$%
points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)
# The model of cluster:
mod_pois2a_2 %>%
predict2(newdata2) %>%
plot_mod_pred(2)
# The model of non cluster:
mod_pois2b_2 %>%
predict2(newdata2) %>%
plot_mod_pred(4)
# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)
# The legend:
lgd <- c("non cluster", "cluster")
legend2 <- function(...) legend(..., box.col = "white")
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)
On figure 1, replace the segmented regression by a spline regression (reviewer 3).
library(mgcv)
# The data of non cluster:
bmg_ts_12b %>%
filter(! cluster) %$%
plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")
# The data of cluster:
bmg_ts_12b %>%
filter(cluster) %$%
points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)
# The model of non cluster:
bmg_ts_12b %>%
filter(! cluster) %>%
gam(n ~ s(date2), poisson, .) %>%
# gam(n ~ s(date2) + s(cosdate) + s(sindate), poisson, .) %>%
predict2(newdata) %>%
plot_mod_pred(4)
# The model of cluster:
bmg_ts_12b %>%
filter(cluster) %>%
gam(n ~ s(date2) + s(cosdate) + s(sindate), poisson, .) %>%
predict2(newdata) %>%
plot_mod_pred(2)
# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)
# The legend:
lgd <- c("non cluster", "cluster")
legend2 <- function(...) legend(..., box.col = "white")
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)
Let’s look at the dispersion in the residuals
AER::dispersiontest(mod_pois2a)
##
## Overdispersion test
##
## data: mod_pois2a
## z = -2.733, p-value = 0.9969
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 0.7781604
AER::dispersiontest(mod_pois2b)
##
## Overdispersion test
##
## data: mod_pois2b
## z = 0.36427, p-value = 0.3578
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 1.039615
Which comforts us in the choice of a Poisson model. Let’s now look at temporal autocorrelation in residuals. Below is a function that performs a few checks. Let’s check the models for cluster and non cluster
test_autocorr(mod_pois2a)
##
## Call:
## lm(formula = Residuals[-length(Residuals)] ~ Residuals[-1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.70216 -0.73952 -0.08912 0.61388 2.07158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.058091 0.080260 -0.724 0.471
## Residuals[-1] 0.001723 0.091312 0.019 0.985
##
## Residual standard error: 0.8842 on 120 degrees of freedom
## Multiple R-squared: 2.967e-06, Adjusted R-squared: -0.00833
## F-statistic: 0.0003561 on 1 and 120 DF, p-value: 0.985
test_autocorr(mod_pois2b)
##
## Call:
## lm(formula = Residuals[-length(Residuals)] ~ Residuals[-1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.36650 -0.59176 0.01941 0.70196 2.11442
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.052870 0.094085 -0.562 0.575
## Residuals[-1] -0.007139 0.090144 -0.079 0.937
##
## Residual standard error: 1.046 on 122 degrees of freedom
## Multiple R-squared: 5.141e-05, Adjusted R-squared: -0.008145
## F-statistic: 0.006272 on 1 and 122 DF, p-value: 0.937
Everything looks OK!
First we need a function that estimates the rate of decrease in % / year, with confidence interval. Let’s apply it to all the data:
glm(n ~ date2, poisson, bmg_ts_12b) %>%
est_decrease() %>%
round(2)
## est lwr upr
## 6.70 5.19 8.19
Let’s apply it the cluster cases
mod_pois2a %>%
update(. ~ . - cosdate - sindate) %>% # we remove the seasonality
est_decrease() %>%
round(2)
## est lwr upr
## 7.70 5.10 10.26
Let’s apply it to the non cluster before 2015:
mod_pois2b %>%
update(. ~ . -U1.date2, data = filter(bmg_ts_12b, ! cluster, date < date_2)) %>%
est_decrease() %>%
round(2)
## est lwr upr
## 10.16 6.21 13.97
And to the non cluster after 2015:
mod_pois2b %>%
update(. ~ . -U1.date2, data = filter(bmg_ts_12b, ! cluster, date >= date_2)) %>%
est_decrease() %>%
round(2)
## est lwr upr
## 2.95 -4.71 10.06
The number of cluster, samples and proportion of samples per cluster size:
clust_dist <- bmg %>%
group_by(cluster_number) %>%
summarise(cluster_size = n()) %>%
group_by(cluster_size) %>%
summarise(nb_cluster = n()) %>%
mutate(nb_samples = nb_cluster * cluster_size,
pc_samples = round(100 * nb_samples / nrow(bmg), 2))
clust_dist
## # A tibble: 12 × 4
## cluster_size nb_cluster nb_samples pc_samples
## <int> <int> <int> <dbl>
## 1 1 1142 1142 69.1
## 2 2 79 158 9.56
## 3 3 36 108 6.53
## 4 4 15 60 3.63
## 5 6 4 24 1.45
## 6 7 1 7 0.42
## 7 8 1 8 0.48
## 8 10 1 10 0.6
## 9 12 2 24 1.45
## 10 13 1 13 0.79
## 11 21 2 42 2.54
## 12 57 1 57 3.45
The number and proportion of samples in a cluster of at least 9 samples:
clust_dist %>%
filter(cluster_size > 9) %>%
select(nb_samples, pc_samples) %>%
colSums()
## nb_samples pc_samples
## 146.00 8.83
The following function computes the number of samples in a cluster (a cluster being defined as at least 2 samples):
nb_clustered <- function(x) {
x %>%
group_by(cluster_number) %>%
tally() %>%
group_by(n) %>%
tally() %>%
filter(n > 1) %$%
sum(n * nn)
}
Computing the total number N
of samples and the number
of n
of samples in a cluster per lineage (rows), as well as
the corresponding proportions of these samples in a cluster:
tmp <- map(c(nb_clustered, nrow), ~ map_int(group_split(bmg, lineage), .x)) %>%
as.data.frame(col.names = c("n", "N")) %>%
mutate(p = round(100 * n / N))
tmp
## n N p
## 1 30 214 14
## 2 20 86 23
## 3 200 703 28
## 4 260 649 40
## 5 0 1 0
The proportion of samples in a cluster depends on the lineage:
tmp %>%
filter(n > 0) %>%
select(-p) %>%
mutate(N = N - n) %>%
as.matrix() %>%
fisher.test()
##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 2.931e-13
## alternative hypothesis: two.sided
The risk of being in a cluster (corresponds to the 2nd part of the
3rd paragraph of the Transmission
section in the ms):
Anova(glm(cluster ~ prison + alcohol + homeless + drugs, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
##
## Response: cluster
## LR Chisq Df Pr(>Chisq)
## prison 2.0202 1 0.1552219
## alcohol 11.5018 1 0.0006953 ***
## homeless 0.2765 1 0.5990232
## drugs 27.8607 1 1.304e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef(glm(cluster ~ prison + alcohol + homeless + drugs, binomial, bmg)))
## (Intercept) prisonTRUE alcoholTRUE homelessTRUE drugsTRUE
## 0.3867648 1.5281724 3.6942585 1.2237157 4.1143378
Anova(glm(cluster ~ sex + born + prison + alcohol + homeless + drugs, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
##
## Response: cluster
## LR Chisq Df Pr(>Chisq)
## sex 2.231 1 0.135260
## born 144.702 2 < 2.2e-16 ***
## prison 0.153 1 0.695425
## alcohol 6.298 1 0.012087 *
## homeless 0.180 1 0.671381
## drugs 7.984 1 0.004719 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(cluster ~ sex + born + prison + alcohol + homeless + drugs, binomial, filter(bmg, born != "os2")))
## Analysis of Deviance Table (Type II tests)
##
## Response: cluster
## LR Chisq Df Pr(>Chisq)
## sex 0.833 1 0.361410
## born 127.639 1 < 2.2e-16 ***
## prison 0.487 1 0.485177
## alcohol 6.605 1 0.010170 *
## homeless 0.191 1 0.662415
## drugs 8.048 1 0.004555 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(cluster ~ born + prison + alcohol + homeless + drugs, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
##
## Response: cluster
## LR Chisq Df Pr(>Chisq)
## born 143.330 2 < 2.2e-16 ***
## prison 0.335 1 0.562606
## alcohol 6.690 1 0.009695 **
## homeless 0.240 1 0.623998
## drugs 8.191 1 0.004210 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(cluster ~ born + prison + alcohol + homeless + drugs, binomial, filter(bmg, born != "os2")))
## Analysis of Deviance Table (Type II tests)
##
## Response: cluster
## LR Chisq Df Pr(>Chisq)
## born 127.097 1 < 2.2e-16 ***
## prison 0.669 1 0.413514
## alcohol 6.857 1 0.008828 **
## homeless 0.225 1 0.635173
## drugs 8.199 1 0.004192 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The clustered ordered from biggest to smallest (n
being
the number of samples in the cluster):
bmg %>%
group_by(cluster_number) %>%
tally() %>%
arrange(desc(n))
## # A tibble: 1,285 × 2
## cluster_number n
## <int> <int>
## 1 13 57
## 2 9 21
## 3 15 21
## 4 67 13
## 5 27 12
## 6 65 12
## 7 38 10
## 8 42 8
## 9 41 7
## 10 10 6
## # ℹ 1,275 more rows
The function below is used to generate Figure 2:
plot_clusters <- function(x, l, cex = .5, col = 4) {
x %>%
filter(!cluster) %$%
plot(date, chronorder, cex = cex, axes = FALSE, ann = FALSE)
x %>%
filter(cluster) %>%
group_split(chronorder) %>%
walk(~ lines(.x$date, .x$chronorder, col = "red", type = "o", cex = cex))
yrs <- 2009:2019
axis(1, ymd(paste(yrs, "01 01")), yrs)
text(ymd(20110101), 1000, paste("lineage", l))
}
Let’s make Figure 2:
opar <- par(mfrow = c(2, 2), cex = 1)
walk(1:4, ~ plot_clusters(bmg[bmg$lineage == .x, ], .x))
par(opar)
This is an alternative way of looking at it. Instead of computing presence / absence data per individual, we compute average rate per cluster (i.e. number of case / duration). And then we can relate this computed rate to risk factors. The advantage here is that we don’t need to exclude the sample that were reported during the last 2 years. First we need to prepare the data for the analysis:
bmg_zip <- bmg %>%
mutate(duration = as.integer(max(bmg$date) - start)) %>%
select(cluster_number, duration, lineage) %>%
unique() %>%
left_join(tally(group_by(bmg, cluster_number)), "cluster_number") %>%
mutate(n = n - 1) %>%
filter(duration > 0)
bmg_zip
## # A tibble: 1,284 × 4
## cluster_number duration lineage n
## <int> <int> <int> <dbl>
## 1 1 2931 3 1
## 2 2 2901 4 1
## 3 3 1907 4 1
## 4 4 3489 3 2
## 5 5 3367 3 1
## 6 6 3768 1 1
## 7 7 3394 4 2
## 8 8 3718 3 1
## 9 9 3682 4 20
## 10 10 3033 3 5
## # ℹ 1,274 more rows
This function will be used to generate the estimates:
estimates <- function(x) {
cbind(est = coef(x), confint(x)) %>%
as.data.frame() %>%
tibble::rownames_to_column()
}
Here is the analysis that consists in fitting a zero-inflated Poisson regression (forced to go through the origin) to the data per lineage:
rate_estimates <- map(1:4, function(x)
zeroinfl(n ~ -1 + duration | -1 + duration,
filter(bmg_zip, lineage == x))) %>%
setNames(paste("lineage", 1:4)) %>%
map_dfr(estimates) %>%
group_split(rowname, .keep = FALSE) %>%
map2(c(function(x) exp(365 * x), function(x) exp(365 * x)/(1 + exp(365 * x))), function(y, f) f(y)) %>%
setNames(c("count_duration", "zero_duration"))
This function plot the rate estimates per lineage:
plot_estimates <- function(x, ylim, ylab, y, length = .03) {
with(x, {
plot(1:4, est, xlim = c(.5, 4.5), ylim = ylim,
xlab = NA, ylab = ylab, axes = FALSE, col = 4)
arrows(1:4, `2.5 %`, 1:4, `97.5 %`, length, 90, 3, col = 4)
text(1:4, y, paste("lineage", 1:4))
axis(2)
})
}
The estimates of the rate of starting a chain of transmission (left) and rate of transmission in a chain (right), per lineage:
opar <- par(mfrow = 1:2)
plot_estimates(rate_estimates$zero_duration, c(.5, .6),
"probability of starting a chain (/year)", .5)
plot_estimates(rate_estimates$count_duration, c(.8, 1.2), "growth rate (/year)", .8)
par(opar)
To explain these differences we should look at how the lineages are different in terms of risk factor.
This is a third option of looking at the same thing. Now, instead of
looking at the presence / absence of another case within 2 years as for
the first option, we compute the time to the next case. Here a tuning of
the polygon()
that can make “staircase” style:
polygon2 <- function(x, l, u, ...) {
n <- length(x)
x <- c(x[1], rep(x[-1], each = 2))
l <- c(rep(l[-n], each = 2), l[n])
u <- c(rep(u[-n], each = 2), u[n])
polygon(c(x, rev(x)), 1 - c(l, rev(u)), border = NA, ...)
}
Here is a function to plot 1 - survival:
plot_surv <- function(x, col, ...) {
with(x, {
plot(time, 1 - surv, type = "s", xlim = c(0, 2 * 365.25), ylim = 0:1,
xlab = "time (days)", col = col, lwd = 2,
ylab = "probability", ...)
polygon2(time, lower, upper, col = adjustcolor(col, .2))
})
}
Here is the equivalent but adding a line:
lines_surv <- function(x, col) {
with(x, {
lines(time, 1 - surv, col = col, lwd = 2, type = "s")
polygon2(time, lower, upper, col = adjustcolor(col, .2))
})
}
Kaplan-Meier curves:
opar <- par(mfrow = c(2, 2), cex = 1)
# 1. Kaplan-Meier fits by social risk factor:
groups <- c("prison", "alcohol", "homeless", "drugs", "no_other_factor")
km_rf <- groups %>%
map(~ survfit(Surv(time, status) ~ 1, data = bmg[bmg[[.x]], ])) %>%
setNames(groups)
cols <- c("#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3", "#a6d854")
plot_surv(km_rf[[1]], cols[1])
walk(2:5, ~ lines_surv(km_rf[[.x]], cols[.x]))
abline(v = 365.25, lty = 2)
legend("topleft", legend = c(str_replace_all(groups, "_", " ")[-5], "none"),
lwd = 2, lty = 1, col = cols, bty = "n")
# 2. Kaplan-Meier fits by place of birth:
groups <- na.exclude(unique(bmg$born))
km_born <- groups %>%
map(~ survfit(Surv(time, status) ~ 1, data = bmg[bmg[["born"]] == .x, ])) %>%
setNames(groups)
cols <- c("#66c2a5", "#fc8d62", "#8da0cb")
plot_surv(km_born[[1]], cols[1], axes = FALSE)
walk(2:3, ~ lines_surv(km_born[[.x]], cols[.x]))
abline(v = 365.25, lty = 2)
legend("topleft", legend = c("entered UK > 2 years before",
"entered UK < 2 years before",
"UK born"), lwd = 2, lty = 1, col = cols,
box.col = "white", bg = "white")
axis(1); axis(2)
# 3. Kaplan-Meier fits by number of social risk factors:
groups <- 0:3
km_born <- groups %>%
map(~ survfit(Surv(time, status) ~ 1, data = bmg[bmg[["srf"]] == .x, ])) %>%
setNames(groups)
cols <- c("#fed98e", "#fe9929", "#d95f0e", "#993404")
plot_surv(km_born[[1]], cols[1])
walk(2:4, ~ lines_surv(km_born[[.x]], cols[.x]))
abline(v = 365.25, lty = 2)
legend("topleft", legend = paste(groups, "SRF"),
lwd = 2, lty = 1, col = cols, bty = "n")
# 4. Kaplan-Meier fits by lineage:
groups <- 1:4
km_lineage <- groups %>%
map(~ survfit(Surv(time, status) ~ 1, data = bmg[bmg[["lineage"]] == .x, ])) %>%
setNames(groups)
cols <- c("#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3")
plot_surv(km_lineage[[1]], cols[1])
walk(2:4, ~ lines_surv(km_lineage[[.x]], cols[.x]))
abline(v = 365.25, lty = 2)
legend("topleft", legend = paste("lineage", groups),
lwd = 2, lty = 1, col = cols, bty = "n")
par(opar)
tim_data <- "for_kaplanmeier.csv" %>%
read.csv() %>%
as_tibble() %>%
select(cluster_number, firstsample, imstat, srf, SR, hmp, nfa, drugs, etoh, pul) %>%
mutate_at("firstsample", dmy)
cluster_size <- tim_data %>%
group_by(cluster_number) %>%
tally()
cluster_start <- tim_data %>%
group_by(cluster_number) %>%
summarise(clusterstart = min(firstsample))
At least one social risk factor at 1 and 2 years:
map_dfr(1:2,
~ 1 - unlist(summary(survfit(Surv(time, status) ~ 1,
data = filter(bmg, ! no_other_factor)),
times = .x * 365.25)[c("lower", "surv", "upper")]))
## # A tibble: 2 × 3
## lower surv upper
## <dbl> <dbl> <dbl>
## 1 0.469 0.395 0.311
## 2 0.566 0.490 0.402
UK-born at 1 year:
1 - unlist(summary(survfit(Surv(time, status) ~ 1, data = filter(bmg, born == "uk")),
times = 365.25)[c("lower", "surv", "upper")])
## lower surv upper
## 0.3818547 0.3387870 0.2927186
Alcohol and drug at 1 year:
map_dfr(c("alcohol", "drugs"),
~ 1 - unlist(summary(survfit(Surv(time, status) ~ 1, data = bmg[bmg[[.x]], ]),
times = 365.25)[c("lower", "surv", "upper")]))
## # A tibble: 2 × 3
## lower surv upper
## <dbl> <dbl> <dbl>
## 1 0.659 0.524 0.336
## 2 0.546 0.452 0.339
Number of social risk factors:
map_dfr(0:3, ~ 1 - unlist(summary(survfit(Surv(time, status) ~ 1,
data = filter(bmg, srf == .x)),
times = 365.25)[c("lower", "surv", "upper")]))
## # A tibble: 4 × 3
## lower surv upper
## <dbl> <dbl> <dbl>
## 1 0.161 0.143 0.125
## 2 0.412 0.319 0.212
## 3 0.554 0.423 0.253
## 4 0.871 0.721 0.396
Corresponding log-rank p-values:
survdiff(Surv(time, status) ~ as.factor(srf), data = filter(bmg, srf < 4))
## Call:
## survdiff(formula = Surv(time, status) ~ as.factor(srf), data = filter(bmg,
## srf < 4))
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## as.factor(srf)=0 1500 291 339.13 6.83 93.4
## as.factor(srf)=1 86 38 16.76 26.92 28.3
## as.factor(srf)=2 43 22 7.65 26.93 27.6
## as.factor(srf)=3 20 15 2.46 64.03 64.6
##
## Chisq= 125 on 3 degrees of freedom, p= <2e-16
Lineage:
map_dfr(1:4, ~ 1 - unlist(summary(survfit(Surv(time, status) ~ 1,
data = filter(bmg, lineage == .x)),
times = 365.25)[c("lower", "surv", "upper")]))
## # A tibble: 4 × 3
## lower surv upper
## <dbl> <dbl> <dbl>
## 1 0.0829 0.0529 0.0220
## 2 0.141 0.0835 0.0223
## 3 0.160 0.135 0.109
## 4 0.280 0.247 0.212
Corresponding log-rank p-values:
survdiff(Surv(time, status) ~ as.factor(lineage),
data = filter(bmg, ! is.na(lineage)))
## Call:
## survdiff(formula = Surv(time, status) ~ as.factor(lineage), data = filter(bmg,
## !is.na(lineage)))
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## as.factor(lineage)=1 214 18 52.8 22.89 26.77
## as.factor(lineage)=2 86 12 20.2 3.33 3.53
## as.factor(lineage)=3 703 138 161.9 3.54 6.34
## as.factor(lineage)=4 649 199 132.1 33.88 53.07
##
## Chisq= 63.8 on 3 degrees of freedom, p= 9e-14
Log-rank p-values testing gender differences:
survdiff(Surv(time, status) ~ sex, data = bmg)
## Call:
## survdiff(formula = Surv(time, status) ~ sex, data = bmg)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=female 695 141 157 1.71 2.99
## sex=male 958 227 211 1.28 2.99
##
## Chisq= 3 on 1 degrees of freedom, p= 0.08
m <- coxph(Surv(time, status) ~ sex + born + pul + prison + alcohol + homeless + drugs + lineage + pul, bmg_w2y_tim)
summary(m)
## Call:
## coxph(formula = Surv(time, status) ~ sex + born + pul + prison +
## alcohol + homeless + drugs + lineage + pul, data = bmg_w2y_tim)
##
## n= 1265, number of events= 305
## (8 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## sexmale 0.12300 1.13088 0.12155 1.012 0.3116
## bornos2 -0.47166 0.62396 0.23599 -1.999 0.0456 *
## bornuk 0.98192 2.66957 0.12871 7.629 2.36e-14 ***
## pulTRUE 0.95373 2.59537 0.15684 6.081 1.19e-09 ***
## prisonTRUE 0.03711 1.03780 0.23724 0.156 0.8757
## alcoholTRUE 0.56012 1.75088 0.24897 2.250 0.0245 *
## homelessTRUE 0.09197 1.09633 0.29072 0.316 0.7517
## drugsTRUE 0.50799 1.66195 0.20631 2.462 0.0138 *
## lineage 0.32986 1.39077 0.07287 4.527 5.98e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## sexmale 1.131 0.8843 0.8911 1.4351
## bornos2 0.624 1.6027 0.3929 0.9909
## bornuk 2.670 0.3746 2.0744 3.4355
## pulTRUE 2.595 0.3853 1.9085 3.5294
## prisonTRUE 1.038 0.9636 0.6519 1.6522
## alcoholTRUE 1.751 0.5711 1.0748 2.8522
## homelessTRUE 1.096 0.9121 0.6201 1.9382
## drugsTRUE 1.662 0.6017 1.1092 2.4902
## lineage 1.391 0.7190 1.2057 1.6043
##
## Concordance= 0.741 (se = 0.013 )
## Likelihood ratio test= 248.3 on 9 df, p=<2e-16
## Wald test = 247.1 on 9 df, p=<2e-16
## Score (logrank) test = 313.2 on 9 df, p=<2e-16
summary(coxph(Surv(time, status) ~ sex + born + pul + prison + alcohol + homeless + drugs + lineage + pul, bmg_w2y_tim))
## Call:
## coxph(formula = Surv(time, status) ~ sex + born + pul + prison +
## alcohol + homeless + drugs + lineage + pul, data = bmg_w2y_tim)
##
## n= 1265, number of events= 305
## (8 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## sexmale 0.12300 1.13088 0.12155 1.012 0.3116
## bornos2 -0.47166 0.62396 0.23599 -1.999 0.0456 *
## bornuk 0.98192 2.66957 0.12871 7.629 2.36e-14 ***
## pulTRUE 0.95373 2.59537 0.15684 6.081 1.19e-09 ***
## prisonTRUE 0.03711 1.03780 0.23724 0.156 0.8757
## alcoholTRUE 0.56012 1.75088 0.24897 2.250 0.0245 *
## homelessTRUE 0.09197 1.09633 0.29072 0.316 0.7517
## drugsTRUE 0.50799 1.66195 0.20631 2.462 0.0138 *
## lineage 0.32986 1.39077 0.07287 4.527 5.98e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## sexmale 1.131 0.8843 0.8911 1.4351
## bornos2 0.624 1.6027 0.3929 0.9909
## bornuk 2.670 0.3746 2.0744 3.4355
## pulTRUE 2.595 0.3853 1.9085 3.5294
## prisonTRUE 1.038 0.9636 0.6519 1.6522
## alcoholTRUE 1.751 0.5711 1.0748 2.8522
## homelessTRUE 1.096 0.9121 0.6201 1.9382
## drugsTRUE 1.662 0.6017 1.1092 2.4902
## lineage 1.391 0.7190 1.2057 1.6043
##
## Concordance= 0.741 (se = 0.013 )
## Likelihood ratio test= 248.3 on 9 df, p=<2e-16
## Wald test = 247.1 on 9 df, p=<2e-16
## Score (logrank) test = 313.2 on 9 df, p=<2e-16
summary(coxph(Surv(time, status) ~ sex + born + pul + prison + alcohol + homeless + drugs + lineage + pul + born:lineage, bmg_w2y_tim))
## Call:
## coxph(formula = Surv(time, status) ~ sex + born + pul + prison +
## alcohol + homeless + drugs + lineage + pul + born:lineage,
## data = bmg_w2y_tim)
##
## n= 1265, number of events= 305
## (8 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## sexmale 0.12051 1.12807 0.12164 0.991 0.32185
## bornos2 -1.47934 0.22779 1.13597 -1.302 0.19282
## bornuk 1.03949 2.82778 0.51818 2.006 0.04485 *
## pulTRUE 0.94794 2.58038 0.15697 6.039 1.55e-09 ***
## prisonTRUE 0.03445 1.03505 0.23756 0.145 0.88469
## alcoholTRUE 0.56120 1.75278 0.24904 2.253 0.02423 *
## homelessTRUE 0.08904 1.09313 0.29169 0.305 0.76017
## drugsTRUE 0.51174 1.66819 0.20742 2.467 0.01362 *
## lineage 0.31709 1.37312 0.10244 3.095 0.00197 **
## bornos2:lineage 0.29761 1.34663 0.32217 0.924 0.35561
## bornuk:lineage -0.01523 0.98488 0.14979 -0.102 0.91899
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## sexmale 1.1281 0.8865 0.88878 1.432
## bornos2 0.2278 4.3901 0.02458 2.111
## bornuk 2.8278 0.3536 1.02418 7.808
## pulTRUE 2.5804 0.3875 1.89702 3.510
## prisonTRUE 1.0351 0.9661 0.64976 1.649
## alcoholTRUE 1.7528 0.5705 1.07583 2.856
## homelessTRUE 1.0931 0.9148 0.61714 1.936
## drugsTRUE 1.6682 0.5995 1.11093 2.505
## lineage 1.3731 0.7283 1.12335 1.678
## bornos2:lineage 1.3466 0.7426 0.71617 2.532
## bornuk:lineage 0.9849 1.0154 0.73432 1.321
##
## Concordance= 0.74 (se = 0.014 )
## Likelihood ratio test= 249.3 on 11 df, p=<2e-16
## Wald test = 243.3 on 11 df, p=<2e-16
## Score (logrank) test = 321.8 on 11 df, p=<2e-16
summary(coxph(Surv(time, status) ~ sex + born + pul + srf + as.factor(lineage), bmg_w2y_tim))
## Call:
## coxph(formula = Surv(time, status) ~ sex + born + pul + srf +
## as.factor(lineage), data = bmg_w2y_tim)
##
## n= 1265, number of events= 305
## (8 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## sexmale 0.10435 1.10999 0.12173 0.857 0.391333
## bornos2 -0.48404 0.61629 0.23627 -2.049 0.040496 *
## bornuk 1.01023 2.74622 0.12751 7.923 2.32e-15 ***
## pulTRUE 0.95834 2.60737 0.15679 6.112 9.82e-10 ***
## srf 0.30627 1.35834 0.07951 3.852 0.000117 ***
## as.factor(lineage)2 0.53082 1.70033 0.40935 1.297 0.194718
## as.factor(lineage)3 0.89681 2.45176 0.27382 3.275 0.001056 **
## as.factor(lineage)4 1.13194 3.10168 0.27294 4.147 3.36e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## sexmale 1.1100 0.9009 0.8744 1.4091
## bornos2 0.6163 1.6226 0.3879 0.9793
## bornuk 2.7462 0.3641 2.1389 3.5259
## pulTRUE 2.6074 0.3835 1.9175 3.5454
## srf 1.3583 0.7362 1.1623 1.5874
## as.factor(lineage)2 1.7003 0.5881 0.7622 3.7929
## as.factor(lineage)3 2.4518 0.4079 1.4335 4.1933
## as.factor(lineage)4 3.1017 0.3224 1.8167 5.2957
##
## Concordance= 0.739 (se = 0.014 )
## Likelihood ratio test= 246.6 on 8 df, p=<2e-16
## Wald test = 239.2 on 8 df, p=<2e-16
## Score (logrank) test = 298.6 on 8 df, p=<2e-16
summary(coxph(Surv(time, status) ~ sex + born + pul + srf + as.factor(lineage) + born:as.factor(lineage), bmg_w2y_tim))
## Call:
## coxph(formula = Surv(time, status) ~ sex + born + pul + srf +
## as.factor(lineage) + born:as.factor(lineage), data = bmg_w2y_tim)
##
## n= 1265, number of events= 305
## (8 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## sexmale 9.589e-02 1.101e+00 1.222e-01 0.785 0.432679
## bornos2 -1.488e+01 3.447e-07 1.224e+03 -0.012 0.990301
## bornuk 5.154e-01 1.674e+00 5.848e-01 0.881 0.378132
## pulTRUE 9.483e-01 2.581e+00 1.573e-01 6.029 1.64e-09
## srf 3.078e-01 1.360e+00 7.955e-02 3.869 0.000109
## as.factor(lineage)2 -1.095e-01 8.962e-01 6.517e-01 -0.168 0.866522
## as.factor(lineage)3 6.642e-01 1.943e+00 3.275e-01 2.028 0.042519
## as.factor(lineage)4 9.071e-01 2.477e+00 3.324e-01 2.729 0.006348
## bornos2:as.factor(lineage)2 1.508e+01 3.525e+06 1.224e+03 0.012 0.990174
## bornuk:as.factor(lineage)2 1.162e+00 3.195e+00 9.368e-01 1.240 0.214958
## bornos2:as.factor(lineage)3 1.420e+01 1.466e+06 1.224e+03 0.012 0.990746
## bornuk:as.factor(lineage)3 5.457e-01 1.726e+00 6.130e-01 0.890 0.373388
## bornos2:as.factor(lineage)4 1.459e+01 2.176e+06 1.224e+03 0.012 0.990488
## bornuk:as.factor(lineage)4 4.753e-01 1.609e+00 6.094e-01 0.780 0.435369
##
## sexmale
## bornos2
## bornuk
## pulTRUE ***
## srf ***
## as.factor(lineage)2
## as.factor(lineage)3 *
## as.factor(lineage)4 **
## bornos2:as.factor(lineage)2
## bornuk:as.factor(lineage)2
## bornos2:as.factor(lineage)3
## bornuk:as.factor(lineage)3
## bornos2:as.factor(lineage)4
## bornuk:as.factor(lineage)4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## sexmale 1.101e+00 9.086e-01 0.8662 1.399
## bornos2 3.447e-07 2.901e+06 0.0000 Inf
## bornuk 1.674e+00 5.973e-01 0.5322 5.267
## pulTRUE 2.581e+00 3.874e-01 1.8966 3.513
## srf 1.360e+00 7.351e-01 1.1640 1.590
## as.factor(lineage)2 8.962e-01 1.116e+00 0.2498 3.215
## as.factor(lineage)3 1.943e+00 5.147e-01 1.0227 3.691
## as.factor(lineage)4 2.477e+00 4.037e-01 1.2913 4.752
## bornos2:as.factor(lineage)2 3.525e+06 2.837e-07 0.0000 Inf
## bornuk:as.factor(lineage)2 3.195e+00 3.129e-01 0.5094 20.044
## bornos2:as.factor(lineage)3 1.466e+06 6.820e-07 0.0000 Inf
## bornuk:as.factor(lineage)3 1.726e+00 5.795e-01 0.5190 5.738
## bornos2:as.factor(lineage)4 2.176e+06 4.595e-07 0.0000 Inf
## bornuk:as.factor(lineage)4 1.609e+00 6.217e-01 0.4872 5.310
##
## Concordance= 0.742 (se = 0.013 )
## Likelihood ratio test= 251.3 on 14 df, p=<2e-16
## Wald test = 234.3 on 14 df, p=<2e-16
## Score (logrank) test = 311.2 on 14 df, p=<2e-16
Social risk factors, sex and place of birth
Number of patients with a social risk factor:
Number of patients in each of the social risk factors:
Number of patients per number of social risk factors:
Number of patients with a social risk factor and born in the UK:
Number of patients with a social risk factor and males:
Venn diagram of the social risk factors:
Combining social risk factors and sex and place of birth:
Only sex and place of birth are independent:
Same type of analysis but this time accounting for confoundings. First the social risk factors only:
Same thing, with
sex
anduk
in addition: