Packages and functions

library(survival)
library(car)
library(grid)
library(pscl)
library(segmented)
library(readxl)
library(lubridate)
library(tidyr)
library(purrr)
library(stringr)
library(magrittr) # safer to load in penultimate position
library(dplyr)    # safer to load in last position

A function that does the likelihood ratio test (LRT):

lrt <- function(...) anova(..., test = "LRT")

A tuning of the diff() function:

diff2 <- function(x, ...) c(diff(x, ...), NA)

Preparing the data

Data with 5 SNP

Social risk factors:

srf <- c("hmp",   # prison
         "etoh",  # alcoholic
         "drugs", # drug user
         "nfa")   # homeless

Loading the data sets from the files dataset1.xlsx and dataset3.xlsx:

bmg1 <- "dataset1.xlsx" %>%
  read_excel() %>% 
  mutate_at(srf, as.logical) # no NA is these variables

bmg3 <- "dataset3.xlsx" %>%
  read_excel() %>% 
  mutate_at(srf, ~ .x %>%
              na_if("Unknown") %>% # assumes that "Unknown" = NA
              equals("Yes")) 

Let’s compare bmg1 and bmg3 with some basic checks. Checking for the presence of duplicated IDs:

any(table(bmg1$id) > 1)
## [1] FALSE
any(table(bmg3$id) > 1)
## [1] FALSE

Checking that the exact same sets of IDs are used for the 2 data sets:

identical(sort(bmg1$id), sort(bmg3$id))
## [1] TRUE

Are there any variables that have have different values in the two data sets:

intsct <- setdiff(intersect(names(bmg1), names(bmg3)), "id")
intsct_1 <- paste0(intsct, ".1")
intsct_3 <- paste0(intsct, ".3")

bmg13 <- bmg3 %>% 
  mutate_at(srf, replace_na, FALSE) %>% 
  left_join(bmg1, "id", suffix = c(".3", ".1"))

(difrt <- intsct[which(! map2_lgl(bmg13[, intsct_1], bmg13[, intsct_3], identical))])
## [1] "lineage"

Thus, the variable lineage seems to have different values in the two data sets:

if (length(difrt)) {
  show_differences <- function(x, y) {
    X <- bmg13[[x]]
    Y <- bmg13[[y]]
    bmg13[is.na(X - Y) | (X != Y), c("id", x, y)]
  }
  map2(paste0(difrt, ".1"), paste0(difrt, ".3"), show_differences)
}
## [[1]]
## # A tibble: 1 × 3
##      id lineage.1 lineage.3
##   <dbl>     <dbl>     <dbl>
## 1  5636        10        NA

From these observations, let’s merge bmg1 and bmg3, taking bmg3 as a reference (i.e. with a missing lineage):

bmg <- bmg3 %>% 
  select(c("id", setdiff(names(bmg3), names(bmg1)))) %>% 
  left_join(bmg1, "id") %>% 
  rename(prison    = hmp,
         alcohol   = etoh,
         homeless  = nfa,
         post_code = pc) %>% 
  mutate(date      = as_date(firstsample),
         sex       = c("female", "male")[mf],
         lineage   = na_if(lineage, 10),
         born      = ifelse(ukb == "Not known", NA, c("uk", "os1", "os2")[imstat + 1])) %>% 
  mutate_at(vars(index2y, cluster, pul), as.logical) %>% 
  mutate_at(vars(id, cluster_number, lineage, post_code), as.integer) %>% 
  select(id, post_code, date, sex, born, pul, prison, alcohol, homeless, drugs,
         cluster, cluster_number, lineage)

Adding the variables start and chronorder used for generating figure 2:

bmg %<>% 
  group_by(cluster_number) %>% 
  summarise(start = min(date)) %>%
  mutate(chronorder = rank(start, ties = "first")) %>% 
  left_join(bmg, ., "cluster_number")

Adding time and status for the survival analyses:

end_of_data <- max(bmg$date)
bmg %<>% 
  arrange(date) %>% 
  group_by(cluster_number) %>% 
  group_map(~ mutate(.x, time = diff2(as.numeric(date))), .keep = TRUE) %>% 
  bind_rows() %>% 
  mutate(status          = !is.na(time),
         time            = ifelse(status, time, end_of_data - date),
         no_other_factor = !(prison | alcohol | homeless | drugs))

Adding the number of social risk factors:

bmg %<>% mutate(srf = prison + alcohol + homeless + drugs)

Final version of the data set that merges bmg1 and bmg3 is thus:

bmg
## # A tibble: 1,653 × 19
##       id post_code date       sex    born  pul   prison alcohol homeless drugs
##    <int>     <int> <date>     <chr>  <chr> <lgl> <lgl>  <lgl>   <lgl>    <lgl>
##  1   234         8 2011-06-04 male   os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  2   560         8 2013-04-10 female os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  3   253        66 2011-07-04 female os2   TRUE  FALSE  FALSE   FALSE    FALSE
##  4   290        66 2011-08-25 male   os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  5  3497        19 2014-03-24 male   os2   TRUE  FALSE  FALSE   FALSE    FALSE
##  6  4347        19 2015-07-07 male   uk    TRUE  FALSE  FALSE   FALSE    FALSE
##  7  2299        11 2009-11-23 female os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  8     5        11 2009-12-14 female os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  9   426        28 2012-06-08 male   os1   FALSE FALSE  FALSE   FALSE    FALSE
## 10  2542        66 2010-03-25 female uk    TRUE  FALSE  FALSE   FALSE    FALSE
## # ℹ 1,643 more rows
## # ℹ 9 more variables: cluster <lgl>, cluster_number <int>, lineage <int>,
## #   start <date>, chronorder <int>, time <dbl>, status <lgl>,
## #   no_other_factor <lgl>, srf <int>

Now we want to verify the cluster variable. Here we compute from scratch the set of IDs that belong to a cluster:

in_a_cluster1 <- bmg %>% 
  group_by(cluster_number) %>% 
  tally() %>% 
  filter(n > 1) %>% 
  pull(cluster_number)

This is the set of IDs reported in the data set to belong to a cluster:

in_a_cluster2 <- bmg %>%
  filter(cluster) %>% 
  pull(cluster_number) %>% 
  unique()

We can verify that this 2 sets are identical:

identical(in_a_cluster1, in_a_cluster2)
## [1] TRUE

Next, we want to verify that there is only one lineage per cluster:

bmg %>% 
  group_by(lineage, cluster_number) %>% 
  tally() %>% 
  pivot_wider(names_from = lineage, values_from = n) %>% 
  mutate_all(replace_na, 0) %>% 
  mutate_at(vars(`1`, `2`, `3`, `4`, `NA`), as.logical) %>% 
  mutate(s = `1` + `2` + `3` + `4` + `NA`) %>% 
  filter(s > 1)
## # A tibble: 1 × 7
##   cluster_number `1`   `2`   `3`   `4`   `NA`      s
##            <int> <lgl> <lgl> <lgl> <lgl> <lgl> <int>
## 1             27 FALSE FALSE TRUE  FALSE TRUE      2

All good. Now confirming that there is one lineage missing value (ID 5636) in the cluster number 27:

bmg %>%
  filter(is.na(lineage)) %>% 
  select(id, cluster_number, lineage)
## # A tibble: 1 × 3
##      id cluster_number lineage
##   <int>          <int>   <int>
## 1  5636             27      NA

Data with 12 SNP

bmg1_12 <- "dataset1_snps12.xlsx" %>%
  read_excel() %>% 
  mutate_at(srf, as.logical) %>% 
  select(-`_merge`)
bmg_12 <- bmg3 %>% 
  select(c("id", setdiff(names(bmg3), names(bmg1_12)))) %>% 
  left_join(bmg1_12, "id") %>% 
  rename(prison    = hmp,
         alcohol   = etoh,
         homeless  = nfa,
         post_code = pc) %>% 
  mutate(date      = as_date(firstsample),
         sex       = c("female", "male")[mf],
         lineage   = na_if(lineage, 10),
         born      = ifelse(ukb == "Not known", NA, c("uk", "os1", "os2")[imstat + 1])) %>% 
  mutate_at(vars(index2y, cluster, pul), as.logical) %>% 
  mutate_at(vars(id, cluster_number, lineage, post_code), as.integer) %>% 
  select(id, post_code, date, sex, born, pul, prison, alcohol, homeless, drugs,
         cluster, cluster_number, lineage)
bmg_12 %<>% 
  group_by(cluster_number) %>% 
  summarise(start = min(date)) %>%
  mutate(chronorder = rank(start, ties = "first")) %>% 
  left_join(bmg_12, ., "cluster_number")
end_of_data <- max(bmg_12$date)
bmg_12 %<>% 
  arrange(date) %>% 
  group_by(cluster_number) %>% 
  group_map(~ mutate(.x, time = diff2(as.numeric(date))), .keep = TRUE) %>% 
  bind_rows() %>% 
  mutate(status          = !is.na(time),
         time            = ifelse(status, time, end_of_data - date),
         no_other_factor = !(prison | alcohol | homeless | drugs))
bmg_12 %<>% mutate(srf = prison + alcohol + homeless + drugs)
bmg_12
## # A tibble: 1,653 × 19
##       id post_code date       sex    born  pul   prison alcohol homeless drugs
##    <int>     <int> <date>     <chr>  <chr> <lgl> <lgl>  <lgl>   <lgl>    <lgl>
##  1   234         8 2011-06-04 male   os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  2   560         8 2013-04-10 female os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  3   253        66 2011-07-04 female os2   TRUE  FALSE  FALSE   FALSE    FALSE
##  4   290        66 2011-08-25 male   os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  5  3497        19 2014-03-24 male   os2   TRUE  FALSE  FALSE   FALSE    FALSE
##  6  4347        19 2015-07-07 male   uk    TRUE  FALSE  FALSE   FALSE    FALSE
##  7  2299        11 2009-11-23 female os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  8     5        11 2009-12-14 female os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  9   426        28 2012-06-08 male   os1   FALSE FALSE  FALSE   FALSE    FALSE
## 10  2542        66 2010-03-25 female uk    TRUE  FALSE  FALSE   FALSE    FALSE
## # ℹ 1,643 more rows
## # ℹ 9 more variables: cluster <lgl>, cluster_number <int>, lineage <int>,
## #   start <date>, chronorder <int>, time <dbl>, status <lgl>,
## #   no_other_factor <lgl>, srf <int>

Let’s compare bmg and bmg12:

table(bmg$cluster, bmg_12$cluster)
##        
##         FALSE TRUE
##   FALSE  1095   47
##   TRUE      0  511

Regenerated data with 12 SNP

Reading the new data set:

bmg1_12b <- "snp12_clusters.xlsx" %>%
  read_excel() %>% 
  setNames(c("id", "date", "cluster_number")) %>% 
  mutate(id = as.integer(id),
         date = as_date(date),
         cluster_number = as.integer(cluster_number))

Checking consistency on the date:

bmg %>% 
  select(id, date) %>% 
  left_join(select(bmg1_12b, id, date), "id") %>% 
  mutate(dff = date.x - date.y) %>% 
  pull(dff) %>% 
  unique()
## [1] 0

The cluster ID that actually correspond to actual clusters:

in_cluster <- bmg1_12b %>% 
  group_by(cluster_number) %>% 
  tally() %>% 
  filter(n > 1) %>% 
  pull(cluster_number)
new_cluster <- bmg1_12b %>% 
  mutate(cluster2 = as.numeric(cluster_number %in% in_cluster)) %>% 
  select(id, cluster2)
bmg1_12b <- bmg1_12 %>% 
  left_join(new_cluster, "id") %>% 
  mutate(cluster = cluster2) %>% 
  select(-cluster2)
bmg_12b <- bmg3 %>% 
  select(c("id", setdiff(names(bmg3), names(bmg1_12b)))) %>% 
  left_join(bmg1_12b, "id") %>% 
  rename(prison    = hmp,
         alcohol   = etoh,
         homeless  = nfa,
         post_code = pc) %>% 
  mutate(date      = as_date(firstsample),
         sex       = c("female", "male")[mf],
         lineage   = na_if(lineage, 10),
         born      = ifelse(ukb == "Not known", NA, c("uk", "os1", "os2")[imstat + 1])) %>% 
  mutate_at(vars(index2y, cluster, pul), as.logical) %>% 
  mutate_at(vars(id, cluster_number, lineage, post_code), as.integer) %>% 
  select(id, post_code, date, sex, born, pul, prison, alcohol, homeless, drugs,
         cluster, cluster_number, lineage)
bmg_12b %<>% 
  group_by(cluster_number) %>% 
  summarise(start = min(date)) %>%
  mutate(chronorder = rank(start, ties = "first")) %>% 
  left_join(bmg_12b, ., "cluster_number")
end_of_data <- max(bmg_12b$date)
bmg_12b %<>% 
  arrange(date) %>% 
  group_by(cluster_number) %>% 
  group_map(~ mutate(.x, time = diff2(as.numeric(date))), .keep = TRUE) %>% 
  bind_rows() %>% 
  mutate(status          = !is.na(time),
         time            = ifelse(status, time, end_of_data - date),
         no_other_factor = !(prison | alcohol | homeless | drugs))
bmg_12b %<>% mutate(srf = prison + alcohol + homeless + drugs)
bmg_12b
## # A tibble: 1,653 × 19
##       id post_code date       sex    born  pul   prison alcohol homeless drugs
##    <int>     <int> <date>     <chr>  <chr> <lgl> <lgl>  <lgl>   <lgl>    <lgl>
##  1   234         8 2011-06-04 male   os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  2   560         8 2013-04-10 female os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  3   253        66 2011-07-04 female os2   TRUE  FALSE  FALSE   FALSE    FALSE
##  4   290        66 2011-08-25 male   os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  5  3497        19 2014-03-24 male   os2   TRUE  FALSE  FALSE   FALSE    FALSE
##  6  4347        19 2015-07-07 male   uk    TRUE  FALSE  FALSE   FALSE    FALSE
##  7  2299        11 2009-11-23 female os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  8     5        11 2009-12-14 female os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  9   426        28 2012-06-08 male   os1   FALSE FALSE  FALSE   FALSE    FALSE
## 10  2542        66 2010-03-25 female uk    TRUE  FALSE  FALSE   FALSE    FALSE
## # ℹ 1,643 more rows
## # ℹ 9 more variables: cluster <lgl>, cluster_number <int>, lineage <int>,
## #   start <date>, chronorder <int>, time <dbl>, status <lgl>,
## #   no_other_factor <lgl>, srf <int>

Let’s compare bmg12 and bmg12b:

table(bmg_12$cluster, bmg_12b$cluster)
##        
##         FALSE TRUE
##   FALSE  1091    4
##   TRUE      2  556

General findings

Total number of patients:

(nb_patients <- nrow(bmg))
## [1] 1653

Numbers of male and female:

sex_ratio <- bmg %>% 
  group_by(sex) %>% 
  tally() %>% 
  mutate(perc = round(n / nb_patients, 2))

sex_ratio
## # A tibble: 2 × 3
##   sex        n  perc
##   <chr>  <int> <dbl>
## 1 female   695  0.42
## 2 male     958  0.58

Number of missing data on location of birth:

(nb_missing_born <- sum(is.na(bmg$born)))
## [1] 10

Corresponding percentage:

round(nb_missing_born / nb_patients, 2)
## [1] 0.01

Number of people born in the UK:

(nb_uk <- sum(bmg$born == "uk", na.rm = TRUE))
## [1] 448

Corresponding percentage:

(perc_uk <- round(nb_uk / nb_patients, 2))
## [1] 0.27

Number of people born overseas:

nb_patients - nb_uk - nb_missing_born
## [1] 1195

Corresponding percentage:

round((nb_patients - nb_uk - nb_missing_born) / nb_patients, 2)
## [1] 0.72

Pulmonary TB:

bmg %>% 
  group_by(pul) %>% 
  tally()
## # A tibble: 2 × 2
##   pul       n
##   <lgl> <int>
## 1 FALSE   610
## 2 TRUE   1043

Type of TB x born in the UK or not:

tbl <- bmg %>% 
  mutate(uk = born == "uk") %$% 
  table(uk, pul) %>% 
  addmargins()

tbl
##        pul
## uk      FALSE TRUE  Sum
##   FALSE   505  690 1195
##   TRUE     99  349  448
##   Sum     604 1039 1643

Corresponding percentages:

round(100 * tbl[1:2, -3] / tbl[1:2, 3])
##        pul
## uk      FALSE TRUE
##   FALSE    42   58
##   TRUE     22   78

Lineages:

bmg %>% 
  group_by(lineage) %>% 
  tally()
## # A tibble: 5 × 2
##   lineage     n
##     <int> <int>
## 1       1   214
## 2       2    86
## 3       3   703
## 4       4   649
## 5      NA     1

Social risk factors, sex and place of birth

Number of patients with a social risk factor:

nb_srf <- bmg %>% 
  filter(prison | alcohol | homeless | drugs) %>% 
  nrow()

nb_srf
## [1] 153

Number of patients in each of the social risk factors:

bmg %>% 
  select(prison, alcohol, homeless, drugs) %>% 
  colSums()
##   prison  alcohol homeless    drugs 
##       74       39       42       93

Number of patients per number of social risk factors:

bmg %>% 
  select(prison, alcohol, homeless, drugs) %>% 
  rowSums() %>% 
  table()
## .
##    0    1    2    3    4 
## 1500   86   43   20    4

Number of patients with a social risk factor and born in the UK:

nb_srf_uk <- bmg %>% 
  filter((prison | alcohol | homeless | drugs) & born == "uk") %>% 
  nrow()

nb_srf_uk
## [1] 101
round(100 * nb_srf_uk / nb_srf)
## [1] 66

Number of patients with a social risk factor and males:

nb_srf_male <- bmg %>% 
  filter((prison | alcohol | homeless | drugs) & sex == "male") %>% 
  nrow()

nb_srf_male
## [1] 131
round(100 * nb_srf_male / nb_srf)
## [1] 86

Venn diagram of the social risk factors:

srf <- c("prison", "alcohol", "homeless", "drugs")

cols <- adjustcolor(RColorBrewer::brewer.pal(4, "Set3"), .5)

grid.newpage()

srf %>%
  map(~ unlist(bmg[bmg[[.x]], "id"])) %>% 
  setNames(srf) %>% 
  VennDiagram::venn.diagram(NULL, fill = cols) %>% 
  grid.draw()

Combining social risk factors and sex and place of birth:

rf <- c("sex", "born", srf)

Only sex and place of birth are independent:

nb <- length(rf)
mat <- matrix(0, nb, nb)
for (i in 1:(nb - 1)) {
  for (j in (i + 1):nb) {
    mat[i, j] <- mat[j, i] <- fisher.test(table(bmg[[rf[i]]], bmg[[rf[j]]]))$p.value
  }
}
rownames(mat) <- colnames(mat) <- rf

round(mat, 4)
##             sex   born prison alcohol homeless drugs
## sex      0.0000 0.1522      0  0.0004        0     0
## born     0.1522 0.0000      0  0.0000        0     0
## prison   0.0000 0.0000      0  0.0000        0     0
## alcohol  0.0004 0.0000      0  0.0000        0     0
## homeless 0.0000 0.0000      0  0.0000        0     0
## drugs    0.0000 0.0000      0  0.0000        0     0

Same type of analysis but this time accounting for confoundings. First the social risk factors only:

Anova(glm(drugs ~ prison + alcohol + homeless, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
## 
## Response: drugs
##          LR Chisq Df Pr(>Chisq)    
## prison     99.788  1  < 2.2e-16 ***
## alcohol    16.058  1  6.143e-05 ***
## homeless   33.267  1  8.033e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(prison ~ drugs + alcohol + homeless, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
## 
## Response: prison
##          LR Chisq Df Pr(>Chisq)    
## drugs     102.128  1    < 2e-16 ***
## alcohol     2.475  1    0.11569    
## homeless    4.475  1    0.03439 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(alcohol ~ drugs + prison + homeless, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
## 
## Response: alcohol
##          LR Chisq Df Pr(>Chisq)    
## drugs     18.7619  1  1.481e-05 ***
## prison     2.8392  1   0.091991 .  
## homeless   7.9916  1   0.004699 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(homeless ~ drugs + prison + alcohol, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
## 
## Response: homeless
##         LR Chisq Df Pr(>Chisq)    
## drugs     35.554  1  2.481e-09 ***
## prison     4.423  1   0.035458 *  
## alcohol    7.575  1   0.005919 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Same thing, with sex and uk in addition:

tmp <- mutate(bmg, uk = born == "uk")
Anova(glm(drugs ~ prison + alcohol + homeless + sex + uk, binomial, tmp))
## Analysis of Deviance Table (Type II tests)
## 
## Response: drugs
##          LR Chisq Df Pr(>Chisq)    
## prison     62.966  1  2.103e-15 ***
## alcohol     7.779  1   0.005287 ** 
## homeless   28.672  1  8.572e-08 ***
## sex         1.094  1   0.295488    
## uk         56.894  1  4.600e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(prison ~ drugs + alcohol + homeless + sex + uk, binomial, tmp))
## Analysis of Deviance Table (Type II tests)
## 
## Response: prison
##          LR Chisq Df Pr(>Chisq)    
## drugs      66.831  1  2.959e-16 ***
## alcohol     1.258  1  0.2619997    
## homeless    2.595  1  0.1072285    
## sex        36.029  1  1.944e-09 ***
## uk         11.622  1  0.0006516 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(alcohol ~ drugs + prison + homeless + sex + uk, binomial, tmp))
## Analysis of Deviance Table (Type II tests)
## 
## Response: alcohol
##          LR Chisq Df Pr(>Chisq)   
## drugs      9.5246  1   0.002027 **
## prison     1.0623  1   0.302689   
## homeless   6.7274  1   0.009494 **
## sex        5.1377  1   0.023412 * 
## uk         8.6643  1   0.003245 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(homeless ~ drugs + prison + alcohol + sex + uk, binomial, tmp))
## Analysis of Deviance Table (Type II tests)
## 
## Response: homeless
##         LR Chisq Df Pr(>Chisq)    
## drugs    29.6872  1  5.077e-08 ***
## prison    1.9940  1   0.157925    
## alcohol   6.8921  1   0.008658 ** 
## sex       9.3603  1   0.002217 ** 
## uk        0.6567  1   0.417731    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Clusters

Number of patients in a cluster (versus not):

nb_clst <- bmg %>% 
  group_by(cluster) %>% 
  tally()

nb_clst
## # A tibble: 2 × 2
##   cluster     n
##   <lgl>   <int>
## 1 FALSE    1142
## 2 TRUE      511

Confirming above result, sex and place of birth are independent but social risk factors have a significant effect on both of them, namely males and those born in the UK are more associated with social risk factors:

summary(glm(sex == "male" ~ born == "uk", binomial, bmg))
## 
## Call:
## glm(formula = sex == "male" ~ born == "uk", family = binomial, 
##     data = bmg)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       0.31556    0.05858   5.387 7.16e-08 ***
## born == "uk"TRUE  0.04545    0.11249   0.404    0.686    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2234.1  on 1642  degrees of freedom
## Residual deviance: 2233.9  on 1641  degrees of freedom
##   (10 observations deleted due to missingness)
## AIC: 2237.9
## 
## Number of Fisher Scoring iterations: 4
summary(glm(sex == "male" ~ prison | alcohol | homeless | drugs, binomial, bmg))
## 
## Call:
## glm(formula = sex == "male" ~ prison | alcohol | homeless | drugs, 
##     family = binomial, data = bmg)
## 
## Coefficients:
##                                         Estimate Std. Error z value Pr(>|z|)
## (Intercept)                              0.20606    0.05191   3.969 7.21e-05
## prison | alcohol | homeless | drugsTRUE  1.57810    0.23611   6.684 2.33e-11
##                                            
## (Intercept)                             ***
## prison | alcohol | homeless | drugsTRUE ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2249.5  on 1652  degrees of freedom
## Residual deviance: 2189.6  on 1651  degrees of freedom
## AIC: 2193.6
## 
## Number of Fisher Scoring iterations: 3
summary(glm(born == "uk" ~ prison | alcohol | homeless | drugs, binomial, bmg))
## 
## Call:
## glm(formula = born == "uk" ~ prison | alcohol | homeless | drugs, 
##     family = binomial, data = bmg)
## 
## Coefficients:
##                                         Estimate Std. Error z value Pr(>|z|)
## (Intercept)                             -1.19209    0.06129  -19.45   <2e-16
## prison | alcohol | homeless | drugsTRUE  1.85596    0.18135   10.23   <2e-16
##                                            
## (Intercept)                             ***
## prison | alcohol | homeless | drugsTRUE ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1925.3  on 1642  degrees of freedom
## Residual deviance: 1813.5  on 1641  degrees of freedom
##   (10 observations deleted due to missingness)
## AIC: 1817.5
## 
## Number of Fisher Scoring iterations: 4

Sex is not associated with the likelihood of the case to be part of a cluster but social risk factor and being born in the UK both significantly increase the likelihood to be part of a cluster:

summary(glm(cluster ~ (prison | alcohol | homeless | drugs) + sex, binomial, bmg))
## 
## Call:
## glm(formula = cluster ~ (prison | alcohol | homeless | drugs) + 
##     sex, family = binomial, data = bmg)
## 
## Coefficients:
##                                         Estimate Std. Error z value Pr(>|z|)
## (Intercept)                             -0.99858    0.08548 -11.682  < 2e-16
## prison | alcohol | homeless | drugsTRUE  1.33140    0.17796   7.482 7.35e-14
## sexmale                                  0.09132    0.11258   0.811    0.417
##                                            
## (Intercept)                             ***
## prison | alcohol | homeless | drugsTRUE ***
## sexmale                                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2044.5  on 1652  degrees of freedom
## Residual deviance: 1982.1  on 1650  degrees of freedom
## AIC: 1988.1
## 
## Number of Fisher Scoring iterations: 4
summary(glm(cluster ~ (prison | alcohol | homeless | drugs) + (born == "uk"), binomial, bmg))
## 
## Call:
## glm(formula = cluster ~ (prison | alcohol | homeless | drugs) + 
##     (born == "uk"), family = binomial, data = bmg)
## 
## Coefficients:
##                                         Estimate Std. Error z value Pr(>|z|)
## (Intercept)                             -1.36774    0.07219 -18.945  < 2e-16
## prison | alcohol | homeless | drugsTRUE  0.81466    0.18998   4.288  1.8e-05
## born == "uk"TRUE                         1.51194    0.12273  12.320  < 2e-16
##                                            
## (Intercept)                             ***
## prison | alcohol | homeless | drugsTRUE ***
## born == "uk"TRUE                        ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2035.4  on 1642  degrees of freedom
## Residual deviance: 1820.0  on 1640  degrees of freedom
##   (10 observations deleted due to missingness)
## AIC: 1826
## 
## Number of Fisher Scoring iterations: 4

Temporal trend in case load with 5 SNP

Preparing monthly incidence data

Let’s first transform the data into monthly incidence time series. Because we will model potential seasonality with harmonic regression, we also generate cos and sine wave with the following function:

wave_trans <- function(x, f = cos) {
  f(2 * pi * x / 365)
}

And now the time series data can be produced this way:

bmg_ts <- bmg %>% 
  mutate(year  = year(date),
         month = month(date)) %>% 
  group_by(year, month, cluster) %>% 
  tally() %>% 
  ungroup() %>% 
  mutate(date    = ymd(paste(year, month, "1")),
         date2   = as.integer(date),
         cosdate = wave_trans(date2, cos),
         sindate = wave_trans(date2, sin))

Which looks like:

bmg_ts
## # A tibble: 248 × 8
##     year month cluster     n date       date2 cosdate sindate
##    <dbl> <dbl> <lgl>   <int> <date>     <int>   <dbl>   <dbl>
##  1  2009     1 FALSE      13 2009-01-01 14245   0.985   0.171
##  2  2009     1 TRUE        5 2009-01-01 14245   0.985   0.171
##  3  2009     2 FALSE      17 2009-02-01 14276   0.761   0.649
##  4  2009     2 TRUE        5 2009-02-01 14276   0.761   0.649
##  5  2009     3 FALSE      16 2009-03-01 14304   0.374   0.928
##  6  2009     3 TRUE       10 2009-03-01 14304   0.374   0.928
##  7  2009     4 FALSE      16 2009-04-01 14335  -0.150   0.989
##  8  2009     4 TRUE        8 2009-04-01 14335  -0.150   0.989
##  9  2009     5 FALSE      16 2009-05-01 14365  -0.619   0.786
## 10  2009     5 TRUE       11 2009-05-01 14365  -0.619   0.786
## # ℹ 238 more rows

Poisson model

Let’s consider some Poisson models with various options for the interactions:

# no interactions:
mod_pois1 <- glm(n ~ date2 + cluster + cosdate + sindate, poisson, bmg_ts)
# interaction between seasonality and cluster:
mod_pois2 <- update(mod_pois1, . ~ . + cosdate : cluster + sindate : cluster)
# interaction between trend and cluster:
mod_pois3 <- update(mod_pois1, . ~ . + date2 : cluster)
# interactions between seasonality and cluster, as well as between trend and cluster:
mod_pois4 <- update(mod_pois2, . ~ . + date2 : cluster)

Let’s compare these models:

lrt(mod_pois1, mod_pois2, mod_pois4)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate + 
##     date2:cluster
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
## 1       243     231.51                        
## 2       241     221.94  2   9.5713 0.008349 **
## 3       240     221.73  1   0.2091 0.647474   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrt(mod_pois1, mod_pois3, mod_pois4)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + date2:cluster
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate + 
##     date2:cluster
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
## 1       243     231.51                        
## 2       242     231.25  1   0.2607 0.609650   
## 3       240     221.73  2   9.5198 0.008567 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

So, the best model would be mod_pois2 (i.e. interaction between cluster and seasonality, but not between cluster and linear trend). Next, let’s see whether the seasonality is significant for both the cluster and non-cluster:

mod_pois2a <- update(mod_pois1, . ~ . - cluster, data = filter(bmg_ts,  cluster))
mod_pois2b <- update(mod_pois2a, data = filter(bmg_ts,  !cluster))

test_seasonality <- function(x) {
  lrt(update(x, . ~ . - cosdate - sindate), x)
}

test_seasonality(mod_pois2a)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       121    112.147                          
## 2       119     91.685  2   20.462 3.604e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_seasonality(mod_pois2b)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       123     134.11                     
## 2       121     130.05  2    4.061   0.1313

Which means that seasonality is significant only for the cluster (mod_pois2a). Let’s thus update this model accordingly:

mod_pois2b %<>% update(. ~ . - cosdate - sindate)

Next, let’s look at segmented regressions version of these two models, with periods predefined by these dates:

date_1 <- ymd(20110101)
date_2 <- ymd(20150101)

Let’s do the test for the cluster and non cluster:

segmented2 <- function(x, y) {
  segmented(x, ~ date2, y - 15, control = seg.control(it.max = 0))
}

test_segments <- function(x) {
  mod1 <- segmented2(x, date_1)
  mod2 <- segmented2(x, date_2)
  mod3 <- segmented2(x, c(date_1, date_2))
  print(lrt(x, mod1))
  print(lrt(x, mod2))
  lrt(mod2, mod3)
}

test_segments(mod_pois2a)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       119     91.685                     
## 2       118     91.601  1 0.083495   0.7726
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       119     91.685                     
## 2       118     90.949  1  0.73597    0.391
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate + U1.date2
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       118     90.949                     
## 2       117     89.970  1   0.9785   0.3226
test_segments(mod_pois2b)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       123     134.11                     
## 2       122     133.49  1  0.61578   0.4326
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       123     134.11                       
## 2       122     128.93  1   5.1811  0.02283 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + U1.date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       122     128.93                     
## 2       121     128.42  1   0.5077   0.4761

Which means that there is a change in the trend only for the non-cluster, and only after 2015. Let’s update model mod_pois2b accordingly:

mod_pois2b %<>% segmented2(date_2)

Now we can make a figure. First we need a function that computes predictions with confidence intervals from a model:

predict2 <- function(object, newdata, level = .95) {
  q <- (1 - level) / 2
  
  predict(object, newdata, se.fit = TRUE) %>% 
    magrittr::extract(-3) %>% 
    as_tibble() %>% 
    mutate(lwr = fit + qt(q, Inf) * se.fit,
           upr = fit + qt(1 - q, Inf) * se.fit) %>% 
    mutate_at(vars(fit, lwr, upr), object$family$linkinv) %>% 
    cbind(newdata, .)
}

Let’s generate some new data for the covariables used for prediction:

newdata <- tibble(date     = unique(bmg_ts$date)) %>% 
           mutate(date2    = as.integer(date),
                  cosdate  = wave_trans(date2, cos),
                  sindate  = wave_trans(date2, sin),
                  U1.date2 = date2 - as.numeric(date_2) + 15,
                  U1.date2 = ifelse(U1.date2 < 0, 0, U1.date2))

Next we need a function that plots the model predictions with confidence interval:

arrows2 <- function(...) arrows(..., length = .02, angle = 90, code = 3)

plot_mod_pred <- function(x, color, lwd = 1.5) {
  with(x, {
    lines(date, fit, col = color, lwd = lwd)
    arrows2(date, lwr, date, upr, col = color, lwd = lwd)
  })
}

Now that we have these two functions, we can make the figure:

# The data of non cluster:
bmg_ts %>% 
  filter(! cluster) %$% 
  plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
       ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")

# The data of cluster:
bmg_ts %>% 
  filter(cluster) %$% 
  points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)

# The model of cluster:
mod_pois2a %>% 
  predict2(newdata) %>% 
  plot_mod_pred(2)

# The model of non cluster:
mod_pois2b %>% 
  predict2(newdata) %>% 
  plot_mod_pred(4)

# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)

# The legend:
lgd <- c("non cluster", "cluster")
legend2 <- function(...) legend(..., box.col = "white") 
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)

Alternative Figure 1 addressing reviewer 5’s comment

On figure 1, replace incidence by incidence rate (reviewer 5).

ons <- readxl::read_excel("ONS-mid-year-population-estimates.xlsx")
bmg_ts2 <- ons %>% 
  select(Year, Total) %>% 
  left_join(bmg_ts, ., c("year" = "Year"))
# no interactions:
mod_pois1_2 <- glm(n ~ date2 + cluster + cosdate + sindate, poisson, bmg_ts2, offset = log(Total))
# interaction between seasonality and cluster:
mod_pois2_2 <- update(mod_pois1_2, . ~ . + cosdate : cluster + sindate : cluster)
# interaction between trend and cluster:
mod_pois3_2 <- update(mod_pois1_2, . ~ . + date2 : cluster)
# interactions between seasonality and cluster, as well as between trend and cluster:
mod_pois4_2 <- update(mod_pois2_2, . ~ . + date2 : cluster)
lrt(mod_pois1_2, mod_pois2_2, mod_pois4_2)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate + 
##     date2:cluster
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
## 1       243     230.97                        
## 2       241     221.39  2   9.5721 0.008345 **
## 3       240     221.18  1   0.2098 0.646923   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod_pois2a_2 <- update(mod_pois1_2, . ~ . - cluster, data = filter(bmg_ts2,  cluster))
mod_pois2b_2 <- update(mod_pois2a_2, data = filter(bmg_ts2,  !cluster))

test_seasonality(mod_pois2a_2)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       121    111.625                          
## 2       119     91.485  2    20.14 4.233e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_seasonality(mod_pois2b_2)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       123     133.67                     
## 2       121     129.70  2   3.9706   0.1373
mod_pois2b_2 %<>% update(. ~ . - cosdate - sindate)
test_segments(mod_pois2a_2)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       119     91.485                     
## 2       118     91.412  1 0.073012    0.787
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       119     91.485                     
## 2       118     90.711  1  0.77457   0.3788
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate + U1.date2
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       118     90.711                     
## 2       117     89.746  1  0.96441   0.3261
test_segments(mod_pois2b_2)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       123     133.67                     
## 2       122     133.02  1  0.64837   0.4207
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       123     133.67                       
## 2       122     128.35  1   5.3165  0.02112 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + U1.date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       122     128.35                     
## 2       121     127.85  1  0.50206   0.4786
mod_pois2b_2 %<>% segmented2(date_2)
newdata2 <- tibble(date     = unique(bmg_ts$date)) %>% 
            mutate(date2    = as.integer(date),
                   cosdate  = wave_trans(date2, cos),
                   sindate  = wave_trans(date2, sin),
                   U1.date2 = date2 - as.numeric(date_2) + 15,
                   U1.date2 = ifelse(U1.date2 < 0, 0, U1.date2),
                   Year     = year(date)) %>% 
            left_join(select(ons, Year, Total), "Year") %>% 
            select(-Year)

The figure:

# The data of non cluster:
bmg_ts %>% 
  filter(! cluster) %$% 
  plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
       ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")

# The data of cluster:
bmg_ts %>% 
  filter(cluster) %$% 
  points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)

# The model of cluster:
mod_pois2a_2 %>% 
  predict2(newdata2) %>% 
  plot_mod_pred(2)

# The model of non cluster:
mod_pois2b_2 %>% 
  predict2(newdata2) %>% 
  plot_mod_pred(4)

# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)

# The legend:
lgd <- c("non cluster", "cluster")
legend2 <- function(...) legend(..., box.col = "white") 
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)

Alternative Figure 1 addressing reviewer 3’s comment

On figure 1, replace the segmented regression by a spline regression (reviewer 3).

library(mgcv)
# The data of non cluster:
bmg_ts %>% 
  filter(! cluster) %$% 
  plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
       ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")

# The data of cluster:
bmg_ts %>% 
  filter(cluster) %$% 
  points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)

# The model of non cluster:
bmg_ts %>%
  filter(! cluster) %>% 
  gam(n ~ s(date2), poisson, .) %>% 
#  gam(n ~ s(date2) + s(cosdate) + s(sindate), poisson, .) %>% 
  predict2(newdata) %>% 
  plot_mod_pred(4)

# The model of cluster:
bmg_ts %>%
  filter(cluster) %>% 
  gam(n ~ s(date2) + s(cosdate) + s(sindate), poisson, .) %>% 
  predict2(newdata) %>% 
  plot_mod_pred(2)

# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)

# The legend:
lgd <- c("non cluster", "cluster")
legend2 <- function(...) legend(..., box.col = "white") 
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)

Diagnostics on the Poisson model

Let’s look at the dispersion in the residuals

AER::dispersiontest(mod_pois2a)
## 
##  Overdispersion test
## 
## data:  mod_pois2a
## z = -2.4579, p-value = 0.993
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##  0.7780348
AER::dispersiontest(mod_pois2b)
## 
##  Overdispersion test
## 
## data:  mod_pois2b
## z = -0.080627, p-value = 0.5321
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##  0.9917055

Which comforts us in the choice of a Poisson model. Let’s now look at temporal autocorrelation in residuals. Below is a function that performs a few checks:

test_autocorr <- function(x) {
  Residuals <- resid(x)
  print(summary(lm(Residuals[-length(Residuals)] ~ Residuals[-1])))
  opar <- par(mfrow = 1:2)
  plot(Residuals, type = "l", col = 4)
  abline(h = 0)
  acf(Residuals, main = NA)
  par(opar)
} 

Let’s check the models for cluster and non cluster

test_autocorr(mod_pois2a)
## 
## Call:
## lm(formula = Residuals[-length(Residuals)] ~ Residuals[-1])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7054 -0.6757 -0.1475  0.5343  2.2863 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -0.06134    0.07910  -0.775    0.440
## Residuals[-1] -0.01188    0.09129  -0.130    0.897
## 
## Residual standard error: 0.8716 on 120 degrees of freedom
## Multiple R-squared:  0.0001411,  Adjusted R-squared:  -0.008191 
## F-statistic: 0.01694 on 1 and 120 DF,  p-value: 0.8967

test_autocorr(mod_pois2b)
## 
## Call:
## lm(formula = Residuals[-length(Residuals)] ~ Residuals[-1])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1867 -0.5622  0.1478  0.7324  2.1779 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -0.04882    0.09167  -0.533    0.595
## Residuals[-1] -0.02381    0.08992  -0.265    0.792
## 
## Residual standard error: 1.019 on 122 degrees of freedom
## Multiple R-squared:  0.0005744,  Adjusted R-squared:  -0.007618 
## F-statistic: 0.07012 on 1 and 122 DF,  p-value: 0.7916

Everything looks OK!

Quantifying the decreases in incidence

First we need a function that estimates the rate of decrease in % / year, with confidence interval:

est_decrease <- function(x, level = .95) {
  c(coef(x)["date2"], rev(confint(x, "date2", level))) %>% 
    multiply_by(365) %>% 
    x$family$linkinv() %>% 
    subtract(1, .) %>% 
    multiply_by(100) %>% 
    setNames(c("est", "lwr", "upr"))
}

Let’s apply it to all the data:

glm(n ~ date2, poisson, bmg_ts) %>% 
  est_decrease() %>% 
  round(2)
##  est  lwr  upr 
## 6.70 5.19 8.19

Let’s apply it the cluster cases

mod_pois2a %>%
  update(. ~ .  - cosdate - sindate) %>% # we remove the seasonality
  est_decrease() %>% 
  round(2)
##   est   lwr   upr 
##  7.40  4.67 10.08

Let’s apply it to the non cluster before 2015:

mod_pois2b %>% 
  update(. ~ . -U1.date2, data = filter(bmg_ts, ! cluster, date < date_2)) %>% 
  est_decrease() %>% 
  round(2)
##   est   lwr   upr 
## 10.99  7.15 14.69

And to the non cluster after 2015:

mod_pois2b %>% 
  update(. ~ . -U1.date2, data = filter(bmg_ts, ! cluster, date >= date_2)) %>% 
  est_decrease() %>% 
  round(2)
##   est   lwr   upr 
##  3.71 -3.71 10.62

Risk factors

This function plots temporal trends of proportions by quarter or month:

plot_prop_trends <- function(x, val, ylab, ylim = 0:1, dta = bmg, col = "grey",
                             f1 = lubridate::quarter, f2 = lubridate::yq) {
  dta %>% 
    rename(var = x) %>% 
    mutate(xxxx  = f1(date),
           year  = year(date)) %>%
    group_by(year, xxxx) %>%
    summarise(yes = sum(var == val),
              no  = sum(var != val)) %>%
    ungroup() %>% 
    mutate(date     = f2(paste(year, xxxx)),
           prop     = map2(yes, yes + no, prop.test),
           estimate = map_dbl(prop, extract2, "estimate"),
           confint  = map(prop, extract2, "conf.int")) %>%
    unnest_wider(confint, "") %>% 
    select(date, estimate, confint1, confint2) %$% 
    plot(date, estimate, ylim = ylim, type = "o",
         xlab = NA, ylab = ylab, col = col, pch = 19)
}

The same function but by year:

plot_prop_trends_yr <- function(x, val, ylab, ylim = 0:1, dta = bmg, col = "grey") {
  dta %>% 
    rename(var = x) %>% 
    mutate(year  = year(date)) %>%
    group_by(year) %>%
    summarise(yes = sum(var == val),
              no  = sum(var != val)) %>%
    ungroup() %>% 
    mutate(date     = ymd(paste(year, 6, 15)),
           prop     = map2(yes, yes + no, prop.test),
           estimate = map_dbl(prop, extract2, "estimate"),
           confint  = map(prop, extract2, "conf.int")) %>%
    unnest_wider(confint, "") %>% 
    select(date, estimate, confint1, confint2) %$% 
    plot(date, estimate, ylim = ylim, type = "o",
         xlab = NA, ylab = ylab, col = col, pch = 19)
}

A function that adds prediction and confidence interval to an existing plot:

add_pred_ci <- function(x, fit, lwr, upr, col = 4, alpha = .2) {
  polygon(c(x, rev(x)), c(lwr, rev(upr)), border = NA, col = adjustcolor(col, alpha))
  lines(x, fit, col = col)
}

Plotting the temporal trends by quarter (left column) and year (right column):

opar <- par(mfrow = c(4, 2), cex = 1)

# 1. Temporal trend of the proportion of males:
newdata3 <- tibble(date = seq(min(bmg$date), max(bmg$date), 1))

plot_prop_trends("sex", "male", "proportion of male")

glm(sex == "male" ~ date, binomial, bmg) %>% 
  predict2(newdata3) %>% 
  with(add_pred_ci(date, fit, lwr, upr))

abline(h = .5)
abline(h = select(filter(sex_ratio, sex == "male"), perc), lty = 2)

# 2. The same, by year:
plot_prop_trends_yr("sex", "male", "proportion of male")

glm(sex == "male" ~ date, binomial, bmg) %>% 
  predict2(newdata3) %>% 
  with(add_pred_ci(date, fit, lwr, upr))

abline(h = .5)
abline(h = select(filter(sex_ratio, sex == "male"), perc), lty = 2)

# 3. Temporal trend of the proportion born in the UK:
tmp <- mutate(bmg, born = replace_na(born, "uk"))

plot_prop_trends("born", "uk", "proportion UK born", ylim = 0:1, dta = tmp)

glm(born == "uk" ~ date, binomial, tmp) %>% 
  predict2(newdata3) %>% 
  with(add_pred_ci(date, fit, lwr, upr))

abline(h = perc_uk, lty = 2)

# 4. The same, by year:
plot_prop_trends_yr("born", "uk", "proportion UK born", ylim = 0:1,
                    dta = mutate(bmg, born = replace_na(born, "uk")))

glm(born == "uk" ~ date, binomial, tmp) %>% 
  predict2(newdata3) %>% 
  with(add_pred_ci(date, fit, lwr, upr))

abline(h = perc_uk, lty = 2)

# 5. Temporal trend of the proportion in clusters:
plot_prop_trends("cluster", TRUE, "proportion in cluster")

glm(cluster ~ date, binomial, bmg) %>% 
  predict2(newdata3) %>% 
  with(add_pred_ci(date, fit, lwr, upr))

abline(h = nb_clst$n[2] / sum(nb_clst$n), lty = 2)

# 6. The same, by year:
plot_prop_trends_yr("cluster", TRUE, "proportion in cluster")

glm(cluster ~ date, binomial, bmg) %>% 
  predict2(newdata3) %>% 
  with(add_pred_ci(date, fit, lwr, upr))

abline(h = nb_clst$n[2] / sum(nb_clst$n), lty = 2)

# 7. Temporal trend of the proportion with social risk factors:
tmp <- mutate(bmg, srf = prison | alcohol | homeless | drugs)

plot_prop_trends("srf", TRUE, "proportion with social risk factors", dta = tmp)

glm(srf ~ date, binomial, tmp) %>% 
  predict2(newdata3) %>% 
  with(add_pred_ci(date, fit, lwr, upr))

abline(h = sum(tmp$srf) / nrow(tmp), lty = 2)

# 8. The same, by year:
plot_prop_trends_yr("srf", TRUE, "proportion with social risk factors", dta = tmp)

glm(srf ~ date, binomial, tmp) %>% 
  predict2(newdata3) %>% 
  with(add_pred_ci(date, fit, lwr, upr))

abline(h = sum(tmp$srf) / nrow(tmp), lty = 2)

par(opar)

Let’s test these trends:

Anova(glm(drugs ~ date + prison + alcohol + homeless, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
## 
## Response: drugs
##          LR Chisq Df Pr(>Chisq)    
## date        5.237  1    0.02211 *  
## prison     97.116  1  < 2.2e-16 ***
## alcohol    15.515  1  8.187e-05 ***
## homeless   31.410  1  2.089e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(prison ~ date + drugs + alcohol + homeless, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
## 
## Response: prison
##          LR Chisq Df Pr(>Chisq)    
## date        0.209  1     0.6473    
## drugs      99.458  1     <2e-16 ***
## alcohol     2.476  1     0.1156    
## homeless    4.197  1     0.0405 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(alcohol ~ date + drugs + prison + homeless, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
## 
## Response: alcohol
##          LR Chisq Df Pr(>Chisq)    
## date       0.0094  1   0.922624    
## drugs     18.4062  1  1.785e-05 ***
## prison     2.8375  1   0.092087 .  
## homeless   7.8248  1   0.005153 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(homeless ~ date + drugs + prison + alcohol, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
## 
## Response: homeless
##         LR Chisq Df Pr(>Chisq)    
## date       1.353  1   0.244696    
## drugs     34.378  1  4.539e-09 ***
## prison     4.277  1   0.038641 *  
## alcohol    7.480  1   0.006239 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Which means that, among these social risk factors, drug usage seems to be the main driver of the increasing trend.

summary(glm(srf ~ date, binomial, tmp))
## 
## Call:
## glm(formula = srf ~ date, family = binomial, data = tmp)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.910e+00  1.224e+00  -4.830 1.36e-06 ***
## date         2.262e-04  7.557e-05   2.993  0.00276 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1019.6  on 1652  degrees of freedom
## Residual deviance: 1010.7  on 1651  degrees of freedom
## AIC: 1014.7
## 
## Number of Fisher Scoring iterations: 5

Relationship between proportions of male and social risk factors

No relationship between being born in the UK and sex of the patient:

bmg %>% 
  transmute(imm   = born != "uk",
            sex   = sex == "male") %$%
  fisher.test(imm, sex)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  imm and sex
## p-value = 0.6947
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.7613409 1.1980703
## sample estimates:
## odds ratio 
##  0.9555805
bmg %>% 
  filter(born != "uk") %>% 
  transmute(born = born == "os1",
            sex  = sex == "male") %$% 
  fisher.test(sex, born)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  sex and born
## p-value = 0.06404
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.5511467 1.0186870
## sample estimates:
## odds ratio 
##  0.7509145
bmg %>% 
  filter(born != "uk") %>% 
  transmute(born = born == "os1",
            sex  = sex == "male") %>% 
  table() %>% 
  addmargins()
##        sex
## born    FALSE TRUE  Sum
##   FALSE    85  147  232
##   TRUE    419  544  963
##   Sum     504  691 1195

Still increase of males proportion with time, even after correcting for SRF:

mod <- bmg %>% 
  transmute(year  = year(date),
            srf   = prison | alcohol | homeless | drugs,
            sex   = sex == "male") %$% 
  glm(sex ~ srf + year, binomial)

summary(mod)
## 
## Call:
## glm(formula = sex ~ srf + year, family = binomial)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -98.01893   33.62249  -2.915  0.00355 ** 
## srfTRUE       1.54429    0.23667   6.525  6.8e-11 ***
## year          0.04879    0.01670   2.921  0.00348 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2249.5  on 1652  degrees of freedom
## Residual deviance: 2181.0  on 1650  degrees of freedom
## AIC: 2187
## 
## Number of Fisher Scoring iterations: 4
anova(mod, test = "LRT")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: sex
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                  1652     2249.5              
## srf   1   59.912      1651     2189.6 9.918e-15 ***
## year  1    8.592      1650     2181.0  0.003377 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ukentry <- readr::read_csv("ukentry_date.csv")
ukentry %>% 
  select(id, ukent) %>% 
  left_join(bmg, ., "id") %>% 
  select(id, date, ukent, sex, born) %>% 
  filter(born != "uk", is.na(ukent)) %>% 
  pull(born) %>% 
  unique()
## [1] "os1"
tmp <- ukentry %>% 
  select(id, ukent) %>% 
  left_join(bmg, ., "id") %>% 
  select(date, ukent, sex) %>% 
  filter(! is.na(ukent)) %>% 
  mutate(interval = year(date) - ukent) %>% 
  select(sex, interval)
with(tmp, plot(jitter(as.numeric(sex == "male")), interval))

with(tmp, plot(jitter(as.numeric(sex == "male")), log(interval + .1)))

hist(tmp$interval)

tmp2 <- tmp %>% 
  mutate(int2 = cut(interval, c(seq(0, 60, 5), 100), include.lowest = TRUE)) %>% 
  group_by(int2) %>% 
  summarise(n     = n(),
            males = sum(sex == "male")) %>% 
  mutate(ptest = map2(males, n, prop.test),
         prop  = map_dbl(ptest, extract2, "estimate"),
         ci    = map(ptest, extract2, "conf.int"),
         lower = map_dbl(ci, extract, 1),
         upper = map_dbl(ci, extract, 2),
         int2  = str_remove_all(int2, "\\(|\\]|\\[")) %>% 
  separate(int2, c("lb", "ub")) %>% 
  mutate_at(c("lb", "ub"), as.numeric) %>% 
  mutate(xs = (lb + ub) / 2)
with(tmp2, {
  plot(xs, prop, ylim = 0:1,
       xlab = "number of years between entry and infection",
       ylab = "proportion of males", col = 4)
  arrows(xs, lower, xs, upper, .1, 90, 3, 4)
})

abline(h = .5)

tmp3 <- tmp %>% 
  mutate(int2 = cut(interval, seq(0, 80, 5), include.lowest = TRUE)) %>% 
  group_by(int2) %>% 
  summarise(n     = n(),
            males = sum(sex == "male")) %>% 
  mutate(n     = cumsum(n),
         males = cumsum(males)) %>% 
  mutate(ptest = map2(males, n, prop.test),
         prop  = map_dbl(ptest, extract2, "estimate"),
         ci    = map(ptest, extract2, "conf.int"),
         lower = map_dbl(ci, extract, 1),
         upper = map_dbl(ci, extract, 2),
         int2  = str_remove_all(int2, "\\(|\\]|\\[")) %>% 
  separate(int2, c("lb", "ub")) %>% 
  mutate_at(c("lb", "ub"), as.numeric) %>% 
  mutate(xs = (lb + ub) / 2)
with(tmp3, {
  plot(xs, prop, ylim = 0:1,
       xlab = "number of years between entry and infection",
       ylab = "proportion of males", col = 4)
  arrows(xs, lower, xs, upper, .1, 90, 3, 4)
})

abline(h = .5)

foo <- ukentry %>% 
  select(id, ukent) %>% 
  left_join(bmg, ., "id") %>% 
  select(date, ukent, sex) %>% 
  filter(! is.na(ukent)) %>% 
  transmute(year     = year(date),
            interval = year - ukent)
with(foo, plot(jitter(year), interval))

foo %>% 
  group_by(year) %>% 
  summarise(n     = n(),
            below = sum(interval <= 2)) %>% 
  mutate(ptest = map2(below, n, prop.test),
         prop  = map_dbl(ptest, extract2, "estimate"),
         ci    = map(ptest, extract2, "conf.int"),
         lower = map_dbl(ci, extract, 1),
         upper = map_dbl(ci, extract, 2)) %>% 
  with({
    plot(year, prop, ylim = 0:1,xlab = NA,
         ylab = "proportion of recent entries (< 2 years)", col = 4)
    arrows(year, lower, year, upper, .1, 90, 3, 4)
  })

mod <- ukentry %>% 
  select(id, ukent) %>% 
  left_join(bmg, ., "id") %>% 
  select(date, ukent, sex) %>% 
  filter(! is.na(ukent)) %>% 
  mutate(year     = year(date),
         interval = year - ukent,
         recent   = interval <= 2,
         male     = sex == "male") %$%
  glm(male ~ recent + year)

summary(mod)
## 
## Call:
## glm(formula = male ~ recent + year)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -28.673893   9.746621  -2.942  0.00333 **
## recentTRUE    0.061652   0.036232   1.702  0.08911 . 
## year          0.014526   0.004842   3.000  0.00276 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.241387)
## 
##     Null deviance: 273.81  on 1124  degrees of freedom
## Residual deviance: 270.84  on 1122  degrees of freedom
## AIC: 1598.6
## 
## Number of Fisher Scoring iterations: 2
anova(mod, test = "LRT")
## Analysis of Deviance Table
## 
## Model: gaussian, link: identity
## 
## Response: male
## 
## Terms added sequentially (first to last)
## 
## 
##        Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                    1124     273.81            
## recent  1  0.79907      1123     273.01 0.068847 . 
## year    1  2.17275      1122     270.84 0.002698 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
bmg %>% 
  filter(born != "uk")
## # A tibble: 1,195 × 19
##       id post_code date       sex    born  pul   prison alcohol homeless drugs
##    <int>     <int> <date>     <chr>  <chr> <lgl> <lgl>  <lgl>   <lgl>    <lgl>
##  1   234         8 2011-06-04 male   os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  2   560         8 2013-04-10 female os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  3   253        66 2011-07-04 female os2   TRUE  FALSE  FALSE   FALSE    FALSE
##  4   290        66 2011-08-25 male   os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  5  3497        19 2014-03-24 male   os2   TRUE  FALSE  FALSE   FALSE    FALSE
##  6  2299        11 2009-11-23 female os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  7     5        11 2009-12-14 female os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  8   426        28 2012-06-08 male   os1   FALSE FALSE  FALSE   FALSE    FALSE
##  9    83        23 2010-06-12 female os1   TRUE  FALSE  FALSE   FALSE    FALSE
## 10  2393        67 2009-02-17 male   os1   TRUE  FALSE  FALSE   FALSE    FALSE
## # ℹ 1,185 more rows
## # ℹ 9 more variables: cluster <lgl>, cluster_number <int>, lineage <int>,
## #   start <date>, chronorder <int>, time <dbl>, status <lgl>,
## #   no_other_factor <lgl>, srf <int>
ukentry %>% 
  select(id, ukent) %>% 
  left_join(bmg, ., "id") %>% 
  filter(born != "uk") %>% 
  filter(! is.na(ukent))
## # A tibble: 1,125 × 20
##       id post_code date       sex    born  pul   prison alcohol homeless drugs
##    <dbl>     <int> <date>     <chr>  <chr> <lgl> <lgl>  <lgl>   <lgl>    <lgl>
##  1   234         8 2011-06-04 male   os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  2   560         8 2013-04-10 female os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  3   253        66 2011-07-04 female os2   TRUE  FALSE  FALSE   FALSE    FALSE
##  4   290        66 2011-08-25 male   os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  5  3497        19 2014-03-24 male   os2   TRUE  FALSE  FALSE   FALSE    FALSE
##  6     5        11 2009-12-14 female os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  7   426        28 2012-06-08 male   os1   FALSE FALSE  FALSE   FALSE    FALSE
##  8    83        23 2010-06-12 female os1   TRUE  FALSE  FALSE   FALSE    FALSE
##  9  2393        67 2009-02-17 male   os1   TRUE  FALSE  FALSE   FALSE    FALSE
## 10   149        67 2011-01-09 male   os1   TRUE  FALSE  FALSE   FALSE    FALSE
## # ℹ 1,115 more rows
## # ℹ 10 more variables: cluster <lgl>, cluster_number <int>, lineage <int>,
## #   start <date>, chronorder <int>, time <dbl>, status <lgl>,
## #   no_other_factor <lgl>, srf <int>, ukent <dbl>

Temporal trend in case load with 5 SNP on pulmonary cases only

Preparing monthly incidence data

bmg_ts_pul <- bmg %>% 
  filter(pul) %>% 
  mutate(year  = year(date),
         month = month(date)) %>% 
  group_by(year, month, cluster) %>% 
  tally() %>% 
  ungroup() %>% 
  mutate(date    = ymd(paste(year, month, "1")),
         date2   = as.integer(date),
         cosdate = wave_trans(date2, cos),
         sindate = wave_trans(date2, sin))

Poisson model

Let’s consider some Poisson models with various options for the interactions:

# no interactions:
mod_pois1_pul <- glm(n ~ date2 + cluster + cosdate + sindate, poisson, bmg_ts_pul)
# interaction between seasonality and cluster:
mod_pois2_pul <- update(mod_pois1_pul, . ~ . + cosdate : cluster + sindate : cluster)
# interaction between trend and cluster:
mod_pois3_pul <- update(mod_pois1_pul, . ~ . + date2 : cluster)
# interactions between seasonality and cluster, as well as between trend and cluster:
mod_pois4_pul <- update(mod_pois2_pul, . ~ . + date2 : cluster)

Let’s compare these models:

lrt(mod_pois1_pul, mod_pois2_pul, mod_pois4_pul)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate + 
##     date2:cluster
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       238     187.74                       
## 2       236     182.56  2   5.1806   0.0750 .
## 3       235     182.04  1   0.5152   0.4729  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrt(mod_pois1_pul, mod_pois3_pul, mod_pois4_pul)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + date2:cluster
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate + 
##     date2:cluster
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       238     187.74                       
## 2       237     187.17  1   0.5727  0.44921  
## 3       235     182.04  2   5.1232  0.07718 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

So, the best model would (marginally) be mod_pois2 (i.e. interaction between cluster and seasonality, but not between cluster and linear trend). Next, let’s see whether the seasonality is significant for both the cluster and non-cluster:

mod_pois2a_pul <- update(mod_pois1_pul, . ~ . - cluster, data = filter(bmg_ts_pul,  cluster))
mod_pois2b_pul <- update(mod_pois2a_pul, data = filter(bmg_ts_pul,  !cluster))

test_seasonality(mod_pois2a_pul)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
## 1       118     90.918                        
## 2       116     79.717  2   11.201 0.003695 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_seasonality(mod_pois2b_pul)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       121     102.67                     
## 2       119     102.33  2  0.34646   0.8409

Which means that seasonality is significant only for the cluster (mod_pois2b). Let’s thus update this model accordingly:

mod_pois2b_pul %<>% update(. ~ . - cosdate - sindate)

Next, let’s look at segmented regressions version of these two models, with periods predefined by these dates. Let’s do the test for the cluster and non cluster:

test_segments(mod_pois2a_pul)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       116     79.717                     
## 2       115     79.696  1 0.020579   0.8859
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       116     79.717                     
## 2       115     79.354  1  0.36332   0.5467
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate + U1.date2
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       115     79.354                     
## 2       114     79.283  1 0.070156   0.7911
test_segments(mod_pois2b_pul)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       121    102.674                       
## 2       120     99.242  1   3.4327  0.06392 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       121    102.674                       
## 2       120     99.494  1   3.1799  0.07455 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + U1.date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       120     99.494                     
## 2       119     98.536  1  0.95884   0.3275

Which means that there is a marginal change in the trend only for the non-cluster, either after 2011 or after 2015, but not both at the same time. Let’s thus not update model mod_pois2b. Now we can make the figure. Let’s generate some new data for the covariables used for prediction:

newdata_pul <- tibble(date     = unique(bmg_ts_pul$date)) %>% 
           mutate(date2    = as.integer(date),
                  cosdate  = wave_trans(date2, cos),
                  sindate  = wave_trans(date2, sin),
                  U1.date2 = date2 - as.numeric(date_2) + 15,
                  U1.date2 = ifelse(U1.date2 < 0, 0, U1.date2))

And now the figure:

# The data of non cluster:
bmg_ts_pul %>% 
  filter(! cluster) %$% 
  plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
       ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")

# The data of cluster:
bmg_ts_pul %>% 
  filter(cluster) %$% 
  points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)

# The model of cluster:
mod_pois2a_pul %>% 
  predict2(newdata_pul) %>% 
  plot_mod_pred(2)

# The model of non cluster:
mod_pois2b_pul %>% 
  predict2(newdata_pul) %>% 
  plot_mod_pred(4)

# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)

# The legend:
lgd <- c("non cluster", "cluster")
legend2 <- function(...) legend(..., box.col = "white") 
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)

Temporal trend in case load with 5 SNP on non-pulmonary cases only

Preparing monthly incidence data

bmg_ts_nonpul <- bmg %>% 
  filter(!pul) %>% 
  mutate(year  = year(date),
         month = month(date)) %>% 
  group_by(year, month, cluster) %>% 
  tally() %>% 
  ungroup() %>% 
  mutate(date    = ymd(paste(year, month, "1")),
         date2   = as.integer(date),
         cosdate = wave_trans(date2, cos),
         sindate = wave_trans(date2, sin))

Poisson model

Let’s consider some Poisson models with various options for the interactions:

# no interactions:
mod_pois1_nonpul <- glm(n ~ date2 + cluster + cosdate + sindate, poisson, bmg_ts_nonpul)
# interaction between seasonality and cluster:
mod_pois2_nonpul <- update(mod_pois1_nonpul, . ~ . + cosdate : cluster + sindate : cluster)
# interaction between trend and cluster:
mod_pois3_nonpul <- update(mod_pois1_nonpul, . ~ . + date2 : cluster)
# interactions between seasonality and cluster, as well as between trend and cluster:
mod_pois4_nonpul <- update(mod_pois2_nonpul, . ~ . + date2 : cluster)

Let’s compare these models:

lrt(mod_pois1_nonpul, mod_pois2_nonpul, mod_pois4_nonpul)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate + 
##     date2:cluster
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       187     160.11                       
## 2       185     160.03  2  0.08641  0.95771  
## 3       184     157.06  1  2.96532  0.08507 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrt(mod_pois1_nonpul, mod_pois3_nonpul, mod_pois4_nonpul)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + date2:cluster
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate + 
##     date2:cluster
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       187     160.11                       
## 2       186     157.12  1  2.98718  0.08393 .
## 3       184     157.06  2  0.06455  0.96824  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

So, the best model would (marginally) be mod_pois3 (i.e. interaction between cluster and linear trend, but not between cluster and seasonality). Next, let’s see whether the linear trend is significant for both the cluster and non-cluster:

mod_pois2a_nonpul <- update(mod_pois1_nonpul, . ~ . - cluster,
                            data = filter(bmg_ts_nonpul,  cluster))
mod_pois2b_nonpul <- update(mod_pois2a_nonpul,
                            data = filter(bmg_ts_nonpul,  !cluster))

test_lineartrend <- function(x) {
  lrt(update(x, . ~ . - date2), x)
}

test_lineartrend(mod_pois2a_nonpul)
## Analysis of Deviance Table
## 
## Model 1: n ~ cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate
##   Resid. Df Resid. Dev Df   Deviance Pr(>Chi)
## 1        66     28.184                       
## 2        65     28.183  1 0.00080014   0.9774
test_lineartrend(mod_pois2b_nonpul)
## Analysis of Deviance Table
## 
## Model 1: n ~ cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       120     145.56                          
## 2       119     128.88  1   16.684 4.416e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Which means that linear trend is significant only for the non-cluster (mod_pois2b_nonpul). Let’s thus update this model accordingly:

#mod_pois2a_nonpul %<>% update(. ~ . - date2)

But we still want to test the significance of seasonality for both clustered and non-clustered:

mod_pois2a_nonpul2 <- update(mod_pois1_nonpul, . ~ . - cluster,
                             data = filter(bmg_ts_nonpul, cluster))
mod_pois2b_nonpul2 <- update(mod_pois2a_nonpul2,
                             data = filter(bmg_ts_nonpul, !cluster))

test_seasonality(mod_pois2a_nonpul)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        67     28.841                     
## 2        65     28.183  2  0.65759   0.7198
test_seasonality(mod_pois2b_nonpul)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       121     132.71                     
## 2       119     128.88  2   3.8322   0.1472

which means that seasonality is not significant, neither for the clustered or the non-clustered. Thus, let’s update the models accordingly:

mod_pois2a_nonpul %<>% update(. ~ . - cosdate - sindate)
mod_pois2b_nonpul %<>% update(. ~ . - cosdate - sindate)

Next, let’s look at segmented regressions version of these two models, with periods predefined by these dates. Let’s do the test for the cluster and non cluster:

test_segments <- function(x) {
  mod1 <- segmented2(x, date_1)
  mod2 <- segmented2(x, date_2)
  mod3 <- segmented2(x, c(date_1, date_2))
  print(lrt(x, mod1))
  print(lrt(x, mod2))
  print(lrt(x, mod3))
  lrt(mod2, mod3)
}

test_segments(mod_pois2a_nonpul)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        67     28.841                     
## 2        66     27.513  1   1.3276   0.2492
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        67     28.841                     
## 2        66     28.590  1  0.25116   0.6163
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        67     28.841                     
## 2        65     27.430  2   1.4109   0.4939
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + U1.date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        66      28.59                     
## 2        65      27.43  1   1.1598   0.2815
test_segments(mod_pois2b_nonpul)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       121     132.71                     
## 2       120     131.43  1   1.2807   0.2578
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       121     132.71                     
## 2       120     131.23  1   1.4775   0.2242
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       121     132.71                       
## 2       119     125.93  2   6.7745   0.0338 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + U1.date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       120     131.23                       
## 2       119     125.93  1    5.297  0.02136 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Which means that there is a significant change in the trend only for the non-cluster, but both after 2011 and after 2015. Let’s thus update model mod_pois2b_nonpul accordingly:

mod_pois2b_nonpul %<>% segmented2(c(date_1, date_2))

Now we can make the figure. Let’s generate some new data for the covariables used for prediction:

newdata_nonpul <- tibble(date     = unique(bmg_ts_nonpul$date)) %>% 
           mutate(date2    = as.integer(date),
                  cosdate  = wave_trans(date2, cos),
                  sindate  = wave_trans(date2, sin),
                  U1.date2 = date2 - as.numeric(date_1) + 11,
                  U1.date2 = ifelse(U1.date2 < 0, 0, U1.date2),
                  U2.date2 = date2 - as.numeric(date_2) + 15,
                  U2.date2 = ifelse(U2.date2 < 0, 0, U2.date2))

And now the figure:

# The data of non cluster:
bmg_ts_nonpul %>% 
  filter(! cluster) %$% 
  plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
       ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")

# The data of cluster:
bmg_ts_nonpul %>% 
  filter(cluster) %$% 
  points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)

# The model of cluster:
mod_pois2a_nonpul %>% 
  predict2(newdata_nonpul) %>% 
  plot_mod_pred(2)

# The model of non cluster:
mod_pois2b_nonpul %>% 
  predict2(newdata_nonpul) %>% 
  plot_mod_pred(4)

# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)

# The legend:
lgd <- c("non cluster", "cluster")
legend2 <- function(...) legend(..., box.col = "white") 
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)

Temporal trend in case load with 12 SNP

Preparing monthly incidence data

bmg_ts_12 <- bmg_12 %>% 
  mutate(year  = year(date),
         month = month(date)) %>% 
  group_by(year, month, cluster) %>% 
  tally() %>% 
  mutate(date    = ymd(paste(year, month, "1")),
         date2   = as.integer(date),
         cosdate = wave_trans(date2, cos),
         sindate = wave_trans(date2, sin))

Which looks like:

bmg_ts_12
## # A tibble: 248 × 8
## # Groups:   year, month [126]
##     year month cluster     n date       date2 cosdate sindate
##    <dbl> <dbl> <lgl>   <int> <date>     <int>   <dbl>   <dbl>
##  1  2009     1 FALSE      11 2009-01-01 14245   0.985   0.171
##  2  2009     1 TRUE        7 2009-01-01 14245   0.985   0.171
##  3  2009     2 FALSE      17 2009-02-01 14276   0.761   0.649
##  4  2009     2 TRUE        5 2009-02-01 14276   0.761   0.649
##  5  2009     3 FALSE      16 2009-03-01 14304   0.374   0.928
##  6  2009     3 TRUE       10 2009-03-01 14304   0.374   0.928
##  7  2009     4 FALSE      15 2009-04-01 14335  -0.150   0.989
##  8  2009     4 TRUE        9 2009-04-01 14335  -0.150   0.989
##  9  2009     5 FALSE      14 2009-05-01 14365  -0.619   0.786
## 10  2009     5 TRUE       13 2009-05-01 14365  -0.619   0.786
## # ℹ 238 more rows

Poisson model

Let’s consider some Poisson models with various options for the interactions:

# no interactions:
mod_pois1 <- glm(n ~ date2 + cluster + cosdate + sindate, poisson, bmg_ts_12)
# interaction between seasonality and cluster:
mod_pois2 <- update(mod_pois1, . ~ . + cosdate : cluster + sindate : cluster)
# interaction between trend and cluster:
mod_pois3 <- update(mod_pois1, . ~ . + date2 : cluster)
# interactions between seasonality and cluster, as well as between trend and cluster:
mod_pois4 <- update(mod_pois2, . ~ . + date2 : cluster)

Let’s compare these models:

lrt(mod_pois1, mod_pois2, mod_pois4)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate + 
##     date2:cluster
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       243     239.79                       
## 2       241     231.90  2   7.8964  0.01929 *
## 3       240     231.46  1   0.4349  0.50957  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrt(mod_pois1, mod_pois3, mod_pois4)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + date2:cluster
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate + 
##     date2:cluster
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       243     239.79                       
## 2       242     239.30  1   0.4987  0.48006  
## 3       240     231.46  2   7.8326  0.01991 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

So, the best model would be mod_pois2 (i.e. interaction between cluster and seasonality, but not between cluster and linear trend). Next, let’s see whether the seasonality is significant for both the cluster and non-cluster:

mod_pois2a <- update(mod_pois1, . ~ . - cluster, data = filter(bmg_ts_12,  cluster))
mod_pois2b <- update(mod_pois2a, data = filter(bmg_ts_12,  !cluster))

test_seasonality(mod_pois2a)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       121    114.698                          
## 2       119     95.104  2   19.594 5.563e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_seasonality(mod_pois2b)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       123     139.38                     
## 2       121     136.36  2   3.0165   0.2213

Which means that seasonality is significant only for the cluster (mod_pois2b). Let’s thus update this model accordingly:

mod_pois2b %<>% update(. ~ . - cosdate - sindate)

Next, let’s look at segmented regressions version of these two models, with periods predefined by these dates. Let’s do the test for the cluster and non cluster:

test_segments(mod_pois2a)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       119     95.104                     
## 2       118     95.088  1 0.016541   0.8977
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       119     95.104                     
## 2       118     93.822  1   1.2828   0.2574
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       119     95.104                     
## 2       117     93.364  2   1.7406   0.4188
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate + U1.date2
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       118     93.822                     
## 2       117     93.364  1  0.45778   0.4987
test_segments(mod_pois2b)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       123     139.38                     
## 2       122     139.10  1  0.27515   0.5999
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       123     139.38                       
## 2       122     134.83  1   4.5485  0.03295 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       123     139.38                       
## 2       121     133.97  2   5.4078  0.06694 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + U1.date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       122     134.83                     
## 2       121     133.97  1  0.85935   0.3539

Which means that there is a change in the trend only for the non-cluster, and only after 2015. Let’s update model mod_pois2b accordingly:

mod_pois2b %<>% segmented2(date_2)

Now we can make a figure. First we need a function that computes predictions with confidence intervals from a model. Let’s generate some new data for the covariables used for prediction:

newdata <- tibble(date     = unique(bmg_ts_12$date)) %>% 
           mutate(date2    = as.integer(date),
                  cosdate  = wave_trans(date2, cos),
                  sindate  = wave_trans(date2, sin),
                  U1.date2 = date2 - as.numeric(date_2) + 15,
                  U1.date2 = ifelse(U1.date2 < 0, 0, U1.date2))

Next we need a function that plots the model predictions with confidence interval. Now that we have these two functions, we can make the figure:

# The data of non cluster:
bmg_ts_12 %>% 
  filter(! cluster) %$% 
  plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
       ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")

# The data of cluster:
bmg_ts_12 %>% 
  filter(cluster) %$% 
  points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)

# The model of cluster:
mod_pois2a %>% 
  predict2(newdata) %>% 
  plot_mod_pred(2)

# The model of non cluster:
mod_pois2b %>% 
  predict2(newdata) %>% 
  plot_mod_pred(4)

# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)

# The legend:
lgd <- c("non cluster", "cluster")
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)

Diagnostics on the Poisson model

Let’s look at the dispersion in the residuals

AER::dispersiontest(mod_pois2a)
## 
##  Overdispersion test
## 
## data:  mod_pois2a
## z = -2.6588, p-value = 0.9961
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##  0.7838471
AER::dispersiontest(mod_pois2b)
## 
##  Overdispersion test
## 
## data:  mod_pois2b
## z = 0.29027, p-value = 0.3858
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##   1.031179

Which comforts us in the choice of a Poisson model. Let’s now look at temporal autocorrelation in residuals. Below is a function that performs a few checks. Let’s check the models for cluster and non cluster

test_autocorr(mod_pois2a)
## 
## Call:
## lm(formula = Residuals[-length(Residuals)] ~ Residuals[-1])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.69972 -0.74893 -0.05969  0.61864  2.09660 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -0.05804    0.08057  -0.720    0.473
## Residuals[-1]  0.01110    0.09132   0.122    0.903
## 
## Residual standard error: 0.8875 on 120 degrees of freedom
## Multiple R-squared:  0.000123,   Adjusted R-squared:  -0.008209 
## F-statistic: 0.01477 on 1 and 120 DF,  p-value: 0.9035

test_autocorr(mod_pois2b)
## 
## Call:
## lm(formula = Residuals[-length(Residuals)] ~ Residuals[-1])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.38805 -0.59268  0.02363  0.70206  2.09143 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -0.05302    0.09387  -0.565    0.573
## Residuals[-1] -0.01531    0.09014  -0.170    0.865
## 
## Residual standard error: 1.044 on 122 degrees of freedom
## Multiple R-squared:  0.0002363,  Adjusted R-squared:  -0.007958 
## F-statistic: 0.02884 on 1 and 122 DF,  p-value: 0.8654

Everything looks OK!

Quantifying the decreases in incidence

First we need a function that estimates the rate of decrease in % / year, with confidence interval. Let’s apply it to all the data:

glm(n ~ date2, poisson, bmg_ts_12) %>% 
  est_decrease() %>% 
  round(2)
##  est  lwr  upr 
## 6.70 5.19 8.19

Let’s apply it the cluster cases

mod_pois2a %>%
  update(. ~ .  - cosdate - sindate) %>% # we remove the seasonality
  est_decrease() %>% 
  round(2)
##   est   lwr   upr 
##  7.56  4.95 10.12

Let’s apply it to the non cluster before 2015:

mod_pois2b %>% 
  update(. ~ . -U1.date2, data = filter(bmg_ts_12, ! cluster, date < date_2)) %>% 
  est_decrease() %>% 
  round(2)
##   est   lwr   upr 
## 10.33  6.38 14.13

And to the non cluster after 2015:

mod_pois2b %>% 
  update(. ~ . -U1.date2, data = filter(bmg_ts_12, ! cluster, date >= date_2)) %>% 
  est_decrease() %>% 
  round(2)
##   est   lwr   upr 
##  2.95 -4.71 10.06

Temporal trend in case load with regenerated 12 SNP data

Preparing monthly incidence data

bmg_ts_12b <- bmg_12b %>% 
  mutate(year  = year(date),
         month = month(date)) %>% 
  group_by(year, month, cluster) %>% 
  tally() %>% 
  ungroup() %>% 
  mutate(date    = ymd(paste(year, month, "1")),
         date2   = as.integer(date),
         cosdate = wave_trans(date2, cos),
         sindate = wave_trans(date2, sin))

Which looks like:

bmg_ts_12b
## # A tibble: 248 × 8
##     year month cluster     n date       date2 cosdate sindate
##    <dbl> <dbl> <lgl>   <int> <date>     <int>   <dbl>   <dbl>
##  1  2009     1 FALSE      11 2009-01-01 14245   0.985   0.171
##  2  2009     1 TRUE        7 2009-01-01 14245   0.985   0.171
##  3  2009     2 FALSE      17 2009-02-01 14276   0.761   0.649
##  4  2009     2 TRUE        5 2009-02-01 14276   0.761   0.649
##  5  2009     3 FALSE      16 2009-03-01 14304   0.374   0.928
##  6  2009     3 TRUE       10 2009-03-01 14304   0.374   0.928
##  7  2009     4 FALSE      14 2009-04-01 14335  -0.150   0.989
##  8  2009     4 TRUE       10 2009-04-01 14335  -0.150   0.989
##  9  2009     5 FALSE      14 2009-05-01 14365  -0.619   0.786
## 10  2009     5 TRUE       13 2009-05-01 14365  -0.619   0.786
## # ℹ 238 more rows

Poisson model

Let’s consider some Poisson models with various options for the interactions:

# no interactions:
mod_pois1 <- glm(n ~ date2 + cluster + cosdate + sindate, poisson, bmg_ts_12b)
# interaction between seasonality and cluster:
mod_pois2 <- update(mod_pois1, . ~ . + cosdate : cluster + sindate : cluster)
# interaction between trend and cluster:
mod_pois3 <- update(mod_pois1, . ~ . + date2 : cluster)
# interactions between seasonality and cluster, as well as between trend and cluster:
mod_pois4 <- update(mod_pois2, . ~ . + date2 : cluster)

Let’s compare these models:

lrt(mod_pois1, mod_pois2, mod_pois4)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate + 
##     date2:cluster
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       243     239.74                       
## 2       241     231.56  2   8.1777  0.01676 *
## 3       240     230.93  1   0.6245  0.42936  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrt(mod_pois1, mod_pois3, mod_pois4)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + date2:cluster
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate + 
##     date2:cluster
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       243     239.74                       
## 2       242     239.03  1   0.7034  0.40165  
## 3       240     230.93  2   8.0989  0.01743 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

So, the best model would be mod_pois2 (i.e. interaction between cluster and seasonality, but not between cluster and linear trend). Next, let’s see whether the seasonality is significant for both the cluster and non-cluster:

mod_pois2a <- update(mod_pois1, . ~ . - cluster, data = filter(bmg_ts_12b,  cluster))
mod_pois2b <- update(mod_pois2a, data = filter(bmg_ts_12b,  !cluster))

test_seasonality(mod_pois2a)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)    
## 1       121    114.057                         
## 2       119     94.355  2   19.702 5.27e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_seasonality(mod_pois2b)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       123     139.74                     
## 2       121     136.58  2   3.1604   0.2059

Which means that seasonality is significant only for the cluster (mod_pois2b). Let’s thus update this model accordingly:

mod_pois2b %<>% update(. ~ . - cosdate - sindate)

Next, let’s look at segmented regressions version of these two models, with periods predefined by these dates. Let’s do the test for the cluster and non cluster:

test_segments(mod_pois2a)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       119     94.355                     
## 2       118     94.290  1 0.065389   0.7982
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       119     94.355                     
## 2       118     92.926  1   1.4295   0.2318
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       119     94.355                     
## 2       117     92.605  2   1.7503   0.4168
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate + U1.date2
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       118     92.926                     
## 2       117     92.605  1  0.32075   0.5712
test_segments(mod_pois2b)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       123     139.74                     
## 2       122     139.56  1  0.17705   0.6739
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       123     139.74                       
## 2       122     135.41  1   4.3255  0.03755 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       123     139.74                       
## 2       121     134.38  2    5.357  0.06867 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + U1.date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       122     135.41                     
## 2       121     134.38  1   1.0315   0.3098

Which means that there is a change in the trend only for the non-cluster, and only after 2015. Let’s update model mod_pois2b accordingly:

mod_pois2b %<>% segmented2(date_2)

Now we can make a figure. First we need a function that computes predictions with confidence intervals from a model. Let’s generate some new data for the covariables used for prediction:

newdata <- tibble(date     = unique(bmg_ts_12b$date)) %>% 
           mutate(date2    = as.integer(date),
                  cosdate  = wave_trans(date2, cos),
                  sindate  = wave_trans(date2, sin),
                  U1.date2 = date2 - as.numeric(date_2) + 15,
                  U1.date2 = ifelse(U1.date2 < 0, 0, U1.date2))

Next we need a function that plots the model predictions with confidence interval. Now that we have these two functions, we can make the figure:

# The data of non cluster:
bmg_ts_12b %>% 
  filter(! cluster) %$% 
  plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
       ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")

# The data of cluster:
bmg_ts_12b %>% 
  filter(cluster) %$% 
  points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)

# The model of cluster:
mod_pois2a %>% 
  predict2(newdata) %>% 
  plot_mod_pred(2)

# The model of non cluster:
mod_pois2b %>% 
  predict2(newdata) %>% 
  plot_mod_pred(4)

# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)

# The legend:
lgd <- c("non cluster", "cluster")
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)

Alternative Figure 1 addressing reviewer 5’s comment

On figure 1, replace incidence by incidence rate (reviewer 5).

bmg_ts2 <- ons %>% 
  select(Year, Total) %>% 
  left_join(bmg_ts_12b, ., c("year" = "Year"))
# no interactions:
mod_pois1_2 <- glm(n ~ date2 + cluster + cosdate + sindate, poisson, bmg_ts2, offset = log(Total))
# interaction between seasonality and cluster:
mod_pois2_2 <- update(mod_pois1_2, . ~ . + cosdate : cluster + sindate : cluster)
# interaction between trend and cluster:
mod_pois3_2 <- update(mod_pois1_2, . ~ . + date2 : cluster)
# interactions between seasonality and cluster, as well as between trend and cluster:
mod_pois4_2 <- update(mod_pois2_2, . ~ . + date2 : cluster)
lrt(mod_pois1_2, mod_pois2_2, mod_pois4_2)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cluster + cosdate + sindate
## Model 2: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate
## Model 3: n ~ date2 + cluster + cosdate + sindate + cluster:cosdate + cluster:sindate + 
##     date2:cluster
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       243     239.19                       
## 2       241     231.01  2   8.1787  0.01675 *
## 3       240     230.39  1   0.6264  0.42867  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod_pois2a_2 <- update(mod_pois1_2, . ~ . - cluster, data = filter(bmg_ts2,  cluster))
mod_pois2b_2 <- update(mod_pois2a_2, data = filter(bmg_ts2,  !cluster))

test_seasonality(mod_pois2a_2)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       121    113.535                          
## 2       119     94.152  2   19.383 6.181e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_seasonality(mod_pois2b_2)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + cosdate + sindate
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       123     139.30                     
## 2       121     136.23  2   3.0696   0.2155
mod_pois2b_2 %<>% update(. ~ . - cosdate - sindate)
test_segments(mod_pois2a_2)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       119     94.152                     
## 2       118     94.076  1 0.075729   0.7832
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       119     94.152                     
## 2       118     92.667  1   1.4852    0.223
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       119     94.152                     
## 2       117     92.354  2   1.7977    0.407
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + cosdate + sindate + U1.date2
## Model 2: n ~ date2 + cosdate + sindate + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       118     92.667                     
## 2       117     92.354  1  0.31247   0.5762
test_segments(mod_pois2b_2)
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       123     139.30                     
## 2       122     139.11  1  0.19442   0.6593
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       123     139.30                       
## 2       122     134.86  1   4.4468  0.03497 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: n ~ date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       123     139.30                       
## 2       121     133.83  2   5.4701  0.06489 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: n ~ date2 + U1.date2
## Model 2: n ~ date2 + U1.date2 + U2.date2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       122     134.86                     
## 2       121     133.83  1   1.0233   0.3117
mod_pois2b_2 %<>% segmented2(date_2)

The figure:

# The data of non cluster:
bmg_ts %>% 
  filter(! cluster) %$% 
  plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
       ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")

# The data of cluster:
bmg_ts %>% 
  filter(cluster) %$% 
  points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)

# The model of cluster:
mod_pois2a_2 %>% 
  predict2(newdata2) %>% 
  plot_mod_pred(2)

# The model of non cluster:
mod_pois2b_2 %>% 
  predict2(newdata2) %>% 
  plot_mod_pred(4)

# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)

# The legend:
lgd <- c("non cluster", "cluster")
legend2 <- function(...) legend(..., box.col = "white") 
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)

Alternative Figure 1 addressing reviewer 3’s comment

On figure 1, replace the segmented regression by a spline regression (reviewer 3).

library(mgcv)
# The data of non cluster:
bmg_ts_12b %>% 
  filter(! cluster) %$% 
  plot(date, n, type = "h", lwd = 3, col = "lightgrey", lend = 1, yaxs = "i",
       ylim = c(0, 20), xlab = NA, ylab = "monthly incidence (ind.)")

# The data of cluster:
bmg_ts_12b %>% 
  filter(cluster) %$% 
  points(date, n, type = "h", lwd = 3, col = "darkgrey", lend = 1)

# The model of non cluster:
bmg_ts_12b %>%
  filter(! cluster) %>% 
  gam(n ~ s(date2), poisson, .) %>% 
#  gam(n ~ s(date2) + s(cosdate) + s(sindate), poisson, .) %>% 
  predict2(newdata) %>% 
  plot_mod_pred(4)

# The model of cluster:
bmg_ts_12b %>%
  filter(cluster) %>% 
  gam(n ~ s(date2) + s(cosdate) + s(sindate), poisson, .) %>% 
  predict2(newdata) %>% 
  plot_mod_pred(2)

# The 3 time periods:
abline(v = c(date_1, date_2) - 15, col = 3, lwd = 2, lty = 2:1)

# The legend:
lgd <- c("non cluster", "cluster")
legend2 <- function(...) legend(..., box.col = "white") 
legend2("top", paste(lgd, "(data)"), fill = c("lightgrey", "darkgrey"))
legend2("topright", paste(lgd, "(model)"), col = c(4, 2), lty = 1)

Diagnostics on the Poisson model

Let’s look at the dispersion in the residuals

AER::dispersiontest(mod_pois2a)
## 
##  Overdispersion test
## 
## data:  mod_pois2a
## z = -2.733, p-value = 0.9969
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##  0.7781604
AER::dispersiontest(mod_pois2b)
## 
##  Overdispersion test
## 
## data:  mod_pois2b
## z = 0.36427, p-value = 0.3578
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##   1.039615

Which comforts us in the choice of a Poisson model. Let’s now look at temporal autocorrelation in residuals. Below is a function that performs a few checks. Let’s check the models for cluster and non cluster

test_autocorr(mod_pois2a)
## 
## Call:
## lm(formula = Residuals[-length(Residuals)] ~ Residuals[-1])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.70216 -0.73952 -0.08912  0.61388  2.07158 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -0.058091   0.080260  -0.724    0.471
## Residuals[-1]  0.001723   0.091312   0.019    0.985
## 
## Residual standard error: 0.8842 on 120 degrees of freedom
## Multiple R-squared:  2.967e-06,  Adjusted R-squared:  -0.00833 
## F-statistic: 0.0003561 on 1 and 120 DF,  p-value: 0.985

test_autocorr(mod_pois2b)
## 
## Call:
## lm(formula = Residuals[-length(Residuals)] ~ Residuals[-1])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.36650 -0.59176  0.01941  0.70196  2.11442 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -0.052870   0.094085  -0.562    0.575
## Residuals[-1] -0.007139   0.090144  -0.079    0.937
## 
## Residual standard error: 1.046 on 122 degrees of freedom
## Multiple R-squared:  5.141e-05,  Adjusted R-squared:  -0.008145 
## F-statistic: 0.006272 on 1 and 122 DF,  p-value: 0.937

Everything looks OK!

Quantifying the decreases in incidence

First we need a function that estimates the rate of decrease in % / year, with confidence interval. Let’s apply it to all the data:

glm(n ~ date2, poisson, bmg_ts_12b) %>% 
  est_decrease() %>% 
  round(2)
##  est  lwr  upr 
## 6.70 5.19 8.19

Let’s apply it the cluster cases

mod_pois2a %>%
  update(. ~ .  - cosdate - sindate) %>% # we remove the seasonality
  est_decrease() %>% 
  round(2)
##   est   lwr   upr 
##  7.70  5.10 10.26

Let’s apply it to the non cluster before 2015:

mod_pois2b %>% 
  update(. ~ . -U1.date2, data = filter(bmg_ts_12b, ! cluster, date < date_2)) %>% 
  est_decrease() %>% 
  round(2)
##   est   lwr   upr 
## 10.16  6.21 13.97

And to the non cluster after 2015:

mod_pois2b %>% 
  update(. ~ . -U1.date2, data = filter(bmg_ts_12b, ! cluster, date >= date_2)) %>% 
  est_decrease() %>% 
  round(2)
##   est   lwr   upr 
##  2.95 -4.71 10.06

Transmission

The number of cluster, samples and proportion of samples per cluster size:

clust_dist <- bmg %>% 
  group_by(cluster_number) %>% 
  summarise(cluster_size = n()) %>% 
  group_by(cluster_size) %>% 
  summarise(nb_cluster = n()) %>% 
  mutate(nb_samples = nb_cluster * cluster_size,
         pc_samples = round(100 * nb_samples / nrow(bmg), 2))

clust_dist
## # A tibble: 12 × 4
##    cluster_size nb_cluster nb_samples pc_samples
##           <int>      <int>      <int>      <dbl>
##  1            1       1142       1142      69.1 
##  2            2         79        158       9.56
##  3            3         36        108       6.53
##  4            4         15         60       3.63
##  5            6          4         24       1.45
##  6            7          1          7       0.42
##  7            8          1          8       0.48
##  8           10          1         10       0.6 
##  9           12          2         24       1.45
## 10           13          1         13       0.79
## 11           21          2         42       2.54
## 12           57          1         57       3.45

The number and proportion of samples in a cluster of at least 9 samples:

clust_dist %>% 
  filter(cluster_size > 9) %>% 
  select(nb_samples, pc_samples) %>% 
  colSums()
## nb_samples pc_samples 
##     146.00       8.83

The following function computes the number of samples in a cluster (a cluster being defined as at least 2 samples):

nb_clustered <- function(x) {
  x %>% 
    group_by(cluster_number) %>% 
    tally() %>% 
    group_by(n) %>% 
    tally() %>% 
    filter(n > 1) %$% 
    sum(n * nn)
}

Computing the total number N of samples and the number of n of samples in a cluster per lineage (rows), as well as the corresponding proportions of these samples in a cluster:

tmp <- map(c(nb_clustered, nrow), ~ map_int(group_split(bmg, lineage), .x)) %>% 
  as.data.frame(col.names = c("n", "N")) %>% 
  mutate(p = round(100 * n / N))

tmp
##     n   N  p
## 1  30 214 14
## 2  20  86 23
## 3 200 703 28
## 4 260 649 40
## 5   0   1  0

The proportion of samples in a cluster depends on the lineage:

tmp %>% 
  filter(n > 0) %>% 
  select(-p) %>% 
  mutate(N = N - n) %>% 
  as.matrix() %>% 
  fisher.test()
## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 2.931e-13
## alternative hypothesis: two.sided

Risk of being in a cluster

The risk of being in a cluster (corresponds to the 2nd part of the 3rd paragraph of the Transmission section in the ms):

Anova(glm(cluster ~ prison + alcohol + homeless + drugs, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
## 
## Response: cluster
##          LR Chisq Df Pr(>Chisq)    
## prison     2.0202  1  0.1552219    
## alcohol   11.5018  1  0.0006953 ***
## homeless   0.2765  1  0.5990232    
## drugs     27.8607  1  1.304e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef(glm(cluster ~ prison + alcohol + homeless + drugs, binomial, bmg)))
##  (Intercept)   prisonTRUE  alcoholTRUE homelessTRUE    drugsTRUE 
##    0.3867648    1.5281724    3.6942585    1.2237157    4.1143378
Anova(glm(cluster ~ sex + born + prison + alcohol + homeless + drugs, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
## 
## Response: cluster
##          LR Chisq Df Pr(>Chisq)    
## sex         2.231  1   0.135260    
## born      144.702  2  < 2.2e-16 ***
## prison      0.153  1   0.695425    
## alcohol     6.298  1   0.012087 *  
## homeless    0.180  1   0.671381    
## drugs       7.984  1   0.004719 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(cluster ~ sex + born + prison + alcohol + homeless + drugs, binomial, filter(bmg, born != "os2")))
## Analysis of Deviance Table (Type II tests)
## 
## Response: cluster
##          LR Chisq Df Pr(>Chisq)    
## sex         0.833  1   0.361410    
## born      127.639  1  < 2.2e-16 ***
## prison      0.487  1   0.485177    
## alcohol     6.605  1   0.010170 *  
## homeless    0.191  1   0.662415    
## drugs       8.048  1   0.004555 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(cluster ~ born + prison + alcohol + homeless + drugs, binomial, bmg))
## Analysis of Deviance Table (Type II tests)
## 
## Response: cluster
##          LR Chisq Df Pr(>Chisq)    
## born      143.330  2  < 2.2e-16 ***
## prison      0.335  1   0.562606    
## alcohol     6.690  1   0.009695 ** 
## homeless    0.240  1   0.623998    
## drugs       8.191  1   0.004210 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(glm(cluster ~ born + prison + alcohol + homeless + drugs, binomial, filter(bmg, born != "os2")))
## Analysis of Deviance Table (Type II tests)
## 
## Response: cluster
##          LR Chisq Df Pr(>Chisq)    
## born      127.097  1  < 2.2e-16 ***
## prison      0.669  1   0.413514    
## alcohol     6.857  1   0.008828 ** 
## homeless    0.225  1   0.635173    
## drugs       8.199  1   0.004192 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The clustered ordered from biggest to smallest (n being the number of samples in the cluster):

bmg %>% 
  group_by(cluster_number) %>% 
  tally() %>% 
  arrange(desc(n))
## # A tibble: 1,285 × 2
##    cluster_number     n
##             <int> <int>
##  1             13    57
##  2              9    21
##  3             15    21
##  4             67    13
##  5             27    12
##  6             65    12
##  7             38    10
##  8             42     8
##  9             41     7
## 10             10     6
## # ℹ 1,275 more rows

Figure 2

The function below is used to generate Figure 2:

plot_clusters <- function(x, l, cex = .5, col = 4) {
  x %>%
    filter(!cluster) %$% 
    plot(date, chronorder, cex = cex, axes = FALSE, ann = FALSE)
  
  x %>% 
    filter(cluster) %>% 
    group_split(chronorder) %>% 
    walk(~ lines(.x$date, .x$chronorder, col = "red", type = "o", cex = cex))
  
  yrs <- 2009:2019
  axis(1, ymd(paste(yrs, "01 01")), yrs)
  text(ymd(20110101), 1000, paste("lineage", l))
}

Let’s make Figure 2:

opar <- par(mfrow = c(2, 2), cex = 1)
walk(1:4, ~ plot_clusters(bmg[bmg$lineage == .x, ], .x))

par(opar)

Zero-inflated Poisson regression

This is an alternative way of looking at it. Instead of computing presence / absence data per individual, we compute average rate per cluster (i.e. number of case / duration). And then we can relate this computed rate to risk factors. The advantage here is that we don’t need to exclude the sample that were reported during the last 2 years. First we need to prepare the data for the analysis:

bmg_zip <- bmg %>% 
  mutate(duration = as.integer(max(bmg$date) - start)) %>% 
  select(cluster_number, duration, lineage) %>% 
  unique() %>% 
  left_join(tally(group_by(bmg, cluster_number)), "cluster_number") %>% 
  mutate(n = n - 1) %>% 
  filter(duration > 0)

bmg_zip
## # A tibble: 1,284 × 4
##    cluster_number duration lineage     n
##             <int>    <int>   <int> <dbl>
##  1              1     2931       3     1
##  2              2     2901       4     1
##  3              3     1907       4     1
##  4              4     3489       3     2
##  5              5     3367       3     1
##  6              6     3768       1     1
##  7              7     3394       4     2
##  8              8     3718       3     1
##  9              9     3682       4    20
## 10             10     3033       3     5
## # ℹ 1,274 more rows

This function will be used to generate the estimates:

estimates <- function(x) {
  cbind(est = coef(x), confint(x)) %>% 
    as.data.frame() %>% 
    tibble::rownames_to_column()
}

Here is the analysis that consists in fitting a zero-inflated Poisson regression (forced to go through the origin) to the data per lineage:

rate_estimates <- map(1:4, function(x)
                             zeroinfl(n ~ -1 + duration | -1 + duration,
                                      filter(bmg_zip, lineage == x))) %>% 
  setNames(paste("lineage", 1:4)) %>% 
  map_dfr(estimates) %>% 
  group_split(rowname, .keep = FALSE) %>% 
  map2(c(function(x) exp(365 * x), function(x) exp(365 * x)/(1 + exp(365 * x))), function(y, f) f(y)) %>% 
  setNames(c("count_duration", "zero_duration"))

This function plot the rate estimates per lineage:

plot_estimates <- function(x, ylim, ylab, y, length = .03) {
  with(x, {
    plot(1:4, est, xlim = c(.5, 4.5), ylim = ylim,
         xlab = NA, ylab = ylab, axes = FALSE, col = 4)
    arrows(1:4, `2.5 %`, 1:4, `97.5 %`, length, 90, 3, col = 4)
    text(1:4, y, paste("lineage", 1:4))
    axis(2)
  })
}

The estimates of the rate of starting a chain of transmission (left) and rate of transmission in a chain (right), per lineage:

opar <- par(mfrow = 1:2)
plot_estimates(rate_estimates$zero_duration, c(.5, .6),
               "probability of starting a chain (/year)", .5)
plot_estimates(rate_estimates$count_duration, c(.8, 1.2), "growth rate (/year)", .8)

par(opar)

To explain these differences we should look at how the lineages are different in terms of risk factor.

Survival analysis

Figure 3 on Kaplan-Meier estimates

This is a third option of looking at the same thing. Now, instead of looking at the presence / absence of another case within 2 years as for the first option, we compute the time to the next case. Here a tuning of the polygon() that can make “staircase” style:

polygon2 <- function(x, l, u, ...) {
  n <- length(x)
  x <- c(x[1], rep(x[-1], each = 2))
  l <- c(rep(l[-n], each = 2), l[n])
  u <- c(rep(u[-n], each = 2), u[n])
  polygon(c(x, rev(x)), 1 - c(l, rev(u)), border = NA, ...)
}

Here is a function to plot 1 - survival:

plot_surv <- function(x, col, ...) {
  with(x, {
    plot(time, 1 - surv, type = "s", xlim = c(0, 2 * 365.25), ylim = 0:1,
         xlab = "time (days)", col = col, lwd = 2,
         ylab = "probability", ...)
    polygon2(time, lower, upper, col = adjustcolor(col, .2))
  })
}

Here is the equivalent but adding a line:

lines_surv <- function(x, col) {
  with(x, {
    lines(time, 1 - surv, col = col, lwd = 2, type = "s")
    polygon2(time, lower, upper, col = adjustcolor(col, .2))
  })
}

Kaplan-Meier curves:

opar <- par(mfrow = c(2, 2), cex = 1)

# 1. Kaplan-Meier fits by social risk factor:
groups <- c("prison", "alcohol", "homeless", "drugs", "no_other_factor")
km_rf <- groups %>%
  map(~ survfit(Surv(time, status) ~ 1, data = bmg[bmg[[.x]], ])) %>% 
  setNames(groups)
cols <- c("#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3", "#a6d854")
plot_surv(km_rf[[1]], cols[1])
walk(2:5, ~ lines_surv(km_rf[[.x]], cols[.x]))
abline(v = 365.25, lty = 2)
legend("topleft", legend = c(str_replace_all(groups, "_", " ")[-5], "none"),
       lwd = 2, lty = 1, col = cols, bty = "n")

# 2. Kaplan-Meier fits by place of birth:
groups <- na.exclude(unique(bmg$born))
km_born <- groups %>%
  map(~ survfit(Surv(time, status) ~ 1, data = bmg[bmg[["born"]] == .x, ])) %>% 
  setNames(groups)
cols <- c("#66c2a5", "#fc8d62", "#8da0cb")
plot_surv(km_born[[1]], cols[1], axes = FALSE)
walk(2:3, ~ lines_surv(km_born[[.x]], cols[.x]))
abline(v = 365.25, lty = 2)
legend("topleft", legend = c("entered UK > 2 years before",
                             "entered UK < 2 years before",
                             "UK born"), lwd = 2, lty = 1, col = cols,
       box.col = "white", bg = "white")
axis(1); axis(2)

# 3. Kaplan-Meier fits by number of social risk factors:
groups <- 0:3
km_born <- groups %>%
  map(~ survfit(Surv(time, status) ~ 1, data = bmg[bmg[["srf"]] == .x, ])) %>% 
  setNames(groups)
cols <- c("#fed98e", "#fe9929", "#d95f0e", "#993404")
plot_surv(km_born[[1]], cols[1])
walk(2:4, ~ lines_surv(km_born[[.x]], cols[.x]))
abline(v = 365.25, lty = 2)
legend("topleft", legend = paste(groups, "SRF"),
       lwd = 2, lty = 1, col = cols, bty = "n")

# 4. Kaplan-Meier fits by lineage:
groups <- 1:4
km_lineage <- groups %>%
  map(~ survfit(Surv(time, status) ~ 1, data = bmg[bmg[["lineage"]] == .x, ])) %>% 
  setNames(groups)
cols <- c("#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3")
plot_surv(km_lineage[[1]], cols[1])
walk(2:4, ~ lines_surv(km_lineage[[.x]], cols[.x]))
abline(v = 365.25, lty = 2)
legend("topleft", legend = paste("lineage", groups),
       lwd = 2, lty = 1, col = cols, bty = "n")

par(opar)

With Tim’s data

tim_data <- "for_kaplanmeier.csv" %>%
  read.csv() %>% 
  as_tibble() %>% 
  select(cluster_number, firstsample, imstat, srf, SR, hmp, nfa, drugs, etoh, pul) %>% 
  mutate_at("firstsample", dmy)
cluster_size <- tim_data %>% 
  group_by(cluster_number) %>% 
  tally()
cluster_start <- tim_data %>% 
  group_by(cluster_number) %>% 
  summarise(clusterstart = min(firstsample))

Estimates and tests

At least one social risk factor at 1 and 2 years:

map_dfr(1:2,
        ~ 1 - unlist(summary(survfit(Surv(time, status) ~ 1,
                                     data = filter(bmg, ! no_other_factor)),
                             times = .x * 365.25)[c("lower", "surv", "upper")]))
## # A tibble: 2 × 3
##   lower  surv upper
##   <dbl> <dbl> <dbl>
## 1 0.469 0.395 0.311
## 2 0.566 0.490 0.402

UK-born at 1 year:

1 - unlist(summary(survfit(Surv(time, status) ~ 1, data = filter(bmg, born == "uk")),
                   times = 365.25)[c("lower", "surv", "upper")])
##     lower      surv     upper 
## 0.3818547 0.3387870 0.2927186

Alcohol and drug at 1 year:

map_dfr(c("alcohol", "drugs"),
        ~ 1 - unlist(summary(survfit(Surv(time, status) ~ 1, data = bmg[bmg[[.x]], ]),
                             times = 365.25)[c("lower", "surv", "upper")]))
## # A tibble: 2 × 3
##   lower  surv upper
##   <dbl> <dbl> <dbl>
## 1 0.659 0.524 0.336
## 2 0.546 0.452 0.339

Number of social risk factors:

map_dfr(0:3, ~ 1 - unlist(summary(survfit(Surv(time, status) ~ 1,
                                          data = filter(bmg, srf == .x)),
                   times = 365.25)[c("lower", "surv", "upper")]))
## # A tibble: 4 × 3
##   lower  surv upper
##   <dbl> <dbl> <dbl>
## 1 0.161 0.143 0.125
## 2 0.412 0.319 0.212
## 3 0.554 0.423 0.253
## 4 0.871 0.721 0.396

Corresponding log-rank p-values:

survdiff(Surv(time, status) ~ as.factor(srf), data = filter(bmg, srf < 4))
## Call:
## survdiff(formula = Surv(time, status) ~ as.factor(srf), data = filter(bmg, 
##     srf < 4))
## 
##                     N Observed Expected (O-E)^2/E (O-E)^2/V
## as.factor(srf)=0 1500      291   339.13      6.83      93.4
## as.factor(srf)=1   86       38    16.76     26.92      28.3
## as.factor(srf)=2   43       22     7.65     26.93      27.6
## as.factor(srf)=3   20       15     2.46     64.03      64.6
## 
##  Chisq= 125  on 3 degrees of freedom, p= <2e-16

Lineage:

map_dfr(1:4, ~ 1 - unlist(summary(survfit(Surv(time, status) ~ 1,
                                          data = filter(bmg, lineage == .x)),
                   times = 365.25)[c("lower", "surv", "upper")]))
## # A tibble: 4 × 3
##    lower   surv  upper
##    <dbl>  <dbl>  <dbl>
## 1 0.0829 0.0529 0.0220
## 2 0.141  0.0835 0.0223
## 3 0.160  0.135  0.109 
## 4 0.280  0.247  0.212

Corresponding log-rank p-values:

survdiff(Surv(time, status) ~ as.factor(lineage),
         data = filter(bmg, ! is.na(lineage)))
## Call:
## survdiff(formula = Surv(time, status) ~ as.factor(lineage), data = filter(bmg, 
##     !is.na(lineage)))
## 
##                        N Observed Expected (O-E)^2/E (O-E)^2/V
## as.factor(lineage)=1 214       18     52.8     22.89     26.77
## as.factor(lineage)=2  86       12     20.2      3.33      3.53
## as.factor(lineage)=3 703      138    161.9      3.54      6.34
## as.factor(lineage)=4 649      199    132.1     33.88     53.07
## 
##  Chisq= 63.8  on 3 degrees of freedom, p= 9e-14

Log-rank p-values testing gender differences:

survdiff(Surv(time, status) ~ sex, data = bmg)
## Call:
## survdiff(formula = Surv(time, status) ~ sex, data = bmg)
## 
##              N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=female 695      141      157      1.71      2.99
## sex=male   958      227      211      1.28      2.99
## 
##  Chisq= 3  on 1 degrees of freedom, p= 0.08

Cox proportional hazard model

m <- coxph(Surv(time, status) ~ sex + born + pul + prison + alcohol + homeless + drugs + lineage + pul, bmg_w2y_tim)

summary(m)
## Call:
## coxph(formula = Surv(time, status) ~ sex + born + pul + prison + 
##     alcohol + homeless + drugs + lineage + pul, data = bmg_w2y_tim)
## 
##   n= 1265, number of events= 305 
##    (8 observations deleted due to missingness)
## 
##                  coef exp(coef) se(coef)      z Pr(>|z|)    
## sexmale       0.12300   1.13088  0.12155  1.012   0.3116    
## bornos2      -0.47166   0.62396  0.23599 -1.999   0.0456 *  
## bornuk        0.98192   2.66957  0.12871  7.629 2.36e-14 ***
## pulTRUE       0.95373   2.59537  0.15684  6.081 1.19e-09 ***
## prisonTRUE    0.03711   1.03780  0.23724  0.156   0.8757    
## alcoholTRUE   0.56012   1.75088  0.24897  2.250   0.0245 *  
## homelessTRUE  0.09197   1.09633  0.29072  0.316   0.7517    
## drugsTRUE     0.50799   1.66195  0.20631  2.462   0.0138 *  
## lineage       0.32986   1.39077  0.07287  4.527 5.98e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##              exp(coef) exp(-coef) lower .95 upper .95
## sexmale          1.131     0.8843    0.8911    1.4351
## bornos2          0.624     1.6027    0.3929    0.9909
## bornuk           2.670     0.3746    2.0744    3.4355
## pulTRUE          2.595     0.3853    1.9085    3.5294
## prisonTRUE       1.038     0.9636    0.6519    1.6522
## alcoholTRUE      1.751     0.5711    1.0748    2.8522
## homelessTRUE     1.096     0.9121    0.6201    1.9382
## drugsTRUE        1.662     0.6017    1.1092    2.4902
## lineage          1.391     0.7190    1.2057    1.6043
## 
## Concordance= 0.741  (se = 0.013 )
## Likelihood ratio test= 248.3  on 9 df,   p=<2e-16
## Wald test            = 247.1  on 9 df,   p=<2e-16
## Score (logrank) test = 313.2  on 9 df,   p=<2e-16
summary(coxph(Surv(time, status) ~ sex + born + pul + prison + alcohol + homeless + drugs + lineage + pul, bmg_w2y_tim))
## Call:
## coxph(formula = Surv(time, status) ~ sex + born + pul + prison + 
##     alcohol + homeless + drugs + lineage + pul, data = bmg_w2y_tim)
## 
##   n= 1265, number of events= 305 
##    (8 observations deleted due to missingness)
## 
##                  coef exp(coef) se(coef)      z Pr(>|z|)    
## sexmale       0.12300   1.13088  0.12155  1.012   0.3116    
## bornos2      -0.47166   0.62396  0.23599 -1.999   0.0456 *  
## bornuk        0.98192   2.66957  0.12871  7.629 2.36e-14 ***
## pulTRUE       0.95373   2.59537  0.15684  6.081 1.19e-09 ***
## prisonTRUE    0.03711   1.03780  0.23724  0.156   0.8757    
## alcoholTRUE   0.56012   1.75088  0.24897  2.250   0.0245 *  
## homelessTRUE  0.09197   1.09633  0.29072  0.316   0.7517    
## drugsTRUE     0.50799   1.66195  0.20631  2.462   0.0138 *  
## lineage       0.32986   1.39077  0.07287  4.527 5.98e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##              exp(coef) exp(-coef) lower .95 upper .95
## sexmale          1.131     0.8843    0.8911    1.4351
## bornos2          0.624     1.6027    0.3929    0.9909
## bornuk           2.670     0.3746    2.0744    3.4355
## pulTRUE          2.595     0.3853    1.9085    3.5294
## prisonTRUE       1.038     0.9636    0.6519    1.6522
## alcoholTRUE      1.751     0.5711    1.0748    2.8522
## homelessTRUE     1.096     0.9121    0.6201    1.9382
## drugsTRUE        1.662     0.6017    1.1092    2.4902
## lineage          1.391     0.7190    1.2057    1.6043
## 
## Concordance= 0.741  (se = 0.013 )
## Likelihood ratio test= 248.3  on 9 df,   p=<2e-16
## Wald test            = 247.1  on 9 df,   p=<2e-16
## Score (logrank) test = 313.2  on 9 df,   p=<2e-16
summary(coxph(Surv(time, status) ~ sex + born + pul + prison + alcohol + homeless + drugs + lineage + pul + born:lineage, bmg_w2y_tim))
## Call:
## coxph(formula = Surv(time, status) ~ sex + born + pul + prison + 
##     alcohol + homeless + drugs + lineage + pul + born:lineage, 
##     data = bmg_w2y_tim)
## 
##   n= 1265, number of events= 305 
##    (8 observations deleted due to missingness)
## 
##                     coef exp(coef) se(coef)      z Pr(>|z|)    
## sexmale          0.12051   1.12807  0.12164  0.991  0.32185    
## bornos2         -1.47934   0.22779  1.13597 -1.302  0.19282    
## bornuk           1.03949   2.82778  0.51818  2.006  0.04485 *  
## pulTRUE          0.94794   2.58038  0.15697  6.039 1.55e-09 ***
## prisonTRUE       0.03445   1.03505  0.23756  0.145  0.88469    
## alcoholTRUE      0.56120   1.75278  0.24904  2.253  0.02423 *  
## homelessTRUE     0.08904   1.09313  0.29169  0.305  0.76017    
## drugsTRUE        0.51174   1.66819  0.20742  2.467  0.01362 *  
## lineage          0.31709   1.37312  0.10244  3.095  0.00197 ** 
## bornos2:lineage  0.29761   1.34663  0.32217  0.924  0.35561    
## bornuk:lineage  -0.01523   0.98488  0.14979 -0.102  0.91899    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                 exp(coef) exp(-coef) lower .95 upper .95
## sexmale            1.1281     0.8865   0.88878     1.432
## bornos2            0.2278     4.3901   0.02458     2.111
## bornuk             2.8278     0.3536   1.02418     7.808
## pulTRUE            2.5804     0.3875   1.89702     3.510
## prisonTRUE         1.0351     0.9661   0.64976     1.649
## alcoholTRUE        1.7528     0.5705   1.07583     2.856
## homelessTRUE       1.0931     0.9148   0.61714     1.936
## drugsTRUE          1.6682     0.5995   1.11093     2.505
## lineage            1.3731     0.7283   1.12335     1.678
## bornos2:lineage    1.3466     0.7426   0.71617     2.532
## bornuk:lineage     0.9849     1.0154   0.73432     1.321
## 
## Concordance= 0.74  (se = 0.014 )
## Likelihood ratio test= 249.3  on 11 df,   p=<2e-16
## Wald test            = 243.3  on 11 df,   p=<2e-16
## Score (logrank) test = 321.8  on 11 df,   p=<2e-16
summary(coxph(Surv(time, status) ~ sex + born + pul + srf + as.factor(lineage), bmg_w2y_tim))
## Call:
## coxph(formula = Surv(time, status) ~ sex + born + pul + srf + 
##     as.factor(lineage), data = bmg_w2y_tim)
## 
##   n= 1265, number of events= 305 
##    (8 observations deleted due to missingness)
## 
##                         coef exp(coef) se(coef)      z Pr(>|z|)    
## sexmale              0.10435   1.10999  0.12173  0.857 0.391333    
## bornos2             -0.48404   0.61629  0.23627 -2.049 0.040496 *  
## bornuk               1.01023   2.74622  0.12751  7.923 2.32e-15 ***
## pulTRUE              0.95834   2.60737  0.15679  6.112 9.82e-10 ***
## srf                  0.30627   1.35834  0.07951  3.852 0.000117 ***
## as.factor(lineage)2  0.53082   1.70033  0.40935  1.297 0.194718    
## as.factor(lineage)3  0.89681   2.45176  0.27382  3.275 0.001056 ** 
## as.factor(lineage)4  1.13194   3.10168  0.27294  4.147 3.36e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                     exp(coef) exp(-coef) lower .95 upper .95
## sexmale                1.1100     0.9009    0.8744    1.4091
## bornos2                0.6163     1.6226    0.3879    0.9793
## bornuk                 2.7462     0.3641    2.1389    3.5259
## pulTRUE                2.6074     0.3835    1.9175    3.5454
## srf                    1.3583     0.7362    1.1623    1.5874
## as.factor(lineage)2    1.7003     0.5881    0.7622    3.7929
## as.factor(lineage)3    2.4518     0.4079    1.4335    4.1933
## as.factor(lineage)4    3.1017     0.3224    1.8167    5.2957
## 
## Concordance= 0.739  (se = 0.014 )
## Likelihood ratio test= 246.6  on 8 df,   p=<2e-16
## Wald test            = 239.2  on 8 df,   p=<2e-16
## Score (logrank) test = 298.6  on 8 df,   p=<2e-16
summary(coxph(Surv(time, status) ~ sex + born + pul + srf + as.factor(lineage) + born:as.factor(lineage), bmg_w2y_tim))
## Call:
## coxph(formula = Surv(time, status) ~ sex + born + pul + srf + 
##     as.factor(lineage) + born:as.factor(lineage), data = bmg_w2y_tim)
## 
##   n= 1265, number of events= 305 
##    (8 observations deleted due to missingness)
## 
##                                   coef  exp(coef)   se(coef)      z Pr(>|z|)
## sexmale                      9.589e-02  1.101e+00  1.222e-01  0.785 0.432679
## bornos2                     -1.488e+01  3.447e-07  1.224e+03 -0.012 0.990301
## bornuk                       5.154e-01  1.674e+00  5.848e-01  0.881 0.378132
## pulTRUE                      9.483e-01  2.581e+00  1.573e-01  6.029 1.64e-09
## srf                          3.078e-01  1.360e+00  7.955e-02  3.869 0.000109
## as.factor(lineage)2         -1.095e-01  8.962e-01  6.517e-01 -0.168 0.866522
## as.factor(lineage)3          6.642e-01  1.943e+00  3.275e-01  2.028 0.042519
## as.factor(lineage)4          9.071e-01  2.477e+00  3.324e-01  2.729 0.006348
## bornos2:as.factor(lineage)2  1.508e+01  3.525e+06  1.224e+03  0.012 0.990174
## bornuk:as.factor(lineage)2   1.162e+00  3.195e+00  9.368e-01  1.240 0.214958
## bornos2:as.factor(lineage)3  1.420e+01  1.466e+06  1.224e+03  0.012 0.990746
## bornuk:as.factor(lineage)3   5.457e-01  1.726e+00  6.130e-01  0.890 0.373388
## bornos2:as.factor(lineage)4  1.459e+01  2.176e+06  1.224e+03  0.012 0.990488
## bornuk:as.factor(lineage)4   4.753e-01  1.609e+00  6.094e-01  0.780 0.435369
##                                
## sexmale                        
## bornos2                        
## bornuk                         
## pulTRUE                     ***
## srf                         ***
## as.factor(lineage)2            
## as.factor(lineage)3         *  
## as.factor(lineage)4         ** 
## bornos2:as.factor(lineage)2    
## bornuk:as.factor(lineage)2     
## bornos2:as.factor(lineage)3    
## bornuk:as.factor(lineage)3     
## bornos2:as.factor(lineage)4    
## bornuk:as.factor(lineage)4     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                             exp(coef) exp(-coef) lower .95 upper .95
## sexmale                     1.101e+00  9.086e-01    0.8662     1.399
## bornos2                     3.447e-07  2.901e+06    0.0000       Inf
## bornuk                      1.674e+00  5.973e-01    0.5322     5.267
## pulTRUE                     2.581e+00  3.874e-01    1.8966     3.513
## srf                         1.360e+00  7.351e-01    1.1640     1.590
## as.factor(lineage)2         8.962e-01  1.116e+00    0.2498     3.215
## as.factor(lineage)3         1.943e+00  5.147e-01    1.0227     3.691
## as.factor(lineage)4         2.477e+00  4.037e-01    1.2913     4.752
## bornos2:as.factor(lineage)2 3.525e+06  2.837e-07    0.0000       Inf
## bornuk:as.factor(lineage)2  3.195e+00  3.129e-01    0.5094    20.044
## bornos2:as.factor(lineage)3 1.466e+06  6.820e-07    0.0000       Inf
## bornuk:as.factor(lineage)3  1.726e+00  5.795e-01    0.5190     5.738
## bornos2:as.factor(lineage)4 2.176e+06  4.595e-07    0.0000       Inf
## bornuk:as.factor(lineage)4  1.609e+00  6.217e-01    0.4872     5.310
## 
## Concordance= 0.742  (se = 0.013 )
## Likelihood ratio test= 251.3  on 14 df,   p=<2e-16
## Wald test            = 234.3  on 14 df,   p=<2e-16
## Score (logrank) test = 311.2  on 14 df,   p=<2e-16