Data for good for Vietnam

Packages
Downloading the data
- Data for good
- Maps from GADM
Removing whatever occurs only outside Vietnam
Fixing the grid
Looking at temporal trends
Identifying the communes from Vinh Phuc
A refinement
Plotting the links

Packages

library(magrittr)
library(readr)
library(sf)
library(purrr)
library(lubridate)
library(dplyr)

Downloading the data

Data for good

Let’s download the daily movement file (291.5 MB, takes about 1’20’’ to download):

download.file("URL here", "tet_daily_mvt.zip")

Unzip it (takes less than 10’’ and it produces a CSV file of 1.8 GB):

unzip("tet_daily_mvt.zip")

Load it (takes about 30’’ and 242.3 MB in memory):

mvts <- "Vietnam_Disease_Prevention_Map_Tet_Holiday_Dec_17_2019_Daily_Movement.csv" %>%
  read_csv(col_types = "Tccdddddddd") %>% 
  select(-z_score, -n_baseline, -n_difference) %>% 
  filter(! is.na(n_crisis)) %>% 
  mutate(n_crisis      = as.integer(n_crisis),
         utc_date_time = with_tz(utc_date_time, "Asia/Ho_Chi_Minh")) %>% 
  rename(date_time = utc_date_time)

Note that we remove the variables that we don’t need here (z_score, n_baseline and n_difference) and get rid off the records with less then 10 cases (i.e. n_crisis == NA). Let’s now remove Xã from communes names (takes less than 10’’):

mvts %<>% mutate_at(vars(ends_with("name")), sub, pattern = "Xã *", replacement = "")

Let’s have a look:

mvts

## # A tibble: 4,225,309 x 8
##    date_time           start_name end_name      start_lon start_lat end_lon end_lat n_crisis
##    <dttm>              <chr>      <chr>             <dbl>     <dbl>   <dbl>   <dbl>    <int>
##  1 2019-12-16 07:00:00 A Phon     A Phon             104.      14.5    104.    14.5      386
##  2 2019-12-16 07:00:00 A Phon     Charat             104.      14.5    104.    14.4       25
##  3 2019-12-16 07:00:00 A Phon     Nok Mueang         104.      14.5    103.    14.9       11
##  4 2019-12-16 07:00:00 A Phon     Sadao              104.      14.5    104.    14.6      126
##  5 2019-12-16 07:00:00 A Phon     Sangkha            104.      14.5    104.    14.6       13
##  6 2019-12-16 07:00:00 Akat       Akat               104.      17.6    104.    17.6     1292
##  7 2019-12-16 07:00:00 Akat       Lao Phatthana      104.      17.6    104.    17.6       12
##  8 2019-12-16 07:00:00 Akat       Na Duea            104.      17.6    104.    17.6       17
##  9 2019-12-16 07:00:00 Akat       Na Wa              104.      17.5    104.    17.5       57
## 10 2019-12-16 07:00:00 Akat       Phon Phaeng        104.      17.6    104.    17.5       17
## # … with 4,225,299 more rows

Three points to note:

utc_date_time refers to the end time;
start_name and end_name are communes names;
start_lon, start_lat, end_lon and end_lat are coordinates of (the centers of?) pixels.

Maps from GADM

Let’s now download some maps of Vietnam from GADM (takes about 40’’ for a total of 7.2 MB):

download.file("https://biogeo.ucdavis.edu/data/gadm3.6/Rsf/gadm36_VNM_0_sf.rds", 
              "gadm36_VNM_0_sf.rds")
download.file("https://biogeo.ucdavis.edu/data/gadm3.6/Rsf/gadm36_VNM_1_sf.rds", 
              "gadm36_VNM_1_sf.rds")
download.file("https://biogeo.ucdavis.edu/data/gadm3.6/Rsf/gadm36_VNM_2_sf.rds", 
              "gadm36_VNM_2_sf.rds")
download.file("https://biogeo.ucdavis.edu/data/gadm3.6/Rsf/gadm36_VNM_3_sf.rds", 
              "gadm36_VNM_3_sf.rds")

And let’s load them:

vn0 <- readRDS("gadm36_VNM_0_sf.rds") # country polygon
vn1 <- readRDS("gadm36_VNM_1_sf.rds") # provinces polygons
vn2 <- readRDS("gadm36_VNM_2_sf.rds") # districts polygons
vn3 <- readRDS("gadm36_VNM_3_sf.rds") # communes polygons

Removing whatever occurs only outside Vietnam

Let’s see the locations of the records. First let’s get the ranges of the coordinates:

(xlim <- range(range(mvts$start_lon), range(mvts$end_lon)))

## [1] 101.6455 112.0166

(ylim <- range(range(mvts$start_lat), range(mvts$end_lat)))

## [1]  8.62447 23.36242

And use these ranges to map the start records:

plot(xlim, ylim, asp = 1, xlab = "longitude", ylab = "latitude", type = "n")
maps::map(col = "grey", fill = TRUE, add = TRUE)
with(unique(select(mvts, start_lon, start_lat)), points(start_lon, start_lat, pch = ".", col = "blue"))
axis(1)
axis(2)
box(bty = "o")

And the end records:

plot(xlim, ylim, asp = 1, xlab = "longitude", ylab = "latitude", type = "n")
maps::map(col = "grey", fill = TRUE, add = TRUE)
with(unique(select(mvts, end_lon, end_lat)), points(end_lon, end_lat, pch = ".", col = "red"))
axis(1)
axis(2)
box(bty = "o")

This shows that we need to get rid off all the records that do not have either start or end in Vietnam. First we select the records that have start inside Vietnam (it takes about 30’’):

start_sel <- mvts %>% 
  st_as_sf(coords = c("start_lon", "start_lat"), crs = 4326) %>% 
  st_intersects(vn0) %>% 
  map_int(length)

Next the records that have end inside Vietnam (it takes about 30’’):

end_sel <- mvts %>% 
  st_as_sf(coords = c("end_lon", "end_lat"), crs = 4326) %>% 
  st_intersects(vn0) %>% 
  map_int(length)

And now getting rid off all the records that do not have at least start or end inside Vietnam:

mvts <- mvts[start_sel + end_sel > 0, ]

Which gives:

mvts

## # A tibble: 2,806,452 x 8
##    date_time           start_name    end_name          start_lon start_lat end_lon end_lat n_crisis
##    <dttm>              <chr>         <chr>                 <dbl>     <dbl>   <dbl>   <dbl>    <int>
##  1 2019-12-16 07:00:00 Malipo County Malipo County          105.     23.2     105.   23.2       343
##  2 2019-12-16 07:00:00 Phường 10     Phường 10              106.      9.58    106.    9.58    11492
##  3 2019-12-16 07:00:00 Phường 10     Phường 7               106.      9.58    107.   10.8        41
##  4 2019-12-16 07:00:00 Phường 10     Phường Hưng Thạnh      106.      9.58    106.   10.0       108
##  5 2019-12-16 07:00:00 Phường 10     An Thạnh Tây           106.      9.58    106.    9.67       34
##  6 2019-12-16 07:00:00 Phường 10     Hòa Tú 2               106.      9.58    106.    9.41       18
##  7 2019-12-16 07:00:00 Phường 10     Hưng Phú               106.      9.58    106.    9.67       33
##  8 2019-12-16 07:00:00 Phường 10     Kế Thành               106.      9.58    106.    9.75       31
##  9 2019-12-16 07:00:00 Phường 10     Liêu Tú                106.      9.58    106.    9.49       19
## 10 2019-12-16 07:00:00 Phường 10     Long Phú               106.      9.58    106.    9.58       38
## # … with 2,806,442 more rows

And, visually:

(xlim <- range(range(mvts$start_lon), range(mvts$end_lon)))

## [1] 102.2016 109.3799

(ylim <- range(range(mvts$start_lat), range(mvts$end_lat)))

## [1]  8.62447 23.36242

And use these ranges to map the start records:

plot(xlim, ylim, asp = 1, xlab = "longitude", ylab = "latitude", type = "n")
maps::map(col = "grey", fill = TRUE, add = TRUE)
with(unique(select(mvts, start_lon, start_lat)), points(start_lon, start_lat, pch = ".", col = "blue"))
axis(1)
axis(2)
box(bty = "o")

And the end records:

plot(xlim, ylim, asp = 1, xlab = "longitude", ylab = "latitude", type = "n")
maps::map(col = "grey", fill = TRUE, add = TRUE)
with(unique(select(mvts, end_lon, end_lat)), points(end_lon, end_lat, pch = ".", col = "red"))
axis(1)
axis(2)
box(bty = "o")

Much better!

Fixing the grid

The grid resolution seems to be 0.1 degree. Phil: is it correct?

Problem that we see a bit from the maps above is that the pixel coordinates do not seem to perfectly match a regular grid. Any idea of the reason of that? Let’s try here to fix the grid.

First, let’s identify a large enough zone where the grid seems OK:

x1 <- 105
x2 <- 106.5
y1 <- 20.5
y2 <- 21.5

Let’s for example visualize it on the start points:

plot(xlim, ylim, asp = 1, xlab = "longitude", ylab = "latitude", type = "n")
maps::map(col = "grey", fill = TRUE, add = TRUE)
with(unique(select(mvts, start_lon, start_lat)), points(start_lon, start_lat, pch = ".", col = "blue"))
polygon(c(x1, x2, x2, x1), c(y1, y1, y2, y2), border = "red")
axis(1)
axis(2)
box(bty = "o")

And let’s zoom it:

mvts %>% 
  filter(between(start_lon, 105, 106.5), between(start_lat, 20.5, 21.5)) %$%
  plot(start_lon, start_lat, pch = 3, col = "blue", xlab = "longitude", ylab = "latitute")
box(bty = "o")

Let’s look at the errors:

mvts %>% 
  mutate(difference = start_lon - round(start_lon, 2)) %>% 
  pull(difference) %>% 
  hist(n = 100, col = "grey", main = NA)

I’m not quite sure about what I’m doing wrong here… TO BE CONTINUED.

Looking at temporal trends

Almost 2 months of data:

range(mvts$date_time)

## [1] "2019-12-16 07:00:00 +07" "2020-02-12 23:00:00 +07"

Visually:

first_day <- seq(ymd("2019-11-01"), ymd("2020-05-01"), "month")
plot(first_day, rep(1, length(first_day)), axes = FALSE, ann = FALSE, type = "n", ylim = 0:1)
axis(1, first_day, paste(month.abb[month(first_day)], sub("^\\d\\d", "", year(first_day))))
fb_days <- as.numeric(as.Date(mvts$date_time))
x <- range(fb_days)
polygon(c(x, rev(x)), c(-1, -1, 2, 2), col = "pink", border = NA)
box(bty = "o")
abline(v = first_day, col = "grey", lwd = 2)

Another option:

col <- c("7" = "#e41a1c", "15" = "#377eb8", "23" = "#4daf4a")

mvts2 <- mvts %>% 
  group_by(date_time) %>% 
  tally() %>% 
  arrange(date_time)

with(mvts2, {
    plot(date_time, n, type = "l", xlab = NA, ylab = "number of people",
         ylim = c(0, 22000), axes = FALSE)
    points(date_time, n, col = col[as.character(hour(date_time))], pch = 19)
  })

with(filter(mvts2, wday(date_time) < 2), points(date_time, n, col = col[as.character(hour(date_time))], cex = 2))
  
ats <- seq.POSIXt(ymd_hms("2020-01-01 00:00:00"), ymd_hms("2020-02-01 00:00:00"), "month")
axis(1, ats, month.abb[month(ats)])
axis(2)
box(bty = "o")

d1 <- ymd_hms("2020-01-23 00:00:00")
d2 <- ymd_hms("2020-01-29 23:59:59")
polygon(c(d1, d2, d2, d1), c(-1000, -1000, 25000, 25000), col = adjustcolor("red", .1), border = NA)

opar <- par(plt = c(.2, .5, .2, .5), new = TRUE)
inside_width <- .5
pie(c(3, 3, 3), NA, clockwise = TRUE, init.angle = 285, col = col)
plotrix::draw.circle(0, 0, inside_width, 200, col = "white") %>% 
  data.frame() %>% 
  filter(y < 0) %>% 
  with(polygon(x, y, col = "black"))

par(opar)

Identifying the communes from Vinh Phuc

Let’s retrieve the names of the communes of Vinh Phuc

vp_com <- vn3 %>%
  filter(NAME_1 == "Vĩnh Phúc") %>% 
  pull(NAME_3) %>% 
  sort()

which is:

vp_com

##   [1] "An Hòa"       "An Tường"     "Bá Hiến"      "Bắc Bình"     "Bạch Lưu"     "Bàn Giản"     "Bình Định"    "Bình Dương"  
##   [9] "Bồ Lý"        "Bồ Sao"       "Cao Đại"      "Cao Minh"     "Cao Phong"    "Chấn Hưng"    "Đại Đình"     "Đại Đồng"    
##  [17] "Đại Tự"       "Đạo Đức"      "Đạo Trù"      "Đạo Tú"       "Đình Chu"     "Định Trung"   "Đôn Nhân"     "Đồng Cương"  
##  [25] "Đống Đa"      "Đồng Ích"     "Đồng Quế"     "Đồng Tâm"     "Đồng Thịnh"   "Đồng Tĩnh"    "Đồng Văn"     "Đồng Xuân"   
##  [33] "Đức Bác"      "Duy Phiên"    "Gia Khánh"    "Hải Lựu"      "Hồ Sơn"       "Hoa Sơn"      "Hoàng Đan"    "Hoàng Hoa"   
##  [41] "Hoàng Lâu"    "Hội Hợp"      "Hồng Châu"    "Hồng Phương"  "Hợp Châu"     "Hợp Hòa"      "Hợp Lý"       "Hợp Thịnh"   
##  [49] "Hùng Vương"   "Hương Canh"   "Hướng Đạo"    "Hương Sơn"    "Khai Quang"   "Kim Long"     "Kim Xá"       "Lãng Công"   
##  [57] "Lập Thạch"    "Liên Bảo"     "Liên Châu"    "Liên Hòa"     "Liễn Sơn"     "Lũng Hoà"     "Lý Nhân"      "Minh Quang"  
##  [65] "Nam Viêm"     "Nghĩa Hưng"   "Ngô Quyền"    "Ngọc Mỹ"      "Ngọc Thanh"   "Ngũ Kiên"     "Nguyệt Đức"   "Nhân Đạo"    
##  [73] "Nhạo Sơn"     "Như Thụy"     "Phú Đa"       "Phú Thịnh"    "Phú Xuân"     "Phúc Thắng"   "Phương Khoan" "Quang Sơn"   
##  [81] "Quang Yên"    "Quất Lưu"     "Sơn Đông"     "Sơn Lôi"      "Tam Đảo"      "Tam Hồng"     "Tam Hợp"      "Tam Phúc"    
##  [89] "Tam Quan"     "Tam Sơn"      "Tân Cương"    "Tân Lập"      "Tân Phong"    "Tân Tiến"     "Tề Lỗ"        "Thái Hòa"    
##  [97] "Thanh Lãng"   "Thanh Trù"    "Thanh Vân"    "Thiện Kế"     "Thổ Tang"     "Thượng Trưng" "Tích Sơn"     "Tiền Châu"   
## [105] "Tiên Lữ"      "Triệu Đề"     "Trung Hà"     "Trung Kiên"   "Trung Mỹ"     "Trung Nguyên" "Trưng Nhị"    "Trưng Trắc"  
## [113] "Tử Du"        "Tứ Trưng"     "Tứ Yên"       "Tuân Chính"   "Vân Hội"      "Văn Quán"     "Văn Tiến"     "Vân Trục"    
## [121] "Vân Xuân"     "Việt Xuân"    "Vĩnh Ninh"    "Vĩnh Sơn"     "Vĩnh Thịnh"   "Vĩnh Tường"   "Vũ Di"        "Xuân Hoà"    
## [129] "Xuân Hòa"     "Xuân Lôi"     "Yên Bình"     "Yên Đồng"     "Yên Dương"    "Yên Lạc"      "Yên Lập"      "Yên Phương"  
## [137] "Yên Thạch"

Let’s now filter the records that start from Vinh Phuc:

from_vp <- filter(mvts, start_name %in% vp_com)

which is 95090 of them and the records that end up in Vinh Phuc:

to_vp <- filter(mvts, end_name %in% vp_com)

which is 95060 of them. Let’s check that every thing looks corrects:

vp <- filter(vn1, VARNAME_1 == "Vinh Phuc")

plot(st_geometry(vp), col = "grey")
from_vp %>% 
  unique() %>% 
  st_as_sf(coords = c("start_lon", "start_lat"), crs = 4326) %>% 
  st_geometry() %>% 
  plot(add = TRUE, pch = 3, col = "red")

And:

plot(st_geometry(vp), col = "grey")
to_vp %>% 
  unique() %>% 
  st_as_sf(coords = c("end_lon", "end_lat"), crs = 4326) %>% 
  st_geometry() %>% 
  plot(add = TRUE, pch = 3, col = "blue")

Everything looks OK! But:

plot(st_geometry(vn1), col = "grey")
from_vp %>% 
  unique() %>% 
  st_as_sf(coords = c("start_lon", "start_lat"), crs = 4326) %>% 
  st_geometry() %>% 
  plot(add = TRUE, pch = 3, col = "red")

And:

plot(st_geometry(vn1), col = "grey")
to_vp %>% 
  unique() %>% 
  st_as_sf(coords = c("end_lon", "end_lat"), crs = 4326) %>% 
  st_geometry() %>% 
  plot(add = TRUE, pch = 3, col = "blue")

We cannot rely on the communes’s names and need to refine a bit…

A refinement

from_vp %<>% 
  st_as_sf(coords = c("start_lon", "start_lat"), crs = 4326) %>% 
  st_intersects(vp) %>% 
  map_lgl(~ .x %>% length() %>% as.logical()) %>% 
  which() %>% 
  slice(from_vp, .)

And:

to_vp %<>% 
  st_as_sf(coords = c("end_lon", "end_lat"), crs = 4326) %>% 
  st_intersects(vp) %>% 
  map_lgl(~ .x %>% length() %>% as.logical()) %>% 
  which() %>% 
  slice(to_vp, .)

Let’s check:

plot(st_geometry(vn1), col = "grey")
from_vp %>% 
  unique() %>% 
  st_as_sf(coords = c("start_lon", "start_lat"), crs = 4326) %>% 
  st_geometry() %>% 
  plot(add = TRUE, pch = 3, col = "red")

And:

plot(st_geometry(vn1), col = "grey")
to_vp %>% 
  unique() %>% 
  st_as_sf(coords = c("end_lon", "end_lat"), crs = 4326) %>% 
  st_geometry() %>% 
  plot(add = TRUE, pch = 3, col = "blue")

Now it looks good!

Plotting the links

plot(st_geometry(vn1), col = "grey")
with(from_vp, segments(start_lon, start_lat, end_lon, end_lat, col = "red"))

plot(st_geometry(vn1), col = "grey")
with(to_vp, segments(start_lon, start_lat, end_lon, end_lat, col = "blue"))

Let’s look another scale:

vn1 %>% 
  filter(VARNAME_1 %in% c("Ha Giang", "Ha Nam")) %>% 
  st_geometry() %>% 
  plot(border = "white")
plot(st_geometry(vn1), col = "grey", add = TRUE)
with(from_vp, segments(start_lon, start_lat, end_lon, end_lat, col = "red"))
box(bty = "o")

and:

vn1 %>% 
  filter(VARNAME_1 %in% c("Ha Giang", "Ha Nam")) %>% 
  st_geometry() %>% 
  plot(border = "white")
plot(st_geometry(vn1), col = "grey", add = TRUE)
with(to_vp, segments(start_lon, start_lat, end_lon, end_lat, col = "blue"))
box(bty = "o")

Zooming more:

to_vp2 <- to_vp %>% 
  group_by(start_name, end_name) %>% 
  summarise(start_lon = mean(start_lon),
            start_lat = mean(start_lat),
            end_lon   = mean(end_lon),
            end_lat   = mean(end_lat),
            n_crisis  = sum(n_crisis))

x <- range(to_vp2$n_crisis)
y <- c(.005, 50)
m <- lm(y ~ x)

plot(st_geometry(vp), col = "grey")
with(to_vp2, segments(start_lon, start_lat, end_lon, end_lat, lwd = predict(m, data.frame(x = n_crisis)), col = "blue"))

Let’s focus now only on those who move: