library(magrittr)
library(readr)
library(sf)
library(purrr)
library(lubridate)
library(dplyr)
Let’s download the daily movement file (291.5 MB, takes about 1’20’’ to download):
download.file("URL here", "tet_daily_mvt.zip")
Unzip it (takes less than 10’’ and it produces a CSV file of 1.8 GB):
unzip("tet_daily_mvt.zip")
Load it (takes about 30’’ and 242.3 MB in memory):
mvts <- "Vietnam_Disease_Prevention_Map_Tet_Holiday_Dec_17_2019_Daily_Movement.csv" %>%
read_csv(col_types = "Tccdddddddd") %>%
select(-z_score, -n_baseline, -n_difference) %>%
filter(! is.na(n_crisis)) %>%
mutate(n_crisis = as.integer(n_crisis),
utc_date_time = with_tz(utc_date_time, "Asia/Ho_Chi_Minh")) %>%
rename(date_time = utc_date_time)
Note that we remove the variables that we don’t need here (z_score
, n_baseline
and n_difference
) and get rid off the records with less then 10 cases (i.e. n_crisis == NA
). Let’s now remove Xã
from communes names (takes less than 10’’):
mvts %<>% mutate_at(vars(ends_with("name")), sub, pattern = "Xã *", replacement = "")
Let’s have a look:
mvts
## # A tibble: 4,225,309 x 8
## date_time start_name end_name start_lon start_lat end_lon end_lat n_crisis
## <dttm> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 2019-12-16 07:00:00 A Phon A Phon 104. 14.5 104. 14.5 386
## 2 2019-12-16 07:00:00 A Phon Charat 104. 14.5 104. 14.4 25
## 3 2019-12-16 07:00:00 A Phon Nok Mueang 104. 14.5 103. 14.9 11
## 4 2019-12-16 07:00:00 A Phon Sadao 104. 14.5 104. 14.6 126
## 5 2019-12-16 07:00:00 A Phon Sangkha 104. 14.5 104. 14.6 13
## 6 2019-12-16 07:00:00 Akat Akat 104. 17.6 104. 17.6 1292
## 7 2019-12-16 07:00:00 Akat Lao Phatthana 104. 17.6 104. 17.6 12
## 8 2019-12-16 07:00:00 Akat Na Duea 104. 17.6 104. 17.6 17
## 9 2019-12-16 07:00:00 Akat Na Wa 104. 17.5 104. 17.5 57
## 10 2019-12-16 07:00:00 Akat Phon Phaeng 104. 17.6 104. 17.5 17
## # … with 4,225,299 more rows
Three points to note:
utc_date_time
refers to the end time;start_name
and end_name
are communes names;start_lon
, start_lat
, end_lon
and end_lat
are coordinates of (the centers of?) pixels.Let’s now download some maps of Vietnam from GADM (takes about 40’’ for a total of 7.2 MB):
download.file("https://biogeo.ucdavis.edu/data/gadm3.6/Rsf/gadm36_VNM_0_sf.rds",
"gadm36_VNM_0_sf.rds")
download.file("https://biogeo.ucdavis.edu/data/gadm3.6/Rsf/gadm36_VNM_1_sf.rds",
"gadm36_VNM_1_sf.rds")
download.file("https://biogeo.ucdavis.edu/data/gadm3.6/Rsf/gadm36_VNM_2_sf.rds",
"gadm36_VNM_2_sf.rds")
download.file("https://biogeo.ucdavis.edu/data/gadm3.6/Rsf/gadm36_VNM_3_sf.rds",
"gadm36_VNM_3_sf.rds")
And let’s load them:
vn0 <- readRDS("gadm36_VNM_0_sf.rds") # country polygon
vn1 <- readRDS("gadm36_VNM_1_sf.rds") # provinces polygons
vn2 <- readRDS("gadm36_VNM_2_sf.rds") # districts polygons
vn3 <- readRDS("gadm36_VNM_3_sf.rds") # communes polygons
Let’s see the locations of the records. First let’s get the ranges of the coordinates:
(xlim <- range(range(mvts$start_lon), range(mvts$end_lon)))
## [1] 101.6455 112.0166
(ylim <- range(range(mvts$start_lat), range(mvts$end_lat)))
## [1] 8.62447 23.36242
And use these ranges to map the start records:
plot(xlim, ylim, asp = 1, xlab = "longitude", ylab = "latitude", type = "n")
maps::map(col = "grey", fill = TRUE, add = TRUE)
with(unique(select(mvts, start_lon, start_lat)), points(start_lon, start_lat, pch = ".", col = "blue"))
axis(1)
axis(2)
box(bty = "o")
And the end records:
plot(xlim, ylim, asp = 1, xlab = "longitude", ylab = "latitude", type = "n")
maps::map(col = "grey", fill = TRUE, add = TRUE)
with(unique(select(mvts, end_lon, end_lat)), points(end_lon, end_lat, pch = ".", col = "red"))
axis(1)
axis(2)
box(bty = "o")
This shows that we need to get rid off all the records that do not have either start or end in Vietnam. First we select the records that have start inside Vietnam (it takes about 30’’):
start_sel <- mvts %>%
st_as_sf(coords = c("start_lon", "start_lat"), crs = 4326) %>%
st_intersects(vn0) %>%
map_int(length)
Next the records that have end inside Vietnam (it takes about 30’’):
end_sel <- mvts %>%
st_as_sf(coords = c("end_lon", "end_lat"), crs = 4326) %>%
st_intersects(vn0) %>%
map_int(length)
And now getting rid off all the records that do not have at least start or end inside Vietnam:
mvts <- mvts[start_sel + end_sel > 0, ]
Which gives:
mvts
## # A tibble: 2,806,452 x 8
## date_time start_name end_name start_lon start_lat end_lon end_lat n_crisis
## <dttm> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 2019-12-16 07:00:00 Malipo County Malipo County 105. 23.2 105. 23.2 343
## 2 2019-12-16 07:00:00 Phường 10 Phường 10 106. 9.58 106. 9.58 11492
## 3 2019-12-16 07:00:00 Phường 10 Phường 7 106. 9.58 107. 10.8 41
## 4 2019-12-16 07:00:00 Phường 10 Phường Hưng Thạnh 106. 9.58 106. 10.0 108
## 5 2019-12-16 07:00:00 Phường 10 An Thạnh Tây 106. 9.58 106. 9.67 34
## 6 2019-12-16 07:00:00 Phường 10 Hòa Tú 2 106. 9.58 106. 9.41 18
## 7 2019-12-16 07:00:00 Phường 10 Hưng Phú 106. 9.58 106. 9.67 33
## 8 2019-12-16 07:00:00 Phường 10 Kế Thành 106. 9.58 106. 9.75 31
## 9 2019-12-16 07:00:00 Phường 10 Liêu Tú 106. 9.58 106. 9.49 19
## 10 2019-12-16 07:00:00 Phường 10 Long Phú 106. 9.58 106. 9.58 38
## # … with 2,806,442 more rows
And, visually:
(xlim <- range(range(mvts$start_lon), range(mvts$end_lon)))
## [1] 102.2016 109.3799
(ylim <- range(range(mvts$start_lat), range(mvts$end_lat)))
## [1] 8.62447 23.36242
And use these ranges to map the start records:
plot(xlim, ylim, asp = 1, xlab = "longitude", ylab = "latitude", type = "n")
maps::map(col = "grey", fill = TRUE, add = TRUE)
with(unique(select(mvts, start_lon, start_lat)), points(start_lon, start_lat, pch = ".", col = "blue"))
axis(1)
axis(2)
box(bty = "o")
And the end records:
plot(xlim, ylim, asp = 1, xlab = "longitude", ylab = "latitude", type = "n")
maps::map(col = "grey", fill = TRUE, add = TRUE)
with(unique(select(mvts, end_lon, end_lat)), points(end_lon, end_lat, pch = ".", col = "red"))
axis(1)
axis(2)
box(bty = "o")
Much better!
The grid resolution seems to be 0.1 degree. Phil: is it correct?
Problem that we see a bit from the maps above is that the pixel coordinates do not seem to perfectly match a regular grid. Any idea of the reason of that? Let’s try here to fix the grid.
First, let’s identify a large enough zone where the grid seems OK:
x1 <- 105
x2 <- 106.5
y1 <- 20.5
y2 <- 21.5
Let’s for example visualize it on the start points:
plot(xlim, ylim, asp = 1, xlab = "longitude", ylab = "latitude", type = "n")
maps::map(col = "grey", fill = TRUE, add = TRUE)
with(unique(select(mvts, start_lon, start_lat)), points(start_lon, start_lat, pch = ".", col = "blue"))
polygon(c(x1, x2, x2, x1), c(y1, y1, y2, y2), border = "red")
axis(1)
axis(2)
box(bty = "o")
And let’s zoom it:
mvts %>%
filter(between(start_lon, 105, 106.5), between(start_lat, 20.5, 21.5)) %$%
plot(start_lon, start_lat, pch = 3, col = "blue", xlab = "longitude", ylab = "latitute")
box(bty = "o")
Let’s look at the errors:
mvts %>%
mutate(difference = start_lon - round(start_lon, 2)) %>%
pull(difference) %>%
hist(n = 100, col = "grey", main = NA)
I’m not quite sure about what I’m doing wrong here… TO BE CONTINUED.
Almost 2 months of data:
range(mvts$date_time)
## [1] "2019-12-16 07:00:00 +07" "2020-02-12 23:00:00 +07"
Visually:
first_day <- seq(ymd("2019-11-01"), ymd("2020-05-01"), "month")
plot(first_day, rep(1, length(first_day)), axes = FALSE, ann = FALSE, type = "n", ylim = 0:1)
axis(1, first_day, paste(month.abb[month(first_day)], sub("^\\d\\d", "", year(first_day))))
fb_days <- as.numeric(as.Date(mvts$date_time))
x <- range(fb_days)
polygon(c(x, rev(x)), c(-1, -1, 2, 2), col = "pink", border = NA)
box(bty = "o")
abline(v = first_day, col = "grey", lwd = 2)
Another option:
col <- c("7" = "#e41a1c", "15" = "#377eb8", "23" = "#4daf4a")
mvts2 <- mvts %>%
group_by(date_time) %>%
tally() %>%
arrange(date_time)
with(mvts2, {
plot(date_time, n, type = "l", xlab = NA, ylab = "number of people",
ylim = c(0, 22000), axes = FALSE)
points(date_time, n, col = col[as.character(hour(date_time))], pch = 19)
})
with(filter(mvts2, wday(date_time) < 2), points(date_time, n, col = col[as.character(hour(date_time))], cex = 2))
ats <- seq.POSIXt(ymd_hms("2020-01-01 00:00:00"), ymd_hms("2020-02-01 00:00:00"), "month")
axis(1, ats, month.abb[month(ats)])
axis(2)
box(bty = "o")
d1 <- ymd_hms("2020-01-23 00:00:00")
d2 <- ymd_hms("2020-01-29 23:59:59")
polygon(c(d1, d2, d2, d1), c(-1000, -1000, 25000, 25000), col = adjustcolor("red", .1), border = NA)
opar <- par(plt = c(.2, .5, .2, .5), new = TRUE)
inside_width <- .5
pie(c(3, 3, 3), NA, clockwise = TRUE, init.angle = 285, col = col)
plotrix::draw.circle(0, 0, inside_width, 200, col = "white") %>%
data.frame() %>%
filter(y < 0) %>%
with(polygon(x, y, col = "black"))
par(opar)
Let’s retrieve the names of the communes of Vinh Phuc
vp_com <- vn3 %>%
filter(NAME_1 == "Vĩnh Phúc") %>%
pull(NAME_3) %>%
sort()
which is:
vp_com
## [1] "An Hòa" "An Tường" "Bá Hiến" "Bắc Bình" "Bạch Lưu" "Bàn Giản" "Bình Định" "Bình Dương"
## [9] "Bồ Lý" "Bồ Sao" "Cao Đại" "Cao Minh" "Cao Phong" "Chấn Hưng" "Đại Đình" "Đại Đồng"
## [17] "Đại Tự" "Đạo Đức" "Đạo Trù" "Đạo Tú" "Đình Chu" "Định Trung" "Đôn Nhân" "Đồng Cương"
## [25] "Đống Đa" "Đồng Ích" "Đồng Quế" "Đồng Tâm" "Đồng Thịnh" "Đồng Tĩnh" "Đồng Văn" "Đồng Xuân"
## [33] "Đức Bác" "Duy Phiên" "Gia Khánh" "Hải Lựu" "Hồ Sơn" "Hoa Sơn" "Hoàng Đan" "Hoàng Hoa"
## [41] "Hoàng Lâu" "Hội Hợp" "Hồng Châu" "Hồng Phương" "Hợp Châu" "Hợp Hòa" "Hợp Lý" "Hợp Thịnh"
## [49] "Hùng Vương" "Hương Canh" "Hướng Đạo" "Hương Sơn" "Khai Quang" "Kim Long" "Kim Xá" "Lãng Công"
## [57] "Lập Thạch" "Liên Bảo" "Liên Châu" "Liên Hòa" "Liễn Sơn" "Lũng Hoà" "Lý Nhân" "Minh Quang"
## [65] "Nam Viêm" "Nghĩa Hưng" "Ngô Quyền" "Ngọc Mỹ" "Ngọc Thanh" "Ngũ Kiên" "Nguyệt Đức" "Nhân Đạo"
## [73] "Nhạo Sơn" "Như Thụy" "Phú Đa" "Phú Thịnh" "Phú Xuân" "Phúc Thắng" "Phương Khoan" "Quang Sơn"
## [81] "Quang Yên" "Quất Lưu" "Sơn Đông" "Sơn Lôi" "Tam Đảo" "Tam Hồng" "Tam Hợp" "Tam Phúc"
## [89] "Tam Quan" "Tam Sơn" "Tân Cương" "Tân Lập" "Tân Phong" "Tân Tiến" "Tề Lỗ" "Thái Hòa"
## [97] "Thanh Lãng" "Thanh Trù" "Thanh Vân" "Thiện Kế" "Thổ Tang" "Thượng Trưng" "Tích Sơn" "Tiền Châu"
## [105] "Tiên Lữ" "Triệu Đề" "Trung Hà" "Trung Kiên" "Trung Mỹ" "Trung Nguyên" "Trưng Nhị" "Trưng Trắc"
## [113] "Tử Du" "Tứ Trưng" "Tứ Yên" "Tuân Chính" "Vân Hội" "Văn Quán" "Văn Tiến" "Vân Trục"
## [121] "Vân Xuân" "Việt Xuân" "Vĩnh Ninh" "Vĩnh Sơn" "Vĩnh Thịnh" "Vĩnh Tường" "Vũ Di" "Xuân Hoà"
## [129] "Xuân Hòa" "Xuân Lôi" "Yên Bình" "Yên Đồng" "Yên Dương" "Yên Lạc" "Yên Lập" "Yên Phương"
## [137] "Yên Thạch"
Let’s now filter the records that start from Vinh Phuc:
from_vp <- filter(mvts, start_name %in% vp_com)
which is 95090 of them and the records that end up in Vinh Phuc:
to_vp <- filter(mvts, end_name %in% vp_com)
which is 95060 of them. Let’s check that every thing looks corrects:
vp <- filter(vn1, VARNAME_1 == "Vinh Phuc")
plot(st_geometry(vp), col = "grey")
from_vp %>%
unique() %>%
st_as_sf(coords = c("start_lon", "start_lat"), crs = 4326) %>%
st_geometry() %>%
plot(add = TRUE, pch = 3, col = "red")
And:
plot(st_geometry(vp), col = "grey")
to_vp %>%
unique() %>%
st_as_sf(coords = c("end_lon", "end_lat"), crs = 4326) %>%
st_geometry() %>%
plot(add = TRUE, pch = 3, col = "blue")
Everything looks OK! But:
plot(st_geometry(vn1), col = "grey")
from_vp %>%
unique() %>%
st_as_sf(coords = c("start_lon", "start_lat"), crs = 4326) %>%
st_geometry() %>%
plot(add = TRUE, pch = 3, col = "red")
And:
plot(st_geometry(vn1), col = "grey")
to_vp %>%
unique() %>%
st_as_sf(coords = c("end_lon", "end_lat"), crs = 4326) %>%
st_geometry() %>%
plot(add = TRUE, pch = 3, col = "blue")
We cannot rely on the communes’s names and need to refine a bit…
from_vp %<>%
st_as_sf(coords = c("start_lon", "start_lat"), crs = 4326) %>%
st_intersects(vp) %>%
map_lgl(~ .x %>% length() %>% as.logical()) %>%
which() %>%
slice(from_vp, .)
And:
to_vp %<>%
st_as_sf(coords = c("end_lon", "end_lat"), crs = 4326) %>%
st_intersects(vp) %>%
map_lgl(~ .x %>% length() %>% as.logical()) %>%
which() %>%
slice(to_vp, .)
Let’s check:
plot(st_geometry(vn1), col = "grey")
from_vp %>%
unique() %>%
st_as_sf(coords = c("start_lon", "start_lat"), crs = 4326) %>%
st_geometry() %>%
plot(add = TRUE, pch = 3, col = "red")
And:
plot(st_geometry(vn1), col = "grey")
to_vp %>%
unique() %>%
st_as_sf(coords = c("end_lon", "end_lat"), crs = 4326) %>%
st_geometry() %>%
plot(add = TRUE, pch = 3, col = "blue")
Now it looks good!
plot(st_geometry(vn1), col = "grey")
with(from_vp, segments(start_lon, start_lat, end_lon, end_lat, col = "red"))
plot(st_geometry(vn1), col = "grey")
with(to_vp, segments(start_lon, start_lat, end_lon, end_lat, col = "blue"))
Let’s look another scale:
vn1 %>%
filter(VARNAME_1 %in% c("Ha Giang", "Ha Nam")) %>%
st_geometry() %>%
plot(border = "white")
plot(st_geometry(vn1), col = "grey", add = TRUE)
with(from_vp, segments(start_lon, start_lat, end_lon, end_lat, col = "red"))
box(bty = "o")
and:
vn1 %>%
filter(VARNAME_1 %in% c("Ha Giang", "Ha Nam")) %>%
st_geometry() %>%
plot(border = "white")
plot(st_geometry(vn1), col = "grey", add = TRUE)
with(to_vp, segments(start_lon, start_lat, end_lon, end_lat, col = "blue"))
box(bty = "o")
Zooming more:
to_vp2 <- to_vp %>%
group_by(start_name, end_name) %>%
summarise(start_lon = mean(start_lon),
start_lat = mean(start_lat),
end_lon = mean(end_lon),
end_lat = mean(end_lat),
n_crisis = sum(n_crisis))
x <- range(to_vp2$n_crisis)
y <- c(.005, 50)
m <- lm(y ~ x)
plot(st_geometry(vp), col = "grey")
with(to_vp2, segments(start_lon, start_lat, end_lon, end_lat, lwd = predict(m, data.frame(x = n_crisis)), col = "blue"))
Let’s focus now only on those who move: