This is a quick blog post to try and figure out which school boards are the “hottest zones” for covid.

In this blog post, I will do the following: - get the shapefiles for both the school boards (commission scolaires, or CS) and the smallest possible health districts (réseau local de santé, or RLS) - find the intersection between school boards and RLS using sf::st_intersection() - find the population of each intersection using cancensus::get_census() and tongfen::tongfen_estimate() - assign the number of cases in each health district (RLS) to each intersection proportionnaly to the population using tongfen::proportional_reaggregate() - sum the number of cases for each school board (commissions scolaires).

This post is massively dependent on Jens von Bergmann (@vb_jens) who created both the {cancensus} and the {tongfen} packages and also helped me getting started with tongfen. Thanks Jens!

First, download the number of covid cases at the RLS level from Claude Boucher’s google spreadsheet (and process data from my cronjob)

## claude data 
url_rls <- "https://docs.google.com/spreadsheets/d/17eJ05pg6fkzKlwwiWn0ztdMvzBX9pGx3q6w7h3_vJw4/"
gs4_deauth()
prout <- read_sheet(url_rls)

rls_population <- prout %>% select(RLS, Population) %>% filter(RLS != "Total") 

rls_claude  <- prout[1:which(prout$NoRLS == 1801), ] %>% 
  filter(!is.na(NoRLS)) %>% 
  select(-No,  -NoRLS,  -Population)%>% 
  gather(key=key, value=cumulative_cases, -RLS, -RSS) %>%
  mutate_at( vars(cumulative_cases) , as.numeric) %>% 
  filter(!is.na(cumulative_cases)) %>%
  #filter(NoRLS == "0112") %>%
  mutate(cumulative_cases = map_int(cumulative_cases, ~ as.integer(str_replace_all(.x, " ", "")))) %>% # remove spaces from numbers
  mutate(date_report = lubridate::ymd(key)) %>% 
  select(-key) 

# cronjo data


prepare_cronjob_data <- function(mypath = "~/cronjob/tableau-rls/"){
  pouet <- list.files(
    path =mypath,
    pattern = ".csv$", 
    full.names = TRUE)
  
  csvs <- purrr::map(pouet, read_csv)
  
  keep <- rep(TRUE, length(csvs))
  for (i in seq(from=2, to = length(csvs))){
    if ( identical(csvs[[i]],csvs[[i-1]])){ 
      keep[i]= FALSE
    }
  }
  dates <- lubridate::ymd(pouet %>% str_sub(start=-19, end = -12))
  datetimes <- lubridate::ymd_hms(
    paste0(
      pouet %>% str_sub(
        start=-19, end = -12),pouet %>% 
        str_sub(start=-10, end = -5) ))
  z1<- datetimes[keep]
  z2 <- dates[keep]
  z3 <- csvs[keep]
  
  dates_fixed <- tibble(download_datetime = z1) %>%
    mutate(download_hour = lubridate::hour(download_datetime),
           download_date = lubridate::date(download_datetime),
           report_date = if_else(download_hour >=8, download_date, download_date-1) # au milieu de la nuit tu es le rapport d'hier
    ) %>%
    arrange(desc(download_datetime)) %>%
    mutate(report_date_lag = lag(report_date),
           #si j'ai deux rapports différents dans la même journée, le premier est celui de la veille
           report_date_fixed = if_else(report_date == report_date_lag, report_date-1, report_date, report_date))  %>%
    arrange(download_datetime)
  
  gaa <- purrr::map2(z3, z1,
                     ~ .x %>% mutate(download_datetime = .y) )
  gaaa <- purrr::map2(gaa, dates_fixed$report_date_fixed,
                      ~ .x %>% mutate(date_report = .y) )
  
  rls_data <- gaaa %>%
    bind_rows() %>%
    rename(cumulative_cases = Cas) 
  
}

myrls <- prepare_cronjob_data() %>% 
  filter(!is.na(NoRLS), RLS != "Total") %>%
  mutate(
    cumulative_cases = if_else(cumulative_cases != "n.d.", cumulative_cases, NA_character_),
    cumulative_cases = as.numeric(str_replace_all(cumulative_cases, "\\s+", "")),
    Taux = if_else(Taux != "n.d.", Taux, NA_character_)
  ) %>%
  select(-No, -NoRLS, -Population, -Taux, -download_datetime)


# combine both sources, fill in the blanks

both <- bind_rows(
  rls_claude %>% 
    mutate(source = "bouchecl") %>% 
    filter(!(date_report %in% unique(myrls$date_report))) ,
  myrls %>% 
    mutate(source = "cronjob") 
)


rls <- both %>% 
  group_by(RSS,RLS) %>% 
  arrange( RSS,RLS, desc(date_report)) %>% ## descending date to fix cumulative 
  mutate(cumulative_cases = if_else(cumulative_cases > cummin(cumulative_cases), cummin(cumulative_cases), cumulative_cases, cumulative_cases)) %>% ## cumulative can't be bigger than next day.. if so reduce to next day level.
  arrange( RSS,RLS, date_report) %>% ## ascending date
  mutate(slope = (lead(cumulative_cases)- cumulative_cases) / as.numeric(lead(date_report) - date_report)) %>%
  complete(date_report = seq.Date(min(date_report), max(date_report), by = "day")) %>%
  fill(slope, .direction= "down") %>%
  mutate(cumulative_cases = floor(first(cumulative_cases) + cumsum(slope))) %>%   
  select(-slope) %>%
  mutate(cases = cumulative_cases - lag(cumulative_cases)) %>% 
  ungroup() %>%
  filter(!is.na(cases))%>%
  mutate(shortname_rls = str_replace(str_extract(str_replace(RLS, "RLS de ","RLS " ),"RLS.+"),"RLS ", "")) %>%
  left_join(rls_population) %>%
  mutate(cases_per_100k = cases * 1e5 / Population,
         cases_last_14_days = (cumulative_cases - lag(cumulative_cases,14)),
         cases_last_14_days_per_100k = cases_last_14_days * 1e5 / Population,
         RLS_code = str_extract(RLS, "^\\d+")
  )




prep_data <- function(data, group, type){
  
  type_column <- enquo(type)   ## this has to be !!
  type_name <- quo_name(type_column) ## its a string, dont !!
  mean_name = paste0("avg_", type_name, "_last7")   
  mean_column <- sym(mean_name)
  gaa <-   data %>% 
    group_by( {{ group }} ) %>%
    arrange(date_report) %>% 
    mutate(!!mean_name := ( !!type_column + lag(!!type_column, 1) + lag(!!type_column, 2) + lag(!!type_column, 3) + lag(!!type_column, 4) + lag(!!type_column, 5) + lag(!!type_column, 6)) / 7)  %>%
    ungroup() 
  
  gaa1 <- gaa %>%
    group_by( {{ group }}) %>%
    summarise(total = sum(!!type_column),
              worst7 = max(!!mean_column, na.rm = TRUE),
              last7  = max(!!mean_column * (date_report == max(date_report)), na.rm = TRUE),
              ratio = last7 / worst7,
              winning = factor( 
                case_when(ratio < 0.33 ~ "Winning",
                          ratio < 0.67 ~ "Nearly there",
                          TRUE ~ "Needs action"
                )
                , levels = c("Winning", "Nearly there", "Needs action"))
    ) %>%
    ungroup() %>%
    
    mutate(
      group = fct_reorder( {{ group }}, total, .desc =TRUE)
    )
  
  gaa %>% 
    left_join(gaa1) 
  
}

rls_cases <- prep_data(rls, shortname_rls, type = cases)
firstdate <- min(rls_cases$date_report)
lastdate  <- max(rls_cases$date_report)

We want to assign the covid cases from these “RLS” (health districts)

rls_shp <- read_sf(here::here("content/post/data/downloads/Territoires_RLS_2020.shp")) %>%
  st_transform(crs=4326) %>%
  rmapshaper::ms_simplify() %>%
  left_join(rls %>% filter(date_report ==max(date_report)) %>% select(date_report, RLS_code, cases_last_14_days_per_100k, cumulative_cases, cases_last_14_days, Population ))

ggplot(data = rls_shp) +
  geom_sf(aes(fill=cases_last_14_days))+
  labs(title = paste0("There are ", nrow(rls_shp), " RLS"))

To these “commission scolaires”.

CS_FRA_SDA <- read_sf(here::here("content/post/data/downloads/CS", "CS_FRA_SDA.shp"))%>%
  rmapshaper::ms_simplify()

ggplot(data = CS_FRA_SDA) +
  geom_sf()+
  labs(title = paste0("There are ", nrow(CS_FRA_SDA), " commissions scolaires"))

First, we create the smallest common denominators (intersections) between the two geographies using sf::st_intersection() and st_collection_extract()

intersections <- st_intersection(
  rls_shp %>% select(RLS_code),  ## RLS = région locale de service = health region,   RLS_code = health region id
  CS_FRA_SDA %>% select(CD_CS, NOM_CS)) %>%  ## CS = school board, CD_CS = school board id
  st_collection_extract(type="POLYGON") ## drop polylines

ggplot(data = intersections) +
  geom_sf()+
  labs(title = paste0("There are ", nrow(intersections), " intersections"))

Then, we will allocate the cases at the RLS to the lower geography level “intersections” proportionnally to the population of the intersections.

To do this, we must find the population of each intersection.
First, we download census population and shapefile using cancensus::get_census() Table below shows the population for a sample of the dissemination areas (DA)

census_data_da <- get_census(dataset='CA16', regions=list(PR="24"), vectors=c("pop2016"= "v_CA16_401"), level='DA',geo_format="sf") 

head(census_data_da %>% st_drop_geometry)
##   Shape Area Type Households CD_UID Dwellings   GeoUID Population CSD_UID
## 1   40.78502   DA        213   2401       254 24010018        465 2401042
## 2    7.91099   DA        230   2401       261 24010019        496 2401023
## 3   33.38888   DA        224   2401       248 24010020        523 2401023
## 4    2.94327   DA        212   2401       235 24010021        524 2401023
## 5    3.98348   DA        251   2401       285 24010022        530 2401023
## 6    5.70935   DA        251   2401       301 24010023        542 2401023
##   CMA_UID CT_UID              Region Name Area (sq km) pop2016
## 1    <NA>   <NA>               Grosse-Île     40.78502     465
## 2    <NA>   <NA> Les Îles-de-la-Madeleine      7.91099     496
## 3    <NA>   <NA> Les Îles-de-la-Madeleine     33.38888     523
## 4    <NA>   <NA> Les Îles-de-la-Madeleine      2.94327     524
## 5    <NA>   <NA> Les Îles-de-la-Madeleine      3.98348     530
## 6    <NA>   <NA> Les Îles-de-la-Madeleine      5.70935     542

Then we estimate the population of the “intersections” using tongfen::tongfen_estimate(). The estimates are “perfect” if the borders of the intersections line up with statcan’s “dissemination area”. When they dont, the population of a dissemination area is spread proportionally to the area covered by each intersection.

Table below shows the population for a sample of the intersections

intersections_populations <- tongfen_estimate(
  intersections,
  census_data_da,
  meta_for_additive_variables("census_data_da", "Population")
)

head(intersections_populations)
## Simple feature collection with 6 features and 4 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -72.09877 ymin: 45.00646 xmax: -71.39747 ymax: 45.7684
## geographic CRS: WGS 84
##     Population RLS_code  CD_CS                NOM_CS
## 1 1.501631e+01     0513 751000 CSS des Hauts-Cantons
## 2 3.087784e-01     0513 751000 CSS des Hauts-Cantons
## 3 2.717933e-01     0513 751000 CSS des Hauts-Cantons
## 4 1.667248e+04     0514 751000 CSS des Hauts-Cantons
## 5 1.741468e-01     0516 751000 CSS des Hauts-Cantons
## 6 1.966473e-02     0517 751000 CSS des Hauts-Cantons
##                         geometry
## 1 MULTIPOLYGON (((-71.90947 4...
## 2 MULTIPOLYGON (((-71.92083 4...
## 3 MULTIPOLYGON (((-71.91972 4...
## 4 MULTIPOLYGON (((-71.81586 4...
## 5 MULTIPOLYGON (((-71.71563 4...
## 6 MULTIPOLYGON (((-71.47856 4...

Assign the health districts covid cases to the lower geography level (intersections) using proportional_reaggregate(). This is population-weighted. Then, sum the number of cases and population in each intersection to get the number of cases and population at the school board (Commission scolaire) level.

The end result is this map and table:

cases_intersections <- tongfen::proportional_reaggregate(
  data = intersections_populations,
  parent_data = rls_shp ,
  geo_match = c("RLS_code" = "RLS_code"),
  categories = c("cases_last_14_days"),
  base = "Population"
)


cases_cs <- cases_intersections %>% 
  group_by(CD_CS, NOM_CS) %>% 
  summarise(
    cases_last_14_days = sum(cases_last_14_days, na.rm = TRUE),
    Population = sum(Population, na.rm = TRUE),
    
  ) %>%
  ungroup() %>%
  mutate(
    dailycases_per_1M_avg_14_days = round(cases_last_14_days * 1e6 /14 / Population,1),
    
    cases_last_14_days_per_100k = round(cases_last_14_days * 100000 / Population,1),
    cases_last_14_days = round(cases_last_14_days),
    Population = round(Population)
    
  )

ggplot(data = cases_cs) +
  geom_sf(aes(fill=dailycases_per_1M_avg_14_days))+ 
  #scale_fill_viridis_c() + 
  scale_fill_gradient(
    
    low = "white",
    high = "red",
    space = "Lab",
    na.value = "grey50",
    guide = "colourbar",
    aesthetics = "fill"
  ) + 
  labs(title = paste0("Nombre moyen de nouveaux cas de covid19 par million d'habitants \n pour les 14 derniers jours pour les commissions scolaires en date du  ", lastdate))+
  theme_bw() +
  theme(panel.grid.major = element_line(colour = "transparent"))

cases_cs %>% 
  ungroup %>% 
  st_drop_geometry() %>%
  arrange(-dailycases_per_1M_avg_14_days) %>%
  mutate(rang = row_number())%>%
  select(rang, NOM_CS, dailycases_per_1M_avg_14_days, cases_last_14_days, Population) %>% 
  kable(
    caption  = paste0("Nombre moyen de nouveaux cas de covid19 par million d'habitants \n pour les 14 derniers jours pour les commissions scolaires en date du  ", lastdate),
    col.names = c("Rang", "Nom Commission Scolaire", "Nombre moyen de cas par million dans les 14 derniers jours", "Nombre de nouveaux cas dans les 14 derniers jours", "Population 2016")) %>% 
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE, position = "center") %>% 
  row_spec(0, background = "#555555", color = "#fff")
Table 1: Nombre moyen de nouveaux cas de covid19 par million d’habitants pour les 14 derniers jours pour les commissions scolaires en date du 2020-08-25
Rang Nom Commission Scolaire Nombre moyen de cas par million dans les 14 derniers jours Nombre de nouveaux cas dans les 14 derniers jours Population 2016
1 CSS des Samares 31.0 90 207792
2 CSS de la Région-de-Sherbrooke 19.7 49 179142
3 CSS de Laval 16.7 99 423003
4 CSS des Grandes-Seigneuries 15.9 44 197898
5 CSS de la Rivière-du-Nord 15.4 41 190460
6 CSS de la Seigneurie-des-Mille-Îles 15.1 61 291498
7 CSS au Coeur-des-Vallées 14.4 11 53662
8 CSS de Montréal 14.2 202 1014236
9 CSS des Draveurs 14.2 28 140717
10 CSS des Portages-de-l’Outaouais 14.0 30 153452
11 CSS de la Pointe-de-l’Île 13.4 59 316733
12 CSS Marguerite-Bourgeoys 12.8 109 610999
13 CSS des Affluents 12.1 45 267561
14 CSS de la Moyenne-Côte-Nord 11.1 1 6424
15 CSS du Val-des-Cerfs 10.4 21 146620
16 CSS des Trois-Lacs 10.0 21 149319
17 CSS de la Vallée-des-Tisserands 9.5 12 86777
18 CSS des Chênes 9.0 13 103398
19 CSS des Hautes-Rivières 8.1 18 155632
20 CSS des Patriotes 7.8 29 270108
21 CSS du Pays-des-Bleuets 7.6 6 56735
22 CSS de Portneuf 6.7 5 53072
23 CSS de la Riveraine 6.6 4 43263
24 CSS de Saint-Hyacinthe 6.3 9 102679
25 CSS Marie-Victorin 6.2 30 347095
26 CSS des Découvreurs 5.1 10 138839
27 CSS de Charlevoix 5.0 2 28363
28 CSS du Fleuve-et-des-Lacs 4.9 2 29130
29 CSS de la Capitale 4.6 18 278887
30 CSS des Laurentides 4.5 6 92009
31 CSS René-Lévesque 4.4 3 48363
32 CSS des Navigateurs 4.3 11 183344
33 CSS du Lac-Saint-Jean 4.1 3 52082
34 CSS des Hauts-Bois-de-l’Outaouais 4.1 2 33871
35 CSS des Premières-Seigneuries 3.8 12 230723
36 CSS du Fer 3.6 2 39323
37 CSS des Sommets 3.5 4 84188
38 CSS des Hauts-Cantons 3.4 2 51887
39 CSS de Kamouraska-Rivière-du-Loup 2.5 2 55735
40 CSS Pierre-Neveu 2.0 1 35249
41 CSS de la Côte-du-Sud 1.9 2 70534
42 CSS de Rouyn-Noranda 1.7 1 41784
43 CSS des Appalaches 1.6 1 45953
44 CSS de Sorel-Tracy 1.4 1 50995
45 CSS du Chemin-du-Roy 1.3 3 168945
46 CSS de la Beauce-Etchemin 1.2 2 123825
47 CSS des Bois-Francs 0.8 1 94346
48 CSS des Monts-et-Marées 0.0 0 39227
49 CSS des Phares 0.0 0 74864
50 CSS des Rives-du-Saguenay 0.0 0 100969
51 CSS De La Jonquière 0.0 0 66717
52 CSS de l’Énergie 0.0 0 97132
53 CSS du Lac-Témiscamingue 0.0 0 15948
54 CSS Harricana 0.0 0 24652
55 CSS de l’Or-et-des-Bois 0.0 0 43767
56 CSS du Lac-Abitibi 0.0 0 20533
57 CSS de l’Estuaire 0.0 0 41865
58 CSS de la Baie-James 0.0 0 16177
59 CSS des Îles 0.0 0 12475
60 CSS des Chic-Chocs 0.0 0 29447
cases_cs %>% 
  arrange(-dailycases_per_1M_avg_14_days)  %>%
  mutate(NOM_CS = as.factor(NOM_CS) )%>%
  ggplot(aes(x= fct_reorder(NOM_CS, dailycases_per_1M_avg_14_days), y= dailycases_per_1M_avg_14_days )) + 
  geom_col() +
  coord_flip() +
  geom_hline(yintercept = 20, color = "red") +
  cowplot::theme_half_open() +
  cowplot::background_grid() +
  colorblindr::scale_color_OkabeIto( ) +
  labs(
    title = "Nombre de cas par 1 million d'habitant au cours des 14 derniers jours",
    subtitle = "Selon le ministre Dubé, la pandémie est contrôlée sour 20 cas / million",
    caption = paste0("En date du " , format(lastdate, format="%B %d %Y")),
    x = "Commission scolaire",
    y= "Nombre de cas par million"
  )