--- title: "diseasystore: Google Health COVID-19 Open Data" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{diseasystore: Google Health COVID-19 Open Data} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup} library(diseasystore) ``` ```{r hidden_options, include = FALSE} if (rlang::is_installed("withr")) { withr::local_options("tibble.print_min" = 5) withr::local_options("diseasystore.verbose" = FALSE) } else { opts <- options("tibble.print_min" = 5, "diseasystore.verbose" = FALSE) } # We have a "hard" dependency for duckdb to render parts of this vignette suggests_available <- rlang::is_installed("duckdb") not_on_cran <- interactive() || identical(Sys.getenv("NOT_CRAN"), "true") || identical(Sys.getenv("CI"), "true") ``` The Google COVID-19 data repository is a comprehensive open repository of COVID-19 data. This vignette shows how to use (some of) this data through the `diseasystore` package. First, it is a good idea to copy the relevant Google COVID-19 data files locally and store that location as an option for the package. `?DiseasystoreGoogleCovid19` uses only the age-stratified metrics for COVID-19, so only a subset of the repository is needed to download. ```{r download_data, eval = FALSE} # First we set the path we want to use as an option options( "diseasystore.DiseasystoreGoogleCovid19.source_conn" = file.path("local", "path") ) # Ensure folder exists source_conn <- diseasyoption("source_conn", "DiseasystoreGoogleCovid19") if (!dir.exists(source_conn)) { dir.create(source_conn, recursive = TRUE, showWarnings = FALSE) } # Define the Google files to download google_files <- c("by-age.csv", "demographics.csv", "index.csv", "weather.csv") # Download each file and compress them to reduce storage purrr::walk(google_files, ~ { url <- paste0(diseasyoption("remote_conn", "DiseasystoreGoogleCovid19"), .) destfile <- file.path( diseasyoption("source_conn", "DiseasystoreGoogleCovid19"), . ) if (!file.exists(destfile)) { download.file(url, destfile) } }) ``` ```{r download_data_hidden, include = FALSE, eval = not_on_cran} # The files we need are stored remotely in Google's API google_files <- c("by-age.csv", "demographics.csv", "index.csv", "weather.csv") remote_conn <- diseasyoption("remote_conn", "DiseasystoreGoogleCovid19") # In practice, it is best to make a local copy of the data which is stored in the "vignette_data" folder # This folder can either be in the package folder (preferred, please create the folder) or in the tempdir() local_conn <- purrr::detect("vignette_data", checkmate::test_directory_exists, .default = tempdir()) if (rlang::is_installed("withr")) { withr::local_options("diseasystore.DiseasystoreGoogleCovid19.source_conn" = local_conn) withr::local_options("diseasystore.DiseasystoreGoogleCovid19.n_max" = 1000) } else { opts <- c(opts, options("diseasystore.DiseasystoreGoogleCovid19.source_conn" = local_conn, "diseasystore.DiseasystoreGoogleCovid19.n_max" = 1000)) } # Then we download the first n rows of each data set of interest try({ purrr::discard(google_files, ~ checkmate::test_file_exists(file.path(local_conn, .))) |> purrr::walk(\(file) { paste0(remote_conn, file) |> readr::read_csv(n_max = 1000, show_col_types = FALSE, progress = FALSE) |> readr::write_csv(file.path(local_conn, file)) }) }) # Check that the files are available after attempting to download if (purrr::some(google_files, ~ !checkmate::test_file_exists(file.path(local_conn, .)))) { data_available <- FALSE } else { data_available <- TRUE } ``` The `diseasystores` require a database to store its features in. These should be configured before use and can be stored in the packages options. ```{r configure_diseasystore, eval = FALSE} # We define target_conn as a function that opens a DBIconnection to the DB target_conn <- \() DBI::dbConnect(duckdb::duckdb()) options( "diseasystore.DiseasystoreGoogleCovid19.target_conn" = target_conn ) ``` ```{r configure_diseasystore_hidden, include = FALSE, eval = not_on_cran && suggests_available && data_available} target_conn <- \() DBI::dbConnect(duckdb::duckdb()) if (rlang::is_installed("withr")) { withr::local_options("diseasystore.DiseasystoreGoogleCovid19.target_conn" = target_conn) } else { opts <- c(opts, options("diseasystore.DiseasystoreGoogleCovid19.target_conn" = target_conn)) } ``` Once the files are downloaded and the target DB is configured, we can initialize the `diseasystore` that uses the Google COVID-19 data. ```{r initializing_diseasystore, eval = not_on_cran && suggests_available && data_available} ds <- DiseasystoreGoogleCovid19$new() ``` Once configured such, we can use the feature store directly to get data. ```{r using_diseasystore_example_1, eval = not_on_cran && suggests_available && data_available} # We can see all the available features in the feature store ds$available_features ``` ```{r using_diseasystore_example_2, eval = not_on_cran && suggests_available && data_available} # And then retrieve a feature from the feature store ds$get_feature(feature = "n_hospital", start_date = as.Date("2020-01-01"), end_date = as.Date("2020-06-01")) ``` ```{r cleanup, include = FALSE} if (exists("ds")) rm(ds) gc() if (!rlang::is_installed("withr")) { options(opts) } ```