--- title: "user_sample_2" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{user_sample_2} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- Run multiple stations and models for 9 euro ticket 1. Adapt directory path! ``` r # set the dir where the data is stored data_dir <- "../../Daten/user_sample_data/" ``` 2. Adapt this part based on the effect/target/stations that is investigated ``` r sample_name <- "NeunEuroTicket" target <- "NO2" stations <- list(Luenen = "DENW006", AachenBurtscheid = "DENW094") meteo_variables <- c("TMP", "RFE", "WIG", "WIR", "LDR") application_start <- lubridate::ymd("20220301") # = start reference time date_effect_start <- lubridate::ymd_hm("20220601 00:00") application_end <- lubridate::ymd("20220831") # = end effect time buffer <- 0 # number of data points to be ignored before effect trend <- "linear" # hyperparameters can be set in params/params.yaml model_types <- c("lightgbm", "rf", "dynamic_regression", "fnn") window_size <- 14 # days of data to calculate the mean in prediction results ``` 3. Load data and train models. This part does not necessarily need to be changed. ``` r library(ubair) ``` ``` r # This might take a few seconds for large files data <- load_uba_data_from_dir(data_dir = data_dir) params <- load_params() params$target <- target params$meteo_variables <- meteo_variables ``` ``` r for (station_name in names(stations)) { station <- stations[[station_name]] predictions_all <- data.table::data.table() metrics_all <- data.table::data.table() env_data <- clean_data(data, station = station) dt_prepared <- prepare_data_for_modelling(env_data, params) dt_prepared <- dt_prepared[complete.cases(dt_prepared)] split_data <- split_data_counterfactual( dt_prepared, application_start = application_start, application_end = application_end ) for (model_type in model_types) { message(paste("start training:", station_name, station, model_type)) res <- run_counterfactual(split_data, params, detrending_function = trend, model_type = model_type, alpha = 0.9, log_transform = FALSE ) predictions <- data.table::copy(res$prediction) # plot bau_plot <- plot_counterfactual(predictions, params, window_size = window_size, date_effect_start, buffer = buffer ) # evaluation metrics <- round(calc_performance_metrics(predictions, date_effect_start, buffer = buffer ), 2) effect <- estimate_effect_size(predictions, date_effect_start, buffer = buffer, verbose = FALSE ) metrics["effect_size"] <- effect["absolute_effect"] metrics["relative_effect"] <- effect["relative_effect"] # add information for export metrics["model"] <- model_type metrics["trend"] <- trend metrics["station_name"] <- station_name metrics["station"] <- station metrics["buffer_start"] <- format( date_effect_start - as.difftime(buffer, units = "hours"), "%Y-%m-%d" ) metrics["effect_start"] <- format(date_effect_start, "%Y-%m-%d") metrics_dt <- data.table::as.data.table(t(metrics)) metrics_all <- rbind(metrics_all, metrics_dt) predictions[, station := station] predictions[, model := model_type] predictions[, trend := trend] predictions_all <- rbind(predictions_all, predictions) } # save predictions (hourly data) and metrics predictions_save <- dplyr::select( predictions_all, c( date, value, prediction, prediction_lower, prediction_upper, station, model, trend ) ) predictions_save$date <- format(predictions_save$date, "%Y-%m-%d %H:%M") } #> start training: Luenen DENW006 lightgbm #> [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000319 seconds. #> You can set `force_row_wise=true` to remove the overhead. #> And if memory is not enough, you can set `force_col_wise=true`. #> [LightGBM] [Info] Total Bins 1549 #> [LightGBM] [Info] Number of data points in the train set: 60472, number of used features: 8 #> [LightGBM] [Info] Start training from score 0.000000 #> start training: Luenen DENW006 rf #> start training: Luenen DENW006 dynamic_regression #> Using data for dynamic regression training from 2021-01-22 01:00:00 to 2022-02-28 23:00:00. Too long training series can lead to worse performance. Adjust this via the dynamic_regression$ntrain hyperparameter. #> start training: Luenen DENW006 fnn #> start training: AachenBurtscheid DENW094 lightgbm #> [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031247 seconds. #> You can set `force_col_wise=true` to remove the overhead. #> [LightGBM] [Info] Total Bins 1550 #> [LightGBM] [Info] Number of data points in the train set: 60039, number of used features: 8 #> [LightGBM] [Info] Start training from score -0.000000 #> start training: AachenBurtscheid DENW094 rf #> start training: AachenBurtscheid DENW094 dynamic_regression #> Using data for dynamic regression training from 2021-01-10 04:00:00 to 2022-02-28 23:00:00. Too long training series can lead to worse performance. Adjust this via the dynamic_regression$ntrain hyperparameter. #> start training: AachenBurtscheid DENW094 fnn ```