content/02-model/01-train-and-deploy-model/document.qmd

---
title: "Model Step 1 - Train and Deploy Model"
date: "`r lubridate::now(tzone = 'EST')` EST"
output: html_document
---

This notebook trains a model to predict the number of bikes at a given bike docking station. The model is trained using the *bike_model_data* table from *Content DB*. The trained model is then:

- pinned to RStudio Connect
- deployed as a plumber API to Rstudio Connect using vetiver.

```{r setup, include=FALSE}
knitr::opts_chunk$set(warning = FALSE, message = FALSE, collapse = TRUE) 

library(dplyr)
library(dbplyr)
library(glue)
library(recipes)
library(parsnip)
library(workflows)
library(vetiver)
library(rsconnect)

# Internally developed packages
library(bikeHelpR)

# The following packages are not directly called, but are requirements. They
# need to be here so that `rsconnect::writeManifest` is able to capture them
# as a dependency.
library(ranger)

```

## Get data

Connect to the database:

```{r connect_to_db}
con <- DBI::dbConnect(odbc::odbc(), "Content DB")
```

Split the data into a train/test split:

```{r train_test_split_data}
all_days <- tbl(con, "bike_model_data")

# Get a vector that contains all of the dates.
dates <- all_days %>%
  distinct(date) %>%
  collect() %>%
  arrange(desc(date)) %>%
  pull(date) %>%
  as.Date()

# Split the data into test and train.
n_days_test <- 2
n_days_to_train <- 10

train_end_date <- dates[n_days_test + 1]
train_start_date <- train_end_date - n_days_to_train

# Training data split.
train_data <- all_days %>%
  filter(
    date >= train_start_date,
    date <= train_end_date
  ) %>%
  distinct() %>%
  collect()

print(glue::glue(
  "The model will be trained on data from {start} to {end} ",
  "({num_obs} observations). ",
  start = min(train_data$date),
  end = max(train_data$date),
  num_obs = scales::comma(nrow(train_data)),
))

# Test data split.
test_data <- all_days %>%
  filter(date > train_end_date) %>%
  distinct() %>%
  collect()

print(glue::glue(
  "The model will be tested on data from {start} to {end} ",
  "({num_obs} observations). ",
  start = min(test_data$date),
  end = max(test_data$date),
  num_obs = scales::comma(nrow(test_data)),
))
```

## Train the model

### Data preprocessing

Define a recipe to clean the data.

```{r define_recipe}
# Define a recipe to clean the data.
recipe_spec <- 
  recipe(n_bikes ~ ., data = train_data) %>% 
  step_dummy(dow) %>%
  step_integer(id, date)

# Preview the cleaned training data.
recipe_spec %>% 
  prep(train_data) %>% 
  bake(head(train_data)) %>%
  glimpse()
```

### Fit model

Fit a random forest model:

```{r fit_model}
model_spec <- 
  rand_forest() %>%
  set_mode("regression") %>%
  set_engine("ranger")

model_workflow <- 
  workflow() %>%
  add_recipe(recipe_spec) %>%
  add_model(model_spec)

model_fit <- fit(model_workflow, data = train_data)
model_fit
```

## Model evaluation

```{r evaluate_model}
predictions <- predict(model_fit, test_data)

results <- test_data %>%
  mutate(preds = predictions$.pred)

oos_metrics(results$n_bikes, results$preds)
```

## Model deployment

### `vetiver`

Create a `vetiver` model object.

```{r create_vetiver_model}
model_name <- "bike_predict_model_r"
pin_name <- glue("sam.edwardes/{model_name}")

# Get the train and test data ranges. This will be passed into the pin metadata
# so that other scripts can access this information.
date_metadata <- list(
  train_dates = c(
    as.character(min(train_data$date)), 
    as.character(max(train_data$date))
  ),
  test_dates = c(
    as.character(min(test_data$date)), 
    as.character(max(test_data$date))
  )
)

print(date_metadata)

# Create the vetiver model.
v <- vetiver_model(
  model_fit, 
  model_name,
  versioned = TRUE,
  save_ptype = train_data %>%
    head(1) %>%
    select(-n_bikes),
  metadata = date_metadata
)

v
```

### `pins`

Save the model as a *pin* to RStudio Connect:

```{r pin_model}
# Use RStudio Connect as a board.
board <- pins::board_rsconnect(
  server = Sys.getenv("CONNECT_SERVER"),
  key = Sys.getenv("CONNECT_API_KEY"),
  versioned = TRUE
)
# Write the model to the board.
board %>%
 vetiver_pin_write(vetiver_model = v)
```

### `plumber`

Then, deploy the model as a plumber API to RStudio Connect.

```{r deploy_model_api_to_connect}
# Add server
rsconnect::addServer(
  url = "https://colorado.posit.co/rsc/__api__",
  name = "colorado"
)

# Add account
rsconnect::connectApiUser(
  account = "sam.edwardes",
  server = "colorado",
  apiKey = Sys.getenv("CONNECT_API_KEY"),
)

# Deploy to Connect
vetiver_deploy_rsconnect(
  board = board,
  name = pin_name,
  appId = "11314",
  launch.browser = FALSE,
  appTitle = "Bike Predict - Model - API",
  predict_args = list(debug = FALSE),
  account = "sam.edwardes",
  server =  "colorado"
)

```

```{r close_db_connection}
DBI::dbDisconnect(con)
```