The issue here is that you used step_center()
to transform the outcome (price_per_sqm_huf_mil
) and at prediction time, there is no outcome available. You can instead specify that you want to center all_predictors() & all_numeric()
like this:
library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
#> ? broom 0.7.3 ? recipes 0.1.15
#> ? dials 0.0.9 ? rsample 0.0.8
#> ? dplyr 1.0.3 ? tibble 3.0.5
#> ? ggplot2 3.3.3 ? tidyr 1.1.2
#> ? infer 0.5.4 ? tune 0.1.2
#> ? modeldata 0.1.0 ? workflows 0.2.1
#> ? parsnip 0.1.5 ? yardstick 0.0.7
#> ? purrr 0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x recipes::step() masks stats::step()
library(dplyr)
real_estate_data <- tibble::tribble(
~re_id, ~price_per_sqm_huf_mil, ~district, ~num_room,
"30876343", 0.534722222222222, 1, 3,
"31914489", 0.476119402985075, 1, 1,
"30972289", 0.507352941176471, 1, 2,
"31739730", 0.472972972972973, 1, 3,
"31783137", 0.49875, 2, 3,
"31809435", 0.439705882352941, 2, 2,
"31943408", 0.469117647058824, 2, 3,
"31944348", 0.56231884057971, 2, 1,
"31961146", 0.472972972972973, 3, 3,
"24314388", 0.649550561797753, 3, 2,
"29840270", 0.719178082191781, 3, 3,
"29840429", 0.719178082191781, 3, 3,
"30873484", 0.822857142857143, 4, 3,
"30969673", 0.533802816901408, 4, 3,
"31333120", 0.741511627906977, 4, 3,
"31788730", 0.527142857142857, 4, 2,
"31948441", 0.734848484848485, 5, 2,
"31962350", 0.8, 5, 3,
"31962779", 0.670454545454545, 5, 3,
"31979128", 0.689054054054054, 5, 1
) %>%
mutate(district = factor(district))
set.seed(123)
re_split <- initial_split(real_estate_data)
re_train <- training(re_split)
re_test <- testing(re_split)
re_rec <- recipe(re_train,
formula = price_per_sqm_huf_mil ~ .) %>%
update_role(re_id, new_role = "ID") %>%
step_center(all_predictors() & all_numeric()) %>%
step_scale(all_predictors() & all_numeric()) %>%
step_dummy(district) %>%
step_zv(all_predictors())
summary(re_rec)
#> # A tibble: 4 x 4
#> variable type role source
#> <chr> <chr> <chr> <chr>
#> 1 re_id nominal ID original
#> 2 district nominal predictor original
#> 3 num_room numeric predictor original
#> 4 price_per_sqm_huf_mil numeric outcome original
lr_model <-
linear_reg() %>%
set_engine("lm")
re_wflow <-
workflow() %>%
add_model(lr_model) %>%
add_recipe(re_rec)
re_fit <-
re_wflow %>%
fit(data = re_train)
predict(re_fit, new_data = re_test)
#> # A tibble: 5 x 1
#> .pred
#> <dbl>
#> 1 0.486
#> 2 0.611
#> 3 0.688
#> 4 0.688
#> 5 0.768
Created on 2021-01-25 by the reprex package (v0.3.0)
This has tripped up more folks than you so we are working on adding a new set of selectors that will be merged in soon. The other option to think about, if you really do want to try transforming an outcome, is to look into using skip = TRUE
.