Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
358 views
in Technique[技术] by (71.8m points)

r - Error: Can't subset columns that don't exist when running prediction using {Tidymodels}

I'm trying to predict real estate prices in R with Tidymodels. I'm following this tutorial. All goes well until the very and when I try to run prediction on my test data.

Please see the below code example and the error at the very end.

I looked at two similar questions (here and here) but it seems that I have defined variable roles and provided an unprepared recipe to my workflow.

    # libraries ---------------------------------------------------------------
    library(tidymodels)
    #> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
    #> ? broom     0.7.3      ? recipes   0.1.15
    #> ? dials     0.0.9      ? rsample   0.0.8 
    #> ? dplyr     1.0.3      ? tibble    3.0.5 
    #> ? ggplot2   3.3.3      ? tidyr     1.1.2 
    #> ? infer     0.5.4      ? tune      0.1.2 
    #> ? modeldata 0.1.0      ? workflows 0.2.1 
    #> ? parsnip   0.1.5      ? yardstick 0.0.7 
    #> ? purrr     0.3.4
    #> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
    #> x purrr::discard() masks scales::discard()
    #> x dplyr::filter()  masks stats::filter()
    #> x dplyr::lag()     masks stats::lag()
    #> x recipes::step()  masks stats::step()
    library(data.table)
    
    library(purrr)
    
    
    # data --------------------------------------------------------------------
    # 're' means real estate
    # I'm using data.table in general. Using tribble below for cleaner data definition.
    real_estate_data <- tibble::tribble(
        ~re_id, ~price_per_sqm_huf_mil, ~district, ~num_room,
        "30876343",      0.534722222222222,        1,         3,
        "31914489",      0.476119402985075,        1,         1,
        "30972289",      0.507352941176471,        1,         2,
        "31739730",      0.472972972972973,        1,         3,
        "31783137",                0.49875,        2,         3,
        "31809435",      0.439705882352941,        2,         2,
        "31943408",      0.469117647058824,        2,         3,
        "31944348",       0.56231884057971,        2,         1,
        "31961146",      0.472972972972973,        3,         3,
        "24314388",      0.649550561797753,        3,         2,
        "29840270",      0.719178082191781,        3,         3,
        "29840429",      0.719178082191781,        3,         3,
        "30873484",      0.822857142857143,        4,         3,
        "30969673",      0.533802816901408,        4,         3,
        "31333120",      0.741511627906977,        4,         3,
        "31788730",      0.527142857142857,        4,         2,
        "31948441",      0.734848484848485,        5,         2,
        "31962350",                    0.8,        5,         3,
        "31962779",      0.670454545454545,        5,         3,
        "31979128",      0.689054054054054,        5,         1
    )
    
    real_estate_data <- as.data.table(real_estate_data) %>% .[, district := factor(district)]
    
    # train/test split --------------------------------------------------------
    set.seed(123)
    re_split <- initial_split(real_estate_data)
    re_train <- training(re_split)
    re_test  <- testing(re_split)
    
    # workflow (w/ recipe) ----------------------------------------------------
    re_rec <- recipe(re_train,
                     formula = price_per_sqm_huf_mil ~ .) %>%
        update_role(re_id, new_role = "ID") %>%
        step_center(all_numeric(), - district) %>%
        step_scale(all_predictors(), all_numeric(), - district) %>%
        step_dummy(district) %>%
        step_zv(all_predictors())
    
    summary(re_rec)
    #> # A tibble: 4 x 4
    #>   variable              type    role      source  
    #>   <chr>                 <chr>   <chr>     <chr>   
    #> 1 re_id                 nominal ID        original
    #> 2 district              nominal predictor original
    #> 3 num_room              numeric predictor original
    #> 4 price_per_sqm_huf_mil numeric outcome   original
    
    lr_model <-
        linear_reg() %>%
        set_engine("lm")
    
    re_wflow <-
        workflow() %>%
        add_model(lr_model) %>%
        add_recipe(re_rec)
    
    # model training and prediction -------------------------------------------
    re_fit <-
        re_wflow %>%
        fit(data = re_train)
    
    re_pred <- predict(re_fit, re_test)
    #> Error: Can't subset columns that don't exist.
    #> x Column `price_per_sqm_huf_mil` doesn't exist.

Created on 2021-01-25 by the reprex package (v0.3.0)

Many thanks!

question from:https://stackoverflow.com/questions/65890264/error-cant-subset-columns-that-dont-exist-when-running-prediction-using-tidy

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Answer

0 votes
by (71.8m points)

The issue here is that you used step_center() to transform the outcome (price_per_sqm_huf_mil) and at prediction time, there is no outcome available. You can instead specify that you want to center all_predictors() & all_numeric() like this:

library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
#> ? broom     0.7.3      ? recipes   0.1.15
#> ? dials     0.0.9      ? rsample   0.0.8 
#> ? dplyr     1.0.3      ? tibble    3.0.5 
#> ? ggplot2   3.3.3      ? tidyr     1.1.2 
#> ? infer     0.5.4      ? tune      0.1.2 
#> ? modeldata 0.1.0      ? workflows 0.2.1 
#> ? parsnip   0.1.5      ? yardstick 0.0.7 
#> ? purrr     0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter()  masks stats::filter()
#> x dplyr::lag()     masks stats::lag()
#> x recipes::step()  masks stats::step()
library(dplyr)

real_estate_data <- tibble::tribble(
  ~re_id, ~price_per_sqm_huf_mil, ~district, ~num_room,
  "30876343",      0.534722222222222,        1,         3,
  "31914489",      0.476119402985075,        1,         1,
  "30972289",      0.507352941176471,        1,         2,
  "31739730",      0.472972972972973,        1,         3,
  "31783137",                0.49875,        2,         3,
  "31809435",      0.439705882352941,        2,         2,
  "31943408",      0.469117647058824,        2,         3,
  "31944348",       0.56231884057971,        2,         1,
  "31961146",      0.472972972972973,        3,         3,
  "24314388",      0.649550561797753,        3,         2,
  "29840270",      0.719178082191781,        3,         3,
  "29840429",      0.719178082191781,        3,         3,
  "30873484",      0.822857142857143,        4,         3,
  "30969673",      0.533802816901408,        4,         3,
  "31333120",      0.741511627906977,        4,         3,
  "31788730",      0.527142857142857,        4,         2,
  "31948441",      0.734848484848485,        5,         2,
  "31962350",                    0.8,        5,         3,
  "31962779",      0.670454545454545,        5,         3,
  "31979128",      0.689054054054054,        5,         1
) %>%
  mutate(district = factor(district))


set.seed(123)
re_split <- initial_split(real_estate_data)
re_train <- training(re_split)
re_test  <- testing(re_split)

re_rec <- recipe(re_train,
                 formula = price_per_sqm_huf_mil ~ .) %>%
  update_role(re_id, new_role = "ID") %>%
  step_center(all_predictors() & all_numeric()) %>%
  step_scale(all_predictors() & all_numeric()) %>%
  step_dummy(district) %>%
  step_zv(all_predictors())

summary(re_rec)
#> # A tibble: 4 x 4
#>   variable              type    role      source  
#>   <chr>                 <chr>   <chr>     <chr>   
#> 1 re_id                 nominal ID        original
#> 2 district              nominal predictor original
#> 3 num_room              numeric predictor original
#> 4 price_per_sqm_huf_mil numeric outcome   original

lr_model <-
  linear_reg() %>%
  set_engine("lm")

re_wflow <-
  workflow() %>%
  add_model(lr_model) %>%
  add_recipe(re_rec)

re_fit <-
  re_wflow %>%
  fit(data = re_train)

predict(re_fit, new_data = re_test)
#> # A tibble: 5 x 1
#>   .pred
#>   <dbl>
#> 1 0.486
#> 2 0.611
#> 3 0.688
#> 4 0.688
#> 5 0.768

Created on 2021-01-25 by the reprex package (v0.3.0)

This has tripped up more folks than you so we are working on adding a new set of selectors that will be merged in soon. The other option to think about, if you really do want to try transforming an outcome, is to look into using skip = TRUE.


与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

2.1m questions

2.1m answers

60 comments

57.0k users

...