R - xgboost model predictions not reproducible when model object is loaded from disk #11169

delt87 · 2025-01-15T18:26:22Z

I have spent quite a bit of time between the open and closed issues on here and similar question(s) / answer(s) in SO, in an attempt to figure out why I keep getting different predictions from xgboost (on the same exact data) after I save a trained xgboost model to disk and the load it back in my R session to make predictions. The reprex below should illustrate what I'm seeing

Some of the sources I've looked at include:

https://stackoverflow.com/questions/71401330/xgboost-not-providing-reproducible-predictions-after-saving-in-bin-format
https://github.com/dmlc/xgboost/issues/160
https://github.com/tqchen/xgboost/blob/master/R-package/demo/basic_walkthrough.R#L62
https://stackoverflow.com/questions/73097036/pre-trained-xgboost-model-does-not-reproduce-results-if-loaded-from-file

I've also tried to save the mode in raw format and rebuild it again; and also tried to save the model in .RDS format; and I've tried re-setting the model parameters but the issue still persists.

Any guidance is greatly appreciated.

fake_client_data file required for reprex: https://drive.google.com/file/d/1ie8OQQbo2gCn3uhRWknTKvU1WPTfyqEs/view?usp=drive_link

system version:

> version
               _                                
platform       x86_64-w64-mingw32               
arch           x86_64                           
os             mingw32                          
crt            ucrt                             
system         x86_64, mingw32                  
status                                          
major          4                                
minor          4.1                              
year           2024                             
month          06                               
day            14                               
svn rev        86737                            
language       R                                
version.string R version 4.4.1 (2024-06-14 ucrt)
nickname       Race for Your Life

xgboost version:

> packageVersion("xgboost")
[1] ‘2.0.3.1’

Reprex

library(furrr)
library(purrr)
library(dplyr)
library(future)
library(xgboost)
library(lubridate)
library(data.table)

options(digits = 5)
options(scipen = 999)
options(future.rng.onMisuse = "ignore")

set.seed(1451553)
fake_client_data <- fread("fake_client_data.csv")
testDate <- ymd("2024-10-30")
response_col <- "response"
train_cols <- setdiff(names(fake_client_data), c("Date","Client","response","monthly_earnings_bucket"))

best_grid <- data.frame(
  nrounds = 55,
  max_depth = 2,
  colsample_bytree = 1,
  eta = 0.1,
  gamma = 0.01,
  min_child_weight = 1,
  subsample = 1,
  lambda = 0.01,
  alpha = 0.01
)

make_best_model <- function(client_earnings, testDate, best_grid)
{
  ## make validation df
  val_set <- fake_client_data %>%
    dplyr::filter(Date >= testDate,
                  monthly_earnings_bucket == client_earnings) %>%
    dplyr::select(all_of(response_col), all_of(train_cols)) %>%
    dplyr::mutate_all(., ~as.numeric(.)) %>%
    data.frame(.)
  
  val_set <- na.omit(val_set)
  
  val_set <- xgb.DMatrix( as.matrix(val_set[,-1]), label = as.matrix(val_set[,1]) )
  
  ## make train df
  train_set <- fake_client_data %>%
    dplyr::filter(Date < testDate,
                  monthly_earnings_bucket == client_earnings) %>%
    dplyr::select(all_of(response_col), all_of(train_cols)) %>%
    dplyr::mutate_all(., ~as.numeric(.)) %>%
    data.frame(.)
  
  train_set <- na.omit(train_set)
  
  train_set <- xgb.DMatrix( as.matrix(train_set[,-1]), label = as.matrix(train_set[,1]) )
  
  best_model_parameters <- best_grid
  
  param <- list(objective = "reg:squarederror",
                max_depth = best_model_parameters[, "max_depth"],
                colsample_bytree = best_model_parameters[, "colsample_bytree"],
                eta = best_model_parameters[, "eta"],
                gamma = best_model_parameters[, "gamma"],
                min_child_weight = best_model_parameters[, "min_child_weight"],
                subsample = best_model_parameters[, "subsample"],
                alpha = best_model_parameters[, "alpha"],
                lambda = best_model_parameters[, "lambda"],
                tree_method = 'hist',
                device = "cuda")
  
  xgb_model_best <- xgboost(data = train_set,
                            params = param,
                            nrounds = best_model_parameters[, "nrounds"],
                            verbose = 0)
  
  xgb_model_best <- xgb.Booster.complete(xgb_model_best)
  
  xgb_preds <- predict(xgb_model_best, newdata = val_set)
  
  validation_set <- fake_client_data %>%
    dplyr::filter(Date >= testDate,
                  monthly_earnings_bucket == client_earnings ) %>%
    na.omit() %>%
    dplyr::mutate(xgb_preds = xgb_preds) %>%
    dplyr::select(Date,Client,xgb_preds)
  
  final_list <- list(
    validation_set,
    xgb_model_best
  )
  
  names(final_list) <- c(
    "validation_set",
    "model"
  )
  
  return(final_list)
}

plan(multisession)

all_client_earnings <- fake_client_data %>%
  dplyr::filter(Date >= testDate) %>%
  pluck("monthly_earnings_bucket") %>%
  unique()

all_client_xgb_preds <- future_map(
  .x = all_client_earnings,
  .f = ~make_best_model(client_earnings = .x, testDate = testDate, best_grid = best_grid)
)

base::closeAllConnections()
gc()

final_preds_future_map <- rbind(
  all_client_xgb_preds[[1]]$validation_set,
  all_client_xgb_preds[[2]]$validation_set,
  all_client_xgb_preds[[3]]$validation_set,
  all_client_xgb_preds[[4]]$validation_set,
  all_client_xgb_preds[[5]]$validation_set,
  all_client_xgb_preds[[6]]$validation_set
)

xgb_model_best_future_map <- all_client_xgb_preds[[1]]$model
xgb_model_best_future_map <- xgb.Booster.complete(xgb_model_best_future_map)

## save model
xgb.save(xgb_model_best_future_map, "xgb_model_best_future_map.json")
rm(xgb_model_best_future_map,all_client_xgb_preds)
gc()

## load future_map model from disk
xgb_model_best_future_map <- xgb.load("xgb_model_best_future_map.json")

val_set <- fake_client_data %>%
  dplyr::filter(Date >= testDate) %>%
  dplyr::select(all_of(response_col), all_of(train_cols)) %>%
  dplyr::mutate_all(., ~as.numeric(.)) %>%
  data.frame(.)
val_set <- na.omit(val_set)
val_set <- xgb.DMatrix( as.matrix(val_set[,-1]), label = as.matrix(val_set[,1]) )

xgb_preds <- predict(xgb_model_best_future_map, newdata = val_set)

final_preds_future_map_disk <- fake_client_data %>%
  dplyr::filter(Date >= testDate) %>%
  dplyr::select(Date,Client) %>%
  na.omit() %>%
  dplyr::mutate(xgb_preds = xgb_preds)

## compare "live" future_map preds vs. preds made after loading best model from disk
identical(final_preds_future_map$xgb_preds, final_preds_future_map_disk$xgb_preds)  #FALSE
all.equal(final_preds_future_map$xgb_preds, final_preds_future_map_disk$xgb_preds)  #"Mean relative difference: 0.35698"

#### build model sequential to see if maybe using future_map has something to do with the differences #####
final_preds_sequential <- vector("list", length = length(all_client_earnings))
final_model_sequential <- vector("list", length = length(all_client_earnings))

for (i in seq_along(all_client_earnings))
{
  client_earnings <- all_client_earnings[i]
  
  ## make validation df
  val_set <- fake_client_data %>%
    dplyr::filter(Date >= testDate,
                  monthly_earnings_bucket == client_earnings) %>%
    dplyr::select(all_of(response_col), all_of(train_cols)) %>%
    dplyr::mutate_all(., ~as.numeric(.)) %>%
    data.frame(.)
  val_set <- na.omit(val_set)
  val_set <- xgb.DMatrix( as.matrix(val_set[,-1]), label = as.matrix(val_set[,1]) )
  
  ## make train df
  train_set <- fake_client_data %>%
    dplyr::filter(Date < testDate,
                  monthly_earnings_bucket == client_earnings) %>%
    dplyr::select(all_of(response_col), all_of(train_cols)) %>%
    dplyr::mutate_all(., ~as.numeric(.)) %>%
    data.frame(.)
  train_set <- na.omit(train_set)
  train_set <- xgb.DMatrix( as.matrix(train_set[,-1]), label = as.matrix(train_set[,1]) )
  
  param <- list(objective = "reg:squarederror",
                max_depth =best_grid$max_depth,
                colsample_bytree =best_grid$colsample_bytree,
                eta =best_grid$eta,
                gamma =best_grid$gamma,
                min_child_weight =best_grid$min_child_weight,
                subsample =best_grid$subsample,
                alpha =best_grid$alpha,
                lambda =best_grid$lambda,
                tree_method = 'hist',
                device = "cuda")
  
  xgb_model_best <- xgboost(data = train_set,
                            params = param,
                            nrounds = best_grid$nrounds,
                            verbose = 0)
  
  xgb_model_best <- xgb.Booster.complete(xgb_model_best)
  
  xgb_preds <- predict(xgb_model_best, newdata = val_set)
  
  validation_set <- fake_client_data %>%
    dplyr::filter(Date >= testDate,
                  monthly_earnings_bucket == client_earnings ) %>%
    na.omit() %>%
    dplyr::mutate(xgb_preds = xgb_preds) %>%
    dplyr::select(Date,Client,xgb_preds)
  
  final_preds_sequential[[i]] <- validation_set
  final_model_sequential[[i]] <- xgb_model_best
}

final_preds_sequential <- rbindlist(final_preds_sequential)
xgb_model_best_sequential <- final_model_sequential[[1]]

## save model
xgb.save(xgb_model_best_sequential, "xgb_model_best_sequential.json")
rm(xgb_model_best_sequential,xgb_preds,final_model_sequential)
gc()

## load sequential model from disk
xgb_model_best_sequential <- xgb.load("xgb_model_best_sequential.json")

val_set <- fake_client_data %>%
  dplyr::filter(Date >= testDate) %>%
  dplyr::select(all_of(response_col), all_of(train_cols)) %>%
  dplyr::mutate_all(., ~as.numeric(.)) %>%
  data.frame(.)
val_set <- na.omit(val_set)
val_set <- xgb.DMatrix( as.matrix(val_set[,-1]), label = as.matrix(val_set[,1]) )

xgb_preds <- predict(xgb_model_best_sequential, newdata = val_set)

final_preds_sequential_disk <- fake_client_data %>%
  dplyr::filter(Date >= testDate) %>%
  dplyr::select(Date,Client) %>%
  na.omit() %>%
  dplyr::mutate(xgb_preds = xgb_preds)

## compare "live" sequential model preds vs. sequential model preds loading from disk
identical(final_preds_sequential$xgb_preds, final_preds_sequential_disk$xgb_preds)  #FALSE
all.equal(final_preds_sequential$xgb_preds, final_preds_sequential_disk$xgb_preds)  #"Mean relative difference: 0.35698"

## now, compare "live" sequential model preds vs. "live" future_map preds
## building model sequentially vs. in parallel is not the reason why predictions are different when loading xgb model from disk
identical(final_preds_sequential$xgb_preds, final_preds_future_map$xgb_preds)  #TRUE
all.equal(final_preds_sequential$xgb_preds, final_preds_future_map$xgb_preds)  #TRUE

The text was updated successfully, but these errors were encountered:

delt87 changed the title ~~R - xgboost model predictions no reproducible when model object is loaded from disk~~ R - xgboost model predictions not reproducible when model object is loaded from disk Jan 15, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

R - xgboost model predictions not reproducible when model object is loaded from disk #11169

R - xgboost model predictions not reproducible when model object is loaded from disk #11169

delt87 commented Jan 15, 2025 •

edited

Loading

R - xgboost model predictions not reproducible when model object is loaded from disk #11169

R - xgboost model predictions not reproducible when model object is loaded from disk #11169

Comments

delt87 commented Jan 15, 2025 • edited Loading

delt87 commented Jan 15, 2025 •

edited

Loading