Simulation
ALM
R
Published

January 9, 2024

Code
source(here::here("Functions", "packages.R"))
test <- readRDS(here("data/e1_08-21-23.rds")) |>  filter(expMode2 == "Test") |>
  select(id,condit,bandInt,vb,vx,dist,sdist,bandType)

e1_vxBMM <- brm(vx ~ condit * bandInt + (1 + bandInt|id),
                        data=test,file=paste0(here::here("data/model_cache", "e1_testVxBand_RF_5k")))
new_data_grid=map_dfr(1, ~data.frame(unique(test[,c("id","condit","bandInt")])))


t1 <- system.time({
tidy_pred <- test |> add_predicted_draws(e1_vxBMM) |> mutate(.residual = vx - .prediction); 
predict_per_row <- tidy_pred |> group_by(.row) |> mean_hdi(.prediction,.residual)

})

m1 <- system.time({
  predict_vx <- test |> cbind(predict(e1_vxBMM, test)) |> mutate(resid=vx-Estimate)
}
)

cat(paste0("tidybayes preds = ",round(t1["elapsed"],2)," \n ",
           "predict.brmsfit = ",round(m1["elapsed"],2)))


predict_vx |> group_by(id,condit,bandInt) |> summarise(mean(vx),mean(Estimate),mean(resid))

tidybayes preds = 169.94 predict.brmsfit = 10.27

https://cran.r-project.org/web/packages/brms/vignettes/brms_threading.html

https://discourse.mc-stan.org/t/using-the-apple-m1-gpus-question-from-a-noob/23089/39?page=2

Code
pacman::p_load(tidyverse,tidybayes,brms,broom,broom.mixed,lme4,here,knitr,gt,gghalves,patchwork,ggdist,microbenchmark)
e1 <- readRDS(here("data/e1_08-21-23.rds"))
test <- e1 |> filter(expMode2 == "Test")  

options(mc.cores = 4, brms.backend = "cmdstanr")

n_cores=4
bayes_seed <- 1234
n_iter=20000
n_threads=2


gprior<- c( prior(normal(800, 100), class = Intercept),
    prior(normal(400, 10), class = sigma)
  )


##############

tf3 <- system.time(
fit_int_norm <- brm(vx ~ 1 + condit, 
  data = test,
  family = gaussian(),
  iter=n_iter,
  prior = gprior,
  silent=2,
  cores=n_cores,
  threads = threading(n_threads),
  seed=bayes_seed
)
)
tf3

tf4 <- system.time(
fit_int_norm <- brm(vx ~ 1 + condit +(1|bandInt) + (1 + bandInt|id),
  data = test |> filter(id %in% 1:15),
  family = gaussian(),
  iter=n_iter,
  prior = gprior,
  silent=2,
  cores=n_cores,
  threads = threading(n_threads),
  seed=bayes_seed
) )
tf4


#########


tf <- system.time(
fit_int_norm <- brm(vx ~ 1 + condit, 
  data = test,
  family = gaussian(),
  iter=n_iter,
  prior = gprior,
  silent=2,
  cores=n_cores,
  seed=bayes_seed
)
)
tf

tf2 <- system.time(
fit_int_norm <- brm(vx ~ 1 + condit +(1|bandInt) + (1 + bandInt|id),
  data = test |> filter(id %in% 1:15),
  family = gaussian(),
  iter=n_iter,
  prior = gprior,
  silent=2,
  cores=n_cores,
  seed=bayes_seed
)
)
tf2


##########


cat(paste0("int only gaussian = ",round(tf["elapsed"],2)," \n ",
          # "foreach parallel time=",round(tfP["elapsed"],2)," \n ",
           "irt version = ",round(tf2["elapsed"],2), " \n ",
           "int only threading = ",round(tf3["elapsed"],2)," \n ",
          "irt  threading = ",round(tf4["elapsed"],2)," \n "
            
          ))

default brms settings (2K iterations)

int only gaussian = 6.85 irt version = 12.15

int only gaussian = 7.51 irt version = 12

int only gaussian = 6.67 irt version = 11.82 int only threading = 7.49 irt threading = 11.33

default brms settings (4K iterations) - 2 threads

int only gaussian = 8.1 irt version = 14.69 int only threading = 6.99 irt threading = 15.24

default brms settings (4K iterations) - 4 threads

brms int only gaussian = 8.11 irt version = 15.05 int only threading = 7.28 irt threading = 16.95

default brms settings (4K iterations) - 4 threads - 2 cores

int only gaussian = 11.27 irt version = 21.72 int only threading = 8.39 irt threading = 22.67

default brms settings (4K iterations) - 2 threads - 2 cores

int only gaussian = 11.27 irt version = 21.81 int only threading = 9.38 irt threading = 20.63

default brms settings (4K iterations) - 8 threads - 4 cores

int only gaussian = 8.27 irt version = 15.4 int only threading = 7.69 irt threading = 21.28

int only gaussian = 8.37 irt version = 15.18 int only threading = 8.02 irt threading = 21.47

default brms settings (4K iterations) - 8 threads - 8 cores

int only gaussian = 8.11 irt version = 14.88 int only threading = 7.92 irt threading = 20.72

default brms settings (10K iterations) - 8 threads - 8 cores

int only gaussian = 11.38 irt version = 23.64 int only threading = 11.58 irt threading = 37.1

int only gaussian = 11.53 irt version = 23.76 int only threading = 12 irt threading = 37.45

default brms settings (10K iterations) - 8 threads - 4 cores

int only gaussian = 11.48 irt version = 23.29 int only threading = 10.91 irt threading = 36.54

default brms settings (10K iterations) - 4 threads - 4 cores

int only gaussian = 11.99 irt version = 24.25 int only threading = 10.13 irt threading = 29.13

default brms settings (10K iterations) - 4 threads - 1 cores

int only gaussian = 29.35 irt version = 66.61 int only threading = 18.97 irt threading = 71.21

default brms settings (10K iterations) - 2 threads - 2 cores

int only gaussian = 17.98 irt version = 38.31 int only threading = 13.87 irt threading = 37.78

default brms settings (10K iterations) - 1 threads - 4 cores

int only gaussian = 11.14 irt version = 22.89 int only threading = 12.25 irt threading = 27.03

int only gaussian = 11.35 irt version = 22.99 int only threading = 12.28 irt threading = 27.38

default brms settings (10K iterations) - 2 threads - 4 cores

int only gaussian = 11.22 irt version = 23.07 int only threading = 9 irt threading = 22.34

int only gaussian = 11.31 irt version = 23.14 int only threading = 9.49 irt threading = 22.21

default brms settings (10K iterations) - 3 threads - 4 cores

int only gaussian = 11.15 irt version = 22.88 int only threading = 9.4 irt threading = 26.75

int only gaussian = 11 irt version = 22.88 int only threading = 9.5 irt threading = 25.7

(20K iterations) - 1 threads - 4 cores

int only gaussian = 18.24 irt version = 40.37 int only threading = 20.32 irt threading = 48.94

int only gaussian = 17.8 irt version = 39.26 int only threading = 20.54 irt threading = 48.14

int only gaussian = 18.12 irt version = 40.31
int only threading = 20.6 irt threading = 40.02 - when I commented out the threading specification

(20K iterations) - 2 threads - 4 cores

int only gaussian = 17.82 irt version = 39.34 int only threading = 14.22 irt threading = 37.73

(20K iterations) - adding condit factor - 2 threads - 4 cores

int only gaussian = 7.78 irt version = 45.66 int only threading = 8.57 irt threading = 45.97

(20K iterations) - adding condit factor - 3 threads - 4 cores

int only gaussian = 8.58 irt version = 47.07 int only threading = 10.71 irt threading = 57.87

(20K iterations) - condit factor and bandInt RF slope - 2 threads - 4 cores

int only gaussian = 7.85 irt version = 80.71 int only threading = 8.69 irt threading = 88.3

Code
pacman::p_load(tidyverse,foreach,doParallel,future,furrr,here)
purrr::walk(c(here("Functions/alm_functions.R","Functions/Display_Functions.R")),source)

n_cores <- parallel::detectCores()

param_grid <- tibble(crossing(
  c = seq(.5,5,.25),
  lr = seq(0.01, 1,.1),
  noise_sd = c(0,.0001,0.001, 0.01),
  inNodes = c(5, 7,14,28),
  outNodes = c(16, 32,64)
))
nrow(param_grid)

gen_train <- function(trainVec=c(5,6,7),trainRep=3,noise=0){
  bandVec=c(0,100,350,600,800,1000,1200)
  if(class(trainVec)=="list"){trainVec=unlist(trainVec)}
  ts <- rep(seq(1,length(trainVec)),trainRep)
  noiseVec=rnorm(length(ts),mean=0)*noise
  if(noise==0) {noiseVec=noiseVec*0}
  tibble(trial=seq(1,length(ts)),input=trainVec[ts],vx=bandVec[trainVec[ts]]+noiseVec)
}
fit_alm <- function(data, c, lr, noise_sd, inNodes, outNodes) {
  mse_list <- replicate(5, {
    train_data <- data[, c("trial", "input", "cor")] %>% rename("vx" = cor)
    sim_result <- sim_train(
      dat = train_data,
      c = c,
      lr = lr,
      inNodes = inNodes,
      outNodes = outNodes,
      noise_sd = noise_sd
    )
    train_data$almTrain <- sim_result$almTrain
    mse <- mean((data$vx - train_data$almTrain)^2)
    mse
  })
  avg_mse <- mean(mse_list)
  return(avg_mse)
}

gt <- gen_train(trainVec=c(5,6,7),trainRep=8) %>% mutate(cor=vx,err=(800-0)*exp(-.1*seq(1,n()))+0,vx=cor-err)


furrr::furrr_options(seed = TRUE)
plan(multisession, workers = n_cores-1)
tff <- system.time({

param_grid <- param_grid %>% mutate(performance = future_map_dbl(seq_len(nrow(.)), function(idx) {
    fit_alm(gt, c = c[idx], lr = lr[idx], noise_sd = noise_sd[idx], inNodes = inNodes[idx], outNodes = outNodes[idx])
  },.options = furrr_options(seed = T)))
  best_paramsF <- param_grid %>%
    arrange((performance)) 
  bestF <- head(best_paramsF,1)
})


# cluster <- parallel::makeCluster(n_cores-1)                 
# doParallel::registerDoParallel(cluster)
# tfP <- system.time({
#   param_grid <- param_grid %>%
#     mutate(performance = foreach(idx = seq_len(nrow(.)), .combine = c) %dopar% {
#       fit_alm(gt, c = c[idx], lr = lr[idx], noise_sd = noise_sd[idx], inNodes = inNodes[idx], outNodes = outNodes[idx])
#     })
#   best_paramsP <- param_grid %>%
#     arrange((performance)) 
#   bestP <- head(best_paramsP,1)
#   })
# stopImplicitCluster()
#   tfP


tI <- system.time({
gt <- gen_train(trainVec=c(5,6,7),trainRep=8) %>% mutate(cor=vx,err=(600-0)*exp(-.1*seq(1,n()))+0,vx=cor-err)

param_grid <- param_grid %>%
  mutate(performance = map_dbl(seq_len(nrow(.)), ~ {
    fit_alm(gt, c = c[.x], lr = lr[.x], noise_sd = noise_sd[.x], inNodes = inNodes[.x], outNodes = outNodes[.x])
  }))
best_paramsI <- param_grid %>%
  arrange((performance)) 
bestI <- head(best_paramsI,1)
})

tI




cat(paste0("furr time=",round(tff["elapsed"],2)," \n ",
          # "foreach parallel time=",round(tfP["elapsed"],2)," \n ",
           " Standard Time=",round(tI["elapsed"],2)))

4.2 -6.2 times faster with furr on m1

Runs Machine Cores Method Time (seconds)
9120 M1 9/10 Furr 9.56
9120 M1 9/10 Standard 58.8
912 M1 8/10 Standard 5.7
912 M1 8/10 Parallel 1.2
912 M1 9/10 Furr 1.3
912 2015 iMac 2/4 Standard 26.2
912 2015 iMac 2/4 Parallel 18.5
912 2015 iMac 3/4 Furr 12.88
912 GTX 3/4 Standard 25.32
912 GTX 3/4 Furr 13.1

Furrr and standard version both work equally fast, tested with up to 120 simulation repetitions

Code
library(furrr)
furrr::furrr_options(seed = TRUE)
plan(multisession, workers = parallel::detectCores())

parmVec <- tibble(crossing(c = c(0.1,.5), lr = c(0.4), noise = c(500), trainRep = c(20), lossFun = list("RMSE", "RMSE.blocked"), simNum = 1:30))
tf <- system.time(
sdpf <- parmVec %>% 
  mutate(d = future_pmap(list(c, lr, noise, trainRep), ~sim_data(c = ..1, lr = ..2, noise = ..3, trainRep = ..4),
                         .options = furrr_options(seed = T)),
                           almTrainDat = map(d, "almTrain"),
                          almTestDat = map(d, "almPred"),
                          examTestDat = map(d, "examPred"),
                          td = map(trainRep, ~gen_train(trainRep = .)),
                          fitO = map2(td, lossFun, ~wrap_optim(.x, .y)),
                          fitG = map2(td, lossFun, ~wrap_grid(.x, .y)),
                          cFitO = map_dbl(fitO, "c"),
                          lrFitO = map_dbl(fitO, "lr"),
                          optimValO = map_dbl(fitO, "Value"),
                          cFitG = map_dbl(fitG, "c"),
                          lrFitG = map_dbl(fitG, "lr"),
                          optimValG = map_dbl(fitG, "Value"))
)
tf

ts <- system.time(
sdp <- parmVec %>% mutate(d = pmap(list(c, lr, noise, trainRep), ~sim_data(c = ..1, lr = ..2, noise = ..3, trainRep = ..4)),
                          almTrainDat = map(d, "almTrain"),
                          almTestDat = map(d, "almPred"),
                          examTestDat = map(d, "examPred"),
                          td = map(trainRep, ~gen_train(trainRep = .)),
                          fitO = map2(td, lossFun, ~wrap_optim(.x, .y)),
                          fitG = map2(td, lossFun, ~wrap_grid(.x, .y)),
                          cFitO = map_dbl(fitO, "c"),
                          lrFitO = map_dbl(fitO, "lr"),
                          optimValO = map_dbl(fitO, "Value"),
                          cFitG = map_dbl(fitG, "c"),
                          lrFitG = map_dbl(fitG, "lr"),
                          optimValG = map_dbl(fitG, "Value"))
)
ts

M1 Max times: tf user system elapsed 221.326 4.517 226.233

ts user system elapsed 221.140 4.288 225.330

Mac Pro times: tf user system elapsed 1125.102 76.859 1474.612

ts user system elapsed 1132.716 76.260 1493.154

Benchmarking ALM Model Fit Functions

Code
pacman::p_load(tidyverse,data.table,microbenchmark())


d <- readRDS(here('dPrune-01-19-23.rds'))

dtest <- d %>% filter(expMode %in% c("test-Nf","test-train-nf")) %>% group_by(id,lowBound) %>% 
  mutate(nBand=n(),band=bandInt,id=factor(id)) %>% group_by(id) %>% mutate(nd=n_distinct(lowBound))
# unique(dtest[dtest$nd==4,]$sbjCode) # 7 in wrong condition
dtest <- dtest %>% group_by(id,lowBound) %>% filter(nBand>=5 & nd==6)
# for any id that has at least 1 nBand >=5, remove all rows with that id. 
dtest <- dtest %>% group_by(id) %>% filter(!id %in% unique(dtest$id[dtest$nBand<5]))

dtestAgg <- dtest %>% group_by(id,condit,catOrder,feedbackType,vb,band,lowBound,highBound,input) %>% mutate(vxCapped=ifelse(vx>1600,1600,vx)) %>%
  summarise(vxMean=mean(vx),devMean=mean(dist),vxMed=median(vx),devMed=median(dist),
            vxMeanCap=mean(vxCapped),.groups = "keep")

# select first row for each id in d, then create histogram for nTrain
#  d  %>% group_by(id) %>% slice(1) %>% ggplot(aes(nTrain)) + geom_histogram() + facet_wrap(~condit)
  
ds <- d %>% filter(expMode %in% c("train","train-Nf","test-Nf","test-train-nf")) %>% 
filter(!id %in% unique(dtest$id[dtest$nBand<5])) %>% 
select(id,condit,catOrder,feedbackType,expMode,trial,gt.train,vb,band,bandInt,lowBound,highBound,input,vx,dist,vxb) 

dst <- ds %>% filter(expMode=="train",catOrder=="orig")

vTrainTrial <- dst %>% filter(condit=="Varied",gt.train<=84) %>% group_by(gt.train,vb) %>% summarise(sdVx=sd(vx),vx=mean(vx),sdDist=sd(dist),dist=mean(dist)) %>% 
  group_by(vb) %>% mutate(gt.trainBin=cut(gt.train,breaks=5,labels=c(1:5)))

binTrainTrial <- dst %>% filter(gt.train<=83) %>% group_by(gt.train,vb,condit) %>% summarise(sdVx=sd(vx),vx=mean(vx),sdDist=sd(dist),dist=mean(dist)) %>% 
  group_by(vb) %>% mutate(gt.trainBin=cut(gt.train,breaks=6,labels=c(1:6)))


tMax=84
bandVec <- rep(c(800,1000,1200),each=tMax/3)
bandVec <- bandVec[sample(1:length(bandVec),tMax,replace=FALSE)]

trainTrials <- dst %>% filter(gt.train<=tMax) %>% group_by(condit,gt.train,vb,bandInt,input) %>% summarise(vx=mean(vx)) 

input.activation<-function(x.target, c){
  return(exp(-1*c*(x.target-inputNodes)^2))
}

output.activation<-function(x.target, weights, c){
  return(weights%*%input.activation(x.target, c))
}

mean.prediction<-function(x.target, weights, association.parameter){
  probability<-output.activation(x.target, weights, c)/sum(output.activation(x.target, weights, c))
  return(outputNodes%*%probability) # integer prediction
}
# function to generate exam predictions
exam.prediction<-function(x.target, weights, c,trainVec){
  #trainVec = sort(unique(x.learning))
  nearestTrain = trainVec[which.min(abs(trainVec-x.target))]
  aresp = mean.prediction(nearestTrain, weights, c)
  xUnder = ifelse(min(trainVec) == nearestTrain, nearestTrain, trainVec[which(trainVec == nearestTrain) - 1])
  xOver = ifelse(max(trainVec) == nearestTrain, nearestTrain, trainVec[which(trainVec == nearestTrain) + 1])
  mUnder = mean.prediction(xUnder, weights, c)
  mOver = mean.prediction(xOver, weights, c)
  exam.output = round(aresp + ((mOver - mUnder) / (xOver - xUnder)) * (x.target - nearestTrain), 3)
  exam.output
}
  
update.weights<-function(x.new, y.new, weights, c, lr){
  y.feedback.activation<-exp(-1*c*(y.new-outputNodes)^2)
  x.feedback.activation<-output.activation(x.new, weights, c)
  return(weights+lr*(y.feedback.activation-x.feedback.activation)%*%t(input.activation(x.new, c)))
}

train.alm<-function(dat, c=0.05, lr=0.5, weights){
   alm.train<-rep(NA,nrow(dat))  
  for (i in 1:nrow(dat)){
    weights <- update.weights(dat$input[i], dat$vx[i], weights, c, lr)
    resp = mean.prediction(dat$input[i], weights, c)
    alm.train[i]=resp
    weights[weights<0]=0
  }
  alm.train
}

wrap_alm <- function(parms,dat, weights,lossFun){
    c=parms[1]; lr=parms[2]
   pred=train.alm(dat, c=c, lr=lr, weights=weights)
   #sqrt(mean((dat$vx -pred)^2))
   lossFun(dat$vx,pred)
}

wrap_optim <- function(dat,wm,lossFun){
  bounds_lower <- c(.0000001, .00001)
  bounds_upper <- c(5, 5)

 optim(c(.1, .2),
   fn = wrap_alm,
   dat = dat, weights = wm,lossFun=lossFun,
   method = "L-BFGS-B",
   lower = bounds_lower,
   upper = bounds_upper,
   control = list(maxit = 1e4, pgtol = 0, factr = 0)
 )
}

RMSE <- function(x,y){
  sqrt(mean((x-y)^2))
}

## First average observed and predicted data into blocks, then compute RMSE
RMSE.tb <- function(x,y,blocks=6){
  data.frame(x,y) %>% mutate(t=row_number(),fitBins=cut(t,breaks=blocks,labels=c(1:blocks))) %>%
    group_by(fitBins) %>% 
    summarise(predMean=mean(x),obsMean=mean(y)) %>% 
    summarise(RMSE(predMean,obsMean)) %>% as.numeric()
}

## Recode RMSE.tb using data.table functions rather than dplyr
RMSE.tb2 <- function(x,y,blocks=6){
  data.table(x=x,y=y,t=seq(1,length(x))) %>% 
    .[, `:=`(fitBins = cut(t, breaks = ..blocks, labels = c(1:..blocks)))] %>%
    .[, .(predMean = mean(x), obsMean = mean(y)), keyby = .(fitBins)] %>%
    .[, RMSE(predMean,obsMean)] %>% as.numeric()
}

dplyr RMSE vs data.table RMSE

Code
dpVsDt=microbenchmark(
  dplyrMethod={
  fitVaried <- tv %>% filter(condit=="Varied") %>% wrap_optim(.,wm,lossFun=RMSE.tb);
  fitConstant <- tv %>% filter(condit=="Constant") %>% wrap_optim(.,wm,lossFun=RMSE.tb)},
  dtMethod={
  fitVaried2 <- tv %>% filter(condit=="Varied") %>% wrap_optim(.,wm,lossFun=RMSE.tb2);
  fitConstant2 <- tv %>% filter(condit=="Constant") %>% wrap_optim(.,wm,lossFun=RMSE.tb2)},
  times=5
)
knitr::kable(summary(dpVsDt),format="markdown")
expr min lq mean median uq max neval cld
dplyrMethod 11.282291 12.996857 13.558623 13.970367 14.658515 14.885086 5 a
dtMethod 4.933999 5.306232 5.681235 5.515397 6.131193 6.519355 5 b

The data.table version seems to be consistently more than 2x faster

Separate fits, vs. nesting method vs. split method

Code
nestSplit<-microbenchmark(
separate={fitVaried <- tv %>% filter(condit=="Varied") %>% wrap_optim(.,wm,lossFun=RMSE.tb2);
fitConstant <- tv %>% filter(condit=="Constant") %>% wrap_optim(.,wm,lossFun=RMSE.tb2) },
nestGroups = tv %>% group_by(condit) %>% nest() %>% mutate(fit=map(data,~wrap_optim(.,wm,RMSE.tb2))),
splitGroups = tv %>% split(.$condit) %>% map(~wrap_optim(.,wm,RMSE.tb2)),
times=5
)
knitr::kable(summary(nestSplit),format="markdown") # 03/02/23 - Mac Pro
expr min lq mean median uq max neval cld
separate 6.394459 6.483235 6.709882 6.508353 7.059420 7.103941 5 a
nestGroups 6.253850 6.457956 6.789962 6.756942 7.044965 7.436094 5 a
splitGroups 6.117184 6.455866 6.605814 6.461572 6.640798 7.353648 5 a

Not much of an effect for nesting method.

Computing RMSE over Raw trials vs. RMSE of blocked training performance

Code
trialBlock<-microbenchmark(
  trialFit = tv %>% split(.$condit) %>% map(~wrap_optim(.,wm,RMSE)),
  blockFit = tv %>% split(.$condit) %>% map(~wrap_optim(.,wm,RMSE.tb2)),
  times=5
)
knitr::kable(summary(trialBlock),format="markdown")
expr min lq mean median uq max neval cld
trialFit 2.665184 2.674186 2.806476 2.803491 2.843806 3.045712 5 a
blockFit 5.169167 5.264218 5.350769 5.285151 5.491828 5.543483 5 b

The models are fit about 2x faster when computing RMSE over trials. This shouldn’t be surprising, since fitting to blocked data requires several additional computations (e.g. grouping, computing means)