Benchmarking model-based subsampling functions • NeEDS4BigData

# Load the R packages
library(ggplot2)

## Error in get(paste0(generic, ".", class), envir = get_method_env()) : 
##   object 'type_sum.accel' not found

library(ggpubr)
library(tidyr)
library(dplyr)

# Theme for plots
Theme_special<-function(){
  theme(legend.key.width=unit(1,"cm"),
        axis.text.x = element_text(color = "black",size=14), 
        axis.text.y = element_text(color = "black",size=14),
        strip.text = element_text(colour = "black",size = 16,face="bold"),
        panel.grid.minor.x = element_blank(),
        axis.title= element_text(color = "black",face = "bold",size = 14),
        legend.text = element_text(color = "black", size=16),
        legend.position = "bottom",
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(-1,-2,-1,-2)) 
}

load("additionaldata/Results_Model_Based.Rdata")

Setup

The lm(), biglm(), glm() and bigglm() functions are benchmarked against the model-based subsampling functions. Benchmarking is conducted across all three regression problems using a consistent setup throughout. For the big data sizes \(N = \{10^4,5 \times 10^4,10^5,5 \times 10^5,10^6,5 \times 10^6\}\) and for five different Covariates sizes \(\{5,10,25,50,100\}\), the functions were replicated \(50\) times. When using the subsampling functions, the initial subsample sizes \(\{100,100,250,500,1000\}\) were selected based on the covariate sizes, while the final subsample size was fixed at \(2500\) to ensure successful implementation.

N_size<-c(1,5,10,50,100,500)*10000
N_size_labels<-c("10^4","5 x 10^4","10^5","5 x 10^5","10^6","5 x 10^6")
CS_size<-c(5,10,25,50,100)

This implementation was conducted on a high-performance computing system; however, the code for linear regression-related functions is shown below. Furthermore, it can be extended to logistic and Poisson regression by incorporating the relevant subsampling functions and model configurations.

# load the packages
library(NeEDS4BigData)
library(here)
library(biglm)

# indexes for N and covariate sizes
indexA <- as.numeric(Sys.getenv("indexA"))
indexB <- as.numeric(Sys.getenv("indexB"))

# set N and covariate size, and assign replicates
N_size<-c(1,5,10,50,100,500)*10000
Covariate_size<-c(5,10,25,50,100)
Replicates<-50

# set the initial and final subsample sizes, 
# with the distribution parameters to generate data
r1<-c(1,1,2.5,5,10)*100; r2<-2500;
Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1,Error_Variance=0.5); 
Family<-"linear"

# assign the indexes
N_idx<-indexA
Covariate_idx<-indexB
No_Of_Var<-Covariate_size[Covariate_idx]
N<-N_size[N_idx]

# generate the big data based on N and covariate size
Beta<-c(-1,rep(0.5,Covariate_size[Covariate_idx]))
Generated_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
Full_Data<-Generated_Data$Complete_Data

cat("N size :",N_size[N_idx]," and Covariate size :",Covariate_size[Covariate_idx],"\n")

# formula for the linear regression model
lm_formula<-as.formula(paste("Y ~", paste(paste0("X",0:ncol(Full_Data[,-c(1,2)])), 
                                          collapse = " + ")))

# benchmarking the stats::lm() function
lm<-sapply(1:Replicates,function(i){
  start_T<-Sys.time()
  suppressMessages(lm(Y~.-1,data=data.frame(Full_Data)))
  return(difftime(Sys.time(),start_T,units = "secs"))
})

cat("Benchmarked lm() function.\n")

# benchmarking the biglm::biglm() function
biglm<-sapply(1:Replicates,function(i){
  start_T<-Sys.time()
  suppressMessages(biglm(lm_formula,data=data.frame(Full_Data)))
  return(difftime(Sys.time(),start_T,units = "secs"))
})

cat("Benchmarked biglm() function.\n")

# benchmarking the NeEDS4BigData::AoptimalGauLMSub() function
AoptimalGauLMSub<-sapply(1:Replicates,function(i){
  start_T<-Sys.time()
  suppressMessages(AoptimalGauLMSub(r1[Covariate_idx],r2,
                                    Y=as.matrix(Full_Data[,1]),
                                    X=as.matrix(Full_Data[,-1]),
                                    N=nrow(Full_Data)))
  return(difftime(Sys.time(),start_T,units = "secs"))
})

cat("Benchmarked AoptimalGauLMSub() function.\n")

# benchmarking the NeEDS4BigData::LeverageSampling() function
LeverageSampling<-sapply(1:Replicates,function(i){
  start_T<-Sys.time()
  suppressMessages(LeverageSampling(r2,
                                    Y=as.matrix(Full_Data[,1]),
                                    X=as.matrix(Full_Data[,-1]),
                                    N=nrow(Full_Data),
                                    S_alpha=0.95,
                                    family=Family))
  return(difftime(Sys.time(),start_T,units = "secs"))
})

cat("Benchmarked LeverageSampling() function.\n")

# benchmarking the NeEDS4BigData::ALoptimalGLMSub() function
ALoptimalGLMSub<-sapply(1:Replicates,function(i){
  start_T<-Sys.time()
  suppressMessages(ALoptimalGLMSub(r1[Covariate_idx],r2,
                                   Y=as.matrix(Full_Data[,1]),
                                   X=as.matrix(Full_Data[,-1]),
                                   N=nrow(Full_Data),
                                   family=Family))
  return(difftime(Sys.time(),start_T,units = "secs"))
})

cat("Benchmarked ALoptimalGLMSub() function.\n")

# benchmarking the NeEDS4BigData::AoptimalMCGLMSub() function
AoptimalMCGLMSub<-sapply(1:Replicates,function(i){
  start_T<-Sys.time()
  suppressMessages(AoptimalMCGLMSub(r1[Covariate_idx],r2,
                                    Y=as.matrix(Full_Data[,1]),
                                    X=as.matrix(Full_Data[,-1]),
                                    N=nrow(Full_Data),
                                    family=Family))
  return(difftime(Sys.time(),start_T,units = "secs"))
})

cat("Benchmarked AoptimalMCGLMSub() function.\n")

Plot_Data<-cbind.data.frame("N size"=N_size[N_idx],
                            "Covariate size"=Covariate_size[Covariate_idx],"lm()"=lm,
                            "biglm()"=biglm,"AoptimalGauLMSub()"=AoptimalGauLMSub,
                            "LeverageSampling()"=LeverageSampling,
                            "ALoptimalGLMSub()"=ALoptimalGLMSub,
                            "AoptimalMCGLMSub()"=AoptimalMCGLMSub)

save(Plot_Data,
     file=here("Results",paste0("Output_NS_",N_idx,"_CS_",Covariate_idx,".RData")))

Linear Regression

For linear regression, the following functions from the NeEDS4BigData package: (1) AoptimalGauLMSub(), (2) LeverageSampling(), (3) ALoptimalGLMSub(), and (4) AoptimalMCGLMSub() are compared against lm() and biglm().

Methods_FCT<-c("lm()","biglm()","LeverageSampling()","AoptimalMCGLMSub()",
               "AoptimalGauLMSub()","ALoptimalGLMSub()")
Method_Colors<-c("grey","black","#F76D5E","#A50021","#BBFFBB","#50FF50")

Final_Linear_Regression %>%
  pivot_longer(cols = `lm()`:`AoptimalMCGLMSub()`,
               names_to = "Methods",values_to = "Time") %>%
  group_by(`N size`,`Covariate size`,Methods) %>%
  summarise(Mean=mean(Time),min=quantile(Time,0.05),max=quantile(Time,0.95),
            .groups = "drop") %>%
  mutate(Methods=factor(Methods,levels = Methods_FCT,labels = Methods_FCT),
         `N size`=factor(`N size`,levels = N_size,
                         labels=paste0("N = ",N_size_labels))) %>%
ggplot(.,aes(x=factor(`Covariate size`),y=Mean,color=Methods,group=Methods))+
  geom_point(position = position_dodge(width = 0.5))+
  geom_line(position = position_dodge(width = 0.5))+
  geom_errorbar(aes(ymin=min,ymax=max),position = position_dodge(width = 0.5))+
  facet_wrap(~factor(`N size`),scales = "free",nrow=2)+
  xlab("Number of Covariates")+ylab("Time in Seconds")+
  scale_color_manual(values = Method_Colors)+
  theme_bw()+ggtitle("Linear Regression")+Theme_special()

Average time for functions, with 5% and 95% percentile intervals under linear regression.

In general, our subsampling functions perform faster as the size of the big data and the number of covariates increase, except for LeverageSampling(). Even when a solution for linear regression is available in an analytical form, the subsampling functions perform as well as biglm().

Logistic Regression

For logistic regression, the following functions: (1) LCCSampling(), (2) LeverageSampling(), (3) AoptimalMCGLMSub(), and (4) ALoptimalGLMSub() are compared against glm() and bigglm().

Methods_FCT<-c("glm()","bigglm()","LCCSampling()","LeverageSampling()",
               "AoptimalMCGLMSub()","ALoptimalGLMSub()")
Method_Colors<-c("grey","black","#F76D5E","#A50021","#BBFFBB","#50FF50")

Final_Logistic_Regression %>%
  pivot_longer(cols = `glm()`:`AoptimalMCGLMSub()`,
               names_to = "Methods",values_to = "Time") %>%
  group_by(`N size`,`Covariate size`,Methods) %>%
  summarise(Mean=mean(Time),min=quantile(Time,0.05),max=quantile(Time,0.95),
            .groups = "drop") %>%
  mutate(Methods=factor(Methods,levels = Methods_FCT,labels = Methods_FCT),
         `N size`=factor(`N size`,levels = N_size,
                         labels=paste0("N = ",N_size_labels))) %>%
ggplot(.,aes(x=factor(`Covariate size`),y=Mean,color=Methods,group=Methods))+
  geom_point(position = position_dodge(width = 0.5))+
  geom_line(position = position_dodge(width = 0.5))+
  geom_errorbar(aes(ymin=min,ymax=max),position = position_dodge(width = 0.5))+
  facet_wrap(~factor(`N size`),scales = "free",nrow=2)+
  xlab("Number of Covariates")+ylab("Time in Seconds")+
  scale_color_manual(values = Method_Colors)+
  theme_bw()+Theme_special()+ggtitle("Logistic Regression")

Average time for functions, with 5% and 95% percentile intervals under logistic regression.

There seems to be little difference between using glm() and bigglm(), while the subsampling functions perform faster, with the performance gap increasing as the size of the big data and the number of covariates grow.

Poisson Regression

For Poisson regression, the following functions: (1) LeverageSampling(), (2) AoptimalMCGLMSub(), and (3) ALoptimalGLMSub() are compared against glm() and bigglm().

Methods_FCT<-c("glm()","bigglm()","LeverageSampling()",
               "AoptimalMCGLMSub()","ALoptimalGLMSub()")
Method_Colors<-c("grey","black","#A50021","#BBFFBB","#50FF50")

Final_Poisson_Regression %>%
  pivot_longer(cols = `glm()`:`AoptimalMCGLMSub()`,
               names_to = "Methods",values_to = "Time") %>%
  group_by(`N size`,`Covariate size`,Methods) %>%
  summarise(Mean=mean(Time),min=quantile(Time,0.05),max=quantile(Time,0.95),
            .groups = "drop") %>%
  mutate(Methods=factor(Methods,levels = Methods_FCT,labels = Methods_FCT),
         `N size`=factor(`N size`,levels = N_size,
                         labels=paste0("N = ",N_size_labels))) %>%
ggplot(.,aes(x=factor(`Covariate size`),y=Mean,color=Methods,group=Methods))+
  geom_point(position = position_dodge(width = 0.5))+
  geom_line(position = position_dodge(width = 0.5))+
  geom_errorbar(aes(ymin=min,ymax=max),position = position_dodge(width = 0.5))+
  facet_wrap(~factor(`N size`),scales = "free",nrow=2)+
  xlab("Number of Covariates")+ylab("Time in Seconds")+
  scale_color_manual(values = Method_Colors)+
  theme_bw()+Theme_special()+ggtitle("Poisson Regression")

Average time for functions, with 5% and 95% percentile intervals under Poisson regression.

Similar to logistic regression, the subsampling functions perform faster than the glm() and bigglm() functions.

In summary, the subsampling functions available in this R package perform best under high-dimensional data.