Bootstrapping multiple variables in R-CodePudding

I would like to perform a bootstrap linear regression, because of concerns about normally distributed error terms. I have a data set that is big enough to ignore this fact, but I just want to double check if a linear regression model relying on analytically computed standard errors yields the same results as results obtained from a bootstrapped linear regression.

So far, I have used the follwing code:

Rel01 <- subset(
  Relevante_Variablen2,
  select = c(intr, sale_py_at_py, R_at_py,
             inflr, dt, re, txt)
)

x = as.data.frame(sapply(Rel01, as.numeric))
boxplot(x)

library(boot)

i<-nrow(x) #count number of rows for resampling 
g<-ncol(x) #count number of columns to step through with bootstrapping
boot.mean<-function(x,i){boot.mean<-mean(x[i])} #bootstrapping function to get the mean

boot_tabl <- apply(x,2,function(y){ 
  b<-boot(y,boot.mean,R=50000); 
  c(mean(b$t),boot.ci(b,type="perc", conf=0.95)$percent[4:5])
})

View(boot_tabl)

Everything seems to work (output is in the table shown below), but I do not know how to interpret the output from my approach.

^	sale_py_at_py	intr	inflr	dt	re	txt
1	0.0984438	0.01223	0.04243	200	400	1200
2	0.0974530	0.01191	0.04122	190	300	1000
3	0.0993230	0.01291	0.04256	210	405	1210

My data looks something like this:

Name	Segment	Sale	Year	Asset	Another header
A	3401	10000	2000	200000	x
A	3401	20000	2001	250000	x
B	2201	15000	2004	280000	x
B	2201	23000	2009	320000	x
B	2201	28000	2010	390000	x
C	2201	30000	2000	210000	x
C	2201	18000	2004	200000	x
D	1	28000	2000	400000	x
D	1	38000	2001	521000	x

Could anyone provide some directions on how I could bootstrap my linear regression analysis? And also tell me what the output of my regression is telling me exactly?

Edit:

fm0 <- lm(marketingspending ~ intr   inflr   sale_py_at_py  R_at_py  
            dt   re   txt , data = Relevante_V03)

boot_lm <- function(Relevante_V03, fm0,
                    type = c('ordinary', 'param'),
                    B = 1000L, seed = 1) {
  set.seed(59385)
  betas_original_model <- coef(Relevante_V03)
  len_coef <- length(betas_original_model)
  mat <- matrix(rep(0L, B * len_coef), ncol = len_coef)
  if (type %in% 'ordinary') {
    n_rows <- length(residuals(Relevante_V03))
    for (i in seq_len(B)) {
      boot_dat <- Rel01[sample(seq_len(n_rows), replace = TRUE), ]
      mat[i, ] <- coef(lm(terms(Relevante_V03), data = boot_dat))
    }
  }
  if (type %in% 'param') {
    X <- model.matrix(delete.response(terms(Relevante_V03)),
                      data = original_data)[, -1L]
    for (i in seq_len(B)) {
      mat[i, ] <- coef(lm(unlist(simulate(Relevante_V03)) ~ X,
                          data = original_data))
    }
  }
  confints <- matrix(rep(0L, 2L * len_coef), ncol = 2L)
  pvals <- numeric(len_coef)
  for (i in seq_len(len_coef)) {
    pvals[i] <- mean(abs(mat[, i] - mean(mat[, i])) > abs(betas_original_model[i]))
    confints[i, ] <- quantile(mat[, i], c(.025, 0.975))
  }
  names(pvals) <- names(betas_original_model)
  out <- data.frame(estimate = betas_original_model,
                    'lwr' = confints[, 1], 'upr' = confints[, 2],
                    p_value = pvals)
  return(out)
}

Thanks to the comment I tried the approach. Could some one tell me if I am doing everything right in this approach?

CodePudding user response：

As requested by the OP, this should be the appropriate way to proceed with in his specific case.

It is not necessary (and I would even advise against it) to change the arguments within the function itself. Instead, specify appropriate character stings and integers as arguments in a call to the function. This should be done as follows.

First we specify our linear model.

# linear model
fm0 <- lm(marketingspending ~ intr   inflr   sale_py_at_py  R_at_py  
              dt   re   txt , data = Relevante_V03)

Then, we run the function as is. For more information on the function's arguments, refer to my recent answer.

boot_lm <- function(original_data, original_model,
                    type = c('ordinary', 'param'),
                    B = 1000L, seed = 1) {
  set.seed(seed)
  betas_original_model <- coef(original_model)
  len_coef <- length(betas_original_model)
  mat <- matrix(rep(0L, B * len_coef), ncol = len_coef)
  if (type %in% 'ordinary') {
    n_rows <- length(residuals(original_model))
    for (i in seq_len(B)) {
      boot_dat <- original_data[sample(seq_len(n_rows), replace = TRUE), ]
      mat[i, ] <- coef(lm(terms(original_model), data = boot_dat))
    }
  }
  if (type %in% 'param') {
    X <- model.matrix(delete.response(terms(original_model)),
                      data = original_data)[, -1L]
    for (i in seq_len(B)) {
      mat[i, ] <- coef(lm(unlist(simulate(original_model)) ~ X,
                          data = original_data))
    }
  }
  confints <- matrix(rep(0L, 2L * len_coef), ncol = 2L)
  pvals <- numeric(len_coef)
  for (i in seq_len(len_coef)) {
    pvals[i] <- mean(abs(mat[, i] - mean(mat[, i])) > abs(betas_original_model[i]))
    confints[i, ] <- quantile(mat[, i], c(.025, 0.975))
  }
  names(pvals) <- names(betas_original_model)
  out <- data.frame(estimate = betas_original_model,
                    'lwr' = confints[, 1], 'upr' = confints[, 2],
                    p_value = pvals)
  return(out)
}

Finally, we specify character strings and integers as arguments in a call to boot_lm() to have it tailor-made for your specific case.

# non-parametric bootstrap
boot_lm(original_data = Relevante_V03, original_model = fm0,
        type = 'ordinary', B = 1e4, seed = 59385)

# parametric bootstrap
boot_lm(original_data = Relevante_V03, original_model = fm0,
        type = 'param', B = 1e4, seed = 59385)