Prediction Intervals
- Confidence intervals attempt to capture the value of a parameter.
  Example: - Prediction intervals attempt to capture the value of the random variable, using the estimated parameter value.
  Example:
### Intervals.t(..) and plot_Intervals(..) functions
################################ t interval ############
#' Make confidence and prediction intervals for mean of normal rv
#'
#' @ mu mean of X
#' @ sigma variance of X
#' @ n sample size
#' @ alpha desired confidence level (1-alpha)
#' @ num_estimates number of confidence intervals to construct
#' @ data boolean: return data or return bootstrap confidence value
#'
#' returns either data.frame: $x_bar, $CI.min, $CI.max, $PI.min, $PI.max $x, $in_PI
#' or number: bootstrap confidence level of Prediction Interval
#'
Intervals.t <- function(mu=0, sigma=1, n=100, alpha=.05, num_estimates=10000, data=FALSE) {
samples <- matrix( # generate matrix of num_samples x num_estimates
rnorm(n*num_estimates, mu, sigma),
nrow=num_estimates
)
library(matrixStats) # needed for rowSds (sample deviations of rows)
sample_means <- rowMeans(samples)
sample_sds <- rowSds(samples)
rm(samples) # free up memory
e.CI <- qt(1-alpha/2,n-1) * sample_sds * sqrt(1/n) # CI margin of error
e.PI <- qt(1-alpha/2,n-1) * sample_sds * sqrt(1+1/n) # PI margin of error
Intervals <- data.frame(
CI.min = sample_means - e.CI, # confindence interval min
CI.max = sample_means + e.CI, # max
PI.min = sample_means - e.PI, # prediction interval min
PI.max = sample_means + e.PI # max
)
rm(e.CI, e.PI) # free up memory
x <- rnorm(num_estimates, mu, sigma) # generate next value
# T/F vector telling if PI contains actual next value
in_PI <- ((x > Intervals$PI.min) & (x < Intervals$PI.max))
if (data==TRUE) { return(cbind(x_bar=sample_means, # sample mean
Intervals, # CI.min, CI.max, PI.min, PI.max
x, # sample next value
in_PI # is next value in PI?
)) }
return (sum(in_PI)/num_estimates)
}
################################
################################ plot_Intervals ####################
#' plots prediction intervals spread vertically across graph
#'
#' @ I data.frame: $x_bar, $CI.min, $CI.max, $PI.min, $PI.max, $x, $in_PI
#' @ y.max prediction intervals plotted at heights from 0 to y.max
#'
plot_Intervals <- function(I, y.max) {
n <- nrow(I)
color <- rep('red', n) # bad PI will be red and thick
width <- rep(2, n)
color[I$in_PI] <- 'black' # good PI will be black and thin
width[I$in_PI] <- 1
height <- (1:n) * y.max / n # spread PI vertically across plot
for (i in 1:n) {
lines(c(I$PI.min[i], I$PI.max[i]),
c(height[i], height[i]),
col=color[i], lwd=width[i], lty='dotted')
lines(c(I$CI.min[i], I$CI.max[i]),
c(height[i], height[i]),
col=color[i], lwd=width[i], lty='solid')
points(I$x_bar[i], height[i],
col=color[i], pch=16)
points(I$x[i], height[i],
col=color[i])
}
}
Example:
The example below generates random normal samples to make
- a filled dot marks the observed sample mean
- a solid line marks the
confidence interval for the mean - a dotted line extending outwards marks the
prediction interval for the next random value - an empty dot marks the next random normal sample value computed
If the next random sample is not included in the prediction interval, then the everything is colored red (in this case, the prediction interval failed to predict the next random value), otherwise everything is colored black. The true mean is marked with a blue vertical line and the
and quantiles for sample means are marked with dotted blue vertical lines.
Bootstrap Confidence of Prediction Intervals
Just looking at the graph above, it is difficult to tell if our prediction intervals actually have the correct confidence value. For the
Tolerance Intervals
install.packages('tolerance')
TI <- function(mu=0, sigma=1, n=100, alpha=.05, P=.99, num_estimates=10000, data=FALSE) {
samples <- matrix( # generate matrix of num_samples x num_estimates
rnorm(n*num_estimates, mu, sigma),
nrow=num_estimates
)
### the following code gives tolerance intervals which are close but not correct
library(matrixStats) # needed for rowSds (sample deviations of rows)
sample_means <- rowMeans(samples)
sample_sds <- rowSds(samples)
rm(samples) # free up memory
e.mu <- sample_sds / sqrt(n) * qt(1-alpha/2,n-1) # std error of mu approx
e.x <- sample_sds * qt(min(.87+1.15*P,.91+1.1*P,1.002+1.003*P)/2,n-1) # std error for P% of more data
# this should be something close to (1+P)/2
TI <- data.frame(
min = sample_means - e.mu - e.x,
max = sample_means + e.mu + e.x
)
rm(e.mu, e.x) # free up memory
###
### the following code relies on the 'tolerance' libarary, which datacamp isn't loading correctly
#TI <- as.data.frame(do.call(rbind,
# apply(samples, 1, function (x) normtol.int(x, alpha, P, side = 2))
# ))
#TI <- TI[c('2-sided.lower','2-sided.upper')]
#colnames(TI) <- c('min','max')
###
# T/F vector telling if PI contains correct proportion of further data
# generate 1000 comparison data for checking each PI
samples <- matrix( # generate matrix of 1000 x num_estimates
rnorm(1000*num_estimates, mu, sigma),
nrow=num_estimates
)
in_TI <- rowSums((samples > TI$min) & (samples < TI$max)) / 1000
rm(samples)
if (data==TRUE) { return(cbind(x_bar=sample_means, TI, in_TI)) }
return (sum(in_TI >= P) / num_estimates) # return bootstrap confidence
}
P <- seq(from=.5, to=.99, by=.01)
confidence <- sapply(P, function(p) TI(alpha=.05,P=p))
plot(P,confidence)