file_path <- "http://www.statslab.cam.ac.uk/~rds37/teaching/statistical_modelling/" Movies <- read.csv(paste(file_path, "Movies.csv", sep ="")) MoviesLM <- lm(log(Total.Gross) ~ log(Opening) + Screens + RT + log(Budget)) lev <- hatvalues(MoviesLM) high_lev <- which(lev > 3*4/nrow(Movies)) # gives the high leverage obs lev[high_lev] # actually shows their leverage # Should see that obs 99 has much higher leverage than the rest Movies[99, ] # Looking at the covariate values, we see that this is largely because # the budget was so low for the film # Now a linear model without this obs MoviesLM_sub <- lm(log(Total.Gross) ~ log(Opening) + RT + log(Budget), subset=-99) ## Prediction intervals Movies2010 <- read.csv(paste(file_path, "Movies2010.csv", sep ="")) pred_intervals <- predict(MoviesLM_sub, Movies2010, interval="prediction") pred_intervals_trans <- exp(pred_intervals) target <- Movies2010$Total.Gross # (pred_intervals_trans[, 2] < target) gives a logical vector # The & performs a copmonentwise AND operation # Applying the mean function first coerces the logical vector into an vector where # TRUE -> 1 and FALSE -> 0 mean((pred_intervals_trans[, 2] < target) & (pred_intervals_trans[, 3] > target)) # We can also see how well our model predicts film earnings by # displaying the true earnings alongside the predicted earnings cbind(target, pred_intervals_trans)