Basic Tools For Forecasting

Seasonal plots

miles <- read.csv("Revenue_miles_2.csv")
str(miles)
## 'data.frame':    192 obs. of  3 variables:
##  $ DATE            : chr  "2000-01-01" "2000-02-01" "2000-03-01" "2000-04-01" ...
##  $ Miles.Thousands.: int  49843099 49931931 61478163 58981617 61223861 65601574 67898320 67028338 56441629 58834210 ...
##  $ Miles..Billions.: num  4.98 4.99 6.15 5.9 6.12 ...
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
miles.ts <- ts(miles$Miles..Billions.,start = c(2000,1),end = c(2004,12),frequency = 12)
autoplot(miles.ts,xlab = "Month", ylab = "Miles (Billions)", main = "Miles vs Month")

seasonplot(miles.ts,xlab = "Month", ylab = "Miles (Billions)", main = "Seasonal Plot of Miles vs Month",year.labels = T,col = 1:4)

Transformations

Summary of Dulles Data

Dulles <- read.csv("Dulles_2.csv", header = T)
summary(Dulles)
##       Year      Passengers..000s.
##  Min.   :1963   Min.   :  641    
##  1st Qu.:1976   1st Qu.: 2083    
##  Median :1989   Median : 8947    
##  Mean   :1989   Mean   : 8456    
##  3rd Qu.:2002   3rd Qu.:14393    
##  Max.   :2015   Max.   :22129
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
Dulles.plot <- ts(data = Dulles$Passengers..000s., start = as.yearmon("1963-12"), frequency = 1)
plot(Dulles.plot, xlab = "Year", ylab = "Passengers/K",main = "Dulles Airport Passengers")

autoplot(Dulles.plot, xlab = "Year", ylab = "Passengers/K",main = "Dulles Airport Passengers")

-Differences

A simple way to view a single (or “first order”) difference is to see it as \[ Y_t−Y_{t−k} \] Time Series Plot for the First Differences of the Dulles Passenger Series.

-“diff()” function

-Arguments

x:
a numeric vector or matrix containing the values to be differenced.

lag:
an integer indicating which lag to use.

differences
an integer indicating the order of the difference.(We won’t use this argument much)

x <-  c(3,2,4,6,7,3,1,8)
x
## [1] 3 2 4 6 7 3 1 8
diff(x, lag = 1)
## [1] -1  2  2  1 -4 -2  7
diff(x, lag = 2)
## [1]  1  4  3 -3 -6  5
Diff_1 <- diff(Dulles.plot, differences = 1)
plot(Diff_1, xlab='Time', ylab = "Difference", main = "First order difference of passengers at Dulles Airport")

autoplot(Diff_1, xlab='Time', ylab = "Difference", main = "First order difference of passengers at Dulles Airport")

-Growth Rates Time Series Plot for the Growth Rate for the Dulles Passenger Series

Growth_1 <- 100* Diff_1 / Dulles.plot
plot(Growth_1, xlab='Time',ylab = "Growth", main= "Growth of # of Passengers at Dulles Airport")

autoplot(Growth_1, xlab='Time',ylab = "Growth", main= "Growth of # of Passengers at Dulles Airport")

-Logarithm

Log Transform

Log_Dulles <- log(Dulles.plot)
plot(Log_Dulles, xlab='Time', ylab = "Log # of panssengers",main = "Dulles Airport Log # of Passengers")

autoplot(Log_Dulles, xlab='Time', ylab = "Log # of panssengers",main = "Dulles Airport Log # of Passengers")

log_diff_1 <- diff(Log_Dulles, differences = 1)
plot(log_diff_1, xlab='Time', ylab = "Difference", main = "First order difference of log passengers at Dulles Airport")

autoplot(log_diff_1, xlab='Time', ylab = "Difference", main = "First order difference of log passengers at Dulles Airport")

Do the log_diff plot looks like the growth plot?

layout(t(1:2))
plot(log_diff_1, xlab='Time', ylab = "Difference", main = "Log_Diff")
plot(Growth_1, xlab='Time',ylab = "Growth", main= "Growth")

layout(1)

How to Evaluate Forecasting Accuracy

This process is known as rolling origin forecasting. The one-step-ahead forecast error at time t+i may be denoted by

\[ e_{t+i}=Y_{t+i}−F_{t+i} \]

Example:

DC_weather <- read.csv("dc_weather.csv", header = T)
colnames(DC_weather) <- c("Date", "Forecast", "Actual")
DC_weather$Date <- as.Date(DC_weather$Date, "%m/%d/%y")
str(DC_weather)
## 'data.frame':    35 obs. of  3 variables:
##  $ Date    : Date, format: "2021-06-01" "2021-06-02" ...
##  $ Forecast: int  82 82 83 86 85 85 74 80 81 94 ...
##  $ Actual  : int  NA 82 81 82 85 87 87 74 82 82 ...
#summary(DC_weather)
# Draw time series
plot(x = DC_weather$Date, y = DC_weather$Forecast, xlab='Time', ylab="Forecast", type="l", col="blue")
lines(x = DC_weather$Date, y = DC_weather$Actual, xlab='Time', ylab="Actual", col="red")

# Calculate the forecast error
Actual1 <- DC_weather$Actual[c(-1)]
Forecast1 <- DC_weather$Forecast[-dim(DC_weather)[1]]
error <- Actual1 - Forecast1

Mean Error

# Mean error
ME <- mean(error)
ME
## [1] 0.3823529

Mean Percentage Error

# Mean Percentage Error
MPE <- 100 * mean(error/DC_weather$Actual[-1])
MPE
## [1] 0.3828892

Mean absolute error

MAE <- mean(abs(error))
MAE
## [1] 1.911765

Mean absolute percentage error

# Mean absolute percentage error
MAPE <- 100 * mean(abs(error/DC_weather$Actual[-1]))
MAPE
## [1] 2.244587

Mean square error

MSE <- mean(error^2)
MSE
## [1] 6.970588

Root mean square error

RMSE <- sqrt(MSE)
RMSE
## [1] 2.640187

Relative absolute error

Diff_1 <- diff(DC_weather$Actual,lag = 1,differences = 1)
RelMAE <- sum(abs(error[-1])) / sum(abs(Diff_1[-1]))
RelMAE
## [1] 0.4642857

Theil’s U

Theil_U <- sqrt(sum(error[-1]^2)) / sqrt(sum(Diff_1[-1]^2))
Theil_U
## [1] 0.4422042

Prediction Interval

Normal Prediction Intervals

We can use the method of Normal Distribution to get a prediction confidence interval. We can get the lower and upper limits through the way of:

\[ \begin{aligned} \text{Lower} &= F_{t+1} - Z * \text{RMSE}\\ \text{Upper} &= F_{t+1} + Z * \text{RMSE}\\ \end{aligned} \] How to get the corresponding Z-score to a given \(100*(1-\alpha)\) level of confidence?

You can use the function qnorm() to get the Z-score.

\[ \text{qnorm}(1-\frac{\alpha}{2}) \]

# For example, you want to get the 95% confidence level. Then here the aplha = 0.05. So (1 - alpha/2) = 0.975
qnorm(0.975)
## [1] 1.959964

For example, we want to get a 95% Confidence Interval for 2022-06-06.

RMSE
## [1] 2.640187
DC_weather[6,]
##         Date Forecast Actual
## 6 2021-06-06       85     87
prediction_lower <- DC_weather$Forecast[6] - 1.96 * RMSE
prediction_upper <- DC_weather$Forecast[6] + 1.96 * RMSE
prediction_lower
## [1] 79.82523
prediction_upper
## [1] 90.17477

Empirical Prediction Intervals

length(error)
## [1] 34
0.025 * 35
## [1] 0.875
0.975 * 35
## [1] 34.125
# So let's take the 1st error term and the 34th error term
sort(error)[1]
## [1] -3
sort(error)[34]
## [1] 9
# therefore the lower and upper limits are:
DC_weather$Forecast[6] + sort(error)[1]
## [1] 82
DC_weather$Forecast[6] + sort(error)[34]
## [1] 94