miles <- read.csv("Revenue_miles_2.csv")
str(miles)
## 'data.frame': 192 obs. of 3 variables:
## $ DATE : chr "2000-01-01" "2000-02-01" "2000-03-01" "2000-04-01" ...
## $ Miles.Thousands.: int 49843099 49931931 61478163 58981617 61223861 65601574 67898320 67028338 56441629 58834210 ...
## $ Miles..Billions.: num 4.98 4.99 6.15 5.9 6.12 ...
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
miles.ts <- ts(miles$Miles..Billions.,start = c(2000,1),end = c(2004,12),frequency = 12)
autoplot(miles.ts,xlab = "Month", ylab = "Miles (Billions)", main = "Miles vs Month")
seasonplot(miles.ts,xlab = "Month", ylab = "Miles (Billions)", main = "Seasonal Plot of Miles vs Month",year.labels = T,col = 1:4)
Dulles <- read.csv("Dulles_2.csv", header = T)
summary(Dulles)
## Year Passengers..000s.
## Min. :1963 Min. : 641
## 1st Qu.:1976 1st Qu.: 2083
## Median :1989 Median : 8947
## Mean :1989 Mean : 8456
## 3rd Qu.:2002 3rd Qu.:14393
## Max. :2015 Max. :22129
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
Dulles.plot <- ts(data = Dulles$Passengers..000s., start = as.yearmon("1963-12"), frequency = 1)
plot(Dulles.plot, xlab = "Year", ylab = "Passengers/K",main = "Dulles Airport Passengers")
autoplot(Dulles.plot, xlab = "Year", ylab = "Passengers/K",main = "Dulles Airport Passengers")
A simple way to view a single (or “first order”) difference is to see it as \[ Y_t−Y_{t−k} \] Time Series Plot for the First Differences of the Dulles Passenger Series.
x:
a numeric vector or matrix containing the values to be
differenced.
lag:
an integer indicating which lag to use.
differences
an integer indicating the order of the difference.(We won’t use
this argument much)
x <- c(3,2,4,6,7,3,1,8)
x
## [1] 3 2 4 6 7 3 1 8
diff(x, lag = 1)
## [1] -1 2 2 1 -4 -2 7
diff(x, lag = 2)
## [1] 1 4 3 -3 -6 5
Diff_1 <- diff(Dulles.plot, differences = 1)
plot(Diff_1, xlab='Time', ylab = "Difference", main = "First order difference of passengers at Dulles Airport")
autoplot(Diff_1, xlab='Time', ylab = "Difference", main = "First order difference of passengers at Dulles Airport")
Growth_1 <- 100* Diff_1 / Dulles.plot
plot(Growth_1, xlab='Time',ylab = "Growth", main= "Growth of # of Passengers at Dulles Airport")
autoplot(Growth_1, xlab='Time',ylab = "Growth", main= "Growth of # of Passengers at Dulles Airport")
Log_Dulles <- log(Dulles.plot)
plot(Log_Dulles, xlab='Time', ylab = "Log # of panssengers",main = "Dulles Airport Log # of Passengers")
autoplot(Log_Dulles, xlab='Time', ylab = "Log # of panssengers",main = "Dulles Airport Log # of Passengers")
log_diff_1 <- diff(Log_Dulles, differences = 1)
plot(log_diff_1, xlab='Time', ylab = "Difference", main = "First order difference of log passengers at Dulles Airport")
autoplot(log_diff_1, xlab='Time', ylab = "Difference", main = "First order difference of log passengers at Dulles Airport")
Do the log_diff plot looks like the growth plot?
layout(t(1:2))
plot(log_diff_1, xlab='Time', ylab = "Difference", main = "Log_Diff")
plot(Growth_1, xlab='Time',ylab = "Growth", main= "Growth")
layout(1)
This process is known as rolling origin forecasting. The one-step-ahead forecast error at time t+i may be denoted by
\[ e_{t+i}=Y_{t+i}−F_{t+i} \]
DC_weather <- read.csv("dc_weather.csv", header = T)
colnames(DC_weather) <- c("Date", "Forecast", "Actual")
DC_weather$Date <- as.Date(DC_weather$Date, "%m/%d/%y")
str(DC_weather)
## 'data.frame': 35 obs. of 3 variables:
## $ Date : Date, format: "2021-06-01" "2021-06-02" ...
## $ Forecast: int 82 82 83 86 85 85 74 80 81 94 ...
## $ Actual : int NA 82 81 82 85 87 87 74 82 82 ...
#summary(DC_weather)
# Draw time series
plot(x = DC_weather$Date, y = DC_weather$Forecast, xlab='Time', ylab="Forecast", type="l", col="blue")
lines(x = DC_weather$Date, y = DC_weather$Actual, xlab='Time', ylab="Actual", col="red")
# Calculate the forecast error
Actual1 <- DC_weather$Actual[c(-1)]
Forecast1 <- DC_weather$Forecast[-dim(DC_weather)[1]]
error <- Actual1 - Forecast1
Mean Error
# Mean error
ME <- mean(error)
ME
## [1] 0.3823529
Mean Percentage Error
# Mean Percentage Error
MPE <- 100 * mean(error/DC_weather$Actual[-1])
MPE
## [1] 0.3828892
Mean absolute error
MAE <- mean(abs(error))
MAE
## [1] 1.911765
Mean absolute percentage error
# Mean absolute percentage error
MAPE <- 100 * mean(abs(error/DC_weather$Actual[-1]))
MAPE
## [1] 2.244587
Mean square error
MSE <- mean(error^2)
MSE
## [1] 6.970588
Root mean square error
RMSE <- sqrt(MSE)
RMSE
## [1] 2.640187
Relative absolute error
Diff_1 <- diff(DC_weather$Actual,lag = 1,differences = 1)
RelMAE <- sum(abs(error[-1])) / sum(abs(Diff_1[-1]))
RelMAE
## [1] 0.4642857
Theil’s U
Theil_U <- sqrt(sum(error[-1]^2)) / sqrt(sum(Diff_1[-1]^2))
Theil_U
## [1] 0.4422042
We can use the method of Normal Distribution to get a prediction confidence interval. We can get the lower and upper limits through the way of:
\[ \begin{aligned} \text{Lower} &= F_{t+1} - Z * \text{RMSE}\\ \text{Upper} &= F_{t+1} + Z * \text{RMSE}\\ \end{aligned} \] How to get the corresponding Z-score to a given \(100*(1-\alpha)\) level of confidence?
You can use the function qnorm() to get the Z-score.
\[ \text{qnorm}(1-\frac{\alpha}{2}) \]
# For example, you want to get the 95% confidence level. Then here the aplha = 0.05. So (1 - alpha/2) = 0.975
qnorm(0.975)
## [1] 1.959964
For example, we want to get a 95% Confidence Interval for 2022-06-06.
RMSE
## [1] 2.640187
DC_weather[6,]
## Date Forecast Actual
## 6 2021-06-06 85 87
prediction_lower <- DC_weather$Forecast[6] - 1.96 * RMSE
prediction_upper <- DC_weather$Forecast[6] + 1.96 * RMSE
prediction_lower
## [1] 79.82523
prediction_upper
## [1] 90.17477
length(error)
## [1] 34
0.025 * 35
## [1] 0.875
0.975 * 35
## [1] 34.125
# So let's take the 1st error term and the 34th error term
sort(error)[1]
## [1] -3
sort(error)[34]
## [1] 9
# therefore the lower and upper limits are:
DC_weather$Forecast[6] + sort(error)[1]
## [1] 82
DC_weather$Forecast[6] + sort(error)[34]
## [1] 94