Please refer to the dput of data . You may directly scroll down to objective and problem statement. Maybe you don't need data as you could have encountered this problem earlier.
Calling required libraries
library(zoo)
library(ggplot2)
library(scales)
library(plotly)
library(ggthemes)
library(forecast)
library(plotly)
library(DescTools)
dput of data
dput(ridership.ts)
structure(c(1709L, 1621L, 1973L, 1812L, 1975L, 1862L, 1940L,
2013L, 1596L, 1725L, 1676L, 1814L, 1615L, 1557L, 1891L, 1956L,
1885L, 1623L, 1903L, 1997L, 1704L, 1810L, 1862L, 1875L, 1705L,
1619L, 1837L, 1957L, 1917L, 1882L, 1933L, 1996L, 1673L, 1753L,
1720L, 1734L, 1563L, 1574L, 1903L, 1834L, 1831L, 1776L, 1868L,
1907L, 1686L, 1779L, 1776L, 1783L, 1548L, 1497L, 1798L, 1733L,
1772L, 1761L, 1792L, 1875L, 1571L, 1647L, 1673L, 1657L, 1382L,
1361L, 1559L, 1608L, 1697L, 1693L, 1836L, 1943L, 1551L, 1687L,
1576L, 1700L, 1397L, 1372L, 1708L, 1655L, 1763L, 1776L, 1934L,
2008L, 1616L, 1774L, 1732L, 1797L, 1570L, 1413L, 1755L, 1825L,
1843L, 1826L, 1968L, 1922L, 1670L, 1791L, 1817L, 1847L, 1599L,
1549L, 1832L, 1840L, 1846L, 1865L, 1966L, 1949L, 1607L, 1804L,
1850L, 1836L, 1542L, 1617L, 1920L, 1971L, 1992L, 2010L, 2054L,
2097L, 1824L, 1977L, 1981L, 2000L, 1683L, 1663L, 2008L, 2024L,
2047L, 2073L, 2127L, 2203L, 1708L, 1951L, 1974L, 1985L, 1760L,
1771L, 2020L, 2048L, 2069L, 1994L, 2075L, 2027L, 1734L, 1917L,
1858L, 1996L, 1778L, 1749L, 2066L, 2099L, 2105L, 2130L, 2223L,
2174L, 1931L, 2121L, 2076L, 2141L, 1832L, 1838L, 2132L), .Tsp = c(1991,
2004.16666666667, 12), class = "ts")
Creating data frame of ts object to use ggplot
tsd = data.frame(time = as.Date(ridership.ts),
value = as.matrix(ridership.ts))
Building linear model
ridership.lm <- tslm(ridership.ts ~ trend + I(trend^2))
Adding new column to existing data frame tsd
tsd$linear_fit = as.matrix(ridership.lm$fitted.values)
Defining length of validation and training period
nValid = 36
nTrain = length(ridership.ts) - nValid
Training data
train.ts = window(ridership.ts,
start = c(1991, 1),
end = c(1991, nTrain))
validation data
valid.ts = window(ridership.ts,
start = c(1991, nTrain + 1),
end = c(1991, nTrain + nValid))
Building model
ridership.lm = tslm(train.ts ~ trend + I(trend^2))
Forecasting using our build model
ridership.lm.pred = forecast(ridership.lm, h = nValid, level = 0)
Making dataframe for the fitted model values
tsd_train_model = data.frame(time = as.Date(train.ts),
lm_fit_train = as.matrix(ridership.lm$fitted.values))
Making dataframe for plotting purpose
forecast_df = data.frame(time = as.Date(valid.ts),
value = as.matrix(ridership.lm.pred$mean))
Creating plot using ggplot
p1 = ggplot(data = tsd,
aes(x = time, y = value)) +
geom_line(color = 'blue') +
ylim(1300, 2300) +
geom_line(data = tsd_train_model,
aes(x = time, y = lm_fit_train),
color = 'red')
p2 = p1 +
geom_line(data = forecast_df,
aes(x = time, y = value),
col = 'red', linetype = 'dotted') +
scale_x_date(breaks = date_breaks('1 years'),
labels = date_format('%b-%y')) +
geom_vline(xintercept = as.numeric(c(tsd_train_model[NROW(tsd_train_model), ]$time, #last date of training period
forecast_df[NROW(forecast_df), ]$time))) #last date of testing period
p3 = p2 +
annotate('text',
x = c(tsd_train_model[NROW(tsd_train_model)/2, ]$time,
forecast_df[NROW(forecast_df) / 2,]$time),
y = 2250,
label = c('Training Period', 'Validation Period'))
Objective: I want to add forecast error of 5 percentile and 95 percentile on both side of predicted line (dotted red in this figure) and shade the region.
I used quantile for producting forecast range
q = quantile(ridership.lm.pred$residuals, c(.05, .95))
percentile_5 = as.numeric(q[1])
percentile_95 = as.numeric(q[2])
Add 5 percentile and 95 percentile to the forecast data
yl = forecast_df$value + percentile_5
ym = forecast_df$value + percentile_95
Problem: If I use the below command then it is not displaying the shaded region for the complete validation period.
p3 + geom_ribbon(data = forecast_df,
aes(ymin = yl,
ymax = ym),
fill="gray30")
NROW(yl)
[1]36
sum(is.na(yl))
[1] 0
NROW(ym)
[1] 36
sum(is.na(ym))
[1] 0
Things Tried: If I replace the value of ymin and ymax by any other value for example If I use the below command then I get the figure shown just below the command
p3 + geom_ribbon(data = forecast_df,
aes(ymin = rep(1750,36),
ymax = rep(2000,36),
fill="gray30"))
My Questions:
Can anyone please tell me the reason behind the output in figure 2? Why R is giving strange output as in figure 2?
Can anyone please help me to shade the complete region using ggplot?
TLDR: remove the line ylim(1300, 2300) +
from your ggplot
code.
When you set your plot's limits using scale_x_***()
/ scale_y_***
(or equivalently xlim()
/ ylim()
), the plot will throw away all data points that fall outside this range. In the case of a geom_ribbon that requires both ymin & ymax values, when the values corresponding to ymax get removed (since they are larger than 2300), the ribbon can't be plotted with only ymin, hence the ribbon stops short before then.
If you really want to plot only for the range (1300, 2300), set the limits inside coord_cartesian()
instead. This enables the plot to zoom to the range limit, without discarding data points outside. See the documentation for more information.
Other non-essential suggestions below:
For plotting in ggplot, I'd usually try to keep everything within the same data frame, as far as possible, to utilise common variables in the aesthetic mappings. Here's how I'd do it:
Combining everything into a single data frame:
library(dplyr)
df <- left_join(tsd %>% select(time, value),
rbind(tsd_train_model %>%
rename(fit = lm_fit_train) %>%
mutate(status = "train"),
forecast_df %>%
rename(fit = value) %>%
mutate(status = "valid")))
df <- df %>%
mutate(yl = ifelse(status == "valid", fit + percentile_5, NA),
ym = ifelse(status == "valid", fit + percentile_95, NA))
> head(df)
time value fit status yl ym
1 1991-01-01 1709 1882.681 train NA NA
2 1991-02-01 1621 1876.546 train NA NA
3 1991-03-01 1973 1870.518 train NA NA
4 1991-04-01 1812 1864.597 train NA NA
5 1991-05-01 1975 1858.784 train NA NA
6 1991-06-01 1862 1853.078 train NA NA
> tail(df)
time value fit status yl ym
154 2003-10-01 2121 2190.490 valid 1934.914 2397.875
155 2003-11-01 2076 2200.756 valid 1945.179 2408.141
156 2003-12-01 2141 2211.129 valid 1955.553 2418.514
157 2004-01-01 1832 2221.609 valid 1966.033 2428.994
158 2004-02-01 1838 2232.197 valid 1976.620 2439.582
159 2004-03-01 2132 2242.891 valid 1987.315 2450.277
Plot
ggplot(data = df,
aes(x = time)) +
# place the ribbon below all other geoms for easier viewing, & increase transparency
geom_ribbon(aes(ymin = yl, ymax = ym), fill = "gray30", alpha = 0.2) +
# original values
geom_line(aes(y = value), color = "blue") +
# fitted values (line type differs by training / validation)
geom_line(aes(y = fit, linetype = status), color = "red") +
# indicates validation range
geom_vline(xintercept = c(min(df$time[df$status=="valid"]),
max(df$time[df$status=="valid"]))) +
scale_x_date(breaks = scales::date_breaks('1 year'),
labels = scales::date_format('%b-%y')) +
# hide legend for line type (comment this line out if you want to show it)
scale_linetype(guide = F) +
# limits can be tweaked here
coord_cartesian(ylim = c(1300, 2500)) +
# plain white plot background for easier viewing
theme_classic()
Edit: Alternative solution that makes legends easier:
# create long data frame where all values (original / training / validation) are
# in the same column
df2 <- rbind(tsd %>% select(time, value) %>%
mutate(status = "original"),
tsd_train_model %>%
rename(value = lm_fit_train) %>%
mutate(status = "train"),
forecast_df %>%
mutate(status = "valid")) %>%
mutate(yl = ifelse(status == "valid", value + percentile_5, NA),
ym = ifelse(status == "valid", value + percentile_95, NA))
# in the scales for colour / line type, define the same labels in order to
# combine the two legends
ggplot(data = df2,
aes(x = time)) +
geom_ribbon(data = subset(df2, !is.na(yl)),
aes(ymin = yl, ymax = ym, fill = "interval"), alpha = 0.2) +
geom_line(aes(y = value, color = status, linetype = status)) +
geom_vline(xintercept = c(min(df2$time[df$status=="valid"]),
max(df2$time[df$status=="valid"]))) +
scale_x_date(breaks = scales::date_breaks('1 year'),
labels = scales::date_format('%b-%y')) +
scale_color_manual(name = "",
values = c("original" = "blue",
"train" = "red",
"valid" = "red")) +
scale_linetype_manual(name = "",
values = c("original" = "solid",
"train" = "solid",
"valid" = "longdash")) +
scale_fill_manual(name = "",
values = c("interval" = "gray30")) +
coord_cartesian(ylim = c(1300, 2500)) +
theme_classic() +
theme(legend.position = "bottom")