Here dput of my train sample
dput(head(train,10))
train=structure(list(d_event_date = structure(c(19702, 19702, 19702,
19702, 19702, 19702, 19702, 19702, 19702, 19702), class = "Date"),
id_placement = c(20199549L, 20396189L, 20420889L, 16751495L,
17220762L, 15387714L, 20355507L, 20385438L, 17752382L, 21253771L
), id_zone = c(1632853L, 2836041L, 2919363L, 1709628L, 9937L,
1274157L, 1400727L, 2907891L, 1948324L, 1863050L), id_publisher = c(302501L,
480683L, 1204726L, 4468L, 5515L, 163955L, 176741L, 421345L,
441862L, 467632L), id_advertiser = c(914L, 914L, 914L, 914L,
914L, 914L, 914L, 914L, 914L, 914L), id_campaign = c(687314L,
687314L, 687314L, 723900L, 723900L, 723900L, 723900L, 723900L,
723900L, 723900L), id_banner = c(2032222L, 2032222L, 2032222L,
2136674L, 2136674L, 2136674L, 2136674L, 2136674L, 2136674L,
2136674L), id_landing = c(4772102L, 4772102L, 4699256L, 4821730L,
4821514L, 4821513L, 4821514L, 4823676L, 4821514L, 4821730L
), id_ad_unit = c(28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L,
28L, 28L), n_landing_pricing_type = c(3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), n_impression_qd = c(1L, 8L, 1L, 1L,
25L, 4L, 35L, 1L, 5L, 1L), n_click_qd = structure(c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0), class = "integer64"), n_conversion_qd = c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L), n_gross = c(0.0024,
0.0024, 0.0024, 0.008, 0.032499998, 0.0054, 0.064999996,
0.0076, 0.0129, 0.0239), n_net = c(0.001632, 0.00036, 0.001632,
0.0052, 0, 0.0027, 0.05135, 0.0038, 0.00774, 0.016252)), row.names = c(NA,
10L), class = "data.frame")
i create time series model using Arima. Here my code
# Function for data normalization
normalize_data <- function(data, method) {
normalized_data <- NULL
if (method == "none") {
normalized_data <- data
} else if (method == "min-max") {
min_val <- min(data)
max_val <- max(data)
normalized_data <- (data - min_val) / (max_val - min_val)
} else if (method == "z-score") {
mean_val <- mean(data)
std_val <- sd(data)
normalized_data <- (data - mean_val) / std_val
} else if (method == "cubic-root") {
normalized_data <- sign(data) * abs(data)^(1/3)
} else if (method == "logarithmic") {
normalized_data <- log(1 + data)
}
return(normalized_data)
}
# Function for calculating MAPE
calculate_mape <- function(actual, forecasted) {
mape <- mean(abs((actual - forecasted) / actual)) * 100
return(map)
}
# Normalization and model building for each variable
variables <- c("n_impression_qd", "n_conversion_qd", "n_gross")
normalization_methods <- c("none", "min-max", "z-score", "cubic-root", "logarithmic")
mape_results <- data.frame(variable = character(), normalization = character(), mape = numeric(), stringsAsFactors = FALSE)
for (variable in variables) {
for (method in normalization_methods) {
normalized_data <- normalize_data(data_filtered[[variable]], method)
# Convert to time series object
ts_data <- ts(normalized_data)
# Building an ARIMA model
model <- auto.arima(ts_data)
#Forecasting
forecast <- forecast(model, h = 10)$mean
# MAPE calculation
actual <- tail(data_filtered[[variable]], 10)
mape <- calculate_mape(actual, forecast)
# Saving results
result <- data.frame(variable = variable, normalization = method, mape = mape)
mape_results <- rbind(mape_results, result)
}
}
# Output results
print(map_results)
it works, but when i try apply model on my test sample for example like that
# Convert 'd_event_date' column to date format
test_data$d_event_date <- as.Date(test_data$d_event_date)
# Function to denormalize data
denormalize_data <- function(data, method, min_val, max_val, mean_val, std_val) {
denormalized_data <- NULL
if (method == "none") {
denormalized_data <- data
} else if (method == "min-max") {
denormalized_data <- data * (max_val - min_val) + min_val
} else if (method == "z-score") {
denormalized_data <- data * std_val + mean_val
} else if (method == "cubic-root") {
denormalized_data <- sign(data) * abs(data)^3
} else if (method == "logarithmic") {
denormalized_data <- exp(data) - 1
}
return(denormalized_data)
}
# Variables and normalization methods
variables <- c("n_impression_qd", "n_conversion_qd", "n_gross")
normalization_methods <- c("none", "min-max", "z-score", "cubic-root", "logarithmic")
# Result storage
prediction_results <- data.frame(variable = character(), actual = numeric(), predicted = numeric(), stringsAsFactors = FALSE)
# Predict and calculate MAPE for each variable and normalization method
for (variable in variables) {
for (method in normalization_methods) {
# Retrieving data and normalization parameters for a variable
data <- test_data[[variable]]
min_val <- min(data)
max_val <- max(data)
mean_val <- mean(data)
std_val <- sd(data)
# Data normalization
normalized_data <- normalize_data(data, method)
# Convert to time series object
ts_data <- ts(normalized_data)
#Forecasting
forecast <- forecast(model, h = length(ts_data))$mean
# Denormalization of predicted values
denormalized_forecast <- denormalize_data(forecast, method, min_val, max_val, mean_val, std_val)
# MAPE calculation
actual <- data
mape <- calculate_mape(actual, denormalized_forecast)
# Saving results
result <- data.frame(variable = variable, actual = actual, predicted = denormalized_forecast)
prediction_results <- rbind(prediction_results, result)
}
}
i get the error
Error in `[<-.ts`(`*tmp*`, ri, value = c(135.945603813953, 177.486351819609, :
Only element replacements are allowed
example test sample
dput(test_data)
structure(list(d_event_date = structure(c(19702L, 19702L, 19702L,
19702L, 19702L, 19702L, 19702L, 19702L, 19702L, 19702L), class = c("IDate",
"Date")), id_placement = c(20794491L, 20794491L, 17788422L, 20654348L,
21496171L, 18582636L, 19840341L, 17218292L, 18423615L, 10973945L
), id_zone = c(2859002L, 2859002L, 1781319L, 1733870L, 2096492L,
1933910L, 2717419L, 1850302L, 2245073L, 46564L), id_publisher = c(352465L,
352465L, 143258L, 304470L, 627080L, 59862L, 1052120L, 425742L,
38L, 34624L), id_advertiser = c(914L, 914L, 914L, 914L, 914L,
2834L, 2834L, 8579L, 15862L, 15862L), id_campaign = c(689656L,
689656L, 723900L, 723900L, 723900L, 873762L, 882336L, 360584L,
872210L, 872213L), id_banner = c(2038567L, 2038567L, 2136674L,
2136674L, 2136674L, 2546849L, 2568794L, 1181453L, 2543069L, 2543072L
), id_landing = c(3814050L, 4590592L, 4823675L, 4821514L, 4823676L,
4601283L, 4811884L, 4801168L, 4595695L, 4595698L), id_ad_unit = c(5L,
5L, 28L, 28L, 28L, 27L, 29L, 28L, 28L, 28L), n_landing_pricing_type = c(3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L), n_impression_qd = c(13L,
11L, 2L, 1L, 8L, 5L, 349L, 2L, 132L, 7L), n_click_qd = structure(c(0,
0, 0, 0, 0, 0, 1.15611361126852e-321, 0, 0, 0), class = "integer64"),
n_conversion_qd = c(1L, 2L, 1L, 1L, 3L, 2L, 5L, 1L, 3L, 1L
), n_gross = c(0.003, 0.0065, 0.0124, 0.0167, 0.090299998,
0.00054504, 0.03256, 0.0263, 0.08052, 0.00404), n_net = c(9e-04,
0.00195, 0.00868, 0.012024, 0.0686280016, 0.0003706272, 0.0221408,
0.017884, 0.032208, 0.00202)), row.names = c(NA, 10L), class = "data.frame")
why this error occured, and how can i fix it? Thank you for your help. After the forecast, I denormalize the data back to absolute values. The desired result is for each variable in the test sample the value that was and the one that was predicted for each model built using one or another normalization method
structure(list(n_impression_qd_actual = 12:16, n_impression_qd_pred = 12:16,
model.normalize_n_impression_qd = c("none", "min-max", "z-score",
"cubic-root", "logarithmic"), n_conversion_qd_actual = 12:16,
n_conversion_qd_pred = 12:16, model.normalize_n_n_conversion_qd = c("none",
"min-max", "z-score", "cubic-root", "logarithmic"), n_gross_actual = 12:16,
n_gross_pred = 12:16, n_gross_model.normalize = c("none",
"min-max", "z-score", "cubic-root", "logarithmic")), class = "data.frame", row.names = c(NA,
-5L))
This could be a comment but I have a suggestion to make on the normalization/denormalization functions and will post them as an answer.
The solution to the problem is to coerce the "ts"
object to numeric before rbind
ing with the result.
for (variable in variables) {
for (method in normalization_methods) {
# [rest of code omitted]
# Saving results
result <- data.frame(variable = variable, actual = actual, predicted = denormalized_forecast)
# include this line
result$predicted <- as.numeric(result$predicted)
#
prediction_results <- rbind(prediction_results, result)
}
}
To normalize the data raises no problems unless it is to be denormalized later in which case the denormalization code can be made simpler. If the normalization function saves the method and parameters as returned object attributes then those attributes can be used later with no need to recompute them and to have so many denormalization function arguments.
# New function for data normalization
normalize_data <- function(data, method) {
normalized_data <- NULL
params <- NULL
if (method == "none") {
normalized_data <- data
} else if (method == "min-max") {
min_val <- min(data)
max_val <- max(data)
normalized_data <- (data - min_val) / (max_val - min_val)
params <- c(min = min_val, max = max_val)
} else if (method == "z-score") {
mean_val <- mean(data)
std_val <- sd(data)
normalized_data <- (data - mean_val) / std_val
params <- c(mean = mean_val, sd = std_val)
} else if (method == "cubic-root") {
normalized_data <- sign(data) * abs(data)^(1/3)
} else if (method == "logarithmic") {
normalized_data <- log1p(data)
}
attr(normalized_data, "method") <- method
if(!is.null(params))
attr(normalized_data, "params") <- params
normalized_data
}
# New denormalization function
denormalize_data <- function(data, attrib) {
method <- attrib$method
denormalized_data <- NULL
if (method == "none") {
denormalized_data <- data
} else if (method == "min-max") {
min_val <- attrib$params["min"]
max_val <- attrib$params["max"]
denormalized_data <- data * (max_val - min_val) + min_val
} else if (method == "z-score") {
mean_val <- attrib$params["mean"]
std_val <- attrib$params["sd"]
denormalized_data <- data * std_val + mean_val
} else if (method == "cubic-root") {
denormalized_data <- sign(data) * abs(data)^3
} else if (method == "logarithmic") {
denormalized_data <- expm1(data)
}
denormalized_data
}
The call to denormalize the data is now
denormalized_forecast <- denormalize_data(forecast, attributes(normalized_data))