I want to use linear interpolation to create a finer dataset where my responses y are non-monotonic and I need to interpolate onto a fixed y-grid (this is not negotiable). I have a script that does this, I am wondering if there is an easier approach I could use.
system("Rscript -e 'q(\"no\")'") # command to restart R
gc() # should clear global variables
rm(list = ls()) # clear the workspace
library(tidyverse)
library(dplyr)
library(ggplot2)
# the original data
x = c(0,1,2,3,4,5,6,7,8)
y = c(3,6,1,2,4,6,7,5.5,3.5)
df <- data.frame(x = x, y = y)
# the y values I want to interpolate onto
ynew = seq(min(y), max(y), length.out = 21)
interpolatedX <- function(i, x, y, ynew) { # function that calculates the new X value given the points at the ends
m = (y[i+1] - y[i]) / (x[i+1] - x[i])
c = y[i] - m * x[i]
Xnew = (ynew - c) / m
return(Xnew)
}
mydata <- data.frame(X = numeric(), y_new = numeric())
for (i in 1:(length(y) - 1)) { # work through each of the y-values in the observations
ystart <- y[i]
yend <- y[i + 1]
# find location of values in ynew that are between the y-points
if (i == length(y) - 1) { # if we are at the last point
if (ystart < yend) {
Y_inbetween_start_and_end <- which(ynew >= ystart & ynew <= yend)
} else {
Y_inbetween_start_and_end <- which(ynew >= yend & ynew <= ystart)
}
} else { # if we are not at the last point
if (ystart < yend) {
Y_inbetween_start_and_end <- which(ynew >= ystart & ynew < yend)
} else {
Y_inbetween_start_and_end <- which(ynew > yend & ynew <= ystart)
}
}
for (j in 1:(length(Y_inbetween_start_and_end) - 1)) {
pos_in_ynew <- Y_inbetween_start_and_end[j]
newX <- interpolatedX(i, x, y, ynew[pos_in_ynew] )
mydata<- rbind(mydata, data.frame(X = newX, y_new = ynew[pos_in_ynew]))
}
}
dfi <- mydata %>% drop_na() %>% arrange(X)
ggplot() +
geom_point(data = df, aes(x = x, y = y), color = "blue", size = 2, stroke = NA, alpha = 0.4) +
geom_point(data = dfi, aes(x = X, y = y_new), color = "red", size = 1, stroke = NA) +
labs(x = "X", y = "Y", title = "Original vs Interpolated Data") +
theme_minimal()
I'm not sure this is much of an improvement, but it demonstrates an approach relying on some dplyr data wrangling and the base approx
function.
# prepare version of data with start and end coordinates of each segment
bind_rows(df |> mutate(segment = row_number()),
df |> mutate(segment = row_number() - 1)) |>
filter(n() == 2, .by = segment) |>
mutate(low = min(y), high = max(y), .by = segment) |>
# create a row for every target y within each segment y range
left_join(data.frame(y_pts = seq(min(y), max(y), length.out = 21)),
join_by(low <= y_pts, high >= y_pts)) |>
# use `approx` with x/y swapped since we are solving for x from y
mutate(x_fit = approx(y, x, xout = y_pts)$y, .by = segment) |>
# plot
ggplot(aes(x_fit, y_pts)) +
geom_point() +
geom_point(aes(x,y), data = df, color = "red")