rinterpolationcurve-fittingquadratic

Quadratic/parabolic interpolation


I have this curve (COVID-19 cases per 100,000 inhabitants in California between 2020-09-01 and 2021-03-01):

enter image description here

It's clear that the dip at the end of December 2020 is an artifact of testing's having gone down during the winter holidays (the nadir occurs exactly on Christmas Day) rather than a true decline in cases.

What I would like to do is impute values via some sort of quadratic or parabolic interpolation to come up with plausible values for the real case rate (per 100k) between 2020-12-12 and 2021-01-13. How can I do this?

Here's the code I used to generate the plot:

x <- seq.Date(as.Date("2020-09-01"), as.Date("2021-03-01"), by=1)
y <- c(9.36,9.16,9.05,8.88,8.76,8.65,7.94,7.81,7.65,7.5,7.47,7.5,7.52,8.19,
       8.03,8.1,8.12,8.14,8.19,8.24,8.21,8.19,8.19,8.22,8.24,8.2,8.16,8.14,
       8.16,8.14,8.25,8.19,8.3,8.36,8.45,8.43,8.42,8.44,8.51,8.63,8.62,8.63,
       8.66,8.69,8.73,8.81,8.79,8.9,9.15,9.46,9.67,9.78,10.07,10.19,10.32,10.48,
       10.52,10.69,10.93,11.27,11.68,12.4,13.45,14.66,15.92,17.09,18.15,18.85,
       19.04,19.98,20.93,21.69,22.89,24.28,25.52,26.78,29.08,31.29,33.62,35.34,
       37.11,37.95,38.35,39.59,40.82,42.06,39.44,39.63,41.69,43.73,47.78,52.64,
       57.16,65.24,70.15,72.29,73.01,76.01,78.53,81.46,84.64,87.58,89.86,90.79,
       93.81,96.47,98.05,99.48,100.07,99.73,99.65,99.36,99.52,99.32,92.84,82.53,
       84.34,86.33,89.6,92.99,96.15,99.42,101.56,102.45,102.72,103.63,102.26,101,
       104.48,112.58,109.57,106.79,100.29,94.47,88.73,83.79,79.33,76.19,74.25,
       67.69,63.86,60.59,57.27,54.07,51.8,50.69,49.49,46.01,42.54,39.79,37.28,
       36.11,35.29,33.53,31.66,30.16,28.58,27.37,26.13,25.13,23.06,21.33,19.92,
       18.65,17.51,16.71,16.13,14.63,13.89,13.03,12.27,11.56,11.1,10.79,10.63,
       10.07,9.63,9.28,8.98,8.77,8.61,8.25)

df <- data.frame(x,y)

p <- ggplot(data=df) +
  geom_line(aes(x=as.Date(x, origin=as.Date("1970-01-01")),y=y))
p

I'm not really sure where to begin, so I'd appreciate it if someone tossed me a bone, here. Thanks! :)


Solution

  • A colleague supplied this code:

        #Quadratic Interpolation 
    
    library(magrittr)
    library(dplyr)
    library(ggplot2)
    library(deSolve)
    ca_pop <- 40129160L
    
    x <- seq.Date(as.Date("2020-09-01"), as.Date("2021-03-01"), by=1)
    y <- c(9.36,9.16,9.05,8.88,8.76,8.65,7.94,7.81,7.65,7.5,7.47,7.5,7.52,8.19,
           8.03,8.1,8.12,8.14,8.19,8.24,8.21,8.19,8.19,8.22,8.24,8.2,8.16,8.14,
           8.16,8.14,8.25,8.19,8.3,8.36,8.45,8.43,8.42,8.44,8.51,8.63,8.62,8.63,
           8.66,8.69,8.73,8.81,8.79,8.9,9.15,9.46,9.67,9.78,10.07,10.19,10.32,10.48,
           10.52,10.69,10.93,11.27,11.68,12.4,13.45,14.66,15.92,17.09,18.15,18.85,
           19.04,19.98,20.93,21.69,22.89,24.28,25.52,26.78,29.08,31.29,33.62,35.34,
           37.11,37.95,38.35,39.59,40.82,42.06,39.44,39.63,41.69,43.73,47.78,52.64,
           57.16,65.24,70.15,72.29,73.01,76.01,78.53,81.46,84.64,87.58,89.86,90.79,
           93.81,96.47,98.05,99.48,100.07,99.73,99.65,99.36,99.52,99.32,92.84,82.53,
           84.34,86.33,89.6,92.99,96.15,99.42,101.56,102.45,102.72,103.63,102.26,101,
           104.48,112.58,109.57,106.79,100.29,94.47,88.73,83.79,79.33,76.19,74.25,
           67.69,63.86,60.59,57.27,54.07,51.8,50.69,49.49,46.01,42.54,39.79,37.28,
           36.11,35.29,33.53,31.66,30.16,28.58,27.37,26.13,25.13,23.06,21.33,19.92,
           18.65,17.51,16.71,16.13,14.63,13.89,13.03,12.27,11.56,11.1,10.79,10.63,
           10.07,9.63,9.28,8.98,8.77,8.61,8.25)
    
    df <- data.frame(x,y, day_num = 1:length(y))
    leave_out <- which(df$x > "2020-12-18" & df$x < "2021-01-08")
    
    #Plot curve with missing points
    df[-leave_out,] %>% filter(x > "2020-10-15") %>%
      ggplot() +
      geom_point(aes(x=x,y=y), shape=1, alpha=.7, size=.6) + 
      theme_bw()
    
    df_fit <- df %>%#[-leave_out,] %>%
      filter(x > "2020-10-15") %>%
      mutate(transform_day = 1 / day_num) 
    
    #Plot df_fit
    #ggplot(data=df_fit, aes(x=x, y=transform_day)) + geom_line()
      
    #quad_model <- lm(y ~ (poly(transform_day, 2)), data = df_fit)
    #y_fit <- predict(quad_model)
    #model_fit <- data.frame(x = df_fit$x,y_fit)
    #model_fit %>% filter(x > "2020-10-15") %>%
    #  ggplot() +
    #  geom_line(aes(x = x, y = y_fit)) +
    #  geom_point(data = df_fit, aes(x = x, y =y)) + theme_bw()
    halfway_ind <- round(mean(order(abs(y - 30))[1:2]))
    halfway_ind #116
    halfway_ind60 <- round(mean(order(abs(y - 60))[c(1,3)]))
    halfway_ind60 #118
    ##Let's say 117 for the peak
    df_fit$day_adj <- df_fit$day_num - 117
    df_fit$model <- 150*375/ (df_fit$day_adj^2 + 375)
    df_fit$cases <- df_fit$y * ca_pop / 1e5
    df_fit %>% 
      ggplot() +
      geom_line(aes(x = day_adj, y = model)) +
      geom_point(aes(x = day_adj, y =y)) + theme_bw()