Timeseries upsampling with linear interpolation in explorer

I don’t believe Explorer supports interpolation at this time. Until it does, I think you’ll need to use a workaround.

Here is an example workaround:

alias Explorer.DataFrame, as: DF
alias Explorer.Series, as: S
require DF

defmodule Interpolate do
  def evenly_spaced(x_step, df, x_col, y_col) do
    n_rows = DF.n_rows(df)
    last = DF.slice(df, n_rows - 1, 1)[[x_col, y_col]]

    DF.new(
      x1: S.head(df[x_col], n_rows - 1),
      x2: S.tail(df[x_col], n_rows - 1),
      y1: S.head(df[y_col], n_rows - 1),
      y2: S.tail(df[y_col], n_rows - 1),
    )
    |> DF.to_rows_stream()
    |> Stream.map(fn %{"x1" => x1, "y1" => y1, "x2" => x2, "y2" => y2} ->
      m = (y2 - y1) / (x2 - x1)
      x = x1..x2//x_step |> Enum.to_list() |> S.from_list()
      n = S.size(x)
      x = S.head(x, n - 1)
      y = x |> S.subtract(x1) |> S.multiply(m) |> S.add(y1)

      DF.new(%{x_col => x, y_col => y})
    end)
    |> Enum.reduce(&DF.concat_rows(&2, &1))
    |> DF.concat_rows(last)
  end
end

# Build data
data = [
  ["2023-02-13", 100],
  ["2023-02-15", 100.01],
  ["2023-02-16", 100.09],
  ["2023-02-17", 101.02],
  ["2023-02-20", 105.00],
  ["2023-02-22", 103.06]
]

df = (
  data
  |> Enum.map(fn [date, value] ->
    %{date: Date.from_iso8601!(date), value: value}
  end)
  |> DF.new()
)

# Cast date column to integer.
# (Units make working with Date/Durations a bit awkward.)
df = DF.mutate(df, date_int: cast(date, :integer))

# Run interpolation.
interp = Interpolate.evenly_spaced(1, df, "date_int", "value")

# Cast back to date.
interp = DF.mutate(interp, date: cast(date_int, :date))

# Print result.
interp[["date", "value"]] |> DF.print(limit: :infinity)

# +---------------------------------------------+
# | Explorer DataFrame: [rows: 10, columns: 2]  |
# +------------------+--------------------------+
# |       date       |          value           |
# |      <date>      |         <float>          |
# +==================+==========================+
# | 2023-02-13       | 100.0                    |
# +------------------+--------------------------+
# | 2023-02-14       | 100.005                  |
# +------------------+--------------------------+
# | 2023-02-15       | 100.01                   |
# +------------------+--------------------------+
# | 2023-02-16       | 100.09                   |
# +------------------+--------------------------+
# | 2023-02-17       | 101.02                   |
# +------------------+--------------------------+
# | 2023-02-18       | 102.34666666666666       |
# +------------------+--------------------------+
# | 2023-02-19       | 103.67333333333333       |
# +------------------+--------------------------+
# | 2023-02-20       | 105.0                    |
# +------------------+--------------------------+
# | 2023-02-21       | 104.03                   |
# +------------------+--------------------------+
# | 2023-02-22       | 103.06                   |
# +------------------+--------------------------+

This workaround will be quite slow. If you need a performant solution, you’ll need to shell out to Polars proper.

Also, note that because of the simplicity of your use case (evenly spaced dates), I was able to use a simple algorithm. In the general case is more complicated.

4 Likes