I made a workaround which seems to fix the problem when an empty tensor is generated(it happens when the audio ends precisely at a moment divisible by 20 seconds).
0..stat.duration//chunk_time
|> dbg()
|> Task.async_stream(
fn ss ->
args = ~w(-ac 1 -ar 16k -f f32le -ss #{ss} -t #{chunk_time} -v quiet -)
{data, 0} = System.cmd("ffmpeg", ["-i", path] ++ args)
+ if byte_size(data) > 0 do
+ {ss, Nx.Serving.batched_run(WhisperServing, Nx.from_binary(data, :f32))}
+ else
+ {ss, ""}
+ end
end,
max_concurrency: 2,
timeout: :infinity
)
|> Enum.map(fn
+ {:ok, {ss, %{results: [%{text: text}]}}} ->
+ func.(ss, text)
+ {:ok, {ss, ""}} ->
+ func.(ss, "")
end)