Blog Post: Voice Activity Detection in Elixir with Membrane

Great post! In case anyone else runs into the same issue, the latest silero_vad.onnx on their GitHub repo (version 5) expects different inputs than the version used in this post (version 4). The inputs h and c are now combined into a single state tensor: {"state", "Float32", [2, nil, 128]}
Here is the updated code:

init_state = %{state: Nx.broadcast(0.0, {2, 1, 128})}

live_audio
|> Kino.Control.stream()
|> Kino.listen(init_state, fn
  %{event: :audio_chunk, chunk: data}, %{state: state} ->
    input = Nx.tensor([data])
    sr = Nx.tensor(16_000, type: :s64)
    {output, state_n} = Ortex.run(model, {input, state, sr})
    prob = output |> Nx.squeeze() |> Nx.to_number()
    row = %{x: :os.system_time(), y: prob}
    Kino.VegaLite.push(chart, row, window: 1000)

    {:cont, %{state: state_n}}
end)
1 Like