Supervisor is not restarting worker

Hi there,

I am struggling to find the problem with this piece of code: I start a worker (Scheduler) in a supervisor (the root supervisor of my app if that matters). For some reason, the worker is raising an error after some seconds and is dying.

I would expect the supervisor to restart the worker but it does not. I was trying for hours to find the problem. Any ideas?

defmodule MyProj.Scheduler do
  def start_link() do
    IO.puts "starting"
    Task.start(__MODULE__, :process, [])
  end

  def process() do
    # testing
    :timer.sleep(3000)
    raise "crash"
  end
end

defmodule MyProj.Application do
  use Application
  import Supervisor.Spec, warn: true
  
  def start(_type, _args) do
    # List all child processes to be supervised
    children = [
      worker(MyProj.Scheduler, []),
    ]

    opts = [strategy: :one_for_one, name: MyProj.Supervisor]
    Supervisor.start_link(children, opts)
  end
end

Task.start/3 creates a new process which is not linked, but you need to link your process to make the Supervisor be able to see it crashing. Try Task.start_link/* instead.

But that is only a quick guess, as I’m only passing by, preparing for an appointement.

5 Likes

Yeah, that helped. Thanks!!! I thought that the supervisor detected the killed process, since it was printing an error message after the process crashed.

You may actually be looking for DynamicSupervisor (or perhaps Task.Supervisor)

defmodule MyProj.Application do
  # file: lib/my_proj/application.ex

  # created with:
  # $ mix new --sup my_proj
  use Application

  def start(_type, _args) do
    children = [
      {DynamicSupervisor, [strategy: :one_for_one, name: MyProj.WorkSupervisor]}
    ]

    opts = [strategy: :one_for_one, name: MyProj.Supervisor]
    # root supervisor
    on_start = Supervisor.start_link(children, opts)

    Task.start(&do_stuff/0)
    on_start
  end

  def do_stuff do
    Process.sleep(1000)
    init_arg = []

    spec = %{
      id: MyProj.Worker,
      start: {MyProj.Worker, :start_link, [init_arg]},
      restart: :transient
    }

    # :transient is necessary so children can 
    # terminate normally without being restarted.

    {:ok, worker1} = DynamicSupervisor.start_child(MyProj.WorkSupervisor, spec)
    {:ok, worker2} = DynamicSupervisor.start_child(MyProj.WorkSupervisor, spec)
    {:ok, worker3} = DynamicSupervisor.start_child(MyProj.WorkSupervisor, spec)
    {:ok, worker4} = DynamicSupervisor.start_child(MyProj.WorkSupervisor, spec)

    Process.sleep(100)
    IO.puts("#{inspect(DynamicSupervisor.count_children(MyProj.WorkSupervisor))}")

    IO.puts("\nWorker exits (not :normal)")
    MyProj.Worker.exit(worker1)
    Process.sleep(100)
    IO.puts("#{inspect(DynamicSupervisor.count_children(MyProj.WorkSupervisor))}")

    IO.puts("\nWorker raises RuntimeError")
    MyProj.Worker.crash(worker2)
    Process.sleep(100)
    IO.puts("#{inspect(DynamicSupervisor.count_children(MyProj.WorkSupervisor))}")

    IO.puts("\nWorker is killed")
    Process.exit(worker3, :kill)
    Process.sleep(100)
    IO.puts("#{inspect(DynamicSupervisor.count_children(MyProj.WorkSupervisor))}")

    IO.puts("\nWorker terminates normally")
    MyProj.Worker.done(worker4)
    Process.sleep(100)
    IO.puts("#{inspect(DynamicSupervisor.count_children(MyProj.WorkSupervisor))}")

    Application.stop(:my_proj)
  end
end
defmodule MyProj.Worker do
  # file: lib/my_proj/worker.ex
  use GenServer

  def start_link(init_arg),
    do: GenServer.start_link(__MODULE__, init_arg)

  @impl true
  def init(init_arg) do
    IO.puts("#{inspect(self())}: init_arg #{inspect(init_arg)}")
    state = init_arg
    {:ok, state, {:continue, []}}
  end

  @impl true
  def handle_continue(_continue, state) do
    IO.puts("#{inspect(self())}: started")
    {:noreply, state}
  end

  @impl true
  def handle_cast(:crash, state) do
    raise "crash"
    {:noreply, state}
  end

  def handle_cast(:done, state) do
    {:stop, :normal, state}
  end

  def handle_cast(:exit, state) do
    Kernel.exit(:exit)
    {:noreply, state}
  end

  @impl true
  def terminate(reason, _state) do
    IO.puts("#{inspect(self())}: terminating with reason #{reason}")
    :ok
  end

  # server API
  def done(pid),
    do: GenServer.cast(pid, :done)

  def crash(pid),
    do: GenServer.cast(pid, :crash)

  def exit(pid),
    do: GenServer.cast(pid, :exit)
end
$ iex -S mix
Erlang/OTP 21 [erts-10.2.1] [source] [64-bit] [smp:8:8] [ds:8:8:10] [async-threads:1] [hipe] [dtrace]

Compiling 2 files (.ex)
Interactive Elixir (1.8.0) - press Ctrl+C to exit (type h() ENTER for help)
iex(1)> #PID<0.152.0>: init_arg []
#PID<0.152.0>: started
#PID<0.153.0>: init_arg []
#PID<0.153.0>: started
#PID<0.154.0>: init_arg []
#PID<0.154.0>: started
#PID<0.155.0>: init_arg []
#PID<0.155.0>: started
%{active: 4, specs: 4, supervisors: 0, workers: 4}

Worker exits (not :normal)
#PID<0.152.0>: terminating with reason exit

14:26:50.043 [error] GenServer #PID<0.152.0> terminating
** (stop) :exit
    (my_proj) lib/my_proj/worker.ex:32: MyProj.Worker.handle_cast/2
    (stdlib) gen_server.erl:637: :gen_server.try_dispatch/4
    (stdlib) gen_server.erl:711: :gen_server.handle_msg/6
    (stdlib) proc_lib.erl:249: :proc_lib.init_p_do_apply/3
Last message: {:"$gen_cast", :exit}
State: []
#PID<0.156.0>: init_arg []
#PID<0.156.0>: started
%{active: 4, specs: 4, supervisors: 0, workers: 4}

Worker raises RuntimeError

14:26:50.142 [error] GenServer #PID<0.153.0> terminating
** (Protocol.UndefinedError) protocol String.Chars not implemented for {%RuntimeError{message: "crash"}, [{MyProj.Worker, :handle_cast, 2, [file: 'lib/my_proj/worker.ex', line: 23]}, {:gen_server, :try_dispatch, 4, [file: 'gen_server.erl', line: 637]}, {:gen_server, :handle_msg, 6, [file: 'gen_server.erl', line: 711]}, {:proc_lib, :init_p_do_apply, 3, [file: 'proc_lib.erl', line: 249]}]}. This protocol is implemented for: Float, DateTime, Time, List, Version.Requirement, Atom, Integer, Version, Date, BitString, NaiveDateTime, URI
    (elixir) /private/tmp/elixir-20190114-83715-150g46n/elixir-1.8.0/lib/elixir/lib/string/chars.ex:3: String.Chars.impl_for!/1
    (elixir) /private/tmp/elixir-20190114-83715-150g46n/elixir-1.8.0/lib/elixir/lib/string/chars.ex:22: String.Chars.to_string/1
    (my_proj) lib/my_proj/worker.ex:47: MyProj.Worker.terminate/2
    (stdlib) gen_server.erl:673: :gen_server.try_terminate/3
    (stdlib) gen_server.erl:858: :gen_server.terminate/10
    (stdlib) proc_lib.erl:249: :proc_lib.init_p_do_apply/3
Last message: {:"$gen_cast", :crash}
State: []
#PID<0.157.0>: init_arg []
#PID<0.157.0>: started
%{active: 4, specs: 4, supervisors: 0, workers: 4}

Worker is killed
#PID<0.158.0>: init_arg []
#PID<0.158.0>: started
%{active: 4, specs: 4, supervisors: 0, workers: 4}

Worker terminates normally
#PID<0.155.0>: terminating with reason normal
%{active: 3, specs: 3, supervisors: 0, workers: 3}

14:26:50.442 [info]  Application my_proj exited: :stopped

defmodule MyProj.Application do
  # file: lib/my_proj/application.ex

  # created with:
  # $ mix new --sup my_proj
  use Application

  def start(_type, _args) do
    children = [
      {Task.Supervisor, [name: MyProj.WorkSupervisor]}
    ]

    opts = [strategy: :one_for_one, name: MyProj.Supervisor]
    # root supervisor
    on_start = Supervisor.start_link(children, opts)

    Task.start(&do_stuff/0)
    on_start
  end

  def do_stuff do
    Process.sleep(1000)
    tab = :ets.new(:noname_table, [:public])
    opts = [restart: :transient]

    {:ok, _pid} =
      Task.Supervisor.start_child(MyProj.WorkSupervisor, MyProj.Task, :run, [tab], opts)

    Process.sleep(1000)
    children = Task.Supervisor.children(MyProj.WorkSupervisor)
    IO.puts("children: #{inspect(children)}")
    Application.stop(:my_proj)
  end
end
defmodule MyProj.Task do
  # file: lib/my_proj/task.ex

  def run(tab) do
    value = get(tab)
    IO.puts("\n#{inspect(self())}: Got value: #{inspect(value)}")

    case value do
      :undefined ->
        exit_exit(tab)

      :crash ->
        raise_crash(tab)

      :kill ->
        self_kill(tab)

      _ ->
        exit_normally(tab)
    end
  end

  defp exit_exit(tab) do
    IO.puts("Task exits (not :normal)")
    put(tab, :crash)
    Kernel.exit(:exit)
  end

  defp raise_crash(tab) do
    IO.puts("Task raises RuntimeError")
    put(tab, :kill)
    raise "crash"
  end

  defp self_kill(tab) do
    IO.puts("Task is killed")
    put(tab, :normal)
    Task.start(run_kill())
    Process.sleep(5000)
  end

  defp exit_normally(tab) do
    IO.puts("Task terminates normally")
    delete(tab)
  end

  defp run_kill() do
    target = self()

    fn ->
      Process.sleep(200)
      IO.puts("KILL #{inspect(target)}")
      Process.exit(target, :kill)
    end
  end

  defp delete(tab),
    do: :ets.delete(tab, :current)

  defp get(tab) do
    case :ets.lookup(tab, :current) do
      [current: value] ->
        value

      _ ->
        :undefined
    end
  end

  defp put(tab, value),
    do: :ets.insert(tab, {:current, value})
end
$ iex -S mix
Erlang/OTP 21 [erts-10.2.3] [source] [64-bit] [smp:8:8] [ds:8:8:10] [async-threads:1] [hipe] [dtrace]

Generated my_proj app
Interactive Elixir (1.8.1) - press Ctrl+C to exit (type h() ENTER for help)
iex(1)> 
#PID<0.143.0>: Got value: :undefined
Task exits (not :normal)

17:43:08.406 [error] Task #PID<0.143.0> started from #PID<0.140.0> terminating
** (stop) :exit
    (my_proj) lib/my_proj/task.ex:26: MyProj.Task.exit_exit/1
    (elixir) lib/task/supervised.ex:90: Task.Supervised.invoke_mfa/2
    (stdlib) proc_lib.erl:249: :proc_lib.init_p_do_apply/3
Function: &MyProj.Task.run/1
    Args: [#Reference<0.4139279380.2160459782.161846>]
 
#PID<0.144.0>: Got value: :crash
Task raises RuntimeError

17:43:08.408 [error] Task #PID<0.144.0> started from #PID<0.140.0> terminating
** (RuntimeError) crash
    (my_proj) lib/my_proj/task.ex:32: MyProj.Task.raise_crash/1
    (elixir) lib/task/supervised.ex:90: Task.Supervised.invoke_mfa/2
    (stdlib) proc_lib.erl:249: :proc_lib.init_p_do_apply/3
Function: &MyProj.Task.run/1
    Args: [#Reference<0.4139279380.2160459782.161846>]
 
#PID<0.145.0>: Got value: :kill
Task is killed
KILL #PID<0.145.0>

#PID<0.147.0>: Got value: :normal
Task terminates normally
children: []

17:43:09.395 [info]  Application my_proj exited: :stopped
5 Likes