Correctness/safety question with DynamicSupervisor

James_E · July 10, 2025, 5:09pm

I have an app that has many users connecting to it.

When each user tries to connect to a room, they should be connected to a “MyApp.Room” GenServer, which is started if a room with the requested ID does not yet exist.

All of the “MyApp.Room” instances are managed by one app-wide DynamicSupervisor.

Unfortunately, DynamicSupervisor just records all children in a PID-keyed map with no method to retrieve a particular child in any way except by PID, so I wrote a function like this to try to implement that “started if…does not yet exist” logic:

  @doc """
  Exactly like [DynamicSupervisor.start_child/2](https://hexdocs.pm/elixir/1.19/DynamicSupervisor.html#start_child/2)
  except that the "id" field is not [disregarded](https://hexdocs.pm/elixir/1.19.0-rc.0/DynamicSupervisor.html#start_child/2:~:text=while%20the%20:id%20field%20is%20still%20required,the%20value%20is%20ignored),
  but instead used as a unique key.

  Only works with simple GenServer supervisees for now.

  Does NOT work with remote DynamicSupervor or distributed systems for now.

  ## Example

      iex> defmodule FooApp.Model do use GenServer; def init(_opts), do: {:ok, nil} end
      iex> children = [{DynamicSupervisor, name: FooApp.ModelInstances}]
      iex> opts = [strategy: :one_for_one, name: FooApp.Supervisor]
      iex> {:ok, _} = Supervisor.start_link(children, opts)
      iex> {:ok, child1} = get_or_start_child(
      ...>   FooApp.ModelInstances,
      ...>   %{id: "one", start: {FooApp.Model, :start_link, [[]]}})
      iex> {:ok, child2} = get_or_start_child(
      ...>   FooApp.ModelInstances,
      ...>   %{id: "two", start: {FooApp.Model, :start_link, [[]]}})
      iex> get_or_start_child(
      ...>   FooApp.ModelInstances,
      ...>   %{id: "one", start: {FooApp.Model, :start_link, [[]]}})
      {:ok, child1} # retrieved the existing child1
  """
  @spec get_or_start_child(
          dynamic_supervisor :: Supervisor.supervisor(),
          child_spec :: Supervisor.module_spec() | Supervisor.child_spec()
        ) :: DynamicSupervisor.on_start_child()
  def get_or_start_child(dynamic_supervisor, module) when is_atom(module) do
    # https://github.com/elixir-lang/elixir/blob/v1.19.0-rc.0/lib/elixir/lib/gen_server.ex#L1036
    get_or_start_child(dynamic_supervisor, module.child_spec([]))
  end
  def get_or_start_child(dynamic_supervisor, {module, arg}) when is_atom(module) do
    get_or_start_child(dynamic_supervisor, module.child_spec(arg))
  end
  def get_or_start_child(dynamic_supervisor, child_spec = %{id: id, start: {_, _, _}}) do
    with supervisor_pid when is_pid(supervisor_pid) <- GenServer.whereis(dynamic_supervisor) do
      # https://hexdocs.pm/elixir/1.19/GenServer.html#module-name-registration
      # FIXME: https://hexdocs.pm/elixir/1.19/Registry.html#module-using-in-via
      reg_name = {:global, {supervisor_pid, id}}

      # https://github.com/elixir-lang/elixir/blob/v1.19.0-rc.0/lib/elixir/lib/dynamic_supervisor.ex#L407-L490
      child_spec =
        Map.update!(child_spec, :start, fn
          {m = GenServer, f = :start_link, [m1, init_arg]} ->
            {m, f, [m1, init_arg, [name: reg_name]]}

          {m = GenServer, f = :start_link, [m1, init_arg, opts]} ->
            {m, f, [m1, init_arg, opts ++ [name: reg_name]]}

          {m, f = :start_link, [init_arg, opts]} when is_list(opts) ->
            #true = GenServer in Keyword.get(m.__info__(:attributes), :behaviour, [])
            {m, f, [init_arg, opts ++ [name: reg_name]]}

          {m, f = :start_link, [opts]} when is_list(opts) ->
            #true = GenServer in Keyword.get(m.__info__(:attributes), :behaviour, [])
            {m, f, [opts ++ [name: reg_name]]}

          _start ->
            raise ArgumentError, """
            only GenServer style start_link children supported at this time.
            \tchild_spec = #{inspect(child_spec)}\
            """
        end)

      # https://github.com/erlang/otp/blob/84adefa331c4159d432d22840663c38f155cd4c1/lib/stdlib/src/gen.erl#L71
      # https://github.com/elixir-lang/elixir/blob/v1.19.0-rc.0/lib/elixir/lib/gen_server.ex#L1050
      case DynamicSupervisor.start_child(supervisor_pid, child_spec) do
        {:error, {:already_started, pid}} -> {:ok, pid}
        result -> result
      end
    else
      {name, node} = server when is_atom(name) and is_atom(node) ->
        # https://github.com/elixir-lang/elixir/blob/v1.19.0-rc.0/lib/elixir/lib/gen_server.ex#L1324-L1326
        raise ArgumentError,
          message: """
          Only local supervisors supported at this time.
          \tserver = #{inspect(server)}\
          """

      nil ->
        # https://github.com/elixir-lang/elixir/blob/v1.19.0-rc.0/lib/elixir/lib/gen_server.ex#L1309
        {:error, :noproc}
    end
  end

However, this seems fragile; I’m wondering in particular whether, as written, it contains any race conditions, or if it’d be safe to call the current version of get_or_start_child blindly from, for example, a Phoenix LiveView mount callback.

If this approach is subject to a race condition, is there any correct way to accomplish this instead?

windexoriginal · July 10, 2025, 5:29pm

I think you should look at the registry module. It will allow you to give arbitrary identifiers to the processes you spawn dynamically.

Registry is implemented with ets which should help with concerns about race conditions.

LostKobrakai · July 10, 2025, 5:30pm

You want to decouple the supervision tree concerns from the key management and uniqueness. For the former continue to use DynamicSupervisor. For the latter you can use Registry if you only need to enforce uniqueness on a per node level.

garrison · July 10, 2025, 5:34pm

Global will synchronously ping every other node to register the name which is probably not something you want in a user path. I don’t really get the impression that it’s meant for a large number of processes, but someone should correct me if I’m wrong.

Consider using Elixir’s Registry as mentioned above or writing your own routing layer. You probably also want to maintain limits and rate limit users creating rooms.

James_E · July 10, 2025, 5:38pm

Right now, I’m definitely only doing single-node servers (hence my method just raising whenever it’s called against a remote supervisor); but I am interested in building this in a way that will not cause a giant headache if the code is ever upgraded to support multi-node servers.

The users definitely do not care about any backend node partitioning; the room IDs should be a genuinely global identifier. I absolutely must never put the users in a position where 2 different users who attempted to join the same room end up attached to different “MyApp.Room” instances. So does that mean Registry would not work for this?

garrison · July 10, 2025, 5:48pm

The Elixir Registry is local, yes. I don’t think it would be very hard to migrate off of, though.

You can write your own routing table/registry fairly easily with a GenServer and ets as primitives. Then you can use :global to ensure your routing table is registered on one node and send all requests to that node.

That will get you pretty far. If you need to scale further you can have multiple processes read from the ets table, and if you need to scale out much further you can shard your rooms across nodes and route locally on each node with a node id embedded in the room id or similar. Probably you don’t even need to think about doing this right now.

LostKobrakai · July 10, 2025, 6:36pm

As already mentioned the Registry modules works per node. But if you build your system using via tuples it should be not much work to switch to a different registry later. Generally you want to use a registry – a module allowing you to register a process/pid for a given key – no matter which specific registry you land on in the end. OTP comes with :global, elixir has Registry and there are third party ones each with different sets of tradeoffs.

josevalim · July 10, 2025, 6:45pm

We have just rewritten the Mix & OTP guides in Elixir to build something similar, you can read it in this PR. We also built something that relied on similar techniques here: Homemade analytics with Ecto and Elixir - Dashbit Blog

James_E · July 15, 2025, 4:29pm

I don’t know how long it will be until that PR makes it into hexdocs.pm, but for now it can be read at https://github.com/elixir-lang/elixir/blob/ce64fc364d368705759d0982b72a4ba98572caa6/lib/elixir/pages/mix-and-otp/agents.md#preview.

Does this seem to be a correct implementation of those ideas to implement a DynamicSupervisor with unique children, or is this still going to fail in edge cases?

defmodule MyApp.Util.DynamicSupervisor2 do
  # https://elixirforum.com/t/correctness-safety-question-with-dynamicsupervisor/71652/9
  use Agent

  @type supervisor :: Agent.agent
  @type on_terminate :: :ok | {:error, :not_found}

  def count_children(supervisor) do
    Agent.get(supervisor, fn %{supervisor: supervisor} ->
      DynamicSupervisor.count_children(supervisor)
    end)
  end

  @spec get_or_start_child(
    supervisor(), Supervisor.module_spec() | Supervisor.child_spec()
  ) :: DynamicSupervisor.on_start_child()
  def get_or_start_child(supervisor, child_spec)

  def get_or_start_child(supervisor, child_spec = %{id: id, start: {_, _, _}}) do
    Agent.get(supervisor, fn %{
      supervisor: supervisor,
      registry: registry,
      options: opts
    } ->
      case Registry.lookup(registry, id) do
        [{_, pid}] ->
          {:ok, pid}

        [] ->
          case DynamicSupervisor.start_child(supervisor, child_spec) do
            {:ok, pid} ->
              {:ok, _} = Registry.register(registry, id, pid)
              {:ok, pid}

            :ignore ->
              :ignore

            {:error, {:already_started, pid}} = result ->
              if Keyword.get(opts, :allow_foreign_child, false) do
                {:ok, _} = Registry.register(registry, id, pid)
                {:ok, pid}
              else
                result
              end

            {:error, _} = result ->
              result
          end
      end
    end)
  end

  def get_or_start_child(supervisor, module) when is_atom(module) do
    # https://github.com/elixir-lang/elixir/blob/v1.19.0-rc.0/lib/elixir/lib/gen_server.ex#L1036
    get_or_start_child(supervisor, module.child_spec([]))
  end

  def get_or_start_child(supervisor, {module, arg}) when is_atom(module) do
    get_or_start_child(supervisor, module.child_spec(arg))
  end

  def start_link(opts) do
    allow_foreign = Keyword.get(opts, :allow_foreign, false)

    {name, opts} = Keyword.pop(opts, :name, __MODULE__)
    registry_name = Module.concat(name, "registry")

    agent_opts = [name: name]
    supervisor_opts = [strategy: :one_for_one]
    registry_opts = [keys: :unique, name: registry_name]

    Agent.start_link(
      fn ->
        supervisor = case DynamicSupervisor.start_link(supervisor_opts) do
          {:ok, pid} ->
            pid

          {:error, {:already_started, pid}} when allow_foreign ->
            pid
        end

        case Registry.start_link(registry_opts) do
          {:ok, pid} ->
            pid

          {:error, {:already_started, pid}} when allow_foreign ->
            pid
        end

        %{
          supervisor: supervisor,
          registry: registry_name,
          options: opts
        }
      end,
      agent_opts
    )
  end

  @spec terminate_child(supervisor(), atom()) :: on_terminate()
  def terminate_child(supervisor, id) when is_atom(id) do
    Agent.get(supervisor, fn %{
      supervisor: supervisor,
      registry: registry
    } ->
      case Registry.lookup(registry, id) do
        [{_, pid}] ->
          :ok = DynamicSupervisor.terminate_child(supervisor, pid)
          :ok = Registry.unregister(registry, id)
          :ok

        [] ->
          {:error, :not_found}
      end
    end)
  end
end

josevalim · July 15, 2025, 4:49pm

The updated docs are here: Simple state with agents — Elixir v1.20.0-dev

We generally treat naming processes and supervising processes as separate things. You don’t need to wrap the registry and supervisor access from within the agent. The architecture built in the getting started guide should be enough and provide uniqueness (either locally or in a cluster).