Regex.replace with an accumulating value

Hey ElixirForum… working with HTML + regex right now, ran into some trouble that I’d appreciate some outside eyes on.

Here’s the code as it stands:

def code do
    html = ~S"""
    <span class="token keyword">import</span> React<span class="token punctuation">,</span> <span class="token punctuation">{</span> useState <span class="token punctuation">}</span> <span class="token keyword">from</span> <span class="token string">'react'</span><span class="token punctuation">;</span>

    <span class="token keyword">function</span> <span class="token function">Example</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token punctuation">{</span>
    <span class="token comment">// Declare a new state variable, which we'll call "count"</span>
    <span class="token keyword">const</span> <span class="token punctuation">[</span>count<span class="token punctuation">,</span> setCount<span class="token punctuation">]</span> <span class="token operator">=</span> <span class="token function">useState</span><span class="token punctuation">(</span><span class="token number">0</span><span class="token punctuation">)</span><span class="token punctuation">;</span>

    <span class="token keyword">return</span> <span class="token punctuation">(</span>
        <span class="token tag"><span class="token tag"><span class="token punctuation">&lt;</span>div</span><span class="token punctuation">></span></span><span class="token plain-text">
        </span><span class="token tag"><span class="token tag"><span class="token punctuation">&lt;</span>p</span><span class="token punctuation">></span></span><span class="token plain-text">You clicked </span><span class="token punctuation">{</span>count<span class="token punctuation">}</span><span class="token plain-text"> times</span><span class="token tag"><span class="token tag"><span class="token punctuation">&lt;/</span>p</span><span class="token punctuation">></span></span><span class="token plain-text">
    </span><span class="token tag"><span class="token tag"><span class="token punctuation">&lt;</span>button</span> <span class="token attr-name">onClick</span><span class="token script language-javascript"><span class="token script-punctuation punctuation">=</span><span class="token punctuation">{</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token operator">=></span> <span class="token function">setCount</span><span class="token punctuation">(</span>count <span class="token operator">+</span> <span class="token number">1</span><span class="token punctuation">)</span><span class="token punctuation">}</span></span><span class="token punctuation">></span></span><span class="token plain-text">
    Click me
    </span><span class="token tag"><span class="token tag"><span class="token punctuation">&lt;/</span>button</span><span class="token punctuation">></span></span><span class="token plain-text">
    </span><span class="token tag"><span class="token tag"><span class="token punctuation">&lt;/</span>div</span><span class="token punctuation">></span></span>
    <span class="token punctuation">)</span><span class="token punctuation">;</span>
    <span class="token punctuation">}</span>

    """

    Regex.replace(~r/>([^&<]+)<\/span>/, html, fn _, y -> ">#{wrap(y)}" end)

  end

  def wrap(x) do
    x |> String.split("") |> Enum.with_index |> Enum.map( fn({char,index}) -> "<span id='CHAR-#{index}'>#{char}</span>" end) |> Enum.join("")
  end

The goal is to wrap every character inside HTML tags so it looks like this: <span id='char-#{CHAR_INDEX}'>CHAR_HERE</span>. The end result should hopefully look something like this: <span class="token keyword"><span id="char-0"></span><span id="char-1">i</span><span id="char-2">m</span><span id="char-3">p</span><span id="char-4">o</span><span id="char-5">r</span><span id="char-6">t</span><span id="char-7"></span>...

Right now, my wrap function gets called many times for this HTML, so my CHAR-#{index} id repeats itself quite often. What’s the recommended pattern here to combine an accumulator with a Regex replace operation?

You can use Regex.scan instead of Regex.replace. Probably not so fast, but it works :slight_smile:

Regex.scan(~r/(.*?>)([^&<]+)(<\/span>)/s, html, capture: :all_but_first)
|> Enum.map_reduce(0, fn [l, c, r], acc ->
  c =
    c
    |> String.split()
    |> Enum.with_index()
    |> Enum.map(fn {char, index} -> "<span id='CHAR-#{index + acc}'>#{char}</span>" end)

  {[l, c, r], acc + length(c)}
end)
|> elem(0)
|> IO.puts()
2 Likes

Another solution would be to use a Html parser, like Floki…

iex> Floki.parse html
[
  {"span", [{"class", "token keyword"}], ["import"]},
  " React",
  {"span", [{"class", "token punctuation"}], [","]},
  {"span", [{"class", "token punctuation"}], ["{"]},
  " useState ",
  {"span", [{"class", "token punctuation"}], ["}"]},
  {"span", [{"class", "token keyword"}], ["from"]},
  {"span", [{"class", "token string"}], ["'react'"]},
  {"span", [{"class", "token punctuation"}], [";"]},
  {"span", [{"class", "token keyword"}], ["function"]},
  {"span", [{"class", "token function"}], ["Example"]},
  {"span", [{"class", "token punctuation"}], ["("]},
  {"span", [{"class", "token punctuation"}], [")"]},
  {"span", [{"class", "token punctuation"}], ["{"]},
  {"span", [{"class", "token comment"}],
   ["// Declare a new state variable, which we'll call \"count\""]},
  {"span", [{"class", "token keyword"}], ["const"]},
  {"span", [{"class", "token punctuation"}], ["["]},
  "count",
  {"span", [{"class", "token punctuation"}], [","]},
  " setCount",
  {"span", [{"class", "token punctuation"}], ["]"]},
  {"span", [{"class", "token operator"}], ["="]},
  {"span", [{"class", "token function"}], ["useState"]},
  {"span", [{"class", "token punctuation"}], ["("]},
  {"span", [{"class", "token number"}], ["0"]},
  {"span", [{"class", "token punctuation"}], [")"]},
  {"span", [{"class", "token punctuation"}], [";"]},
  {"span", [{"class", "token keyword"}], ["return"]},
  {"span", [{"class", "token punctuation"}], ["("]},
  {"span", [{"class", "token tag"}],
   [
     {"span", [{"class", "token tag"}],
      [{"span", [{"class", "token punctuation"}], ["<"]}, "div"]},
     {"span", [{"class", "token punctuation"}], [">"]}
   ]},
  {"span", [{"class", "token plain-text"}], []},
  {"span", [{"class", "token tag"}],
   [
     {"span", [{"class", "token tag"}],
      [{"span", [{"class", "token punctuation"}], ["<"]}, "p"]},
     {"span", [{"class", "token punctuation"}], [">"]}
   ]},
  {"span", [{"class", "token plain-text"}], ["You clicked "]},
  {"span", [{"class", "token punctuation"}], ["{"]},
  "count",
  {"span", [{"class", "token punctuation"}], ["}"]},
  {"span", [{"class", "token plain-text"}], [" times"]},
  {"span", [{"class", "token tag"}],
   [
     {"span", [{"class", "token tag"}],
      [{"span", [{"class", ...}], ["</"]}, "p"]},
     {"span", [{"class", "token punctuation"}], [">"]}
   ]},
  {"span", [{"class", "token plain-text"}], []},
  {"span", [{"class", "token tag"}],
   [
     {"span", [{"class", "token tag"}], [{"span", [...], ...}, "button"]},
     {"span", [{"class", "token attr-name"}], ["onClick"]},
     {"span", [{"class", ...}], [{...}, ...]},
     {"span", [{...}], [...]}
   ]},
  {"span", [{"class", "token plain-text"}], ["\nClick me\n"]},
  {"span", [{"class", "token tag"}],
   [{"span", [{"class", ...}], [{...}, ...]}, {"span", [{...}], [...]}]},
  {"span", [{"class", "token plain-text"}], []},
  {"span", [{"class", "token tag"}], [{"span", [...], ...}, {"span", ...}]},
  {"span", [{"class", "token punctuation"}], [")"]},
  {"span", [{"class", ...}], [";"]},
  {"span", [{...}], [...]}
]

It is probably easier to transform this list, then back to html than using regex. See this post :slight_smile:

https://blog.codinghorror.com/parsing-html-the-cthulhu-way/

6 Likes

One recursive example…

It is just a basic one that does modify span of class token keyword… You might adapt to which span You wish to transform.

defmodule Demo.Recurs do
  def parse(html) do
    html
    |> Floki.parse
    |> do_process([], 0)
    |> Floki.raw_html()
  end

  defp do_process([], acc, _), do: Enum.reverse(acc)
  defp do_process([head|tail], acc, index) do
    case head do
      {"span", [{"class", "token keyword"}], [string]} ->
        elems_from_string = do_process_string(string, [], index)
        elem = {"span", [{"class", "token keyword"}], elems_from_string}
        do_process(tail, [elem|acc], index + String.length(string))
      elem ->
        do_process(tail, [elem|acc], index)
    end
  end

  defp do_process_string(<<>>, acc, _), do: Enum.reverse(acc)
  defp do_process_string(<<c::utf8, rest::binary>>, acc, index) do
    elem = {"span", [{"id", "CHAR-#{index}"}], [List.to_string([c])]}
    do_process_string(rest, [elem|acc], index + 1)
  end
end

# then

Demo.Recurs.parse(html)

The do_process returns something like…

[
  {"span", [{"class", "token keyword"}],
   [
     {"span", [{"id", "CHAR-0"}], ["i"]},
     {"span", [{"id", "CHAR-1"}], ["m"]},
     {"span", [{"id", "CHAR-2"}], ["p"]},
     {"span", [{"id", "CHAR-3"}], ["o"]},
     {"span", [{"id", "CHAR-4"}], ["r"]},
     {"span", [{"id", "CHAR-5"}], ["t"]}
   ]},
  " React",
  {"span", [{"class", "token punctuation"}], [","]},
  {"span", [{"class", "token punctuation"}], ["{"]},
  " useState ",
  {"span", [{"class", "token punctuation"}], ["}"]},
  {"span", [{"class", "token keyword"}],
   [
     {"span", [{"id", "CHAR-6"}], ["f"]},
     {"span", [{"id", "CHAR-7"}], ["r"]},
     {"span", [{"id", "CHAR-8"}], ["o"]},
     {"span", [{"id", "CHAR-9"}], ["m"]}
   ]},
...
]
6 Likes

Thx @kokolegorille @fuelen for the new techniques!

Edit: the <<c::utf8, rest::binary>> is super clever, just starting to grok that right now