Elixir big comprehension-list optimisation - takes too long

Need help! need to optimize.
My terminal hangs when I run the code.

I have a list of 1_78_000 words i.e 178k words

I need pairs of words, such that when joined together, their length is exactly ten.

# this is how I get those 178k words from a file
# do I need to use streams here?
# Its just a txt file with one word per line
def all_words do
    {:ok, content} = File.read("assets/dictionary.txt")
    String.split(content, "\n", trim: true)
end

# Length of individial word must be 3 or more
# two words combined, their length must be exactly 10.
# example: moto + camera = motocamera
# I am sending `all_words` here.
def foo list
    for a <- list, b <-list,
      String.length(a) > 2 && String.length(b) > 2 && String.length(a <> b)==10,
      do: a <> b
end
# do I need to add :into ?

This operation is quadratic in its runtime as you state it. You might be able to shave a bit, if you filter shorter than 3 from the start, also it might help to generatea <> b and b <> a at the same time when not using a comprehension but a recursive function only checking current word and tail of the input.

If though this is an exercise on comprehensions, prefiltering is probably your best bet.

1 Like

The reason this is taking so long is because you are traversing the list so many times (178,001 which is 31,684,178,000 elements to run through).

When you have that comprehension, you are basically doing the same as

Enum.map(all_words, fn a ->
  Enum.map(all_words, fn b ->
    ...
  end)
end)

So for each element in the list, you are actually traversing the entire list again.

What you may be able to do is reduce the list into a map with the key being the length of the word, and the values in a list. So you may end up with something like

%{
  3 => ["and", "bat"],
  5 => ["apple"]
}

Then you could just merge the numbers that equal 10. This should shrink your runtime a bit. There are probably ways to do this though.

3 Likes

can you please elaborate more in terms of code?
I tried this using the same code with a list of 1k words… it worked.
But for a large list, terminal just stops forever

looks helpful!

I was thinking of a map… your suggestion of string length as map key sounds useful.
I will give this a try!

It’s not forever, just for a very long time.

If it takes a second for a thousand element list, it will take 4 for a list of two thousand elements, 16 for 4k, etc.

And I will provide some code when I’m at my computer.

1 Like

This is how I mean it:

defmodule Glue do
  @list ~w[moto camera foo giraffe stupid thing word brat wurst]

  def pair(list \\ @list), do: list |> Enum.filter(&(String.length(&1) >= 3)) |> pair([])

  def pair([], acc), do: acc |> Enum.into([])
  def pair(list = [word|tail], acc) do
    len = 10 - String.length(word)

    acc = list
    |> Enum.filter(&(String.length(&1) === len))
    |> Enum.flat_map(fn
      other when other === word -> [other <> other]
      other -> [word <> other, other <> word]
    end)
    |> Stream.concat(acc)

    pair(tail, acc)
  end
end

IO.inspect(Glue.pair())

This is just a draft though, and still might have some room for optimisations, but at least it should be of O(n * log n) instead of O(n²).

2 Likes

Here is how I have generated 178_000 random “words” (just ~r/[a-zA-Z]{1,11}/) with random length from 1 to 11:

file = File.open!("words.txt", [:append])
list = Enum.to_list(?a..?z) ++ Enum.to_list(?A..?Z)

1..178_000 |> Flow.from_enumerable() |> Flow.each(fn _ ->
  length = Enum.random(1..11)
  data = Enum.map(1..length, fn _ -> Enum.random(list) end)
  IO.binwrite(file, data ++ '\n')
end) |> Flow.run()

File.close(file)

From this random input I have received 35_349_451 combinations in average 209 seconds.

Note: This time is much too big as I’m on my “standard” PC usage with web browser, code editor and video player. :slight_smile:

Here is my example code:

defmodule Example do
  require Integer

  @target_length 10

  def sample(path) do
    path
    |> File.stream!()
    |> Flow.from_enumerable()
    |> Flow.reduce(fn -> %{} end, &group/2)
    |> Enum.into(%{})
    |> do_sample()
  end

  defp group(line, acc) do
    word = String.trim(line)
    length = String.length(word)
    if length > @target_length, do: acc, else: Map.update(acc, length, [word], &[word | &1])
  end

  defp do_sample(acc) do
    target_words = Map.get(acc, @target_length) || []
    lengths = Map.keys(acc) -- [5, 10]
    keys = for a <- lengths, b = Enum.find(lengths, &(a + &1 == 10)), not is_nil(b), do: {a, b}
    keys_flow = keys |> Flow.from_enumerable() |> Flow.flat_map(&combine(&1, acc))
    target_words ++ combine_half(acc) ++ Enum.to_list(keys_flow)
  end

  defp combine_half(acc) when Integer.is_even(@target_length) do
    half_length = round(@target_length / 2)
    list = Map.get(acc, half_length) || []
    combine(list)
  end

  defp combine_half(_acc), do: []

  defp combine([]), do: []

  defp combine([head | tail]) do
    tail_flow = tail |> Flow.from_enumerable() |> Flow.map(&(head <> &1))
    Enum.to_list(tail_flow) ++ combine(tail)
  end

  defp combine({a, b}, acc) do
    flow1 = acc |> Map.get(a) |> Flow.from_enumerable()
    flow2 = acc |> Map.get(b) |> Flow.from_enumerable()
    flow1 |> Flow.flat_map(&do_combine(&1, flow2)) |> Enum.to_list()
  end

  defp do_combine(word, flow), do: flow |> Flow.map(&(word <> &1)) |> Enum.to_list()
end

Example.sample("words.txt")

I got previous clause warning, but it’s only because @target_length module attribute is set at compile time. Feel free to modify it as you wish. For sure I believe that it’s definitely possible to write better code - it’s just my typical “5 min” example as I don’t have much time now.

Let me know what do you think about it. :smiley:

2 Likes

Thank you both for comments.

After reading all the comments,
This is what I came to

– Iterate words from dictionary
– Enum.group them by word.length
– Next, using comprehension, join them where the length == 10

– In my case I had to match word, so,
– – if word is at beginning length is key,
– -- else if word is at end then key is -(10 - length)

The actual problem statement to me is, create word pairs for given_input_phone_number using dictionary words.
Here is some of the code.

defmodule Numbero do

  def process(input_number) do
    # words = ~w(motor truck star onto nouns struck baz motortruck foo bar)

    #words from dictionary file
    combinations = all_words()
    |> Stream.filter(&String.length(&1)>2)
    |> Enum.group_by(&number_cominations(&1,input_number))
    |> Map.delete("nomatch")

    word_pairs_from_list(combinations)
    |> List.flatten
    |> Enum.chunk_every(2)
    |> Enum.concat combinations[10]
    # First added the combinations, next adding words whose length is 10.

  end

  def number_cominations dictionary_word, input_number do
    word_no = dictionary_word |> word_to_no

    numb = to_string input_number
    cond do
      String.starts_with?(numb, word_no) -> String.length(word_no)
      String.ends_with?(numb, word_no) ->  ((10 - String.length(word_no)) * -1)
      true -> "nomatch"
    end
  end

  defp word_to_no word do
    word
    |> String.upcase
    |> String.codepoints
    |> Enum.map_join(&(letter_to_number(&1)))
  end

  defp letter_to_number letter do
    input_in_string = fn str -> String.contains? str, letter end

    cond do
      input_in_string.("ABC") -> 2
      input_in_string.("DEF") -> 3
      input_in_string.("GHI") -> 4
      input_in_string.("JKL") -> 5
      input_in_string.("MNO") -> 6
      input_in_string.("PQRS") -> 7
      input_in_string.("TUV") -> 8
      input_in_string.("WXYZ") -> 9
      true -> 0
    end

  end

end

complete problem solution is here.

I know there is room for performance optimisation.
Right now it takes around 10sec to complete execution.

ah, now I better see your use case

Look that you can generate all cases for all 10-digit numbers at compile time (with saving results to priv directory or database like :mnesia) and it would be then really easy and quick to fetch them.