How to group by data format

I have data following structure with csv file.

| id | name | lang |
| 1 | Bob  | Elixir |
| 1 | Bob  | Java |
| 1 | Bob  | Ruby |
| 2 | Tom  | Elixir |
| 2 | Tom  | C# |
| 3 | Dave | Python |
| 3 | Dave | Java |
| 3 | Dave | Ruby |
| 4 | Jane | Python |

I would like to group by name and make vertical lang data to horizontal like below.

| name | lang1 | lang2 | lang3 |
| Bob | Elixir | Java| Ruby |
| Tom | Elixir | C# |  |  
| Dave | Python | Java | Ruby |
| Jane | Python |  |  |

How can I make it by best way?

Parse the csv using e.g. nimble_csv, group with Enum.group_by + Enum.map to make a single row for the grouped values and encode to csv again.

2 Likes

I might not know how to use Enum.group_by in this case.

What have you tried so far?

I do assume you get your CSV parsed into a list of maps, where each maps keys is the header of the column. So Enum.group_by(the_list, fn map -> map["id"] end) will give you a map from ID to objects. You can safely Enum.map over these.

iex(1)> [%{"id" => 1, "language" => "Elixir"}, %{"id" => 2, "language" => "erlang"}, %{"id" => 1, "language" => "Gleam"}] |> Enum.group_by(&(&1["id"]))
%{
  1 => [
    %{"id" => 1, "language" => "Elixir"},
    %{"id" => 1, "language" => "Gleam"}
  ],
  2 => [%{"id" => 2, "language" => "erlang"}]
}

I have following error. I have no idea how to use group_by

  def convert do
    headers = [:id, :name, :lang]
    "data.tsv"
      |> File.stream!
      |> CSV.decode(separator: ?\t, headers: headers)
      |> Enum.group_by(fn map -> map["id"] end)
      |> Enum.take(10)
  end
** (FunctionClauseError) no function clause matching in Access.get/3    
    The following arguments were given to Access.get/3:
        # 1
        {:ok, %{id: "4", lang: "Python", name: "Jane"}}
        # 2
        "id"
        # 3
        nil
    Attempted function clauses (showing 5 out of 5):
        def get(%module{} = container, key, default)
        def get(map, key, default) when is_map(map)
        def get(list, key, default) when is_list(list) and is_atom(key)
        def get(list, key, _default) when is_list(list)
        def get(nil, _key, default)
    (elixir) lib/access.ex:320: Access.get/3
    (elixir) lib/enum.ex:1115: anonymous fn/4 in Enum.group_by/3
    (elixir) lib/enum.ex:1940: Enum."-group_by/3-lists^foldl/2-0-"/3
    (format_tsv) lib/format_tsv.ex:7: FormatTsv.convert/0

CSV.decode returns a list of {:ok, data} tuples, so you need adjust your code to extract the data out of those tuples.

1 Like

I still lost. How can I transform following structure to horizontal?
I’ve tried many patterns with Enum.map. However I can’t resolve.

%{
  "1" => [
    %{id: "1", lang: "Elixir", name: "Bob"},
    %{id: "1", lang: "Java", name: "Bob"},
    %{id: "1", lang: "Ruby", name: "Bob"}
  ],
  "2" => [
    %{id: "2", lang: "Elixir", name: "Tom"},
    %{id: "2", lang: "C#", name: "Tom"}
  ],
  "3" => [
    %{id: "3", lang: "Python", name: "Dave"},
    %{id: "3", lang: "Java", name: "Dave"},
    %{id: "3", lang: "Ruby", name: "Dave"}
  ],
  "4" => [%{id: "4", lang: "Python", name: "Jane"}]
}

to

[
%{id: 1, name: "Bob", lang1: "Elixir", lang2: "Java", lang3: "Ruby"},
%{id: 2, name: "Tom", lang1: "Elixir", lang2: "C#", lang3: ""},
%{id: 3, name: "Dave", lang1: "Python", lang2: "Java", lang3: "Ruby"},
%{id: 4, name: "Jane", lang1: "Python", lang2: "", lang3: ""}
]

Sorry for my short knowledge.

Finally I ended up with this.

defmodule FormatTsv do
  def convert do
    headers = [:id, :name, :lang]
    out_file = "out.tsv"
    result = "data.tsv"
      |> File.stream!
      |> CSV.decode(separator: ?\t, headers: headers)
      |> Enum.map(&(elem(&1, 1)))
      |> Enum.group_by(fn(x)-> x[:id] end)
      |> Enum.map(fn {id, values} -> 
                    "#{id}\t#{Enum.at(values, 0)[:name]}\t#{Enum.at(values, 0)[:lang]}\t#{Enum.at(values, 1)[:lang]}\t#{Enum.at(values, 2)[:lang]}\n"
                  end)
    File.write(out_file, result)
  end
end

Thanks.

1 Like

And if you have more than a handful of languages?

1 Like

Saw this the other day and just getting some time to respond. The structure of the data makes it a little bit of a pain to parse. I wrote some code that works assuming the ids in the tsv file are ordered. If your looking for this type of result sometimes it’s better to transform the data into another format. For this purpose I thought a struct would be the appropriate data structure to use.

defstruct [headers: [], data_rows: [], column_count: 0]

headers is a list that will accumulate all the headers.
data_rows will contain a list of maps for each row in the file.
column_count will keep track of how many header columns are needed.

So thinking in terms of rows of data the desired result is to take this input

| id | name | lang |
| 1 | Bob  | Elixir |
| 1 | Bob  | Java |
| 1 | Bob  | Ruby |
| 2 | Tom  | Elixir |
| 2 | Tom  | C# |
| 3 | Dave | Python |
| 3 | Dave | Java |
| 3 | Dave | Ruby |
| 4 | Jane | Python |

and reduce them to single rows of maps

%{id: 1, name: "Bob", languages: ["Elixir", "Java", "Ruby"]},
%{id: 2, name: "Tom, "languages: ["C#", "Elixir"]},
%{id: 3, name: "Dave,"languages: ["Ruby", "Java", "Python"]},
%{id: 4, name: "Jane", languages: ["Python"]}

This is what I came up with hope it helps.

defmodule FormatTsv do
  defstruct [headers: [], data_rows: [], column_count: 0]
  NimbleCSV.define(TSV, separator: " | ")
  @file_input "priv/data.tsv"
  @file_output "priv/output.tsv"

  def convert do
    @file_input
    |> File.stream!()
    |> TSV.parse_stream()
    |> Stream.map(fn [id, name, lang] -> new_data_row(id, name, lang) end)
    |> transform_data()
    |> write_to_file()
  end

  def transform_data(data_rows) do
    data_rows
    |> group_data_rows()
    |> add_last_row()
    |> add_headers()
    |> reverse_rows()
  end

  def add_last_row({row, struct}) do
    Map.put(struct, :data_rows, [row|struct.data_rows])
  end

  def add_headers(struct) do
    Map.put(struct, :headers, headers(struct.column_count))
  end

  def reverse_rows(struct) do
    Map.put(struct, :data_rows, Enum.reverse(struct.data_rows))
  end

  def group_data_rows(rows) do
    Enum.reduce(rows, {%{}, %FormatTsv{}}, fn(row, acc) ->
      {map, struct} = acc
      group_data(row, map, struct)
    end)
  end

  def group_data(row, map, struct) when map == %{} do
    {row, struct}
  end

  def group_data(row, map, struct) do
    case row.id == map.id do
      true ->
        [language] = row.languages
        {add_language(map, language), struct}

      false ->
        {row, add_data_row(struct, map)}
    end
  end

  def add_language(map, language) do
    Map.put(map, :languages, [language|map.languages])
  end

  def add_data_row(struct, map) do
    Map.put(struct, :data_rows, [map|struct.data_rows])
    |> update_column_count(map)
  end

  def update_column_count(struct, map) do
    header_count = Enum.count(map.languages)
    column_count = struct.column_count

    case header_count > column_count do
      true ->
        Map.put(struct, :column_count, header_count)

      false ->
        struct
    end
  end

  def headers(num) do
    ["id", "name"] ++ Enum.map(1..num, &("lang#{&1}"))
  end

  def header_row(struct) do
    "| " <> Enum.map_join(struct.headers, " | ", &(&1)) <> " |\n"
  end

  def data_row_to_string(row, lang_count, column_count) when lang_count < column_count do
    "| #{row.id} | #{row.name} |" <> " " <> Enum.join(row.languages, " | ") <> String.duplicate(" |", column_count - lang_count) <> " |\n"
  end

  def data_row_to_string(row, _lang_count, _column_count) do
    "| #{row.id} | #{row.name} |" <> " " <> Enum.join(row.languages, " | ") <> " |\n"
  end

  def write_to_file(struct) do
    File.write(@file_output, header_row(struct), [:append])

    for row <- struct.data_rows do
      lang_count = Enum.count(row.languages)
      column_count = struct.column_count
      data_row = data_row_to_string(row, lang_count, column_count)
      File.write(@file_output, data_row, [:append])
    end

  end

  def new_data_row(id, name, language) do
    %{}
    |> put_id(id)
    |> put_name(name)
    |> put_language(language)
  end

  def put_id(map, id) do
    id = String.trim_leading(id, "| ")
    Map.put(map, :id, id)
  end

  def put_name(map, name) do
    name = String.trim_trailing(name)
    Map.put(map, :name, name)
  end

  def put_language(map, language) do
    language = String.trim_trailing(language, " |")
    Map.put(map, :languages, [language])
  end
end
1 Like