Saw this the other day and just getting some time to respond. The structure of the data makes it a little bit of a pain to parse. I wrote some code that works assuming the ids in the tsv file are ordered. If your looking for this type of result sometimes it’s better to transform the data into another format. For this purpose I thought a struct would be the appropriate data structure to use.
defstruct [headers: [], data_rows: [], column_count: 0]
headers
is a list that will accumulate all the headers.
data_rows
will contain a list of maps for each row in the file.
column_count
will keep track of how many header columns are needed.
So thinking in terms of rows of data the desired result is to take this input
| id | name | lang |
| 1 | Bob | Elixir |
| 1 | Bob | Java |
| 1 | Bob | Ruby |
| 2 | Tom | Elixir |
| 2 | Tom | C# |
| 3 | Dave | Python |
| 3 | Dave | Java |
| 3 | Dave | Ruby |
| 4 | Jane | Python |
and reduce them to single rows of maps
%{id: 1, name: "Bob", languages: ["Elixir", "Java", "Ruby"]},
%{id: 2, name: "Tom, "languages: ["C#", "Elixir"]},
%{id: 3, name: "Dave,"languages: ["Ruby", "Java", "Python"]},
%{id: 4, name: "Jane", languages: ["Python"]}
This is what I came up with hope it helps.
defmodule FormatTsv do
defstruct [headers: [], data_rows: [], column_count: 0]
NimbleCSV.define(TSV, separator: " | ")
@file_input "priv/data.tsv"
@file_output "priv/output.tsv"
def convert do
@file_input
|> File.stream!()
|> TSV.parse_stream()
|> Stream.map(fn [id, name, lang] -> new_data_row(id, name, lang) end)
|> transform_data()
|> write_to_file()
end
def transform_data(data_rows) do
data_rows
|> group_data_rows()
|> add_last_row()
|> add_headers()
|> reverse_rows()
end
def add_last_row({row, struct}) do
Map.put(struct, :data_rows, [row|struct.data_rows])
end
def add_headers(struct) do
Map.put(struct, :headers, headers(struct.column_count))
end
def reverse_rows(struct) do
Map.put(struct, :data_rows, Enum.reverse(struct.data_rows))
end
def group_data_rows(rows) do
Enum.reduce(rows, {%{}, %FormatTsv{}}, fn(row, acc) ->
{map, struct} = acc
group_data(row, map, struct)
end)
end
def group_data(row, map, struct) when map == %{} do
{row, struct}
end
def group_data(row, map, struct) do
case row.id == map.id do
true ->
[language] = row.languages
{add_language(map, language), struct}
false ->
{row, add_data_row(struct, map)}
end
end
def add_language(map, language) do
Map.put(map, :languages, [language|map.languages])
end
def add_data_row(struct, map) do
Map.put(struct, :data_rows, [map|struct.data_rows])
|> update_column_count(map)
end
def update_column_count(struct, map) do
header_count = Enum.count(map.languages)
column_count = struct.column_count
case header_count > column_count do
true ->
Map.put(struct, :column_count, header_count)
false ->
struct
end
end
def headers(num) do
["id", "name"] ++ Enum.map(1..num, &("lang#{&1}"))
end
def header_row(struct) do
"| " <> Enum.map_join(struct.headers, " | ", &(&1)) <> " |\n"
end
def data_row_to_string(row, lang_count, column_count) when lang_count < column_count do
"| #{row.id} | #{row.name} |" <> " " <> Enum.join(row.languages, " | ") <> String.duplicate(" |", column_count - lang_count) <> " |\n"
end
def data_row_to_string(row, _lang_count, _column_count) do
"| #{row.id} | #{row.name} |" <> " " <> Enum.join(row.languages, " | ") <> " |\n"
end
def write_to_file(struct) do
File.write(@file_output, header_row(struct), [:append])
for row <- struct.data_rows do
lang_count = Enum.count(row.languages)
column_count = struct.column_count
data_row = data_row_to_string(row, lang_count, column_count)
File.write(@file_output, data_row, [:append])
end
end
def new_data_row(id, name, language) do
%{}
|> put_id(id)
|> put_name(name)
|> put_language(language)
end
def put_id(map, id) do
id = String.trim_leading(id, "| ")
Map.put(map, :id, id)
end
def put_name(map, name) do
name = String.trim_trailing(name)
Map.put(map, :name, name)
end
def put_language(map, language) do
language = String.trim_trailing(language, " |")
Map.put(map, :languages, [language])
end
end