Stream a large file with multiline records

kokolegorille · November 30, 2018, 3:30pm

Hello everyone,

I am trying to parse a large file which contains records that could potentially be on 2 lines. Here is an extract

A02 "Bird: From Gambit, Langheld Gambit"  1.f4 e5 2.fxe5 d6 3.exd6 Nf6 *
A02 "Bird: From Gambit, 3...Bxd6"  1.f4 e5 2.fxe5 d6 3.exd6 Bxd6 *
A02 "Bird: From Gambit, Lipke"  1.f4 e5 2.fxe5 d6 3.exd6 Bxd6 4.Nf3 Nh6 5.d4 *
A02 "Bird: From Gambit, Lasker Variation"
  1.f4 e5 2.fxe5 d6 3.exd6 Bxd6 4.Nf3 g5 *
A02 "Bird: From Gambit, Lasker, 5.d4"
  1.f4 e5 2.fxe5 d6 3.exd6 Bxd6 4.Nf3 g5 5.d4 *

A record starts with a code, and ends with a star. I would like to generate a stream that would merge incomplete records with the next line, making a valid record.

I could separate lines by group with…

    file
    |> File.stream!()
    |> Stream.chunk_by(&String.ends_with?(&1, "*\n"))

But somehow, It starts to be hackish…

I wonder how You would generate a stream of records, that could potentially be split on multiple lines.

Thanks for taking time

kokolegorille · November 30, 2018, 4:25pm

Sorry about the question, I resolved the problem with Stream.chunk_while, it does what I need…

For future reference, here is the solution…

    chunk_fun = fn item, acc ->
      if String.ends_with?(item, "*\n") do
        case acc do
          "" -> {:cont, item, ""}
          previous -> {:cont, previous <> item, ""}
        end
      else
        {:cont, acc <> String.trim_trailing(item, "\n")}
      end
    end

    after_fun = fn
      "" -> {:cont, ""}
      acc -> {:cont, acc, ""}
    end

    file
    |> File.stream!()
    |> Stream.chunk_while("", chunk_fun, after_fun)

Have a nice weekend

chrismcg · November 30, 2018, 4:51pm

Having just read https://pl-rants.net/posts/leex-yecc-in-elixir/ I thought I’d have a go with leex for a bit of fun.

src/chess_parser.xrl

Definitions.

Rules.
%% first 3 chars in example, some sort of tag
[A-Z][0-9]+ : {token, {start, TokenLine, TokenChars}}.

%% Description string - naive and doesn't handle any sort of escaping
".*" : {token, {description, TokenLine, TokenChars}}.

%% The moves
%% Move number
[0-9]\. : {token, {move_number, TokenLine, move_number_to_integer(TokenChars)}}.
%% Move
[a-zA-Z0-9]+ : {token, {move, TokenLine, TokenChars}}.

%% the end
\* : {token, {finish}}.

%% ignore whitespace
[\s\n\r\t]+ : skip_token.

Erlang code.

move_number_to_integer(Move) ->
  {MoveNumber, _} = string:to_integer(Move),
  MoveNumber.

Gave me this output in tests.
test/chess_parser_test.exs

  test "parses single line correctly" do
    input = ~S(A02 "Bird: From Gambit, Langheld Gambit"  1.f4 e5 2.fxe5 d6 3.exd6 Nf6 *)
    expected_tokens = [
      {:start, 1, 'A02'},
      {:description, 1, '"Bird: From Gambit, Langheld Gambit"'},
      {:move_number, 1, 1},
      {:move, 1, 'f4'},
      {:move, 1, 'e5'},
      {:move_number, 1, 2},
      {:move, 1, 'fxe5'},
      {:move, 1, 'd6'},
      {:move_number, 1, 3},
      {:move, 1, 'exd6'},
      {:move, 1, 'Nf6'},
      {:finish}
    ]

    {:ok, tokens, _} = :chess_parser.string(String.to_charlist(input))

    assert tokens == expected_tokens
  end

  test "parses split line correctly" do
    input = """
    A02 "Bird: From Gambit, Lasker, 5.d4"
      1.f4 e5 2.fxe5 d6 3.exd6 Bxd6 4.Nf3 g5 5.d4 *
    """
    expected_tokens = [
      {:start, 1, 'A02'},
      {:description, 1, '"Bird: From Gambit, Lasker, 5.d4"'},
      {:move_number, 2, 1},
      {:move, 2, 'f4'},
      {:move, 2, 'e5'},
      {:move_number, 2, 2},
      {:move, 2, 'fxe5'},
      {:move, 2, 'd6'},
      {:move_number, 2, 3},
      {:move, 2, 'exd6'},
      {:move, 2, 'Bxd6'},
      {:move_number, 2, 4},
      {:move, 2, 'Nf3'},
      {:move, 2, 'g5'},
      {:move_number, 2, 5},
      {:move, 2, 'd4'},
      {:finish}
    ]

    {:ok, tokens, _} = :chess_parser.string(String.to_charlist(input))

    assert tokens == expected_tokens
  end

Fun to play around with, thanks for the example!

kokolegorille · November 30, 2018, 5:02pm

Thanks for your example @chrismcg. I have been doing the parsing of pgn with leex and yecc…

%%%%%%%%%%%%%%%%%%%%
%% Definitions
%%%%%%%%%%%%%%%%%%%%

Definitions.

TAG            = \[[^\]]*\]
MOVE           = [1-9][0-9]*\.(\.)?(\.)?

% There is an ambiguity on 26.Nxc8+- as the plus can be seen as check!
SAN            = ([BKNPQR])?(([a-h])?([1-8])?)(x)?([a-h])([1-8])(\s*[eE]\.?[pP]\.?\s*)?(=([BNQR]))?([\+#]-?)?
POS_EVAL       = (=)?(\+=)?(=\+)?(\+\/=)?(=\/\+)?(\+-)?(-\+)?(\+\/-)?(-\+)?(\+\/-)?(-\/\+)?(\x{00B1})?(\x{2213})?

COMMENT        = {[^}]*}
COMMENT_EOL    = ;.*
MOVE_EVAL      = (!|!!|\?|\?\?|!\?|\?!)?

NAG            = \$[0-9]*
CASTLING       = O-O(-O)?(\+)?
RESULT         = 1-0|0-1|1\/2-1\/2|\*

WHITESPACE     = [\s\t]
TERMINATOR     = [\n\r]

%%%%%%%%%%%%%%%%%%%%
%% Rules
%%%%%%%%%%%%%%%%%%%%

Rules.

\(             : {token, {'(', TokenLine}}.
\)             : {token, {')', TokenLine}}.

{POS_EVAL}     : {token, {pos_eval, TokenLine, TokenChars}}.

{TAG}          : {token, {tag, TokenLine, TokenChars}}.
{MOVE}         : {token, {move, TokenLine, TokenChars}}.
{SAN}          : {token, {san, TokenLine, TokenChars}}.
{COMMENT}      : {token, {comment, TokenLine, TokenChars}}.
{COMMENT_EOL}  : {token, {comment, TokenLine, TokenChars}}.
{MOVE_EVAL}    : {token, {move_eval, TokenLine, TokenChars}}.

{NAG}          : {token, {nag, TokenLine, TokenChars}}.
{CASTLING}     : {token, {san, TokenLine, TokenChars}}.
{RESULT}       : {token, {result, TokenLine, TokenChars}}.

{WHITESPACE}+  : skip_token.
{TERMINATOR}+  : skip_token.

%%%%%%%%%%%%%%%%%%%%
%% Erlang code
%%%%%%%%%%%%%%%%%%%%

Erlang code.

But here I wanted to find a solution based on Stream.

leex and yecc are really fun to use, but I guess nimble_parsec could give better result

chrismcg · November 30, 2018, 5:51pm

I wanted to play with streams too, so I had a look inside File.Stream and hacked together this very incomplete bit of code:

defmodule TokenStream do
  defstruct path: nil, token_mod: nil

  def new(path, token_mod) do
    %TokenStream{path: path, token_mod: token_mod}
  end

  defimpl Enumerable do
    def reduce(%{path: path, token_mod: token_mod}, acc, fun) do
      start_fun = fn ->
        case :file.open(path, :read) do
          {:ok, file} ->
            {file, 1}

          {:error, reason} ->
            raise File.Error, reason: reason, action: "tokenstream", path: path
        end
      end

      next_fun = fn {file, line} ->
        case :io.request(file, {:get_until, '', token_mod, :tokens, [line]}) do
          {:eof, line} ->
            {:halt, {file, line}}

          {:error, reason} ->
            raise IO.StreamError, reason: reason

          {:ok, tokens, line} ->
            {[tokens], {file, line}}
        end
      end

      after_fun = fn {file, _} ->
        :file.close(file)
      end

      Stream.resource(start_fun, next_fun, after_fun).(acc, fun)
    end
  end
end

This works because you have a terminator character which plays nice with io:request according to the leex docs. So in my chess_parser.xrl I changed the * line to return an end_token rather than a token:

\* : {end_token, {finish}}.

Then this works in the console:

iex(19)> TokenStream.new("test/file.txt", :chess_parser) |> Stream.map(& &1) |> Enum.to_list()
[
  [
    {:start, 1, 'A02'},
    {:description, 1, '"Bird: From Gambit, Langheld Gambit"'},
    {:move_number, 1, 1},
    {:move, 1, 'f4'},
    {:move, 1, 'e5'},
    {:move_number, 1, 2},
    {:move, 1, 'fxe5'},
    {:move, 1, 'd6'},
    {:move_number, 1, 3},
    {:move, 1, 'exd6'},
    {:move, 1, 'Nf6'},
    {:finish}
  ],
  [
    {:start, 2, 'A02'},
    {:description, 2, '"Bird: From Gambit, 3...Bxd6"'},
    {:move_number, 2, 1},
    {:move, 2, 'f4'},
    {:move, 2, 'e5'},
    {:move_number, 2, 2},
    {:move, 2, 'fxe5'},
    {:move, 2, 'd6'},
    {:move_number, 2, 3},
    {:move, 2, 'exd6'},
    {:move, 2, 'Bxd6'},
    {:finish}
  ],
  [
    {:start, 3, 'A02'},
    {:description, 3, '"Bird: From Gambit, Lipke"'},
    {:move_number, 3, 1},
    {:move, 3, 'f4'},
    {:move, 3, 'e5'},
    {:move_number, 3, 2},
    {:move, 3, 'fxe5'},
    {:move, 3, 'd6'},
    {:move_number, 3, 3},
    {:move, 3, 'exd6'},
    {:move, 3, 'Bxd6'},
    {:move_number, 3, 4},
    {:move, 3, 'Nf3'},
    {:move, 3, 'Nh6'},
    {:move_number, 3, 5},
    {:move, 3, 'd4'},
    {:finish}
  ],
  [
    {:start, 4, 'A02'},
    {:description, 4, '"Bird: From Gambit, Lasker Variation"'},
    {:move_number, 5, 1},
    {:move, 5, 'f4'},
    {:move, 5, 'e5'},
    {:move_number, 5, 2},
    {:move, 5, 'fxe5'},
    {:move, 5, 'd6'},
    {:move_number, 5, 3},
    {:move, 5, 'exd6'},
    {:move, 5, 'Bxd6'},
    {:move_number, 5, 4},
    {:move, 5, 'Nf3'},
    {:move, 5, 'g5'},
    {:finish}
  ],
  [
    {:start, 6, 'A02'},
    {:description, 6, '"Bird: From Gambit, Lasker, 5.d4"'},
    {:move_number, 7, 1},
    {:move, 7, 'f4'},
    {:move, 7, 'e5'},
    {:move_number, 7, 2},
    {:move, 7, 'fxe5'},
    {:move, 7, 'd6'},
    {:move_number, 7, 3},
    {:move, 7, 'exd6'},
    {:move, 7, 'Bxd6'},
    {:move_number, 7, 4},
    {:move, 7, 'Nf3'},
    {:move, 7, 'g5'},
    {:move_number, 7, 5},
    {:move, 7, 'd4'},
    {:finish}
  ]
]

kokolegorille · November 30, 2018, 5:57pm

That is really nice, thank You for the improvement

chrismcg · November 30, 2018, 6:14pm

It would probably work just as well with plain Stream.resource rather than wrapping it up in a module and implementing Enumerable. Depends on your use case. There’s no optimization of the file reading in that code either which could make a difference if the files are big.