Parser isn't reading Lexer's tokens correctly?

Hey there, so here is my small project where I try to implement a simple Lexer and Parser for my programming language.

This is the syntax of it:

fu main() {
    $$ This is a comment
    let name = "Name";
    display("Hello Nerd ${name}");
}

I now post all of my files and codes:

lexer.ex:

defmodule Lexer do
  def find_inok_file() do
    case File.ls!(".") do
      files ->
        Enum.find_value(files, fn file ->
          Path.extname(file) == ".inok" && file
        end)
    end
  end

  def read_inok_file(file_path) do
    case Path.extname(file_path) do
      ".inok" ->
        case File.read(file_path) do
          {:ok, file_contents} ->
            {file_path, file_contents}

          {:error, _} ->
            {file_path, ""}
        end

      _ ->
        {file_path, ""}
    end
  end

  def tokenize() do
    case find_inok_file() do
      nil ->
        {:error, "No .inok file found in directory"}

      file_path ->
        case read_inok_file(file_path) do
          {_, ""} ->
            {:error, "File is empty"}

          {_, file_contents} ->
            Regex.scan(
              ~r/let|fu|main|display|\{|\}|\(|\)|;|(?<=\$\$).*?(?=\$\$)/ms,
              file_contents
            )
            |> Enum.map(fn [token_value] -> {determine_token_type(token_value), token_value} end)
        end
    end
  end

  def determine_token_type(token_value) do
    case token_value do
      "let" -> {:let, token_value}
      "fu" -> {:function, token_value}
      "main" -> {:main, token_value}
      "display" -> {:display, token_value}
      "{" -> {:open_brace, token_value}
      "}" -> {:close_brace, token_value}
      "(" -> {:open_paren, token_value}
      ")" -> {:close_paren, token_value}
      ";" -> {:semicolon, token_value}
      "$$" -> {:comment, token_value}
      _ -> {:unknown, token_value}
    end
  end
end

parser.ex

defmodule Parser do
  def parse(tokens) do
    IO.inspect(tokens, label: "parse tokens")
    case tokens do
      [] ->
        {:error, "Unexpected end of input"}

      [{:function, _} = function_token | rest_tokens] ->
        parse_function(rest_tokens, function_token)

      [{:let, _} = _let_token, {:unknown, variable_token}, expression_token, {:semicolon, _} | rest_tokens] ->
        statements = parse_statements(rest_tokens, [[:let, variable_token, expression_token]])
        {:ok, {:function, "main", [], Enum.reverse(statements)}}

      [{:display, _} = _display_token, {:unknown, expression_token}, {:semicolon, _} | rest_tokens] ->
        statements = parse_statements(rest_tokens, [[:display, expression_token]])
        {:ok, {:function, "main", [], Enum.reverse(statements)}}

      _ ->
        {:error, "Unexpected token"}
    end
  end

  defp parse_function(tokens, function_token) do
    case tokens do
      [] ->
        {:error, "Unexpected end of input"}

      [{:main, _} = main_token, {:open_paren, _}, {:close_paren, _}, {:open_brace, _} | rest_tokens] ->
        statements = parse_statements(rest_tokens, [])
        {:ok, {function_token, main_token, [], statements}}

      _ ->
        {:error, "Unexpected token"}
    end
  end

  defp parse_statements(tokens, acc) do
    IO.inspect(tokens, label: "parse_statements tokens")
    case tokens do
      [] ->
        {:error, "Unexpected end of input"}

      [{:close_brace, _} | rest_tokens] ->
        {:ok, Enum.reverse(acc), rest_tokens}

      [{:let, _}, {:unknown, variable_token}, {:equal, _}, expression_token, {:semicolon, _} | rest_tokens] ->
        parse_statements(rest_tokens, [[:let, variable_token, expression_token]] ++ acc)

      [{:display, _}, expression_token, {:semicolon, _} | rest_tokens] ->
        parse_statements(rest_tokens, [[:display, expression_token]] ++ acc)

      [{kind, token} | _] ->
        {:error, "Unexpected token: #{inspect(kind)} '#{inspect(token)}'"}
    end
  end
end

Here is my output (debugging purposes):

iex(61)> tokens = Lexer.tokenize 
[
  {{:function, "fu"}, "fu"},
  {{:main, "main"}, "main"},
  {{:open_paren, "("}, "("},
  {{:close_paren, ")"}, ")"},
  {{:open_brace, "{"}, "{"},
  {{:let, "let"}, "let"},
  {{:semicolon, ";"}, ";"},
  {{:display, "display"}, "display"},
  {{:open_paren, "("}, "("},
  {{:open_brace, "{"}, "{"},
  {{:close_brace, "}"}, "}"},
  {{:close_paren, ")"}, ")"},
  {{:semicolon, ";"}, ";"},
  {{:close_brace, "}"}, "}"}
]
iex(62)> Parser.parse(tokens)
parse tokens: [
  {{:function, "fu"}, "fu"},
  {{:main, "main"}, "main"},
  {{:open_paren, "("}, "("},
  {{:close_paren, ")"}, ")"},
  {{:open_brace, "{"}, "{"},
  {{:let, "let"}, "let"},
  {{:semicolon, ";"}, ";"},
  {{:display, "display"}, "display"},
  {{:open_paren, "("}, "("},
  {{:open_brace, "{"}, "{"},
  {{:close_brace, "}"}, "}"},
  {{:close_paren, ")"}, ")"},
  {{:semicolon, ";"}, ";"},
  {{:close_brace, "}"}, "}"}
]
{:error, "Unexpected token"}

I literally don’t know why it doesn’t match | throws an error saying that there is an unexpected token :confused:


I also created a main file to produce an error message that might help, but for me it didn’t (yet)
main.ex

defmodule Main do
  def run() do
    case Lexer.tokenize() do
      {:ok, tokens} ->
        case Parser.parse(tokens) do
          {:ok, function} ->
            execute_function(function)

          {:error, message} ->
            IO.puts("Error: #{message}")
        end

      {:error, message} ->
        IO.puts("Error: #{message}")
    end
  end

  def execute_function({_, :main, _, statements}) do
    Enum.each(statements, fn statement ->
      execute_statement(statement)
    end)
  end

  def execute_statement({_, :let, variable, expression}) do
    value = evaluate_expression(expression)
    assign_variable(variable, value)
  end

  def execute_statement({_, :display, expression}) do
    value = evaluate_expression(expression)
    IO.puts(value)
  end

  def evaluate_expression(value) do
    value
  end

  def assign_variable(variable, value) do
      # do nothing
  end
end

=>

** (CaseClauseError) no case clause matching: [{{:function, "fu"}, "fu"}, {{:main, "main"}, "main"}, {{:open_paren, "("}, "("}, {{:close_paren, ")"}, ")"}, {{:open_brace, "{"}, "{"}, {{:let, "let"}, "let"}, {{:semicolon, ";"}, ";"}, {{:display, "display"}, "display"}, {{:open_paren, "("}, "("}, {{:open_brace, "{"}, "{"}, {{:close_brace, "}"}, "}"}, {{:close_paren, ")"}, ")"}, {{:semicolon, ";"}, ";"}, {{:close_brace, "}"}, "}"}]
    (inok 0.1.0) lib/inok.ex:3: Main.run/0
    iex:1: (file)

I hope that someone can help me out of here, I literally spent over three hours debugging this now with several IO.inspects and multiple rewrites of my Parser … but everything ends up in the same problem.

Thanks!

1 Like

{{:function, "fu"}, "fu"} doesn’t match {:function, _} = function_token, you’ve got another layer of tuple in there

Hmm okay. What, where is that coming from

The parser is expecting {atom, value} tuples, so one or the other place that glues on token_value is not desired / needed. Either:

            |> Enum.map(fn [token_value] -> {determine_token_type(token_value), token_value} end)
        end
    end
  end

  def determine_token_type(token_value) do
    case token_value do
      "let" -> :let
      "fu" -> :function

or alternatively:

            |> Enum.map(fn [token_value] -> determine_token_type(token_value) end)
        end
    end
  end

  def determine_token_type(token_value) do
    case token_value do
      "let" -> {:let, token_value}
      "fu" -> {:function, token_value}

Usually when this happens in my code, it means I started refactoring a function like determine_token_type but wandered off / got distracted halfway through.

2 Likes

Well yea that’s what I tried already, multiple times actually :slight_smile:

But my response after that was:

with: |> Enum.map(fn [token_value] -> determine_token_type(token_value) end)

iex(1)> tokens = Lexer.tokenize
[
  function: "fu",
  main: "main",
  open_paren: "(",
  close_paren: ")",
  open_brace: "{",
  let: "let",
  semicolon: ";",
  display: "display",
  open_paren: "(",
  open_brace: "{",
  close_brace: "}",
  close_paren: ")",
  semicolon: ";",
  close_brace: "}"
]
iex(2)> Parser.parse(tokens)
parse tokens: [
  function: "fu",
  main: "main",
  open_paren: "(",
  close_paren: ")",
  open_brace: "{",
  let: "let",
  semicolon: ";",
  display: "display",
  open_paren: "(",
  open_brace: "{",
  close_brace: "}",
  close_paren: ")",
  semicolon: ";",
  close_brace: "}"
]
parse_statements tokens: [
  let: "let",
  semicolon: ";",
  display: "display",
  open_paren: "(",
  open_brace: "{",
  close_brace: "}",
  close_paren: ")",
  semicolon: ";",
  close_brace: "}"
]
{:ok,
 {{:function, "fu"}, {:main, "main"}, [],
  {:error, "Unexpected token: :let '\"let\"'"}}}
iex(3)> 

Your parser’s clause for let requires an unknown and an equal token:

      [{:let, _}, {:unknown, variable_token}, {:equal, _}, expression_token, {:semicolon, _} | rest_tokens] ->
        parse_statements(rest_tokens, [[:let, variable_token, expression_token]] ++ acc)

but the output from Lexer.tokenize doesn’t contain that sequence:

  let: "let",
  semicolon: ";",
  display: "display",

My guess is that the Regex.scan is skipping important bits; you might try simplifying the situation by temporarily not worrying about comments that start with $$.

I currently rethink the whole thing … really painful situation

Now have this version but didnt make it better sadly :confused:

defmodule Parser do
  def parse(tokens) do
    IO.inspect(tokens, label: "parse tokens")
    case tokens do
      [] ->
        {:error, "Unexpected end of input"}

      [{:function, _} = function_token | rest_tokens] ->
        parse_function(rest_tokens, function_token)

      [{:let, _} = _let_token, {:identifier, variable_token}, expression_token, {:semicolon, _} | rest_tokens] ->
        case parse_statements(rest_tokens, [[:let, variable_token, expression_token]]) do
          {:ok, statements, _} ->
            {:ok, {:function, "main", [], Enum.reverse(statements)}}

          error ->
            error
        end

      [{:display, _} = _display_token, {:identifier, expression_token}, {:semicolon, _} | rest_tokens] ->
        case parse_statements(rest_tokens, [[:display, expression_token]]) do
          {:ok, statements, _} ->
            {:ok, {:function, "main", [], Enum.reverse(statements)}}

          error ->
            error
        end

      _ ->
        {:error, "Unexpected token"}
    end
  end

  defp parse_function(tokens, function_token) do
    case tokens do
      [] ->
        {:error, "Unexpected end of input"}

      [{:main, _} = main_token, {:open_paren, _}, {:close_paren, _}, {:open_brace, _} | rest_tokens] ->
        statements = parse_statements(rest_tokens, [])
        {:ok, {function_token, main_token, [], statements}}

      _ ->
        {:error, "Unexpected token"}
    end
  end

  defp parse_statements(tokens, acc) do
    IO.inspect(tokens, label: "parse_statements tokens")
    case tokens do
      [] ->
        {:ok, Enum.reverse(acc), []}

      [{:close_brace, _} | rest_tokens] ->
        {:ok, Enum.reverse(acc), rest_tokens}

      [{:let, _} = _let_token, {:identifier, variable_token}, {:equal, _}, expression_token, {:semicolon, _} | rest_tokens] ->
        case parse_statements(rest_tokens, [[:let, variable_token, expression_token]] ++ acc) do
          {:ok, statements, remaining_tokens} ->
            {:ok, statements, remaining_tokens}

          error ->
            error
        end

      [{:display, _} = _display_token, {:identifier, expression_token}, {:semicolon, _} | rest_tokens] ->
        case parse_statements(rest_tokens, [[:display, expression_token]] ++ acc) do
          {:ok, statements, remaining_tokens} ->
            {:ok, statements, remaining_tokens}

          error ->
            error
        end

      [{kind, token} | _] ->
        {:error, "Unexpected token: #{inspect(kind)} '#{inspect(token)}'"}
    end
  end
end

I literally cant figure it out, my lexer works fine but parser is hard :expressionless:

:ok,
 {{:function, "fu"}, {:main, "main"}, [],
  {:error, "Unexpected token: :let '\"let\"'"}}}

If your parser expects an :identifier token and your lexer is not producing one, then I would say that the problem lies within the lexer, not the parser.

Your regex is dropping the input that should be turning into :unknown, etc so the parser has no chance to work:

# s is the input from your first post
iex(48)> Regex.scan(~r/let|fu|main|display|\{|\}|\(|\)|;|(?<=\$\$).*?(?=\$\$)/ms, s)
[
  ["fu"],
  ["main"],
  ["("],
  [")"],
  ["{"],
  ["let"],
  [";"],
  ["display"],
  ["("],
  ["{"],
  ["}"],
  [")"],
  [";"],
  ["}"]
]

A better regex will help:

iex(55)> Regex.scan(~r/let|fu|main|display|\{|\}|\(|\)|;|=|\w+|"[^"]*"|\$\$.*?$/ms, s)
[
  ["fu"],
  ["main"],
  ["("],
  [")"],
  ["{"],
  ["$$ This is a comment"],
  ["let"],
  ["name"],
  ["="],
  ["\"Name\""],
  [";"],
  ["display"],
  ["("],
  ["\"Hello Nerd ${name}\""],
  [")"],
  [";"],
  ["}"]
]

This adds some clauses to match:

  • \w+ to match identifiers. You’ll want to customize the \w part to match all characters that are valid in an identifier for your lexer.
  • "[^"]*" matches whole quoted strings, including embedded spaces. It doesn’t have any notion of “escaping”, so you’ll need to enhance it if you need strings with embedded " characters
  • \$\$.*?$ may look like a comic book character swearing, but it’s matching two literal $ characters followed by everything up to the nearest newline
1 Like

Any other problems that you can see? :smiley:

So I played around with the Regex once again and whatever I try (+ yours), I get the same result.

{:ok,
 {{:function, "fu"}, {:main, "main"}, [],
  {:error, "Unexpected token: :let '\"let\"'"}}}
iex(1)> tokens = Lexer.tokenize
[
  function: "fu",
  main: "main",
  open_paren: "(",
  close_paren: ")",
  open_brace: "{",
  let: "let",
  unknown: "name",
  unknown: "=",
  unknown: "\"BabyGirl\"",
  semicolon: ";",
  display: "display",
  open_paren: "(",
  unknown: "\"Hello Nerd ${name}\"",
  close_paren: ")",
  semicolon: ";",
  close_brace: "}"
]
iex(2)> Parser.parse(tokens)
parse tokens: [
  function: "fu",
  main: "main",
  open_paren: "(",
  close_paren: ")",
  open_brace: "{",
  let: "let",
  unknown: "name",
  unknown: "=",
  unknown: "\"BabyGirl\"",
  semicolon: ";",
  display: "display",
  open_paren: "(",
  unknown: "\"Hello Nerd ${name}\"",
  close_paren: ")",
  semicolon: ";",
  close_brace: "}"
]
parse_statements tokens: [
  let: "let",
  unknown: "name",
  unknown: "=",
  unknown: "\"BabyGirl\"",
  semicolon: ";",
  display: "display",
  open_paren: "(",
  unknown: "\"Hello Nerd ${name}\"",
  close_paren: ")",
  semicolon: ";",
  close_brace: "}"
]
``´`
I feel like its because it doesn't recognize the let now because some parts are unknown now

Your determine_token_type does not appear to be producing {:equal, "="} tuples. The version you posted in the original post doesn’t have a case clause for that.