Recommendation for building search parser

Well, we can make some assumptions here, I think. E.g. if user would enter “SAP BW OR Java” in search engine I would assume the user wanted someone with “SAP BW” OR “Java”. The way I decided to interpret unquoted searches basically boils down to considering everything between boolean operators (AND, OR, NOT, AND NOT, …) to be a “keyword” term which can be a single or multi word in nature. One needs to be careful with quoted terms of course (e.g. to take "Bang AND Olufsen" - quoted - as is).

EDIT: Ehh, I missed that you also want to consider words separated by space, Foo Bar AND Baz. That won’t work here too I guess.

Just realized its similarity with Erlang expression, and I couldn’t resist abusing it.

expr = ~S|("Modesty Blase" AND "X") Or not "Willie Garvin and" and Foo|

out =>

"(\"Modesty Blase\" AND \"X\") Or not \"Willie Garvin and\" and Foo"

{:ok, tokens, _EndLine} = :erl_scan.string(to_charlist(expr))
tokens

out =>

[
  {:"(", 1},
  {:string, 1, ~c"Modesty Blase"},
  {:var, 1, :AND},
  {:string, 1, ~c"X"},
  {:")", 1},
  {:var, 1, :Or},
  {:not, 1},
  {:string, 1, ~c"Willie Garvin and"},
  {:and, 1},
  {:var, 1, :Foo}
]

tokens =
  Enum.map(tokens, fn
    {:var, anno, var} ->
      term = String.downcase(to_string(var))

      if term in ~w(and or not) do
        {String.to_existing_atom(term), anno}
      else
        {:string, anno, to_string(var)}
      end

    {op, anno} when op in ~w|( )|a ->
      {op, anno}

    {type, anno, value} when type in ~w(string atom integer)a ->
      {:string, anno, to_string(value)}

    {boolean_op, _anno} = token when boolean_op in ~w(and or not)a ->
      token

    {op, anno} ->
      {:string, anno, to_string(op)}
  end)

out =>

[
  {:"(", 1},
  {:string, 1, "Modesty Blase"},
  {:and, 1},
  {:string, 1, "X"},
  {:")", 1},
  {:or, 1},
  {:not, 1},
  {:string, 1, "Willie Garvin and"},
  {:and, 1},
  {:string, 1, "Foo"}
]

IO.puts("Expression: " <> expr)

{:ok, abs_form} = :erl_parse.parse_exprs(tokens ++ [{:dot, 1}])
abs_form

out =>

Expression: ("Modesty Blase" AND "X") Or not "Willie Garvin and" and Foo

[
  {:op, 1, :or, {:op, 1, :and, {:string, 1, "Modesty Blase"}, {:string, 1, "X"}},
   {:op, 1, :and, {:op, 1, :not, {:string, 1, "Willie Garvin and"}}, {:string, 1, "Foo"}}}
]

You can then use the parsed abs_form term for further analysis.

:warning: Although the hack seems to work, but there is one major issue, it seems erl_scan dynamically creates atoms