NLP in Beam VM

,

I have been searching for natural language processing packages in the beam vm, but didn’t find anything.
My idea was to find something in lemmatazation, stop words, vader, LDA and so on. Hence, classical methods that do not rely on DNN.
Has there been an attempt to make create or make it accessible in Elixir, or another Beam VM language I can use from Elixir?

I am in the same endeavour like you and I would like to respond with some of my findings in the questions asked.

{:stemmer, "~> 1.0"} is probably not maintained anymore, however, it seems to work fine in the short testing I did.

{:haystack, "~> 0.1.0"} uses the :stemmer for stemming and also provides stop_words functionality (among other things).

{:text, "~> 0.2.0"} also provides some n-gram related functionality.

With respect to LDA, I have found nothing. But then, I played a little bit with claude.ai and in the 3rd iteration I got the implementation that I will provide in the following post. At least for the simple example, it seems to work fine.

Any feedback (especially on the AI-generated code) is welcome!

defmodule LDA do
  @moduledoc """
  Implementation of Latent Dirichlet Allocation (LDA) for topic modeling in Elixir.
  Uses collapsed Gibbs sampling for inference.
  """

  @doc """
  Runs LDA on a corpus of documents.

  Parameters:
  - documents: A list of documents, where each document is a list of word IDs
  - vocabulary_size: The size of the vocabulary
  - num_topics: The number of topics to model
  - alpha: The Dirichlet prior on the per-document topic distributions
  - beta: The Dirichlet prior on the per-topic word distributions
  - iterations: Number of Gibbs sampling iterations

  Returns a tuple of:
  - document_topic_counts: Matrix of document-topic assignments
  - topic_word_counts: Matrix of topic-word assignments
  - topic_assignments: List of topic assignments for each word
  """
  def run(documents, vocabulary_size, num_topics, alpha, beta, iterations) do
    # Initialize data structures
    num_documents = length(documents)
    document_topic_counts = initialize_matrix(num_documents, num_topics)
    topic_word_counts = initialize_matrix(num_topics, vocabulary_size)
    topic_totals = List.duplicate(0, num_topics)

    # Initialize topic assignments randomly
    {topic_assignments, document_topic_counts, topic_word_counts, topic_totals} =
      initialize_topics(
        documents,
        document_topic_counts,
        topic_word_counts,
        topic_totals,
        num_topics
      )

    # Run Gibbs sampling iterations
    {topic_assignments, document_topic_counts, topic_word_counts, _} =
      Enum.reduce(
        1..iterations,
        {topic_assignments, document_topic_counts, topic_word_counts, topic_totals},
        fn _, acc -> gibbs_iteration(documents, acc, alpha, beta, num_topics) end
      )

    {document_topic_counts, topic_word_counts, topic_assignments}
  end

  @doc """
  Initializes a matrix of zeros with the given dimensions.
  """
  def initialize_matrix(rows, cols) do
    Enum.map(1..rows, fn _ -> List.duplicate(0, cols) end)
  end

  @doc """
  Initializes topic assignments randomly.
  """
  def initialize_topics(
        documents,
        document_topic_counts,
        topic_word_counts,
        topic_totals,
        num_topics
      ) do
    documents_with_index = Enum.with_index(documents)

    Enum.reduce(
      documents_with_index,
      {%{}, document_topic_counts, topic_word_counts, topic_totals},
      fn {document, doc_idx}, {topic_assignments, doc_topics, word_topics, totals} ->
        words_with_index = Enum.with_index(document)

        Enum.reduce(
          words_with_index,
          {topic_assignments, doc_topics, word_topics, totals},
          fn {word_id, word_pos}, {assignments, dt_counts, tw_counts, t_totals} ->
            # Randomly assign a topic to this word
            topic = :rand.uniform(num_topics) - 1

            # Update count matrices
            dt_counts = update_count(dt_counts, doc_idx, topic, 1)
            tw_counts = update_count(tw_counts, topic, word_id, 1)
            t_totals = List.update_at(t_totals, topic, &(&1 + 1))

            # Record the topic assignment
            key = {doc_idx, word_pos}
            assignments = Map.put(assignments, key, topic)

            {assignments, dt_counts, tw_counts, t_totals}
          end
        )
      end
    )
  end

  @doc """
  Performs one iteration of Gibbs sampling over all words in all documents.
  """
  def gibbs_iteration(
        documents,
        {topic_assignments, document_topic_counts, topic_word_counts, topic_totals},
        alpha,
        beta,
        num_topics
      ) do
    documents_with_index = Enum.with_index(documents)

    Enum.reduce(
      documents_with_index,
      {topic_assignments, document_topic_counts, topic_word_counts, topic_totals},
      fn {document, doc_idx}, acc ->
        words_with_index = Enum.with_index(document)

        Enum.reduce(words_with_index, acc, fn {word_id, word_pos},
                                              {assignments, dt_counts, tw_counts, t_totals} ->
          key = {doc_idx, word_pos}
          old_topic = Map.get(assignments, key)

          # Remove this word from the counts
          dt_counts = update_count(dt_counts, doc_idx, old_topic, -1)
          tw_counts = update_count(tw_counts, trunc(old_topic), trunc(word_id), -1)
          t_totals = List.update_at(t_totals, trunc(old_topic), &(&1 - 1))

          # Sample a new topic based on the conditional distribution
          new_topic =
            sample_topic(
              doc_idx,
              word_id,
              dt_counts,
              tw_counts,
              t_totals,
              alpha,
              beta,
              num_topics
            )

          # Update the counts with the new topic
          dt_counts = update_count(dt_counts, doc_idx, new_topic, 1)
          tw_counts = update_count(tw_counts, trunc(new_topic), trunc(word_id), 1)
          t_totals = List.update_at(t_totals, trunc(new_topic), &(&1 + 1))

          # Update the topic assignment
          assignments = Map.put(assignments, key, new_topic)

          {assignments, dt_counts, tw_counts, t_totals}
        end)
      end
    )
  end

  @doc """
  Samples a new topic for a word based on the conditional distribution.
  """
  def sample_topic(
        doc_idx,
        word_id,
        doc_topic_counts,
        topic_word_counts,
        topic_totals,
        alpha,
        beta,
        num_topics
      ) do
    # Calculate sampling weights for each topic
    topic_weights =
      Enum.map(0..(num_topics - 1), fn topic ->
        # Get document-topic count for this document and topic
        dt_count = Enum.at(doc_topic_counts, doc_idx) |> Enum.at(topic)

        # Get topic-word count for this topic and word
        tw_count = Enum.at(topic_word_counts, topic) |> Enum.at(word_id)

        # Get total words assigned to this topic
        topic_total = Enum.at(topic_totals, topic)

        # Calculate the sampling weight using the LDA formula
        (dt_count + alpha) * (tw_count + beta) /
          (topic_total + beta * length(Enum.at(topic_word_counts, topic)))
      end)

    # Sample a topic based on the weights
    total_weight = Enum.sum(topic_weights)
    random_value = :rand.uniform() * total_weight

    # Find the topic whose cumulative weight exceeds the random value
    {chosen_topic, _} =
      Enum.reduce_while(0..(num_topics - 1), {nil, 0}, fn topic, {_, cum_weight} ->
        new_cum_weight = cum_weight + Enum.at(topic_weights, topic)

        if new_cum_weight >= random_value do
          {:halt, {topic, new_cum_weight}}
        else
          {:cont, {nil, new_cum_weight}}
        end
      end)

    # If we somehow didn't choose a topic (shouldn't happen with valid weights), pick the first one
    chosen_topic || 0
  end

  @doc """
  Updates a count in a matrix at the specified position.
  """
  def update_count(matrix, row, col, value) do
    row_values = Enum.at(matrix, trunc(row))
    updated_row = List.update_at(row_values, trunc(col), &(&1 + value))
    List.replace_at(matrix, row, updated_row)
  end

  @doc """
  Extracts the document-topic distribution from the count matrix.
  """
  def get_document_topic_distribution(document_topic_counts, alpha) do
    Enum.map(document_topic_counts, fn doc_counts ->
      # Calculate normalized topic distribution for this document
      total = Enum.sum(doc_counts) + length(doc_counts) * alpha
      Enum.map(doc_counts, fn count -> (count + alpha) / total end)
    end)
  end

  @doc """
  Extracts the topic-word distribution from the count matrix.
  """
  def get_topic_word_distribution(topic_word_counts, beta) do
    Enum.map(topic_word_counts, fn topic_counts ->
      # Calculate normalized word distribution for this topic
      total = Enum.sum(topic_counts) + length(topic_counts) * beta
      Enum.map(topic_counts, fn count -> (count + beta) / total end)
    end)
  end

  @doc """
  Preprocesses a corpus of text documents for LDA.
  Returns a tuple of {documents, vocabulary}, where documents is a list of
  lists of word IDs, and vocabulary is a map from word to ID.
  """
  def preprocess(texts) do
    # Split texts into words and create vocabulary
    tokenized_texts =
      Enum.map(texts, fn text ->
        text
        |> String.downcase()
        |> String.replace(~r/[^\w\s]/, "")
        |> String.split(~r/\s+/)
        |> Enum.reject(fn word -> String.length(word) == 0 end)
      end)

    # Create vocabulary
    words = List.flatten(tokenized_texts) |> Enum.uniq()
    vocabulary = Map.new(Enum.with_index(words))

    # Convert texts to lists of word IDs
    documents =
      Enum.map(tokenized_texts, fn words ->
        Enum.map(words, fn word -> Map.get(vocabulary, word) end)
      end)

    {documents, vocabulary}
  end
end

# Example usage
defmodule Example do
  def run do
    texts2 = [
      "machine learning is fascinating and powerful",
      "neural networks deep learning transformers",
      "elixir is a functional programming language",
      "phoenix is a web framework for elixir",
      "topic modeling finds hidden themes in documents"
    ]

    texts = [
      "machine learning fascinating powerful",
      "neural networks deep learning transformers",
      "elixir functional programming language",
      "phoenix web framework elixir",
      "topic modeling finds hidden themes documents"
    ]

    {documents, vocabulary} = LDA.preprocess(texts)
    vocabulary_size = map_size(vocabulary)

    # Run LDA
    {document_topic_counts, topic_word_counts, _} =
      LDA.run(documents, vocabulary_size, 2, 0.1, 0.01, 100)

    # Get topic distributions
    topic_distributions =
      LDA.get_document_topic_distribution(document_topic_counts, 0.1)

    # Print results
    IO.puts("Document-Topic Distributions:")

    Enum.with_index(topic_distributions)
    |> Enum.each(fn {dist, idx} ->
      doc_topics =
        Enum.with_index(dist)
        |> Enum.map(fn {prob, topic} -> "Topic #{topic}: #{Float.round(prob, 3)}" end)
        |> Enum.join(", ")

      IO.puts("Document #{idx}: #{doc_topics}")
    end)

    # Create reverse vocabulary for interpreting topics
    reverse_vocab = Map.new(Enum.map(vocabulary, fn {word, id} -> {id, word} end))

    # Print top words for each topic
    word_distributions =
      LDA.get_topic_word_distribution(topic_word_counts, 0.01)

    IO.puts("\nTop Words per Topic:")

    Enum.with_index(word_distributions)
    |> Enum.each(fn {dist, topic_idx} ->
      top_words =
        Enum.with_index(dist)
        |> Enum.sort(fn {prob1, _}, {prob2, _} -> prob1 > prob2 end)
        |> Enum.take(5)
        |> Enum.map(fn {prob, word_id} ->
          "#{Map.get(reverse_vocab, word_id)} (#{Float.round(prob, 3)})"
        end)
        |> Enum.join(", ")

      IO.puts("Topic #{topic_idx}: #{top_words}")
    end)
  end
end

# Run the example
# Example.run()