diff --git a/lib/scholar/feature_extraction/count_vectorizer.ex b/lib/scholar/feature_extraction/count_vectorizer.ex new file mode 100644 index 00000000..8dcfff88 --- /dev/null +++ b/lib/scholar/feature_extraction/count_vectorizer.ex @@ -0,0 +1,106 @@ +defmodule Scholar.FeatureExtraction.CountVectorizer do + @moduledoc """ + A `CountVectorizer` converts already indexed collection of text documents to a matrix of token counts. + """ + import Nx.Defn + + opts_schema = [ + max_token_id: [ + type: :pos_integer, + required: true, + doc: ~S""" + Maximum token id in the input tensor. + """ + ] + ] + + @opts_schema NimbleOptions.new!(opts_schema) + + @doc """ + Generates a count matrix where each row corresponds to a document in the input corpus, + and each column corresponds to a unique token in the vocabulary of the corpus. + + The input must be a 2D tensor where: + + * Each row represents a document. + * Each document has integer values representing tokens. + + The same number represents the same token in the vocabulary. Tokens should start from 0 + and be consecutive. Negative values are ignored, making them suitable for padding. + + ## Options + + #{NimbleOptions.docs(@opts_schema)} + + ## Examples + + iex> t = Nx.tensor([[0, 1, 2], [1, 3, 4]]) + iex> Scholar.FeatureExtraction.CountVectorizer.fit_transform(t, max_token_id: Scholar.FeatureExtraction.CountVectorizer.max_token_id(t)) + Nx.tensor([ + [1, 1, 1, 0, 0], + [0, 1, 0, 1, 1] + ]) + + With padding: + + iex> t = Nx.tensor([[0, 1, -1], [1, 3, 4]]) + iex> Scholar.FeatureExtraction.CountVectorizer.fit_transform(t, max_token_id: Scholar.FeatureExtraction.CountVectorizer.max_token_id(t)) + Nx.tensor([ + [1, 1, 0, 0, 0], + [0, 1, 0, 1, 1] + ]) + """ + deftransform fit_transform(tensor, opts \\ []) do + fit_transform_n(tensor, NimbleOptions.validate!(opts, @opts_schema)) + end + + @doc """ + Computes the max_token_id option from given tensor. + + This function cannot be called inside `defn` (and it will raise + if you try to do so). + + ## Examples + + iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]) + iex> Scholar.FeatureExtraction.CountVectorizer.max_token_id(t) + 2 + """ + def max_token_id(tensor) do + tensor |> Nx.reduce_max() |> Nx.to_number() + end + + defnp fit_transform_n(tensor, opts) do + check_for_rank(tensor) + counts = Nx.broadcast(0, {Nx.axis_size(tensor, 0), opts[:max_token_id] + 1}) + + {_, counts} = + while {{i = 0, tensor}, counts}, Nx.less(i, Nx.axis_size(tensor, 0)) do + {_, counts} = + while {{j = 0, i, tensor}, counts}, Nx.less(j, Nx.axis_size(tensor, 1)) do + index = tensor[i][j] + + counts = + if Nx.any(Nx.less(index, 0)), + do: counts, + else: Nx.indexed_add(counts, Nx.stack([i, index]), 1) + + {{j + 1, i, tensor}, counts} + end + + {{i + 1, tensor}, counts} + end + + counts + end + + defnp check_for_rank(tensor) do + if Nx.rank(tensor) != 2 do + raise ArgumentError, + """ + expected tensor to have shape {num_documents, num_tokens}, \ + got tensor with shape: #{inspect(Nx.shape(tensor))}\ + """ + end + end +end diff --git a/test/scholar/feature_extraction/count_vectorizer.ex b/test/scholar/feature_extraction/count_vectorizer.ex new file mode 100644 index 00000000..64fa6480 --- /dev/null +++ b/test/scholar/feature_extraction/count_vectorizer.ex @@ -0,0 +1,53 @@ +defmodule Scholar.Preprocessing.CountVectorizer do + use Scholar.Case, async: true + alias Scholar.FeatureExtraction.CountVectorizer + doctest CountVectorizer + + describe "fit_transform" do + test "fit_transform test" do + tesnsor = Nx.tensor([[2, 3, 0], [1, 4, 4]]) + + counts = + CountVectorizer.fit_transform(tesnsor, + max_token_id: CountVectorizer.max_token_id(tesnsor) + ) + + expected_counts = Nx.tensor([[1, 0, 1, 1, 0], [0, 1, 0, 0, 2]]) + + assert counts == expected_counts + end + + test "fit_transform test - tensor with padding" do + tensor = Nx.tensor([[2, 3, 0], [1, 4, -1]]) + + counts = + CountVectorizer.fit_transform(tensor, max_token_id: CountVectorizer.max_token_id(tensor)) + + expected_counts = Nx.tensor([[1, 0, 1, 1, 0], [0, 1, 0, 0, 1]]) + + assert counts == expected_counts + end + end + + describe "max_token_id" do + test "max_token_id test" do + tensor = Nx.tensor([[2, 3, 0], [1, 4, 4]]) + assert CountVectorizer.max_token_id(tensor) == 4 + end + + test "max_token_id tes - tensor with padding" do + tensor = Nx.tensor([[2, 3, 0], [1, 4, -1]]) + assert CountVectorizer.max_token_id(tensor) == 4 + end + end + + describe "errors" do + test "wrong input rank" do + assert_raise ArgumentError, + "expected tensor to have shape {num_documents, num_tokens}, got tensor with shape: {3}", + fn -> + CountVectorizer.fit_transform(Nx.tensor([1, 2, 3]), max_token_id: 3) + end + end + end +end