remove preprocessing

elixir-nx · Jan 11, 2025 · 9d8aeb4 · 9d8aeb4
1 parent e31f605
commit 9d8aeb4
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 200 deletions.
diff --git a/lib/scholar/feature_extraction/count_vectorizer.ex b/lib/scholar/feature_extraction/count_vectorizer.ex
@@ -1,97 +1,43 @@
 defmodule Scholar.FeatureExtraction.CountVectorizer do
   @moduledoc """
-  A `CountVectorizer` converts a collection of text documents to a matrix of token counts.
-
-  Each row of the matrix corresponds to a document in the input corpus, and each column corresponds to a unique token from the vocabulary of the corpus.
-
-  Supports also already indexed tensors.
+  A `CountVectorizer` converts already indexed collection of text documents to a matrix of token counts.
   """
   import Nx.Defn
 
-  @derive {Nx.Container, containers: [:counts, :vocabulary]}
-  defstruct [:counts, :vocabulary]
-
-  binarize_schema = [
-    indexed_tensor: [
-      type: :boolean,
-      default: false,
-      doc: ~S"""
-      If set to true, it assumes the input corpus is already an indexed tensor
-      instead of raw text strings. This skips preprocessing and vocabulary creation.
-      Tensors needs to be homogeneous, so you can pad them with -1, and they will be ignored.
-      """
-    ]
-  ]
-
-  @binarize_schema NimbleOptions.new!(binarize_schema)
-
   @doc """
-  Processes the input corpus and generates a count matrix and vocabulary.
+  Generates a count matrix where each row corresponds to a document in the input corpus, and each column corresponds to a unique token in the vocabulary of the corpus.
 
-  This function performs:
+  The input must be a 2D tensor where:
+  * Each row represents a document.
+  * Each document has integer values representing tokens.
 
-  * tokenization of input text (splitting by whitespace and removing punctuation)
-  * vocabulary construction
-  * creation of a count tensor
-
-  ## Options
-
-  #{NimbleOptions.docs(@binarize_schema)}
+  The same number represents the same token in the vocabulary. Tokens should start from 0 and be consecutive. Negative values are ignored, making them suitable for padding.
 
   ## Examples
-
-      iex> corpus = ["Elixir is amazing!", "Elixir provides great tools."]
-      iex> Scholar.FeatureExtraction.CountVectorizer.fit_transform(corpus)
-      %Scholar.FeatureExtraction.CountVectorizer{
-        counts: Nx.tensor(
-          [
-            [1, 1, 0, 1, 0, 0],
-            [0, 1, 1, 0, 1, 1]
-          ]
-        ),
-        vocabulary: %{
-          "amazing" => Nx.tensor(0),
-          "elixir" => Nx.tensor(1),
-          "great" => Nx.tensor(2),
-          "is" => Nx.tensor(3),
-          "provides" => Nx.tensor(4),
-          "tools" => Nx.tensor(5)
-        }
-      }
-
-  Input can optionally be an indexed tensor to skip preprocessing and vocabulary creation:
-
       iex> t = Nx.tensor([[0, 1, 2], [1, 3, 4]])
-      iex> Scholar.FeatureExtraction.CountVectorizer.fit_transform(t, indexed_tensor: true)
-      %Scholar.FeatureExtraction.CountVectorizer{
-        counts: Nx.tensor([
+      iex> Scholar.FeatureExtraction.CountVectorizer.fit_transform(t)
+      Nx.tensor([
           [1, 1, 1, 0, 0],
           [0, 1, 0, 1, 1]
-        ]),
-        vocabulary: %{}
-      }
+        ])
+
+  With padding:
+      iex> t = Nx.tensor([[0, 1, -1], [1, 3, 4]])
+      iex> Scholar.FeatureExtraction.CountVectorizer.fit_transform(t)
+      Nx.tensor([
+            [1, 1, 0, 0, 0],
+            [0, 1, 0, 1, 1]
+        ])
   """
-  deftransform fit_transform(corpus, opts \\ []) do
-    {tensor, vocabulary} =
-      if opts[:indexed_tensor] do
-        {corpus, %{}}
-      else
-        preprocessed_corpus = preprocess(corpus)
-        vocabulary = create_vocabulary(preprocessed_corpus)
-        tensor = create_tensor(preprocessed_corpus, vocabulary)
-        {tensor, vocabulary}
-      end
-
+  deftransform fit_transform(tensor) do
     max_index = tensor |> Nx.reduce_max() |> Nx.add(1) |> Nx.to_number()
-
-    opts =
-      NimbleOptions.validate!(opts, @binarize_schema) ++
-        [max_index: max_index, vocabulary: vocabulary]
+    opts = [max_index: max_index]
 
     fit_transform_n(tensor, opts)
   end
 
   defnp fit_transform_n(tensor, opts) do
+    check_for_rank(tensor)
     counts = Nx.broadcast(0, {Nx.axis_size(tensor, 0), opts[:max_index]})
 
     {_, counts} =
@@ -111,40 +57,16 @@ defmodule Scholar.FeatureExtraction.CountVectorizer do
         {{i + 1, tensor}, counts}
       end
 
-    %__MODULE__{
-      counts: counts,
-      vocabulary: opts[:vocabulary]
-    }
-  end
-
-  deftransformp preprocess(corpus) do
-    corpus
-    |> Enum.map(&String.downcase/1)
-    |> Enum.map(&String.split(&1, ~r/\W+/, trim: true))
+    counts
   end
 
-  deftransformp create_vocabulary(preprocessed_corpus) do
-    preprocessed_corpus
-    |> List.flatten()
-    |> Enum.uniq()
-    |> Enum.sort()
-    |> Enum.with_index()
-    |> Enum.into(%{})
-  end
-
-  deftransformp create_tensor(preprocessed_corpus, vocabulary) do
-    indexed_sublist =
-      preprocessed_corpus
-      |> Enum.map(fn words ->
-        words
-        |> Enum.map(&Map.get(vocabulary, &1, :nan))
-      end)
-
-    max_length = indexed_sublist |> Enum.map(&length/1) |> Enum.max()
-
-    indexed_sublist
-    |> Enum.map(&Enum.concat(&1, List.duplicate(-1, max_length - length(&1))))
-    |> Nx.tensor()
-    |> Nx.as_type({:s, 64})
+  defnp check_for_rank(tensor) do
+    if Nx.rank(tensor) != 2 do
+      raise ArgumentError,
+            """
+            expected tensor to have shape {num_documents, num_tokens}, \
+            got tensor with shape: #{inspect(Nx.shape(tensor))}\
+            """
+    end
   end
 end
diff --git a/test/scholar/feature_extraction/count_vectorizer.ex b/test/scholar/feature_extraction/count_vectorizer.ex
@@ -4,107 +4,30 @@ defmodule Scholar.Preprocessing.BinarizerTest do
   doctest CountVectorizer
 
   describe "fit_transform" do
-    test "fit_transform test - default options" do
-      result = CountVectorizer.fit_transform(["i love elixir", "hello world"])
+    test "fit_transform test" do
+      counts = CountVectorizer.fit_transform(Nx.tensor([[2, 3, 0], [1, 4, 4]]))
 
-      expected_counts =
-        Nx.tensor([
-          [1, 0, 1, 1, 0],
-          [0, 1, 0, 0, 1]
-        ])
+      expected_counts = Nx.tensor([[1, 0, 1, 1, 0], [0, 1, 0, 0, 2]])
 
-      expected_vocabulary = %{
-        "elixir" => Nx.tensor(0),
-        "hello" => Nx.tensor(1),
-        "i" => Nx.tensor(2),
-        "love" => Nx.tensor(3),
-        "world" => Nx.tensor(4)
-      }
-
-      assert result.counts == expected_counts
-      assert result.vocabulary == expected_vocabulary
-    end
-
-    test "fit_transform test - removes interpunction" do
-      result = CountVectorizer.fit_transform(["i love elixir.", "hello, world!"])
-
-      expected_counts =
-        Nx.tensor([
-          [1, 0, 1, 1, 0],
-          [0, 1, 0, 0, 1]
-        ])
-
-      expected_vocabulary = %{
-        "elixir" => Nx.tensor(0),
-        "hello" => Nx.tensor(1),
-        "i" => Nx.tensor(2),
-        "love" => Nx.tensor(3),
-        "world" => Nx.tensor(4)
-      }
-
-      assert result.counts == expected_counts
-      assert result.vocabulary == expected_vocabulary
-    end
-
-    test "fit_transform test - ignores case" do
-      result = CountVectorizer.fit_transform(["i love elixir", "hello world HELLO"])
-
-      expected_counts =
-        Nx.tensor([
-          [1, 0, 1, 1, 0],
-          [0, 2, 0, 0, 1]
-        ])
-
-      expected_vocabulary = %{
-        "elixir" => Nx.tensor(0),
-        "hello" => Nx.tensor(1),
-        "i" => Nx.tensor(2),
-        "love" => Nx.tensor(3),
-        "world" => Nx.tensor(4)
-      }
-
-      assert result.counts == expected_counts
-      assert result.vocabulary == expected_vocabulary
+      assert counts == expected_counts
     end
 
-    test "fit_transform test - already indexed tensor" do
-      result =
-        CountVectorizer.fit_transform(
-          Nx.tensor([
-            [2, 3, 0],
-            [1, 4, 4]
-          ]),
-          indexed_tensor: true
-        )
+    test "fit_transform test - tensor with padding" do
+      counts = CountVectorizer.fit_transform(Nx.tensor([[2, 3, 0], [1, 4, -1]]))
 
-      expected_counts =
-        Nx.tensor([
-          [1, 0, 1, 1, 0],
-          [0, 1, 0, 0, 2]
-        ])
+      expected_counts = Nx.tensor([[1, 0, 1, 1, 0], [0, 1, 0, 0, 1]])
 
-      assert result.counts == expected_counts
-      assert result.vocabulary == %{}
+      assert counts == expected_counts
     end
+  end
 
-    test "fit_transform test - already indexed tensor with padding" do
-      result =
-        CountVectorizer.fit_transform(
-          Nx.tensor([
-            [2, 3, 0],
-            [1, 4, -1]
-          ]),
-          indexed_tensor: true
-        )
-
-      expected_counts =
-        Nx.tensor([
-          [1, 0, 1, 1, 0],
-          [0, 1, 0, 0, 1]
-        ])
-
-      assert result.counts == expected_counts
-      assert result.vocabulary == %{}
+  describe "errors" do
+    test "wrong input rank" do
+      assert_raise ArgumentError,
+                   "expected tensor to have shape {num_documents, num_tokens}, got tensor with shape: {3}",
+                   fn ->
+                     CountVectorizer.fit_transform(Nx.tensor([1, 2, 3]))
+                   end
     end
   end
 end