Skip to content

Commit

Permalink
remove preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
ksew1 committed Jan 11, 2025
1 parent e31f605 commit 9d8aeb4
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 200 deletions.
136 changes: 29 additions & 107 deletions lib/scholar/feature_extraction/count_vectorizer.ex
Original file line number Diff line number Diff line change
@@ -1,97 +1,43 @@
defmodule Scholar.FeatureExtraction.CountVectorizer do
@moduledoc """
A `CountVectorizer` converts a collection of text documents to a matrix of token counts.
Each row of the matrix corresponds to a document in the input corpus, and each column corresponds to a unique token from the vocabulary of the corpus.
Supports also already indexed tensors.
A `CountVectorizer` converts already indexed collection of text documents to a matrix of token counts.
"""
import Nx.Defn

@derive {Nx.Container, containers: [:counts, :vocabulary]}
defstruct [:counts, :vocabulary]

binarize_schema = [
indexed_tensor: [
type: :boolean,
default: false,
doc: ~S"""
If set to true, it assumes the input corpus is already an indexed tensor
instead of raw text strings. This skips preprocessing and vocabulary creation.
Tensors needs to be homogeneous, so you can pad them with -1, and they will be ignored.
"""
]
]

@binarize_schema NimbleOptions.new!(binarize_schema)

@doc """
Processes the input corpus and generates a count matrix and vocabulary.
Generates a count matrix where each row corresponds to a document in the input corpus, and each column corresponds to a unique token in the vocabulary of the corpus.
This function performs:
The input must be a 2D tensor where:
* Each row represents a document.
* Each document has integer values representing tokens.
* tokenization of input text (splitting by whitespace and removing punctuation)
* vocabulary construction
* creation of a count tensor
## Options
#{NimbleOptions.docs(@binarize_schema)}
The same number represents the same token in the vocabulary. Tokens should start from 0 and be consecutive. Negative values are ignored, making them suitable for padding.
## Examples
iex> corpus = ["Elixir is amazing!", "Elixir provides great tools."]
iex> Scholar.FeatureExtraction.CountVectorizer.fit_transform(corpus)
%Scholar.FeatureExtraction.CountVectorizer{
counts: Nx.tensor(
[
[1, 1, 0, 1, 0, 0],
[0, 1, 1, 0, 1, 1]
]
),
vocabulary: %{
"amazing" => Nx.tensor(0),
"elixir" => Nx.tensor(1),
"great" => Nx.tensor(2),
"is" => Nx.tensor(3),
"provides" => Nx.tensor(4),
"tools" => Nx.tensor(5)
}
}
Input can optionally be an indexed tensor to skip preprocessing and vocabulary creation:
iex> t = Nx.tensor([[0, 1, 2], [1, 3, 4]])
iex> Scholar.FeatureExtraction.CountVectorizer.fit_transform(t, indexed_tensor: true)
%Scholar.FeatureExtraction.CountVectorizer{
counts: Nx.tensor([
iex> Scholar.FeatureExtraction.CountVectorizer.fit_transform(t)
Nx.tensor([
[1, 1, 1, 0, 0],
[0, 1, 0, 1, 1]
]),
vocabulary: %{}
}
])
With padding:
iex> t = Nx.tensor([[0, 1, -1], [1, 3, 4]])
iex> Scholar.FeatureExtraction.CountVectorizer.fit_transform(t)
Nx.tensor([
[1, 1, 0, 0, 0],
[0, 1, 0, 1, 1]
])
"""
deftransform fit_transform(corpus, opts \\ []) do
{tensor, vocabulary} =
if opts[:indexed_tensor] do
{corpus, %{}}
else
preprocessed_corpus = preprocess(corpus)
vocabulary = create_vocabulary(preprocessed_corpus)
tensor = create_tensor(preprocessed_corpus, vocabulary)
{tensor, vocabulary}
end

deftransform fit_transform(tensor) do
max_index = tensor |> Nx.reduce_max() |> Nx.add(1) |> Nx.to_number()

opts =
NimbleOptions.validate!(opts, @binarize_schema) ++
[max_index: max_index, vocabulary: vocabulary]
opts = [max_index: max_index]

fit_transform_n(tensor, opts)
end

defnp fit_transform_n(tensor, opts) do
check_for_rank(tensor)
counts = Nx.broadcast(0, {Nx.axis_size(tensor, 0), opts[:max_index]})

{_, counts} =
Expand All @@ -111,40 +57,16 @@ defmodule Scholar.FeatureExtraction.CountVectorizer do
{{i + 1, tensor}, counts}
end

%__MODULE__{
counts: counts,
vocabulary: opts[:vocabulary]
}
end

deftransformp preprocess(corpus) do
corpus
|> Enum.map(&String.downcase/1)
|> Enum.map(&String.split(&1, ~r/\W+/, trim: true))
counts
end

deftransformp create_vocabulary(preprocessed_corpus) do
preprocessed_corpus
|> List.flatten()
|> Enum.uniq()
|> Enum.sort()
|> Enum.with_index()
|> Enum.into(%{})
end

deftransformp create_tensor(preprocessed_corpus, vocabulary) do
indexed_sublist =
preprocessed_corpus
|> Enum.map(fn words ->
words
|> Enum.map(&Map.get(vocabulary, &1, :nan))
end)

max_length = indexed_sublist |> Enum.map(&length/1) |> Enum.max()

indexed_sublist
|> Enum.map(&Enum.concat(&1, List.duplicate(-1, max_length - length(&1))))
|> Nx.tensor()
|> Nx.as_type({:s, 64})
defnp check_for_rank(tensor) do
if Nx.rank(tensor) != 2 do
raise ArgumentError,
"""
expected tensor to have shape {num_documents, num_tokens}, \
got tensor with shape: #{inspect(Nx.shape(tensor))}\
"""
end
end
end
109 changes: 16 additions & 93 deletions test/scholar/feature_extraction/count_vectorizer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,107 +4,30 @@ defmodule Scholar.Preprocessing.BinarizerTest do
doctest CountVectorizer

describe "fit_transform" do
test "fit_transform test - default options" do
result = CountVectorizer.fit_transform(["i love elixir", "hello world"])
test "fit_transform test" do
counts = CountVectorizer.fit_transform(Nx.tensor([[2, 3, 0], [1, 4, 4]]))

expected_counts =
Nx.tensor([
[1, 0, 1, 1, 0],
[0, 1, 0, 0, 1]
])
expected_counts = Nx.tensor([[1, 0, 1, 1, 0], [0, 1, 0, 0, 2]])

expected_vocabulary = %{
"elixir" => Nx.tensor(0),
"hello" => Nx.tensor(1),
"i" => Nx.tensor(2),
"love" => Nx.tensor(3),
"world" => Nx.tensor(4)
}

assert result.counts == expected_counts
assert result.vocabulary == expected_vocabulary
end

test "fit_transform test - removes interpunction" do
result = CountVectorizer.fit_transform(["i love elixir.", "hello, world!"])

expected_counts =
Nx.tensor([
[1, 0, 1, 1, 0],
[0, 1, 0, 0, 1]
])

expected_vocabulary = %{
"elixir" => Nx.tensor(0),
"hello" => Nx.tensor(1),
"i" => Nx.tensor(2),
"love" => Nx.tensor(3),
"world" => Nx.tensor(4)
}

assert result.counts == expected_counts
assert result.vocabulary == expected_vocabulary
end

test "fit_transform test - ignores case" do
result = CountVectorizer.fit_transform(["i love elixir", "hello world HELLO"])

expected_counts =
Nx.tensor([
[1, 0, 1, 1, 0],
[0, 2, 0, 0, 1]
])

expected_vocabulary = %{
"elixir" => Nx.tensor(0),
"hello" => Nx.tensor(1),
"i" => Nx.tensor(2),
"love" => Nx.tensor(3),
"world" => Nx.tensor(4)
}

assert result.counts == expected_counts
assert result.vocabulary == expected_vocabulary
assert counts == expected_counts
end

test "fit_transform test - already indexed tensor" do
result =
CountVectorizer.fit_transform(
Nx.tensor([
[2, 3, 0],
[1, 4, 4]
]),
indexed_tensor: true
)
test "fit_transform test - tensor with padding" do
counts = CountVectorizer.fit_transform(Nx.tensor([[2, 3, 0], [1, 4, -1]]))

expected_counts =
Nx.tensor([
[1, 0, 1, 1, 0],
[0, 1, 0, 0, 2]
])
expected_counts = Nx.tensor([[1, 0, 1, 1, 0], [0, 1, 0, 0, 1]])

assert result.counts == expected_counts
assert result.vocabulary == %{}
assert counts == expected_counts
end
end

test "fit_transform test - already indexed tensor with padding" do
result =
CountVectorizer.fit_transform(
Nx.tensor([
[2, 3, 0],
[1, 4, -1]
]),
indexed_tensor: true
)

expected_counts =
Nx.tensor([
[1, 0, 1, 1, 0],
[0, 1, 0, 0, 1]
])

assert result.counts == expected_counts
assert result.vocabulary == %{}
describe "errors" do
test "wrong input rank" do
assert_raise ArgumentError,
"expected tensor to have shape {num_documents, num_tokens}, got tensor with shape: {3}",
fn ->
CountVectorizer.fit_transform(Nx.tensor([1, 2, 3]))
end
end
end
end

0 comments on commit 9d8aeb4

Please sign in to comment.