Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add RobustScaler #314

Merged
merged 1 commit into from
Dec 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions lib/scholar/options.ex
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,17 @@ defmodule Scholar.Options do
{:error, "expected 'beta' to be in the range [0, inf]"}
end
end

def quantile_range(value) do
case value do
{q_min, q_max}
when is_number(q_min) and is_number(q_max) and 0.0 < q_min and q_min < q_max and
q_max < 100.0 ->
{:ok, {q_min, q_max}}

_ ->
{:error,
"expected :quantile_range to be a tuple {q_min, q_max} such that 0.0 < q_min < q_max < 100.0, got: #{inspect(value)}"}
end
end
end
149 changes: 149 additions & 0 deletions lib/scholar/preprocessing/robust_scaler.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
defmodule Scholar.Preprocessing.RobustScaler do
@moduledoc ~S"""
Scale features using statistics that are robust to outliers.

This Scaler removes the median and scales the data according to
the quantile range (defaults to IQR: Interquartile Range).
The IQR is the range between the 1st quartile (25th quantile)
and the 3rd quartile (75th quantile).
"""

import Nx.Defn

@derive {Nx.Container, containers: [:medians, :iqr]}
defstruct [:medians, :iqr]

opts_schema = [
quantile_range: [
type: {:custom, Scholar.Options, :quantile_range, []},
default: {25.0, 75.0},
doc: """
Quantile range as a tuple {q_min, q_max} defining the range of quantiles
to include. Must satisfy 0.0 < q_min < q_max < 100.0.
"""
]
]

@opts_schema NimbleOptions.new!(opts_schema)

@doc """
Compute the median and quantiles to be used for scaling.

## Options

#{NimbleOptions.docs(@opts_schema)}

## Return values

Returns a struct with the following parameters:

* `:iqr` - the calculated interquartile range.

* `:medians` - the calculated medians of each feature across samples.

## Examples

iex> Scholar.Preprocessing.RobustScaler.fit(Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]))
%Scholar.Preprocessing.RobustScaler{
medians: Nx.tensor([1, 0, 0]),
iqr: Nx.tensor([1.0, 1.0, 1.5])
}
"""
deftransform fit(tensor, opts \\ []) do
fit_n(tensor, NimbleOptions.validate!(opts, @opts_schema))
end

defnp fit_n(tensor, opts) do
check_for_rank(tensor)

{q_min, q_max} = opts[:quantile_range]

medians = Nx.median(tensor, axis: 0)

sorted_tensor = Nx.sort(tensor, axis: 0)

q_min = percentile(sorted_tensor, q_min)
q_max = percentile(sorted_tensor, q_max)

iqr = q_max - q_min

%__MODULE__{medians: medians, iqr: iqr}
end

@doc """
Performs centering and scaling of the tensor using a fitted scaler.

## Examples

iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
iex> scaler = Scholar.Preprocessing.RobustScaler.fit(t)
%Scholar.Preprocessing.RobustScaler{
medians: Nx.tensor([1, 0, 0]),
iqr: Nx.tensor([1.0, 1.0, 1.5])
}
iex> Scholar.Preprocessing.RobustScaler.transform(scaler, t)
#Nx.Tensor<
f32[3][3]
[
[0.0, -1.0, 1.3333333730697632],
[1.0, 0.0, 0.0],
[-1.0, 1.0, -0.6666666865348816]
]
>
"""
defn transform(%__MODULE__{medians: medians, iqr: iqr}, tensor) do
check_for_rank(tensor)
scale(tensor, medians, iqr)
end

@doc """
Computes the scaling parameters and applies them to transform the tensor.

## Examples

iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
iex> Scholar.Preprocessing.RobustScaler.fit_transform(t)
#Nx.Tensor<
f32[3][3]
[
[0.0, -1.0, 1.3333333730697632],
[1.0, 0.0, 0.0],
[-1.0, 1.0, -0.6666666865348816]
]
>
"""
defn fit_transform(tensor, opts \\ []) do
tensor
|> fit(opts)
|> transform(tensor)
end

defnp scale(tensor, medians, iqr) do
(tensor - medians) / Nx.select(iqr == 0, 1.0, iqr)
end

defnp percentile(sorted_tensor, p) do
num_rows = Nx.axis_size(sorted_tensor, 0)
idx = p / 100 * (num_rows - 1)

lower_idx = Nx.floor(idx) |> Nx.as_type(:s64)
upper_idx = Nx.ceil(idx) |> Nx.as_type(:s64)

lower_values = Nx.take(sorted_tensor, lower_idx, axis: 0)
upper_values = Nx.take(sorted_tensor, upper_idx, axis: 0)

weight_upper = idx - Nx.floor(idx)
weight_lower = 1.0 - weight_upper
lower_values * weight_lower + upper_values * weight_upper
end

defnp check_for_rank(tensor) do
if Nx.rank(tensor) != 2 do
raise ArgumentError,
"""
expected tensor to have shape {num_samples, num_features}, \
got tensor with shape: #{inspect(Nx.shape(tensor))}\
"""
end
end
end
116 changes: 116 additions & 0 deletions test/scholar/preprocessing/robust_scaler_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
defmodule Scholar.Preprocessing.RobustScalerTest do
use Scholar.Case, async: true
alias Scholar.Preprocessing.RobustScaler
doctest RobustScaler

describe "fit_transform" do
test "applies scaling to data" do
data = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])

expected =
Nx.tensor([
[0.0, -1.0, 1.3333333333333333],
[1.0, 0.0, 0.0],
[-1.0, 1.0, -0.6666666666666666]
])

assert_all_close(RobustScaler.fit_transform(data), expected)
end

test "applies scaling to data with custom quantile range" do
data = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])

expected =
Nx.tensor([
[0.0, -0.7142857142857142, 1.0],
[0.7142857142857142, 0.0, 0.0],
[-0.7142857142857142, 0.7142857142857142, -0.5]
])

assert_all_close(
RobustScaler.fit_transform(data, quantile_range: {10, 80}),
expected
)
end

test "handles constant data (all values the same)" do
data = Nx.tensor([[5, 5, 5], [5, 5, 5], [5, 5, 5]])
expected = Nx.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])

assert_all_close(RobustScaler.fit_transform(data), expected)
end

test "handles already scaled data" do
data = Nx.tensor([[0, -1, 1], [1, 0, 0], [-1, 1, -1]])
expected = data

assert_all_close(RobustScaler.fit_transform(data), expected)
end

test "handles single-row tensor" do
data = Nx.tensor([[1, 2, 3]])
expected = Nx.tensor([[0.0, 0.0, 0.0]])

assert_all_close(RobustScaler.fit_transform(data), expected)
end

test "handles single-column tensor" do
data = Nx.tensor([[1], [2], [3]])
expected = Nx.tensor([[-1.0], [0.0], [1.0]])

assert_all_close(RobustScaler.fit_transform(data), expected)
end

test "handles data with negative values only" do
data = Nx.tensor([[-5, -10, -15], [-15, -5, -20], [-10, -15, -5]])

expected =
Nx.tensor([
[1.0, 0.0, 0.0],
[-1.0, 1.0, -0.6666666666666666],
[0.0, -1.0, 1.3333333333333333]
])

assert_all_close(RobustScaler.fit_transform(data), expected)
end

test "handles data with extreme outliers" do
data = Nx.tensor([[1, 2, 3], [1000, 2000, 3000], [-1000, -2000, -3000]])

expected =
Nx.tensor([[0.0, 0.0, 0.0], [0.999, 0.999, 0.999], [-1.001, -1.001, -1.001]])

assert_all_close(
RobustScaler.fit_transform(data),
expected
)
end
end

describe "errors" do
test "wrong input rank for fit" do
assert_raise ArgumentError,
"expected tensor to have shape {num_samples, num_features}, got tensor with shape: {1, 1, 1}",
fn ->
RobustScaler.fit(Nx.tensor([[[1]]]))
end
end

test "wrong input rank for transform" do
assert_raise ArgumentError,
"expected tensor to have shape {num_samples, num_features}, got tensor with shape: {1, 1, 1}",
fn ->
RobustScaler.fit(Nx.tensor([[1]]))
|> RobustScaler.transform(Nx.tensor([[[1]]]))
end
end

test "wrong quantile range" do
assert_raise NimbleOptions.ValidationError,
"invalid value for :quantile_range option: expected :quantile_range to be a tuple {q_min, q_max} such that 0.0 < q_min < q_max < 100.0, got: {10, 800}",
fn ->
RobustScaler.fit(Nx.tensor([[[1]]]), quantile_range: {10, 800})
end
end
end
end
Loading