-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathstep_collapse_stringdist.Rd
115 lines (94 loc) · 3.79 KB
/
step_collapse_stringdist.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/collapse_stringdist.R
\name{step_collapse_stringdist}
\alias{step_collapse_stringdist}
\alias{tidy.step_collapse_stringdist}
\title{collapse factor levels using stringdist}
\usage{
step_collapse_stringdist(
recipe,
...,
role = NA,
trained = FALSE,
distance = NULL,
method = "osa",
options = list(),
results = NULL,
columns = NULL,
skip = FALSE,
id = rand_id("collapse_stringdist")
)
}
\arguments{
\item{recipe}{A recipe object. The step will be added to the
sequence of operations for this recipe.}
\item{...}{One or more selector functions to choose which variables are
affected by the step. See \link[recipes:selections]{recipes::selections} for more details. For the \code{tidy}
method, these are not currently used.}
\item{role}{Not used by this step since no new variables are created.}
\item{trained}{A logical to indicate if the quantities for
preprocessing have been estimated.}
\item{distance}{Integer, value to determine which strings should be collapsed
with which. The value is being used inclusive, so \code{2} will collapse levels
that have a string distance between them of 2 or lower.}
\item{method}{Character, method for distance calculation. The default is
\code{"osa"}, see \link[stringdist:stringdist-metrics]{stringdist::stringdist-metrics}.}
\item{options}{List, other arguments passed to
\code{\link[stringdist:stringdist]{stringdist::stringdistmatrix()}} such as \code{weight}, \code{q}, \code{p}, and \code{bt}, that
are used for different values of \code{method}.}
\item{results}{A list denoting the way the labels should be collapses is
stored here once this preprocessing step has be trained by \link[recipes:prep]{recipes::prep}.}
\item{columns}{A character string of variable names that will be populated
(eventually) by the \code{terms} argument.}
\item{skip}{A logical. Should the step be skipped when the
recipe is baked by \code{\link[recipes:bake]{bake()}}? While all operations are baked
when \code{\link[recipes:prep]{prep()}} is run, some operations may not be able to be
conducted on new data (e.g. processing the outcome variable(s)).
Care should be taken when using \code{skip = TRUE} as it may affect
the computations for subsequent operations.}
\item{id}{A character string that is unique to this step to identify it.}
}
\value{
An updated version of \code{recipe} with the new step added to the
sequence of existing steps (if any). For the \code{tidy} method, a tibble with
columns \code{terms} (the columns that will be affected) and \code{base}.
}
\description{
\code{step_collapse_stringdist()} creates a \emph{specification} of a recipe step that
will collapse factor levels that have a low stringdist between them.
}
\section{Tidying}{
When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble is returned with
columns \code{terms}, \code{from}, \code{to}, and \code{id}:
\describe{
\item{terms}{character, the selectors or variables selected}
\item{from}{character, the old levels}
\item{too}{character, the new levels}
\item{id}{character, id of this step}
}
}
\section{Case weights}{
The underlying operation does not allow for case weights.
}
\examples{
\dontshow{if (rlang::is_installed("stringdist")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
library(recipes)
library(tibble)
data0 <- tibble(
x1 = c("a", "b", "d", "e", "sfgsfgsd", "hjhgfgjgr"),
x2 = c("ak", "b", "djj", "e", "hjhgfgjgr", "hjhgfgjgr")
)
rec <- recipe(~., data = data0) \%>\%
step_collapse_stringdist(all_predictors(), distance = 1) \%>\%
prep()
rec \%>\%
bake(new_data = NULL)
tidy(rec, 1)
rec <- recipe(~., data = data0) \%>\%
step_collapse_stringdist(all_predictors(), distance = 2) \%>\%
prep()
rec \%>\%
bake(new_data = NULL)
tidy(rec, 1)
\dontshow{\}) # examplesIf}
}