-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFindEditDistanceMod.h
258 lines (230 loc) · 8.65 KB
/
FindEditDistanceMod.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
/*
* Copyright (C) 2010 University of Tartu
* Authors: Reina Käärik, Siim Orasmaa, Kristo Tammeoja, Jaak Vilo
* Contact: siim . orasmaa {at} ut . ee
*
* This file is part of Generalized Edit Distance Tool.
*
* Generalized Edit Distance Tool is free software: you can redistribute
* it and/or modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* Generalized Edit Distance Tool is distributed in the hope that it will
* be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Generalized Edit Distance Tool.
* If not, see <http://www.gnu.org/licenses/>.
*
*/
#ifndef FINDEDITDISTMOD_H
#define FINDEDITDISTMOD_H
#include "Trie.h"
#include "ARTrie.h"
#include "FileToTrie.h"
#include "Transformation.h"
#include "ShowTransformations.h"
#define min(x,y) (x > y ? y : x)
// Maximum number of possible match types
#define FP_MAX_POSITIONS 4
// Match types and corresponding indexes in the flag array
#define L_EMPTY 0
#define L_FULL 1
#define L_PREFIX 2
#define L_INFIX 3
#define L_SUFFIX 4
// Default values for add, replace and remove operations
extern double rep;
extern double rem;
extern double add;
// Tries containing generalized edit distance transformations for search
extern Trie *t;
extern ARTrie *addT;
extern ARTrie *remT;
// Tries containing generalized edit distance transformations for backtracing
extern Trie *traceT;
extern ARTrie *traceAddT;
extern ARTrie *traceRemT;
extern double lastBest;
// Mask of penalties for regular edit distance
extern double *changeSearchStringWithEd_pen;
// Mask of penalties for generalized edit distance
extern double *changeSearchStringWithGenEd_pen;
// if debug == 1, then debug will be printed
extern int debug;
/**
* Inserts \a value in \a table at position \a [row][col], but only
* if current value in given position is greater than inserted value.
*/
int addValueToTable(int cols, double table[][cols], int row, int col, double value);
/**
* Searches for transformations in "remove" trie, which can be applied
* from \a table position \c [i][j] onwards. Found transformations are
* applied, if they improve the result: make existing values in table
* smaller.
*/
int searchFromRemTrie(int cols, double table[][cols], wchar_t *string, int i, int j);
/**
* Searches for transformations in "add" trie, which can be applied
* from \a table position \c [i][j] onwards. Found transformations are
* applied, if they improve the result: make existing values in table
* smaller.
*/
int searchFromAddTrie(int cols, double table[][cols], wchar_t *string, int i , int j);
/**
* Searches for transformations in "replace" trie, which can be applied
* from \a table position \c [i][j] onwards. Found transformations are
* applied, if they improve the result: make existing values in table
* smaller.
*/
int searchFromRepTrie(int cols, double table[][cols], wchar_t *string1, wchar_t *string2, int i, int j);
/**
* Searches a replacement from list \a *n, which matches a prefix of
* \a *string. For each found match, the replacement is applied, if
* it improves the result: makes existing values in table smaller.
*/
int findReplacement(int cols, EndNode *n, double table[][cols], wchar_t *string, int i, int j, double value);
/**
* Returns a penalty value, which must be added to the cost of changing
* i-th position in the search string with regular edit distance.
*
* \param i position in search string
*/
double getPenaltOfChangingPos(int i);
/**
* Returns a penalty value, which must be added to the cost of changing
* i-th position in the search string with generalized edit distance.
*
* \param i position in search string
*/
double getPenaltOfChangingPosWithGenEd(int i);
/**
* Calculates regular edit distance between strings \a a and \a b.
* Cost of every operation is 1.0.
*
* \param a search string
* \param b text
* \param aLen length of a
* \param bLen length of b
*/
int editDistance(wchar_t* a, wchar_t* b,int aLen, int bLen);
/**
* Calculates generalized edit distance between full strings \a a
* and \a b without applying any penalties.
*
* If \a *transF \c != \c NULL , transformations are backtraced and
* stored into \a *transF .
*
* \param a search string
* \param b text
* \param aLen length of a
* \param bLen length of b
* \param transF object for storing tracebacks of the transformations
*/
double genEditDistance(wchar_t *a, wchar_t *b, int aLen, int bLen, Transformations *transF);
/**
* (A shortcut method)
* Calculates generalized edit distance between string \a a and
* a prefix of string \a b. The method also considers penalties
* of changing the search string.
*
* \param a search string
* \param b text
* \param aLen length of a
* \param bLen length of b
*/
double genEditDistance_prefix(wchar_t *a, wchar_t *b, int aLen, int bLen);
/**
* (A shortcut method)
* Calculates generalized edit distance between string \a a and
* a suffix of string \a b. The method also considers penalties
* of changing the search string.
*
* \param a search string
* \param b text
* \param aLen length of a
* \param bLen length of b
*/
double genEditDistance_suffix(wchar_t *a, wchar_t *b, int aLen, int bLen);
/**
* (A shortcut method)
* Calculates generalized edit distance between string \a a and
* an infix of string \a b. The method also considers penalties
* of changing the search string.
*
* \param a search string
* \param b text
* \param aLen length of a
* \param bLen length of b
*/
double genEditDistance_middle(wchar_t *a, wchar_t *b, int aLen, int bLen);
/**
* (A shortcut method)
* Calculates generalized edit distance between full strings \a a and
* \a b. The method is practically same as \a genEditDistance(), only
* difference is that penalties of changing the search string are also
* considered.
*
* \param a search string
* \param b text
* \param aLen length of a
* \param bLen length of b
*/
double genEditDistance_full(wchar_t *a, wchar_t *b, int aLen, int bLen);
/**
* Calculates generalized edit distance between strings \a a and \a b,
* allowing only a partial match with \a b (match can skip a prefix,
* suffix or a circumfix of \a b). For example, if \a isPrefix==1 and
* \a isSuffix==0, then match is calculated only with a prefix of
* \a b, while the suffix can be arbitary and does not effect the
* result.
* Penalties of changing the search string are also cosidered while
* calculation.
*
* For a shortcut methods, see \a genEditDistance_full(),
* \a genEditDistance_middle(), \a genEditDistance_suffix(),
* \a genEditDistance_prefix().
*
* \param a search string
* \param b text
* \param aLen length of a
* \param bLen length of b
* \param isPrefix 1 if prefix cannot be skipped, 0 if it can be skipped
* \param isSuffix 1 if suffix cannot be skipped, 0 if it can be skipped
*/
double genEditDistance_mod(wchar_t *a, wchar_t *b, int aLen, int bLen, short isPrefix, short isSuffix);
/**
*
* Calculates generalized edit distance between strings \a a and \a b,
* cosidering also penalties. Considers two kinds of penalties: penalties
* of changing the text (\a start_pen and \a end_pen) and penalties of
* changing the search string (see \a getPenaltOfChangingPos() and
* \a getPenaltOfChangingPosWithGenEd() ).
*
* Note that the term "penalties" might be misleading for \a start_pen
* and \a end_pen, as they are in this context rather "benefits". If they
* are both NULL, full (restricted) match is calculated, while if one of
* them is filled with 0.0 values, the restrictions are loosened, allowing
* to skip a prefix or a suffix of text while matching.
*
*
* \param a search string
* \param b text
* \param aLen length of a
* \param bLen length of b
* \param start_pen array of length bLen, start_pen[i] shows penalty
* for starting from text b at position i
* \param end_pen array of length bLen, end_pen[i] shows penalty for
* ending in text b at position i
*/
double genEditDistance_pens(wchar_t *a, wchar_t *b, int aLen, int bLen, double* start_pen, double* end_pen);
/**
* A debug method for printing generalized edit distance table with some additional
* information.
*/
void printTableWithChangingPenalties( wchar_t *a, wchar_t *b, int aLen, int bLen,
int rows, int cols, double table[rows][cols]);
#endif // FINDEDITDISTMOD_H