-
Notifications
You must be signed in to change notification settings - Fork 67
/
Copy pathApprox.pm
executable file
·1151 lines (825 loc) · 27.7 KB
/
Approx.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package Approx;
=head1 NAME
String::Approx - match and substitute approximately (aka fuzzy matching)
=head1 SYNOPSIS
use String::Approx qw(amatch asubstitute aregex);
=head1 DESCRIPTION
B<Approximate> is defined here as I<k-differences>. One I<difference>
is an insertion, a deletion, or a substitution of one character.
The I<k> in the I<k-differences> is the maximum number of differences.
For example I<1-difference> means that a match is found if there is
one character too many (insertion) or one character missing (deletion)
or one character changed (substitution). Those are I<exclusive or>s:
that is, I<not> one of each type of modification but I<exactly one>.
=head2 The default approximateness
The default approximateness is I<10 %> of the length of the
approximate pattern or I<at least 1>: I<0-differences> being the exact
matching which can be done very effectively using the usual Perl
function C<index()> or normal regular expression matching.
=head2 amatch
use String::Approx qw(amatch);
amatch("PATTERN");
amatch("PATTERN", @LIST);
amatch("PATTERN", [ @MODS ]);
amatch("PATTERN", [ @MODS ], @LIST);
The PATTERN is B<a string>, not a regular expression. The regular
expression metanotation (C<. ? * + {...,...} ( ) | [ ] ^ $ \w ...>)
will be understood as literal characters, that is, a C<*> means in
regex terms C<\*>, not I<"match 0 or more times">.
The LIST is the list of strings to match against the pattern.
If no LIST is given matches against C<$_>.
The MODS are the modifiers that tell how approximately to match. See
below for more detailed explanation. B<NOTE>: The syntax really is
C<[ @MODS ]>, the square brackets C<[ ]> must be in there and it is
B<not> a string, no quotes of any kind around the C<[ ]>. It is
an anonymous array, see L<perlref>. See below for MODS examples.
In scalar context C<amatch()> returns the number of successful
matches. In list context C<amatch()> returns the strings that
had matches.
Example:
use String::Approx qw(amatch);
open(WORDS, '/usr/dict/words') or die;
while (<WORDS>) {
print if amatch('perl');
}
or the same ignoring case:
use String::Approx qw(amatch);
open(WORDS, '/usr/dict/words') or die;
while (<WORDS>) {
print if amatch('perl', ['i']);
}
=head2 asubstitute
use String::Approx qw(asubstitute);
asubstitute("PATTERN", "SUBSTITUTION");
asubstitute("PATTERN", "SUBSTITUTION", @LIST);
asubstitute("PATTERN", "SUBSTITUTION", [ @MODS ]);
asubstitute("PATTERN", "SUBSTITUTION", [ @MODS ], @LIST);
The PATTERN is B<a string>, not a regular expression. The regular
expression metanotation (C<. ? * + {...,...} ( ) | [ ] ^ $ \w ...>)
will be understood as literal characters, that is, a C<*> means in
regex terms C<\*>, not I<"match 0 or more times">.
Also the SUBSTITUTION is B<a string>, not a regular expression. Well,
mostly. I<Most of the> regular expression metanotation (C<.>, C<?>,
C<*>, C<+>, ...) will be not understood as literal characters, that
is, a C<*> means in regex terms C<\*>, not I<"match 0 or more times">.
The understood notations are
=over 8
=item C<$`>
the part I<before> the approximate match
=item C<$&>
the approximately matched part
=item C<$'>
the part I<after> the approximate match
=back
The MODS are the modifiers that tell how approximately to match. See
below for more detailed explanation. B<NOTE>: Yes, the syntax is
really C<[ @MODS ]>, the square brackets C<[ ]> must be in there and
it is B<not> a string, no quotes of any kind around the C<[ ]>. It
is an anonymous array, see L<perlref>. See below for MODS examples.
The LIST is the list of strings to substitute against the pattern.
If no LIST is given substitutes against C<$_>.
In scalar context C<asubstitute()> returns the number of successful
substitutions. In list context C<asubstitute()> returns the strings
that had substitutions.
Examples:
use String::Approx qw(asubstitute);
open(WORDS, '/usr/dict/words') or die;
while (<WORDS>) {
print if asubstitute('perl', '($&)');
}
or the same ignoring case:
use String::Approx qw(asubstitute);
open(WORDS, '/usr/dict/words') or die;
while (<WORDS>) {
print if asubstitute('perl', '($&)', [ 'i' ]);
}
=head2 aregex
use String::Approx qw(aregex);
aregex("PATTERN");
aregex("PATTERN", [ @MODS ]);
B<Always returns a list>: the list of regular expressions that can be
used to approximately match the wanted pattern, given the possible
modifiers. If the pattern is long or the modifiers call for large
approximateness, the list will have more than one element. How to use
the element past the first one, is dependent on the intended
application of the regular expressions.
my (@regex) = aregex('Obi-wan Kenobi');
print $regex[0], "\n";
print $regex[1], "\n";
If the pattern is to be passed onto an external utility such as
C<grep(1)> the B<std> modifier may be useful. If that is applied, the
Perlisms C<(?:)> and <(?=)> are avoided.
my (@regex) = aregex('Obi-wan Kenobi', [ 'std' ]);
=head2 Modifiers
The MODS argument in amatch(), asubstitute(), and aregex(), is an
anonymous array (see L<perlref>) of strings that control the matching
of PATTERN. The first three modifiers, B<i>, B<g>, and B<?>, are the
usual regular expression match/substitute modifiers, the rest are
special for approximate matching/substitution.
=over 8
=item i
Match/Substitute ignoring case, case-insensitively.
=item g
Substitute I<globally>, that is, all the approximate matches, not just
the first one.
=item ?
Stingy match: instead of matching greedily, as much as possibly, match
as little as possible.
=item std
Used usually only with the C<aregex()> interface. Uses the C<()>
instead of the C<(?:)> and does not use the C<?=> in the result
pattern. Useful if the pattern is going to be fed to an external tool
that does not understand these Perl extensions. Produces shorter
regular expressions -- but they might match slower.
=item I<k>
The maximum number of differences.
For example 2.
=item II<k>
The maximum number of insertions.
For example 'I2'.
=item DI<k>
The maximum number of deletions.
For example 'D2'.
=item SI<k>
The maximum number of substitutions.
For example 'S2'.
=item I<k>%
The maximum relative number of differences.
For example '10%'.
=item II<k>%
The maximum relative number of insertions.
For example 'I5%'.
=item DI<k>%
The maximum relative number of deletions.
For example 'D5%'.
=item SI<k>%
The maximum relative number of substitutions.
For example 'S5%'.
=back
I<The regular expression modifiers> C<o m s x> I<are> B<not supported>
because their definitions for approximate matching are less than clear.
The relative number of differences is relative to the length of the
PATTERN, rounded up: if, for example, the PATTERN is C<'bouillabaise'>
and the MODS is C<['20%']> the I<k> becomes I<3>.
If you want to B<disable> a particular kind of difference you need
to explicitly set it to zero: for example C<'D0'> allows no deletions.
In case of conflicting definitions the later ones silently override,
for example:
[2, 'I3', 'I1']
equals
['I1', 'D2', 'S2']
=head1 EXAMPLES
The following examples assume the following template:
use String::Approx qw(amatch asubstitute);
open(WORDS, "/usr/dict/words") or die;
while (<WORDS>) {
# <---
}
and the following examples just replace the above 'C<# E<lt>--->' line.
=head2 Matching from the C<$_>
=over 8
=item Match 'perl' with one difference
print if amatch('perl');
The I<one difference> is automatically the result in this case because
first the rule of the I<10 %> of the length of the pattern ('C<perl>')
is used and then the I<at least 1> rule.
=item Match 'perl' with case ignored
print if amatch('perl', [ 'i' ]);
The case is ignored in matching (C<i>).
B<NOTE>: this option halves the speed.
=item Match 'perl' with one insertion
print if amatch('perl', [ '0', 'I1' ]);
The I<one insertion> is easiest achieved with first disabling any
approximateness (C<0>) and then enabling one insertion (C<I1>).
=item Match 'perl' with zero deletions
print if amatch('perl', [ 'D0' ]);
The I<zero deletion> is easily achieved with simply disabling any
deletions (C<D0>), the other types of differences, the insertions and
substitutions, are still enabled.
=item Stingy matching
print if amatch('perl', [ '?' ]);
Match stingily: as little is matched as possible, as opposed to the
default greedy matching, where as much is matched as possible.
=item Substitute 'perl' approximately with HTML emboldening
print if asubstitute('perl', '<B>$&</B>', [ 'g' ]);
All (C<g>) of the approximately matching parts of the input are
surrounded by the C<HTML> emboldening markup.
=item Stingy substitution
print if asubstitute('perl', '<B>$&</B>', [ '?' ]);
Substitution is now stingy: as little is substituted as possible,
as opposed to the default greedy substitution, where as much is
substituted as possible. When stingy the 'B<$&>' naturally
tends to be shorter than when greedy -- and the 'B<&`>' and
the 'B<$'>' respectively longer.
=back
=head2 Matching from a list
The above examples match against the default variable B<$_>.
The rest of the examples show how the match from a list.
The template is now:
use String::Approx qw(amatch asubstitute);
open(WORDS, "/usr/dict/words") or die;
@words = <words>;
# <---
and the examples still go where the 'C<# E<lt>--->' line is.
=over 8
=item Match 'perl' with one difference from a list
@matched = amatch('perl', @words);
The C<@matched> contains the elements of the C<@words> that matched
approximately.
=item Substitute 'perl' approximately with HTML emphasizing from a list
@substituted = asubstitute('perl', '<EM>$&</EM>', [ 'g' ], @words);
The C<@substituted> contains B<with all> (C<g>) B<the substitutions>
the elements of the C<@words> that matched approximately.
=back
=head1 DIAGNOSTICS
=over 8
=item amatch: $_ is undefined: what are you matching against?
=item asubstitute: $_ is undefined: what are you matching against?
These happen when you have nothing in C<$_> and try to C<amatch()> or
C<asubstitute()>. Perhaps you are using the Perl option C<-e> but you
did forget the Perl option C<-n>?
=item amatch: too long pattern.
This happens when the pattern is too long for matching.
When matching long patterns, C<String::Approx> attempts to partition
the match. In other words, it tries to do the matching incrementally
in smaller parts.
If this fails the above message is shown. Please try using shorter
match patterns.
See below for L<LIMITATIONS/Pattern length> for more detailed
explanation why this happens.
=item asubstitute: too long pattern.
This happens when the pattern is too long for substituting.
The partitioning scheme explained above that is used for matching long
patterns cannot, sadly enough, be used substituting.
Please try using shorter substitution patterns.
See below for L<LIMITATIONS/Pattern length> for more detailed
explanation why this happens.
=back
=head1 TIPS
=over 8
=item transposes
To match transposed letters (as in "trasnposed") use substitutions = 2.
=item case ignorance
Avoid this, the speed is halved.
=item stingy matching
Also this tends to be somewhat slower.
=back
=head1 VERSION
Version 2.7.
=head1 LIMITATIONS
=head2 Fixed pattern
The PATTERNs of C<amatch()> and C<asubstitute()> are fixed strings,
they are not regular expressions. The I<SUBSTITUTION> of
C<asubstitute()> is a bit more flexible than that but not by much.
=head2 Pattern length
The used approximate matching algorithm is B<very aggressive>. In
mathematical terms it is I<O(exp(n) * k**2)>. This means that
when the pattern length and/or the approximateness grows the
matching or substitution take much longer time and memory.
For C<amatch()> this can be avoided by I<partitioning> the pattern,
matching it in shorter subpatterns. This makes matching a bit slower
and a bit more fuzzier, more approximate. For C<asubstitute()> this
partitioning cannot be done, the absolute maximum for the substitution
pattern length is B<19> but sometimes, for example it the approximateness
is increased, even shorter patterns are too much. When this happens,
you must use shorter patterns.
=head2 Speed
We are still now at release 2.3 about 100 times slower than B<agrep>.
If you do not know what C<agrep> is: it is a program like the UNIX
grep for searching text from within files but it knows, among other
things, how to do approximate matching. B<NOTE>: all these speeds
were measured in one particular system using one particular set of
tests: your mileage will vary.
=head2 Incompatibilities with C<String::Approx> I<v1.*>
If you have been using regular expression modifiers (B<i>, B<g>) you
lose. Sorry about that. The syntax simply is not compatible. I had
to choose between having C<amatch()> match and C<asubstitute()>
substitute elsewhere than just in $_ I<and> the old messy way of
having an unlimited number of modifiers. The first need won.
B<There is a backward compability mode>, though, if you do not want to
change your C<amatch()> and C<asubstitute()> calls. You B<have> to
change your C<use> line, however:
use String::Approx qw(amatch compat1);
That is, you must add the C<compat1> symbol if you want to be
compatible with the C<String::Approx> version 1 call syntax.
=head1 AUTHOR
Jarkko Hietaniemi C<E<lt>[email protected]<gt>>
=head1 ACKNOWLEDGEMENTS
Alberto Fontaneda C<E<lt>[email protected]<gt>>
Dmitrij Frishman C<E<lt>[email protected]<gt>>
Lars Gregersen C<E<lt>[email protected]<gt>>
Helmut Jarausch C<E<lt>[email protected]<gt>>
Håkan Kjellerstrand C<E<lt>[email protected]<gt>>
Slaven Rezic C<E<lt>[email protected]<gt>>
Nathan Torkington C<E<lt>[email protected]<gt>>
Alberto Fontaneda and Dmitrij Frishman found a bug in long patterns,
suggested a test, and tested the patch.
Lars Gregersen saw String::Approx 2.2 and Håkan Kjellerstrand's MakeRegex
and that moment he experienced a genuine Eureka. The result: up to thirty
times faster String::Approx. (MakeRegex was used for completely other
purposes).
Helmut Jarausch noticed that 2.3 asubst() failed its test case in 5.004_50+.
Slaven Rezic found a bug in 2.5 and supplied a test case and a fix for it.
Nathan Torkington is to blame for the new API of release 2. :-)
=cut
#require 5;
use strict;
local $^W = 1;
use vars qw($PACKAGE $VERSION $compat1
@ISA @EXPORT_OK
%P @aL @dL @Pl %Pp);
$PACKAGE = 'String::Approx';
$VERSION = 2.7;
$compat1 = 0;
require Exporter;
@ISA = qw(Exporter);
@EXPORT_OK = qw(amatch asubstitute aregex);
# Catch the 'compat1' tag.
sub import {
my $this = shift;
my (@list, $sym);
foreach $sym (@_) { $sym eq 'compat1' ? $compat1 = 1 : push(@list, $sym) }
local $Exporter::ExportLevel = 1;
Exporter::import($this, @list);
}
sub _estimate {
my ($l, $m) = @_;
my $p = 5 ** ($m + 2);
# Trust me, I know what I am doing.
(3 * $p * $l ** 2 + (8 - $p) * $l - $p) / 8;
# That was before the prefix optimizer and the stinginess.
# Currently this is too aggressive (meaning too hasty
# partitioning), the aggressiveness is overestimated,
# both the optimizer and the stinginess should bring it down.
# One of these lifetimes I will derive the new correct formula.
}
sub _common_suffix_find {
my @list = @_;
my ($s, $t) = (0, -1);
my ($l, $c, $n, $min);
$min = length($list[0]);
foreach (@list[1..$#list]) {
$l = length;
$min = $l if $l < $min;
}
while ( $s < $min ) {
$c = substr($list[0], $t, 1);
# Wimp out on potential metacharacters.
last if $c =~ /[\.\)\]\?]$/;
$n = 1;
foreach (@list[1..$#list]) {
last unless substr($_, $t, 1) eq $c;
$n++;
}
last if $n < @list;
$s++;
$t--;
}
$t == -1 ? '' : substr($list[0], $t+1);
}
# The _common_prefix_optimize() borrows heavily for the
# MakeRegex module by Håkan Kjellerstrand <[email protected]>
# that from a list of words computes a fast regular expression
# that matches those words. Changed (?:...) added (as noted
# by Lars Gregersen <[email protected]>), added more
# [] instead of (), added lookahead (?=), added simple suffix
# optimization (most of this code is dead code for String::Approx,
# however, and is removed from here).
sub _common_prefix_optimize {
my ($std, $p, @list)=@_;
return "$p@list" if @list == 1;
my ( %hash, $prefix, @all );
foreach ( sort @list ) {
$prefix = substr( $_, 0, 1 );
push ( @{ $hash{ $prefix } }, length( ) > 1 ? substr( $_, 1 ) : '' );
}
my $question = 0;
foreach ( sort keys %hash ) { # Recurse.
my $comm = _common_prefix_optimize( $std, $_, @{ $hash{ $_ } } );
$question = 1 if $comm eq "";
push( @all, $comm );
}
my $paren = "";
if ( @all == 1 ) {
$paren = "@all";
} else {
my $maxlen = 0;
my $any = 0;
my $len;
foreach ( @all ) {
$len = length;
$maxlen = $len > $maxlen ? $len : $maxlen;
$any++ if /^\./;
}
my $mark = $question ? "?" : "";
my $join;
my $cando_lookahead = 0;
my $suffix = @all ? _common_suffix_find( @all ) : '';
# If we have pure-\w non-empty suffixes.
if ( @all > 1 && $maxlen > 1 && length $suffix && not $suffix =~ /\W/ ) {
my $sufflen = length $suffix;
foreach ( @all ) { # Remove the suffix.
substr( $_, -$sufflen ) = '';
}
# Very heavy manual optimization took place here.
# Two conditional branches out of three were removed.
# I may have been wrong. If that is the case,
# this will produce illegal regexps. That's life.
# The cases that were removed apply in the general
# case of doing prefix optimization, but not now.
@all = grep length, @all;
$join = "@all?$suffix";
} else {
my ( @l1, @lp );
if ( $maxlen > 1 ) {
@l1 = grep { length() == 1 } @all;
@lp = grep { length() > 1 } @all;
push( @lp, @l1 == 1 ? @l1 : "[" . join( "", @l1 ) . "]" ) if @l1;
} else {
@lp = @all;
}
$join = join( ($maxlen == 1) ? "" : "|", @lp );
$cando_lookahead = 1;
}
if ( length $join == 1 ) {
$paren = "$join$mark";
} else {
if ( $maxlen == 1 ) {
$paren = $any ? ".$mark" : "[$join]$mark";
} else {
$paren = $std ? "($join)$mark" : "(?:$join)$mark";
if ( $any == 0 && $cando_lookahead && not $std ) {
my %first;
foreach ( @all ) {
$first{ substr( $_,0,1 ) } = undef;
}
my $class = join( '', sort keys %first );
if ( length $class ) {
$class = "[$class]" if length $class > 1;
$paren = "(?=$class)$paren";
}
}
}
}
}
return "$p$paren";
}
sub _compile {
my ($pattern, $I, $D, $S, $greed, $std) = @_;
my ($j, $p, %p, %q, $l, $k, $mxm);
my @p = ();
$mxm = $I; # maximum modifier
$mxm = $D if ($D > $mxm);
$mxm = $S if ($S > $mxm);
$l = length($pattern);
# The estimated length of the resulting pattern must be less than 32767.
my $est = _estimate($l, $mxm);
if ($est > 32767) { # The magic limit of Perl.
my ( $a, $b, $i );
my ( $mp, $np );
$np = int( log( $l ) );
# compute and cache the partitions per length
unless (defined $Pl[$l][$mxm]) {
my ($sp, $fp, $gp);
$np = 2 if ($np < 2);
$sp = int($l / $np);
$fp = $l - $np * $sp;
$gp = $sp + $fp;
$mp = int($mxm / $np);
$mp = 1 if ($mp < 1);
$est = _estimate($gp, $mp);
while ($est > 32767) {
# same rule here as above about the length of the pattern.
$sp--;
$np = int($l / $sp);
$fp = $l - $np * $sp;
$gp = $sp + $fp;
$mp = int($mxm / $np);
$mp = 1 if ($mp < 1);
$est = _estimate($gp, $mp);
}
($a, $b) = (0, $sp + $fp);
push(@{$Pl[$l][$mxm]}, [$a, $b]);
$a += $fp;
$b = $sp;
for ($i = 1; $i < $np; $i++) {
$a += $sp;
push(@{$Pl[$l][$mxm]}, [$a, $b]);
}
} else {
$mp = int($mxm / $np);
}
my $pi = $I ? int($mp / $I + 0.9) : 0;
my $pd = $D ? int($mp / $D + 0.9) : 0;
my $ps = $S ? int($mp / $S + 0.9) : 0;
# compute and cache the pattern partitions
unless (defined $Pp{$pattern}[$mxm]) {
foreach $i (@{$Pl[$l][$mxm]}) {
push(@{$Pp{$pattern}[$mxm]},
[substr($pattern, $$i[0], $$i[1]), $pi, $pd, $ps]);
}
}
@p = @{$Pp{$pattern}[$mxm]};
} else {
push(@p, [$pattern, $I, $D, $S]);
}
my $i0 = 1; # The start index for the insertions.
my $pp; # The current partition.
foreach $pp (@p) { # The partition loop.
%p = ();
my ($i, $d, $s) = @$pp[1..4]; # The per-partition I, D, S.
$pp = $$pp[0]; # The partition string itself.
$p{$pp} = length($pp);
while ($i or $d or $s) {
%q = ();
# the insertions
if ($i) {
$i--;
while (($p, $l) = each %p) {
my $lp1 = $l + 1;
for ($j = $i0; $j < $l; $j++) {
$k = $p;
substr($k, $j) = '.' . substr($k, $j);
$q{$k} = $lp1;
}
}
# After the first partition we want one insertion
# before every partition - at index 0. $i0 was
# initialized before the partition loop as 1 and
# thus the first partition does not get the one insertion
# in front of it.
$i0 = 0;
}
# the deletions
if ($d) {
$d--;
while (($p, $l) = each %p) {
if ($l) {
my $lm1 = $l - 1;
for ($j = 0; $j < $l; $j++) {
$k = $p;
substr($k, $j) = substr($k, $j + 1);
$q{$k} = $lm1;
}
}
}
}
# the substitutions
if ($s) {
$s--;
while (($p, $l) = each %p) {
for ($j = 0; $j < $l; $j++) {
$k = $p;
substr($k, $j, 1) = '.';
$q{$k} = $l;
}
}
}
while (($k, $l) = each %q) { $p{$k} = $l }
}
# If stingy, clean leading and trailing ".".
unless ( $greed ) {
my (%c, $c);
foreach $c (keys %p) {
$c =~ s/^\.+//;
$c =~ s/\.+$//;
$c{$c} = undef;
}
%p = %c;
my @lit = sort { length($b) <=> length($a) } keys %p;
my ( $li, $lj, %sh );
for ( $li = 0; $li < @lit; $li++ ) {
for ( $lj = $li + 1; $lj < @lit; $lj++ ) {
if ( length $lit[ $lj ] < length $lit[ $li ]
and
$lj =~ /$li/ ) {
$sh{ $lj } = undef;
}
}
}
# Be pre-5.004-compatible.
foreach $c ( keys %sh ) {
delete $p{ $c };
}
}
# The pattern.
push(@{$P{$pattern}[$I][$D][$S][$greed][$std]},
_common_prefix_optimize($std, "", keys %p));
}
}
sub _mods {
my ($mods, $aI, $aD, $aS, $rI, $rD, $rS) = @_;
my $remods = '';
my $mod;
foreach $mod (@$mods) {
while ($mod ne '') {
if ($mod =~ s/^([IDS]?)(\d+)(%?)//) {
if ($1 ne '') {
if ($3 ne '') {
if ($1 eq 'I') { $$rI = 0.01 * $2 }
elsif ($1 eq 'D') { $$rD = 0.01 * $2 }
else { $$rS = 0.01 * $2 }
} else {
if ($1 eq 'I') { $$aI = $2 }
elsif ($1 eq 'D') { $$aD = $2 }
else { $$aS = $2 }
}
} else {
if ($3 ne '') {
$$rI = $$rD = $$rS = 0.01 * $2;
} else {
$$aI = $$aD = $$aS = $2;
}
}
} elsif ($compat1 and $mod =~ s/^([igmsxo])//) {
$remods .= $1;
} elsif ($mod =~ s/^([ig])//) {
$remods .= $1;
} elsif ($mod =~ s/^\?//) {
# Just accept it.
} elsif ($mod =~ s/^std//) {
# Just accept it.
} else {
die $PACKAGE, ": unknown modifier '$mod'\n";
}
}
}
$remods ne '' ? $remods : undef;
}
sub _mids {
my ($len, $aI, $aD, $aS, $rI, $rD, $rS) = @_;
my $r = int(0.1 * $len + 0.9);
if ( defined $rI) { $aI = int($rI * $len) }
elsif (not defined $aI) { $aI = $r }
if ( defined $rD) { $aD = int($rD * $len) }
elsif (not defined $aD) { $aD = $r }
if ( defined $rS) { $aS = int($rS * $len) }
elsif (not defined $aS) { $aS = $r }
($aI, $aD, $aS);
}
sub _amatch {
my ($pattern, $list) = @_;
my ($aI, $aD, $aS, $rI, $rD, $rS);
my $len = length($pattern);
my $remods;
my $greed = 1;
my $std = 0;
if (ref $$list[0] or $compat1) {
my $mods;
if ($compat1) {
$mods = [ $$list[0] ];
@$list = ();
} else {
$mods = shift(@$list);
}
$greed = 0 if grep { /\?/ } @$mods;
$std = 1 if grep { /std/ } @$mods;
$remods = _mods($mods, \$aI, \$aD, \$aS, \$rI, \$rD, \$rS);
($aI, $aD, $aS) = _mids($len, $aI, $aD, $aS, $rI, $rD, $rS);
} else {
$dL[$len] = int(0.1 * $len + 0.9) unless $dL[$len];
$aI = $aD = $aS = $dL[$len];
}
_compile($pattern, $aI, $aD, $aS, $greed, $std)
unless ref $P{$pattern}[$aI][$aD][$aS][$greed][$std];
( $len, $aD, $remods, @{$P{$pattern}[$aI][$aD][$aS][$greed][$std]} );
}
sub amatch {
my ( $pattern, @list ) = @_;
my ( $len, $aD, $remods, @mpat ) = _amatch( $pattern, \@list );
die "amatch: \$_ is undefined: what are you matching against?\n"
if (not defined $_ and @list == 0);
my $mpat;
@list = grep { length() > $len - $aD } @list if @list and $aD;
if (@mpat == 1) {
$mpat = $mpat[0];
$mpat = '(?' . $remods . ')' . $mpat if defined $remods;
if (@list) {
# match against the @list
my @m = eval { grep /$mpat/, @list };
die "amatch: too long pattern.\n" if ($@ =~ /regexp too big/);
return @m;
}
# match against the $_
my $matched;
eval { $matched = /$mpat/ };
die "amatch: too long pattern.\n" if ($@ =~ /regexp too big/);
return ($_) if $matched;
} else {
if (@list) {
# match against the @list