forked from samanthamccabe/didelphis-sca
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsca-manual.tex
1346 lines (1098 loc) · 75.8 KB
/
sca-manual.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%!TEX program = xelatex
\documentclass[10pt,letterpaper]{article}
\usepackage{fontspec}
\usepackage{xunicode}
% Ok apparently you should never load xunicode before fontspec
% \usepackage{pgf}
% \usepackage{tikz}
% \usetikzlibrary{arrows,automata,positioning}
\usepackage{verbatim}
\usepackage[left=1.25in,right=1.25in,top=1.25in,bottom=1.25in]{geometry}
\newfontface\fontIPA{Gentium Plus Italic}
\newcommand\textIPA[1]{{\fontIPA #1}}
\newcounter{excounter}
% verbatim example
\newenvironment{vex}[1]{
%
\vspace{1em}
\refstepcounter{excounter}
% 〈〉
\noindent\makebox[3em][l]{(\arabic{excounter}\label{#1})}
\minipage{\textwidth}
\verbatim
}{
\endverbatim
\endminipage
\vspace{1em}
}
\renewcommand{\descriptionlabel}[1]{\hspace{\labelsep}\textsf{\bfseries#1}}
\title{Haedus Toolbox SCA, Manual (v0.7.0)}
\author{Samantha Fiona Morrígan McCabe}
\date{\today}
%TODO: should define a custom style for symbols to be rendered in gentium
\setmainfont[Ligatures=Common,ItalicFont=Gentium Plus Italic]{Linux Libertine O}
\setmainfont[Ligatures=Common]{Linux Libertine O}
\setsansfont{Linux Biolinum O}
\setmonofont[Scale=0.85]{DejaVu Sans Mono}
\begin{document}
\maketitle
\tableofcontents
\section*{Introduction}
\label{sec:introduction}
The Haedus Toolbox SCA is a flexible, script-driven sound-change applier implemented in Java. It supports UTF-8 natively and uses a rule syntax designed for clarity and similarity to sound change notation in linguistics.
The Haedus SCA supports capabilities like multiple rule conditions, regular expressions, metathesis, phonetic features, and scripting but also allows novice users to ignore advanced functionality they do not need.
This manual is divided into two main parts, plus an appendix. The first part provides a walkthrough of the SCA and its capabilities using a completed example rules file included with the program; the second part is a more detailed reference to the rule language and its syntax; the appendix provides details of implementation which are only likely to be of interest to more advanced users, or those interesting in technical details.
% section: introduction (end)
% -----------------------------------------------------------------------------
\section{User Guide}
\label{sec:user_guide}
The following sections will guide you, as a new user, through using the SCA and creating a rules script, from prerequisites and executing provided example scripts, to writing basic rules, to using some of the more powerful supported capabilities.
% part: user_guide (end)
% -----------------------------------------------------------------------------
\subsection{Setup and Execution}
\label{sub:setup_and_execution}
This section describes the steps required to set up and run the Haedus SCA, either on Windows or Unix-like systems. The SCA is provided as an executable \texttt{.jar} file, which will require that the Java Runtime Environment (JRE 1.6 or later) be installed on your system.
% section: setup_and_execution (end)
% -----------------------------------------------------------------------------
\subsubsection{Java Runtime Environment}
\label{ssub:java_runtime_environment}
If you do not have a JRE installed, you will need to acquire an appropriate version from Oracle's website. If you have a JRE installed already, it must be Java 6 or later (because that version was released quite a few years ago at the time this manual is being prepared, it likely will be).
If everything is set up correctly, when you open a terminal window and type \texttt{java -version} you should see output like the following:
\begin{vex}{ex:javatest}
samantha@colossus: java -version
java version "1.7.0_79"
OpenJDK Runtime Environment (IcedTea 2.5.5) (7u79-2.5.5-0ubuntu0.14.04.2)
OpenJDK 64-Bit Server VM (build 24.79-b02, mixed mode)
\end{vex}
\noindent
On a Windows or Mac system, you will see something a bit different (like the JRE implementation), but the ``\texttt{java version}'' itself is what matters. If the \texttt{java} command is not recognized, your environment variables may not be set correctly, either because your \texttt{\$JAVA\_HOME} is not set correctly, or because Java is not available on your system path.
% subsection: java_runtime_environment (end)
% -----------------------------------------------------------------------------
\subsubsection{Running the Sound-Change Applier}
\label{ssub:running_sca}
While a graphical interface is in early development, the Haedus SCA is currently run best through the command line. A pair of scripts have been provided to streamline the process, \texttt{toolbox.bat} or \texttt{toolbox.sh} for Windows and Unix-like systems respectively. It is possible to run the \texttt{.jar} file directly, but because it include the version number, this is rather more cumbersome.
Only basic knowledge of using the terminal is required, but if you are unfamiliar with it, there are ample resources provided online. If you want to avoid the terminal entirely, you should be able to (from your operating system's folder view) drag-and-drop a rule script onto the appropriate \texttt{toolbox} shell or batch script. Any errors produced will be sent to \texttt{toolbox.log}
If you are using the terminal to run the SCA, you need to provide the paths of one or more rules files:
\begin{vex}{ex:runstandard}
toolbox.sh script1.rule script2.rule ... scriptN.rule
\end{vex}
\noindent
There are no other arguments to provide, only the script names. Rather than specifying input and output lexicons when starting the program, lexicons are read and written within the rules files themselves (section \ref{sub:scripting_capabilities}).
Logging is handled automatically; the same information is printed to the console and written to \texttt{toolbox.log} to help you debug any problems that might arise.
% subsection: running_sca (end)
% -----------------------------------------------------------------------------
\subsection{Writing Rule Scripts}
\label{sub:writing_rule_scripts}
All of the SCA's functionality is controlled using script files. In addition to rule and variable definitions, the scripts may also contain commands controlling the reading and writing of lexicons, setting normalization modes, loading feature models, and importing or executing commands from other script files. This section will introduce you to most of the supported features in the context of the provided example file \texttt{pie-pkk.rule}, which represents the rules for converting the provided Proto-Indo-European word list into a corresponding list for Kuma-Koban; the hope is that this will be a smooth way of introducing new users to the rule syntax. At this point and for the rest of the section, some of the examples will refer to the provided \texttt{pie-pkk.rule} example file.
The Haedus rule language is generally insensitive to whitespace, and while items in lists (sets, items in rule transforms, or variable definitions) are \emph{delimited} by whitespace, it is still insensitive to quantity\footnote{Specifically, lists are stripped of padding whitespace at the beginning and end, and split using the regex \texttt{\textbackslash\hspace{0pt}s+}}; this will be apparent in the examples.
Comments can always be created using the \texttt{\%} symbol, as shown below:
\begin{vex}{ex:comment}
% This is a comment (full-line)
ph th kh > f s x % this is a rule, followed by a comment (inline)
\end{vex}
\noindent
Anything following the \texttt{\%} will be ignored; there is not currently any way to create block comments, however.
% section: writing_rule_scripts (end)
%------------------------------------------------------------------------------
\subsubsection{Using Normalization}
\label{ssub:using_normalization}
Though it is by no means required that you set this yourself, the SCA's support for normalization modes can be a powerful tool. Not specifying any mode will leave all inputs untouched by default. To set a mode, use the \texttt{mode} keyword, followed by one of the four supported modes: \texttt{none}, \texttt{decomposition}, \texttt{composition}, and \texttt{intelligent}.
The first two perform canonical decomposition, canonical decomposition followed by canonical composition, respectively, according to the Unicode standard.\footnote{UAX 15: Unicode Normalization Forms ``http://unicode.org/reports/tr15/''} The short explanation is that that decomposition will take composed Unicode characters with diacritics, and separate the base and diacritic characters in a consistent way, replacing a single composed character like ö with o + ◌̈. Composition does this as well, but then attempts to re-assemble them, if there are any composed characters available. For a sound-change applier, decomposition may be the most reliable, as it will ensure that you can always write rules which operate on the diacritics themselves.
% ◌
The intelligent segmentation mode is a bit different from either of these; it is superficially similar to \texttt{composition} but rather than using pre-composed Unicode characters, it uses Unicode character classes to distinguish between base and diacritic characters, and assemble strings it can treat as if they were single characters. This allows the SCA to treat \textIPA{q̇ʷʰ}, for instance, as a single character, rather than four. In brief, when it finds a non-diacritic character followed by one or more diacritics, they will be attached to the non-diacritic.
The details of this segmentation algorithm are described in appendix section \ref{sub:segmentation}, but if you develop a sense of how it works, it can make writing rules much simpler than they might otherwise be. One major advantage is that, using the \texttt{intelligent} mode, a rule which targets \textIPA{k} will not trigger on \textIPA{kʰ}. Of course, if you desire that behavior, then you can choose another normalization mode, or use none at all.
% subsection using_normalization (end)
%------------------------------------------------------------------------------
\subsubsection{Loading Lexicons}
\label{ssub:loading_lexicons}
One of the first things you will want to do when writing rules is to load a lexicon. This is done using the \texttt{open} keyword, followed by the path to a lexicon in single or double quotes, the optional\footnote{The \texttt{as} keyword was made optional because its presence can make the commands more fluent and clear to read, but it is not syntactically important, so it may be omitted for compactness} keyword \texttt{as} and a file-handle name by which you can reference the lexicon later. In the provided example file, you are loading \texttt{pie\_lexicon.txt} and binding it to the file-handle \texttt{LEXICON}, using the command
\begin{vex}{ex:openlexicon}
open 'pie_lexicon.txt' as LEXICON
\end{vex}
You can use \texttt{close} or \texttt{write} to write the lexicon's state to disk; \texttt{close} is the same as \texttt{write}, but also removes the file handle and lexicon from memory.\footnote{This should improve performance; each rule in a script is applied to each word in each lexion open at the time of the rule's execution. Additionally, it should prevent you from accidentally writing to a lexicon you did not intend to have open.} It may be good practice to write the output commands as soon as you open a lexicon, to ensure you don't forget later. Both these commands use a similar, if reversed, syntax to loading lexicons, with the file-handle first, then the output path. You might start writing a rules file like the following:
\begin{vex}{ex:closelexicon}
open 'pie_lexicon.txt' as LEXICON
% Intervening rules go here
write LEXICON as 'late-pie_lexicon.txt'
% Intervening rules go here
close LEXICON as 'pkk_lexicon.txt'
\end{vex}
\noindent
Writing intermediate lexicons can be a useful way of debugging during development, and ensuring that your outputs look correct at intermediate stages.
Finally when opening a lexicon from disk, it will be read into memory and normalized using whatever mode was last defined. If you were to change modes \emph{after} loading a lexicon, it could lead to unexpected behavior. For example, if no mode is set when a lexicon is loaded, but intelligent segmentation is enabled prior to writing any rules, a rule containing \textIPA{pʰ} will not trigger on a word containing \textIPA{pʰ} because it will have been loaded as two separate characters \textIPA{p} and \textIPA{ʰ}.
% subsection loading_lexicons (end)
%------------------------------------------------------------------------------
\subsubsection{Reserving Characters}
\label{ssub:reserving_characters}
Though there are no uses of reserving characters in the Kuma-Koban examples, it can be useful to do, especially when you are not using intelligent segmentation (see section \ref{ssub:using_normalization}), or where your orthographic conventions may conflict in some way with it (such as if your language uses pre-aspirated stop, which you represent \textIPA{ʰt}).
You can use the \texttt{reserve} keyword to indicate which sequences of characters which are intended to represent single sounds. Following the keyword, simply list the string you wish to be treated this way, separated by whitespace:
\begin{vex}{ex:reserve}
reserve ph th kh kw kwh
\end{vex}
\noindent
This will ensure that a rule intended to target \textIPA{p} will not affect \textIPA{ph} and a rule intended to target \textIPA{kw} will not affect \textIPA{kwh}. Be mindful when writing rules; carelessly reserving sequences could lead to unexpected behavior in your outputs.
% section: reserving_characters (end)
% -----------------------------------------------------------------------------
\subsubsection{Defining Variables}
\label{ssub:defining_variables}
A common early task in the development process is defining variables you expect to use often. The assignment operator \texttt{=} binds a variable name on the left to a set of values on the right. Values can be either literal symbols, or other variables. The following is a representative example from the Kuma-Koban data and is typical of how variables are defined:
\begin{vex}{ex:basicvars}
H = x ʔ
N = m n
L = r l
R = N L
W = y w
\end{vex}
\noindent
Variables can be defined at any point in the script, and can also be re-defined at any point. Once a variable is defined a certain way, it will have that value for every subsequent reference until and unless it is changed. You can find more detailed information in section \ref{sub:variables}.
As mentioned previous, values must be either a sequence of terminals or a single variable. Any circumstance in which you might feel the need to do something like the following is probably better handled by a sequence of variables in a rule itself:
\begin{vex}{ex:badvars}
P = p pʼ
T = t tʼ
H = x ħ
Y = PH TH nT
\end{vex}
% section defining_variables (end)
% -----------------------------------------------------------------------------
\subsubsection{Writing Simple Rules}
\label{ssub:writing_simple_rules}
It is possible to write some rather complicated rules using the Haedus language, but it is generally not necessary. Most rules are fairly simple, and these are what will be discussed here.
This is first rule you will see in the Kuma-Koban example:
\begin{vex}{ex:simplerule}
y w > i u
\end{vex}
\noindent
The intention is to change every \textIPA{y} to an \textIPA{i} and every \textIPA{w} to a \textIPA{u}, regardless of context. The first set of rules in the Kuma-Koban example are like this; simple rules for making orthographic corrections in the raw data.
Though it is used less often, deletion is a useful feature of rules. The following example shows how to do this:
\begin{vex}{ex:simplerule3}
% Delete morpheme boundary marking
- > 0
\end{vex}
\noindent
Changing anything into the literal zero character \texttt{0} will cause it to be deleted. In this case, the rule will delete all hyphens in the lexicon (\emph{i.e.} remove morpheme boundaries).
It is also possible to \texttt{insert} segments in a similar fashion:
\begin{vex}{ex:insertion}
0 > e / #_sC
\end{vex}
\noindent
There are several things to note however: when inserting segments, there can only be one \texttt{0} on the left, and one literal sequence on the right; additionally, insertions \emph{requires} a condition (see section \ref{ssub:writing_conditions} for how to do this). Failing to obey the required syntax here will produce a compilation error.
Like other SCAs, this one allows you to transform from one variable to another, provided they have the same number of elements. For instance, Kuma-Koban transforms labiovelars into plain labials using the following rule:
\begin{vex}{ex:variabletransform1}
Q = kʷʰ kʷ gʷ
P = pʰ p b
% ... intervening rules ... %
Q > P
\end{vex}
\noindent
This will change \textIPA{kʷʰ} to \textIPA{pʰ}, \textIPA{kʷ} to \textIPA{p}, and \textIPA{gʷ} to \textIPA{b}. It is also possible to write rules like:
\begin{vex}{ex:variabletransform2}
iH uH > ī ū
\end{vex}
\noindent
which will change any sequence of \textIPA{i} and any element of \texttt{H} to \textIPA{ī}, and do the same for \textIPA{u}. Changing a variable to a literal will change every element matched by the variable into that literal sequence.
% section: writing_simple_rules (end)
% -----------------------------------------------------------------------------
\subsubsection{writing Conditions}
\label{ssub:writing_conditions}
While the rules we've discussed so far have not needed to use conditions, the majority of rules might ever find yourself writing will. One of the simplest possible conditions is seen in the following rule:
\begin{vex}{ex:simplerule2}
% For correct handling of negation prefix
n- > nˌ / #_
\end{vex}
\noindent
If you are familiar with the standard formalisms for discussing sound change academically, most of this notation will be familiar. The forward-slash \texttt{/} indicates the start of the condition, and the underscore \texttt{\_} indicates where the ``source'' symbol (any of the elements on the left side of the \texttt{>} operator) occurs in relation to the rest of the condition.
%Good conditions are where an SCA's real power lies. In this rule language, there is a lot to know about writing conditions, and that is detailed in section \ref{sub:conditions}; however, we will discuss these in a more organic fashion, as they appear in the Kuma-Koban example, for the benefit of new users.
There is a very common rule in Indo-European languages where \textIPA{*e} is changed to \textIPA{*a} in the environment of \textIPA{*h₂} and possibly \textIPA{*h₄} if there ever was such a thing\footnote{where distinguishing between these is not possible, \textIPA{*hₐ} is used, at least in those sources which admit a \textIPA{*h₄}}. For convenience, these are simply converted to consonants \textIPA{x} and \textIPA{ʕ}.
Expressed verbally, we might say ``\textIPA{e} changes to \textIPA{a} before or after \textIPA{x} or \textIPA{ʕ}''. One way to write this is with four rules:
\begin{vex}{ex:fourlaryngealrule}
E > A / _x
E > A / _ʕ
E > A / x_
E > A / ʕ_
\end{vex}
\noindent
If this seems sub-optimal, there are two ways we can condense the number of individual rules. First, we can cut the number of rules in half by using \emph{sets}:
\begin{vex}{ex:laryngealrulesets}
E > A / _{x ʕ}
E > A / {x ʕ}_
\end{vex}
\noindent
A set is just a list of elements, separated by whitespace, and contained within a pair of curly braces \texttt{\{} and \texttt{\}}\footnote{Padding is also permitted around the set elements, so writing \texttt{\{ x ʕ \}} is legal and equivalent to \texttt{\{x ʕ\}}}. Functionally, it's nearly the same as defining these in a variable beforehand, but without the need to actually use a separate command to do it.
There is another tool we can use to combine both rules into one, and that is the \texttt{or} keyword:
\begin{vex}{ex:fourrules}
% e to a, before or after x or ʕ
E > A / {x ʕ}_ or _{x ʕ}
\end{vex}
\noindent
Using \texttt{or} allows a rule to have multiple conditions, any one of which can trigger the rule. If find you have the same change being made under several separate conditions, you can write then as a single rule using \texttt{or}. But only do this judiciously, as is can substantially impair the legibility of your rules, especially if the conditions are complicated. Throughout the Kuma-Koban rules you will see places where this is used, as well as places it was not but could have been
% section writing_conditions (end)
% -----------------------------------------------------------------------------
\subsubsection{Writing Advanced Conditions}
\label{ssec:writing_advanced_conditions}
If you browse through the Kuma-Koban example rules, you will see that most of them can be described using only what has been covered so far. There will inevitably be cases you cannot cover with these techniques alone however.
One of the most powerful capabilities supported by this SCA is that it allows the use of regular expressions in rule conditions. This is discussed in detail in section \ref{ssub:expressions} but can be addressed here in the context of how regular expressions can be used to accomplish particular tasks.
Some Indo-European languages exhibit a change known as Grassman's law, where aspirated stops are de-aspirated if another aspirate occurs after it (this is certainly true within a root or stem, but may apply to aspirates in suffixes as well). The following example is intended only to apply to aspirates which occur in adjacent syllables, or within the same syllable:
\begin{vex}{ex:grassman}
% Grassman's Law
bʰ dʰ gʰ gʷʰ > b d g gʷ / _{R W}?VV?C*{bʰ dʰ gʰ gʷʰ}
\end{vex}
\noindent
The two new symbols were are \texttt{?} and \texttt{*}. A third, \texttt{+}, is also possible. If you are familiar with regular expressions, their meanings are what you would expect:
\begin{description}
\itemsep1pt \parskip0pt \parsep0pt
\item[\texttt{?}] Matches the preceding expression zero or one times
\item[\texttt{*}] Matches the preceding expression zero or more times
\item[\texttt{+}] Matches the preceding expression one or more times
\end{description}
\noindent
These are generally called \emph{quantifiers}, as they indicate how many of a symbol can be matched.\footnote{These are always greedy, and will match the \emph{longest} possible sequence, though because they are only used for detecting conditions, this is less of a problem than it is when, for example, using regular expressions to find-and-replace.} Representing the rule in example \ref{ex:grassman} without using regular expressions would require a minimum of 12 conditions. In most cases you will only even need to use \texttt{?} to make a symbol or variable optional. The Kuma-Koban example only uses any of these in 5 total rules.
Under some circumstances in Indo-European languages, laryngeals could cause a preceding short vowel to be come long; in Kuma-Koban, this is represented using the following rule:
\begin{vex}{ex:lengthening}
VS > VL / _H{C # {I U W}?V}
\end{vex}
\noindent
There are two things here worth taking special note of: first, that you can place a word-boundary \texttt{\#} inside a set; second, that you can also place a \emph{set} within an element of a set. In fact, this rule was originally developed as two separate rules:
\begin{vex}{ex:lenthening_old}
VS > VL / _H{C #}
VS > VL / _H{I U W}?V
\end{vex}
\noindent
This may actually be slightly clearer in its intent, without having to be unpacked. Both both simply express that the ``source'' should be followed by \texttt{H}, itself followed by one of several other expressions (\texttt{C}, \texttt{\#}, or \texttt{{I U W}?V}); this intent is matched by the semantics of sets and the fact that one of the other expressions also contains a set is not relevant.
Apart from the familiar quantifiers, two more standard elements of regular expressions is also supported here: the dot character \texttt{.} which will match any single symbol\footnote{Note that this does not say \emph{character}; if you are using normalization, or have reserved strings, ``character'' and ``symbol'' will not have a one-to-one correspondence.} and groups enclosed by parentheses \texttt{()}. Both can be combined with the quantifiers, of course. Groups are mainly used if you wish to make a sequence of expressions repeatable, or optional and are especially useful for writing syllabification rules, or accent patters.
% section writing_advanced_conditions (end)
% -----------------------------------------------------------------------------
% PART II =====================================================================
\section{Haedus Rule Language \& Syntax}
\label{sec:scripts_and_syntax}
This section describes the commands supported by the SCA and their syntax and semantics. It attempts to be as detailed as possible with informative examples and, in some cases, provides notes on implementation.
Scripts and lexicons are, by default, read in as UTF-8; it is not currently possible to change this. One substantial difference between this SCA and others is that these rule files are \emph{compiled} rather than merely interpreted; as script commands are read in, they are validated and parsed to objects in memory. This has several advantages, namely that because compilation happens once, rules are not repeatedly re-interpreted for each word; it also allows that errors can be caught immediately at compile time, rather than at runtime.
In this rule language generally, the contents of lists are whitespace-separated (the space character, or tab) and quantity-insensitive (one space is treated the same as two), so you can use extra spaces or tabs to make columns align, as you will see throughout the examples. This is also true of the padding around most operator symbols (\emph{viz.} \texttt{= > /} )
%While whitespace is used to separate items in lists, padding around operators and delimiters is optional. As elsewhere the quantity is not important.
Script files may contain comments, starting with \texttt{\%}, and may be placed at the start of a line, or in-line; in either case, anything to the right of the comment symbol is ignored.
The following characters have special meanings in the SCA script language and should only be used in the contexts they are expected:
\begin{vex}{ex:reserved}
% # $ * ? + ! ( ) { } [ ] 0 . _ = / >
\end{vex}
\noindent
Most of these are sensitive to context, but \texttt{=}, \texttt{/}, and \texttt{>} are restricted and using them in inappropriate ways will cause the script to fail to compile.\footnote{This is because these symbols in particular are used to identify variable definitions and transformation rules. Specifically, a line which contains \texttt{=} assumed by the parser to be a variable definition. Likewise, a line which contains \texttt{>} is assumed to be a rule definition.} If a section indicates that a symbol on this list is allowed, then it is allowed in that context but should still be avoided in others.
The square brackets \texttt{[]} warrant some additional explanation, as they are reserved \emph{per se} but are treated specially. Anything contained between square brackets will be automatically parsed as a single sequence; that is, any time the SCA parser encounters \texttt{[} it will automatically jump to the next closing \texttt{]} and treat everything in between as a single sequence, just as it would a user-reserved sequence, or variable name. Square brackets are also used to delimit feature arrays in future versions of the SCA.
% section: scripts_and_syntax
% -----------------------------------------------------------------------------
\subsection{Scripting Capabilities}
\label{sub:scripting_capabilities}
The script syntax also allows the user to do things like read and write lexicons from disk, import or execute other script files, and set normalization and formatting modes within a rules file.
% subsection: scripting_capabilities
% -----------------------------------------------------------------------------
\subsubsection{Reading \& Writing Lexicons}
\label{ssub:reading_and_writing_lexicons}
These commands are used to read and write lexicons from disk. Once a lexicon is in memory, any sound changes run in the script will be applied to all open lexicons. There are three commands of this type:
\begin{description}
\itemsep1pt \parskip0pt \parsep0pt
\item[\texttt{open}] Reads a lexicon and binds it to a file-handle
\item[\texttt{write}] Retrieves data using a file-handle and writes it to disk.
\item[\texttt{close}] Like \texttt{write} but after writing, it will then unload the lexicon from memory and remove the file-handle it is bound to.
\end{description}
%
To open a lexicon, use the \texttt{open} command in the following way:
\begin{vex}{ex:open}
open "language.lex" as LANGUAGE
\end{vex}
\noindent
Lexicons are referenced by a file-handle, \texttt{LANGUAGE} in ex. \ref{ex:open}. The handle name must begin with a capital letter an can only contain capital letters, numbers, or the underscore.
The difference between \texttt{write} and \texttt{close} is that the former will write the lexicon, in its current state, to the specified location, but the handle will still be available and future changes will be applied; \texttt{close} will also write the lexicon to disk but remove the it from memory, making the file-handle unavailable. These commands have the same syntax, simply substituting \texttt{write} for \texttt{close} in the following:
\begin{vex}{ex:close}
close LANGUAGE as "new_language.lex"
\end{vex}
\noindent
Lexicons are not automatically written closed when the script completes, so if you open lexicons and forget to close them, their changes will be lost.
% subsubsection: reading_and_writing_lexicons (end)
% -----------------------------------------------------------------------------
\subsubsection{Import \& Execute Commands}
\label{ssub:import_and_execute_commands}
It is possible to use other script files using the \texttt{import} and \texttt{execute} commands. Using \texttt{import} will read the contents of another rule file, and insert its contents into that position in the script and compiles them. Using \texttt{execute} will compile and run all these commands in the file immediately. The syntax is simple and is as follows:
\begin{vex}{ex:commands}
execute "other1.rule"
import "other2.rule"
\end{vex}
\noindent
The key difference is that \texttt{execute} will run the script separately (reading and writing lexicons, applying rules, calling other resources, and so on) while \texttt{import} places the script into your current script so that any lexicons or variables specified in the other file will be usable in the current script. Another important difference is that \texttt{import} reads in the script file at compile-time while \texttt{execute} does so only when the command is reached.
% section: import_and_execute_commands (end)
% -----------------------------------------------------------------------------
\subsubsection{Normalization \& Formatting}
\label{ssub:normalization_and_formatting}
Specifying a normalization mode allows for consistent handling of diacritics in data and rules, so it is important to consider which mode will be appropriate to your needs. Four modes are supported:
\begin{description}
\itemsep1pt \parskip0pt \parsep0pt
\item[\texttt{none}] Makes no changes to lexicon data or rules; all strings are left as-is.
\item[\texttt{composition}] Applies Canonical Composition according to the Unicode Standard Annex \#15.
\item[\texttt{decomposition}] Applies Canonical Decomposition according to the Unicode Standard Annex \#15.
\item[\texttt{intelligent}] Applies intelligent segmentation according to the method described in section \ref{sub:segmentation}.
\end{description}
To set the formatter while running in standard mode, use the keyword \texttt{mode} followed by a supported mode:
\begin{vex}{ex:normalization}
mode intelligent
\end{vex}
Because the formatting mode has an effect on the parsing of all forms of data, it is critical that you declare the format before loading any other resources or declaring any rule or variable statements.
It is possible to change the formatting mode anywhere in a rule file, but it is suggested that you not do this without an extremely good understanding of how this will impact the parsing of your data. Once a resource is loaded or statement declared in a given formatting mode, it will not be affected by future mode changes.
% subsubsection: normalization_and_formatting (end)
% -----------------------------------------------------------------------------
\subsection{Variables}
\label{sub:variables}
Variables represent ordered sets of variable and literal symbols, as they typically do in other SCAs. When defining a variable, the assignment operator \texttt{=} binds a label on the left-hand-side to a space-separated list on the right. The following example shows how variables are often defined:
\begin{vex}{ex:variables}
TH = pʰ tʰ kʰ
T = p t k
D = b d g
W = w y ɰ
N = m n
C = TH T D W N r s
\end{vex}
The definitions in example \ref{ex:variables} illustrate several things: using whitespace to align symbols into columns in a convenient and readable way, reasonably free variable label naming, and the use of variables in the definition of other variables.
Note that each item bound to a variable must be a sequence of literals, or a single variable, \emph{not} combinations of both. Assuming the definitions in example \ref{ex:variables} are already in place, a definition like \texttt{X = DW} or \texttt{TH = Tʰ} is not permitted.
There are no formal restrictions placed on variable labels, beyond requiring that they not use characters reserved by the SCA. You will notice in ex. \ref{ex:variables} that both \texttt{TH} and \texttt{T} are defined. This is possible because when SCA parses a rule or variable definition, it searches for variables by finding the \emph{longest} matching label first. If you have variables \texttt{T}, \texttt{H}, and \texttt{TH}, a rule containing the string \texttt{TH} will always be understood by the SCA to represent the variable \texttt{TH}, and not \texttt{T} followed by \texttt{H}. The best way to avoid this situation is to name variable carefully
\footnote{Though I do not see this as a problem in need of a resolution, I will note that this conflict, should it arise at all, is most likely to do so in a rule condition. In that context, it is possible to simply use the regular expression language (see section \ref{ssub:expressions}) to your advantage by wrapping one or both variables in parentheses to avoid the conflict: \texttt{(T)(H)}}
It is possible to re-assign variables at any point in the script. This includes appending or prepending values to an existing variable, or redefining it entirely:
\begin{vex}{ex:reassignvars}
C = t k
C = C q % Append
C = p C % Prepend
C = p t ts k q
\end{vex}
\noindent
This is especially helpful for variables like \texttt{C} used for consonants or plosives, because sound changes might add new consonants to the language's inventory.
In many cases, variables are used to represent natural classes of sounds, so it might be advantageous to use features instead. This is described in section \ref{ssub:feature_model_specification}
% subsection: variables (end)
% -----------------------------------------------------------------------------
\subsection{Rules}
\label{sub:rules}
This section describes the syntax of rule definitions and basic conditions; more advanced functionality is described in section \ref{sub:conditions}. Any uncommented line containing the right angle-bracket symbol \texttt{>} will be parsed as a rule; that is, a line represents a rule definition \emph{iff} it contains the symbol \texttt{>}.
A rule consists of two principal parts, the ``transform'' and the ``condition''. Not all rules contain a condition, but all rules will contain a transform.
% subsection: rules (end)
% -----------------------------------------------------------------------------
\subsubsection{Transform}
\label{ssub:transform}
The transform consists of two lists of elements on either side of the \texttt{>} operator. The left-hand-side is called the ``source'' group and the right-hand-side is called the ``target'' group. The transform specifies that each element of the source will be changed to the corresponding element of the target if the rule's condition is satisfied. The following is an abstract representation of a rule transform:
\begin{vex}{ex:transform}
s₁ s₂ s₃ ... sₙ > t₁ t₂ t₃ ... tₙ
\end{vex}
\noindent
Each element on the left corresponds to one on the right, so that \texttt{s₁} is changed to \texttt{t₁}, \texttt{s₂} to \texttt{t₂} and so on. Each source element must have a corresponding target element, \emph{unless} the target contains exactly one element:
\begin{vex}{ex:transform_merger}
s₁ s₂ s₃ ... sₙ > t
\end{vex}
\noindent
This can be a useful way of representing mergers. Because of this, the following two statements are equivalent:
\begin{vex}{ex:convergence}
æ e o > a a a
æ e o > a
\end{vex}
Additionally, the right-hand side of the transformation is may contain the literal zero \texttt{0} which represents a deleted segment. For example, the following rule will delete schwas where they occur in word-final position:
\begin{vex}{ex:deletion}
ə > 0 / _#
\end{vex}
\noindent
However, it is not necessary that \texttt{0} be the \emph{only} symbol in the target; a single rule is permitted to delete one sound while simply changing others. It is, however, necessary that \texttt{0} not be used in a string with other characters: a rule containing something like \texttt{s0} will fail to compile.
\subsubsection{Indices \& Backreferences}
\label{ssub:indices_and_backreferences}
Within the transform of a rule, it is possible to use indexing in the target to refer to symbols in the source, which can be very useful in writing commands for metathesis or total assimilation. Within the rule's target group, the dollar sign \texttt{\$} followed by a digit allows you to refer back to a variable matched within the source group. For example, the commands
\begin{vex}{ex:backreferences}
C = p t k
N = n m
CN > $2$1
\end{vex}
\noindent
allow us to easily represent metathesis, swapping \texttt{N} and \texttt{C} wherever \texttt{N} is found following \texttt{C}.
When SCA parses a rule, it keeps track of each variable in the source group and knows in the above example, that \texttt{C} is at index \texttt{1} and \texttt{N} is at index \texttt{2}. The target part of the transform lets us refer back to this using the \texttt{\$} symbol and the index of the variable we wish to refer to.
We can actually go slightly further and use the indices on a \emph{different} variable, provided they have the same number of elements. In a variation on the previous example, we can write
\begin{vex}{ex:indices}
C = p t k
G = b d g
N = n m
CN > $2$G1
\end{vex}
\noindent
which does the same as the above, but also replaces any element of \texttt{C} with the corresponding element of \texttt{G}. So, if a word is \textIPA{atna}, the rule will change it to \textIPA{anda}.
This can also be used for some kinds of assimilation and dissimilation, such as simplifying clusters of plosives by changing the second to be the same as the first:
\begin{vex}{ex:assimilation}
C = p t k
CC > $1$1
\end{vex}
\noindent
This will change a word like \textIPA{akpa} to \textIPA{akka}. To assimilate in the other direction, you can simply use \texttt{\$2\$2}
% subsubsection: indices_and_backreferences (end)
% -----------------------------------------------------------------------------
\subsection{Conditions}
\label{sub:conditions}
The rule condition determines where in a word it will be possible for a rule to apply. Any rule without a condition specified will apply under all circumstances.
The start of the condition is signaled by the forward-slash symbol \texttt{/}. The condition necessarily consists of at least one clause. A single clause consists of the underscore character \texttt{\_} and an expression on either or both sides:
\begin{vex}{ex:clause}
Eₐ_Eₚ
\end{vex}
where \texttt{Eₐ} represents an expression to be matched before the source segment, and \texttt{Eₚ} represents an expression to be matched after the source segment.
\footnote{At the implementation-level, this is done by searching a word, start to end, for a symbol from the transform. If it is found, the symbols before and after it are checked against the condition. If both match, the changes are applied.}
Many common conditions are relatively simple, representing things like ``between consonants'', ``before nasal consonants'', or ``word-initially'', as in the following example:
\begin{vex}{ex:devoicing}
b d g > p t k / #_
\end{vex}
\noindent
which represents devoicing of plosives in word initial position.
\subsubsection{Expressions}
\label{ssub:expressions}
While the Haedus regular expressions are not POSIX compliant, they nevertheless allow the use of metacharacters \texttt{?}, \texttt{*}, \texttt{+}, and \texttt{.} and also allows grouping through with parentheses \texttt{()}.
The metacharacters have the behaviors you should expect if you are familiar with regular expression languages:
\begin{description}
\itemsep1pt \parskip0pt \parsep0pt
\item[\texttt{?}] Matches the preceding expression zero or one times
\item[\texttt{*}] Matches the preceding expression zero or more times
\item[\texttt{+}] Matches the preceding expression one or more times
\item[\texttt{.}] Matches any literal character
\end{description}
\noindent
Grouping with parentheses allow application of the quantifying metacharacters to sequences of expressions, such as:
\begin{vex}{ex:grouping}
(ab?c)+
\end{vex}
\noindent
which will match the expression \texttt{ab?c} one or more times, and thus any of the following strings \textIPA{ac}, \textIPA{abc}, \textIPA{acac}, \textIPA{abcac}, \emph{abcacacabac} and so on.\footnote{As a product of the syntax of groups, it is possibly to nest them to any depth, so that \texttt{(a)} will match the same set of strings as \texttt{(((a)))} for example, but doing this has no benefit and unnecessary nesting merely generates more complicated machines which require more time to evaluate.}
%As in other regular expression languages, a single symbol is an expression, as is a sequence of expressions, or a group or set.
The most substantial deviation from common regular expression implementations is the use of curly braces to represent sets. Expressions enclosed in curly braces are separated by spaces, so that the following expression:
\begin{vex}{ex:sets}
{ E₁ E₂ E₃ }
\end{vex}
\noindent
will match any one of the expressions \texttt{E₁}, \texttt{E₂}, or \texttt{E₃}. The expression in \ref{ex:sets} is equivalent to \texttt{(E₁|E₂|E₃)} in standard regular expressions. Sets can be used with quantifier metacharacters, just as groups can.
\subsubsection{Negation}
\label{ssub:negation}
It is also possible to \emph{negate} an expression by prepending the exclamation-mark character \texttt{!} to it. It indicates that the expression will accept any inputs except for those accepted by the original machine (but only inputs of the same length as the original).
A simple expression \texttt{!a} will accept any single character except for the literal \textIPA{a}. A negated group \texttt{!(abc)} will accept any input of length = 3 which is not \emph{abc}. A negated set \texttt{!\{a b c\}} will accept any inputs of length = 1 which is not \textIPA{a}, not \textIPA{b}, and not \textIPA{c}.
More formally, if a state machine \texttt{E} accepts a set of inputs \texttt{I}
%(traditionally this is called the \emph{language} accepted by \texttt{E})
and \texttt{L} is the set of \emph{lengths} of the inputs in \texttt{I}, then the machine \texttt{!E} will accept \emph{all possible inputs} that have lengths \texttt{L} and are not in \texttt{I}.
The user is cautioned to use negation judiciously; the intent may often be more clear using an un-negated condition, paired with a exception (\texttt{not}) block, which is described in the following section.
% subsection: conditions
% -----------------------------------------------------------------------------
\subsubsection{Multiple Clauses \& Exceptions}
\label{ssub:multiple_clauses}
When a single transformation is triggered under multiple conditions, you may decided to represent this with two separate rules, or a single rule with multiple condition clauses. When more than one clause is present in a condition, they are separated by the \texttt{or} keyword, as in the following:
\begin{vex}{ex:multiple_clauses}
H > 0 / _{C #} or C_
\end{vex}
\noindent
Using multipe clauses can sometimes render rules more difficult to read, so using this capability is not always ideal, from a user's point of view.
It i
s also possible to specify condition clauses under which the rule should \emph{not} apply even though the conditions might otherwise be satisfied. This is done using the \texttt{not} keyword. The exception clauses must always occur after the main clauses, though the main clauses are allowed to be empty. All of the following are valid uses of exception clauses:
\begin{vex}{ex:exception_clauses}
a > b / not _y
a > b / not _y not x_
a > b / C_ not x_
a > b / C_ or _C not x_
a > b / C_ or _C not x_ not _y
\end{vex}
% subsubsection condition_chaining (end)
% -----------------------------------------------------------------------------
\subsection{Phonetic Features}
\label{sub:phonetic_features}
In addition to many othe useful capabilities, this sound change applier is fundamentally built to support rules based on phonetic features while not requiring their use. This section will describe the features model, and the use of features in rules files.
\subsubsection{Loading a Feature Model}
\label{ssub:loading_a_feature_model}
A model is loaded into a rules file using the \texttt{load} command, followed by the model path; the paths are handled just as they are in for opening lexicons or importing scripts.
A feature model is included with this distribution, \texttt{"AT\_hybrid.model"}; the details of this model are described in the appendix (\ref{sub:ATHM}). To load this model into a script, use the following command:
\begin{vex}{ex:load_feature_model}
LOAD "AT_hybrid.model"
\end{vex}
\subsubsection{Using Features}
\label{ssub:using_features}
With a feature model loaded, every segment can be referenced not just by its explicit symbol, but also by an array of feature specifications. If you look at the feature model file, you will see a block which defines the features themselves (name, alias, and type), and blocks defining symbols. The following example shows three feature defintions:
\begin{vex}{ex:feature_definitions}
consonantal con binary
sonorant son binary
continuant cnt binary
\end{vex}
\noindent
The definitions contain a feature name, alias, and type. The name and alias can both be used to reference the feature when used in a rule or variable. The type just restricts which range of values a feature can take, and prevents invalid lookups or assignments. The format of the model definitions is given in section \ref{ssub:feature_model_specification}.
In a rules script, features are used to reference a set of sounds organically, just as a variable is a reference that has been pre-defined. Rather than defining a variable for all plosives, they can be selected using the following:
\begin{vex}{ex:feature_selection}
[+consonantal, -sonorant]
% OR
[+con, -son]
\end{vex}
\noindent
Any segments matching the array will be selected; that is, any segments having the same values as the array. The current implementation does not support alpha-place or similar, so any assimilation needs to be done explicitely. A rule which changes voiceless plosives to voiced before another voiced consonant could be written like this:
\begin{vex}{ex:voicing_assimilation}
[+con, -son, -voice] > [+voice] / _[+con, +voice]
\end{vex}
When a symbol is changed in this way, the symbol which originally represented it can no longer apply. The selection of a new symbol is performed automatically: if the exact sound given by the feature array is present in the model, then its symbol is used; if it is not, the nearest symbol is selected, and an appropraite modifier will be added so that the symbol-plus-modifier represents the feature array correctly. In the event of a conflict (two symbols or modifiers representing the same feature array), precendence is given to the one defined first.
\subsubsection{Feature Model Specification}
\label{ssub:feature_model_specification}
The definition of a feature model has three parts: the feature inventory, symbols block, and the modifier block. Comments work the same way as they do as in rules: anything following the \texttt{\%} character will be ignored.
The feature definition block is started by the keyword \texttt{FEATURES}. Each defined feature will have a name, alias, and type, separated by whitespace. The types maybe be \texttt{binary}, \texttt{ternary}, or \texttt{numeric}.
The \texttt{SYMBOLS} and \texttt{MODIFIERS} blocks have the same structure but have slightly different uses. Each defined symbol or modifier starts with the a character or sequence of characters (whatever represents a single sound), followed by a single \emph{tab character}, followed by a value for each feature, separated by tabs.
Only \texttt{FEATURES} and \texttt{SYMBOLS} are truly mandatory, but \texttt{MODIFIERS} is extremely useful.
In the symbol block, blank values are not allowed. In the modifier block, they are expected; adding a modifier to a symbol such as \textIPA{a̰} will take the feature array for \textIPA{a} and overwrite them with for any value definited in the modifer block for \textIPA{◌̰}.
In the symbol and modifier block, feature values are limited and checked based on the type defined in the features block: \texttt{binary} can take values \texttt{+} or \texttt{-}; \texttt{ternary} can take values \texttt{+}, \texttt{-}, or \texttt{0}; \texttt{numeric} can take any integer value.
Two more optional blocks can be used. The \texttt{ALIASES} block allows aliases to be provided for specific features values, which can be especially useful for numeric or ternary features, such as height values \texttt{[high]}, \texttt{[mid]}, and \texttt{[low]}.
The \texttt{CONSTRAINTS} block allows the model to place limits of values combinations. These are essentialy additional rules that activate whenever a relevant feature is changed. For example, if a rule has changed a sounds to be \texttt{[+nasal]} the constraint
\begin{vex}{ex:constraint}
[+nasal] > [-lateral]
\end{vex}
\noindent
will activate to ensure that the resulting sound is not also \texttt{[+lateral]}.
An example file is provided with the software and can be found at \texttt{example/ATH.model}.
% subsection phonetic_features (end)
% -----------------------------------------------------------------------------
\section{Appendix}
\label{sec:appendix}
\subsection{Intelligent Segmentation}
\label{sub:segmentation}
One significant capability of the Haedus SCA is its intelligent segmentation algorithm, which allows a sequence of characters which would represent a single sound in IPA to be treated as a single unit. The SCA can do this in part because it does not, in fact, use strings to represent internally. When any data is read in, whether it is words in a lexicon, or parts of a rule or variable definition, the strings are split up according to the normalization mode. In modes other than intelligent, each character is then used to create a \texttt{segment}; using intelligent segmentation, a \texttt{segment} represents a phonetic segment, a single speech sound. A list of \texttt{segment}s constitute a \texttt{sequence}, which can be a word, or part of a word. When the \texttt{reserve} command is used, this indicates that some strings will be given special treatment, and will always be represented as a single \texttt{segment} despite containing multiple characters.
The normalization mode applies canonical decomposition and then uses a series of rules based on Unicode character classes and character ranges to attach diacritics to the appropriate head character. This section describes the process.
There are two places data is stored during the segmentation process: a list whose elements will be the \texttt{segment} objects that will be created, and a buffer which contains a single \texttt{segment} as it is being built.
If the character is not a diacritic and the buffer is empty, the character is added to the buffer; if the buffer is not empty, the contents of the buffer are added to the list, the buffer is cleared, and then the current character is added to the buffer. However, if the character is a diacritic, it is simply added to the buffer. At the end of the string, anything remaining in the buffer is added to the list.
This process is relatively simple, apart from some special rules for checking for variable names and reserved characters. What constitutes a diacritic is somewhat arbitrary, insofar as the decision is based on Unicode character classes and certain specific character or character ranges chosen by the author.
The following types of symbols are considered diacritics for the purposes of this algorithm:
\begin{itemize}
\itemsep1pt \parskip0pt \parsep0pt
\item Alphanumeric Super- or Subscripts
\item Unicode class Lm [Letter, Modifier]
\item Unicode class Mn [Mark, Non-Spacing]
\item Unicode class Sk [Symbol, Modifier]
\end{itemize}
One type of diacritic is given special treatment, and that is the double-width or \texttt{COMBINING DOUBLE} diacritics which occur between code-points \texttt{u+035C} and \texttt{u+0362}. When these are encountered, the algorithm will jump to the next character (which should always be a non-diacritic), add it to the buffer, and then continue to operate normally.
Once the list of compiled strings is complete, each \texttt{segment} is added to a \texttt{sequence} object which will represent the original input.
% \subsection{Nondeterministic Finite-State Automata}
% \label{sub:rndfa}
%
% Inside the sound-change applier's rule code, conditions are parsed into nondeterministic finite-state automata (NFA). Unlike traditional NFA implementations, this one can match sequences of both literal symbols and also segments represented as feature arrays. To the NFA (as to the rest of the sound-change system) these are one and the same, sequences of segment objects, which represent speech sounds.
%
% This appendix describes some of the implementational details of the automata
%
% \begin{figure}\label{fig:machine_optional}
% \caption{State machine for \texttt{E?}}
%
% \centering
% \begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,node distance=2.5cm,semithick]
% % ------------------------------------------------------------
% \node[initial,state] (A) {$q_0$};
% \node[state, rectangle] (B) [right of=A] {$E$};
% \node[state, accepting] (C) [right of=B] {$q_1$};
% % ------------------------------------------------------------
% \path (A) edge [below] node {$\oslash$} (B)
% (B) edge [below] node {$\oslash$} (C)
% (A) edge [bend left, above] node {$\oslash$} (C);
% % ------------------------------------------------------------
% \end{tikzpicture}
% \end{figure}
%
% %
% %
% %
%
% \begin{figure}\label{fig:machine_star}
% \caption{State machine for \texttt{E*}}
% \centering
% \begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,node distance=2.5cm,semithick]
% % ------------------------------------------------------------
% \node[initial,state] (A) {$q_0$};
% \node[state, rectangle] (B) [right of=A] {$E$};
% \node[state, accepting] (C) [right of=B] {$q_1$};
% % ------------------------------------------------------------
% \path (A) edge [below] node {$\oslash$} (B)
% (B) edge [bend right, above] node {$\oslash$} (A)
% (A) edge [bend right, below] node {$\oslash$} (C);
% % ------------------------------------------------------------
% \end{tikzpicture}
% \end{figure}
%
% %
% %
% %
%
% \begin{figure}\label{fig:machine_plus}
% \caption{State machine for \texttt{E+}}
% \centering
% \begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,node distance=2.5cm,semithick]
% % ------------------------------------------------------------
% \node[initial,state] (A) {$q_0$};
% \node[state, rectangle] (B) [right of=A] {$E$};
% \node[state, accepting] (C) [right of=B] {$q_1$};
% % ------------------------------------------------------------
% \path (A) edge [below] node {$\oslash$} (B)
% (B) edge [below] node {$\oslash$} (C)
% (C) edge [bend right, above] node {$\oslash$} (A);
% % ------------------------------------------------------------
% \end{tikzpicture}
% \end{figure}
%
% %
% %
% %
%
% \begin{figure}\label{fig:machine_set}
% \caption{State machine for \texttt{\{E\textsubscript{1} E\textsubscript{2} E\textsubscript{3}\}}}
% \centering
% \begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,node distance=2.5cm,semithick]
% % ------------------------------------------------------------
% \node[initial,state] (A) {$q_0$};
% \node[state, rectangle] (C) [right of = A] {$E_2$};
% \node[state, rectangle] (B) [above=0.25cm of C] {$E_1$};
% \node[state, rectangle] (D) [below=0.25cm of C] {$E_3$};
% \node[state, accepting] (X) [right of=C] {$q_1$};
% % ------------------------------------------------------------
% \path (A) edge [bend left, above] node {$\oslash$} (B)
% (A) edge [above] node {$\oslash$} (C)
% (A) edge [bend right, below] node {$\oslash$} (D)
%
% (B) edge [bend left, above] node {$\oslash$} (X)
% (C) edge [above] node {$\oslash$} (X)
% (D) edge [bend right, below] node {$\oslash$} (X);
% % ------------------------------------------------------------
% \end{tikzpicture}
% \end{figure}
\subsection{Articulator Theory Hybrid Model}
\label{sub:ATHM}
In order to carry out a research program which makes us of this sound-change applier, it was necessary to develop a feature model. Unlike phonemic feature models intended to explain phenomena within a single language, the new model must be \emph{phonetic}, in order to compare across languages, or over time.
The new model was based partly on Darin Flynn's \emph{Articulator Theory} (2006), a system which combines binary and unary features. However, \emph{Articulator Theory} is still intended to describe synchronic phenomena in a single language, and does not provide the level of granularity needed in the research program. It also inherits a number of flaws from \emph{Sound Pattern of English}, such as conflating breathy-voice and aspiration, and using an explicitely lingual feature like \texttt{[distributed]} to contrast labial and labio-dental consonants.
The new model would need to address the shortcomings of \emph{Articulator Theory} and improve granularity.
\subsubsection{\texttt{[consonantal]}}
\label{ssub:feature_consonantal}
\begin{samepage}
\begin{description}
\itemsep1pt \parskip0pt \parsep0pt
\item[Descritpion] Separates consonants from vowels and semivowels
\item[Type] \emph{binary}
\item[Examples]\
\begin{itemize}
\item \texttt{[+consonantal]}\\
p̪ b̪ p b t d ʈ ɖ c ɟ k g kp gb q ɢ pf bv ts dz tɬ dɮ tʃ dʒ tɕ dʑ ʈʂ ɖʐ cç ɟʝ kx gɣ ɸ β f v θ ð ɬ ɮ s z ʃ ʒ ɕ ʑ ʂ ʐ ç ʝ x ɣ χ ʁ m ɱ n ɳ ɲ ŋ ŋm ɴ r ɽ ʀ l ɫ ɭ ʎ ʟ ħ ʕ ʔ h ɦ
\item \texttt{[−consonantal]}\\
i y ɪ ʏ ɯ u ɷ ʊ ə ɨ ʉ e ø ɛ œ ɤ o ʌ ɔ æ a ɶ ɐ ɑ ɒ ʋ j ɥ ɰ ʍ w
\end{itemize}
\item[Constraints]\
\begin{enumerate}
\itemsep1pt \parskip0pt \parsep0pt
\item Any segment that is \texttt{[−consonantal]} must be \texttt{[+sonorant]}
\item Any segment that is \texttt{[−consonantal]} must be \texttt{[+continuant]}
\item Any segment that is \texttt{[−consonantal]} must be \texttt{[−release]}
\end{enumerate}
\item[Resctrictions] \emph{none}
\end{description}
\end{samepage}
% =============================================================================
\subsubsection{\texttt{[sonorant]}}
\label{ssub:feature_sonorant}
\begin{samepage}
\begin{description}
\itemsep1pt \parskip0pt \parsep0pt
\item[Descritpion] Contrasts between sonorants and obstruents
\item[Type] \emph{binary}
\item[Examples]\
\begin{itemize}
\item \texttt{[+sonorant]}\\
m ɱ n ɳ ɲ ŋ ŋm ɴ r ɽ ʀ l ɫ ɭ ʎ ʟ i y ɪ ʏ ɯ u ɷ ʊ ə ɨ ʉ e ø ɛ œ ɤ o ʌ ɔ æ a ɶ ɐ ɑ ɒ ʋ j ɥ ɰ ʍ w
\item \texttt{[−sonorant]}\\
p̪ b̪ p b t d ʈ ɖ c ɟ k g kp gb q ɢ pf bv ts dz tɬ dɮ tʃ dʒ tɕ dʑ ʈʂ ɖʐ cç ɟʝ kx gɣ ɸ β f v θ ð ɬ ɮ s z ʃ ʒ ɕ ʑ ʂ ʐ ç ʝ x ɣ χ ʁ ħ ʕ ʔ h ɦ
\end{itemize}
\item[Constraints]\
\begin{enumerate}
\itemsep1pt \parskip0pt \parsep0pt
\item Any segment that is \texttt{[−sonorant]} must be \texttt{[+consonantal]}
\item Any segment that is \texttt{[+sonorant]} must be \texttt{[−release]}
\item Any segment that is \texttt{[+sonorant]} must be \texttt{[−ejective]}
\item Any segment that is \texttt{[−sonorant]} must be \texttt{[−atr]}
\end{enumerate}
\item[Resctrictions] \emph{none}
\end{description}
\end{samepage}
% =============================================================================
\subsubsection{\texttt{[continuant]}}
\label{ssub:feature_continuant}
\begin{samepage}
\begin{description}
\itemsep1pt \parskip0pt \parsep0pt
\item[Descritpion] Distinguishes plosives from fricates; technically, it contrasts a complete closure (\texttt{[−continuant]}) from near-complete closure (\texttt{[+continuant]}) of the oral tract specifically, so nasal consonants are \texttt{[−continuant]} (Flynn, 2006).
\item[Type] \emph{binary}
\item[Examples]\
\begin{itemize}
\item \texttt{[+continuant]}\\
ɸ β f v θ ð ɬ ɮ s z ʃ ʒ ɕ ʑ ʂ ʐ ç ʝ x ɣ χ ʁ r ɽ ʀ l ɫ ɭ ʎ ʟ i y ɪ ʏ ɯ u ɷ ʊ ə ɨ ʉ e ø ɛ œ ɤ o ʌ ɔ æ a ɶ ɐ ɑ ɒ ʋ j ɥ ɰ ʍ w ħ ʕ h ɦ
\item \texttt{[−continuant]}\\
p̪ b̪ p b t d ʈ ɖ c ɟ k g kp gb q ɢ pf bv ts dz tɬ dɮ tʃ dʒ tɕ dʑ ʈʂ ɖʐ cç ɟʝ kx gɣ m ɱ n ɳ ɲ ŋ ŋm ɴ ʔ
\end{itemize}
\item[Constraints]\
\begin{enumerate}
\itemsep1pt \parskip0pt \parsep0pt
\item Any segment that is \texttt{[−continuant]} must be \texttt{[+consonantal]}
\item Any segment that is \texttt{[+continuant]} must be \texttt{[−release]}
\end{enumerate}
\item[Resctrictions] Only applies to consonants; vowels and semivowels are \texttt{[+continuant]} by default.
\end{description}
\end{samepage}
% =============================================================================
\subsubsection{\texttt{[ejective]}}
\label{ssub:feature_ejective}
\begin{samepage}
\begin{description}
\itemsep1pt \parskip0pt \parsep0pt
\item[Descritpion] Distinguishes ejective from pulmonic stops and affricates.
\item[Type] \emph{binary}
\item[Examples]\
\begin{itemize}
\item \texttt{[+ejective]}\\
p̪ʼ pʼ tʼ ʈʼ cʼ kʼ kpʼ qʼ ɢʼ pfʼ tsʼ tɬʼ tʃʼ tɕʼ ʈʂʼ cçʼ kxʼ
\item \texttt{[−ejective]}\\
\emph{everything else}
\end{itemize}
\item[Constraints]\
\begin{enumerate}
\item Any segment that is \texttt{[+ejective]} must be \texttt{[+consonantal, −sonorant, −continuant, −voice]}
\end{enumerate}
\item[Resctrictions] only applies to stops and affricates; all other sounds are \texttt{[−ejective]} by default
\end{description}
\end{samepage}
% =============================================================================
\subsubsection{\texttt{[release]}}
\label{ssub:feature_release}
\begin{samepage}
\begin{description}
\itemsep1pt \parskip0pt \parsep0pt
\item[Descritpion] Distinguishes plosives from affricate counterparts
\item[Type] \emph{binary}
\item[Examples]\
\begin{itemize}
\item \texttt{[+release]}
pf bv pɸ bβ tθ dð ts dz tɬ dɮ tʃ dʒ tɕ dʑ ʈʂ ɖʐ cç ɟʝ kx gɣ qχ ɢʁ
% \item \texttt{[−release]}\\
% ...
\end{itemize}
\item[Constraints]\