-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathdocTrainer.properties
182 lines (153 loc) · 8.53 KB
/
docTrainer.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Sample machine learning properties file
# Choose between MAXENT and PERCEPTRON
Algorithm=MAXENT
Iterations=100
Cutoff=5
Threads=4
##################################################
#### Custom parameters added by ixa-pipe-ml ######
##################################################
# Languages supported:
Language=en
# Training and Test Corpus:
TrainSet=/home/ragerri/experiments/polarity/movies.train
TestSet=/home/ragerri/experiments/polarity/movies.test
# OutputModel name:
OutputModel=en-doc-maxent-movies.bin
# Specify if adaptive features are cleared in the training and/or evaluation data.
# Options are: 'yes', 'no', 'docstart'. The first two will reset the
# features every sentence whereas the 'docstart' option will look for -DOCSTART-
# marks in the data to clear the adaptive features.
# Crossvalidation only works if ClearTrainingFeatures is set to 'yes'.
# If commented out both values default to 'no'.
ClearTrainingFeatures=yes
ClearEvaluationFeatures=no
##################
#### FEATURES ####
##################
# BagOfWordsFeaturesRange: whether to lowercase the tokens (lower) or provide just the
# tokens consisting of letters only (lettersOnly).
# To cancel out an option, just write 'no,lettersOnly', 'lower,no' or 'no,no'.
# If commented out, it defaults to 'lower,wac'.
BagOfWordsFeatures=yes
BagOfWordsFeaturesRange=no,lettersOnly
# TokenClassFeatures: include token shape features (capitalization, digits,
# etc. see TokenClassFeatureGenerator class for details
# TokenClassFeaturesRange: whether to lowercase the tokens and provide wordAndClass (wac)
# joint features. To cancel out an option, just write 'no,wac', 'lower,no' or 'no,no'.
# If commented out, it defaults to 'lower,wac'.
#TokenClassFeatures=yes
TokenClassFeaturesRange=lower,wac
# OutcomePriorFeatures: maps the underlying previous outcomes
#OutcomePriorFeatures=yes
# SentenceFeatures: add first and last words of sentence as features.
# Use the Begin and End options to pick and choose combinations.
#SentenceFeatures=yes
#SentenceFeaturesBegin=true
SentenceFeaturesEnd=false
# PrefixFeatures: takes first 3rd and 4rd characters of current token as feature.
# Modify the values to get other prefix ranges.
#PrefixFeatures=yes
#PrefixFeaturesBegin=3
PrefixFeaturesEnd=4
# SuffixFeatures: takes last 4 characters of current token as feature.
# Modify the options to get other suffix ranges.
#SuffixFeatures=yes
#SuffixFeaturesBegin=0
SuffixFeaturesEnd=4
#NGramFeatures=yes
NGramFeaturesRange=2:5
# CharNgramFeatures: min and maximum length for character ngrams of current
# token. If value is yes, specify the desired range in CharNgramFeaturesRange.
# If Range is commented out, it defaults to 2:5 when this feature is "yes".
#CharNgramFeatures=yes
CharNgramFeaturesRange=2:5
# DictionaryFeatures: add features if some expression found in some gazetteer. Comment
# it out deactivate this feature. Note that every file in the directory
# provided as parameter will be taken to be a dictionary. The dictionary format
# needs to be 'sequence\tabclass' and serialized using ixa-pipe-convert SerializeResources
# class functionalities.
DictionaryFeatures=/home/ragerri/experiments/absa/semeval2015/train2015.gaz
# DictionaryPolarityFeatures: add polarity features if token found in some gazetteer. Comment
# it out deactivate this feature. Note that every file in the directory
# provided as parameter will be taken to be a dictionary. The dictionary format
# needs to be 'sequence\tabclass' and serialized using ixa-pipe-convert SerializeResources
# class functionalities.
#DictionaryPolarityFeatures=/home/ragerri/javacode/models/opinion/en-polarity-opener.txt
# FrequentWordFeatures: add features if token found in some gazetteer containing the most
# frequent words in a training corpus. Comment it out deactivate this feature.
# Note that every file in the directory provided as parameter will be taken to be a dictionary.
# The dictionary format needs to be 'sequence\tabclass' and serialized using ixa-pipe-convert
# SerializeResources class functionalities.
#FrequentWordFeatures=/home/ragerri/javacode/ixa-pipe-nerc/nerc-resources/en/dictionaries
# TargetFeatures: add target features as annotated by an Opinion Target Extraction model.
# The model can be trained with full aspects (B-FOOD#QUALITY) or simply with B-TARGET.
# If the first option is the case, then there are two features parameters, coarse (FOOD)
# and fine (FOOD#QUALITY). For the second option TargetFeaturesRange should be "no".
#TargetFeatures=/home/ragerri/resources/opinion-models/en/en-ote-clusters-restaurants-2016.bin
#TargetFeaturesRange=fine
# BrownClusterFeatures: add features using Brown clusters
# Comment it out to deactivate this feature.
# NOTE: you can add multiple clustering lexicons by chaining them with a comma.
# The brown lexicon needs to be serialized using ixa-pipe-convert SerializeResources
# functionalities
#BrownClusterFeatures=/home/ragerri/resources/clusters/reuters-rcv1/brown/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt.gz
# ClarkClusterFeatures: add features using Clark (2003) clusters. If value is uncommented,
# specify the location of the clustering lexicon in Clark format.
# NOTE: you can add multiple clustering lexicons by chaining them with a comma.
# The Clark lexicon needs to be serialized using ixa-pipe-convert SerializeResources
# functionalities
#ClarkClusterFeatures=/home/ragerri/resources/clusters/wikipedia/en/clark/en-wikipedia-preclean.tok.punct.lower.600.gz
# Word2VecClusterFeatures: add features using word2vec clusters. If value is
# uncommented, specify the location of the clustering lexicon in word2vec format.
# NOTE: you can add multiple clustering lexicons by chaining them with a comma.
# The Word2vec lexicon needs to be serialized using ixa-pipe-convert SerializeResources
# functionalities
#Word2VecClusterFeatures=/home/ragerri/resources/clusters/gigaword/english-5th/word2vec/en-gigaword-s150-w5.200.gz
############################
## Morphological Features ##
############################
# POSTagModelFeatures: add a pos tagging model trained with ixa-pipe-ml
# POSTagModelFeaturesRange specifies the combination of features to be used: options are 'pos'
# and 'posclass' in that strict order. For example, if 'pos,posclass' is
# chosen then both types of features will be used. If 'pos,no' is chosen, then
# only pos tag features are active. If 'no,posclass' then pos tag class is chosen.
# If POSTagModelFeatures is commented out, none of these features are used.
#POSTagModelFeatures=/home/ragerri/javacode/models/pos/en-pos-clark-conll09.bin
POSTagModelFeaturesRange=pos,no
# POSDictionaryFeatures: add tags as features using a dictionary. The input file format
# contains word\tabpostag. Tabulated training data can also be used.
# The input dictionary needs to be serialized using ixa-pipe-convert SerializeResources
# functionalities
#POSDictionaryFeatures=/home/ragerri/experiments/pos/penn3/train.tsv
# LemmaModelFeatures: add a lemmatizer model trained with ixa-pipe-ml
#LemmaModelFeatures=/home/ragerri/javacode/models/lemma/en-lemma-conll09.bin
# LemmaDictionaryFeatures add lemma features from a dictionary
# It is required to provide a POS model trained with ixa-pipe-ml
# and a plain text word\tlemma\tpostag dictionary.
# The Lemma Dictionary needs to be serialized using ixa-pipe-convert SerializeResources
# functionalities
#LemmaDictionaryFeatures=/home/ragerri/javacode/models/pos/en-pos-clark-conll09.bin,/home/ragerri/resources/pos-resources/lemmatizer-dicts/languagetool/en-lemmatizer.txt
#####################################
#### CROSS VALIDATION PARAMETERS ####
#####################################
# Cross Validation Folds; if commented out it defaults to 10 cross validation
# folds.
Folds=5
# Evaluation type: choose between 'detailed' and 'error'; only for cross-validation.
# It defaults to detailed evaluation.
EvaluationType=detailed