docTrainer.properties

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Sample machine learning properties file
# Choose between MAXENT and PERCEPTRON
Algorithm=MAXENT
Iterations=100
Cutoff=5
Threads=4

##################################################
#### Custom parameters added by ixa-pipe-ml ######
##################################################

# Languages supported:
Language=en

# Training and Test Corpus:
TrainSet=/home/ragerri/experiments/polarity/movies.train
TestSet=/home/ragerri/experiments/polarity/movies.test

# OutputModel name:
OutputModel=en-doc-maxent-movies.bin

# Specify if adaptive features are cleared in the training and/or evaluation data.
# Options are: 'yes', 'no', 'docstart'. The first two will reset the
# features every sentence whereas the 'docstart' option will look for -DOCSTART-
# marks in the data to clear the adaptive features.
# Crossvalidation only works if ClearTrainingFeatures is set to 'yes'.
# If commented out both values default to 'no'.
ClearTrainingFeatures=yes
ClearEvaluationFeatures=no

##################
#### FEATURES ####
##################

# BagOfWordsFeaturesRange: whether to lowercase the tokens (lower) or provide just the
# tokens consisting of letters only (lettersOnly).
# To cancel out an option, just write 'no,lettersOnly', 'lower,no' or 'no,no'.
# If commented out, it defaults to 'lower,wac'.
BagOfWordsFeatures=yes
BagOfWordsFeaturesRange=no,lettersOnly

# TokenClassFeatures: include token shape features (capitalization, digits,
# etc. see TokenClassFeatureGenerator class for details
# TokenClassFeaturesRange: whether to lowercase the tokens and provide wordAndClass (wac)
# joint features. To cancel out an option, just write 'no,wac', 'lower,no' or 'no,no'.
# If commented out, it defaults to 'lower,wac'.
#TokenClassFeatures=yes
TokenClassFeaturesRange=lower,wac

# OutcomePriorFeatures: maps the underlying previous outcomes
#OutcomePriorFeatures=yes

# SentenceFeatures: add first and last words of sentence as features.
# Use the Begin and End options to pick and choose combinations.
#SentenceFeatures=yes
#SentenceFeaturesBegin=true
SentenceFeaturesEnd=false

# PrefixFeatures: takes first 3rd and 4rd characters of current token as feature.
# Modify the values to get other prefix ranges.
#PrefixFeatures=yes
#PrefixFeaturesBegin=3
PrefixFeaturesEnd=4

# SuffixFeatures: takes last 4 characters of current token as feature.
# Modify the options to get other suffix ranges.
#SuffixFeatures=yes
#SuffixFeaturesBegin=0
SuffixFeaturesEnd=4

#NGramFeatures=yes
NGramFeaturesRange=2:5

# CharNgramFeatures: min and maximum length for character ngrams of current
# token. If value is yes, specify the desired range in CharNgramFeaturesRange.
# If Range is commented out, it defaults to 2:5 when this feature is "yes".
#CharNgramFeatures=yes
CharNgramFeaturesRange=2:5

# DictionaryFeatures: add features if some expression found in some gazetteer. Comment
# it out deactivate this feature. Note that every file in the directory
# provided as parameter will be taken to be a dictionary. The dictionary format
# needs to be 'sequence\tabclass' and serialized using ixa-pipe-convert SerializeResources
# class functionalities.
DictionaryFeatures=/home/ragerri/experiments/absa/semeval2015/train2015.gaz

# DictionaryPolarityFeatures: add polarity features if token found in some gazetteer. Comment
# it out deactivate this feature. Note that every file in the directory
# provided as parameter will be taken to be a dictionary. The dictionary format
# needs to be 'sequence\tabclass' and serialized using ixa-pipe-convert SerializeResources
# class functionalities.
#DictionaryPolarityFeatures=/home/ragerri/javacode/models/opinion/en-polarity-opener.txt

# FrequentWordFeatures: add features if token found in some gazetteer containing the most
# frequent words in a training corpus. Comment it out deactivate this feature.
# Note that every file in the directory provided as parameter will be taken to be a dictionary.
# The dictionary format needs to be 'sequence\tabclass' and serialized using ixa-pipe-convert
# SerializeResources class functionalities.
#FrequentWordFeatures=/home/ragerri/javacode/ixa-pipe-nerc/nerc-resources/en/dictionaries

# TargetFeatures: add target features as annotated by an Opinion Target Extraction model.
# The model can be trained with full aspects (B-FOOD#QUALITY) or simply with B-TARGET.
# If the first option is the case, then there are two features parameters, coarse (FOOD)
# and fine (FOOD#QUALITY). For the second option TargetFeaturesRange should be "no".
#TargetFeatures=/home/ragerri/resources/opinion-models/en/en-ote-clusters-restaurants-2016.bin
#TargetFeaturesRange=fine

# BrownClusterFeatures: add features using Brown clusters
# Comment it out to deactivate this feature.
# NOTE: you can add multiple clustering lexicons by chaining them with a comma.
# The brown lexicon needs to be serialized using ixa-pipe-convert SerializeResources
# functionalities
#BrownClusterFeatures=/home/ragerri/resources/clusters/reuters-rcv1/brown/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt.gz

# ClarkClusterFeatures: add features using Clark (2003) clusters. If value is uncommented,
# specify the location of the clustering lexicon in Clark format.
# NOTE: you can add multiple clustering lexicons by chaining them with a comma.
# The Clark lexicon needs to be serialized using ixa-pipe-convert SerializeResources
# functionalities
#ClarkClusterFeatures=/home/ragerri/resources/clusters/wikipedia/en/clark/en-wikipedia-preclean.tok.punct.lower.600.gz

# Word2VecClusterFeatures: add features using word2vec clusters. If value is
# uncommented, specify the location of the clustering lexicon in word2vec format.
# NOTE: you can add multiple clustering lexicons by chaining them with a comma.
# The Word2vec lexicon needs to be serialized using ixa-pipe-convert SerializeResources
# functionalities
#Word2VecClusterFeatures=/home/ragerri/resources/clusters/gigaword/english-5th/word2vec/en-gigaword-s150-w5.200.gz

############################
## Morphological Features ##
############################

# POSTagModelFeatures: add a pos tagging model trained with ixa-pipe-ml
# POSTagModelFeaturesRange specifies the combination of features to be used: options are 'pos'
# and 'posclass' in that strict order. For example, if 'pos,posclass' is
# chosen then both types of features will be used. If 'pos,no' is chosen, then
# only pos tag features are active. If 'no,posclass' then pos tag class is chosen.
# If POSTagModelFeatures is commented out, none of these features are used.
#POSTagModelFeatures=/home/ragerri/javacode/models/pos/en-pos-clark-conll09.bin
POSTagModelFeaturesRange=pos,no

# POSDictionaryFeatures: add tags as features using a dictionary. The input file format
# contains  word\tabpostag. Tabulated training data can also be used.
# The input dictionary needs to be serialized using ixa-pipe-convert SerializeResources
# functionalities
#POSDictionaryFeatures=/home/ragerri/experiments/pos/penn3/train.tsv

# LemmaModelFeatures: add a lemmatizer model trained with ixa-pipe-ml
#LemmaModelFeatures=/home/ragerri/javacode/models/lemma/en-lemma-conll09.bin

# LemmaDictionaryFeatures add lemma features from a dictionary
# It is required to provide a POS model trained with ixa-pipe-ml
# and a plain text word\tlemma\tpostag dictionary.
# The Lemma Dictionary needs to be serialized using ixa-pipe-convert SerializeResources
# functionalities
#LemmaDictionaryFeatures=/home/ragerri/javacode/models/pos/en-pos-clark-conll09.bin,/home/ragerri/resources/pos-resources/lemmatizer-dicts/languagetool/en-lemmatizer.txt

#####################################
#### CROSS VALIDATION PARAMETERS ####
#####################################

# Cross Validation Folds; if commented out it defaults to 10 cross validation
# folds.
Folds=5
# Evaluation type: choose between 'detailed' and 'error'; only for cross-validation.
# It defaults to detailed evaluation.
EvaluationType=detailed