forked from smallk/lean
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdefault_docs_config.yml
100 lines (77 loc) · 4.8 KB
/
default_docs_config.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# Copyright (c) 2015, Georgia Tech Research Institute
# All rights reserved.
#
# This unpublished material is the property of the Georgia Tech
# Research Institute and is protected under copyright law.
# The methods and techniques described herein are considered
# trade secrets and/or confidential. Reproduction or distribution,
# in whole or in part, is forbidden except by the express written
# permission of the Georgia Tech Research Institute.
# ---
###############################################################################
# #
# GENERAL CONFIGURATION DETAILS #
# #
# Configure DocIndexer to processes documents in a directory tree. #
# #
###############################################################################
# [REQUIRED] Specify the full path to the LEAN repository's 'config' folder.
# This folder contains default stopword and spelling files, among other items.
# Items in this folder are loaded automatically at startup.
config_path: /path/to/lean/config
# [REQUIRED] Write the Lucene index to this folder. If this folder does not
# exist it will be created.
outdir: /path/to/index
# [REQUIRED] Specify the analyzer to use. An analyzer consists of a tokenizer
# and a chain of zero or more token filters. The filters perform various
# transformations on the tokens as they pass down the chain. The first four
# analyzers are provided by Lucene and are not customizable; the 'custom'
# analyzers can be easily altered and recompiled.
# org.apache.lucene.analysis.core.WhitespaceAnalyzer Split text on whitespace only
# org.apache.lucene.analysis.standard.StandardAnalyzer Lucene's default text analyzer
# org.apache.lucene.analysis.standard.ClassicAnalyzer Lucene's StandardAnalyzer pre v3.1
# org.apache.lucene.analysis.en.EnglishAnalyzer Lucene's English-specific text analyzer
# analyzers.FormalAnalyzer GTRI analyzer for formal documents
# twitter.TwitterAnalyzer GTRI custom analyzer for Twitter
analyzer: analyzers.FormalAnalyzer
# [REQUIRED] Specify the full path to the root folder of the file tree to be
# analyzed. DocIndexer will begin at this root folder and recursively process
# all documents in supported formats throughout the tree.
indir: /path/to/input_folder_root
# [OPTIONAL] Specify the absolute path to a 'user' stopword file. Use this
# file to give DocIndexer additional stopwords that should be removed from the
# token stream, but that are not contained in the default stopword list
# (found in the config folder). This file can also be used to remove specific
# tokens that may not be of interest for a given data set. The file name does
# not have to be 'user_stopwords.txt'. To NOT use a user stopword file,
# comment the following line.
#user_stopword_file: /path/to/user_stopwords.txt
# [OPTIONAL] Specify the absolute path to a 'user' spelling file. Use this
# file to give DocIndexer additional spelling corrections that should be
# performed on the token stream, but that are not contained in the default list
# of spelling corrections (found in the config folder). This file can be used
# to remove or 'normalize' domain-specific slang. The file name does not have
# to be 'user_spelling.txt'. To NOT use a user spelling file,
# comment the next line.
#user_spelling_file: /path/to/user_spelling.txt
###############################################################################
# #
# Boolean flags (Yes/No) #
# #
###############################################################################
# [OPTIONAL - CUSTOM ANALYZERS ONLY] Whether to discard all tokens tagged with
# the <NUM> tag by the tokenizer.
IGNORE_NUMERALS: No
# [OPTIONAL - CUSTOM_FORMAL ANALYZER ONLY] Whether to discard all tokens except
# those tagged with the <ALPHANUM> tag. Overrides the IGNORE_NUMERALS option.
IGNORE_ALL_BUT_ALPHANUM: No
# [OPTIONAL - CUSTOM ANALYZERS ONLY] Whether to disable stemming. Stemming
# removes characters from selected tokens, which may not be desirable in all
# scenarios.
DISABLE_STEMMING: No
# [OPTIONAL] Whether to disable all filters in the CUSTOM analyzers. If
# this option is selected, tokenization is the only operation performed on the
# data. Use this option if you want to see the full set of tokens that emerge
# from the tokenizer. This option does not apply to the Lucene-provided
# analyzers.
DISABLE_CUSTOM_FILTERS: No