-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_config.py
76 lines (68 loc) · 4.28 KB
/
create_config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from configparser import ConfigParser
config_object = ConfigParser()
config_object["INPUT_CONFIG"] = {
"algorithm": 'Top2Vec', #'LDA', 'NMF', 'Top2Vec', 'BERTopic'
"input": './demo/demo_data.csv', #full path to input data
"input_format": 'csv', #'csv' or 'zip'
"text_column": 'text', #only relevant if input_format='csv'
"timestamp_column": 'timestamp', #only relevent if input_format='csv', computes topics over time if provided
"delimiter": ',' #only relevant if input_format='csv'
}
config_object["BERTOPIC_CONFIG"] = {
"model": '', # embedding model to use (any HuggingFace model); empty will result in default model
"topic_reduction": 20, # 0: no reduction, else: max. number of topics allowed
"min_topic_size": 2, # minimum number of documents per topic, must be 1<
"preprocess": 1, # 0 (no) or 1 (yes)
"tokenize": 1, # 0 (no) or 1 (yes), must be 1 if 'lemmatize' is 1 or will result in error
"lemmatize": 0, # 0 (no) or 1 (yes)
"remove_nltk_stopwords": 1, # 0 (no) or 1 (yes), removes NLTK's default stopwords, see 'lang' config parameter
"remove_custom_stopwords": '', # if not emtpy: full path to .txt file with 1 stopword per line
"remove_punct": 1, # 0 (no) or 1 (yes)
"lowercase": 1, # 0 (no) or 1 (yes)
"upper_ngram_range": 1, # upper ngram range for keywords, creates ngrams for range (1, n), must be 0<
"lang": 'english' # 'dutch', 'english', 'french', 'german'; relevant for stopwords, tokenization, lemmatization, embedding model selection (if not specified above)
}
config_object["TOP2VEC_CONFIG"] = {
"model": 'universal-sentence-encoder', # Any of the following: "doc2vec", "universal-sentence-encoder", "all-MiniLM-L6-v2", "distiluse-base-multilingual-cased", "paraphrase-multilingual-MiniLM-L12-v2"
"topic_reduction": 20, # 0: no reduction, else: max. number of topics allowed
"min_topic_size": 2, # minimum number of documents per topic, must be 1<
"preprocess": 1, # 0 (no) or 1 (yes)
"tokenize": 1, # 0 (no) or 1 (yes), must be 1 if 'lemmatize' is 1 or will result in error
"lemmatize": 0, # 0 (no) or 1 (yes)
"remove_nltk_stopwords": 1, # 0 (no) or 1 (yes), removes NLTK's default stopwords, see 'lang' config parameter
"remove_custom_stopwords": '', # if not empty: full path to .txt file with 1 stopword per line
"remove_punct": 1, # 0 (no) or 1 (yes)
"lowercase": 1, # 0 (no) or 1 (yes)
"upper_ngram_range": 1, #upper ngram range for keywords, creates ngrams for range (1, n)
"lang": 'english' # 'dutch', 'english', 'french', 'german'; relevant for stopwords, tokenization, lemmatization, embedding model selection
}
config_object["LDA_CONFIG"] = {
"n_components": 20, # number of topics to detect
"preprocess": 1, # 0 (no) or 1 (yes)
"tokenize": 1, # 0 (no) or 1 (yes), must be 1 if 'lemmatize' is 1 or will result in error
"lemmatize": 0, # 0 (no) or 1 (yes)
"remove_nltk_stopwords": 1, # 0 (no) or 1 (yes), removes NLTK's default stopwords, see 'lang' config parameter
"remove_custom_stopwords": '', # if not emtpy: full path to .txt file with 1 stopword per line
"remove_punct": 1, # 0 (no) or 1 (yes)
"lowercase": 1, # 0 (no) or 1 (yes)
"upper_ngram_range": 1, #upper ngram range for keywords, creates ngrams for range (1, n)
"lang": 'english' # 'dutch', 'english', 'french', 'german'; relevant for stopwords, tokenization, lemmatization
}
config_object["NMF_CONFIG"] = {
"n_components": 20, # number of topics to detect
"preprocess": 1, # 0 (no) or 1 (yes)
"tokenize": 1, # 0 (no) or 1 (yes), must be 1 if 'lemmatize' is 1 or will result in error
"lemmatize": 0, # 0 (no) or 1 (yes)
"remove_nltk_stopwords": 1, # 0 (no) or 1 (yes), removes NLTK's default stopwords, see 'lang' config parameter
"remove_custom_stopwords": '', # if not emtpy: full path to .txt file with 1 stopword per line
"remove_punct": 1, # 0 (no) or 1 (yes)
"lowercase": 1, # 0 (no) or 1 (yes)
"upper_ngram_range": 1, #upper ngram range for keywords, creates ngrams for range (1, n)
"lang": 'english' # 'dutch', 'english', 'french', 'german'; relevant for stopwords, tokenization, lemmatization
}
config_object["OUTPUT_CONFIG"] = {
"output_dir": './demo/output',
"overwrite_output_dir": 1
}
with open('config.ini', 'w') as conf:
config_object.write(conf)